1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 23 /* 24 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 25 * Copyright (c) 2011, 2024 by Delphix. All rights reserved. 26 * Copyright (c) 2018, Nexenta Systems, Inc. All rights reserved. 27 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 28 * Copyright 2013 Saso Kiselkov. All rights reserved. 29 * Copyright (c) 2014 Integros [integros.com] 30 * Copyright 2016 Toomas Soome <tsoome@me.com> 31 * Copyright (c) 2016 Actifio, Inc. All rights reserved. 32 * Copyright 2018 Joyent, Inc. 33 * Copyright (c) 2017, 2019, Datto Inc. All rights reserved. 34 * Copyright 2017 Joyent, Inc. 35 * Copyright (c) 2017, Intel Corporation. 36 * Copyright (c) 2021, Colm Buckley <colm@tuatha.org> 37 * Copyright (c) 2023 Hewlett Packard Enterprise Development LP. 38 * Copyright (c) 2023, 2024, Klara Inc. 39 */ 40 41 /* 42 * SPA: Storage Pool Allocator 43 * 44 * This file contains all the routines used when modifying on-disk SPA state. 45 * This includes opening, importing, destroying, exporting a pool, and syncing a 46 * pool. 47 */ 48 49 #include <sys/zfs_context.h> 50 #include <sys/fm/fs/zfs.h> 51 #include <sys/spa_impl.h> 52 #include <sys/zio.h> 53 #include <sys/zio_checksum.h> 54 #include <sys/dmu.h> 55 #include <sys/dmu_tx.h> 56 #include <sys/zap.h> 57 #include <sys/zil.h> 58 #include <sys/brt.h> 59 #include <sys/ddt.h> 60 #include <sys/vdev_impl.h> 61 #include <sys/vdev_removal.h> 62 #include <sys/vdev_indirect_mapping.h> 63 #include <sys/vdev_indirect_births.h> 64 #include <sys/vdev_initialize.h> 65 #include <sys/vdev_rebuild.h> 66 #include <sys/vdev_trim.h> 67 #include <sys/vdev_disk.h> 68 #include <sys/vdev_raidz.h> 69 #include <sys/vdev_draid.h> 70 #include <sys/metaslab.h> 71 #include <sys/metaslab_impl.h> 72 #include <sys/mmp.h> 73 #include <sys/uberblock_impl.h> 74 #include <sys/txg.h> 75 #include <sys/avl.h> 76 #include <sys/bpobj.h> 77 #include <sys/dmu_traverse.h> 78 #include <sys/dmu_objset.h> 79 #include <sys/unique.h> 80 #include <sys/dsl_pool.h> 81 #include <sys/dsl_dataset.h> 82 #include <sys/dsl_dir.h> 83 #include <sys/dsl_prop.h> 84 #include <sys/dsl_synctask.h> 85 #include <sys/fs/zfs.h> 86 #include <sys/arc.h> 87 #include <sys/callb.h> 88 #include <sys/systeminfo.h> 89 #include <sys/zfs_ioctl.h> 90 #include <sys/dsl_scan.h> 91 #include <sys/zfeature.h> 92 #include <sys/dsl_destroy.h> 93 #include <sys/zvol.h> 94 95 #ifdef _KERNEL 96 #include <sys/fm/protocol.h> 97 #include <sys/fm/util.h> 98 #include <sys/callb.h> 99 #include <sys/zone.h> 100 #include <sys/vmsystm.h> 101 #endif /* _KERNEL */ 102 103 #include "zfs_crrd.h" 104 #include "zfs_prop.h" 105 #include "zfs_comutil.h" 106 #include <cityhash.h> 107 108 /* 109 * spa_thread() existed on Illumos as a parent thread for the various worker 110 * threads that actually run the pool, as a way to both reference the entire 111 * pool work as a single object, and to share properties like scheduling 112 * options. It has not yet been adapted to Linux or FreeBSD. This define is 113 * used to mark related parts of the code to make things easier for the reader, 114 * and to compile this code out. It can be removed when someone implements it, 115 * moves it to some Illumos-specific place, or removes it entirely. 116 */ 117 #undef HAVE_SPA_THREAD 118 119 /* 120 * The "System Duty Cycle" scheduling class is an Illumos feature to help 121 * prevent CPU-intensive kernel threads from affecting latency on interactive 122 * threads. It doesn't exist on Linux or FreeBSD, so the supporting code is 123 * gated behind a define. On Illumos SDC depends on spa_thread(), but 124 * spa_thread() also has other uses, so this is a separate define. 125 */ 126 #undef HAVE_SYSDC 127 128 /* 129 * The interval, in seconds, at which failed configuration cache file writes 130 * should be retried. 131 */ 132 int zfs_ccw_retry_interval = 300; 133 134 typedef enum zti_modes { 135 ZTI_MODE_FIXED, /* value is # of threads (min 1) */ 136 ZTI_MODE_SCALE, /* Taskqs scale with CPUs. */ 137 ZTI_MODE_SYNC, /* sync thread assigned */ 138 ZTI_MODE_NULL, /* don't create a taskq */ 139 ZTI_NMODES 140 } zti_modes_t; 141 142 #define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } 143 #define ZTI_PCT(n) { ZTI_MODE_ONLINE_PERCENT, (n), 1 } 144 #define ZTI_SCALE(min) { ZTI_MODE_SCALE, (min), 1 } 145 #define ZTI_SYNC { ZTI_MODE_SYNC, 0, 1 } 146 #define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } 147 148 #define ZTI_N(n) ZTI_P(n, 1) 149 #define ZTI_ONE ZTI_N(1) 150 151 typedef struct zio_taskq_info { 152 zti_modes_t zti_mode; 153 uint_t zti_value; 154 uint_t zti_count; 155 } zio_taskq_info_t; 156 157 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 158 "iss", "iss_h", "int", "int_h" 159 }; 160 161 /* 162 * This table defines the taskq settings for each ZFS I/O type. When 163 * initializing a pool, we use this table to create an appropriately sized 164 * taskq. Some operations are low volume and therefore have a small, static 165 * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE 166 * macros. Other operations process a large amount of data; the ZTI_SCALE 167 * macro causes us to create a taskq oriented for throughput. Some operations 168 * are so high frequency and short-lived that the taskq itself can become a 169 * point of lock contention. The ZTI_P(#, #) macro indicates that we need an 170 * additional degree of parallelism specified by the number of threads per- 171 * taskq and the number of taskqs; when dispatching an event in this case, the 172 * particular taskq is chosen at random. ZTI_SCALE uses a number of taskqs 173 * that scales with the number of CPUs. 174 * 175 * The different taskq priorities are to handle the different contexts (issue 176 * and interrupt) and then to reserve threads for high priority I/Os that 177 * need to be handled with minimum delay. Illumos taskq has unfair TQ_FRONT 178 * implementation, so separate high priority threads are used there. 179 */ 180 static zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 181 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 182 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ 183 { ZTI_N(8), ZTI_NULL, ZTI_SCALE(0), ZTI_NULL }, /* READ */ 184 #ifdef illumos 185 { ZTI_SYNC, ZTI_N(5), ZTI_SCALE(0), ZTI_N(5) }, /* WRITE */ 186 #else 187 { ZTI_SYNC, ZTI_NULL, ZTI_SCALE(0), ZTI_NULL }, /* WRITE */ 188 #endif 189 { ZTI_SCALE(32), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ 190 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ 191 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FLUSH */ 192 { ZTI_N(4), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* TRIM */ 193 }; 194 195 static void spa_sync_version(void *arg, dmu_tx_t *tx); 196 static void spa_sync_props(void *arg, dmu_tx_t *tx); 197 static boolean_t spa_has_active_shared_spare(spa_t *spa); 198 static int spa_load_impl(spa_t *spa, spa_import_type_t type, 199 const char **ereport); 200 static void spa_vdev_resilver_done(spa_t *spa); 201 202 /* 203 * Percentage of all CPUs that can be used by the metaslab preload taskq. 204 */ 205 static uint_t metaslab_preload_pct = 50; 206 207 static uint_t zio_taskq_batch_pct = 80; /* 1 thread per cpu in pset */ 208 static uint_t zio_taskq_batch_tpq; /* threads per taskq */ 209 210 #ifdef HAVE_SYSDC 211 static const boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 212 static const uint_t zio_taskq_basedc = 80; /* base duty cycle */ 213 #endif 214 215 #ifdef HAVE_SPA_THREAD 216 static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */ 217 #endif 218 219 static uint_t zio_taskq_write_tpq = 16; 220 221 /* 222 * Report any spa_load_verify errors found, but do not fail spa_load. 223 * This is used by zdb to analyze non-idle pools. 224 */ 225 boolean_t spa_load_verify_dryrun = B_FALSE; 226 227 /* 228 * Allow read spacemaps in case of readonly import (spa_mode == SPA_MODE_READ). 229 * This is used by zdb for spacemaps verification. 230 */ 231 boolean_t spa_mode_readable_spacemaps = B_FALSE; 232 233 /* 234 * This (illegal) pool name is used when temporarily importing a spa_t in order 235 * to get the vdev stats associated with the imported devices. 236 */ 237 #define TRYIMPORT_NAME "$import" 238 239 /* 240 * For debugging purposes: print out vdev tree during pool import. 241 */ 242 static int spa_load_print_vdev_tree = B_FALSE; 243 244 /* 245 * A non-zero value for zfs_max_missing_tvds means that we allow importing 246 * pools with missing top-level vdevs. This is strictly intended for advanced 247 * pool recovery cases since missing data is almost inevitable. Pools with 248 * missing devices can only be imported read-only for safety reasons, and their 249 * fail-mode will be automatically set to "continue". 250 * 251 * With 1 missing vdev we should be able to import the pool and mount all 252 * datasets. User data that was not modified after the missing device has been 253 * added should be recoverable. This means that snapshots created prior to the 254 * addition of that device should be completely intact. 255 * 256 * With 2 missing vdevs, some datasets may fail to mount since there are 257 * dataset statistics that are stored as regular metadata. Some data might be 258 * recoverable if those vdevs were added recently. 259 * 260 * With 3 or more missing vdevs, the pool is severely damaged and MOS entries 261 * may be missing entirely. Chances of data recovery are very low. Note that 262 * there are also risks of performing an inadvertent rewind as we might be 263 * missing all the vdevs with the latest uberblocks. 264 */ 265 uint64_t zfs_max_missing_tvds = 0; 266 267 /* 268 * The parameters below are similar to zfs_max_missing_tvds but are only 269 * intended for a preliminary open of the pool with an untrusted config which 270 * might be incomplete or out-dated. 271 * 272 * We are more tolerant for pools opened from a cachefile since we could have 273 * an out-dated cachefile where a device removal was not registered. 274 * We could have set the limit arbitrarily high but in the case where devices 275 * are really missing we would want to return the proper error codes; we chose 276 * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available 277 * and we get a chance to retrieve the trusted config. 278 */ 279 uint64_t zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1; 280 281 /* 282 * In the case where config was assembled by scanning device paths (/dev/dsks 283 * by default) we are less tolerant since all the existing devices should have 284 * been detected and we want spa_load to return the right error codes. 285 */ 286 uint64_t zfs_max_missing_tvds_scan = 0; 287 288 /* 289 * Debugging aid that pauses spa_sync() towards the end. 290 */ 291 static const boolean_t zfs_pause_spa_sync = B_FALSE; 292 293 /* 294 * Variables to indicate the livelist condense zthr func should wait at certain 295 * points for the livelist to be removed - used to test condense/destroy races 296 */ 297 static int zfs_livelist_condense_zthr_pause = 0; 298 static int zfs_livelist_condense_sync_pause = 0; 299 300 /* 301 * Variables to track whether or not condense cancellation has been 302 * triggered in testing. 303 */ 304 static int zfs_livelist_condense_sync_cancel = 0; 305 static int zfs_livelist_condense_zthr_cancel = 0; 306 307 /* 308 * Variable to track whether or not extra ALLOC blkptrs were added to a 309 * livelist entry while it was being condensed (caused by the way we track 310 * remapped blkptrs in dbuf_remap_impl) 311 */ 312 static int zfs_livelist_condense_new_alloc = 0; 313 314 /* 315 * Time variable to decide how often the txg should be added into the 316 * database (in seconds). 317 * The smallest available resolution is in minutes, which means an update occurs 318 * each time we reach `spa_note_txg_time` and the txg has changed. We provide 319 * a 256-slot ring buffer for minute-level resolution. The number is limited by 320 * the size of the structure we use and the maximum amount of bytes we can write 321 * into ZAP. Setting `spa_note_txg_time` to 10 minutes results in approximately 322 * 144 records per day. Given the 256 slots, this provides roughly 1.5 days of 323 * high-resolution data. 324 * 325 * The user can decrease `spa_note_txg_time` to increase resolution within 326 * a day, at the cost of retaining fewer days of data. Alternatively, increasing 327 * the interval allows storing data over a longer period, but with lower 328 * frequency. 329 * 330 * This parameter does not affect the daily or monthly databases, as those only 331 * store one record per day and per month, respectively. 332 */ 333 static uint_t spa_note_txg_time = 10 * 60; 334 335 /* 336 * How often flush txg database to a disk (in seconds). 337 * We flush data every time we write to it, making it the most reliable option. 338 * Since this happens every 10 minutes, it shouldn't introduce any noticeable 339 * overhead for the system. In case of failure, we will always have an 340 * up-to-date version of the database. 341 * 342 * The user can adjust the flush interval to a lower value, but it probably 343 * doesn't make sense to flush more often than the database is updated. 344 * The user can also increase the interval if they're concerned about the 345 * performance of writing the entire database to disk. 346 */ 347 static uint_t spa_flush_txg_time = 10 * 60; 348 349 /* 350 * ========================================================================== 351 * SPA properties routines 352 * ========================================================================== 353 */ 354 355 /* 356 * Add a (source=src, propname=propval) list to an nvlist. 357 */ 358 static void 359 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, const char *strval, 360 uint64_t intval, zprop_source_t src) 361 { 362 const char *propname = zpool_prop_to_name(prop); 363 nvlist_t *propval; 364 365 propval = fnvlist_alloc(); 366 fnvlist_add_uint64(propval, ZPROP_SOURCE, src); 367 368 if (strval != NULL) 369 fnvlist_add_string(propval, ZPROP_VALUE, strval); 370 else 371 fnvlist_add_uint64(propval, ZPROP_VALUE, intval); 372 373 fnvlist_add_nvlist(nvl, propname, propval); 374 nvlist_free(propval); 375 } 376 377 static int 378 spa_prop_add(spa_t *spa, const char *propname, nvlist_t *outnvl) 379 { 380 zpool_prop_t prop = zpool_name_to_prop(propname); 381 zprop_source_t src = ZPROP_SRC_NONE; 382 uint64_t intval; 383 int err; 384 385 /* 386 * NB: Not all properties lookups via this API require 387 * the spa props lock, so they must explicitly grab it here. 388 */ 389 switch (prop) { 390 case ZPOOL_PROP_DEDUPCACHED: 391 err = ddt_get_pool_dedup_cached(spa, &intval); 392 if (err != 0) 393 return (SET_ERROR(err)); 394 break; 395 default: 396 return (SET_ERROR(EINVAL)); 397 } 398 399 spa_prop_add_list(outnvl, prop, NULL, intval, src); 400 401 return (0); 402 } 403 404 int 405 spa_prop_get_nvlist(spa_t *spa, char **props, unsigned int n_props, 406 nvlist_t *outnvl) 407 { 408 int err = 0; 409 410 if (props == NULL) 411 return (0); 412 413 for (unsigned int i = 0; i < n_props && err == 0; i++) { 414 err = spa_prop_add(spa, props[i], outnvl); 415 } 416 417 return (err); 418 } 419 420 /* 421 * Add a user property (source=src, propname=propval) to an nvlist. 422 */ 423 static void 424 spa_prop_add_user(nvlist_t *nvl, const char *propname, char *strval, 425 zprop_source_t src) 426 { 427 nvlist_t *propval; 428 429 VERIFY0(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP)); 430 VERIFY0(nvlist_add_uint64(propval, ZPROP_SOURCE, src)); 431 VERIFY0(nvlist_add_string(propval, ZPROP_VALUE, strval)); 432 VERIFY0(nvlist_add_nvlist(nvl, propname, propval)); 433 nvlist_free(propval); 434 } 435 436 /* 437 * Get property values from the spa configuration. 438 */ 439 static void 440 spa_prop_get_config(spa_t *spa, nvlist_t *nv) 441 { 442 vdev_t *rvd = spa->spa_root_vdev; 443 dsl_pool_t *pool = spa->spa_dsl_pool; 444 uint64_t size, alloc, cap, version; 445 const zprop_source_t src = ZPROP_SRC_NONE; 446 spa_config_dirent_t *dp; 447 metaslab_class_t *mc = spa_normal_class(spa); 448 449 ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 450 451 if (rvd != NULL) { 452 alloc = metaslab_class_get_alloc(mc); 453 alloc += metaslab_class_get_alloc(spa_special_class(spa)); 454 alloc += metaslab_class_get_alloc(spa_dedup_class(spa)); 455 alloc += metaslab_class_get_alloc(spa_embedded_log_class(spa)); 456 alloc += metaslab_class_get_alloc( 457 spa_special_embedded_log_class(spa)); 458 459 size = metaslab_class_get_space(mc); 460 size += metaslab_class_get_space(spa_special_class(spa)); 461 size += metaslab_class_get_space(spa_dedup_class(spa)); 462 size += metaslab_class_get_space(spa_embedded_log_class(spa)); 463 size += metaslab_class_get_space( 464 spa_special_embedded_log_class(spa)); 465 466 spa_prop_add_list(nv, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 467 spa_prop_add_list(nv, ZPOOL_PROP_SIZE, NULL, size, src); 468 spa_prop_add_list(nv, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 469 spa_prop_add_list(nv, ZPOOL_PROP_FREE, NULL, 470 size - alloc, src); 471 spa_prop_add_list(nv, ZPOOL_PROP_CHECKPOINT, NULL, 472 spa->spa_checkpoint_info.sci_dspace, src); 473 474 spa_prop_add_list(nv, ZPOOL_PROP_FRAGMENTATION, NULL, 475 metaslab_class_fragmentation(mc), src); 476 spa_prop_add_list(nv, ZPOOL_PROP_EXPANDSZ, NULL, 477 metaslab_class_expandable_space(mc), src); 478 spa_prop_add_list(nv, ZPOOL_PROP_READONLY, NULL, 479 (spa_mode(spa) == SPA_MODE_READ), src); 480 481 cap = (size == 0) ? 0 : (alloc * 100 / size); 482 spa_prop_add_list(nv, ZPOOL_PROP_CAPACITY, NULL, cap, src); 483 484 spa_prop_add_list(nv, ZPOOL_PROP_DEDUPRATIO, NULL, 485 ddt_get_pool_dedup_ratio(spa), src); 486 spa_prop_add_list(nv, ZPOOL_PROP_BCLONEUSED, NULL, 487 brt_get_used(spa), src); 488 spa_prop_add_list(nv, ZPOOL_PROP_BCLONESAVED, NULL, 489 brt_get_saved(spa), src); 490 spa_prop_add_list(nv, ZPOOL_PROP_BCLONERATIO, NULL, 491 brt_get_ratio(spa), src); 492 493 spa_prop_add_list(nv, ZPOOL_PROP_DEDUP_TABLE_SIZE, NULL, 494 ddt_get_ddt_dsize(spa), src); 495 spa_prop_add_list(nv, ZPOOL_PROP_HEALTH, NULL, 496 rvd->vdev_state, src); 497 spa_prop_add_list(nv, ZPOOL_PROP_LAST_SCRUBBED_TXG, NULL, 498 spa_get_last_scrubbed_txg(spa), src); 499 500 version = spa_version(spa); 501 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) { 502 spa_prop_add_list(nv, ZPOOL_PROP_VERSION, NULL, 503 version, ZPROP_SRC_DEFAULT); 504 } else { 505 spa_prop_add_list(nv, ZPOOL_PROP_VERSION, NULL, 506 version, ZPROP_SRC_LOCAL); 507 } 508 spa_prop_add_list(nv, ZPOOL_PROP_LOAD_GUID, 509 NULL, spa_load_guid(spa), src); 510 } 511 512 if (pool != NULL) { 513 /* 514 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, 515 * when opening pools before this version freedir will be NULL. 516 */ 517 if (pool->dp_free_dir != NULL) { 518 spa_prop_add_list(nv, ZPOOL_PROP_FREEING, NULL, 519 dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes, 520 src); 521 } else { 522 spa_prop_add_list(nv, ZPOOL_PROP_FREEING, 523 NULL, 0, src); 524 } 525 526 if (pool->dp_leak_dir != NULL) { 527 spa_prop_add_list(nv, ZPOOL_PROP_LEAKED, NULL, 528 dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes, 529 src); 530 } else { 531 spa_prop_add_list(nv, ZPOOL_PROP_LEAKED, 532 NULL, 0, src); 533 } 534 } 535 536 spa_prop_add_list(nv, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 537 538 if (spa->spa_comment != NULL) { 539 spa_prop_add_list(nv, ZPOOL_PROP_COMMENT, spa->spa_comment, 540 0, ZPROP_SRC_LOCAL); 541 } 542 543 if (spa->spa_compatibility != NULL) { 544 spa_prop_add_list(nv, ZPOOL_PROP_COMPATIBILITY, 545 spa->spa_compatibility, 0, ZPROP_SRC_LOCAL); 546 } 547 548 if (spa->spa_root != NULL) 549 spa_prop_add_list(nv, ZPOOL_PROP_ALTROOT, spa->spa_root, 550 0, ZPROP_SRC_LOCAL); 551 552 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { 553 spa_prop_add_list(nv, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 554 MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE); 555 } else { 556 spa_prop_add_list(nv, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 557 SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); 558 } 559 560 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) { 561 spa_prop_add_list(nv, ZPOOL_PROP_MAXDNODESIZE, NULL, 562 DNODE_MAX_SIZE, ZPROP_SRC_NONE); 563 } else { 564 spa_prop_add_list(nv, ZPOOL_PROP_MAXDNODESIZE, NULL, 565 DNODE_MIN_SIZE, ZPROP_SRC_NONE); 566 } 567 568 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 569 if (dp->scd_path == NULL) { 570 spa_prop_add_list(nv, ZPOOL_PROP_CACHEFILE, 571 "none", 0, ZPROP_SRC_LOCAL); 572 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 573 spa_prop_add_list(nv, ZPOOL_PROP_CACHEFILE, 574 dp->scd_path, 0, ZPROP_SRC_LOCAL); 575 } 576 } 577 } 578 579 /* 580 * Get zpool property values. 581 */ 582 int 583 spa_prop_get(spa_t *spa, nvlist_t *nv) 584 { 585 objset_t *mos = spa->spa_meta_objset; 586 zap_cursor_t zc; 587 zap_attribute_t *za; 588 dsl_pool_t *dp; 589 int err = 0; 590 591 dp = spa_get_dsl(spa); 592 dsl_pool_config_enter(dp, FTAG); 593 za = zap_attribute_alloc(); 594 mutex_enter(&spa->spa_props_lock); 595 596 /* 597 * Get properties from the spa config. 598 */ 599 spa_prop_get_config(spa, nv); 600 601 /* If no pool property object, no more prop to get. */ 602 if (mos == NULL || spa->spa_pool_props_object == 0) 603 goto out; 604 605 /* 606 * Get properties from the MOS pool property object. 607 */ 608 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 609 (err = zap_cursor_retrieve(&zc, za)) == 0; 610 zap_cursor_advance(&zc)) { 611 uint64_t intval = 0; 612 char *strval = NULL; 613 zprop_source_t src = ZPROP_SRC_DEFAULT; 614 zpool_prop_t prop; 615 616 if ((prop = zpool_name_to_prop(za->za_name)) == 617 ZPOOL_PROP_INVAL && !zfs_prop_user(za->za_name)) 618 continue; 619 620 switch (za->za_integer_length) { 621 case 8: 622 /* integer property */ 623 if (za->za_first_integer != 624 zpool_prop_default_numeric(prop)) 625 src = ZPROP_SRC_LOCAL; 626 627 if (prop == ZPOOL_PROP_BOOTFS) { 628 dsl_dataset_t *ds = NULL; 629 630 err = dsl_dataset_hold_obj(dp, 631 za->za_first_integer, FTAG, &ds); 632 if (err != 0) 633 break; 634 635 strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, 636 KM_SLEEP); 637 dsl_dataset_name(ds, strval); 638 dsl_dataset_rele(ds, FTAG); 639 } else { 640 strval = NULL; 641 intval = za->za_first_integer; 642 } 643 644 spa_prop_add_list(nv, prop, strval, intval, src); 645 646 if (strval != NULL) 647 kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN); 648 649 break; 650 651 case 1: 652 /* string property */ 653 strval = kmem_alloc(za->za_num_integers, KM_SLEEP); 654 err = zap_lookup(mos, spa->spa_pool_props_object, 655 za->za_name, 1, za->za_num_integers, strval); 656 if (err) { 657 kmem_free(strval, za->za_num_integers); 658 break; 659 } 660 if (prop != ZPOOL_PROP_INVAL) { 661 spa_prop_add_list(nv, prop, strval, 0, src); 662 } else { 663 src = ZPROP_SRC_LOCAL; 664 spa_prop_add_user(nv, za->za_name, strval, 665 src); 666 } 667 kmem_free(strval, za->za_num_integers); 668 break; 669 670 default: 671 break; 672 } 673 } 674 zap_cursor_fini(&zc); 675 out: 676 mutex_exit(&spa->spa_props_lock); 677 dsl_pool_config_exit(dp, FTAG); 678 zap_attribute_free(za); 679 680 if (err && err != ENOENT) 681 return (err); 682 683 return (0); 684 } 685 686 /* 687 * Validate the given pool properties nvlist and modify the list 688 * for the property values to be set. 689 */ 690 static int 691 spa_prop_validate(spa_t *spa, nvlist_t *props) 692 { 693 nvpair_t *elem; 694 int error = 0, reset_bootfs = 0; 695 uint64_t objnum = 0; 696 boolean_t has_feature = B_FALSE; 697 698 elem = NULL; 699 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 700 uint64_t intval; 701 const char *strval, *slash, *check, *fname; 702 const char *propname = nvpair_name(elem); 703 zpool_prop_t prop = zpool_name_to_prop(propname); 704 705 switch (prop) { 706 case ZPOOL_PROP_INVAL: 707 /* 708 * Sanitize the input. 709 */ 710 if (zfs_prop_user(propname)) { 711 if (strlen(propname) >= ZAP_MAXNAMELEN) { 712 error = SET_ERROR(ENAMETOOLONG); 713 break; 714 } 715 716 if (strlen(fnvpair_value_string(elem)) >= 717 ZAP_MAXVALUELEN) { 718 error = SET_ERROR(E2BIG); 719 break; 720 } 721 } else if (zpool_prop_feature(propname)) { 722 if (nvpair_type(elem) != DATA_TYPE_UINT64) { 723 error = SET_ERROR(EINVAL); 724 break; 725 } 726 727 if (nvpair_value_uint64(elem, &intval) != 0) { 728 error = SET_ERROR(EINVAL); 729 break; 730 } 731 732 if (intval != 0) { 733 error = SET_ERROR(EINVAL); 734 break; 735 } 736 737 fname = strchr(propname, '@') + 1; 738 if (zfeature_lookup_name(fname, NULL) != 0) { 739 error = SET_ERROR(EINVAL); 740 break; 741 } 742 743 has_feature = B_TRUE; 744 } else { 745 error = SET_ERROR(EINVAL); 746 break; 747 } 748 break; 749 750 case ZPOOL_PROP_VERSION: 751 error = nvpair_value_uint64(elem, &intval); 752 if (!error && 753 (intval < spa_version(spa) || 754 intval > SPA_VERSION_BEFORE_FEATURES || 755 has_feature)) 756 error = SET_ERROR(EINVAL); 757 break; 758 759 case ZPOOL_PROP_DEDUP_TABLE_QUOTA: 760 error = nvpair_value_uint64(elem, &intval); 761 break; 762 763 case ZPOOL_PROP_DELEGATION: 764 case ZPOOL_PROP_AUTOREPLACE: 765 case ZPOOL_PROP_LISTSNAPS: 766 case ZPOOL_PROP_AUTOEXPAND: 767 case ZPOOL_PROP_AUTOTRIM: 768 error = nvpair_value_uint64(elem, &intval); 769 if (!error && intval > 1) 770 error = SET_ERROR(EINVAL); 771 break; 772 773 case ZPOOL_PROP_MULTIHOST: 774 error = nvpair_value_uint64(elem, &intval); 775 if (!error && intval > 1) 776 error = SET_ERROR(EINVAL); 777 778 if (!error) { 779 uint32_t hostid = zone_get_hostid(NULL); 780 if (hostid) 781 spa->spa_hostid = hostid; 782 else 783 error = SET_ERROR(ENOTSUP); 784 } 785 786 break; 787 788 case ZPOOL_PROP_BOOTFS: 789 /* 790 * If the pool version is less than SPA_VERSION_BOOTFS, 791 * or the pool is still being created (version == 0), 792 * the bootfs property cannot be set. 793 */ 794 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 795 error = SET_ERROR(ENOTSUP); 796 break; 797 } 798 799 /* 800 * Make sure the vdev config is bootable 801 */ 802 if (!vdev_is_bootable(spa->spa_root_vdev)) { 803 error = SET_ERROR(ENOTSUP); 804 break; 805 } 806 807 reset_bootfs = 1; 808 809 error = nvpair_value_string(elem, &strval); 810 811 if (!error) { 812 objset_t *os; 813 814 if (strval == NULL || strval[0] == '\0') { 815 objnum = zpool_prop_default_numeric( 816 ZPOOL_PROP_BOOTFS); 817 break; 818 } 819 820 error = dmu_objset_hold(strval, FTAG, &os); 821 if (error != 0) 822 break; 823 824 /* Must be ZPL. */ 825 if (dmu_objset_type(os) != DMU_OST_ZFS) { 826 error = SET_ERROR(ENOTSUP); 827 } else { 828 objnum = dmu_objset_id(os); 829 } 830 dmu_objset_rele(os, FTAG); 831 } 832 break; 833 834 case ZPOOL_PROP_FAILUREMODE: 835 error = nvpair_value_uint64(elem, &intval); 836 if (!error && intval > ZIO_FAILURE_MODE_PANIC) 837 error = SET_ERROR(EINVAL); 838 839 /* 840 * This is a special case which only occurs when 841 * the pool has completely failed. This allows 842 * the user to change the in-core failmode property 843 * without syncing it out to disk (I/Os might 844 * currently be blocked). We do this by returning 845 * EIO to the caller (spa_prop_set) to trick it 846 * into thinking we encountered a property validation 847 * error. 848 */ 849 if (!error && spa_suspended(spa)) { 850 spa->spa_failmode = intval; 851 error = SET_ERROR(EIO); 852 } 853 break; 854 855 case ZPOOL_PROP_CACHEFILE: 856 if ((error = nvpair_value_string(elem, &strval)) != 0) 857 break; 858 859 if (strval[0] == '\0') 860 break; 861 862 if (strcmp(strval, "none") == 0) 863 break; 864 865 if (strval[0] != '/') { 866 error = SET_ERROR(EINVAL); 867 break; 868 } 869 870 slash = strrchr(strval, '/'); 871 ASSERT(slash != NULL); 872 873 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 874 strcmp(slash, "/..") == 0) 875 error = SET_ERROR(EINVAL); 876 break; 877 878 case ZPOOL_PROP_COMMENT: 879 if ((error = nvpair_value_string(elem, &strval)) != 0) 880 break; 881 for (check = strval; *check != '\0'; check++) { 882 if (!isprint(*check)) { 883 error = SET_ERROR(EINVAL); 884 break; 885 } 886 } 887 if (strlen(strval) > ZPROP_MAX_COMMENT) 888 error = SET_ERROR(E2BIG); 889 break; 890 891 default: 892 break; 893 } 894 895 if (error) 896 break; 897 } 898 899 (void) nvlist_remove_all(props, 900 zpool_prop_to_name(ZPOOL_PROP_DEDUPDITTO)); 901 902 if (!error && reset_bootfs) { 903 error = nvlist_remove(props, 904 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 905 906 if (!error) { 907 error = nvlist_add_uint64(props, 908 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 909 } 910 } 911 912 return (error); 913 } 914 915 void 916 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 917 { 918 const char *cachefile; 919 spa_config_dirent_t *dp; 920 921 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 922 &cachefile) != 0) 923 return; 924 925 dp = kmem_alloc(sizeof (spa_config_dirent_t), 926 KM_SLEEP); 927 928 if (cachefile[0] == '\0') 929 dp->scd_path = spa_strdup(spa_config_path); 930 else if (strcmp(cachefile, "none") == 0) 931 dp->scd_path = NULL; 932 else 933 dp->scd_path = spa_strdup(cachefile); 934 935 list_insert_head(&spa->spa_config_list, dp); 936 if (need_sync) 937 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 938 } 939 940 int 941 spa_prop_set(spa_t *spa, nvlist_t *nvp) 942 { 943 int error; 944 nvpair_t *elem = NULL; 945 boolean_t need_sync = B_FALSE; 946 947 if ((error = spa_prop_validate(spa, nvp)) != 0) 948 return (error); 949 950 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 951 zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); 952 953 if (prop == ZPOOL_PROP_CACHEFILE || 954 prop == ZPOOL_PROP_ALTROOT || 955 prop == ZPOOL_PROP_READONLY) 956 continue; 957 958 if (prop == ZPOOL_PROP_INVAL && 959 zfs_prop_user(nvpair_name(elem))) { 960 need_sync = B_TRUE; 961 break; 962 } 963 964 if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) { 965 uint64_t ver = 0; 966 967 if (prop == ZPOOL_PROP_VERSION) { 968 VERIFY0(nvpair_value_uint64(elem, &ver)); 969 } else { 970 ASSERT(zpool_prop_feature(nvpair_name(elem))); 971 ver = SPA_VERSION_FEATURES; 972 need_sync = B_TRUE; 973 } 974 975 /* Save time if the version is already set. */ 976 if (ver == spa_version(spa)) 977 continue; 978 979 /* 980 * In addition to the pool directory object, we might 981 * create the pool properties object, the features for 982 * read object, the features for write object, or the 983 * feature descriptions object. 984 */ 985 error = dsl_sync_task(spa->spa_name, NULL, 986 spa_sync_version, &ver, 987 6, ZFS_SPACE_CHECK_RESERVED); 988 if (error) 989 return (error); 990 continue; 991 } 992 993 need_sync = B_TRUE; 994 break; 995 } 996 997 if (need_sync) { 998 return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, 999 nvp, 6, ZFS_SPACE_CHECK_RESERVED)); 1000 } 1001 1002 return (0); 1003 } 1004 1005 /* 1006 * If the bootfs property value is dsobj, clear it. 1007 */ 1008 void 1009 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 1010 { 1011 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 1012 VERIFY(zap_remove(spa->spa_meta_objset, 1013 spa->spa_pool_props_object, 1014 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 1015 spa->spa_bootfs = 0; 1016 } 1017 } 1018 1019 static int 1020 spa_change_guid_check(void *arg, dmu_tx_t *tx) 1021 { 1022 uint64_t *newguid __maybe_unused = arg; 1023 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 1024 vdev_t *rvd = spa->spa_root_vdev; 1025 uint64_t vdev_state; 1026 1027 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 1028 int error = (spa_has_checkpoint(spa)) ? 1029 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 1030 return (SET_ERROR(error)); 1031 } 1032 1033 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 1034 vdev_state = rvd->vdev_state; 1035 spa_config_exit(spa, SCL_STATE, FTAG); 1036 1037 if (vdev_state != VDEV_STATE_HEALTHY) 1038 return (SET_ERROR(ENXIO)); 1039 1040 ASSERT3U(spa_guid(spa), !=, *newguid); 1041 1042 return (0); 1043 } 1044 1045 static void 1046 spa_change_guid_sync(void *arg, dmu_tx_t *tx) 1047 { 1048 uint64_t *newguid = arg; 1049 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 1050 uint64_t oldguid; 1051 vdev_t *rvd = spa->spa_root_vdev; 1052 1053 oldguid = spa_guid(spa); 1054 1055 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 1056 rvd->vdev_guid = *newguid; 1057 rvd->vdev_guid_sum += (*newguid - oldguid); 1058 vdev_config_dirty(rvd); 1059 spa_config_exit(spa, SCL_STATE, FTAG); 1060 1061 spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", 1062 (u_longlong_t)oldguid, (u_longlong_t)*newguid); 1063 } 1064 1065 /* 1066 * Change the GUID for the pool. This is done so that we can later 1067 * re-import a pool built from a clone of our own vdevs. We will modify 1068 * the root vdev's guid, our own pool guid, and then mark all of our 1069 * vdevs dirty. Note that we must make sure that all our vdevs are 1070 * online when we do this, or else any vdevs that weren't present 1071 * would be orphaned from our pool. We are also going to issue a 1072 * sysevent to update any watchers. 1073 * 1074 * The GUID of the pool will be changed to the value pointed to by guidp. 1075 * The GUID may not be set to the reserverd value of 0. 1076 * The new GUID will be generated if guidp is NULL. 1077 */ 1078 int 1079 spa_change_guid(spa_t *spa, const uint64_t *guidp) 1080 { 1081 uint64_t guid; 1082 int error; 1083 1084 mutex_enter(&spa->spa_vdev_top_lock); 1085 spa_namespace_enter(FTAG); 1086 1087 if (guidp != NULL) { 1088 guid = *guidp; 1089 if (guid == 0) { 1090 error = SET_ERROR(EINVAL); 1091 goto out; 1092 } 1093 1094 if (spa_guid_exists(guid, 0)) { 1095 error = SET_ERROR(EEXIST); 1096 goto out; 1097 } 1098 } else { 1099 guid = spa_generate_guid(NULL); 1100 } 1101 1102 error = dsl_sync_task(spa->spa_name, spa_change_guid_check, 1103 spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); 1104 1105 if (error == 0) { 1106 /* 1107 * Clear the kobj flag from all the vdevs to allow 1108 * vdev_cache_process_kobj_evt() to post events to all the 1109 * vdevs since GUID is updated. 1110 */ 1111 vdev_clear_kobj_evt(spa->spa_root_vdev); 1112 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 1113 vdev_clear_kobj_evt(spa->spa_l2cache.sav_vdevs[i]); 1114 1115 spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE); 1116 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID); 1117 } 1118 1119 out: 1120 spa_namespace_exit(FTAG); 1121 mutex_exit(&spa->spa_vdev_top_lock); 1122 1123 return (error); 1124 } 1125 1126 /* 1127 * ========================================================================== 1128 * SPA state manipulation (open/create/destroy/import/export) 1129 * ========================================================================== 1130 */ 1131 1132 static int 1133 spa_error_entry_compare(const void *a, const void *b) 1134 { 1135 const spa_error_entry_t *sa = (const spa_error_entry_t *)a; 1136 const spa_error_entry_t *sb = (const spa_error_entry_t *)b; 1137 int ret; 1138 1139 ret = memcmp(&sa->se_bookmark, &sb->se_bookmark, 1140 sizeof (zbookmark_phys_t)); 1141 1142 return (TREE_ISIGN(ret)); 1143 } 1144 1145 /* 1146 * Utility function which retrieves copies of the current logs and 1147 * re-initializes them in the process. 1148 */ 1149 void 1150 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 1151 { 1152 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 1153 1154 memcpy(last, &spa->spa_errlist_last, sizeof (avl_tree_t)); 1155 memcpy(scrub, &spa->spa_errlist_scrub, sizeof (avl_tree_t)); 1156 1157 avl_create(&spa->spa_errlist_scrub, 1158 spa_error_entry_compare, sizeof (spa_error_entry_t), 1159 offsetof(spa_error_entry_t, se_avl)); 1160 avl_create(&spa->spa_errlist_last, 1161 spa_error_entry_compare, sizeof (spa_error_entry_t), 1162 offsetof(spa_error_entry_t, se_avl)); 1163 } 1164 1165 static void 1166 spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 1167 { 1168 const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 1169 enum zti_modes mode = ztip->zti_mode; 1170 uint_t value = ztip->zti_value; 1171 uint_t count = ztip->zti_count; 1172 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1173 uint_t cpus, threads, flags = TASKQ_DYNAMIC; 1174 1175 switch (mode) { 1176 case ZTI_MODE_FIXED: 1177 ASSERT3U(value, >, 0); 1178 break; 1179 1180 case ZTI_MODE_SYNC: 1181 1182 /* 1183 * Create one wr_iss taskq for every 'zio_taskq_write_tpq' CPUs, 1184 * not to exceed the number of spa allocators, and align to it. 1185 */ 1186 threads = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100); 1187 count = MAX(1, threads / MAX(1, zio_taskq_write_tpq)); 1188 count = MAX(count, (zio_taskq_batch_pct + 99) / 100); 1189 count = MIN(count, spa->spa_alloc_count); 1190 while (spa->spa_alloc_count % count != 0 && 1191 spa->spa_alloc_count < count * 2) 1192 count--; 1193 1194 /* 1195 * zio_taskq_batch_pct is unbounded and may exceed 100%, but no 1196 * single taskq may have more threads than 100% of online cpus. 1197 */ 1198 value = (zio_taskq_batch_pct + count / 2) / count; 1199 value = MIN(value, 100); 1200 flags |= TASKQ_THREADS_CPU_PCT; 1201 break; 1202 1203 case ZTI_MODE_SCALE: 1204 /* 1205 * We want more taskqs to reduce lock contention, but we want 1206 * less for better request ordering and CPU utilization. 1207 */ 1208 threads = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100); 1209 threads = MAX(threads, value); 1210 if (zio_taskq_batch_tpq > 0) { 1211 count = MAX(1, (threads + zio_taskq_batch_tpq / 2) / 1212 zio_taskq_batch_tpq); 1213 } else { 1214 /* 1215 * Prefer 6 threads per taskq, but no more taskqs 1216 * than threads in them on large systems. For 80%: 1217 * 1218 * taskq taskq total 1219 * cpus taskqs percent threads threads 1220 * ------- ------- ------- ------- ------- 1221 * 1 1 80% 1 1 1222 * 2 1 80% 1 1 1223 * 4 1 80% 3 3 1224 * 8 2 40% 3 6 1225 * 16 3 27% 4 12 1226 * 32 5 16% 5 25 1227 * 64 7 11% 7 49 1228 * 128 10 8% 10 100 1229 * 256 14 6% 15 210 1230 */ 1231 cpus = MIN(threads, boot_ncpus); 1232 count = 1 + threads / 6; 1233 while (count * count > cpus) 1234 count--; 1235 } 1236 1237 /* 1238 * Try to represent the number of threads per taskq as percent 1239 * of online CPUs to allow scaling with later online/offline. 1240 * Fall back to absolute numbers if can't. 1241 */ 1242 value = (threads * 100 + boot_ncpus * count / 2) / 1243 (boot_ncpus * count); 1244 if (value < 5 || value > 100) 1245 value = MAX(1, (threads + count / 2) / count); 1246 else 1247 flags |= TASKQ_THREADS_CPU_PCT; 1248 break; 1249 1250 case ZTI_MODE_NULL: 1251 tqs->stqs_count = 0; 1252 tqs->stqs_taskq = NULL; 1253 return; 1254 1255 default: 1256 panic("unrecognized mode for %s_%s taskq (%u:%u) in " 1257 "spa_taskqs_init()", 1258 zio_type_name[t], zio_taskq_types[q], mode, value); 1259 break; 1260 } 1261 1262 ASSERT3U(count, >, 0); 1263 tqs->stqs_count = count; 1264 tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); 1265 1266 for (uint_t i = 0; i < count; i++) { 1267 taskq_t *tq; 1268 char name[32]; 1269 1270 if (count > 1) 1271 (void) snprintf(name, sizeof (name), "%s_%s_%u", 1272 zio_type_name[t], zio_taskq_types[q], i); 1273 else 1274 (void) snprintf(name, sizeof (name), "%s_%s", 1275 zio_type_name[t], zio_taskq_types[q]); 1276 1277 #ifdef HAVE_SYSDC 1278 if (zio_taskq_sysdc && spa->spa_proc != &p0) { 1279 (void) zio_taskq_basedc; 1280 tq = taskq_create_sysdc(name, value, 50, INT_MAX, 1281 spa->spa_proc, zio_taskq_basedc, flags); 1282 } else { 1283 #endif 1284 /* 1285 * The write issue taskq can be extremely CPU 1286 * intensive. Run it at slightly less important 1287 * priority than the other taskqs. 1288 */ 1289 const pri_t pri = (t == ZIO_TYPE_WRITE && 1290 q == ZIO_TASKQ_ISSUE) ? 1291 wtqclsyspri : maxclsyspri; 1292 tq = taskq_create_proc(name, value, pri, 50, 1293 INT_MAX, spa->spa_proc, flags); 1294 #ifdef HAVE_SYSDC 1295 } 1296 #endif 1297 1298 tqs->stqs_taskq[i] = tq; 1299 } 1300 } 1301 1302 static void 1303 spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 1304 { 1305 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1306 1307 if (tqs->stqs_taskq == NULL) { 1308 ASSERT0(tqs->stqs_count); 1309 return; 1310 } 1311 1312 for (uint_t i = 0; i < tqs->stqs_count; i++) { 1313 ASSERT3P(tqs->stqs_taskq[i], !=, NULL); 1314 taskq_destroy(tqs->stqs_taskq[i]); 1315 } 1316 1317 kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); 1318 tqs->stqs_taskq = NULL; 1319 } 1320 1321 #ifdef _KERNEL 1322 /* 1323 * The READ and WRITE rows of zio_taskqs are configurable at module load time 1324 * by setting zio_taskq_read or zio_taskq_write. 1325 * 1326 * Example (the defaults for READ and WRITE) 1327 * zio_taskq_read='fixed,1,8 null scale null' 1328 * zio_taskq_write='sync null scale null' 1329 * 1330 * Each sets the entire row at a time. 1331 * 1332 * 'fixed' is parameterised: fixed,Q,T where Q is number of taskqs, T is number 1333 * of threads per taskq. 1334 * 1335 * 'null' can only be set on the high-priority queues (queue selection for 1336 * high-priority queues will fall back to the regular queue if the high-pri 1337 * is NULL. 1338 */ 1339 static const char *const modes[ZTI_NMODES] = { 1340 "fixed", "scale", "sync", "null" 1341 }; 1342 1343 /* Parse the incoming config string. Modifies cfg */ 1344 static int 1345 spa_taskq_param_set(zio_type_t t, char *cfg) 1346 { 1347 int err = 0; 1348 1349 zio_taskq_info_t row[ZIO_TASKQ_TYPES] = {{0}}; 1350 1351 char *next = cfg, *tok, *c; 1352 1353 /* 1354 * Parse out each element from the string and fill `row`. The entire 1355 * row has to be set at once, so any errors are flagged by just 1356 * breaking out of this loop early. 1357 */ 1358 uint_t q; 1359 for (q = 0; q < ZIO_TASKQ_TYPES; q++) { 1360 /* `next` is the start of the config */ 1361 if (next == NULL) 1362 break; 1363 1364 /* Eat up leading space */ 1365 while (isspace(*next)) 1366 next++; 1367 if (*next == '\0') 1368 break; 1369 1370 /* Mode ends at space or end of string */ 1371 tok = next; 1372 next = strchr(tok, ' '); 1373 if (next != NULL) *next++ = '\0'; 1374 1375 /* Parameters start after a comma */ 1376 c = strchr(tok, ','); 1377 if (c != NULL) *c++ = '\0'; 1378 1379 /* Match mode string */ 1380 uint_t mode; 1381 for (mode = 0; mode < ZTI_NMODES; mode++) 1382 if (strcmp(tok, modes[mode]) == 0) 1383 break; 1384 if (mode == ZTI_NMODES) 1385 break; 1386 1387 /* Invalid canary */ 1388 row[q].zti_mode = ZTI_NMODES; 1389 1390 /* Per-mode setup */ 1391 switch (mode) { 1392 1393 /* 1394 * FIXED is parameterised: number of queues, and number of 1395 * threads per queue. 1396 */ 1397 case ZTI_MODE_FIXED: { 1398 /* No parameters? */ 1399 if (c == NULL || *c == '\0') 1400 break; 1401 1402 /* Find next parameter */ 1403 tok = c; 1404 c = strchr(tok, ','); 1405 if (c == NULL) 1406 break; 1407 1408 /* Take digits and convert */ 1409 unsigned long long nq; 1410 if (!(isdigit(*tok))) 1411 break; 1412 err = ddi_strtoull(tok, &tok, 10, &nq); 1413 /* Must succeed and also end at the next param sep */ 1414 if (err != 0 || tok != c) 1415 break; 1416 1417 /* Move past the comma */ 1418 tok++; 1419 /* Need another number */ 1420 if (!(isdigit(*tok))) 1421 break; 1422 /* Remember start to make sure we moved */ 1423 c = tok; 1424 1425 /* Take digits */ 1426 unsigned long long ntpq; 1427 err = ddi_strtoull(tok, &tok, 10, &ntpq); 1428 /* Must succeed, and moved forward */ 1429 if (err != 0 || tok == c || *tok != '\0') 1430 break; 1431 1432 /* 1433 * sanity; zero queues/threads make no sense, and 1434 * 16K is almost certainly more than anyone will ever 1435 * need and avoids silly numbers like UINT32_MAX 1436 */ 1437 if (nq == 0 || nq >= 16384 || 1438 ntpq == 0 || ntpq >= 16384) 1439 break; 1440 1441 const zio_taskq_info_t zti = ZTI_P(ntpq, nq); 1442 row[q] = zti; 1443 break; 1444 } 1445 1446 /* 1447 * SCALE is optionally parameterised by minimum number of 1448 * threads. 1449 */ 1450 case ZTI_MODE_SCALE: { 1451 unsigned long long mint = 0; 1452 if (c != NULL && *c != '\0') { 1453 /* Need a number */ 1454 if (!(isdigit(*c))) 1455 break; 1456 tok = c; 1457 1458 /* Take digits */ 1459 err = ddi_strtoull(tok, &tok, 10, &mint); 1460 /* Must succeed, and moved forward */ 1461 if (err != 0 || tok == c || *tok != '\0') 1462 break; 1463 1464 /* Sanity check */ 1465 if (mint >= 16384) 1466 break; 1467 } 1468 1469 const zio_taskq_info_t zti = ZTI_SCALE(mint); 1470 row[q] = zti; 1471 break; 1472 } 1473 1474 case ZTI_MODE_SYNC: { 1475 const zio_taskq_info_t zti = ZTI_SYNC; 1476 row[q] = zti; 1477 break; 1478 } 1479 1480 case ZTI_MODE_NULL: { 1481 /* 1482 * Can only null the high-priority queues; the general- 1483 * purpose ones have to exist. 1484 */ 1485 if (q != ZIO_TASKQ_ISSUE_HIGH && 1486 q != ZIO_TASKQ_INTERRUPT_HIGH) 1487 break; 1488 1489 const zio_taskq_info_t zti = ZTI_NULL; 1490 row[q] = zti; 1491 break; 1492 } 1493 1494 default: 1495 break; 1496 } 1497 1498 /* Ensure we set a mode */ 1499 if (row[q].zti_mode == ZTI_NMODES) 1500 break; 1501 } 1502 1503 /* Didn't get a full row, fail */ 1504 if (q < ZIO_TASKQ_TYPES) 1505 return (SET_ERROR(EINVAL)); 1506 1507 /* Eat trailing space */ 1508 if (next != NULL) 1509 while (isspace(*next)) 1510 next++; 1511 1512 /* If there's anything left over then fail */ 1513 if (next != NULL && *next != '\0') 1514 return (SET_ERROR(EINVAL)); 1515 1516 /* Success! Copy it into the real config */ 1517 for (q = 0; q < ZIO_TASKQ_TYPES; q++) 1518 zio_taskqs[t][q] = row[q]; 1519 1520 return (0); 1521 } 1522 1523 static int 1524 spa_taskq_param_get(zio_type_t t, char *buf, boolean_t add_newline) 1525 { 1526 int pos = 0; 1527 1528 /* Build paramater string from live config */ 1529 const char *sep = ""; 1530 for (uint_t q = 0; q < ZIO_TASKQ_TYPES; q++) { 1531 const zio_taskq_info_t *zti = &zio_taskqs[t][q]; 1532 if (zti->zti_mode == ZTI_MODE_FIXED) 1533 pos += sprintf(&buf[pos], "%s%s,%u,%u", sep, 1534 modes[zti->zti_mode], zti->zti_count, 1535 zti->zti_value); 1536 else if (zti->zti_mode == ZTI_MODE_SCALE && zti->zti_value > 0) 1537 pos += sprintf(&buf[pos], "%s%s,%u", sep, 1538 modes[zti->zti_mode], zti->zti_value); 1539 else 1540 pos += sprintf(&buf[pos], "%s%s", sep, 1541 modes[zti->zti_mode]); 1542 sep = " "; 1543 } 1544 1545 if (add_newline) 1546 buf[pos++] = '\n'; 1547 buf[pos] = '\0'; 1548 1549 return (pos); 1550 } 1551 1552 #ifdef __linux__ 1553 static int 1554 spa_taskq_read_param_set(const char *val, zfs_kernel_param_t *kp) 1555 { 1556 char *cfg = kmem_strdup(val); 1557 int err = spa_taskq_param_set(ZIO_TYPE_READ, cfg); 1558 kmem_strfree(cfg); 1559 return (-err); 1560 } 1561 1562 static int 1563 spa_taskq_read_param_get(char *buf, zfs_kernel_param_t *kp) 1564 { 1565 return (spa_taskq_param_get(ZIO_TYPE_READ, buf, TRUE)); 1566 } 1567 1568 static int 1569 spa_taskq_write_param_set(const char *val, zfs_kernel_param_t *kp) 1570 { 1571 char *cfg = kmem_strdup(val); 1572 int err = spa_taskq_param_set(ZIO_TYPE_WRITE, cfg); 1573 kmem_strfree(cfg); 1574 return (-err); 1575 } 1576 1577 static int 1578 spa_taskq_write_param_get(char *buf, zfs_kernel_param_t *kp) 1579 { 1580 return (spa_taskq_param_get(ZIO_TYPE_WRITE, buf, TRUE)); 1581 } 1582 1583 static int 1584 spa_taskq_free_param_set(const char *val, zfs_kernel_param_t *kp) 1585 { 1586 char *cfg = kmem_strdup(val); 1587 int err = spa_taskq_param_set(ZIO_TYPE_FREE, cfg); 1588 kmem_strfree(cfg); 1589 return (-err); 1590 } 1591 1592 static int 1593 spa_taskq_free_param_get(char *buf, zfs_kernel_param_t *kp) 1594 { 1595 return (spa_taskq_param_get(ZIO_TYPE_FREE, buf, TRUE)); 1596 } 1597 #else 1598 /* 1599 * On FreeBSD load-time parameters can be set up before malloc() is available, 1600 * so we have to do all the parsing work on the stack. 1601 */ 1602 #define SPA_TASKQ_PARAM_MAX (128) 1603 1604 static int 1605 spa_taskq_read_param(ZFS_MODULE_PARAM_ARGS) 1606 { 1607 char buf[SPA_TASKQ_PARAM_MAX]; 1608 int err; 1609 1610 (void) spa_taskq_param_get(ZIO_TYPE_READ, buf, FALSE); 1611 err = sysctl_handle_string(oidp, buf, sizeof (buf), req); 1612 if (err || req->newptr == NULL) 1613 return (err); 1614 return (spa_taskq_param_set(ZIO_TYPE_READ, buf)); 1615 } 1616 1617 static int 1618 spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS) 1619 { 1620 char buf[SPA_TASKQ_PARAM_MAX]; 1621 int err; 1622 1623 (void) spa_taskq_param_get(ZIO_TYPE_WRITE, buf, FALSE); 1624 err = sysctl_handle_string(oidp, buf, sizeof (buf), req); 1625 if (err || req->newptr == NULL) 1626 return (err); 1627 return (spa_taskq_param_set(ZIO_TYPE_WRITE, buf)); 1628 } 1629 1630 static int 1631 spa_taskq_free_param(ZFS_MODULE_PARAM_ARGS) 1632 { 1633 char buf[SPA_TASKQ_PARAM_MAX]; 1634 int err; 1635 1636 (void) spa_taskq_param_get(ZIO_TYPE_FREE, buf, FALSE); 1637 err = sysctl_handle_string(oidp, buf, sizeof (buf), req); 1638 if (err || req->newptr == NULL) 1639 return (err); 1640 return (spa_taskq_param_set(ZIO_TYPE_FREE, buf)); 1641 } 1642 #endif 1643 #endif /* _KERNEL */ 1644 1645 /* 1646 * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. 1647 * Note that a type may have multiple discrete taskqs to avoid lock contention 1648 * on the taskq itself. 1649 */ 1650 void 1651 spa_taskq_dispatch(spa_t *spa, zio_type_t t, zio_taskq_type_t q, 1652 task_func_t *func, zio_t *zio, boolean_t cutinline) 1653 { 1654 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1655 taskq_t *tq; 1656 1657 ASSERT3P(tqs->stqs_taskq, !=, NULL); 1658 ASSERT3U(tqs->stqs_count, !=, 0); 1659 1660 /* 1661 * NB: We are assuming that the zio can only be dispatched 1662 * to a single taskq at a time. It would be a grievous error 1663 * to dispatch the zio to another taskq at the same time. 1664 */ 1665 ASSERT(zio); 1666 ASSERT(taskq_empty_ent(&zio->io_tqent)); 1667 1668 if (tqs->stqs_count == 1) { 1669 tq = tqs->stqs_taskq[0]; 1670 } else if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) && 1671 ZIO_HAS_ALLOCATOR(zio)) { 1672 tq = tqs->stqs_taskq[zio->io_allocator % tqs->stqs_count]; 1673 } else { 1674 tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; 1675 } 1676 1677 taskq_dispatch_ent(tq, func, zio, cutinline ? TQ_FRONT : 0, 1678 &zio->io_tqent); 1679 } 1680 1681 static void 1682 spa_create_zio_taskqs(spa_t *spa) 1683 { 1684 for (int t = 0; t < ZIO_TYPES; t++) { 1685 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1686 spa_taskqs_init(spa, t, q); 1687 } 1688 } 1689 } 1690 1691 #if defined(_KERNEL) && defined(HAVE_SPA_THREAD) 1692 static void 1693 spa_thread(void *arg) 1694 { 1695 psetid_t zio_taskq_psrset_bind = PS_NONE; 1696 callb_cpr_t cprinfo; 1697 1698 spa_t *spa = arg; 1699 user_t *pu = PTOU(curproc); 1700 1701 CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 1702 spa->spa_name); 1703 1704 ASSERT(curproc != &p0); 1705 (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 1706 "zpool-%s", spa->spa_name); 1707 (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 1708 1709 /* bind this thread to the requested psrset */ 1710 if (zio_taskq_psrset_bind != PS_NONE) { 1711 pool_lock(); 1712 mutex_enter(&cpu_lock); 1713 mutex_enter(&pidlock); 1714 mutex_enter(&curproc->p_lock); 1715 1716 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 1717 0, NULL, NULL) == 0) { 1718 curthread->t_bind_pset = zio_taskq_psrset_bind; 1719 } else { 1720 cmn_err(CE_WARN, 1721 "Couldn't bind process for zfs pool \"%s\" to " 1722 "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 1723 } 1724 1725 mutex_exit(&curproc->p_lock); 1726 mutex_exit(&pidlock); 1727 mutex_exit(&cpu_lock); 1728 pool_unlock(); 1729 } 1730 1731 #ifdef HAVE_SYSDC 1732 if (zio_taskq_sysdc) { 1733 sysdc_thread_enter(curthread, 100, 0); 1734 } 1735 #endif 1736 1737 spa->spa_proc = curproc; 1738 spa->spa_did = curthread->t_did; 1739 1740 spa_create_zio_taskqs(spa); 1741 1742 mutex_enter(&spa->spa_proc_lock); 1743 ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 1744 1745 spa->spa_proc_state = SPA_PROC_ACTIVE; 1746 cv_broadcast(&spa->spa_proc_cv); 1747 1748 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1749 while (spa->spa_proc_state == SPA_PROC_ACTIVE) 1750 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1751 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 1752 1753 ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 1754 spa->spa_proc_state = SPA_PROC_GONE; 1755 spa->spa_proc = &p0; 1756 cv_broadcast(&spa->spa_proc_cv); 1757 CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 1758 1759 mutex_enter(&curproc->p_lock); 1760 lwp_exit(); 1761 } 1762 #endif 1763 1764 extern metaslab_ops_t *metaslab_allocator(spa_t *spa); 1765 1766 /* 1767 * Activate an uninitialized pool. 1768 */ 1769 static void 1770 spa_activate(spa_t *spa, spa_mode_t mode) 1771 { 1772 metaslab_ops_t *msp = metaslab_allocator(spa); 1773 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 1774 1775 spa->spa_state = POOL_STATE_ACTIVE; 1776 spa->spa_final_txg = UINT64_MAX; 1777 spa->spa_mode = mode; 1778 spa->spa_read_spacemaps = spa_mode_readable_spacemaps; 1779 1780 spa->spa_normal_class = metaslab_class_create(spa, "normal", 1781 msp, B_FALSE); 1782 spa->spa_log_class = metaslab_class_create(spa, "log", msp, B_TRUE); 1783 spa->spa_embedded_log_class = metaslab_class_create(spa, 1784 "embedded_log", msp, B_TRUE); 1785 spa->spa_special_class = metaslab_class_create(spa, "special", 1786 msp, B_FALSE); 1787 spa->spa_special_embedded_log_class = metaslab_class_create(spa, 1788 "special_embedded_log", msp, B_TRUE); 1789 spa->spa_dedup_class = metaslab_class_create(spa, "dedup", 1790 msp, B_FALSE); 1791 1792 /* Try to create a covering process */ 1793 mutex_enter(&spa->spa_proc_lock); 1794 ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 1795 ASSERT(spa->spa_proc == &p0); 1796 spa->spa_did = 0; 1797 1798 #ifdef HAVE_SPA_THREAD 1799 /* Only create a process if we're going to be around a while. */ 1800 if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 1801 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 1802 NULL, 0) == 0) { 1803 spa->spa_proc_state = SPA_PROC_CREATED; 1804 while (spa->spa_proc_state == SPA_PROC_CREATED) { 1805 cv_wait(&spa->spa_proc_cv, 1806 &spa->spa_proc_lock); 1807 } 1808 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1809 ASSERT(spa->spa_proc != &p0); 1810 ASSERT(spa->spa_did != 0); 1811 } else { 1812 #ifdef _KERNEL 1813 cmn_err(CE_WARN, 1814 "Couldn't create process for zfs pool \"%s\"\n", 1815 spa->spa_name); 1816 #endif 1817 } 1818 } 1819 #endif /* HAVE_SPA_THREAD */ 1820 mutex_exit(&spa->spa_proc_lock); 1821 1822 /* If we didn't create a process, we need to create our taskqs. */ 1823 if (spa->spa_proc == &p0) { 1824 spa_create_zio_taskqs(spa); 1825 } 1826 1827 for (size_t i = 0; i < TXG_SIZE; i++) { 1828 spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, 1829 ZIO_FLAG_CANFAIL); 1830 } 1831 1832 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 1833 offsetof(vdev_t, vdev_config_dirty_node)); 1834 list_create(&spa->spa_evicting_os_list, sizeof (objset_t), 1835 offsetof(objset_t, os_evicting_node)); 1836 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 1837 offsetof(vdev_t, vdev_state_dirty_node)); 1838 1839 txg_list_create(&spa->spa_vdev_txg_list, spa, 1840 offsetof(struct vdev, vdev_txg_node)); 1841 1842 avl_create(&spa->spa_errlist_scrub, 1843 spa_error_entry_compare, sizeof (spa_error_entry_t), 1844 offsetof(spa_error_entry_t, se_avl)); 1845 avl_create(&spa->spa_errlist_last, 1846 spa_error_entry_compare, sizeof (spa_error_entry_t), 1847 offsetof(spa_error_entry_t, se_avl)); 1848 avl_create(&spa->spa_errlist_healed, 1849 spa_error_entry_compare, sizeof (spa_error_entry_t), 1850 offsetof(spa_error_entry_t, se_avl)); 1851 1852 spa_activate_os(spa); 1853 1854 spa_keystore_init(&spa->spa_keystore); 1855 1856 /* 1857 * This taskq is used to perform zvol-minor-related tasks 1858 * asynchronously. This has several advantages, including easy 1859 * resolution of various deadlocks. 1860 * 1861 * The taskq must be single threaded to ensure tasks are always 1862 * processed in the order in which they were dispatched. 1863 * 1864 * A taskq per pool allows one to keep the pools independent. 1865 * This way if one pool is suspended, it will not impact another. 1866 * 1867 * The preferred location to dispatch a zvol minor task is a sync 1868 * task. In this context, there is easy access to the spa_t and minimal 1869 * error handling is required because the sync task must succeed. 1870 */ 1871 spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri, 1872 1, INT_MAX, 0); 1873 1874 /* 1875 * The taskq to preload metaslabs. 1876 */ 1877 spa->spa_metaslab_taskq = taskq_create("z_metaslab", 1878 metaslab_preload_pct, maxclsyspri, 1, INT_MAX, 1879 TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); 1880 1881 /* 1882 * Taskq dedicated to prefetcher threads: this is used to prevent the 1883 * pool traverse code from monopolizing the global (and limited) 1884 * system_taskq by inappropriately scheduling long running tasks on it. 1885 */ 1886 spa->spa_prefetch_taskq = taskq_create("z_prefetch", 100, 1887 defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); 1888 1889 /* 1890 * The taskq to upgrade datasets in this pool. Currently used by 1891 * feature SPA_FEATURE_USEROBJ_ACCOUNTING/SPA_FEATURE_PROJECT_QUOTA. 1892 */ 1893 spa->spa_upgrade_taskq = taskq_create("z_upgrade", 100, 1894 defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); 1895 } 1896 1897 /* 1898 * Opposite of spa_activate(). 1899 */ 1900 static void 1901 spa_deactivate(spa_t *spa) 1902 { 1903 ASSERT(spa->spa_sync_on == B_FALSE); 1904 ASSERT0P(spa->spa_dsl_pool); 1905 ASSERT0P(spa->spa_root_vdev); 1906 ASSERT0P(spa->spa_async_zio_root); 1907 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 1908 1909 spa_evicting_os_wait(spa); 1910 1911 if (spa->spa_zvol_taskq) { 1912 taskq_destroy(spa->spa_zvol_taskq); 1913 spa->spa_zvol_taskq = NULL; 1914 } 1915 1916 if (spa->spa_metaslab_taskq) { 1917 taskq_destroy(spa->spa_metaslab_taskq); 1918 spa->spa_metaslab_taskq = NULL; 1919 } 1920 1921 if (spa->spa_prefetch_taskq) { 1922 taskq_destroy(spa->spa_prefetch_taskq); 1923 spa->spa_prefetch_taskq = NULL; 1924 } 1925 1926 if (spa->spa_upgrade_taskq) { 1927 taskq_destroy(spa->spa_upgrade_taskq); 1928 spa->spa_upgrade_taskq = NULL; 1929 } 1930 1931 txg_list_destroy(&spa->spa_vdev_txg_list); 1932 1933 list_destroy(&spa->spa_config_dirty_list); 1934 list_destroy(&spa->spa_evicting_os_list); 1935 list_destroy(&spa->spa_state_dirty_list); 1936 1937 taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); 1938 1939 for (int t = 0; t < ZIO_TYPES; t++) { 1940 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1941 spa_taskqs_fini(spa, t, q); 1942 } 1943 } 1944 1945 for (size_t i = 0; i < TXG_SIZE; i++) { 1946 ASSERT3P(spa->spa_txg_zio[i], !=, NULL); 1947 VERIFY0(zio_wait(spa->spa_txg_zio[i])); 1948 spa->spa_txg_zio[i] = NULL; 1949 } 1950 1951 metaslab_class_destroy(spa->spa_normal_class); 1952 spa->spa_normal_class = NULL; 1953 1954 metaslab_class_destroy(spa->spa_log_class); 1955 spa->spa_log_class = NULL; 1956 1957 metaslab_class_destroy(spa->spa_embedded_log_class); 1958 spa->spa_embedded_log_class = NULL; 1959 1960 metaslab_class_destroy(spa->spa_special_class); 1961 spa->spa_special_class = NULL; 1962 1963 metaslab_class_destroy(spa->spa_special_embedded_log_class); 1964 spa->spa_special_embedded_log_class = NULL; 1965 1966 metaslab_class_destroy(spa->spa_dedup_class); 1967 spa->spa_dedup_class = NULL; 1968 1969 /* 1970 * If this was part of an import or the open otherwise failed, we may 1971 * still have errors left in the queues. Empty them just in case. 1972 */ 1973 spa_errlog_drain(spa); 1974 avl_destroy(&spa->spa_errlist_scrub); 1975 avl_destroy(&spa->spa_errlist_last); 1976 avl_destroy(&spa->spa_errlist_healed); 1977 1978 spa_keystore_fini(&spa->spa_keystore); 1979 1980 spa->spa_state = POOL_STATE_UNINITIALIZED; 1981 1982 mutex_enter(&spa->spa_proc_lock); 1983 if (spa->spa_proc_state != SPA_PROC_NONE) { 1984 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1985 spa->spa_proc_state = SPA_PROC_DEACTIVATE; 1986 cv_broadcast(&spa->spa_proc_cv); 1987 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 1988 ASSERT(spa->spa_proc != &p0); 1989 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1990 } 1991 ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 1992 spa->spa_proc_state = SPA_PROC_NONE; 1993 } 1994 ASSERT(spa->spa_proc == &p0); 1995 mutex_exit(&spa->spa_proc_lock); 1996 1997 /* 1998 * We want to make sure spa_thread() has actually exited the ZFS 1999 * module, so that the module can't be unloaded out from underneath 2000 * it. 2001 */ 2002 if (spa->spa_did != 0) { 2003 thread_join(spa->spa_did); 2004 spa->spa_did = 0; 2005 } 2006 2007 spa_deactivate_os(spa); 2008 2009 } 2010 2011 /* 2012 * Verify a pool configuration, and construct the vdev tree appropriately. This 2013 * will create all the necessary vdevs in the appropriate layout, with each vdev 2014 * in the CLOSED state. This will prep the pool before open/creation/import. 2015 * All vdev validation is done by the vdev_alloc() routine. 2016 */ 2017 int 2018 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 2019 uint_t id, int atype) 2020 { 2021 nvlist_t **child; 2022 uint_t children; 2023 int error; 2024 2025 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 2026 return (error); 2027 2028 if ((*vdp)->vdev_ops->vdev_op_leaf) 2029 return (0); 2030 2031 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 2032 &child, &children); 2033 2034 if (error == ENOENT) 2035 return (0); 2036 2037 if (error) { 2038 vdev_free(*vdp); 2039 *vdp = NULL; 2040 return (SET_ERROR(EINVAL)); 2041 } 2042 2043 for (int c = 0; c < children; c++) { 2044 vdev_t *vd; 2045 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 2046 atype)) != 0) { 2047 vdev_free(*vdp); 2048 *vdp = NULL; 2049 return (error); 2050 } 2051 } 2052 2053 ASSERT(*vdp != NULL); 2054 2055 return (0); 2056 } 2057 2058 static boolean_t 2059 spa_should_flush_logs_on_unload(spa_t *spa) 2060 { 2061 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) 2062 return (B_FALSE); 2063 2064 if (!spa_writeable(spa)) 2065 return (B_FALSE); 2066 2067 if (!spa->spa_sync_on) 2068 return (B_FALSE); 2069 2070 if (spa_state(spa) != POOL_STATE_EXPORTED) 2071 return (B_FALSE); 2072 2073 if (zfs_keep_log_spacemaps_at_export) 2074 return (B_FALSE); 2075 2076 return (B_TRUE); 2077 } 2078 2079 /* 2080 * Opens a transaction that will set the flag that will instruct 2081 * spa_sync to attempt to flush all the metaslabs for that txg. 2082 */ 2083 static void 2084 spa_unload_log_sm_flush_all(spa_t *spa) 2085 { 2086 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 2087 VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT | DMU_TX_SUSPEND)); 2088 2089 ASSERT0(spa->spa_log_flushall_txg); 2090 spa->spa_log_flushall_txg = dmu_tx_get_txg(tx); 2091 2092 dmu_tx_commit(tx); 2093 txg_wait_synced(spa_get_dsl(spa), spa->spa_log_flushall_txg); 2094 } 2095 2096 static void 2097 spa_unload_log_sm_metadata(spa_t *spa) 2098 { 2099 void *cookie = NULL; 2100 spa_log_sm_t *sls; 2101 log_summary_entry_t *e; 2102 2103 while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg, 2104 &cookie)) != NULL) { 2105 VERIFY0(sls->sls_mscount); 2106 kmem_free(sls, sizeof (spa_log_sm_t)); 2107 } 2108 2109 while ((e = list_remove_head(&spa->spa_log_summary)) != NULL) { 2110 VERIFY0(e->lse_mscount); 2111 kmem_free(e, sizeof (log_summary_entry_t)); 2112 } 2113 2114 spa->spa_unflushed_stats.sus_nblocks = 0; 2115 spa->spa_unflushed_stats.sus_memused = 0; 2116 spa->spa_unflushed_stats.sus_blocklimit = 0; 2117 } 2118 2119 static void 2120 spa_destroy_aux_threads(spa_t *spa) 2121 { 2122 if (spa->spa_condense_zthr != NULL) { 2123 zthr_destroy(spa->spa_condense_zthr); 2124 spa->spa_condense_zthr = NULL; 2125 } 2126 if (spa->spa_checkpoint_discard_zthr != NULL) { 2127 zthr_destroy(spa->spa_checkpoint_discard_zthr); 2128 spa->spa_checkpoint_discard_zthr = NULL; 2129 } 2130 if (spa->spa_livelist_delete_zthr != NULL) { 2131 zthr_destroy(spa->spa_livelist_delete_zthr); 2132 spa->spa_livelist_delete_zthr = NULL; 2133 } 2134 if (spa->spa_livelist_condense_zthr != NULL) { 2135 zthr_destroy(spa->spa_livelist_condense_zthr); 2136 spa->spa_livelist_condense_zthr = NULL; 2137 } 2138 if (spa->spa_raidz_expand_zthr != NULL) { 2139 zthr_destroy(spa->spa_raidz_expand_zthr); 2140 spa->spa_raidz_expand_zthr = NULL; 2141 } 2142 } 2143 2144 static void 2145 spa_sync_time_logger(spa_t *spa, uint64_t txg) 2146 { 2147 uint64_t curtime; 2148 dmu_tx_t *tx; 2149 2150 if (!spa_writeable(spa)) { 2151 return; 2152 } 2153 curtime = gethrestime_sec(); 2154 if (curtime < spa->spa_last_noted_txg_time + spa_note_txg_time) { 2155 return; 2156 } 2157 2158 if (txg > spa->spa_last_noted_txg) { 2159 spa->spa_last_noted_txg_time = curtime; 2160 spa->spa_last_noted_txg = txg; 2161 2162 mutex_enter(&spa->spa_txg_log_time_lock); 2163 dbrrd_add(&spa->spa_txg_log_time, curtime, txg); 2164 mutex_exit(&spa->spa_txg_log_time_lock); 2165 } 2166 2167 if (curtime < spa->spa_last_flush_txg_time + spa_flush_txg_time) { 2168 return; 2169 } 2170 spa->spa_last_flush_txg_time = curtime; 2171 2172 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 2173 2174 VERIFY0(zap_update(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, 2175 DMU_POOL_TXG_LOG_TIME_MINUTES, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM, 2176 &spa->spa_txg_log_time.dbr_minutes, tx)); 2177 VERIFY0(zap_update(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, 2178 DMU_POOL_TXG_LOG_TIME_DAYS, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM, 2179 &spa->spa_txg_log_time.dbr_days, tx)); 2180 VERIFY0(zap_update(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, 2181 DMU_POOL_TXG_LOG_TIME_MONTHS, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM, 2182 &spa->spa_txg_log_time.dbr_months, tx)); 2183 dmu_tx_commit(tx); 2184 } 2185 2186 static void 2187 spa_unload_sync_time_logger(spa_t *spa) 2188 { 2189 uint64_t txg; 2190 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 2191 VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT)); 2192 2193 txg = dmu_tx_get_txg(tx); 2194 spa->spa_last_noted_txg_time = 0; 2195 spa->spa_last_flush_txg_time = 0; 2196 spa_sync_time_logger(spa, txg); 2197 2198 dmu_tx_commit(tx); 2199 } 2200 2201 static void 2202 spa_load_txg_log_time(spa_t *spa) 2203 { 2204 int error; 2205 2206 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 2207 DMU_POOL_TXG_LOG_TIME_MINUTES, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM, 2208 &spa->spa_txg_log_time.dbr_minutes); 2209 if (error != 0 && error != ENOENT) { 2210 spa_load_note(spa, "unable to load a txg time database with " 2211 "minute resolution [error=%d]", error); 2212 } 2213 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 2214 DMU_POOL_TXG_LOG_TIME_DAYS, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM, 2215 &spa->spa_txg_log_time.dbr_days); 2216 if (error != 0 && error != ENOENT) { 2217 spa_load_note(spa, "unable to load a txg time database with " 2218 "day resolution [error=%d]", error); 2219 } 2220 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 2221 DMU_POOL_TXG_LOG_TIME_MONTHS, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM, 2222 &spa->spa_txg_log_time.dbr_months); 2223 if (error != 0 && error != ENOENT) { 2224 spa_load_note(spa, "unable to load a txg time database with " 2225 "month resolution [error=%d]", error); 2226 } 2227 } 2228 2229 static boolean_t 2230 spa_should_sync_time_logger_on_unload(spa_t *spa) 2231 { 2232 2233 if (!spa_writeable(spa)) 2234 return (B_FALSE); 2235 2236 if (!spa->spa_sync_on) 2237 return (B_FALSE); 2238 2239 if (spa_state(spa) != POOL_STATE_EXPORTED) 2240 return (B_FALSE); 2241 2242 if (spa->spa_last_noted_txg == 0) 2243 return (B_FALSE); 2244 2245 return (B_TRUE); 2246 } 2247 2248 2249 /* 2250 * Opposite of spa_load(). 2251 */ 2252 static void 2253 spa_unload(spa_t *spa) 2254 { 2255 ASSERT(spa_namespace_held() || 2256 spa->spa_export_thread == curthread); 2257 ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED); 2258 2259 spa_import_progress_remove(spa_guid(spa)); 2260 spa_load_note(spa, "UNLOADING"); 2261 2262 spa_wake_waiters(spa); 2263 2264 /* 2265 * If we have set the spa_final_txg, we have already performed the 2266 * tasks below in spa_export_common(). We should not redo it here since 2267 * we delay the final TXGs beyond what spa_final_txg is set at. 2268 */ 2269 if (spa->spa_final_txg == UINT64_MAX) { 2270 if (spa_should_sync_time_logger_on_unload(spa)) 2271 spa_unload_sync_time_logger(spa); 2272 2273 /* 2274 * If the log space map feature is enabled and the pool is 2275 * getting exported (but not destroyed), we want to spend some 2276 * time flushing as many metaslabs as we can in an attempt to 2277 * destroy log space maps and save import time. 2278 */ 2279 if (spa_should_flush_logs_on_unload(spa)) 2280 spa_unload_log_sm_flush_all(spa); 2281 2282 /* 2283 * Stop async tasks. 2284 */ 2285 spa_async_suspend(spa); 2286 2287 if (spa->spa_root_vdev) { 2288 vdev_t *root_vdev = spa->spa_root_vdev; 2289 vdev_initialize_stop_all(root_vdev, 2290 VDEV_INITIALIZE_ACTIVE); 2291 vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE); 2292 vdev_autotrim_stop_all(spa); 2293 vdev_rebuild_stop_all(spa); 2294 l2arc_spa_rebuild_stop(spa); 2295 } 2296 2297 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2298 spa->spa_final_txg = spa_last_synced_txg(spa) + 2299 TXG_DEFER_SIZE + 1; 2300 spa_config_exit(spa, SCL_ALL, FTAG); 2301 } 2302 2303 /* 2304 * Stop syncing. 2305 */ 2306 if (spa->spa_sync_on) { 2307 txg_sync_stop(spa->spa_dsl_pool); 2308 spa->spa_sync_on = B_FALSE; 2309 } 2310 2311 /* 2312 * This ensures that there is no async metaslab prefetching 2313 * while we attempt to unload the spa. 2314 */ 2315 taskq_wait(spa->spa_metaslab_taskq); 2316 2317 if (spa->spa_mmp.mmp_thread) 2318 mmp_thread_stop(spa); 2319 2320 /* 2321 * Wait for any outstanding async I/O to complete. 2322 */ 2323 if (spa->spa_async_zio_root != NULL) { 2324 for (int i = 0; i < max_ncpus; i++) 2325 (void) zio_wait(spa->spa_async_zio_root[i]); 2326 kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *)); 2327 spa->spa_async_zio_root = NULL; 2328 } 2329 2330 if (spa->spa_vdev_removal != NULL) { 2331 spa_vdev_removal_destroy(spa->spa_vdev_removal); 2332 spa->spa_vdev_removal = NULL; 2333 } 2334 2335 spa_destroy_aux_threads(spa); 2336 2337 spa_condense_fini(spa); 2338 2339 bpobj_close(&spa->spa_deferred_bpobj); 2340 2341 spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); 2342 2343 /* 2344 * Close all vdevs. 2345 */ 2346 if (spa->spa_root_vdev) 2347 vdev_free(spa->spa_root_vdev); 2348 ASSERT0P(spa->spa_root_vdev); 2349 2350 /* 2351 * Close the dsl pool. 2352 */ 2353 if (spa->spa_dsl_pool) { 2354 dsl_pool_close(spa->spa_dsl_pool); 2355 spa->spa_dsl_pool = NULL; 2356 spa->spa_meta_objset = NULL; 2357 } 2358 2359 ddt_unload(spa); 2360 brt_unload(spa); 2361 spa_unload_log_sm_metadata(spa); 2362 2363 /* 2364 * Drop and purge level 2 cache 2365 */ 2366 spa_l2cache_drop(spa); 2367 2368 if (spa->spa_spares.sav_vdevs) { 2369 for (int i = 0; i < spa->spa_spares.sav_count; i++) 2370 vdev_free(spa->spa_spares.sav_vdevs[i]); 2371 kmem_free(spa->spa_spares.sav_vdevs, 2372 spa->spa_spares.sav_count * sizeof (void *)); 2373 spa->spa_spares.sav_vdevs = NULL; 2374 } 2375 if (spa->spa_spares.sav_config) { 2376 nvlist_free(spa->spa_spares.sav_config); 2377 spa->spa_spares.sav_config = NULL; 2378 } 2379 spa->spa_spares.sav_count = 0; 2380 2381 if (spa->spa_l2cache.sav_vdevs) { 2382 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { 2383 vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); 2384 vdev_free(spa->spa_l2cache.sav_vdevs[i]); 2385 } 2386 kmem_free(spa->spa_l2cache.sav_vdevs, 2387 spa->spa_l2cache.sav_count * sizeof (void *)); 2388 spa->spa_l2cache.sav_vdevs = NULL; 2389 } 2390 if (spa->spa_l2cache.sav_config) { 2391 nvlist_free(spa->spa_l2cache.sav_config); 2392 spa->spa_l2cache.sav_config = NULL; 2393 } 2394 spa->spa_l2cache.sav_count = 0; 2395 2396 spa->spa_async_suspended = 0; 2397 2398 spa->spa_indirect_vdevs_loaded = B_FALSE; 2399 2400 if (spa->spa_comment != NULL) { 2401 spa_strfree(spa->spa_comment); 2402 spa->spa_comment = NULL; 2403 } 2404 if (spa->spa_compatibility != NULL) { 2405 spa_strfree(spa->spa_compatibility); 2406 spa->spa_compatibility = NULL; 2407 } 2408 2409 spa->spa_raidz_expand = NULL; 2410 spa->spa_checkpoint_txg = 0; 2411 2412 spa_config_exit(spa, SCL_ALL, spa); 2413 } 2414 2415 /* 2416 * Load (or re-load) the current list of vdevs describing the active spares for 2417 * this pool. When this is called, we have some form of basic information in 2418 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 2419 * then re-generate a more complete list including status information. 2420 */ 2421 void 2422 spa_load_spares(spa_t *spa) 2423 { 2424 nvlist_t **spares; 2425 uint_t nspares; 2426 int i; 2427 vdev_t *vd, *tvd; 2428 2429 #ifndef _KERNEL 2430 /* 2431 * zdb opens both the current state of the pool and the 2432 * checkpointed state (if present), with a different spa_t. 2433 * 2434 * As spare vdevs are shared among open pools, we skip loading 2435 * them when we load the checkpointed state of the pool. 2436 */ 2437 if (!spa_writeable(spa)) 2438 return; 2439 #endif 2440 2441 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 2442 2443 /* 2444 * First, close and free any existing spare vdevs. 2445 */ 2446 if (spa->spa_spares.sav_vdevs) { 2447 for (i = 0; i < spa->spa_spares.sav_count; i++) { 2448 vd = spa->spa_spares.sav_vdevs[i]; 2449 2450 /* Undo the call to spa_activate() below */ 2451 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 2452 B_FALSE)) != NULL && tvd->vdev_isspare) 2453 spa_spare_remove(tvd); 2454 vdev_close(vd); 2455 vdev_free(vd); 2456 } 2457 2458 kmem_free(spa->spa_spares.sav_vdevs, 2459 spa->spa_spares.sav_count * sizeof (void *)); 2460 } 2461 2462 if (spa->spa_spares.sav_config == NULL) 2463 nspares = 0; 2464 else 2465 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 2466 ZPOOL_CONFIG_SPARES, &spares, &nspares)); 2467 2468 spa->spa_spares.sav_count = (int)nspares; 2469 spa->spa_spares.sav_vdevs = NULL; 2470 2471 if (nspares == 0) 2472 return; 2473 2474 /* 2475 * Construct the array of vdevs, opening them to get status in the 2476 * process. For each spare, there is potentially two different vdev_t 2477 * structures associated with it: one in the list of spares (used only 2478 * for basic validation purposes) and one in the active vdev 2479 * configuration (if it's spared in). During this phase we open and 2480 * validate each vdev on the spare list. If the vdev also exists in the 2481 * active configuration, then we also mark this vdev as an active spare. 2482 */ 2483 spa->spa_spares.sav_vdevs = kmem_zalloc(nspares * sizeof (void *), 2484 KM_SLEEP); 2485 for (i = 0; i < spa->spa_spares.sav_count; i++) { 2486 VERIFY0(spa_config_parse(spa, &vd, spares[i], NULL, 0, 2487 VDEV_ALLOC_SPARE)); 2488 ASSERT(vd != NULL); 2489 2490 spa->spa_spares.sav_vdevs[i] = vd; 2491 2492 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 2493 B_FALSE)) != NULL) { 2494 if (!tvd->vdev_isspare) 2495 spa_spare_add(tvd); 2496 2497 /* 2498 * We only mark the spare active if we were successfully 2499 * able to load the vdev. Otherwise, importing a pool 2500 * with a bad active spare would result in strange 2501 * behavior, because multiple pool would think the spare 2502 * is actively in use. 2503 * 2504 * There is a vulnerability here to an equally bizarre 2505 * circumstance, where a dead active spare is later 2506 * brought back to life (onlined or otherwise). Given 2507 * the rarity of this scenario, and the extra complexity 2508 * it adds, we ignore the possibility. 2509 */ 2510 if (!vdev_is_dead(tvd)) 2511 spa_spare_activate(tvd); 2512 } 2513 2514 vd->vdev_top = vd; 2515 vd->vdev_aux = &spa->spa_spares; 2516 2517 if (vdev_open(vd) != 0) 2518 continue; 2519 2520 if (vdev_validate_aux(vd) == 0) 2521 spa_spare_add(vd); 2522 } 2523 2524 /* 2525 * Recompute the stashed list of spares, with status information 2526 * this time. 2527 */ 2528 fnvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES); 2529 2530 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 2531 KM_SLEEP); 2532 for (i = 0; i < spa->spa_spares.sav_count; i++) 2533 spares[i] = vdev_config_generate(spa, 2534 spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 2535 fnvlist_add_nvlist_array(spa->spa_spares.sav_config, 2536 ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, 2537 spa->spa_spares.sav_count); 2538 for (i = 0; i < spa->spa_spares.sav_count; i++) 2539 nvlist_free(spares[i]); 2540 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 2541 } 2542 2543 /* 2544 * Load (or re-load) the current list of vdevs describing the active l2cache for 2545 * this pool. When this is called, we have some form of basic information in 2546 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 2547 * then re-generate a more complete list including status information. 2548 * Devices which are already active have their details maintained, and are 2549 * not re-opened. 2550 */ 2551 void 2552 spa_load_l2cache(spa_t *spa) 2553 { 2554 nvlist_t **l2cache = NULL; 2555 uint_t nl2cache; 2556 int i, j, oldnvdevs; 2557 uint64_t guid; 2558 vdev_t *vd, **oldvdevs, **newvdevs; 2559 spa_aux_vdev_t *sav = &spa->spa_l2cache; 2560 2561 #ifndef _KERNEL 2562 /* 2563 * zdb opens both the current state of the pool and the 2564 * checkpointed state (if present), with a different spa_t. 2565 * 2566 * As L2 caches are part of the ARC which is shared among open 2567 * pools, we skip loading them when we load the checkpointed 2568 * state of the pool. 2569 */ 2570 if (!spa_writeable(spa)) 2571 return; 2572 #endif 2573 2574 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 2575 2576 oldvdevs = sav->sav_vdevs; 2577 oldnvdevs = sav->sav_count; 2578 sav->sav_vdevs = NULL; 2579 sav->sav_count = 0; 2580 2581 if (sav->sav_config == NULL) { 2582 nl2cache = 0; 2583 newvdevs = NULL; 2584 goto out; 2585 } 2586 2587 VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, 2588 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache)); 2589 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 2590 2591 /* 2592 * Process new nvlist of vdevs. 2593 */ 2594 for (i = 0; i < nl2cache; i++) { 2595 guid = fnvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID); 2596 2597 newvdevs[i] = NULL; 2598 for (j = 0; j < oldnvdevs; j++) { 2599 vd = oldvdevs[j]; 2600 if (vd != NULL && guid == vd->vdev_guid) { 2601 /* 2602 * Retain previous vdev for add/remove ops. 2603 */ 2604 newvdevs[i] = vd; 2605 oldvdevs[j] = NULL; 2606 break; 2607 } 2608 } 2609 2610 if (newvdevs[i] == NULL) { 2611 /* 2612 * Create new vdev 2613 */ 2614 VERIFY0(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 2615 VDEV_ALLOC_L2CACHE)); 2616 ASSERT(vd != NULL); 2617 newvdevs[i] = vd; 2618 2619 /* 2620 * Commit this vdev as an l2cache device, 2621 * even if it fails to open. 2622 */ 2623 spa_l2cache_add(vd); 2624 2625 vd->vdev_top = vd; 2626 vd->vdev_aux = sav; 2627 2628 spa_l2cache_activate(vd); 2629 2630 if (vdev_open(vd) != 0) 2631 continue; 2632 2633 (void) vdev_validate_aux(vd); 2634 2635 if (!vdev_is_dead(vd)) 2636 l2arc_add_vdev(spa, vd); 2637 2638 /* 2639 * Upon cache device addition to a pool or pool 2640 * creation with a cache device or if the header 2641 * of the device is invalid we issue an async 2642 * TRIM command for the whole device which will 2643 * execute if l2arc_trim_ahead > 0. 2644 */ 2645 spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM); 2646 } 2647 } 2648 2649 sav->sav_vdevs = newvdevs; 2650 sav->sav_count = (int)nl2cache; 2651 2652 /* 2653 * Recompute the stashed list of l2cache devices, with status 2654 * information this time. 2655 */ 2656 fnvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE); 2657 2658 if (sav->sav_count > 0) 2659 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), 2660 KM_SLEEP); 2661 for (i = 0; i < sav->sav_count; i++) 2662 l2cache[i] = vdev_config_generate(spa, 2663 sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 2664 fnvlist_add_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 2665 (const nvlist_t * const *)l2cache, sav->sav_count); 2666 2667 out: 2668 /* 2669 * Purge vdevs that were dropped 2670 */ 2671 if (oldvdevs) { 2672 for (i = 0; i < oldnvdevs; i++) { 2673 uint64_t pool; 2674 2675 vd = oldvdevs[i]; 2676 if (vd != NULL) { 2677 ASSERT(vd->vdev_isl2cache); 2678 2679 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 2680 pool != 0ULL && l2arc_vdev_present(vd)) 2681 l2arc_remove_vdev(vd); 2682 vdev_clear_stats(vd); 2683 vdev_free(vd); 2684 } 2685 } 2686 2687 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 2688 } 2689 2690 for (i = 0; i < sav->sav_count; i++) 2691 nvlist_free(l2cache[i]); 2692 if (sav->sav_count) 2693 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 2694 } 2695 2696 static int 2697 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 2698 { 2699 dmu_buf_t *db; 2700 char *packed = NULL; 2701 size_t nvsize = 0; 2702 int error; 2703 *value = NULL; 2704 2705 error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); 2706 if (error) 2707 return (error); 2708 2709 nvsize = *(uint64_t *)db->db_data; 2710 dmu_buf_rele(db, FTAG); 2711 2712 packed = vmem_alloc(nvsize, KM_SLEEP); 2713 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 2714 DMU_READ_PREFETCH); 2715 if (error == 0) 2716 error = nvlist_unpack(packed, nvsize, value, 0); 2717 vmem_free(packed, nvsize); 2718 2719 return (error); 2720 } 2721 2722 /* 2723 * Concrete top-level vdevs that are not missing and are not logs. At every 2724 * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds. 2725 */ 2726 static uint64_t 2727 spa_healthy_core_tvds(spa_t *spa) 2728 { 2729 vdev_t *rvd = spa->spa_root_vdev; 2730 uint64_t tvds = 0; 2731 2732 for (uint64_t i = 0; i < rvd->vdev_children; i++) { 2733 vdev_t *vd = rvd->vdev_child[i]; 2734 if (vd->vdev_islog) 2735 continue; 2736 if (vdev_is_concrete(vd) && !vdev_is_dead(vd)) 2737 tvds++; 2738 } 2739 2740 return (tvds); 2741 } 2742 2743 /* 2744 * Checks to see if the given vdev could not be opened, in which case we post a 2745 * sysevent to notify the autoreplace code that the device has been removed. 2746 */ 2747 static void 2748 spa_check_removed(vdev_t *vd) 2749 { 2750 for (uint64_t c = 0; c < vd->vdev_children; c++) 2751 spa_check_removed(vd->vdev_child[c]); 2752 2753 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && 2754 vdev_is_concrete(vd)) { 2755 zfs_post_autoreplace(vd->vdev_spa, vd); 2756 spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK); 2757 } 2758 } 2759 2760 static int 2761 spa_check_for_missing_logs(spa_t *spa) 2762 { 2763 vdev_t *rvd = spa->spa_root_vdev; 2764 2765 /* 2766 * If we're doing a normal import, then build up any additional 2767 * diagnostic information about missing log devices. 2768 * We'll pass this up to the user for further processing. 2769 */ 2770 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 2771 nvlist_t **child, *nv; 2772 uint64_t idx = 0; 2773 2774 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t *), 2775 KM_SLEEP); 2776 nv = fnvlist_alloc(); 2777 2778 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 2779 vdev_t *tvd = rvd->vdev_child[c]; 2780 2781 /* 2782 * We consider a device as missing only if it failed 2783 * to open (i.e. offline or faulted is not considered 2784 * as missing). 2785 */ 2786 if (tvd->vdev_islog && 2787 tvd->vdev_state == VDEV_STATE_CANT_OPEN) { 2788 child[idx++] = vdev_config_generate(spa, tvd, 2789 B_FALSE, VDEV_CONFIG_MISSING); 2790 } 2791 } 2792 2793 if (idx > 0) { 2794 fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 2795 (const nvlist_t * const *)child, idx); 2796 fnvlist_add_nvlist(spa->spa_load_info, 2797 ZPOOL_CONFIG_MISSING_DEVICES, nv); 2798 2799 for (uint64_t i = 0; i < idx; i++) 2800 nvlist_free(child[i]); 2801 } 2802 nvlist_free(nv); 2803 kmem_free(child, rvd->vdev_children * sizeof (char **)); 2804 2805 if (idx > 0) { 2806 spa_load_failed(spa, "some log devices are missing"); 2807 vdev_dbgmsg_print_tree(rvd, 2); 2808 return (SET_ERROR(ENXIO)); 2809 } 2810 } else { 2811 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 2812 vdev_t *tvd = rvd->vdev_child[c]; 2813 2814 if (tvd->vdev_islog && 2815 tvd->vdev_state == VDEV_STATE_CANT_OPEN) { 2816 spa_set_log_state(spa, SPA_LOG_CLEAR); 2817 spa_load_note(spa, "some log devices are " 2818 "missing, ZIL is dropped."); 2819 vdev_dbgmsg_print_tree(rvd, 2); 2820 break; 2821 } 2822 } 2823 } 2824 2825 return (0); 2826 } 2827 2828 /* 2829 * Check for missing log devices 2830 */ 2831 static boolean_t 2832 spa_check_logs(spa_t *spa) 2833 { 2834 boolean_t rv = B_FALSE; 2835 dsl_pool_t *dp = spa_get_dsl(spa); 2836 2837 switch (spa->spa_log_state) { 2838 default: 2839 break; 2840 case SPA_LOG_MISSING: 2841 /* need to recheck in case slog has been restored */ 2842 case SPA_LOG_UNKNOWN: 2843 rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 2844 zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0); 2845 if (rv) 2846 spa_set_log_state(spa, SPA_LOG_MISSING); 2847 break; 2848 } 2849 return (rv); 2850 } 2851 2852 /* 2853 * Passivate any log vdevs (note, does not apply to embedded log metaslabs). 2854 */ 2855 static boolean_t 2856 spa_passivate_log(spa_t *spa) 2857 { 2858 vdev_t *rvd = spa->spa_root_vdev; 2859 boolean_t slog_found = B_FALSE; 2860 2861 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 2862 2863 for (int c = 0; c < rvd->vdev_children; c++) { 2864 vdev_t *tvd = rvd->vdev_child[c]; 2865 2866 if (tvd->vdev_islog) { 2867 ASSERT0P(tvd->vdev_log_mg); 2868 metaslab_group_passivate(tvd->vdev_mg); 2869 slog_found = B_TRUE; 2870 } 2871 } 2872 2873 return (slog_found); 2874 } 2875 2876 /* 2877 * Activate any log vdevs (note, does not apply to embedded log metaslabs). 2878 */ 2879 static void 2880 spa_activate_log(spa_t *spa) 2881 { 2882 vdev_t *rvd = spa->spa_root_vdev; 2883 2884 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 2885 2886 for (int c = 0; c < rvd->vdev_children; c++) { 2887 vdev_t *tvd = rvd->vdev_child[c]; 2888 2889 if (tvd->vdev_islog) { 2890 ASSERT0P(tvd->vdev_log_mg); 2891 metaslab_group_activate(tvd->vdev_mg); 2892 } 2893 } 2894 } 2895 2896 int 2897 spa_reset_logs(spa_t *spa) 2898 { 2899 int error; 2900 2901 error = dmu_objset_find(spa_name(spa), zil_reset, 2902 NULL, DS_FIND_CHILDREN); 2903 if (error == 0) { 2904 /* 2905 * We successfully offlined the log device, sync out the 2906 * current txg so that the "stubby" block can be removed 2907 * by zil_sync(). 2908 */ 2909 txg_wait_synced(spa->spa_dsl_pool, 0); 2910 } 2911 return (error); 2912 } 2913 2914 static void 2915 spa_aux_check_removed(spa_aux_vdev_t *sav) 2916 { 2917 for (int i = 0; i < sav->sav_count; i++) 2918 spa_check_removed(sav->sav_vdevs[i]); 2919 } 2920 2921 void 2922 spa_claim_notify(zio_t *zio) 2923 { 2924 spa_t *spa = zio->io_spa; 2925 2926 if (zio->io_error) 2927 return; 2928 2929 mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 2930 if (spa->spa_claim_max_txg < BP_GET_BIRTH(zio->io_bp)) 2931 spa->spa_claim_max_txg = BP_GET_BIRTH(zio->io_bp); 2932 mutex_exit(&spa->spa_props_lock); 2933 } 2934 2935 typedef struct spa_load_error { 2936 boolean_t sle_verify_data; 2937 uint64_t sle_meta_count; 2938 uint64_t sle_data_count; 2939 } spa_load_error_t; 2940 2941 static void 2942 spa_load_verify_done(zio_t *zio) 2943 { 2944 blkptr_t *bp = zio->io_bp; 2945 spa_load_error_t *sle = zio->io_private; 2946 dmu_object_type_t type = BP_GET_TYPE(bp); 2947 int error = zio->io_error; 2948 spa_t *spa = zio->io_spa; 2949 2950 abd_free(zio->io_abd); 2951 if (error) { 2952 if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && 2953 type != DMU_OT_INTENT_LOG) 2954 atomic_inc_64(&sle->sle_meta_count); 2955 else 2956 atomic_inc_64(&sle->sle_data_count); 2957 } 2958 2959 mutex_enter(&spa->spa_scrub_lock); 2960 spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp); 2961 cv_broadcast(&spa->spa_scrub_io_cv); 2962 mutex_exit(&spa->spa_scrub_lock); 2963 } 2964 2965 /* 2966 * Maximum number of inflight bytes is the log2 fraction of the arc size. 2967 * By default, we set it to 1/16th of the arc. 2968 */ 2969 static uint_t spa_load_verify_shift = 4; 2970 static int spa_load_verify_metadata = B_TRUE; 2971 static int spa_load_verify_data = B_TRUE; 2972 2973 static int 2974 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 2975 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 2976 { 2977 zio_t *rio = arg; 2978 spa_load_error_t *sle = rio->io_private; 2979 2980 (void) zilog, (void) dnp; 2981 2982 /* 2983 * Note: normally this routine will not be called if 2984 * spa_load_verify_metadata is not set. However, it may be useful 2985 * to manually set the flag after the traversal has begun. 2986 */ 2987 if (!spa_load_verify_metadata) 2988 return (0); 2989 2990 /* 2991 * Sanity check the block pointer in order to detect obvious damage 2992 * before using the contents in subsequent checks or in zio_read(). 2993 * When damaged consider it to be a metadata error since we cannot 2994 * trust the BP_GET_TYPE and BP_GET_LEVEL values. 2995 */ 2996 if (zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { 2997 atomic_inc_64(&sle->sle_meta_count); 2998 return (0); 2999 } 3000 3001 if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || 3002 BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp)) 3003 return (0); 3004 3005 if (!BP_IS_METADATA(bp) && 3006 (!spa_load_verify_data || !sle->sle_verify_data)) 3007 return (0); 3008 3009 uint64_t maxinflight_bytes = 3010 arc_target_bytes() >> spa_load_verify_shift; 3011 size_t size = BP_GET_PSIZE(bp); 3012 3013 mutex_enter(&spa->spa_scrub_lock); 3014 while (spa->spa_load_verify_bytes >= maxinflight_bytes) 3015 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 3016 spa->spa_load_verify_bytes += size; 3017 mutex_exit(&spa->spa_scrub_lock); 3018 3019 zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size, 3020 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 3021 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 3022 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 3023 return (0); 3024 } 3025 3026 static int 3027 verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) 3028 { 3029 (void) dp, (void) arg; 3030 3031 if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN) 3032 return (SET_ERROR(ENAMETOOLONG)); 3033 3034 return (0); 3035 } 3036 3037 static int 3038 spa_load_verify(spa_t *spa) 3039 { 3040 zio_t *rio; 3041 spa_load_error_t sle = { 0 }; 3042 zpool_load_policy_t policy; 3043 boolean_t verify_ok = B_FALSE; 3044 int error = 0; 3045 3046 zpool_get_load_policy(spa->spa_config, &policy); 3047 3048 if (policy.zlp_rewind & ZPOOL_NEVER_REWIND || 3049 policy.zlp_maxmeta == UINT64_MAX) 3050 return (0); 3051 3052 dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); 3053 error = dmu_objset_find_dp(spa->spa_dsl_pool, 3054 spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL, 3055 DS_FIND_CHILDREN); 3056 dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); 3057 if (error != 0) 3058 return (error); 3059 3060 /* 3061 * Verify data only if we are rewinding or error limit was set. 3062 * Otherwise nothing except dbgmsg care about it to waste time. 3063 */ 3064 sle.sle_verify_data = (policy.zlp_rewind & ZPOOL_REWIND_MASK) || 3065 (policy.zlp_maxdata < UINT64_MAX); 3066 3067 rio = zio_root(spa, NULL, &sle, 3068 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 3069 3070 if (spa_load_verify_metadata) { 3071 if (spa->spa_extreme_rewind) { 3072 spa_load_note(spa, "performing a complete scan of the " 3073 "pool since extreme rewind is on. This may take " 3074 "a very long time.\n (spa_load_verify_data=%u, " 3075 "spa_load_verify_metadata=%u)", 3076 spa_load_verify_data, spa_load_verify_metadata); 3077 } 3078 3079 error = traverse_pool(spa, spa->spa_verify_min_txg, 3080 TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | 3081 TRAVERSE_NO_DECRYPT, spa_load_verify_cb, rio); 3082 } 3083 3084 (void) zio_wait(rio); 3085 ASSERT0(spa->spa_load_verify_bytes); 3086 3087 spa->spa_load_meta_errors = sle.sle_meta_count; 3088 spa->spa_load_data_errors = sle.sle_data_count; 3089 3090 if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) { 3091 spa_load_note(spa, "spa_load_verify found %llu metadata errors " 3092 "and %llu data errors", (u_longlong_t)sle.sle_meta_count, 3093 (u_longlong_t)sle.sle_data_count); 3094 } 3095 3096 if (spa_load_verify_dryrun || 3097 (!error && sle.sle_meta_count <= policy.zlp_maxmeta && 3098 sle.sle_data_count <= policy.zlp_maxdata)) { 3099 int64_t loss = 0; 3100 3101 verify_ok = B_TRUE; 3102 spa->spa_load_txg = spa->spa_uberblock.ub_txg; 3103 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 3104 3105 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 3106 fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_TIME, 3107 spa->spa_load_txg_ts); 3108 fnvlist_add_int64(spa->spa_load_info, ZPOOL_CONFIG_REWIND_TIME, 3109 loss); 3110 fnvlist_add_uint64(spa->spa_load_info, 3111 ZPOOL_CONFIG_LOAD_META_ERRORS, sle.sle_meta_count); 3112 fnvlist_add_uint64(spa->spa_load_info, 3113 ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count); 3114 } else { 3115 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 3116 } 3117 3118 if (spa_load_verify_dryrun) 3119 return (0); 3120 3121 if (error) { 3122 if (error != ENXIO && error != EIO) 3123 error = SET_ERROR(EIO); 3124 return (error); 3125 } 3126 3127 return (verify_ok ? 0 : EIO); 3128 } 3129 3130 /* 3131 * Find a value in the pool props object. 3132 */ 3133 static void 3134 spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 3135 { 3136 (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 3137 zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 3138 } 3139 3140 /* 3141 * Find a value in the pool directory object. 3142 */ 3143 static int 3144 spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent) 3145 { 3146 int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 3147 name, sizeof (uint64_t), 1, val); 3148 3149 if (error != 0 && (error != ENOENT || log_enoent)) { 3150 spa_load_failed(spa, "couldn't get '%s' value in MOS directory " 3151 "[error=%d]", name, error); 3152 } 3153 3154 return (error); 3155 } 3156 3157 static int 3158 spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 3159 { 3160 vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 3161 return (SET_ERROR(err)); 3162 } 3163 3164 boolean_t 3165 spa_livelist_delete_check(spa_t *spa) 3166 { 3167 return (spa->spa_livelists_to_delete != 0); 3168 } 3169 3170 static boolean_t 3171 spa_livelist_delete_cb_check(void *arg, zthr_t *z) 3172 { 3173 (void) z; 3174 spa_t *spa = arg; 3175 return (spa_livelist_delete_check(spa)); 3176 } 3177 3178 static int 3179 delete_blkptr_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 3180 { 3181 spa_t *spa = arg; 3182 zio_free(spa, tx->tx_txg, bp); 3183 dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, 3184 -bp_get_dsize_sync(spa, bp), 3185 -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); 3186 return (0); 3187 } 3188 3189 static int 3190 dsl_get_next_livelist_obj(objset_t *os, uint64_t zap_obj, uint64_t *llp) 3191 { 3192 int err; 3193 zap_cursor_t zc; 3194 zap_attribute_t *za = zap_attribute_alloc(); 3195 zap_cursor_init(&zc, os, zap_obj); 3196 err = zap_cursor_retrieve(&zc, za); 3197 zap_cursor_fini(&zc); 3198 if (err == 0) 3199 *llp = za->za_first_integer; 3200 zap_attribute_free(za); 3201 return (err); 3202 } 3203 3204 /* 3205 * Components of livelist deletion that must be performed in syncing 3206 * context: freeing block pointers and updating the pool-wide data 3207 * structures to indicate how much work is left to do 3208 */ 3209 typedef struct sublist_delete_arg { 3210 spa_t *spa; 3211 dsl_deadlist_t *ll; 3212 uint64_t key; 3213 bplist_t *to_free; 3214 } sublist_delete_arg_t; 3215 3216 static void 3217 sublist_delete_sync(void *arg, dmu_tx_t *tx) 3218 { 3219 sublist_delete_arg_t *sda = arg; 3220 spa_t *spa = sda->spa; 3221 dsl_deadlist_t *ll = sda->ll; 3222 uint64_t key = sda->key; 3223 bplist_t *to_free = sda->to_free; 3224 3225 bplist_iterate(to_free, delete_blkptr_cb, spa, tx); 3226 dsl_deadlist_remove_entry(ll, key, tx); 3227 } 3228 3229 typedef struct livelist_delete_arg { 3230 spa_t *spa; 3231 uint64_t ll_obj; 3232 uint64_t zap_obj; 3233 } livelist_delete_arg_t; 3234 3235 static void 3236 livelist_delete_sync(void *arg, dmu_tx_t *tx) 3237 { 3238 livelist_delete_arg_t *lda = arg; 3239 spa_t *spa = lda->spa; 3240 uint64_t ll_obj = lda->ll_obj; 3241 uint64_t zap_obj = lda->zap_obj; 3242 objset_t *mos = spa->spa_meta_objset; 3243 uint64_t count; 3244 3245 /* free the livelist and decrement the feature count */ 3246 VERIFY0(zap_remove_int(mos, zap_obj, ll_obj, tx)); 3247 dsl_deadlist_free(mos, ll_obj, tx); 3248 spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx); 3249 VERIFY0(zap_count(mos, zap_obj, &count)); 3250 if (count == 0) { 3251 /* no more livelists to delete */ 3252 VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, 3253 DMU_POOL_DELETED_CLONES, tx)); 3254 VERIFY0(zap_destroy(mos, zap_obj, tx)); 3255 spa->spa_livelists_to_delete = 0; 3256 spa_notify_waiters(spa); 3257 } 3258 } 3259 3260 /* 3261 * Load in the value for the livelist to be removed and open it. Then, 3262 * load its first sublist and determine which block pointers should actually 3263 * be freed. Then, call a synctask which performs the actual frees and updates 3264 * the pool-wide livelist data. 3265 */ 3266 static void 3267 spa_livelist_delete_cb(void *arg, zthr_t *z) 3268 { 3269 spa_t *spa = arg; 3270 uint64_t ll_obj = 0, count; 3271 objset_t *mos = spa->spa_meta_objset; 3272 uint64_t zap_obj = spa->spa_livelists_to_delete; 3273 /* 3274 * Determine the next livelist to delete. This function should only 3275 * be called if there is at least one deleted clone. 3276 */ 3277 VERIFY0(dsl_get_next_livelist_obj(mos, zap_obj, &ll_obj)); 3278 VERIFY0(zap_count(mos, ll_obj, &count)); 3279 if (count > 0) { 3280 dsl_deadlist_t *ll; 3281 dsl_deadlist_entry_t *dle; 3282 bplist_t to_free; 3283 ll = kmem_zalloc(sizeof (dsl_deadlist_t), KM_SLEEP); 3284 VERIFY0(dsl_deadlist_open(ll, mos, ll_obj)); 3285 dle = dsl_deadlist_first(ll); 3286 ASSERT3P(dle, !=, NULL); 3287 bplist_create(&to_free); 3288 int err = dsl_process_sub_livelist(&dle->dle_bpobj, &to_free, 3289 z, NULL); 3290 if (err == 0) { 3291 sublist_delete_arg_t sync_arg = { 3292 .spa = spa, 3293 .ll = ll, 3294 .key = dle->dle_mintxg, 3295 .to_free = &to_free 3296 }; 3297 zfs_dbgmsg("deleting sublist (id %llu) from" 3298 " livelist %llu, %lld remaining", 3299 (u_longlong_t)dle->dle_bpobj.bpo_object, 3300 (u_longlong_t)ll_obj, (longlong_t)count - 1); 3301 VERIFY0(dsl_sync_task(spa_name(spa), NULL, 3302 sublist_delete_sync, &sync_arg, 0, 3303 ZFS_SPACE_CHECK_DESTROY)); 3304 } else { 3305 VERIFY3U(err, ==, EINTR); 3306 } 3307 bplist_clear(&to_free); 3308 bplist_destroy(&to_free); 3309 dsl_deadlist_close(ll); 3310 kmem_free(ll, sizeof (dsl_deadlist_t)); 3311 } else { 3312 livelist_delete_arg_t sync_arg = { 3313 .spa = spa, 3314 .ll_obj = ll_obj, 3315 .zap_obj = zap_obj 3316 }; 3317 zfs_dbgmsg("deletion of livelist %llu completed", 3318 (u_longlong_t)ll_obj); 3319 VERIFY0(dsl_sync_task(spa_name(spa), NULL, livelist_delete_sync, 3320 &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY)); 3321 } 3322 } 3323 3324 static void 3325 spa_start_livelist_destroy_thread(spa_t *spa) 3326 { 3327 ASSERT0P(spa->spa_livelist_delete_zthr); 3328 spa->spa_livelist_delete_zthr = 3329 zthr_create("z_livelist_destroy", 3330 spa_livelist_delete_cb_check, spa_livelist_delete_cb, spa, 3331 minclsyspri); 3332 } 3333 3334 typedef struct livelist_new_arg { 3335 bplist_t *allocs; 3336 bplist_t *frees; 3337 } livelist_new_arg_t; 3338 3339 static int 3340 livelist_track_new_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 3341 dmu_tx_t *tx) 3342 { 3343 ASSERT0P(tx); 3344 livelist_new_arg_t *lna = arg; 3345 if (bp_freed) { 3346 bplist_append(lna->frees, bp); 3347 } else { 3348 bplist_append(lna->allocs, bp); 3349 zfs_livelist_condense_new_alloc++; 3350 } 3351 return (0); 3352 } 3353 3354 typedef struct livelist_condense_arg { 3355 spa_t *spa; 3356 bplist_t to_keep; 3357 uint64_t first_size; 3358 uint64_t next_size; 3359 } livelist_condense_arg_t; 3360 3361 static void 3362 spa_livelist_condense_sync(void *arg, dmu_tx_t *tx) 3363 { 3364 livelist_condense_arg_t *lca = arg; 3365 spa_t *spa = lca->spa; 3366 bplist_t new_frees; 3367 dsl_dataset_t *ds = spa->spa_to_condense.ds; 3368 3369 /* Have we been cancelled? */ 3370 if (spa->spa_to_condense.cancelled) { 3371 zfs_livelist_condense_sync_cancel++; 3372 goto out; 3373 } 3374 3375 dsl_deadlist_entry_t *first = spa->spa_to_condense.first; 3376 dsl_deadlist_entry_t *next = spa->spa_to_condense.next; 3377 dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist; 3378 3379 /* 3380 * It's possible that the livelist was changed while the zthr was 3381 * running. Therefore, we need to check for new blkptrs in the two 3382 * entries being condensed and continue to track them in the livelist. 3383 * Because of the way we handle remapped blkptrs (see dbuf_remap_impl), 3384 * it's possible that the newly added blkptrs are FREEs or ALLOCs so 3385 * we need to sort them into two different bplists. 3386 */ 3387 uint64_t first_obj = first->dle_bpobj.bpo_object; 3388 uint64_t next_obj = next->dle_bpobj.bpo_object; 3389 uint64_t cur_first_size = first->dle_bpobj.bpo_phys->bpo_num_blkptrs; 3390 uint64_t cur_next_size = next->dle_bpobj.bpo_phys->bpo_num_blkptrs; 3391 3392 bplist_create(&new_frees); 3393 livelist_new_arg_t new_bps = { 3394 .allocs = &lca->to_keep, 3395 .frees = &new_frees, 3396 }; 3397 3398 if (cur_first_size > lca->first_size) { 3399 VERIFY0(livelist_bpobj_iterate_from_nofree(&first->dle_bpobj, 3400 livelist_track_new_cb, &new_bps, lca->first_size)); 3401 } 3402 if (cur_next_size > lca->next_size) { 3403 VERIFY0(livelist_bpobj_iterate_from_nofree(&next->dle_bpobj, 3404 livelist_track_new_cb, &new_bps, lca->next_size)); 3405 } 3406 3407 dsl_deadlist_clear_entry(first, ll, tx); 3408 ASSERT(bpobj_is_empty(&first->dle_bpobj)); 3409 dsl_deadlist_remove_entry(ll, next->dle_mintxg, tx); 3410 3411 bplist_iterate(&lca->to_keep, dsl_deadlist_insert_alloc_cb, ll, tx); 3412 bplist_iterate(&new_frees, dsl_deadlist_insert_free_cb, ll, tx); 3413 bplist_destroy(&new_frees); 3414 3415 char dsname[ZFS_MAX_DATASET_NAME_LEN]; 3416 dsl_dataset_name(ds, dsname); 3417 zfs_dbgmsg("txg %llu condensing livelist of %s (id %llu), bpobj %llu " 3418 "(%llu blkptrs) and bpobj %llu (%llu blkptrs) -> bpobj %llu " 3419 "(%llu blkptrs)", (u_longlong_t)tx->tx_txg, dsname, 3420 (u_longlong_t)ds->ds_object, (u_longlong_t)first_obj, 3421 (u_longlong_t)cur_first_size, (u_longlong_t)next_obj, 3422 (u_longlong_t)cur_next_size, 3423 (u_longlong_t)first->dle_bpobj.bpo_object, 3424 (u_longlong_t)first->dle_bpobj.bpo_phys->bpo_num_blkptrs); 3425 out: 3426 dmu_buf_rele(ds->ds_dbuf, spa); 3427 spa->spa_to_condense.ds = NULL; 3428 bplist_clear(&lca->to_keep); 3429 bplist_destroy(&lca->to_keep); 3430 kmem_free(lca, sizeof (livelist_condense_arg_t)); 3431 spa->spa_to_condense.syncing = B_FALSE; 3432 } 3433 3434 static void 3435 spa_livelist_condense_cb(void *arg, zthr_t *t) 3436 { 3437 while (zfs_livelist_condense_zthr_pause && 3438 !(zthr_has_waiters(t) || zthr_iscancelled(t))) 3439 delay(1); 3440 3441 spa_t *spa = arg; 3442 dsl_deadlist_entry_t *first = spa->spa_to_condense.first; 3443 dsl_deadlist_entry_t *next = spa->spa_to_condense.next; 3444 uint64_t first_size, next_size; 3445 3446 livelist_condense_arg_t *lca = 3447 kmem_alloc(sizeof (livelist_condense_arg_t), KM_SLEEP); 3448 bplist_create(&lca->to_keep); 3449 3450 /* 3451 * Process the livelists (matching FREEs and ALLOCs) in open context 3452 * so we have minimal work in syncing context to condense. 3453 * 3454 * We save bpobj sizes (first_size and next_size) to use later in 3455 * syncing context to determine if entries were added to these sublists 3456 * while in open context. This is possible because the clone is still 3457 * active and open for normal writes and we want to make sure the new, 3458 * unprocessed blockpointers are inserted into the livelist normally. 3459 * 3460 * Note that dsl_process_sub_livelist() both stores the size number of 3461 * blockpointers and iterates over them while the bpobj's lock held, so 3462 * the sizes returned to us are consistent which what was actually 3463 * processed. 3464 */ 3465 int err = dsl_process_sub_livelist(&first->dle_bpobj, &lca->to_keep, t, 3466 &first_size); 3467 if (err == 0) 3468 err = dsl_process_sub_livelist(&next->dle_bpobj, &lca->to_keep, 3469 t, &next_size); 3470 3471 if (err == 0) { 3472 while (zfs_livelist_condense_sync_pause && 3473 !(zthr_has_waiters(t) || zthr_iscancelled(t))) 3474 delay(1); 3475 3476 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 3477 dmu_tx_mark_netfree(tx); 3478 dmu_tx_hold_space(tx, 1); 3479 err = dmu_tx_assign(tx, DMU_TX_NOWAIT | DMU_TX_NOTHROTTLE); 3480 if (err == 0) { 3481 /* 3482 * Prevent the condense zthr restarting before 3483 * the synctask completes. 3484 */ 3485 spa->spa_to_condense.syncing = B_TRUE; 3486 lca->spa = spa; 3487 lca->first_size = first_size; 3488 lca->next_size = next_size; 3489 dsl_sync_task_nowait(spa_get_dsl(spa), 3490 spa_livelist_condense_sync, lca, tx); 3491 dmu_tx_commit(tx); 3492 return; 3493 } 3494 } 3495 /* 3496 * Condensing can not continue: either it was externally stopped or 3497 * we were unable to assign to a tx because the pool has run out of 3498 * space. In the second case, we'll just end up trying to condense 3499 * again in a later txg. 3500 */ 3501 ASSERT(err != 0); 3502 bplist_clear(&lca->to_keep); 3503 bplist_destroy(&lca->to_keep); 3504 kmem_free(lca, sizeof (livelist_condense_arg_t)); 3505 dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, spa); 3506 spa->spa_to_condense.ds = NULL; 3507 if (err == EINTR) 3508 zfs_livelist_condense_zthr_cancel++; 3509 } 3510 3511 /* 3512 * Check that there is something to condense but that a condense is not 3513 * already in progress and that condensing has not been cancelled. 3514 */ 3515 static boolean_t 3516 spa_livelist_condense_cb_check(void *arg, zthr_t *z) 3517 { 3518 (void) z; 3519 spa_t *spa = arg; 3520 if ((spa->spa_to_condense.ds != NULL) && 3521 (spa->spa_to_condense.syncing == B_FALSE) && 3522 (spa->spa_to_condense.cancelled == B_FALSE)) { 3523 return (B_TRUE); 3524 } 3525 return (B_FALSE); 3526 } 3527 3528 static void 3529 spa_start_livelist_condensing_thread(spa_t *spa) 3530 { 3531 spa->spa_to_condense.ds = NULL; 3532 spa->spa_to_condense.first = NULL; 3533 spa->spa_to_condense.next = NULL; 3534 spa->spa_to_condense.syncing = B_FALSE; 3535 spa->spa_to_condense.cancelled = B_FALSE; 3536 3537 ASSERT0P(spa->spa_livelist_condense_zthr); 3538 spa->spa_livelist_condense_zthr = 3539 zthr_create("z_livelist_condense", 3540 spa_livelist_condense_cb_check, 3541 spa_livelist_condense_cb, spa, minclsyspri); 3542 } 3543 3544 static void 3545 spa_spawn_aux_threads(spa_t *spa) 3546 { 3547 ASSERT(spa_writeable(spa)); 3548 3549 spa_start_raidz_expansion_thread(spa); 3550 spa_start_indirect_condensing_thread(spa); 3551 spa_start_livelist_destroy_thread(spa); 3552 spa_start_livelist_condensing_thread(spa); 3553 3554 ASSERT0P(spa->spa_checkpoint_discard_zthr); 3555 spa->spa_checkpoint_discard_zthr = 3556 zthr_create("z_checkpoint_discard", 3557 spa_checkpoint_discard_thread_check, 3558 spa_checkpoint_discard_thread, spa, minclsyspri); 3559 } 3560 3561 /* 3562 * Fix up config after a partly-completed split. This is done with the 3563 * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 3564 * pool have that entry in their config, but only the splitting one contains 3565 * a list of all the guids of the vdevs that are being split off. 3566 * 3567 * This function determines what to do with that list: either rejoin 3568 * all the disks to the pool, or complete the splitting process. To attempt 3569 * the rejoin, each disk that is offlined is marked online again, and 3570 * we do a reopen() call. If the vdev label for every disk that was 3571 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 3572 * then we call vdev_split() on each disk, and complete the split. 3573 * 3574 * Otherwise we leave the config alone, with all the vdevs in place in 3575 * the original pool. 3576 */ 3577 static void 3578 spa_try_repair(spa_t *spa, nvlist_t *config) 3579 { 3580 uint_t extracted; 3581 uint64_t *glist; 3582 uint_t i, gcount; 3583 nvlist_t *nvl; 3584 vdev_t **vd; 3585 boolean_t attempt_reopen; 3586 3587 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 3588 return; 3589 3590 /* check that the config is complete */ 3591 if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 3592 &glist, &gcount) != 0) 3593 return; 3594 3595 vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 3596 3597 /* attempt to online all the vdevs & validate */ 3598 attempt_reopen = B_TRUE; 3599 for (i = 0; i < gcount; i++) { 3600 if (glist[i] == 0) /* vdev is hole */ 3601 continue; 3602 3603 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 3604 if (vd[i] == NULL) { 3605 /* 3606 * Don't bother attempting to reopen the disks; 3607 * just do the split. 3608 */ 3609 attempt_reopen = B_FALSE; 3610 } else { 3611 /* attempt to re-online it */ 3612 vd[i]->vdev_offline = B_FALSE; 3613 } 3614 } 3615 3616 if (attempt_reopen) { 3617 vdev_reopen(spa->spa_root_vdev); 3618 3619 /* check each device to see what state it's in */ 3620 for (extracted = 0, i = 0; i < gcount; i++) { 3621 if (vd[i] != NULL && 3622 vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 3623 break; 3624 ++extracted; 3625 } 3626 } 3627 3628 /* 3629 * If every disk has been moved to the new pool, or if we never 3630 * even attempted to look at them, then we split them off for 3631 * good. 3632 */ 3633 if (!attempt_reopen || gcount == extracted) { 3634 for (i = 0; i < gcount; i++) 3635 if (vd[i] != NULL) 3636 vdev_split(vd[i]); 3637 vdev_reopen(spa->spa_root_vdev); 3638 } 3639 3640 kmem_free(vd, gcount * sizeof (vdev_t *)); 3641 } 3642 3643 static int 3644 spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type) 3645 { 3646 const char *ereport = FM_EREPORT_ZFS_POOL; 3647 int error; 3648 3649 spa->spa_load_state = state; 3650 (void) spa_import_progress_set_state(spa_guid(spa), 3651 spa_load_state(spa)); 3652 spa_import_progress_set_notes(spa, "spa_load()"); 3653 3654 gethrestime(&spa->spa_loaded_ts); 3655 error = spa_load_impl(spa, type, &ereport); 3656 3657 /* 3658 * Don't count references from objsets that are already closed 3659 * and are making their way through the eviction process. 3660 */ 3661 spa_evicting_os_wait(spa); 3662 spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); 3663 if (error) { 3664 if (error != EEXIST) { 3665 spa->spa_loaded_ts.tv_sec = 0; 3666 spa->spa_loaded_ts.tv_nsec = 0; 3667 } 3668 if (error != EBADF) { 3669 (void) zfs_ereport_post(ereport, spa, 3670 NULL, NULL, NULL, 0); 3671 } 3672 } 3673 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 3674 spa->spa_ena = 0; 3675 3676 (void) spa_import_progress_set_state(spa_guid(spa), 3677 spa_load_state(spa)); 3678 3679 return (error); 3680 } 3681 3682 #ifdef ZFS_DEBUG 3683 /* 3684 * Count the number of per-vdev ZAPs associated with all of the vdevs in the 3685 * vdev tree rooted in the given vd, and ensure that each ZAP is present in the 3686 * spa's per-vdev ZAP list. 3687 */ 3688 static uint64_t 3689 vdev_count_verify_zaps(vdev_t *vd) 3690 { 3691 spa_t *spa = vd->vdev_spa; 3692 uint64_t total = 0; 3693 3694 if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2) && 3695 vd->vdev_root_zap != 0) { 3696 total++; 3697 ASSERT0(zap_lookup_int(spa->spa_meta_objset, 3698 spa->spa_all_vdev_zaps, vd->vdev_root_zap)); 3699 } 3700 if (vd->vdev_top_zap != 0) { 3701 total++; 3702 ASSERT0(zap_lookup_int(spa->spa_meta_objset, 3703 spa->spa_all_vdev_zaps, vd->vdev_top_zap)); 3704 } 3705 if (vd->vdev_leaf_zap != 0) { 3706 total++; 3707 ASSERT0(zap_lookup_int(spa->spa_meta_objset, 3708 spa->spa_all_vdev_zaps, vd->vdev_leaf_zap)); 3709 } 3710 3711 for (uint64_t i = 0; i < vd->vdev_children; i++) { 3712 total += vdev_count_verify_zaps(vd->vdev_child[i]); 3713 } 3714 3715 return (total); 3716 } 3717 #else 3718 #define vdev_count_verify_zaps(vd) ((void) sizeof (vd), 0) 3719 #endif 3720 3721 /* 3722 * Determine whether the activity check is required. 3723 */ 3724 static boolean_t 3725 spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label, 3726 nvlist_t *config) 3727 { 3728 uint64_t state = 0; 3729 uint64_t hostid = 0; 3730 uint64_t tryconfig_txg = 0; 3731 uint64_t tryconfig_timestamp = 0; 3732 uint16_t tryconfig_mmp_seq = 0; 3733 nvlist_t *nvinfo; 3734 3735 if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { 3736 nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); 3737 (void) nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG, 3738 &tryconfig_txg); 3739 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 3740 &tryconfig_timestamp); 3741 (void) nvlist_lookup_uint16(nvinfo, ZPOOL_CONFIG_MMP_SEQ, 3742 &tryconfig_mmp_seq); 3743 } 3744 3745 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state); 3746 3747 /* 3748 * Disable the MMP activity check - This is used by zdb which 3749 * is intended to be used on potentially active pools. 3750 */ 3751 if (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) 3752 return (B_FALSE); 3753 3754 /* 3755 * Skip the activity check when the MMP feature is disabled. 3756 */ 3757 if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay == 0) 3758 return (B_FALSE); 3759 3760 /* 3761 * If the tryconfig_ values are nonzero, they are the results of an 3762 * earlier tryimport. If they all match the uberblock we just found, 3763 * then the pool has not changed and we return false so we do not test 3764 * a second time. 3765 */ 3766 if (tryconfig_txg && tryconfig_txg == ub->ub_txg && 3767 tryconfig_timestamp && tryconfig_timestamp == ub->ub_timestamp && 3768 tryconfig_mmp_seq && tryconfig_mmp_seq == 3769 (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) 3770 return (B_FALSE); 3771 3772 /* 3773 * Allow the activity check to be skipped when importing the pool 3774 * on the same host which last imported it. Since the hostid from 3775 * configuration may be stale use the one read from the label. 3776 */ 3777 if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID)) 3778 hostid = fnvlist_lookup_uint64(label, ZPOOL_CONFIG_HOSTID); 3779 3780 if (hostid == spa_get_hostid(spa)) 3781 return (B_FALSE); 3782 3783 /* 3784 * Skip the activity test when the pool was cleanly exported. 3785 */ 3786 if (state != POOL_STATE_ACTIVE) 3787 return (B_FALSE); 3788 3789 return (B_TRUE); 3790 } 3791 3792 /* 3793 * Nanoseconds the activity check must watch for changes on-disk. 3794 */ 3795 static uint64_t 3796 spa_activity_check_duration(spa_t *spa, uberblock_t *ub) 3797 { 3798 uint64_t import_intervals = MAX(zfs_multihost_import_intervals, 1); 3799 uint64_t multihost_interval = MSEC2NSEC( 3800 MMP_INTERVAL_OK(zfs_multihost_interval)); 3801 uint64_t import_delay = MAX(NANOSEC, import_intervals * 3802 multihost_interval); 3803 3804 /* 3805 * Local tunables determine a minimum duration except for the case 3806 * where we know when the remote host will suspend the pool if MMP 3807 * writes do not land. 3808 * 3809 * See Big Theory comment at the top of mmp.c for the reasoning behind 3810 * these cases and times. 3811 */ 3812 3813 ASSERT(MMP_IMPORT_SAFETY_FACTOR >= 100); 3814 3815 if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) && 3816 MMP_FAIL_INT(ub) > 0) { 3817 3818 /* MMP on remote host will suspend pool after failed writes */ 3819 import_delay = MMP_FAIL_INT(ub) * MSEC2NSEC(MMP_INTERVAL(ub)) * 3820 MMP_IMPORT_SAFETY_FACTOR / 100; 3821 3822 zfs_dbgmsg("fail_intvals>0 import_delay=%llu ub_mmp " 3823 "mmp_fails=%llu ub_mmp mmp_interval=%llu " 3824 "import_intervals=%llu", (u_longlong_t)import_delay, 3825 (u_longlong_t)MMP_FAIL_INT(ub), 3826 (u_longlong_t)MMP_INTERVAL(ub), 3827 (u_longlong_t)import_intervals); 3828 3829 } else if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) && 3830 MMP_FAIL_INT(ub) == 0) { 3831 3832 /* MMP on remote host will never suspend pool */ 3833 import_delay = MAX(import_delay, (MSEC2NSEC(MMP_INTERVAL(ub)) + 3834 ub->ub_mmp_delay) * import_intervals); 3835 3836 zfs_dbgmsg("fail_intvals=0 import_delay=%llu ub_mmp " 3837 "mmp_interval=%llu ub_mmp_delay=%llu " 3838 "import_intervals=%llu", (u_longlong_t)import_delay, 3839 (u_longlong_t)MMP_INTERVAL(ub), 3840 (u_longlong_t)ub->ub_mmp_delay, 3841 (u_longlong_t)import_intervals); 3842 3843 } else if (MMP_VALID(ub)) { 3844 /* 3845 * zfs-0.7 compatibility case 3846 */ 3847 3848 import_delay = MAX(import_delay, (multihost_interval + 3849 ub->ub_mmp_delay) * import_intervals); 3850 3851 zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu " 3852 "import_intervals=%llu leaves=%u", 3853 (u_longlong_t)import_delay, 3854 (u_longlong_t)ub->ub_mmp_delay, 3855 (u_longlong_t)import_intervals, 3856 vdev_count_leaves(spa)); 3857 } else { 3858 /* Using local tunings is the only reasonable option */ 3859 zfs_dbgmsg("pool last imported on non-MMP aware " 3860 "host using import_delay=%llu multihost_interval=%llu " 3861 "import_intervals=%llu", (u_longlong_t)import_delay, 3862 (u_longlong_t)multihost_interval, 3863 (u_longlong_t)import_intervals); 3864 } 3865 3866 return (import_delay); 3867 } 3868 3869 /* 3870 * Remote host activity check. 3871 * 3872 * error results: 3873 * 0 - no activity detected 3874 * EREMOTEIO - remote activity detected 3875 * EINTR - user canceled the operation 3876 */ 3877 static int 3878 spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config, 3879 boolean_t importing) 3880 { 3881 uint64_t txg = ub->ub_txg; 3882 uint64_t timestamp = ub->ub_timestamp; 3883 uint64_t mmp_config = ub->ub_mmp_config; 3884 uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0; 3885 uint64_t import_delay; 3886 hrtime_t import_expire, now; 3887 nvlist_t *mmp_label = NULL; 3888 vdev_t *rvd = spa->spa_root_vdev; 3889 kcondvar_t cv; 3890 kmutex_t mtx; 3891 int error = 0; 3892 3893 cv_init(&cv, NULL, CV_DEFAULT, NULL); 3894 mutex_init(&mtx, NULL, MUTEX_DEFAULT, NULL); 3895 mutex_enter(&mtx); 3896 3897 /* 3898 * If ZPOOL_CONFIG_MMP_TXG is present an activity check was performed 3899 * during the earlier tryimport. If the txg recorded there is 0 then 3900 * the pool is known to be active on another host. 3901 * 3902 * Otherwise, the pool might be in use on another host. Check for 3903 * changes in the uberblocks on disk if necessary. 3904 */ 3905 if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { 3906 nvlist_t *nvinfo = fnvlist_lookup_nvlist(config, 3907 ZPOOL_CONFIG_LOAD_INFO); 3908 3909 if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_TXG) && 3910 fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG) == 0) { 3911 vdev_uberblock_load(rvd, ub, &mmp_label); 3912 error = SET_ERROR(EREMOTEIO); 3913 goto out; 3914 } 3915 } 3916 3917 import_delay = spa_activity_check_duration(spa, ub); 3918 3919 /* Add a small random factor in case of simultaneous imports (0-25%) */ 3920 import_delay += import_delay * random_in_range(250) / 1000; 3921 3922 import_expire = gethrtime() + import_delay; 3923 3924 if (importing) { 3925 spa_import_progress_set_notes(spa, "Checking MMP activity, " 3926 "waiting %llu ms", (u_longlong_t)NSEC2MSEC(import_delay)); 3927 } 3928 3929 int iterations = 0; 3930 while ((now = gethrtime()) < import_expire) { 3931 if (importing && iterations++ % 30 == 0) { 3932 spa_import_progress_set_notes(spa, "Checking MMP " 3933 "activity, %llu ms remaining", 3934 (u_longlong_t)NSEC2MSEC(import_expire - now)); 3935 } 3936 3937 if (importing) { 3938 (void) spa_import_progress_set_mmp_check(spa_guid(spa), 3939 NSEC2SEC(import_expire - gethrtime())); 3940 } 3941 3942 vdev_uberblock_load(rvd, ub, &mmp_label); 3943 3944 if (txg != ub->ub_txg || timestamp != ub->ub_timestamp || 3945 mmp_seq != (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) { 3946 zfs_dbgmsg("multihost activity detected " 3947 "txg %llu ub_txg %llu " 3948 "timestamp %llu ub_timestamp %llu " 3949 "mmp_config %#llx ub_mmp_config %#llx", 3950 (u_longlong_t)txg, (u_longlong_t)ub->ub_txg, 3951 (u_longlong_t)timestamp, 3952 (u_longlong_t)ub->ub_timestamp, 3953 (u_longlong_t)mmp_config, 3954 (u_longlong_t)ub->ub_mmp_config); 3955 3956 error = SET_ERROR(EREMOTEIO); 3957 break; 3958 } 3959 3960 if (mmp_label) { 3961 nvlist_free(mmp_label); 3962 mmp_label = NULL; 3963 } 3964 3965 error = cv_timedwait_sig(&cv, &mtx, ddi_get_lbolt() + hz); 3966 if (error != -1) { 3967 error = SET_ERROR(EINTR); 3968 break; 3969 } 3970 error = 0; 3971 } 3972 3973 out: 3974 mutex_exit(&mtx); 3975 mutex_destroy(&mtx); 3976 cv_destroy(&cv); 3977 3978 /* 3979 * If the pool is determined to be active store the status in the 3980 * spa->spa_load_info nvlist. If the remote hostname or hostid are 3981 * available from configuration read from disk store them as well. 3982 * This allows 'zpool import' to generate a more useful message. 3983 * 3984 * ZPOOL_CONFIG_MMP_STATE - observed pool status (mandatory) 3985 * ZPOOL_CONFIG_MMP_HOSTNAME - hostname from the active pool 3986 * ZPOOL_CONFIG_MMP_HOSTID - hostid from the active pool 3987 */ 3988 if (error == EREMOTEIO) { 3989 if (mmp_label) { 3990 if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTNAME)) { 3991 const char *hostname = fnvlist_lookup_string( 3992 mmp_label, ZPOOL_CONFIG_HOSTNAME); 3993 fnvlist_add_string(spa->spa_load_info, 3994 ZPOOL_CONFIG_MMP_HOSTNAME, hostname); 3995 } 3996 3997 if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTID)) { 3998 uint64_t hostid = fnvlist_lookup_uint64( 3999 mmp_label, ZPOOL_CONFIG_HOSTID); 4000 fnvlist_add_uint64(spa->spa_load_info, 4001 ZPOOL_CONFIG_MMP_HOSTID, hostid); 4002 } 4003 } 4004 4005 fnvlist_add_uint64(spa->spa_load_info, 4006 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_ACTIVE); 4007 fnvlist_add_uint64(spa->spa_load_info, 4008 ZPOOL_CONFIG_MMP_TXG, 0); 4009 4010 error = spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO); 4011 } 4012 4013 if (mmp_label) 4014 nvlist_free(mmp_label); 4015 4016 return (error); 4017 } 4018 4019 /* 4020 * Called from zfs_ioc_clear for a pool that was suspended 4021 * after failing mmp write checks. 4022 */ 4023 boolean_t 4024 spa_mmp_remote_host_activity(spa_t *spa) 4025 { 4026 ASSERT(spa_multihost(spa) && spa_suspended(spa)); 4027 4028 nvlist_t *best_label; 4029 uberblock_t best_ub; 4030 4031 /* 4032 * Locate the best uberblock on disk 4033 */ 4034 vdev_uberblock_load(spa->spa_root_vdev, &best_ub, &best_label); 4035 if (best_label) { 4036 /* 4037 * confirm that the best hostid matches our hostid 4038 */ 4039 if (nvlist_exists(best_label, ZPOOL_CONFIG_HOSTID) && 4040 spa_get_hostid(spa) != 4041 fnvlist_lookup_uint64(best_label, ZPOOL_CONFIG_HOSTID)) { 4042 nvlist_free(best_label); 4043 return (B_TRUE); 4044 } 4045 nvlist_free(best_label); 4046 } else { 4047 return (B_TRUE); 4048 } 4049 4050 if (!MMP_VALID(&best_ub) || 4051 !MMP_FAIL_INT_VALID(&best_ub) || 4052 MMP_FAIL_INT(&best_ub) == 0) { 4053 return (B_TRUE); 4054 } 4055 4056 if (best_ub.ub_txg != spa->spa_uberblock.ub_txg || 4057 best_ub.ub_timestamp != spa->spa_uberblock.ub_timestamp) { 4058 zfs_dbgmsg("txg mismatch detected during pool clear " 4059 "txg %llu ub_txg %llu timestamp %llu ub_timestamp %llu", 4060 (u_longlong_t)spa->spa_uberblock.ub_txg, 4061 (u_longlong_t)best_ub.ub_txg, 4062 (u_longlong_t)spa->spa_uberblock.ub_timestamp, 4063 (u_longlong_t)best_ub.ub_timestamp); 4064 return (B_TRUE); 4065 } 4066 4067 /* 4068 * Perform an activity check looking for any remote writer 4069 */ 4070 return (spa_activity_check(spa, &spa->spa_uberblock, spa->spa_config, 4071 B_FALSE) != 0); 4072 } 4073 4074 static int 4075 spa_verify_host(spa_t *spa, nvlist_t *mos_config) 4076 { 4077 uint64_t hostid; 4078 const char *hostname; 4079 uint64_t myhostid = 0; 4080 4081 if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config, 4082 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 4083 hostname = fnvlist_lookup_string(mos_config, 4084 ZPOOL_CONFIG_HOSTNAME); 4085 4086 myhostid = zone_get_hostid(NULL); 4087 4088 if (hostid != 0 && myhostid != 0 && hostid != myhostid) { 4089 cmn_err(CE_WARN, "pool '%s' could not be " 4090 "loaded as it was last accessed by " 4091 "another system (host: %s hostid: 0x%llx). " 4092 "See: https://openzfs.github.io/openzfs-docs/msg/" 4093 "ZFS-8000-EY", 4094 spa_name(spa), hostname, (u_longlong_t)hostid); 4095 spa_load_failed(spa, "hostid verification failed: pool " 4096 "last accessed by host: %s (hostid: 0x%llx)", 4097 hostname, (u_longlong_t)hostid); 4098 return (SET_ERROR(EBADF)); 4099 } 4100 } 4101 4102 return (0); 4103 } 4104 4105 static int 4106 spa_ld_parse_config(spa_t *spa, spa_import_type_t type) 4107 { 4108 int error = 0; 4109 nvlist_t *nvtree, *nvl, *config = spa->spa_config; 4110 int parse; 4111 vdev_t *rvd; 4112 uint64_t pool_guid; 4113 const char *comment; 4114 const char *compatibility; 4115 4116 /* 4117 * Versioning wasn't explicitly added to the label until later, so if 4118 * it's not present treat it as the initial version. 4119 */ 4120 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 4121 &spa->spa_ubsync.ub_version) != 0) 4122 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 4123 4124 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 4125 spa_load_failed(spa, "invalid config provided: '%s' missing", 4126 ZPOOL_CONFIG_POOL_GUID); 4127 return (SET_ERROR(EINVAL)); 4128 } 4129 4130 /* 4131 * If we are doing an import, ensure that the pool is not already 4132 * imported by checking if its pool guid already exists in the 4133 * spa namespace. 4134 * 4135 * The only case that we allow an already imported pool to be 4136 * imported again, is when the pool is checkpointed and we want to 4137 * look at its checkpointed state from userland tools like zdb. 4138 */ 4139 #ifdef _KERNEL 4140 if ((spa->spa_load_state == SPA_LOAD_IMPORT || 4141 spa->spa_load_state == SPA_LOAD_TRYIMPORT) && 4142 spa_guid_exists(pool_guid, 0)) { 4143 #else 4144 if ((spa->spa_load_state == SPA_LOAD_IMPORT || 4145 spa->spa_load_state == SPA_LOAD_TRYIMPORT) && 4146 spa_guid_exists(pool_guid, 0) && 4147 !spa_importing_readonly_checkpoint(spa)) { 4148 #endif 4149 spa_load_failed(spa, "a pool with guid %llu is already open", 4150 (u_longlong_t)pool_guid); 4151 return (SET_ERROR(EEXIST)); 4152 } 4153 4154 spa->spa_config_guid = pool_guid; 4155 4156 nvlist_free(spa->spa_load_info); 4157 spa->spa_load_info = fnvlist_alloc(); 4158 4159 ASSERT0P(spa->spa_comment); 4160 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) 4161 spa->spa_comment = spa_strdup(comment); 4162 4163 ASSERT0P(spa->spa_compatibility); 4164 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMPATIBILITY, 4165 &compatibility) == 0) 4166 spa->spa_compatibility = spa_strdup(compatibility); 4167 4168 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 4169 &spa->spa_config_txg); 4170 4171 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0) 4172 spa->spa_config_splitting = fnvlist_dup(nvl); 4173 4174 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) { 4175 spa_load_failed(spa, "invalid config provided: '%s' missing", 4176 ZPOOL_CONFIG_VDEV_TREE); 4177 return (SET_ERROR(EINVAL)); 4178 } 4179 4180 /* 4181 * Create "The Godfather" zio to hold all async IOs 4182 */ 4183 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 4184 KM_SLEEP); 4185 for (int i = 0; i < max_ncpus; i++) { 4186 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 4187 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 4188 ZIO_FLAG_GODFATHER); 4189 } 4190 4191 /* 4192 * Parse the configuration into a vdev tree. We explicitly set the 4193 * value that will be returned by spa_version() since parsing the 4194 * configuration requires knowing the version number. 4195 */ 4196 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4197 parse = (type == SPA_IMPORT_EXISTING ? 4198 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 4199 error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse); 4200 spa_config_exit(spa, SCL_ALL, FTAG); 4201 4202 if (error != 0) { 4203 spa_load_failed(spa, "unable to parse config [error=%d]", 4204 error); 4205 return (error); 4206 } 4207 4208 ASSERT(spa->spa_root_vdev == rvd); 4209 ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); 4210 ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT); 4211 4212 if (type != SPA_IMPORT_ASSEMBLE) { 4213 ASSERT(spa_guid(spa) == pool_guid); 4214 } 4215 4216 return (0); 4217 } 4218 4219 /* 4220 * Recursively open all vdevs in the vdev tree. This function is called twice: 4221 * first with the untrusted config, then with the trusted config. 4222 */ 4223 static int 4224 spa_ld_open_vdevs(spa_t *spa) 4225 { 4226 int error = 0; 4227 4228 /* 4229 * spa_missing_tvds_allowed defines how many top-level vdevs can be 4230 * missing/unopenable for the root vdev to be still considered openable. 4231 */ 4232 if (spa->spa_trust_config) { 4233 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds; 4234 } else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) { 4235 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile; 4236 } else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) { 4237 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan; 4238 } else { 4239 spa->spa_missing_tvds_allowed = 0; 4240 } 4241 4242 spa->spa_missing_tvds_allowed = 4243 MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed); 4244 4245 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4246 error = vdev_open(spa->spa_root_vdev); 4247 spa_config_exit(spa, SCL_ALL, FTAG); 4248 4249 if (spa->spa_missing_tvds != 0) { 4250 spa_load_note(spa, "vdev tree has %lld missing top-level " 4251 "vdevs.", (u_longlong_t)spa->spa_missing_tvds); 4252 if (spa->spa_trust_config && (spa->spa_mode & SPA_MODE_WRITE)) { 4253 /* 4254 * Although theoretically we could allow users to open 4255 * incomplete pools in RW mode, we'd need to add a lot 4256 * of extra logic (e.g. adjust pool space to account 4257 * for missing vdevs). 4258 * This limitation also prevents users from accidentally 4259 * opening the pool in RW mode during data recovery and 4260 * damaging it further. 4261 */ 4262 spa_load_note(spa, "pools with missing top-level " 4263 "vdevs can only be opened in read-only mode."); 4264 error = SET_ERROR(ENXIO); 4265 } else { 4266 spa_load_note(spa, "current settings allow for maximum " 4267 "%lld missing top-level vdevs at this stage.", 4268 (u_longlong_t)spa->spa_missing_tvds_allowed); 4269 } 4270 } 4271 if (error != 0) { 4272 spa_load_failed(spa, "unable to open vdev tree [error=%d]", 4273 error); 4274 } 4275 if (spa->spa_missing_tvds != 0 || error != 0) 4276 vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2); 4277 4278 return (error); 4279 } 4280 4281 /* 4282 * We need to validate the vdev labels against the configuration that 4283 * we have in hand. This function is called twice: first with an untrusted 4284 * config, then with a trusted config. The validation is more strict when the 4285 * config is trusted. 4286 */ 4287 static int 4288 spa_ld_validate_vdevs(spa_t *spa) 4289 { 4290 int error = 0; 4291 vdev_t *rvd = spa->spa_root_vdev; 4292 4293 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4294 error = vdev_validate(rvd); 4295 spa_config_exit(spa, SCL_ALL, FTAG); 4296 4297 if (error != 0) { 4298 spa_load_failed(spa, "vdev_validate failed [error=%d]", error); 4299 return (error); 4300 } 4301 4302 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 4303 spa_load_failed(spa, "cannot open vdev tree after invalidating " 4304 "some vdevs"); 4305 vdev_dbgmsg_print_tree(rvd, 2); 4306 return (SET_ERROR(ENXIO)); 4307 } 4308 4309 return (0); 4310 } 4311 4312 static void 4313 spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub) 4314 { 4315 spa->spa_state = POOL_STATE_ACTIVE; 4316 spa->spa_ubsync = spa->spa_uberblock; 4317 spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 4318 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 4319 spa->spa_first_txg = spa->spa_last_ubsync_txg ? 4320 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 4321 spa->spa_claim_max_txg = spa->spa_first_txg; 4322 spa->spa_prev_software_version = ub->ub_software_version; 4323 } 4324 4325 static int 4326 spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) 4327 { 4328 vdev_t *rvd = spa->spa_root_vdev; 4329 nvlist_t *label; 4330 uberblock_t *ub = &spa->spa_uberblock; 4331 boolean_t activity_check = B_FALSE; 4332 4333 /* 4334 * If we are opening the checkpointed state of the pool by 4335 * rewinding to it, at this point we will have written the 4336 * checkpointed uberblock to the vdev labels, so searching 4337 * the labels will find the right uberblock. However, if 4338 * we are opening the checkpointed state read-only, we have 4339 * not modified the labels. Therefore, we must ignore the 4340 * labels and continue using the spa_uberblock that was set 4341 * by spa_ld_checkpoint_rewind. 4342 * 4343 * Note that it would be fine to ignore the labels when 4344 * rewinding (opening writeable) as well. However, if we 4345 * crash just after writing the labels, we will end up 4346 * searching the labels. Doing so in the common case means 4347 * that this code path gets exercised normally, rather than 4348 * just in the edge case. 4349 */ 4350 if (ub->ub_checkpoint_txg != 0 && 4351 spa_importing_readonly_checkpoint(spa)) { 4352 spa_ld_select_uberblock_done(spa, ub); 4353 return (0); 4354 } 4355 4356 /* 4357 * Find the best uberblock. 4358 */ 4359 vdev_uberblock_load(rvd, ub, &label); 4360 4361 /* 4362 * If we weren't able to find a single valid uberblock, return failure. 4363 */ 4364 if (ub->ub_txg == 0) { 4365 nvlist_free(label); 4366 spa_load_failed(spa, "no valid uberblock found"); 4367 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 4368 } 4369 4370 if (spa->spa_load_max_txg != UINT64_MAX) { 4371 (void) spa_import_progress_set_max_txg(spa_guid(spa), 4372 (u_longlong_t)spa->spa_load_max_txg); 4373 } 4374 spa_load_note(spa, "using uberblock with txg=%llu", 4375 (u_longlong_t)ub->ub_txg); 4376 if (ub->ub_raidz_reflow_info != 0) { 4377 spa_load_note(spa, "uberblock raidz_reflow_info: " 4378 "state=%u offset=%llu", 4379 (int)RRSS_GET_STATE(ub), 4380 (u_longlong_t)RRSS_GET_OFFSET(ub)); 4381 } 4382 4383 4384 /* 4385 * For pools which have the multihost property on determine if the 4386 * pool is truly inactive and can be safely imported. Prevent 4387 * hosts which don't have a hostid set from importing the pool. 4388 */ 4389 activity_check = spa_activity_check_required(spa, ub, label, 4390 spa->spa_config); 4391 if (activity_check) { 4392 if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay && 4393 spa_get_hostid(spa) == 0) { 4394 nvlist_free(label); 4395 fnvlist_add_uint64(spa->spa_load_info, 4396 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); 4397 return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); 4398 } 4399 4400 int error = 4401 spa_activity_check(spa, ub, spa->spa_config, B_TRUE); 4402 if (error) { 4403 nvlist_free(label); 4404 return (error); 4405 } 4406 4407 fnvlist_add_uint64(spa->spa_load_info, 4408 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_INACTIVE); 4409 fnvlist_add_uint64(spa->spa_load_info, 4410 ZPOOL_CONFIG_MMP_TXG, ub->ub_txg); 4411 fnvlist_add_uint16(spa->spa_load_info, 4412 ZPOOL_CONFIG_MMP_SEQ, 4413 (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)); 4414 } 4415 4416 /* 4417 * If the pool has an unsupported version we can't open it. 4418 */ 4419 if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { 4420 nvlist_free(label); 4421 spa_load_failed(spa, "version %llu is not supported", 4422 (u_longlong_t)ub->ub_version); 4423 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 4424 } 4425 4426 if (ub->ub_version >= SPA_VERSION_FEATURES) { 4427 nvlist_t *features; 4428 4429 /* 4430 * If we weren't able to find what's necessary for reading the 4431 * MOS in the label, return failure. 4432 */ 4433 if (label == NULL) { 4434 spa_load_failed(spa, "label config unavailable"); 4435 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 4436 ENXIO)); 4437 } 4438 4439 if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ, 4440 &features) != 0) { 4441 nvlist_free(label); 4442 spa_load_failed(spa, "invalid label: '%s' missing", 4443 ZPOOL_CONFIG_FEATURES_FOR_READ); 4444 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 4445 ENXIO)); 4446 } 4447 4448 /* 4449 * Update our in-core representation with the definitive values 4450 * from the label. 4451 */ 4452 nvlist_free(spa->spa_label_features); 4453 spa->spa_label_features = fnvlist_dup(features); 4454 } 4455 4456 nvlist_free(label); 4457 4458 /* 4459 * Look through entries in the label nvlist's features_for_read. If 4460 * there is a feature listed there which we don't understand then we 4461 * cannot open a pool. 4462 */ 4463 if (ub->ub_version >= SPA_VERSION_FEATURES) { 4464 nvlist_t *unsup_feat; 4465 4466 unsup_feat = fnvlist_alloc(); 4467 4468 for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, 4469 NULL); nvp != NULL; 4470 nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { 4471 if (!zfeature_is_supported(nvpair_name(nvp))) { 4472 fnvlist_add_string(unsup_feat, 4473 nvpair_name(nvp), ""); 4474 } 4475 } 4476 4477 if (!nvlist_empty(unsup_feat)) { 4478 fnvlist_add_nvlist(spa->spa_load_info, 4479 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 4480 nvlist_free(unsup_feat); 4481 spa_load_failed(spa, "some features are unsupported"); 4482 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 4483 ENOTSUP)); 4484 } 4485 4486 nvlist_free(unsup_feat); 4487 } 4488 4489 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 4490 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4491 spa_try_repair(spa, spa->spa_config); 4492 spa_config_exit(spa, SCL_ALL, FTAG); 4493 nvlist_free(spa->spa_config_splitting); 4494 spa->spa_config_splitting = NULL; 4495 } 4496 4497 /* 4498 * Initialize internal SPA structures. 4499 */ 4500 spa_ld_select_uberblock_done(spa, ub); 4501 4502 return (0); 4503 } 4504 4505 static int 4506 spa_ld_open_rootbp(spa_t *spa) 4507 { 4508 int error = 0; 4509 vdev_t *rvd = spa->spa_root_vdev; 4510 4511 error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 4512 if (error != 0) { 4513 spa_load_failed(spa, "unable to open rootbp in dsl_pool_init " 4514 "[error=%d]", error); 4515 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4516 } 4517 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 4518 4519 return (0); 4520 } 4521 4522 static int 4523 spa_ld_trusted_config(spa_t *spa, spa_import_type_t type, 4524 boolean_t reloading) 4525 { 4526 vdev_t *mrvd, *rvd = spa->spa_root_vdev; 4527 nvlist_t *nv, *mos_config, *policy; 4528 int error = 0, copy_error; 4529 uint64_t healthy_tvds, healthy_tvds_mos; 4530 uint64_t mos_config_txg; 4531 4532 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE) 4533 != 0) 4534 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4535 4536 /* 4537 * If we're assembling a pool from a split, the config provided is 4538 * already trusted so there is nothing to do. 4539 */ 4540 if (type == SPA_IMPORT_ASSEMBLE) 4541 return (0); 4542 4543 healthy_tvds = spa_healthy_core_tvds(spa); 4544 4545 if (load_nvlist(spa, spa->spa_config_object, &mos_config) 4546 != 0) { 4547 spa_load_failed(spa, "unable to retrieve MOS config"); 4548 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4549 } 4550 4551 /* 4552 * If we are doing an open, pool owner wasn't verified yet, thus do 4553 * the verification here. 4554 */ 4555 if (spa->spa_load_state == SPA_LOAD_OPEN) { 4556 error = spa_verify_host(spa, mos_config); 4557 if (error != 0) { 4558 nvlist_free(mos_config); 4559 return (error); 4560 } 4561 } 4562 4563 nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE); 4564 4565 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4566 4567 /* 4568 * Build a new vdev tree from the trusted config 4569 */ 4570 error = spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD); 4571 if (error != 0) { 4572 nvlist_free(mos_config); 4573 spa_config_exit(spa, SCL_ALL, FTAG); 4574 spa_load_failed(spa, "spa_config_parse failed [error=%d]", 4575 error); 4576 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 4577 } 4578 4579 /* 4580 * Vdev paths in the MOS may be obsolete. If the untrusted config was 4581 * obtained by scanning /dev/dsk, then it will have the right vdev 4582 * paths. We update the trusted MOS config with this information. 4583 * We first try to copy the paths with vdev_copy_path_strict, which 4584 * succeeds only when both configs have exactly the same vdev tree. 4585 * If that fails, we fall back to a more flexible method that has a 4586 * best effort policy. 4587 */ 4588 copy_error = vdev_copy_path_strict(rvd, mrvd); 4589 if (copy_error != 0 || spa_load_print_vdev_tree) { 4590 spa_load_note(spa, "provided vdev tree:"); 4591 vdev_dbgmsg_print_tree(rvd, 2); 4592 spa_load_note(spa, "MOS vdev tree:"); 4593 vdev_dbgmsg_print_tree(mrvd, 2); 4594 } 4595 if (copy_error != 0) { 4596 spa_load_note(spa, "vdev_copy_path_strict failed, falling " 4597 "back to vdev_copy_path_relaxed"); 4598 vdev_copy_path_relaxed(rvd, mrvd); 4599 } 4600 4601 vdev_close(rvd); 4602 vdev_free(rvd); 4603 spa->spa_root_vdev = mrvd; 4604 rvd = mrvd; 4605 spa_config_exit(spa, SCL_ALL, FTAG); 4606 4607 /* 4608 * If 'zpool import' used a cached config, then the on-disk hostid and 4609 * hostname may be different to the cached config in ways that should 4610 * prevent import. Userspace can't discover this without a scan, but 4611 * we know, so we add these values to LOAD_INFO so the caller can know 4612 * the difference. 4613 * 4614 * Note that we have to do this before the config is regenerated, 4615 * because the new config will have the hostid and hostname for this 4616 * host, in readiness for import. 4617 */ 4618 if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTID)) 4619 fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_HOSTID, 4620 fnvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_HOSTID)); 4621 if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTNAME)) 4622 fnvlist_add_string(spa->spa_load_info, ZPOOL_CONFIG_HOSTNAME, 4623 fnvlist_lookup_string(mos_config, ZPOOL_CONFIG_HOSTNAME)); 4624 4625 /* 4626 * We will use spa_config if we decide to reload the spa or if spa_load 4627 * fails and we rewind. We must thus regenerate the config using the 4628 * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to 4629 * pass settings on how to load the pool and is not stored in the MOS. 4630 * We copy it over to our new, trusted config. 4631 */ 4632 mos_config_txg = fnvlist_lookup_uint64(mos_config, 4633 ZPOOL_CONFIG_POOL_TXG); 4634 nvlist_free(mos_config); 4635 mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE); 4636 if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY, 4637 &policy) == 0) 4638 fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy); 4639 spa_config_set(spa, mos_config); 4640 spa->spa_config_source = SPA_CONFIG_SRC_MOS; 4641 4642 /* 4643 * Now that we got the config from the MOS, we should be more strict 4644 * in checking blkptrs and can make assumptions about the consistency 4645 * of the vdev tree. spa_trust_config must be set to true before opening 4646 * vdevs in order for them to be writeable. 4647 */ 4648 spa->spa_trust_config = B_TRUE; 4649 4650 /* 4651 * Open and validate the new vdev tree 4652 */ 4653 error = spa_ld_open_vdevs(spa); 4654 if (error != 0) 4655 return (error); 4656 4657 error = spa_ld_validate_vdevs(spa); 4658 if (error != 0) 4659 return (error); 4660 4661 if (copy_error != 0 || spa_load_print_vdev_tree) { 4662 spa_load_note(spa, "final vdev tree:"); 4663 vdev_dbgmsg_print_tree(rvd, 2); 4664 } 4665 4666 if (spa->spa_load_state != SPA_LOAD_TRYIMPORT && 4667 !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) { 4668 /* 4669 * Sanity check to make sure that we are indeed loading the 4670 * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds 4671 * in the config provided and they happened to be the only ones 4672 * to have the latest uberblock, we could involuntarily perform 4673 * an extreme rewind. 4674 */ 4675 healthy_tvds_mos = spa_healthy_core_tvds(spa); 4676 if (healthy_tvds_mos - healthy_tvds >= 4677 SPA_SYNC_MIN_VDEVS) { 4678 spa_load_note(spa, "config provided misses too many " 4679 "top-level vdevs compared to MOS (%lld vs %lld). ", 4680 (u_longlong_t)healthy_tvds, 4681 (u_longlong_t)healthy_tvds_mos); 4682 spa_load_note(spa, "vdev tree:"); 4683 vdev_dbgmsg_print_tree(rvd, 2); 4684 if (reloading) { 4685 spa_load_failed(spa, "config was already " 4686 "provided from MOS. Aborting."); 4687 return (spa_vdev_err(rvd, 4688 VDEV_AUX_CORRUPT_DATA, EIO)); 4689 } 4690 spa_load_note(spa, "spa must be reloaded using MOS " 4691 "config"); 4692 return (SET_ERROR(EAGAIN)); 4693 } 4694 } 4695 4696 error = spa_check_for_missing_logs(spa); 4697 if (error != 0) 4698 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 4699 4700 if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) { 4701 spa_load_failed(spa, "uberblock guid sum doesn't match MOS " 4702 "guid sum (%llu != %llu)", 4703 (u_longlong_t)spa->spa_uberblock.ub_guid_sum, 4704 (u_longlong_t)rvd->vdev_guid_sum); 4705 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 4706 ENXIO)); 4707 } 4708 4709 return (0); 4710 } 4711 4712 static int 4713 spa_ld_open_indirect_vdev_metadata(spa_t *spa) 4714 { 4715 int error = 0; 4716 vdev_t *rvd = spa->spa_root_vdev; 4717 4718 /* 4719 * Everything that we read before spa_remove_init() must be stored 4720 * on concreted vdevs. Therefore we do this as early as possible. 4721 */ 4722 error = spa_remove_init(spa); 4723 if (error != 0) { 4724 spa_load_failed(spa, "spa_remove_init failed [error=%d]", 4725 error); 4726 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4727 } 4728 4729 /* 4730 * Retrieve information needed to condense indirect vdev mappings. 4731 */ 4732 error = spa_condense_init(spa); 4733 if (error != 0) { 4734 spa_load_failed(spa, "spa_condense_init failed [error=%d]", 4735 error); 4736 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 4737 } 4738 4739 return (0); 4740 } 4741 4742 static int 4743 spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep) 4744 { 4745 int error = 0; 4746 vdev_t *rvd = spa->spa_root_vdev; 4747 4748 if (spa_version(spa) >= SPA_VERSION_FEATURES) { 4749 boolean_t missing_feat_read = B_FALSE; 4750 nvlist_t *unsup_feat, *enabled_feat; 4751 4752 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, 4753 &spa->spa_feat_for_read_obj, B_TRUE) != 0) { 4754 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4755 } 4756 4757 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, 4758 &spa->spa_feat_for_write_obj, B_TRUE) != 0) { 4759 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4760 } 4761 4762 if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, 4763 &spa->spa_feat_desc_obj, B_TRUE) != 0) { 4764 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4765 } 4766 4767 enabled_feat = fnvlist_alloc(); 4768 unsup_feat = fnvlist_alloc(); 4769 4770 if (!spa_features_check(spa, B_FALSE, 4771 unsup_feat, enabled_feat)) 4772 missing_feat_read = B_TRUE; 4773 4774 if (spa_writeable(spa) || 4775 spa->spa_load_state == SPA_LOAD_TRYIMPORT) { 4776 if (!spa_features_check(spa, B_TRUE, 4777 unsup_feat, enabled_feat)) { 4778 *missing_feat_writep = B_TRUE; 4779 } 4780 } 4781 4782 fnvlist_add_nvlist(spa->spa_load_info, 4783 ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); 4784 4785 if (!nvlist_empty(unsup_feat)) { 4786 fnvlist_add_nvlist(spa->spa_load_info, 4787 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 4788 } 4789 4790 fnvlist_free(enabled_feat); 4791 fnvlist_free(unsup_feat); 4792 4793 if (!missing_feat_read) { 4794 fnvlist_add_boolean(spa->spa_load_info, 4795 ZPOOL_CONFIG_CAN_RDONLY); 4796 } 4797 4798 /* 4799 * If the state is SPA_LOAD_TRYIMPORT, our objective is 4800 * twofold: to determine whether the pool is available for 4801 * import in read-write mode and (if it is not) whether the 4802 * pool is available for import in read-only mode. If the pool 4803 * is available for import in read-write mode, it is displayed 4804 * as available in userland; if it is not available for import 4805 * in read-only mode, it is displayed as unavailable in 4806 * userland. If the pool is available for import in read-only 4807 * mode but not read-write mode, it is displayed as unavailable 4808 * in userland with a special note that the pool is actually 4809 * available for open in read-only mode. 4810 * 4811 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are 4812 * missing a feature for write, we must first determine whether 4813 * the pool can be opened read-only before returning to 4814 * userland in order to know whether to display the 4815 * abovementioned note. 4816 */ 4817 if (missing_feat_read || (*missing_feat_writep && 4818 spa_writeable(spa))) { 4819 spa_load_failed(spa, "pool uses unsupported features"); 4820 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 4821 ENOTSUP)); 4822 } 4823 4824 /* 4825 * Load refcounts for ZFS features from disk into an in-memory 4826 * cache during SPA initialization. 4827 */ 4828 for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { 4829 uint64_t refcount; 4830 4831 error = feature_get_refcount_from_disk(spa, 4832 &spa_feature_table[i], &refcount); 4833 if (error == 0) { 4834 spa->spa_feat_refcount_cache[i] = refcount; 4835 } else if (error == ENOTSUP) { 4836 spa->spa_feat_refcount_cache[i] = 4837 SPA_FEATURE_DISABLED; 4838 } else { 4839 spa_load_failed(spa, "error getting refcount " 4840 "for feature %s [error=%d]", 4841 spa_feature_table[i].fi_guid, error); 4842 return (spa_vdev_err(rvd, 4843 VDEV_AUX_CORRUPT_DATA, EIO)); 4844 } 4845 } 4846 } 4847 4848 if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { 4849 if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, 4850 &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0) 4851 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4852 } 4853 4854 /* 4855 * Encryption was added before bookmark_v2, even though bookmark_v2 4856 * is now a dependency. If this pool has encryption enabled without 4857 * bookmark_v2, trigger an errata message. 4858 */ 4859 if (spa_feature_is_enabled(spa, SPA_FEATURE_ENCRYPTION) && 4860 !spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_V2)) { 4861 spa->spa_errata = ZPOOL_ERRATA_ZOL_8308_ENCRYPTION; 4862 } 4863 4864 return (0); 4865 } 4866 4867 static int 4868 spa_ld_load_special_directories(spa_t *spa) 4869 { 4870 int error = 0; 4871 vdev_t *rvd = spa->spa_root_vdev; 4872 4873 spa->spa_is_initializing = B_TRUE; 4874 error = dsl_pool_open(spa->spa_dsl_pool); 4875 spa->spa_is_initializing = B_FALSE; 4876 if (error != 0) { 4877 spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error); 4878 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4879 } 4880 4881 return (0); 4882 } 4883 4884 static int 4885 spa_ld_get_props(spa_t *spa) 4886 { 4887 int error = 0; 4888 uint64_t obj; 4889 vdev_t *rvd = spa->spa_root_vdev; 4890 4891 /* Grab the checksum salt from the MOS. */ 4892 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 4893 DMU_POOL_CHECKSUM_SALT, 1, 4894 sizeof (spa->spa_cksum_salt.zcs_bytes), 4895 spa->spa_cksum_salt.zcs_bytes); 4896 if (error == ENOENT) { 4897 /* Generate a new salt for subsequent use */ 4898 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 4899 sizeof (spa->spa_cksum_salt.zcs_bytes)); 4900 } else if (error != 0) { 4901 spa_load_failed(spa, "unable to retrieve checksum salt from " 4902 "MOS [error=%d]", error); 4903 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4904 } 4905 4906 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0) 4907 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4908 error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 4909 if (error != 0) { 4910 spa_load_failed(spa, "error opening deferred-frees bpobj " 4911 "[error=%d]", error); 4912 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4913 } 4914 4915 /* 4916 * Load the bit that tells us to use the new accounting function 4917 * (raid-z deflation). If we have an older pool, this will not 4918 * be present. 4919 */ 4920 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE); 4921 if (error != 0 && error != ENOENT) 4922 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4923 4924 error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 4925 &spa->spa_creation_version, B_FALSE); 4926 if (error != 0 && error != ENOENT) 4927 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4928 4929 /* Load time log */ 4930 spa_load_txg_log_time(spa); 4931 4932 /* 4933 * Load the persistent error log. If we have an older pool, this will 4934 * not be present. 4935 */ 4936 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last, 4937 B_FALSE); 4938 if (error != 0 && error != ENOENT) 4939 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4940 4941 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 4942 &spa->spa_errlog_scrub, B_FALSE); 4943 if (error != 0 && error != ENOENT) 4944 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4945 4946 /* Load the last scrubbed txg. */ 4947 error = spa_dir_prop(spa, DMU_POOL_LAST_SCRUBBED_TXG, 4948 &spa->spa_scrubbed_last_txg, B_FALSE); 4949 if (error != 0 && error != ENOENT) 4950 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4951 4952 /* 4953 * Load the livelist deletion field. If a livelist is queued for 4954 * deletion, indicate that in the spa 4955 */ 4956 error = spa_dir_prop(spa, DMU_POOL_DELETED_CLONES, 4957 &spa->spa_livelists_to_delete, B_FALSE); 4958 if (error != 0 && error != ENOENT) 4959 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4960 4961 /* 4962 * Load the history object. If we have an older pool, this 4963 * will not be present. 4964 */ 4965 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE); 4966 if (error != 0 && error != ENOENT) 4967 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4968 4969 /* 4970 * Load the per-vdev ZAP map. If we have an older pool, this will not 4971 * be present; in this case, defer its creation to a later time to 4972 * avoid dirtying the MOS this early / out of sync context. See 4973 * spa_sync_config_object. 4974 */ 4975 4976 /* The sentinel is only available in the MOS config. */ 4977 nvlist_t *mos_config; 4978 if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) { 4979 spa_load_failed(spa, "unable to retrieve MOS config"); 4980 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4981 } 4982 4983 error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP, 4984 &spa->spa_all_vdev_zaps, B_FALSE); 4985 4986 if (error == ENOENT) { 4987 VERIFY(!nvlist_exists(mos_config, 4988 ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); 4989 spa->spa_avz_action = AVZ_ACTION_INITIALIZE; 4990 ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); 4991 } else if (error != 0) { 4992 nvlist_free(mos_config); 4993 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4994 } else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) { 4995 /* 4996 * An older version of ZFS overwrote the sentinel value, so 4997 * we have orphaned per-vdev ZAPs in the MOS. Defer their 4998 * destruction to later; see spa_sync_config_object. 4999 */ 5000 spa->spa_avz_action = AVZ_ACTION_DESTROY; 5001 /* 5002 * We're assuming that no vdevs have had their ZAPs created 5003 * before this. Better be sure of it. 5004 */ 5005 ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); 5006 } 5007 nvlist_free(mos_config); 5008 5009 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 5010 5011 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object, 5012 B_FALSE); 5013 if (error && error != ENOENT) 5014 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 5015 5016 if (error == 0) { 5017 uint64_t autoreplace = 0; 5018 5019 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 5020 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 5021 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 5022 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 5023 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 5024 spa_prop_find(spa, ZPOOL_PROP_DEDUP_TABLE_QUOTA, 5025 &spa->spa_dedup_table_quota); 5026 spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost); 5027 spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_autotrim); 5028 spa->spa_autoreplace = (autoreplace != 0); 5029 } 5030 5031 /* 5032 * If we are importing a pool with missing top-level vdevs, 5033 * we enforce that the pool doesn't panic or get suspended on 5034 * error since the likelihood of missing data is extremely high. 5035 */ 5036 if (spa->spa_missing_tvds > 0 && 5037 spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE && 5038 spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 5039 spa_load_note(spa, "forcing failmode to 'continue' " 5040 "as some top level vdevs are missing"); 5041 spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE; 5042 } 5043 5044 return (0); 5045 } 5046 5047 static int 5048 spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type) 5049 { 5050 int error = 0; 5051 vdev_t *rvd = spa->spa_root_vdev; 5052 5053 /* 5054 * If we're assembling the pool from the split-off vdevs of 5055 * an existing pool, we don't want to attach the spares & cache 5056 * devices. 5057 */ 5058 5059 /* 5060 * Load any hot spares for this pool. 5061 */ 5062 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object, 5063 B_FALSE); 5064 if (error != 0 && error != ENOENT) 5065 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 5066 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 5067 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 5068 if (load_nvlist(spa, spa->spa_spares.sav_object, 5069 &spa->spa_spares.sav_config) != 0) { 5070 spa_load_failed(spa, "error loading spares nvlist"); 5071 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 5072 } 5073 5074 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5075 spa_load_spares(spa); 5076 spa_config_exit(spa, SCL_ALL, FTAG); 5077 } else if (error == 0) { 5078 spa->spa_spares.sav_sync = B_TRUE; 5079 } 5080 5081 /* 5082 * Load any level 2 ARC devices for this pool. 5083 */ 5084 error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 5085 &spa->spa_l2cache.sav_object, B_FALSE); 5086 if (error != 0 && error != ENOENT) 5087 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 5088 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 5089 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 5090 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 5091 &spa->spa_l2cache.sav_config) != 0) { 5092 spa_load_failed(spa, "error loading l2cache nvlist"); 5093 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 5094 } 5095 5096 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5097 spa_load_l2cache(spa); 5098 spa_config_exit(spa, SCL_ALL, FTAG); 5099 } else if (error == 0) { 5100 spa->spa_l2cache.sav_sync = B_TRUE; 5101 } 5102 5103 return (0); 5104 } 5105 5106 static int 5107 spa_ld_load_vdev_metadata(spa_t *spa) 5108 { 5109 int error = 0; 5110 vdev_t *rvd = spa->spa_root_vdev; 5111 5112 /* 5113 * If the 'multihost' property is set, then never allow a pool to 5114 * be imported when the system hostid is zero. The exception to 5115 * this rule is zdb which is always allowed to access pools. 5116 */ 5117 if (spa_multihost(spa) && spa_get_hostid(spa) == 0 && 5118 (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) == 0) { 5119 fnvlist_add_uint64(spa->spa_load_info, 5120 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); 5121 return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); 5122 } 5123 5124 /* 5125 * If the 'autoreplace' property is set, then post a resource notifying 5126 * the ZFS DE that it should not issue any faults for unopenable 5127 * devices. We also iterate over the vdevs, and post a sysevent for any 5128 * unopenable vdevs so that the normal autoreplace handler can take 5129 * over. 5130 */ 5131 if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 5132 spa_check_removed(spa->spa_root_vdev); 5133 /* 5134 * For the import case, this is done in spa_import(), because 5135 * at this point we're using the spare definitions from 5136 * the MOS config, not necessarily from the userland config. 5137 */ 5138 if (spa->spa_load_state != SPA_LOAD_IMPORT) { 5139 spa_aux_check_removed(&spa->spa_spares); 5140 spa_aux_check_removed(&spa->spa_l2cache); 5141 } 5142 } 5143 5144 /* 5145 * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc. 5146 */ 5147 error = vdev_load(rvd); 5148 if (error != 0) { 5149 spa_load_failed(spa, "vdev_load failed [error=%d]", error); 5150 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 5151 } 5152 5153 error = spa_ld_log_spacemaps(spa); 5154 if (error != 0) { 5155 spa_load_failed(spa, "spa_ld_log_spacemaps failed [error=%d]", 5156 error); 5157 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 5158 } 5159 5160 /* 5161 * Propagate the leaf DTLs we just loaded all the way up the vdev tree. 5162 */ 5163 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5164 vdev_dtl_reassess(rvd, 0, 0, B_FALSE, B_FALSE); 5165 spa_config_exit(spa, SCL_ALL, FTAG); 5166 5167 return (0); 5168 } 5169 5170 static int 5171 spa_ld_load_dedup_tables(spa_t *spa) 5172 { 5173 int error = 0; 5174 vdev_t *rvd = spa->spa_root_vdev; 5175 5176 error = ddt_load(spa); 5177 if (error != 0) { 5178 spa_load_failed(spa, "ddt_load failed [error=%d]", error); 5179 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 5180 } 5181 5182 return (0); 5183 } 5184 5185 static int 5186 spa_ld_load_brt(spa_t *spa) 5187 { 5188 int error = 0; 5189 vdev_t *rvd = spa->spa_root_vdev; 5190 5191 error = brt_load(spa); 5192 if (error != 0) { 5193 spa_load_failed(spa, "brt_load failed [error=%d]", error); 5194 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 5195 } 5196 5197 return (0); 5198 } 5199 5200 static int 5201 spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, const char **ereport) 5202 { 5203 vdev_t *rvd = spa->spa_root_vdev; 5204 5205 if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) { 5206 boolean_t missing = spa_check_logs(spa); 5207 if (missing) { 5208 if (spa->spa_missing_tvds != 0) { 5209 spa_load_note(spa, "spa_check_logs failed " 5210 "so dropping the logs"); 5211 } else { 5212 *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 5213 spa_load_failed(spa, "spa_check_logs failed"); 5214 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, 5215 ENXIO)); 5216 } 5217 } 5218 } 5219 5220 return (0); 5221 } 5222 5223 static int 5224 spa_ld_verify_pool_data(spa_t *spa) 5225 { 5226 int error = 0; 5227 vdev_t *rvd = spa->spa_root_vdev; 5228 5229 /* 5230 * We've successfully opened the pool, verify that we're ready 5231 * to start pushing transactions. 5232 */ 5233 if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 5234 error = spa_load_verify(spa); 5235 if (error != 0) { 5236 spa_load_failed(spa, "spa_load_verify failed " 5237 "[error=%d]", error); 5238 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 5239 error)); 5240 } 5241 } 5242 5243 return (0); 5244 } 5245 5246 static void 5247 spa_ld_claim_log_blocks(spa_t *spa) 5248 { 5249 dmu_tx_t *tx; 5250 dsl_pool_t *dp = spa_get_dsl(spa); 5251 5252 /* 5253 * Claim log blocks that haven't been committed yet. 5254 * This must all happen in a single txg. 5255 * Note: spa_claim_max_txg is updated by spa_claim_notify(), 5256 * invoked from zil_claim_log_block()'s i/o done callback. 5257 * Price of rollback is that we abandon the log. 5258 */ 5259 spa->spa_claiming = B_TRUE; 5260 5261 tx = dmu_tx_create_assigned(dp, spa_first_txg(spa)); 5262 (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 5263 zil_claim, tx, DS_FIND_CHILDREN); 5264 dmu_tx_commit(tx); 5265 5266 spa->spa_claiming = B_FALSE; 5267 5268 spa_set_log_state(spa, SPA_LOG_GOOD); 5269 } 5270 5271 static void 5272 spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg, 5273 boolean_t update_config_cache) 5274 { 5275 vdev_t *rvd = spa->spa_root_vdev; 5276 int need_update = B_FALSE; 5277 5278 /* 5279 * If the config cache is stale, or we have uninitialized 5280 * metaslabs (see spa_vdev_add()), then update the config. 5281 * 5282 * If this is a verbatim import, trust the current 5283 * in-core spa_config and update the disk labels. 5284 */ 5285 if (update_config_cache || config_cache_txg != spa->spa_config_txg || 5286 spa->spa_load_state == SPA_LOAD_IMPORT || 5287 spa->spa_load_state == SPA_LOAD_RECOVER || 5288 (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 5289 need_update = B_TRUE; 5290 5291 for (int c = 0; c < rvd->vdev_children; c++) 5292 if (rvd->vdev_child[c]->vdev_ms_array == 0) 5293 need_update = B_TRUE; 5294 5295 /* 5296 * Update the config cache asynchronously in case we're the 5297 * root pool, in which case the config cache isn't writable yet. 5298 */ 5299 if (need_update) 5300 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 5301 } 5302 5303 static void 5304 spa_ld_prepare_for_reload(spa_t *spa) 5305 { 5306 spa_mode_t mode = spa->spa_mode; 5307 int async_suspended = spa->spa_async_suspended; 5308 5309 spa_unload(spa); 5310 spa_deactivate(spa); 5311 spa_activate(spa, mode); 5312 5313 /* 5314 * We save the value of spa_async_suspended as it gets reset to 0 by 5315 * spa_unload(). We want to restore it back to the original value before 5316 * returning as we might be calling spa_async_resume() later. 5317 */ 5318 spa->spa_async_suspended = async_suspended; 5319 } 5320 5321 static int 5322 spa_ld_read_checkpoint_txg(spa_t *spa) 5323 { 5324 uberblock_t checkpoint; 5325 int error = 0; 5326 5327 ASSERT0(spa->spa_checkpoint_txg); 5328 ASSERT(spa_namespace_held() || 5329 spa->spa_load_thread == curthread); 5330 5331 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 5332 DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), 5333 sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); 5334 5335 if (error == ENOENT) 5336 return (0); 5337 5338 if (error != 0) 5339 return (error); 5340 5341 ASSERT3U(checkpoint.ub_txg, !=, 0); 5342 ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0); 5343 ASSERT3U(checkpoint.ub_timestamp, !=, 0); 5344 spa->spa_checkpoint_txg = checkpoint.ub_txg; 5345 spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp; 5346 5347 return (0); 5348 } 5349 5350 static int 5351 spa_ld_mos_init(spa_t *spa, spa_import_type_t type) 5352 { 5353 int error = 0; 5354 5355 ASSERT(spa_namespace_held()); 5356 ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); 5357 5358 /* 5359 * Never trust the config that is provided unless we are assembling 5360 * a pool following a split. 5361 * This means don't trust blkptrs and the vdev tree in general. This 5362 * also effectively puts the spa in read-only mode since 5363 * spa_writeable() checks for spa_trust_config to be true. 5364 * We will later load a trusted config from the MOS. 5365 */ 5366 if (type != SPA_IMPORT_ASSEMBLE) 5367 spa->spa_trust_config = B_FALSE; 5368 5369 /* 5370 * Parse the config provided to create a vdev tree. 5371 */ 5372 error = spa_ld_parse_config(spa, type); 5373 if (error != 0) 5374 return (error); 5375 5376 spa_import_progress_add(spa); 5377 5378 /* 5379 * Now that we have the vdev tree, try to open each vdev. This involves 5380 * opening the underlying physical device, retrieving its geometry and 5381 * probing the vdev with a dummy I/O. The state of each vdev will be set 5382 * based on the success of those operations. After this we'll be ready 5383 * to read from the vdevs. 5384 */ 5385 error = spa_ld_open_vdevs(spa); 5386 if (error != 0) 5387 return (error); 5388 5389 /* 5390 * Read the label of each vdev and make sure that the GUIDs stored 5391 * there match the GUIDs in the config provided. 5392 * If we're assembling a new pool that's been split off from an 5393 * existing pool, the labels haven't yet been updated so we skip 5394 * validation for now. 5395 */ 5396 if (type != SPA_IMPORT_ASSEMBLE) { 5397 error = spa_ld_validate_vdevs(spa); 5398 if (error != 0) 5399 return (error); 5400 } 5401 5402 /* 5403 * Read all vdev labels to find the best uberblock (i.e. latest, 5404 * unless spa_load_max_txg is set) and store it in spa_uberblock. We 5405 * get the list of features required to read blkptrs in the MOS from 5406 * the vdev label with the best uberblock and verify that our version 5407 * of zfs supports them all. 5408 */ 5409 error = spa_ld_select_uberblock(spa, type); 5410 if (error != 0) 5411 return (error); 5412 5413 /* 5414 * Pass that uberblock to the dsl_pool layer which will open the root 5415 * blkptr. This blkptr points to the latest version of the MOS and will 5416 * allow us to read its contents. 5417 */ 5418 error = spa_ld_open_rootbp(spa); 5419 if (error != 0) 5420 return (error); 5421 5422 return (0); 5423 } 5424 5425 static int 5426 spa_ld_checkpoint_rewind(spa_t *spa) 5427 { 5428 uberblock_t checkpoint; 5429 int error = 0; 5430 5431 ASSERT(spa_namespace_held()); 5432 ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); 5433 5434 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 5435 DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), 5436 sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); 5437 5438 if (error != 0) { 5439 spa_load_failed(spa, "unable to retrieve checkpointed " 5440 "uberblock from the MOS config [error=%d]", error); 5441 5442 if (error == ENOENT) 5443 error = ZFS_ERR_NO_CHECKPOINT; 5444 5445 return (error); 5446 } 5447 5448 ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg); 5449 ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg); 5450 5451 /* 5452 * We need to update the txg and timestamp of the checkpointed 5453 * uberblock to be higher than the latest one. This ensures that 5454 * the checkpointed uberblock is selected if we were to close and 5455 * reopen the pool right after we've written it in the vdev labels. 5456 * (also see block comment in vdev_uberblock_compare) 5457 */ 5458 checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1; 5459 checkpoint.ub_timestamp = gethrestime_sec(); 5460 5461 /* 5462 * Set current uberblock to be the checkpointed uberblock. 5463 */ 5464 spa->spa_uberblock = checkpoint; 5465 5466 /* 5467 * If we are doing a normal rewind, then the pool is open for 5468 * writing and we sync the "updated" checkpointed uberblock to 5469 * disk. Once this is done, we've basically rewound the whole 5470 * pool and there is no way back. 5471 * 5472 * There are cases when we don't want to attempt and sync the 5473 * checkpointed uberblock to disk because we are opening a 5474 * pool as read-only. Specifically, verifying the checkpointed 5475 * state with zdb, and importing the checkpointed state to get 5476 * a "preview" of its content. 5477 */ 5478 if (spa_writeable(spa)) { 5479 vdev_t *rvd = spa->spa_root_vdev; 5480 5481 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5482 vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; 5483 int svdcount = 0; 5484 int children = rvd->vdev_children; 5485 int c0 = random_in_range(children); 5486 5487 for (int c = 0; c < children; c++) { 5488 vdev_t *vd = rvd->vdev_child[(c0 + c) % children]; 5489 5490 /* Stop when revisiting the first vdev */ 5491 if (c > 0 && svd[0] == vd) 5492 break; 5493 5494 if (vd->vdev_ms_array == 0 || vd->vdev_islog || 5495 !vdev_is_concrete(vd)) 5496 continue; 5497 5498 svd[svdcount++] = vd; 5499 if (svdcount == SPA_SYNC_MIN_VDEVS) 5500 break; 5501 } 5502 error = vdev_config_sync(svd, svdcount, spa->spa_first_txg); 5503 if (error == 0) 5504 spa->spa_last_synced_guid = rvd->vdev_guid; 5505 spa_config_exit(spa, SCL_ALL, FTAG); 5506 5507 if (error != 0) { 5508 spa_load_failed(spa, "failed to write checkpointed " 5509 "uberblock to the vdev labels [error=%d]", error); 5510 return (error); 5511 } 5512 } 5513 5514 return (0); 5515 } 5516 5517 static int 5518 spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type, 5519 boolean_t *update_config_cache) 5520 { 5521 int error; 5522 5523 /* 5524 * Parse the config for pool, open and validate vdevs, 5525 * select an uberblock, and use that uberblock to open 5526 * the MOS. 5527 */ 5528 error = spa_ld_mos_init(spa, type); 5529 if (error != 0) 5530 return (error); 5531 5532 /* 5533 * Retrieve the trusted config stored in the MOS and use it to create 5534 * a new, exact version of the vdev tree, then reopen all vdevs. 5535 */ 5536 error = spa_ld_trusted_config(spa, type, B_FALSE); 5537 if (error == EAGAIN) { 5538 if (update_config_cache != NULL) 5539 *update_config_cache = B_TRUE; 5540 5541 /* 5542 * Redo the loading process with the trusted config if it is 5543 * too different from the untrusted config. 5544 */ 5545 spa_ld_prepare_for_reload(spa); 5546 spa_load_note(spa, "RELOADING"); 5547 error = spa_ld_mos_init(spa, type); 5548 if (error != 0) 5549 return (error); 5550 5551 error = spa_ld_trusted_config(spa, type, B_TRUE); 5552 if (error != 0) 5553 return (error); 5554 5555 } else if (error != 0) { 5556 return (error); 5557 } 5558 5559 return (0); 5560 } 5561 5562 /* 5563 * Load an existing storage pool, using the config provided. This config 5564 * describes which vdevs are part of the pool and is later validated against 5565 * partial configs present in each vdev's label and an entire copy of the 5566 * config stored in the MOS. 5567 */ 5568 static int 5569 spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) 5570 { 5571 int error = 0; 5572 boolean_t missing_feat_write = B_FALSE; 5573 boolean_t checkpoint_rewind = 5574 (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); 5575 boolean_t update_config_cache = B_FALSE; 5576 hrtime_t load_start = gethrtime(); 5577 5578 ASSERT(spa_namespace_held()); 5579 ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); 5580 5581 spa_load_note(spa, "LOADING"); 5582 5583 error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache); 5584 if (error != 0) 5585 return (error); 5586 5587 /* 5588 * If we are rewinding to the checkpoint then we need to repeat 5589 * everything we've done so far in this function but this time 5590 * selecting the checkpointed uberblock and using that to open 5591 * the MOS. 5592 */ 5593 if (checkpoint_rewind) { 5594 /* 5595 * If we are rewinding to the checkpoint update config cache 5596 * anyway. 5597 */ 5598 update_config_cache = B_TRUE; 5599 5600 /* 5601 * Extract the checkpointed uberblock from the current MOS 5602 * and use this as the pool's uberblock from now on. If the 5603 * pool is imported as writeable we also write the checkpoint 5604 * uberblock to the labels, making the rewind permanent. 5605 */ 5606 error = spa_ld_checkpoint_rewind(spa); 5607 if (error != 0) 5608 return (error); 5609 5610 /* 5611 * Redo the loading process again with the 5612 * checkpointed uberblock. 5613 */ 5614 spa_ld_prepare_for_reload(spa); 5615 spa_load_note(spa, "LOADING checkpointed uberblock"); 5616 error = spa_ld_mos_with_trusted_config(spa, type, NULL); 5617 if (error != 0) 5618 return (error); 5619 } 5620 5621 /* 5622 * Drop the namespace lock for the rest of the function. 5623 */ 5624 spa->spa_load_thread = curthread; 5625 spa_namespace_exit(FTAG); 5626 5627 /* 5628 * Retrieve the checkpoint txg if the pool has a checkpoint. 5629 */ 5630 spa_import_progress_set_notes(spa, "Loading checkpoint txg"); 5631 error = spa_ld_read_checkpoint_txg(spa); 5632 if (error != 0) 5633 goto fail; 5634 5635 /* 5636 * Retrieve the mapping of indirect vdevs. Those vdevs were removed 5637 * from the pool and their contents were re-mapped to other vdevs. Note 5638 * that everything that we read before this step must have been 5639 * rewritten on concrete vdevs after the last device removal was 5640 * initiated. Otherwise we could be reading from indirect vdevs before 5641 * we have loaded their mappings. 5642 */ 5643 spa_import_progress_set_notes(spa, "Loading indirect vdev metadata"); 5644 error = spa_ld_open_indirect_vdev_metadata(spa); 5645 if (error != 0) 5646 goto fail; 5647 5648 /* 5649 * Retrieve the full list of active features from the MOS and check if 5650 * they are all supported. 5651 */ 5652 spa_import_progress_set_notes(spa, "Checking feature flags"); 5653 error = spa_ld_check_features(spa, &missing_feat_write); 5654 if (error != 0) 5655 goto fail; 5656 5657 /* 5658 * Load several special directories from the MOS needed by the dsl_pool 5659 * layer. 5660 */ 5661 spa_import_progress_set_notes(spa, "Loading special MOS directories"); 5662 error = spa_ld_load_special_directories(spa); 5663 if (error != 0) 5664 goto fail; 5665 5666 /* 5667 * Retrieve pool properties from the MOS. 5668 */ 5669 spa_import_progress_set_notes(spa, "Loading properties"); 5670 error = spa_ld_get_props(spa); 5671 if (error != 0) 5672 goto fail; 5673 5674 /* 5675 * Retrieve the list of auxiliary devices - cache devices and spares - 5676 * and open them. 5677 */ 5678 spa_import_progress_set_notes(spa, "Loading AUX vdevs"); 5679 error = spa_ld_open_aux_vdevs(spa, type); 5680 if (error != 0) 5681 goto fail; 5682 5683 /* 5684 * Load the metadata for all vdevs. Also check if unopenable devices 5685 * should be autoreplaced. 5686 */ 5687 spa_import_progress_set_notes(spa, "Loading vdev metadata"); 5688 error = spa_ld_load_vdev_metadata(spa); 5689 if (error != 0) 5690 goto fail; 5691 5692 spa_import_progress_set_notes(spa, "Loading dedup tables"); 5693 error = spa_ld_load_dedup_tables(spa); 5694 if (error != 0) 5695 goto fail; 5696 5697 spa_import_progress_set_notes(spa, "Loading BRT"); 5698 error = spa_ld_load_brt(spa); 5699 if (error != 0) 5700 goto fail; 5701 5702 /* 5703 * Verify the logs now to make sure we don't have any unexpected errors 5704 * when we claim log blocks later. 5705 */ 5706 spa_import_progress_set_notes(spa, "Verifying Log Devices"); 5707 error = spa_ld_verify_logs(spa, type, ereport); 5708 if (error != 0) 5709 goto fail; 5710 5711 if (missing_feat_write) { 5712 ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT); 5713 5714 /* 5715 * At this point, we know that we can open the pool in 5716 * read-only mode but not read-write mode. We now have enough 5717 * information and can return to userland. 5718 */ 5719 error = spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT, 5720 ENOTSUP); 5721 goto fail; 5722 } 5723 5724 /* 5725 * Traverse the last txgs to make sure the pool was left off in a safe 5726 * state. When performing an extreme rewind, we verify the whole pool, 5727 * which can take a very long time. 5728 */ 5729 spa_import_progress_set_notes(spa, "Verifying pool data"); 5730 error = spa_ld_verify_pool_data(spa); 5731 if (error != 0) 5732 goto fail; 5733 5734 /* 5735 * Calculate the deflated space for the pool. This must be done before 5736 * we write anything to the pool because we'd need to update the space 5737 * accounting using the deflated sizes. 5738 */ 5739 spa_import_progress_set_notes(spa, "Calculating deflated space"); 5740 spa_update_dspace(spa); 5741 5742 /* 5743 * We have now retrieved all the information we needed to open the 5744 * pool. If we are importing the pool in read-write mode, a few 5745 * additional steps must be performed to finish the import. 5746 */ 5747 spa_import_progress_set_notes(spa, "Starting import"); 5748 if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER || 5749 spa->spa_load_max_txg == UINT64_MAX)) { 5750 uint64_t config_cache_txg = spa->spa_config_txg; 5751 5752 ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT); 5753 5754 /* 5755 * Before we do any zio_write's, complete the raidz expansion 5756 * scratch space copying, if necessary. 5757 */ 5758 if (RRSS_GET_STATE(&spa->spa_uberblock) == RRSS_SCRATCH_VALID) 5759 vdev_raidz_reflow_copy_scratch(spa); 5760 5761 /* 5762 * In case of a checkpoint rewind, log the original txg 5763 * of the checkpointed uberblock. 5764 */ 5765 if (checkpoint_rewind) { 5766 spa_history_log_internal(spa, "checkpoint rewind", 5767 NULL, "rewound state to txg=%llu", 5768 (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg); 5769 } 5770 5771 spa_import_progress_set_notes(spa, "Claiming ZIL blocks"); 5772 /* 5773 * Traverse the ZIL and claim all blocks. 5774 */ 5775 spa_ld_claim_log_blocks(spa); 5776 5777 /* 5778 * Kick-off the syncing thread. 5779 */ 5780 spa->spa_sync_on = B_TRUE; 5781 txg_sync_start(spa->spa_dsl_pool); 5782 mmp_thread_start(spa); 5783 5784 /* 5785 * Wait for all claims to sync. We sync up to the highest 5786 * claimed log block birth time so that claimed log blocks 5787 * don't appear to be from the future. spa_claim_max_txg 5788 * will have been set for us by ZIL traversal operations 5789 * performed above. 5790 */ 5791 spa_import_progress_set_notes(spa, "Syncing ZIL claims"); 5792 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 5793 5794 /* 5795 * Check if we need to request an update of the config. On the 5796 * next sync, we would update the config stored in vdev labels 5797 * and the cachefile (by default /etc/zfs/zpool.cache). 5798 */ 5799 spa_import_progress_set_notes(spa, "Updating configs"); 5800 spa_ld_check_for_config_update(spa, config_cache_txg, 5801 update_config_cache); 5802 5803 /* 5804 * Check if a rebuild was in progress and if so resume it. 5805 * Then check all DTLs to see if anything needs resilvering. 5806 * The resilver will be deferred if a rebuild was started. 5807 */ 5808 spa_import_progress_set_notes(spa, "Starting resilvers"); 5809 if (vdev_rebuild_active(spa->spa_root_vdev)) { 5810 vdev_rebuild_restart(spa); 5811 } else if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 5812 vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 5813 spa_async_request(spa, SPA_ASYNC_RESILVER); 5814 } 5815 5816 /* 5817 * Log the fact that we booted up (so that we can detect if 5818 * we rebooted in the middle of an operation). 5819 */ 5820 spa_history_log_version(spa, "open", NULL); 5821 5822 spa_import_progress_set_notes(spa, 5823 "Restarting device removals"); 5824 spa_restart_removal(spa); 5825 spa_spawn_aux_threads(spa); 5826 5827 /* 5828 * Delete any inconsistent datasets. 5829 * 5830 * Note: 5831 * Since we may be issuing deletes for clones here, 5832 * we make sure to do so after we've spawned all the 5833 * auxiliary threads above (from which the livelist 5834 * deletion zthr is part of). 5835 */ 5836 spa_import_progress_set_notes(spa, 5837 "Cleaning up inconsistent objsets"); 5838 (void) dmu_objset_find(spa_name(spa), 5839 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 5840 5841 /* 5842 * Clean up any stale temporary dataset userrefs. 5843 */ 5844 spa_import_progress_set_notes(spa, 5845 "Cleaning up temporary userrefs"); 5846 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 5847 5848 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5849 spa_import_progress_set_notes(spa, "Restarting initialize"); 5850 vdev_initialize_restart(spa->spa_root_vdev); 5851 spa_import_progress_set_notes(spa, "Restarting TRIM"); 5852 vdev_trim_restart(spa->spa_root_vdev); 5853 vdev_autotrim_restart(spa); 5854 spa_config_exit(spa, SCL_CONFIG, FTAG); 5855 spa_import_progress_set_notes(spa, "Finished importing"); 5856 } 5857 zio_handle_import_delay(spa, gethrtime() - load_start); 5858 5859 spa_import_progress_remove(spa_guid(spa)); 5860 spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); 5861 5862 spa_load_note(spa, "LOADED"); 5863 fail: 5864 spa_namespace_enter(FTAG); 5865 spa->spa_load_thread = NULL; 5866 spa_namespace_broadcast(); 5867 5868 return (error); 5869 5870 } 5871 5872 static int 5873 spa_load_retry(spa_t *spa, spa_load_state_t state) 5874 { 5875 spa_mode_t mode = spa->spa_mode; 5876 5877 spa_unload(spa); 5878 spa_deactivate(spa); 5879 5880 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1; 5881 5882 spa_activate(spa, mode); 5883 spa_async_suspend(spa); 5884 5885 spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu", 5886 (u_longlong_t)spa->spa_load_max_txg); 5887 5888 return (spa_load(spa, state, SPA_IMPORT_EXISTING)); 5889 } 5890 5891 /* 5892 * If spa_load() fails this function will try loading prior txg's. If 5893 * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool 5894 * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this 5895 * function will not rewind the pool and will return the same error as 5896 * spa_load(). 5897 */ 5898 static int 5899 spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request, 5900 int rewind_flags) 5901 { 5902 nvlist_t *loadinfo = NULL; 5903 nvlist_t *config = NULL; 5904 int load_error, rewind_error; 5905 uint64_t safe_rewind_txg; 5906 uint64_t min_txg; 5907 5908 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 5909 spa->spa_load_max_txg = spa->spa_load_txg; 5910 spa_set_log_state(spa, SPA_LOG_CLEAR); 5911 } else { 5912 spa->spa_load_max_txg = max_request; 5913 if (max_request != UINT64_MAX) 5914 spa->spa_extreme_rewind = B_TRUE; 5915 } 5916 5917 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING); 5918 if (load_error == 0) 5919 return (0); 5920 if (load_error == ZFS_ERR_NO_CHECKPOINT) { 5921 /* 5922 * When attempting checkpoint-rewind on a pool with no 5923 * checkpoint, we should not attempt to load uberblocks 5924 * from previous txgs when spa_load fails. 5925 */ 5926 ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); 5927 spa_import_progress_remove(spa_guid(spa)); 5928 return (load_error); 5929 } 5930 5931 if (spa->spa_root_vdev != NULL) 5932 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 5933 5934 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 5935 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 5936 5937 if (rewind_flags & ZPOOL_NEVER_REWIND) { 5938 nvlist_free(config); 5939 spa_import_progress_remove(spa_guid(spa)); 5940 return (load_error); 5941 } 5942 5943 if (state == SPA_LOAD_RECOVER) { 5944 /* Price of rolling back is discarding txgs, including log */ 5945 spa_set_log_state(spa, SPA_LOG_CLEAR); 5946 } else { 5947 /* 5948 * If we aren't rolling back save the load info from our first 5949 * import attempt so that we can restore it after attempting 5950 * to rewind. 5951 */ 5952 loadinfo = spa->spa_load_info; 5953 spa->spa_load_info = fnvlist_alloc(); 5954 } 5955 5956 spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 5957 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 5958 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 5959 TXG_INITIAL : safe_rewind_txg; 5960 5961 /* 5962 * Continue as long as we're finding errors, we're still within 5963 * the acceptable rewind range, and we're still finding uberblocks 5964 */ 5965 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 5966 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 5967 if (spa->spa_load_max_txg < safe_rewind_txg) 5968 spa->spa_extreme_rewind = B_TRUE; 5969 rewind_error = spa_load_retry(spa, state); 5970 } 5971 5972 spa->spa_extreme_rewind = B_FALSE; 5973 spa->spa_load_max_txg = UINT64_MAX; 5974 5975 if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 5976 spa_config_set(spa, config); 5977 else 5978 nvlist_free(config); 5979 5980 if (state == SPA_LOAD_RECOVER) { 5981 ASSERT0P(loadinfo); 5982 spa_import_progress_remove(spa_guid(spa)); 5983 return (rewind_error); 5984 } else { 5985 /* Store the rewind info as part of the initial load info */ 5986 fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, 5987 spa->spa_load_info); 5988 5989 /* Restore the initial load info */ 5990 fnvlist_free(spa->spa_load_info); 5991 spa->spa_load_info = loadinfo; 5992 5993 spa_import_progress_remove(spa_guid(spa)); 5994 return (load_error); 5995 } 5996 } 5997 5998 /* 5999 * Pool Open/Import 6000 * 6001 * The import case is identical to an open except that the configuration is sent 6002 * down from userland, instead of grabbed from the configuration cache. For the 6003 * case of an open, the pool configuration will exist in the 6004 * POOL_STATE_UNINITIALIZED state. 6005 * 6006 * The stats information (gen/count/ustats) is used to gather vdev statistics at 6007 * the same time open the pool, without having to keep around the spa_t in some 6008 * ambiguous state. 6009 */ 6010 static int 6011 spa_open_common(const char *pool, spa_t **spapp, const void *tag, 6012 nvlist_t *nvpolicy, nvlist_t **config) 6013 { 6014 spa_t *spa; 6015 spa_load_state_t state = SPA_LOAD_OPEN; 6016 int error; 6017 int locked = B_FALSE; 6018 int firstopen = B_FALSE; 6019 6020 *spapp = NULL; 6021 6022 /* 6023 * As disgusting as this is, we need to support recursive calls to this 6024 * function because dsl_dir_open() is called during spa_load(), and ends 6025 * up calling spa_open() again. The real fix is to figure out how to 6026 * avoid dsl_dir_open() calling this in the first place. 6027 */ 6028 if (!spa_namespace_held()) { 6029 spa_namespace_enter(FTAG); 6030 locked = B_TRUE; 6031 } 6032 6033 if ((spa = spa_lookup(pool)) == NULL) { 6034 if (locked) 6035 spa_namespace_exit(FTAG); 6036 return (SET_ERROR(ENOENT)); 6037 } 6038 6039 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 6040 zpool_load_policy_t policy; 6041 6042 firstopen = B_TRUE; 6043 6044 zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config, 6045 &policy); 6046 if (policy.zlp_rewind & ZPOOL_DO_REWIND) 6047 state = SPA_LOAD_RECOVER; 6048 6049 spa_activate(spa, spa_mode_global); 6050 6051 if (state != SPA_LOAD_RECOVER) 6052 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 6053 spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; 6054 6055 zfs_dbgmsg("spa_open_common: opening %s", pool); 6056 error = spa_load_best(spa, state, policy.zlp_txg, 6057 policy.zlp_rewind); 6058 6059 if (error == EBADF) { 6060 /* 6061 * If vdev_validate() returns failure (indicated by 6062 * EBADF), it indicates that one of the vdevs indicates 6063 * that the pool has been exported or destroyed. If 6064 * this is the case, the config cache is out of sync and 6065 * we should remove the pool from the namespace. 6066 */ 6067 spa_unload(spa); 6068 spa_deactivate(spa); 6069 spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE); 6070 spa_remove(spa); 6071 if (locked) 6072 spa_namespace_exit(FTAG); 6073 return (SET_ERROR(ENOENT)); 6074 } 6075 6076 if (error) { 6077 /* 6078 * We can't open the pool, but we still have useful 6079 * information: the state of each vdev after the 6080 * attempted vdev_open(). Return this to the user. 6081 */ 6082 if (config != NULL && spa->spa_config) { 6083 *config = fnvlist_dup(spa->spa_config); 6084 fnvlist_add_nvlist(*config, 6085 ZPOOL_CONFIG_LOAD_INFO, 6086 spa->spa_load_info); 6087 } 6088 spa_unload(spa); 6089 spa_deactivate(spa); 6090 spa->spa_last_open_failed = error; 6091 if (locked) 6092 spa_namespace_exit(FTAG); 6093 *spapp = NULL; 6094 return (error); 6095 } 6096 } 6097 6098 spa_open_ref(spa, tag); 6099 6100 if (config != NULL) 6101 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 6102 6103 /* 6104 * If we've recovered the pool, pass back any information we 6105 * gathered while doing the load. 6106 */ 6107 if (state == SPA_LOAD_RECOVER && config != NULL) { 6108 fnvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 6109 spa->spa_load_info); 6110 } 6111 6112 if (locked) { 6113 spa->spa_last_open_failed = 0; 6114 spa->spa_last_ubsync_txg = 0; 6115 spa->spa_load_txg = 0; 6116 spa_namespace_exit(FTAG); 6117 } 6118 6119 if (firstopen) 6120 zvol_create_minors(spa_name(spa)); 6121 6122 *spapp = spa; 6123 6124 return (0); 6125 } 6126 6127 int 6128 spa_open_rewind(const char *name, spa_t **spapp, const void *tag, 6129 nvlist_t *policy, nvlist_t **config) 6130 { 6131 return (spa_open_common(name, spapp, tag, policy, config)); 6132 } 6133 6134 int 6135 spa_open(const char *name, spa_t **spapp, const void *tag) 6136 { 6137 return (spa_open_common(name, spapp, tag, NULL, NULL)); 6138 } 6139 6140 /* 6141 * Lookup the given spa_t, incrementing the inject count in the process, 6142 * preventing it from being exported or destroyed. 6143 */ 6144 spa_t * 6145 spa_inject_addref(char *name) 6146 { 6147 spa_t *spa; 6148 6149 spa_namespace_enter(FTAG); 6150 if ((spa = spa_lookup(name)) == NULL) { 6151 spa_namespace_exit(FTAG); 6152 return (NULL); 6153 } 6154 spa->spa_inject_ref++; 6155 spa_namespace_exit(FTAG); 6156 6157 return (spa); 6158 } 6159 6160 void 6161 spa_inject_delref(spa_t *spa) 6162 { 6163 spa_namespace_enter(FTAG); 6164 spa->spa_inject_ref--; 6165 spa_namespace_exit(FTAG); 6166 } 6167 6168 /* 6169 * Add spares device information to the nvlist. 6170 */ 6171 static void 6172 spa_add_spares(spa_t *spa, nvlist_t *config) 6173 { 6174 nvlist_t **spares; 6175 uint_t i, nspares; 6176 nvlist_t *nvroot; 6177 uint64_t guid; 6178 vdev_stat_t *vs; 6179 uint_t vsc; 6180 uint64_t pool; 6181 6182 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 6183 6184 if (spa->spa_spares.sav_count == 0) 6185 return; 6186 6187 nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); 6188 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 6189 ZPOOL_CONFIG_SPARES, &spares, &nspares)); 6190 if (nspares != 0) { 6191 fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 6192 (const nvlist_t * const *)spares, nspares); 6193 VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 6194 &spares, &nspares)); 6195 6196 /* 6197 * Go through and find any spares which have since been 6198 * repurposed as an active spare. If this is the case, update 6199 * their status appropriately. 6200 */ 6201 for (i = 0; i < nspares; i++) { 6202 guid = fnvlist_lookup_uint64(spares[i], 6203 ZPOOL_CONFIG_GUID); 6204 VERIFY0(nvlist_lookup_uint64_array(spares[i], 6205 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)); 6206 if (spa_spare_exists(guid, &pool, NULL) && 6207 pool != 0ULL) { 6208 vs->vs_state = VDEV_STATE_CANT_OPEN; 6209 vs->vs_aux = VDEV_AUX_SPARED; 6210 } else { 6211 vs->vs_state = 6212 spa->spa_spares.sav_vdevs[i]->vdev_state; 6213 } 6214 } 6215 } 6216 } 6217 6218 /* 6219 * Add l2cache device information to the nvlist, including vdev stats. 6220 */ 6221 static void 6222 spa_add_l2cache(spa_t *spa, nvlist_t *config) 6223 { 6224 nvlist_t **l2cache; 6225 uint_t i, j, nl2cache; 6226 nvlist_t *nvroot; 6227 uint64_t guid; 6228 vdev_t *vd; 6229 vdev_stat_t *vs; 6230 uint_t vsc; 6231 6232 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 6233 6234 if (spa->spa_l2cache.sav_count == 0) 6235 return; 6236 6237 nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); 6238 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 6239 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache)); 6240 if (nl2cache != 0) { 6241 fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 6242 (const nvlist_t * const *)l2cache, nl2cache); 6243 VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 6244 &l2cache, &nl2cache)); 6245 6246 /* 6247 * Update level 2 cache device stats. 6248 */ 6249 6250 for (i = 0; i < nl2cache; i++) { 6251 guid = fnvlist_lookup_uint64(l2cache[i], 6252 ZPOOL_CONFIG_GUID); 6253 6254 vd = NULL; 6255 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 6256 if (guid == 6257 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 6258 vd = spa->spa_l2cache.sav_vdevs[j]; 6259 break; 6260 } 6261 } 6262 ASSERT(vd != NULL); 6263 6264 VERIFY0(nvlist_lookup_uint64_array(l2cache[i], 6265 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)); 6266 vdev_get_stats(vd, vs); 6267 vdev_config_generate_stats(vd, l2cache[i]); 6268 6269 } 6270 } 6271 } 6272 6273 static void 6274 spa_feature_stats_from_disk(spa_t *spa, nvlist_t *features) 6275 { 6276 zap_cursor_t zc; 6277 zap_attribute_t *za = zap_attribute_alloc(); 6278 6279 if (spa->spa_feat_for_read_obj != 0) { 6280 for (zap_cursor_init(&zc, spa->spa_meta_objset, 6281 spa->spa_feat_for_read_obj); 6282 zap_cursor_retrieve(&zc, za) == 0; 6283 zap_cursor_advance(&zc)) { 6284 ASSERT(za->za_integer_length == sizeof (uint64_t) && 6285 za->za_num_integers == 1); 6286 VERIFY0(nvlist_add_uint64(features, za->za_name, 6287 za->za_first_integer)); 6288 } 6289 zap_cursor_fini(&zc); 6290 } 6291 6292 if (spa->spa_feat_for_write_obj != 0) { 6293 for (zap_cursor_init(&zc, spa->spa_meta_objset, 6294 spa->spa_feat_for_write_obj); 6295 zap_cursor_retrieve(&zc, za) == 0; 6296 zap_cursor_advance(&zc)) { 6297 ASSERT(za->za_integer_length == sizeof (uint64_t) && 6298 za->za_num_integers == 1); 6299 VERIFY0(nvlist_add_uint64(features, za->za_name, 6300 za->za_first_integer)); 6301 } 6302 zap_cursor_fini(&zc); 6303 } 6304 zap_attribute_free(za); 6305 } 6306 6307 static void 6308 spa_feature_stats_from_cache(spa_t *spa, nvlist_t *features) 6309 { 6310 int i; 6311 6312 for (i = 0; i < SPA_FEATURES; i++) { 6313 zfeature_info_t feature = spa_feature_table[i]; 6314 uint64_t refcount; 6315 6316 if (feature_get_refcount(spa, &feature, &refcount) != 0) 6317 continue; 6318 6319 VERIFY0(nvlist_add_uint64(features, feature.fi_guid, refcount)); 6320 } 6321 } 6322 6323 /* 6324 * Store a list of pool features and their reference counts in the 6325 * config. 6326 * 6327 * The first time this is called on a spa, allocate a new nvlist, fetch 6328 * the pool features and reference counts from disk, then save the list 6329 * in the spa. In subsequent calls on the same spa use the saved nvlist 6330 * and refresh its values from the cached reference counts. This 6331 * ensures we don't block here on I/O on a suspended pool so 'zpool 6332 * clear' can resume the pool. 6333 */ 6334 static void 6335 spa_add_feature_stats(spa_t *spa, nvlist_t *config) 6336 { 6337 nvlist_t *features; 6338 6339 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 6340 6341 mutex_enter(&spa->spa_feat_stats_lock); 6342 features = spa->spa_feat_stats; 6343 6344 if (features != NULL) { 6345 spa_feature_stats_from_cache(spa, features); 6346 } else { 6347 VERIFY0(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP)); 6348 spa->spa_feat_stats = features; 6349 spa_feature_stats_from_disk(spa, features); 6350 } 6351 6352 VERIFY0(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, 6353 features)); 6354 6355 mutex_exit(&spa->spa_feat_stats_lock); 6356 } 6357 6358 int 6359 spa_get_stats(const char *name, nvlist_t **config, 6360 char *altroot, size_t buflen) 6361 { 6362 int error; 6363 spa_t *spa; 6364 6365 *config = NULL; 6366 error = spa_open_common(name, &spa, FTAG, NULL, config); 6367 6368 if (spa != NULL) { 6369 /* 6370 * This still leaves a window of inconsistency where the spares 6371 * or l2cache devices could change and the config would be 6372 * self-inconsistent. 6373 */ 6374 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6375 6376 if (*config != NULL) { 6377 uint64_t loadtimes[2]; 6378 6379 loadtimes[0] = spa->spa_loaded_ts.tv_sec; 6380 loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 6381 fnvlist_add_uint64_array(*config, 6382 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2); 6383 6384 fnvlist_add_uint64(*config, 6385 ZPOOL_CONFIG_ERRCOUNT, 6386 spa_approx_errlog_size(spa)); 6387 6388 if (spa_suspended(spa)) { 6389 fnvlist_add_uint64(*config, 6390 ZPOOL_CONFIG_SUSPENDED, 6391 spa->spa_failmode); 6392 fnvlist_add_uint64(*config, 6393 ZPOOL_CONFIG_SUSPENDED_REASON, 6394 spa->spa_suspended); 6395 } 6396 6397 spa_add_spares(spa, *config); 6398 spa_add_l2cache(spa, *config); 6399 spa_add_feature_stats(spa, *config); 6400 } 6401 } 6402 6403 /* 6404 * We want to get the alternate root even for faulted pools, so we cheat 6405 * and call spa_lookup() directly. 6406 */ 6407 if (altroot) { 6408 if (spa == NULL) { 6409 spa_namespace_enter(FTAG); 6410 spa = spa_lookup(name); 6411 if (spa) 6412 spa_altroot(spa, altroot, buflen); 6413 else 6414 altroot[0] = '\0'; 6415 spa = NULL; 6416 spa_namespace_exit(FTAG); 6417 } else { 6418 spa_altroot(spa, altroot, buflen); 6419 } 6420 } 6421 6422 if (spa != NULL) { 6423 spa_config_exit(spa, SCL_CONFIG, FTAG); 6424 spa_close(spa, FTAG); 6425 } 6426 6427 return (error); 6428 } 6429 6430 /* 6431 * Validate that the auxiliary device array is well formed. We must have an 6432 * array of nvlists, each which describes a valid leaf vdev. If this is an 6433 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 6434 * specified, as long as they are well-formed. 6435 */ 6436 static int 6437 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 6438 spa_aux_vdev_t *sav, const char *config, uint64_t version, 6439 vdev_labeltype_t label) 6440 { 6441 nvlist_t **dev; 6442 uint_t i, ndev; 6443 vdev_t *vd; 6444 int error; 6445 6446 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 6447 6448 /* 6449 * It's acceptable to have no devs specified. 6450 */ 6451 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 6452 return (0); 6453 6454 if (ndev == 0) 6455 return (SET_ERROR(EINVAL)); 6456 6457 /* 6458 * Make sure the pool is formatted with a version that supports this 6459 * device type. 6460 */ 6461 if (spa_version(spa) < version) 6462 return (SET_ERROR(ENOTSUP)); 6463 6464 /* 6465 * Set the pending device list so we correctly handle device in-use 6466 * checking. 6467 */ 6468 sav->sav_pending = dev; 6469 sav->sav_npending = ndev; 6470 6471 for (i = 0; i < ndev; i++) { 6472 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 6473 mode)) != 0) 6474 goto out; 6475 6476 if (!vd->vdev_ops->vdev_op_leaf) { 6477 vdev_free(vd); 6478 error = SET_ERROR(EINVAL); 6479 goto out; 6480 } 6481 6482 vd->vdev_top = vd; 6483 6484 if ((error = vdev_open(vd)) == 0 && 6485 (error = vdev_label_init(vd, crtxg, label)) == 0) { 6486 fnvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 6487 vd->vdev_guid); 6488 } 6489 6490 vdev_free(vd); 6491 6492 if (error && 6493 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 6494 goto out; 6495 else 6496 error = 0; 6497 } 6498 6499 out: 6500 sav->sav_pending = NULL; 6501 sav->sav_npending = 0; 6502 return (error); 6503 } 6504 6505 static int 6506 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 6507 { 6508 int error; 6509 6510 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 6511 6512 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 6513 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 6514 VDEV_LABEL_SPARE)) != 0) { 6515 return (error); 6516 } 6517 6518 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 6519 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 6520 VDEV_LABEL_L2CACHE)); 6521 } 6522 6523 static void 6524 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 6525 const char *config) 6526 { 6527 int i; 6528 6529 if (sav->sav_config != NULL) { 6530 nvlist_t **olddevs; 6531 uint_t oldndevs; 6532 nvlist_t **newdevs; 6533 6534 /* 6535 * Generate new dev list by concatenating with the 6536 * current dev list. 6537 */ 6538 VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, config, 6539 &olddevs, &oldndevs)); 6540 6541 newdevs = kmem_alloc(sizeof (void *) * 6542 (ndevs + oldndevs), KM_SLEEP); 6543 for (i = 0; i < oldndevs; i++) 6544 newdevs[i] = fnvlist_dup(olddevs[i]); 6545 for (i = 0; i < ndevs; i++) 6546 newdevs[i + oldndevs] = fnvlist_dup(devs[i]); 6547 6548 fnvlist_remove(sav->sav_config, config); 6549 6550 fnvlist_add_nvlist_array(sav->sav_config, config, 6551 (const nvlist_t * const *)newdevs, ndevs + oldndevs); 6552 for (i = 0; i < oldndevs + ndevs; i++) 6553 nvlist_free(newdevs[i]); 6554 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 6555 } else { 6556 /* 6557 * Generate a new dev list. 6558 */ 6559 sav->sav_config = fnvlist_alloc(); 6560 fnvlist_add_nvlist_array(sav->sav_config, config, 6561 (const nvlist_t * const *)devs, ndevs); 6562 } 6563 } 6564 6565 /* 6566 * Stop and drop level 2 ARC devices 6567 */ 6568 void 6569 spa_l2cache_drop(spa_t *spa) 6570 { 6571 vdev_t *vd; 6572 int i; 6573 spa_aux_vdev_t *sav = &spa->spa_l2cache; 6574 6575 for (i = 0; i < sav->sav_count; i++) { 6576 uint64_t pool; 6577 6578 vd = sav->sav_vdevs[i]; 6579 ASSERT(vd != NULL); 6580 6581 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 6582 pool != 0ULL && l2arc_vdev_present(vd)) 6583 l2arc_remove_vdev(vd); 6584 } 6585 } 6586 6587 /* 6588 * Verify encryption parameters for spa creation. If we are encrypting, we must 6589 * have the encryption feature flag enabled. 6590 */ 6591 static int 6592 spa_create_check_encryption_params(dsl_crypto_params_t *dcp, 6593 boolean_t has_encryption) 6594 { 6595 if (dcp->cp_crypt != ZIO_CRYPT_OFF && 6596 dcp->cp_crypt != ZIO_CRYPT_INHERIT && 6597 !has_encryption) 6598 return (SET_ERROR(ENOTSUP)); 6599 6600 return (dmu_objset_create_crypt_check(NULL, dcp, NULL)); 6601 } 6602 6603 /* 6604 * Pool Creation 6605 */ 6606 int 6607 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 6608 nvlist_t *zplprops, dsl_crypto_params_t *dcp) 6609 { 6610 spa_t *spa; 6611 const char *altroot = NULL; 6612 vdev_t *rvd; 6613 dsl_pool_t *dp; 6614 dmu_tx_t *tx; 6615 int error = 0; 6616 uint64_t txg = TXG_INITIAL; 6617 nvlist_t **spares, **l2cache; 6618 uint_t nspares, nl2cache; 6619 uint64_t version, obj, ndraid = 0; 6620 boolean_t has_features; 6621 boolean_t has_encryption; 6622 boolean_t has_allocclass; 6623 spa_feature_t feat; 6624 const char *feat_name; 6625 const char *poolname; 6626 nvlist_t *nvl; 6627 6628 if (props == NULL || 6629 nvlist_lookup_string(props, 6630 zpool_prop_to_name(ZPOOL_PROP_TNAME), &poolname) != 0) 6631 poolname = (char *)pool; 6632 6633 /* 6634 * If this pool already exists, return failure. 6635 */ 6636 spa_namespace_enter(FTAG); 6637 if (spa_lookup(poolname) != NULL) { 6638 spa_namespace_exit(FTAG); 6639 return (SET_ERROR(EEXIST)); 6640 } 6641 6642 /* 6643 * Allocate a new spa_t structure. 6644 */ 6645 nvl = fnvlist_alloc(); 6646 fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool); 6647 (void) nvlist_lookup_string(props, 6648 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 6649 spa = spa_add(poolname, nvl, altroot); 6650 fnvlist_free(nvl); 6651 spa_activate(spa, spa_mode_global); 6652 6653 if (props && (error = spa_prop_validate(spa, props))) { 6654 spa_deactivate(spa); 6655 spa_remove(spa); 6656 spa_namespace_exit(FTAG); 6657 return (error); 6658 } 6659 6660 /* 6661 * Temporary pool names should never be written to disk. 6662 */ 6663 if (poolname != pool) 6664 spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME; 6665 6666 has_features = B_FALSE; 6667 has_encryption = B_FALSE; 6668 has_allocclass = B_FALSE; 6669 for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); 6670 elem != NULL; elem = nvlist_next_nvpair(props, elem)) { 6671 if (zpool_prop_feature(nvpair_name(elem))) { 6672 has_features = B_TRUE; 6673 6674 feat_name = strchr(nvpair_name(elem), '@') + 1; 6675 VERIFY0(zfeature_lookup_name(feat_name, &feat)); 6676 if (feat == SPA_FEATURE_ENCRYPTION) 6677 has_encryption = B_TRUE; 6678 if (feat == SPA_FEATURE_ALLOCATION_CLASSES) 6679 has_allocclass = B_TRUE; 6680 } 6681 } 6682 6683 /* verify encryption params, if they were provided */ 6684 if (dcp != NULL) { 6685 error = spa_create_check_encryption_params(dcp, has_encryption); 6686 if (error != 0) { 6687 spa_deactivate(spa); 6688 spa_remove(spa); 6689 spa_namespace_exit(FTAG); 6690 return (error); 6691 } 6692 } 6693 if (!has_allocclass && zfs_special_devs(nvroot, NULL)) { 6694 spa_deactivate(spa); 6695 spa_remove(spa); 6696 spa_namespace_exit(FTAG); 6697 return (ENOTSUP); 6698 } 6699 6700 if (has_features || nvlist_lookup_uint64(props, 6701 zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { 6702 version = SPA_VERSION; 6703 } 6704 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 6705 6706 spa->spa_first_txg = txg; 6707 spa->spa_uberblock.ub_txg = txg - 1; 6708 spa->spa_uberblock.ub_version = version; 6709 spa->spa_ubsync = spa->spa_uberblock; 6710 spa->spa_load_state = SPA_LOAD_CREATE; 6711 spa->spa_removing_phys.sr_state = DSS_NONE; 6712 spa->spa_removing_phys.sr_removing_vdev = -1; 6713 spa->spa_removing_phys.sr_prev_indirect_vdev = -1; 6714 spa->spa_indirect_vdevs_loaded = B_TRUE; 6715 6716 /* 6717 * Create "The Godfather" zio to hold all async IOs 6718 */ 6719 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 6720 KM_SLEEP); 6721 for (int i = 0; i < max_ncpus; i++) { 6722 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 6723 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 6724 ZIO_FLAG_GODFATHER); 6725 } 6726 6727 /* 6728 * Create the root vdev. 6729 */ 6730 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6731 6732 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 6733 6734 ASSERT(error != 0 || rvd != NULL); 6735 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 6736 6737 if (error == 0 && !zfs_allocatable_devs(nvroot)) 6738 error = SET_ERROR(EINVAL); 6739 6740 if (error == 0 && 6741 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 6742 (error = vdev_draid_spare_create(nvroot, rvd, &ndraid, 0)) == 0 && 6743 (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { 6744 /* 6745 * instantiate the metaslab groups (this will dirty the vdevs) 6746 * we can no longer error exit past this point 6747 */ 6748 for (int c = 0; error == 0 && c < rvd->vdev_children; c++) { 6749 vdev_t *vd = rvd->vdev_child[c]; 6750 6751 vdev_metaslab_set_size(vd); 6752 vdev_expand(vd, txg); 6753 } 6754 } 6755 6756 spa_config_exit(spa, SCL_ALL, FTAG); 6757 6758 if (error != 0) { 6759 spa_unload(spa); 6760 spa_deactivate(spa); 6761 spa_remove(spa); 6762 spa_namespace_exit(FTAG); 6763 return (error); 6764 } 6765 6766 /* 6767 * Get the list of spares, if specified. 6768 */ 6769 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 6770 &spares, &nspares) == 0) { 6771 spa->spa_spares.sav_config = fnvlist_alloc(); 6772 fnvlist_add_nvlist_array(spa->spa_spares.sav_config, 6773 ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, 6774 nspares); 6775 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6776 spa_load_spares(spa); 6777 spa_config_exit(spa, SCL_ALL, FTAG); 6778 spa->spa_spares.sav_sync = B_TRUE; 6779 } 6780 6781 /* 6782 * Get the list of level 2 cache devices, if specified. 6783 */ 6784 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 6785 &l2cache, &nl2cache) == 0) { 6786 VERIFY0(nvlist_alloc(&spa->spa_l2cache.sav_config, 6787 NV_UNIQUE_NAME, KM_SLEEP)); 6788 fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 6789 ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache, 6790 nl2cache); 6791 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6792 spa_load_l2cache(spa); 6793 spa_config_exit(spa, SCL_ALL, FTAG); 6794 spa->spa_l2cache.sav_sync = B_TRUE; 6795 } 6796 6797 spa->spa_is_initializing = B_TRUE; 6798 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, dcp, txg); 6799 spa->spa_is_initializing = B_FALSE; 6800 6801 /* 6802 * Create DDTs (dedup tables). 6803 */ 6804 ddt_create(spa); 6805 /* 6806 * Create BRT table and BRT table object. 6807 */ 6808 brt_create(spa); 6809 6810 spa_update_dspace(spa); 6811 6812 tx = dmu_tx_create_assigned(dp, txg); 6813 6814 /* 6815 * Create the pool's history object. 6816 */ 6817 if (version >= SPA_VERSION_ZPOOL_HISTORY && !spa->spa_history) 6818 spa_history_create_obj(spa, tx); 6819 6820 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE); 6821 spa_history_log_version(spa, "create", tx); 6822 6823 /* 6824 * Create the pool config object. 6825 */ 6826 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 6827 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 6828 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 6829 6830 if (zap_add(spa->spa_meta_objset, 6831 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 6832 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 6833 cmn_err(CE_PANIC, "failed to add pool config"); 6834 } 6835 6836 if (zap_add(spa->spa_meta_objset, 6837 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 6838 sizeof (uint64_t), 1, &version, tx) != 0) { 6839 cmn_err(CE_PANIC, "failed to add pool version"); 6840 } 6841 6842 /* Newly created pools with the right version are always deflated. */ 6843 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 6844 spa->spa_deflate = TRUE; 6845 if (zap_add(spa->spa_meta_objset, 6846 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 6847 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 6848 cmn_err(CE_PANIC, "failed to add deflate"); 6849 } 6850 } 6851 6852 /* 6853 * Create the deferred-free bpobj. Turn off compression 6854 * because sync-to-convergence takes longer if the blocksize 6855 * keeps changing. 6856 */ 6857 obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 6858 dmu_object_set_compress(spa->spa_meta_objset, obj, 6859 ZIO_COMPRESS_OFF, tx); 6860 if (zap_add(spa->spa_meta_objset, 6861 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 6862 sizeof (uint64_t), 1, &obj, tx) != 0) { 6863 cmn_err(CE_PANIC, "failed to add bpobj"); 6864 } 6865 VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 6866 spa->spa_meta_objset, obj)); 6867 6868 /* 6869 * Generate some random noise for salted checksums to operate on. 6870 */ 6871 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 6872 sizeof (spa->spa_cksum_salt.zcs_bytes)); 6873 6874 /* 6875 * Set pool properties. 6876 */ 6877 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 6878 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 6879 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 6880 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 6881 spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST); 6882 spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM); 6883 spa->spa_dedup_table_quota = 6884 zpool_prop_default_numeric(ZPOOL_PROP_DEDUP_TABLE_QUOTA); 6885 6886 if (props != NULL) { 6887 spa_configfile_set(spa, props, B_FALSE); 6888 spa_sync_props(props, tx); 6889 } 6890 6891 for (int i = 0; i < ndraid; i++) 6892 spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); 6893 6894 dmu_tx_commit(tx); 6895 6896 spa->spa_sync_on = B_TRUE; 6897 txg_sync_start(dp); 6898 mmp_thread_start(spa); 6899 txg_wait_synced(dp, txg); 6900 6901 spa_spawn_aux_threads(spa); 6902 6903 spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE); 6904 6905 /* 6906 * Don't count references from objsets that are already closed 6907 * and are making their way through the eviction process. 6908 */ 6909 spa_evicting_os_wait(spa); 6910 spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); 6911 spa->spa_load_state = SPA_LOAD_NONE; 6912 6913 spa_import_os(spa); 6914 6915 spa_namespace_exit(FTAG); 6916 6917 return (0); 6918 } 6919 6920 /* 6921 * Import a non-root pool into the system. 6922 */ 6923 int 6924 spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 6925 { 6926 spa_t *spa; 6927 const char *altroot = NULL; 6928 spa_load_state_t state = SPA_LOAD_IMPORT; 6929 zpool_load_policy_t policy; 6930 spa_mode_t mode = spa_mode_global; 6931 uint64_t readonly = B_FALSE; 6932 int error; 6933 nvlist_t *nvroot; 6934 nvlist_t **spares, **l2cache; 6935 uint_t nspares, nl2cache; 6936 6937 /* 6938 * If a pool with this name exists, return failure. 6939 */ 6940 spa_namespace_enter(FTAG); 6941 if (spa_lookup(pool) != NULL) { 6942 spa_namespace_exit(FTAG); 6943 return (SET_ERROR(EEXIST)); 6944 } 6945 6946 /* 6947 * Create and initialize the spa structure. 6948 */ 6949 (void) nvlist_lookup_string(props, 6950 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 6951 (void) nvlist_lookup_uint64(props, 6952 zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 6953 if (readonly) 6954 mode = SPA_MODE_READ; 6955 spa = spa_add(pool, config, altroot); 6956 spa->spa_import_flags = flags; 6957 6958 /* 6959 * Verbatim import - Take a pool and insert it into the namespace 6960 * as if it had been loaded at boot. 6961 */ 6962 if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 6963 if (props != NULL) 6964 spa_configfile_set(spa, props, B_FALSE); 6965 6966 spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE); 6967 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); 6968 zfs_dbgmsg("spa_import: verbatim import of %s", pool); 6969 spa_namespace_exit(FTAG); 6970 return (0); 6971 } 6972 6973 spa_activate(spa, mode); 6974 6975 /* 6976 * Don't start async tasks until we know everything is healthy. 6977 */ 6978 spa_async_suspend(spa); 6979 6980 zpool_get_load_policy(config, &policy); 6981 if (policy.zlp_rewind & ZPOOL_DO_REWIND) 6982 state = SPA_LOAD_RECOVER; 6983 6984 spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT; 6985 6986 if (state != SPA_LOAD_RECOVER) { 6987 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 6988 zfs_dbgmsg("spa_import: importing %s", pool); 6989 } else { 6990 zfs_dbgmsg("spa_import: importing %s, max_txg=%lld " 6991 "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg); 6992 } 6993 error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind); 6994 6995 /* 6996 * Propagate anything learned while loading the pool and pass it 6997 * back to caller (i.e. rewind info, missing devices, etc). 6998 */ 6999 fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info); 7000 7001 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 7002 /* 7003 * Toss any existing sparelist, as it doesn't have any validity 7004 * anymore, and conflicts with spa_has_spare(). 7005 */ 7006 if (spa->spa_spares.sav_config) { 7007 nvlist_free(spa->spa_spares.sav_config); 7008 spa->spa_spares.sav_config = NULL; 7009 spa_load_spares(spa); 7010 } 7011 if (spa->spa_l2cache.sav_config) { 7012 nvlist_free(spa->spa_l2cache.sav_config); 7013 spa->spa_l2cache.sav_config = NULL; 7014 spa_load_l2cache(spa); 7015 } 7016 7017 nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); 7018 spa_config_exit(spa, SCL_ALL, FTAG); 7019 7020 if (props != NULL) 7021 spa_configfile_set(spa, props, B_FALSE); 7022 7023 if (error != 0 || (props && spa_writeable(spa) && 7024 (error = spa_prop_set(spa, props)))) { 7025 spa_unload(spa); 7026 spa_deactivate(spa); 7027 spa_remove(spa); 7028 spa_namespace_exit(FTAG); 7029 return (error); 7030 } 7031 7032 spa_async_resume(spa); 7033 7034 /* 7035 * Override any spares and level 2 cache devices as specified by 7036 * the user, as these may have correct device names/devids, etc. 7037 */ 7038 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 7039 &spares, &nspares) == 0) { 7040 if (spa->spa_spares.sav_config) 7041 fnvlist_remove(spa->spa_spares.sav_config, 7042 ZPOOL_CONFIG_SPARES); 7043 else 7044 spa->spa_spares.sav_config = fnvlist_alloc(); 7045 fnvlist_add_nvlist_array(spa->spa_spares.sav_config, 7046 ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, 7047 nspares); 7048 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 7049 spa_load_spares(spa); 7050 spa_config_exit(spa, SCL_ALL, FTAG); 7051 spa->spa_spares.sav_sync = B_TRUE; 7052 spa->spa_spares.sav_label_sync = B_TRUE; 7053 } 7054 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 7055 &l2cache, &nl2cache) == 0) { 7056 if (spa->spa_l2cache.sav_config) 7057 fnvlist_remove(spa->spa_l2cache.sav_config, 7058 ZPOOL_CONFIG_L2CACHE); 7059 else 7060 spa->spa_l2cache.sav_config = fnvlist_alloc(); 7061 fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 7062 ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache, 7063 nl2cache); 7064 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 7065 spa_load_l2cache(spa); 7066 spa_config_exit(spa, SCL_ALL, FTAG); 7067 spa->spa_l2cache.sav_sync = B_TRUE; 7068 spa->spa_l2cache.sav_label_sync = B_TRUE; 7069 } 7070 7071 /* 7072 * Check for any removed devices. 7073 */ 7074 if (spa->spa_autoreplace) { 7075 spa_aux_check_removed(&spa->spa_spares); 7076 spa_aux_check_removed(&spa->spa_l2cache); 7077 } 7078 7079 if (spa_writeable(spa)) { 7080 /* 7081 * Update the config cache to include the newly-imported pool. 7082 */ 7083 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 7084 } 7085 7086 /* 7087 * It's possible that the pool was expanded while it was exported. 7088 * We kick off an async task to handle this for us. 7089 */ 7090 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 7091 7092 spa_history_log_version(spa, "import", NULL); 7093 7094 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); 7095 7096 spa_namespace_exit(FTAG); 7097 7098 zvol_create_minors(pool); 7099 7100 spa_import_os(spa); 7101 7102 return (0); 7103 } 7104 7105 nvlist_t * 7106 spa_tryimport(nvlist_t *tryconfig) 7107 { 7108 nvlist_t *config = NULL; 7109 const char *poolname, *cachefile; 7110 spa_t *spa; 7111 uint64_t state; 7112 int error; 7113 zpool_load_policy_t policy; 7114 7115 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 7116 return (NULL); 7117 7118 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 7119 return (NULL); 7120 7121 /* 7122 * Create and initialize the spa structure. 7123 */ 7124 char *name = kmem_alloc(MAXPATHLEN, KM_SLEEP); 7125 (void) snprintf(name, MAXPATHLEN, "%s-%llx-%s", 7126 TRYIMPORT_NAME, (u_longlong_t)(uintptr_t)curthread, poolname); 7127 7128 spa_namespace_enter(FTAG); 7129 spa = spa_add(name, tryconfig, NULL); 7130 spa_activate(spa, SPA_MODE_READ); 7131 kmem_free(name, MAXPATHLEN); 7132 7133 /* 7134 * Rewind pool if a max txg was provided. 7135 */ 7136 zpool_get_load_policy(spa->spa_config, &policy); 7137 if (policy.zlp_txg != UINT64_MAX) { 7138 spa->spa_load_max_txg = policy.zlp_txg; 7139 spa->spa_extreme_rewind = B_TRUE; 7140 zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld", 7141 poolname, (longlong_t)policy.zlp_txg); 7142 } else { 7143 zfs_dbgmsg("spa_tryimport: importing %s", poolname); 7144 } 7145 7146 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile) 7147 == 0) { 7148 zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile); 7149 spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; 7150 } else { 7151 spa->spa_config_source = SPA_CONFIG_SRC_SCAN; 7152 } 7153 7154 /* 7155 * spa_import() relies on a pool config fetched by spa_try_import() 7156 * for spare/cache devices. Import flags are not passed to 7157 * spa_tryimport(), which makes it return early due to a missing log 7158 * device and missing retrieving the cache device and spare eventually. 7159 * Passing ZFS_IMPORT_MISSING_LOG to spa_tryimport() makes it fetch 7160 * the correct configuration regardless of the missing log device. 7161 */ 7162 spa->spa_import_flags |= ZFS_IMPORT_MISSING_LOG; 7163 7164 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING); 7165 7166 /* 7167 * If 'tryconfig' was at least parsable, return the current config. 7168 */ 7169 if (spa->spa_root_vdev != NULL) { 7170 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 7171 fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, poolname); 7172 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, state); 7173 fnvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 7174 spa->spa_uberblock.ub_timestamp); 7175 fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 7176 spa->spa_load_info); 7177 fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, 7178 spa->spa_errata); 7179 7180 /* 7181 * If the bootfs property exists on this pool then we 7182 * copy it out so that external consumers can tell which 7183 * pools are bootable. 7184 */ 7185 if ((!error || error == EEXIST) && spa->spa_bootfs) { 7186 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 7187 7188 /* 7189 * We have to play games with the name since the 7190 * pool was opened as TRYIMPORT_NAME. 7191 */ 7192 if (dsl_dsobj_to_dsname(spa_name(spa), 7193 spa->spa_bootfs, tmpname) == 0) { 7194 char *cp; 7195 char *dsname; 7196 7197 dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 7198 7199 cp = strchr(tmpname, '/'); 7200 if (cp == NULL) { 7201 (void) strlcpy(dsname, tmpname, 7202 MAXPATHLEN); 7203 } else { 7204 (void) snprintf(dsname, MAXPATHLEN, 7205 "%s/%s", poolname, ++cp); 7206 } 7207 fnvlist_add_string(config, ZPOOL_CONFIG_BOOTFS, 7208 dsname); 7209 kmem_free(dsname, MAXPATHLEN); 7210 } 7211 kmem_free(tmpname, MAXPATHLEN); 7212 } 7213 7214 /* 7215 * Add the list of hot spares and level 2 cache devices. 7216 */ 7217 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 7218 spa_add_spares(spa, config); 7219 spa_add_l2cache(spa, config); 7220 spa_config_exit(spa, SCL_CONFIG, FTAG); 7221 } 7222 7223 spa_unload(spa); 7224 spa_deactivate(spa); 7225 spa_remove(spa); 7226 spa_namespace_exit(FTAG); 7227 7228 return (config); 7229 } 7230 7231 /* 7232 * Pool export/destroy 7233 * 7234 * The act of destroying or exporting a pool is very simple. We make sure there 7235 * is no more pending I/O and any references to the pool are gone. Then, we 7236 * update the pool state and sync all the labels to disk, removing the 7237 * configuration from the cache afterwards. If the 'hardforce' flag is set, then 7238 * we don't sync the labels or remove the configuration cache. 7239 */ 7240 static int 7241 spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, 7242 boolean_t force, boolean_t hardforce) 7243 { 7244 int error = 0; 7245 spa_t *spa; 7246 hrtime_t export_start = gethrtime(); 7247 7248 if (oldconfig) 7249 *oldconfig = NULL; 7250 7251 if (!(spa_mode_global & SPA_MODE_WRITE)) 7252 return (SET_ERROR(EROFS)); 7253 7254 spa_namespace_enter(FTAG); 7255 if ((spa = spa_lookup(pool)) == NULL) { 7256 spa_namespace_exit(FTAG); 7257 return (SET_ERROR(ENOENT)); 7258 } 7259 7260 if (spa->spa_is_exporting) { 7261 /* the pool is being exported by another thread */ 7262 spa_namespace_exit(FTAG); 7263 return (SET_ERROR(ZFS_ERR_EXPORT_IN_PROGRESS)); 7264 } 7265 spa->spa_is_exporting = B_TRUE; 7266 7267 /* 7268 * Put a hold on the pool, drop the namespace lock, stop async tasks 7269 * and see if we can export. 7270 */ 7271 spa_open_ref(spa, FTAG); 7272 spa_namespace_exit(FTAG); 7273 spa_async_suspend(spa); 7274 if (spa->spa_zvol_taskq) { 7275 zvol_remove_minors(spa, spa_name(spa), B_TRUE); 7276 taskq_wait(spa->spa_zvol_taskq); 7277 } 7278 spa_namespace_enter(FTAG); 7279 spa->spa_export_thread = curthread; 7280 spa_close(spa, FTAG); 7281 7282 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 7283 spa_namespace_exit(FTAG); 7284 goto export_spa; 7285 } 7286 7287 /* 7288 * The pool will be in core if it's openable, in which case we can 7289 * modify its state. Objsets may be open only because they're dirty, 7290 * so we have to force it to sync before checking spa_refcnt. 7291 */ 7292 if (spa->spa_sync_on) { 7293 txg_wait_synced(spa->spa_dsl_pool, 0); 7294 spa_evicting_os_wait(spa); 7295 } 7296 7297 /* 7298 * A pool cannot be exported or destroyed if there are active 7299 * references. If we are resetting a pool, allow references by 7300 * fault injection handlers. 7301 */ 7302 if (!spa_refcount_zero(spa) || (spa->spa_inject_ref != 0)) { 7303 error = SET_ERROR(EBUSY); 7304 goto fail; 7305 } 7306 7307 spa_namespace_exit(FTAG); 7308 /* 7309 * At this point we no longer hold the spa_namespace_lock and 7310 * there were no references on the spa. Future spa_lookups will 7311 * notice the spa->spa_export_thread and wait until we signal 7312 * that we are finshed. 7313 */ 7314 7315 if (spa->spa_sync_on) { 7316 vdev_t *rvd = spa->spa_root_vdev; 7317 /* 7318 * A pool cannot be exported if it has an active shared spare. 7319 * This is to prevent other pools stealing the active spare 7320 * from an exported pool. At user's own will, such pool can 7321 * be forcedly exported. 7322 */ 7323 if (!force && new_state == POOL_STATE_EXPORTED && 7324 spa_has_active_shared_spare(spa)) { 7325 error = SET_ERROR(EXDEV); 7326 spa_namespace_enter(FTAG); 7327 goto fail; 7328 } 7329 7330 /* 7331 * We're about to export or destroy this pool. Make sure 7332 * we stop all initialization and trim activity here before 7333 * we set the spa_final_txg. This will ensure that all 7334 * dirty data resulting from the initialization is 7335 * committed to disk before we unload the pool. 7336 */ 7337 vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE); 7338 vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE); 7339 vdev_autotrim_stop_all(spa); 7340 vdev_rebuild_stop_all(spa); 7341 l2arc_spa_rebuild_stop(spa); 7342 7343 /* 7344 * We want this to be reflected on every label, 7345 * so mark them all dirty. spa_unload() will do the 7346 * final sync that pushes these changes out. 7347 */ 7348 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 7349 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 7350 spa->spa_state = new_state; 7351 vdev_config_dirty(rvd); 7352 spa_config_exit(spa, SCL_ALL, FTAG); 7353 } 7354 7355 if (spa_should_sync_time_logger_on_unload(spa)) 7356 spa_unload_sync_time_logger(spa); 7357 7358 /* 7359 * If the log space map feature is enabled and the pool is 7360 * getting exported (but not destroyed), we want to spend some 7361 * time flushing as many metaslabs as we can in an attempt to 7362 * destroy log space maps and save import time. This has to be 7363 * done before we set the spa_final_txg, otherwise 7364 * spa_sync() -> spa_flush_metaslabs() may dirty the final TXGs. 7365 * spa_should_flush_logs_on_unload() should be called after 7366 * spa_state has been set to the new_state. 7367 */ 7368 if (spa_should_flush_logs_on_unload(spa)) 7369 spa_unload_log_sm_flush_all(spa); 7370 7371 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 7372 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 7373 spa->spa_final_txg = spa_last_synced_txg(spa) + 7374 TXG_DEFER_SIZE + 1; 7375 spa_config_exit(spa, SCL_ALL, FTAG); 7376 } 7377 } 7378 7379 export_spa: 7380 spa_export_os(spa); 7381 7382 if (new_state == POOL_STATE_DESTROYED) 7383 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY); 7384 else if (new_state == POOL_STATE_EXPORTED) 7385 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_EXPORT); 7386 7387 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 7388 spa_unload(spa); 7389 spa_deactivate(spa); 7390 } 7391 7392 if (oldconfig && spa->spa_config) 7393 *oldconfig = fnvlist_dup(spa->spa_config); 7394 7395 if (new_state == POOL_STATE_EXPORTED) 7396 zio_handle_export_delay(spa, gethrtime() - export_start); 7397 7398 /* 7399 * Take the namespace lock for the actual spa_t removal 7400 */ 7401 spa_namespace_enter(FTAG); 7402 if (new_state != POOL_STATE_UNINITIALIZED) { 7403 if (!hardforce) 7404 spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE); 7405 spa_remove(spa); 7406 } else { 7407 /* 7408 * If spa_remove() is not called for this spa_t and 7409 * there is any possibility that it can be reused, 7410 * we make sure to reset the exporting flag. 7411 */ 7412 spa->spa_is_exporting = B_FALSE; 7413 spa->spa_export_thread = NULL; 7414 } 7415 7416 /* 7417 * Wake up any waiters in spa_lookup() 7418 */ 7419 spa_namespace_broadcast(); 7420 spa_namespace_exit(FTAG); 7421 return (0); 7422 7423 fail: 7424 spa->spa_is_exporting = B_FALSE; 7425 spa->spa_export_thread = NULL; 7426 7427 spa_async_resume(spa); 7428 /* 7429 * Wake up any waiters in spa_lookup() 7430 */ 7431 spa_namespace_broadcast(); 7432 spa_namespace_exit(FTAG); 7433 return (error); 7434 } 7435 7436 /* 7437 * Destroy a storage pool. 7438 */ 7439 int 7440 spa_destroy(const char *pool) 7441 { 7442 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 7443 B_FALSE, B_FALSE)); 7444 } 7445 7446 /* 7447 * Export a storage pool. 7448 */ 7449 int 7450 spa_export(const char *pool, nvlist_t **oldconfig, boolean_t force, 7451 boolean_t hardforce) 7452 { 7453 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 7454 force, hardforce)); 7455 } 7456 7457 /* 7458 * Similar to spa_export(), this unloads the spa_t without actually removing it 7459 * from the namespace in any way. 7460 */ 7461 int 7462 spa_reset(const char *pool) 7463 { 7464 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 7465 B_FALSE, B_FALSE)); 7466 } 7467 7468 /* 7469 * ========================================================================== 7470 * Device manipulation 7471 * ========================================================================== 7472 */ 7473 7474 /* 7475 * This is called as a synctask to increment the draid feature flag 7476 */ 7477 static void 7478 spa_draid_feature_incr(void *arg, dmu_tx_t *tx) 7479 { 7480 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 7481 int draid = (int)(uintptr_t)arg; 7482 7483 for (int c = 0; c < draid; c++) 7484 spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); 7485 } 7486 7487 /* 7488 * Add a device to a storage pool. 7489 */ 7490 int 7491 spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t check_ashift) 7492 { 7493 uint64_t txg, ndraid = 0; 7494 int error; 7495 vdev_t *rvd = spa->spa_root_vdev; 7496 vdev_t *vd, *tvd; 7497 nvlist_t **spares, **l2cache; 7498 uint_t nspares, nl2cache; 7499 7500 ASSERT(spa_writeable(spa)); 7501 7502 txg = spa_vdev_enter(spa); 7503 7504 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 7505 VDEV_ALLOC_ADD)) != 0) 7506 return (spa_vdev_exit(spa, NULL, txg, error)); 7507 7508 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 7509 7510 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 7511 &nspares) != 0) 7512 nspares = 0; 7513 7514 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 7515 &nl2cache) != 0) 7516 nl2cache = 0; 7517 7518 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 7519 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 7520 7521 if (vd->vdev_children != 0 && 7522 (error = vdev_create(vd, txg, B_FALSE)) != 0) { 7523 return (spa_vdev_exit(spa, vd, txg, error)); 7524 } 7525 7526 /* 7527 * The virtual dRAID spares must be added after vdev tree is created 7528 * and the vdev guids are generated. The guid of their associated 7529 * dRAID is stored in the config and used when opening the spare. 7530 */ 7531 if ((error = vdev_draid_spare_create(nvroot, vd, &ndraid, 7532 rvd->vdev_children)) == 0) { 7533 if (ndraid > 0 && nvlist_lookup_nvlist_array(nvroot, 7534 ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0) 7535 nspares = 0; 7536 } else { 7537 return (spa_vdev_exit(spa, vd, txg, error)); 7538 } 7539 7540 /* 7541 * We must validate the spares and l2cache devices after checking the 7542 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 7543 */ 7544 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 7545 return (spa_vdev_exit(spa, vd, txg, error)); 7546 7547 /* 7548 * If we are in the middle of a device removal, we can only add 7549 * devices which match the existing devices in the pool. 7550 * If we are in the middle of a removal, or have some indirect 7551 * vdevs, we can not add raidz or dRAID top levels. 7552 */ 7553 if (spa->spa_vdev_removal != NULL || 7554 spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { 7555 for (int c = 0; c < vd->vdev_children; c++) { 7556 tvd = vd->vdev_child[c]; 7557 if (spa->spa_vdev_removal != NULL && 7558 tvd->vdev_ashift != spa->spa_max_ashift) { 7559 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 7560 } 7561 /* Fail if top level vdev is raidz or a dRAID */ 7562 if (vdev_get_nparity(tvd) != 0) 7563 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 7564 7565 /* 7566 * Need the top level mirror to be 7567 * a mirror of leaf vdevs only 7568 */ 7569 if (tvd->vdev_ops == &vdev_mirror_ops) { 7570 for (uint64_t cid = 0; 7571 cid < tvd->vdev_children; cid++) { 7572 vdev_t *cvd = tvd->vdev_child[cid]; 7573 if (!cvd->vdev_ops->vdev_op_leaf) { 7574 return (spa_vdev_exit(spa, vd, 7575 txg, EINVAL)); 7576 } 7577 } 7578 } 7579 } 7580 } 7581 7582 if (check_ashift && spa->spa_max_ashift == spa->spa_min_ashift) { 7583 for (int c = 0; c < vd->vdev_children; c++) { 7584 tvd = vd->vdev_child[c]; 7585 if (tvd->vdev_ashift != spa->spa_max_ashift) { 7586 return (spa_vdev_exit(spa, vd, txg, 7587 ZFS_ERR_ASHIFT_MISMATCH)); 7588 } 7589 } 7590 } 7591 7592 for (int c = 0; c < vd->vdev_children; c++) { 7593 tvd = vd->vdev_child[c]; 7594 vdev_remove_child(vd, tvd); 7595 tvd->vdev_id = rvd->vdev_children; 7596 vdev_add_child(rvd, tvd); 7597 vdev_config_dirty(tvd); 7598 } 7599 7600 if (nspares != 0) { 7601 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 7602 ZPOOL_CONFIG_SPARES); 7603 spa_load_spares(spa); 7604 spa->spa_spares.sav_sync = B_TRUE; 7605 } 7606 7607 if (nl2cache != 0) { 7608 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 7609 ZPOOL_CONFIG_L2CACHE); 7610 spa_load_l2cache(spa); 7611 spa->spa_l2cache.sav_sync = B_TRUE; 7612 } 7613 7614 /* 7615 * We can't increment a feature while holding spa_vdev so we 7616 * have to do it in a synctask. 7617 */ 7618 if (ndraid != 0) { 7619 dmu_tx_t *tx; 7620 7621 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 7622 dsl_sync_task_nowait(spa->spa_dsl_pool, spa_draid_feature_incr, 7623 (void *)(uintptr_t)ndraid, tx); 7624 dmu_tx_commit(tx); 7625 } 7626 7627 /* 7628 * We have to be careful when adding new vdevs to an existing pool. 7629 * If other threads start allocating from these vdevs before we 7630 * sync the config cache, and we lose power, then upon reboot we may 7631 * fail to open the pool because there are DVAs that the config cache 7632 * can't translate. Therefore, we first add the vdevs without 7633 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 7634 * and then let spa_config_update() initialize the new metaslabs. 7635 * 7636 * spa_load() checks for added-but-not-initialized vdevs, so that 7637 * if we lose power at any point in this sequence, the remaining 7638 * steps will be completed the next time we load the pool. 7639 */ 7640 (void) spa_vdev_exit(spa, vd, txg, 0); 7641 7642 spa_namespace_enter(FTAG); 7643 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 7644 spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD); 7645 spa_namespace_exit(FTAG); 7646 7647 return (0); 7648 } 7649 7650 /* 7651 * Given a vdev to be replaced and its parent, check for a possible 7652 * "double spare" condition if a vdev is to be replaced by a spare. When this 7653 * happens, you can get two spares assigned to one failed vdev. 7654 * 7655 * To trigger a double spare condition: 7656 * 7657 * 1. disk1 fails 7658 * 2. 1st spare is kicked in for disk1 and it resilvers 7659 * 3. Someone replaces disk1 with a new blank disk 7660 * 4. New blank disk starts resilvering 7661 * 5. While resilvering, new blank disk has IO errors and faults 7662 * 6. 2nd spare is kicked in for new blank disk 7663 * 7. At this point two spares are kicked in for the original disk1. 7664 * 7665 * It looks like this: 7666 * 7667 * NAME STATE READ WRITE CKSUM 7668 * tank2 DEGRADED 0 0 0 7669 * draid2:6d:10c:2s-0 DEGRADED 0 0 0 7670 * scsi-0QEMU_QEMU_HARDDISK_d1 ONLINE 0 0 0 7671 * scsi-0QEMU_QEMU_HARDDISK_d2 ONLINE 0 0 0 7672 * scsi-0QEMU_QEMU_HARDDISK_d3 ONLINE 0 0 0 7673 * scsi-0QEMU_QEMU_HARDDISK_d4 ONLINE 0 0 0 7674 * scsi-0QEMU_QEMU_HARDDISK_d5 ONLINE 0 0 0 7675 * scsi-0QEMU_QEMU_HARDDISK_d6 ONLINE 0 0 0 7676 * scsi-0QEMU_QEMU_HARDDISK_d7 ONLINE 0 0 0 7677 * scsi-0QEMU_QEMU_HARDDISK_d8 ONLINE 0 0 0 7678 * scsi-0QEMU_QEMU_HARDDISK_d9 ONLINE 0 0 0 7679 * spare-9 DEGRADED 0 0 0 7680 * replacing-0 DEGRADED 0 93 0 7681 * scsi-0QEMU_QEMU_HARDDISK_d10-part1/old UNAVAIL 0 0 0 7682 * spare-1 DEGRADED 0 0 0 7683 * scsi-0QEMU_QEMU_HARDDISK_d10 REMOVED 0 0 0 7684 * draid2-0-0 ONLINE 0 0 0 7685 * draid2-0-1 ONLINE 0 0 0 7686 * spares 7687 * draid2-0-0 INUSE currently in use 7688 * draid2-0-1 INUSE currently in use 7689 * 7690 * ARGS: 7691 * 7692 * newvd: New spare disk 7693 * pvd: Parent vdev_t the spare should attach to 7694 * 7695 * This function returns B_TRUE if adding the new vdev would create a double 7696 * spare condition, B_FALSE otherwise. 7697 */ 7698 static boolean_t 7699 spa_vdev_new_spare_would_cause_double_spares(vdev_t *newvd, vdev_t *pvd) 7700 { 7701 vdev_t *ppvd; 7702 7703 ppvd = pvd->vdev_parent; 7704 if (ppvd == NULL) 7705 return (B_FALSE); 7706 7707 /* 7708 * To determine if this configuration would cause a double spare, we 7709 * look at the vdev_op of the parent vdev, and of the parent's parent 7710 * vdev. We also look at vdev_isspare on the new disk. A double spare 7711 * condition looks like this: 7712 * 7713 * 1. parent of parent's op is a spare or draid spare 7714 * 2. parent's op is replacing 7715 * 3. new disk is a spare 7716 */ 7717 if ((ppvd->vdev_ops == &vdev_spare_ops) || 7718 (ppvd->vdev_ops == &vdev_draid_spare_ops)) 7719 if (pvd->vdev_ops == &vdev_replacing_ops) 7720 if (newvd->vdev_isspare) 7721 return (B_TRUE); 7722 7723 return (B_FALSE); 7724 } 7725 7726 /* 7727 * Attach a device to a vdev specified by its guid. The vdev type can be 7728 * a mirror, a raidz, or a leaf device that is also a top-level (e.g. a 7729 * single device). When the vdev is a single device, a mirror vdev will be 7730 * automatically inserted. 7731 * 7732 * If 'replacing' is specified, the new device is intended to replace the 7733 * existing device; in this case the two devices are made into their own 7734 * mirror using the 'replacing' vdev, which is functionally identical to 7735 * the mirror vdev (it actually reuses all the same ops) but has a few 7736 * extra rules: you can't attach to it after it's been created, and upon 7737 * completion of resilvering, the first disk (the one being replaced) 7738 * is automatically detached. 7739 * 7740 * If 'rebuild' is specified, then sequential reconstruction (a.ka. rebuild) 7741 * should be performed instead of traditional healing reconstruction. From 7742 * an administrators perspective these are both resilver operations. 7743 */ 7744 int 7745 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, 7746 int rebuild) 7747 { 7748 uint64_t txg, dtl_max_txg; 7749 vdev_t *rvd = spa->spa_root_vdev; 7750 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 7751 vdev_ops_t *pvops; 7752 char *oldvdpath, *newvdpath; 7753 int newvd_isspare = B_FALSE; 7754 int error; 7755 7756 ASSERT(spa_writeable(spa)); 7757 7758 txg = spa_vdev_enter(spa); 7759 7760 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 7761 7762 ASSERT(spa_namespace_held()); 7763 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 7764 error = (spa_has_checkpoint(spa)) ? 7765 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 7766 return (spa_vdev_exit(spa, NULL, txg, error)); 7767 } 7768 7769 if (rebuild) { 7770 if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) 7771 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7772 7773 if (dsl_scan_resilvering(spa_get_dsl(spa)) || 7774 dsl_scan_resilver_scheduled(spa_get_dsl(spa))) { 7775 return (spa_vdev_exit(spa, NULL, txg, 7776 ZFS_ERR_RESILVER_IN_PROGRESS)); 7777 } 7778 } else { 7779 if (vdev_rebuild_active(rvd)) 7780 return (spa_vdev_exit(spa, NULL, txg, 7781 ZFS_ERR_REBUILD_IN_PROGRESS)); 7782 } 7783 7784 if (spa->spa_vdev_removal != NULL) { 7785 return (spa_vdev_exit(spa, NULL, txg, 7786 ZFS_ERR_DEVRM_IN_PROGRESS)); 7787 } 7788 7789 if (oldvd == NULL) 7790 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 7791 7792 boolean_t raidz = oldvd->vdev_ops == &vdev_raidz_ops; 7793 7794 if (raidz) { 7795 if (!spa_feature_is_enabled(spa, SPA_FEATURE_RAIDZ_EXPANSION)) 7796 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7797 7798 /* 7799 * Can't expand a raidz while prior expand is in progress. 7800 */ 7801 if (spa->spa_raidz_expand != NULL) { 7802 return (spa_vdev_exit(spa, NULL, txg, 7803 ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS)); 7804 } 7805 } else if (!oldvd->vdev_ops->vdev_op_leaf) { 7806 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7807 } 7808 7809 if (raidz) 7810 pvd = oldvd; 7811 else 7812 pvd = oldvd->vdev_parent; 7813 7814 if (spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 7815 VDEV_ALLOC_ATTACH) != 0) 7816 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 7817 7818 if (newrootvd->vdev_children != 1) 7819 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 7820 7821 newvd = newrootvd->vdev_child[0]; 7822 7823 if (!newvd->vdev_ops->vdev_op_leaf) 7824 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 7825 7826 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 7827 return (spa_vdev_exit(spa, newrootvd, txg, error)); 7828 7829 /* 7830 * log, dedup and special vdevs should not be replaced by spares. 7831 */ 7832 if ((oldvd->vdev_top->vdev_alloc_bias != VDEV_BIAS_NONE || 7833 oldvd->vdev_top->vdev_islog) && newvd->vdev_isspare) { 7834 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7835 } 7836 7837 /* 7838 * A dRAID spare can only replace a child of its parent dRAID vdev. 7839 */ 7840 if (newvd->vdev_ops == &vdev_draid_spare_ops && 7841 oldvd->vdev_top != vdev_draid_spare_get_parent(newvd)) { 7842 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7843 } 7844 7845 if (rebuild) { 7846 /* 7847 * For rebuilds, the top vdev must support reconstruction 7848 * using only space maps. This means the only allowable 7849 * vdevs types are the root vdev, a mirror, or dRAID. 7850 */ 7851 tvd = pvd; 7852 if (pvd->vdev_top != NULL) 7853 tvd = pvd->vdev_top; 7854 7855 if (tvd->vdev_ops != &vdev_mirror_ops && 7856 tvd->vdev_ops != &vdev_root_ops && 7857 tvd->vdev_ops != &vdev_draid_ops) { 7858 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7859 } 7860 } 7861 7862 if (!replacing) { 7863 /* 7864 * For attach, the only allowable parent is a mirror or 7865 * the root vdev. A raidz vdev can be attached to, but 7866 * you cannot attach to a raidz child. 7867 */ 7868 if (pvd->vdev_ops != &vdev_mirror_ops && 7869 pvd->vdev_ops != &vdev_root_ops && 7870 !raidz) 7871 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7872 7873 pvops = &vdev_mirror_ops; 7874 } else { 7875 /* 7876 * Active hot spares can only be replaced by inactive hot 7877 * spares. 7878 */ 7879 if (pvd->vdev_ops == &vdev_spare_ops && 7880 oldvd->vdev_isspare && 7881 !spa_has_spare(spa, newvd->vdev_guid)) 7882 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7883 7884 /* 7885 * If the source is a hot spare, and the parent isn't already a 7886 * spare, then we want to create a new hot spare. Otherwise, we 7887 * want to create a replacing vdev. The user is not allowed to 7888 * attach to a spared vdev child unless the 'isspare' state is 7889 * the same (spare replaces spare, non-spare replaces 7890 * non-spare). 7891 */ 7892 if (pvd->vdev_ops == &vdev_replacing_ops && 7893 spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 7894 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7895 } else if (pvd->vdev_ops == &vdev_spare_ops && 7896 newvd->vdev_isspare != oldvd->vdev_isspare) { 7897 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7898 } 7899 7900 if (spa_vdev_new_spare_would_cause_double_spares(newvd, pvd)) { 7901 vdev_dbgmsg(newvd, 7902 "disk would create double spares, ignore."); 7903 return (spa_vdev_exit(spa, newrootvd, txg, EEXIST)); 7904 } 7905 7906 if (newvd->vdev_isspare) 7907 pvops = &vdev_spare_ops; 7908 else 7909 pvops = &vdev_replacing_ops; 7910 } 7911 7912 /* 7913 * Make sure the new device is big enough. 7914 */ 7915 vdev_t *min_vdev = raidz ? oldvd->vdev_child[0] : oldvd; 7916 if (newvd->vdev_asize < vdev_get_min_asize(min_vdev)) 7917 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 7918 7919 /* 7920 * The new device cannot have a higher alignment requirement 7921 * than the top-level vdev. 7922 */ 7923 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) { 7924 return (spa_vdev_exit(spa, newrootvd, txg, 7925 ZFS_ERR_ASHIFT_MISMATCH)); 7926 } 7927 7928 /* 7929 * RAIDZ-expansion-specific checks. 7930 */ 7931 if (raidz) { 7932 if (vdev_raidz_attach_check(newvd) != 0) 7933 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7934 7935 /* 7936 * Fail early if a child is not healthy or being replaced 7937 */ 7938 for (int i = 0; i < oldvd->vdev_children; i++) { 7939 if (vdev_is_dead(oldvd->vdev_child[i]) || 7940 !oldvd->vdev_child[i]->vdev_ops->vdev_op_leaf) { 7941 return (spa_vdev_exit(spa, newrootvd, txg, 7942 ENXIO)); 7943 } 7944 /* Also fail if reserved boot area is in-use */ 7945 if (vdev_check_boot_reserve(spa, oldvd->vdev_child[i]) 7946 != 0) { 7947 return (spa_vdev_exit(spa, newrootvd, txg, 7948 EADDRINUSE)); 7949 } 7950 } 7951 } 7952 7953 if (raidz) { 7954 /* 7955 * Note: oldvdpath is freed by spa_strfree(), but 7956 * kmem_asprintf() is freed by kmem_strfree(), so we have to 7957 * move it to a spa_strdup-ed string. 7958 */ 7959 char *tmp = kmem_asprintf("raidz%u-%u", 7960 (uint_t)vdev_get_nparity(oldvd), (uint_t)oldvd->vdev_id); 7961 oldvdpath = spa_strdup(tmp); 7962 kmem_strfree(tmp); 7963 } else { 7964 oldvdpath = spa_strdup(oldvd->vdev_path); 7965 } 7966 newvdpath = spa_strdup(newvd->vdev_path); 7967 7968 /* 7969 * If this is an in-place replacement, update oldvd's path and devid 7970 * to make it distinguishable from newvd, and unopenable from now on. 7971 */ 7972 if (strcmp(oldvdpath, newvdpath) == 0) { 7973 spa_strfree(oldvd->vdev_path); 7974 oldvd->vdev_path = kmem_alloc(strlen(newvdpath) + 5, 7975 KM_SLEEP); 7976 (void) sprintf(oldvd->vdev_path, "%s/old", 7977 newvdpath); 7978 if (oldvd->vdev_devid != NULL) { 7979 spa_strfree(oldvd->vdev_devid); 7980 oldvd->vdev_devid = NULL; 7981 } 7982 spa_strfree(oldvdpath); 7983 oldvdpath = spa_strdup(oldvd->vdev_path); 7984 } 7985 7986 /* 7987 * If the parent is not a mirror, or if we're replacing, insert the new 7988 * mirror/replacing/spare vdev above oldvd. 7989 */ 7990 if (!raidz && pvd->vdev_ops != pvops) { 7991 pvd = vdev_add_parent(oldvd, pvops); 7992 ASSERT(pvd->vdev_ops == pvops); 7993 ASSERT(oldvd->vdev_parent == pvd); 7994 } 7995 7996 ASSERT(pvd->vdev_top->vdev_parent == rvd); 7997 7998 /* 7999 * Extract the new device from its root and add it to pvd. 8000 */ 8001 vdev_remove_child(newrootvd, newvd); 8002 newvd->vdev_id = pvd->vdev_children; 8003 newvd->vdev_crtxg = oldvd->vdev_crtxg; 8004 vdev_add_child(pvd, newvd); 8005 8006 /* 8007 * Reevaluate the parent vdev state. 8008 */ 8009 vdev_propagate_state(pvd); 8010 8011 tvd = newvd->vdev_top; 8012 ASSERT(pvd->vdev_top == tvd); 8013 ASSERT(tvd->vdev_parent == rvd); 8014 8015 vdev_config_dirty(tvd); 8016 8017 /* 8018 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 8019 * for any dmu_sync-ed blocks. It will propagate upward when 8020 * spa_vdev_exit() calls vdev_dtl_reassess(). 8021 */ 8022 dtl_max_txg = txg + TXG_CONCURRENT_STATES; 8023 8024 if (raidz) { 8025 /* 8026 * Wait for the youngest allocations and frees to sync, 8027 * and then wait for the deferral of those frees to finish. 8028 */ 8029 spa_vdev_config_exit(spa, NULL, 8030 txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 8031 8032 vdev_initialize_stop_all(tvd, VDEV_INITIALIZE_ACTIVE); 8033 vdev_trim_stop_all(tvd, VDEV_TRIM_ACTIVE); 8034 vdev_autotrim_stop_wait(tvd); 8035 8036 dtl_max_txg = spa_vdev_config_enter(spa); 8037 8038 tvd->vdev_rz_expanding = B_TRUE; 8039 8040 vdev_dirty_leaves(tvd, VDD_DTL, dtl_max_txg); 8041 vdev_config_dirty(tvd); 8042 8043 dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, 8044 dtl_max_txg); 8045 dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_raidz_attach_sync, 8046 newvd, tx); 8047 dmu_tx_commit(tx); 8048 } else { 8049 vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, 8050 dtl_max_txg - TXG_INITIAL); 8051 8052 if (newvd->vdev_isspare) { 8053 spa_spare_activate(newvd); 8054 spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE); 8055 } 8056 8057 newvd_isspare = newvd->vdev_isspare; 8058 8059 /* 8060 * Mark newvd's DTL dirty in this txg. 8061 */ 8062 vdev_dirty(tvd, VDD_DTL, newvd, txg); 8063 8064 /* 8065 * Schedule the resilver or rebuild to restart in the future. 8066 * We do this to ensure that dmu_sync-ed blocks have been 8067 * stitched into the respective datasets. 8068 */ 8069 if (rebuild) { 8070 newvd->vdev_rebuild_txg = txg; 8071 8072 vdev_rebuild(tvd); 8073 } else { 8074 newvd->vdev_resilver_txg = txg; 8075 8076 if (dsl_scan_resilvering(spa_get_dsl(spa)) && 8077 spa_feature_is_enabled(spa, 8078 SPA_FEATURE_RESILVER_DEFER)) { 8079 vdev_defer_resilver(newvd); 8080 } else { 8081 dsl_scan_restart_resilver(spa->spa_dsl_pool, 8082 dtl_max_txg); 8083 } 8084 } 8085 } 8086 8087 if (spa->spa_bootfs) 8088 spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH); 8089 8090 spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH); 8091 8092 /* 8093 * Commit the config 8094 */ 8095 (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 8096 8097 spa_history_log_internal(spa, "vdev attach", NULL, 8098 "%s vdev=%s %s vdev=%s", 8099 replacing && newvd_isspare ? "spare in" : 8100 replacing ? "replace" : "attach", newvdpath, 8101 replacing ? "for" : "to", oldvdpath); 8102 8103 spa_strfree(oldvdpath); 8104 spa_strfree(newvdpath); 8105 8106 return (0); 8107 } 8108 8109 /* 8110 * Detach a device from a mirror or replacing vdev. 8111 * 8112 * If 'replace_done' is specified, only detach if the parent 8113 * is a replacing or a spare vdev. 8114 */ 8115 int 8116 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 8117 { 8118 uint64_t txg; 8119 int error; 8120 vdev_t *rvd __maybe_unused = spa->spa_root_vdev; 8121 vdev_t *vd, *pvd, *cvd, *tvd; 8122 boolean_t unspare = B_FALSE; 8123 uint64_t unspare_guid = 0; 8124 char *vdpath; 8125 8126 ASSERT(spa_writeable(spa)); 8127 8128 txg = spa_vdev_detach_enter(spa, guid); 8129 8130 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 8131 8132 /* 8133 * Besides being called directly from the userland through the 8134 * ioctl interface, spa_vdev_detach() can be potentially called 8135 * at the end of spa_vdev_resilver_done(). 8136 * 8137 * In the regular case, when we have a checkpoint this shouldn't 8138 * happen as we never empty the DTLs of a vdev during the scrub 8139 * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done() 8140 * should never get here when we have a checkpoint. 8141 * 8142 * That said, even in a case when we checkpoint the pool exactly 8143 * as spa_vdev_resilver_done() calls this function everything 8144 * should be fine as the resilver will return right away. 8145 */ 8146 ASSERT(spa_namespace_held()); 8147 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 8148 error = (spa_has_checkpoint(spa)) ? 8149 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 8150 return (spa_vdev_exit(spa, NULL, txg, error)); 8151 } 8152 8153 if (vd == NULL) 8154 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 8155 8156 if (!vd->vdev_ops->vdev_op_leaf) 8157 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 8158 8159 pvd = vd->vdev_parent; 8160 8161 /* 8162 * If the parent/child relationship is not as expected, don't do it. 8163 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 8164 * vdev that's replacing B with C. The user's intent in replacing 8165 * is to go from M(A,B) to M(A,C). If the user decides to cancel 8166 * the replace by detaching C, the expected behavior is to end up 8167 * M(A,B). But suppose that right after deciding to detach C, 8168 * the replacement of B completes. We would have M(A,C), and then 8169 * ask to detach C, which would leave us with just A -- not what 8170 * the user wanted. To prevent this, we make sure that the 8171 * parent/child relationship hasn't changed -- in this example, 8172 * that C's parent is still the replacing vdev R. 8173 */ 8174 if (pvd->vdev_guid != pguid && pguid != 0) 8175 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 8176 8177 /* 8178 * Only 'replacing' or 'spare' vdevs can be replaced. 8179 */ 8180 if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 8181 pvd->vdev_ops != &vdev_spare_ops) 8182 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 8183 8184 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 8185 spa_version(spa) >= SPA_VERSION_SPARES); 8186 8187 /* 8188 * Only mirror, replacing, and spare vdevs support detach. 8189 */ 8190 if (pvd->vdev_ops != &vdev_replacing_ops && 8191 pvd->vdev_ops != &vdev_mirror_ops && 8192 pvd->vdev_ops != &vdev_spare_ops) 8193 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 8194 8195 /* 8196 * If this device has the only valid copy of some data, 8197 * we cannot safely detach it. 8198 */ 8199 if (vdev_dtl_required(vd)) 8200 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 8201 8202 ASSERT(pvd->vdev_children >= 2); 8203 8204 /* 8205 * If we are detaching the second disk from a replacing vdev, then 8206 * check to see if we changed the original vdev's path to have "/old" 8207 * at the end in spa_vdev_attach(). If so, undo that change now. 8208 */ 8209 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 8210 vd->vdev_path != NULL) { 8211 size_t len = strlen(vd->vdev_path); 8212 8213 for (int c = 0; c < pvd->vdev_children; c++) { 8214 cvd = pvd->vdev_child[c]; 8215 8216 if (cvd == vd || cvd->vdev_path == NULL) 8217 continue; 8218 8219 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 8220 strcmp(cvd->vdev_path + len, "/old") == 0) { 8221 spa_strfree(cvd->vdev_path); 8222 cvd->vdev_path = spa_strdup(vd->vdev_path); 8223 break; 8224 } 8225 } 8226 } 8227 8228 /* 8229 * If we are detaching the original disk from a normal spare, then it 8230 * implies that the spare should become a real disk, and be removed 8231 * from the active spare list for the pool. dRAID spares on the 8232 * other hand are coupled to the pool and thus should never be removed 8233 * from the spares list. 8234 */ 8235 if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0) { 8236 vdev_t *last_cvd = pvd->vdev_child[pvd->vdev_children - 1]; 8237 8238 if (last_cvd->vdev_isspare && 8239 last_cvd->vdev_ops != &vdev_draid_spare_ops) { 8240 unspare = B_TRUE; 8241 } 8242 } 8243 8244 /* 8245 * Erase the disk labels so the disk can be used for other things. 8246 * This must be done after all other error cases are handled, 8247 * but before we disembowel vd (so we can still do I/O to it). 8248 * But if we can't do it, don't treat the error as fatal -- 8249 * it may be that the unwritability of the disk is the reason 8250 * it's being detached! 8251 */ 8252 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 8253 8254 /* 8255 * Remove vd from its parent and compact the parent's children. 8256 */ 8257 vdev_remove_child(pvd, vd); 8258 vdev_compact_children(pvd); 8259 8260 /* 8261 * Remember one of the remaining children so we can get tvd below. 8262 */ 8263 cvd = pvd->vdev_child[pvd->vdev_children - 1]; 8264 8265 /* 8266 * If we need to remove the remaining child from the list of hot spares, 8267 * do it now, marking the vdev as no longer a spare in the process. 8268 * We must do this before vdev_remove_parent(), because that can 8269 * change the GUID if it creates a new toplevel GUID. For a similar 8270 * reason, we must remove the spare now, in the same txg as the detach; 8271 * otherwise someone could attach a new sibling, change the GUID, and 8272 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 8273 */ 8274 if (unspare) { 8275 ASSERT(cvd->vdev_isspare); 8276 spa_spare_remove(cvd); 8277 unspare_guid = cvd->vdev_guid; 8278 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 8279 cvd->vdev_unspare = B_TRUE; 8280 } 8281 8282 /* 8283 * If the parent mirror/replacing vdev only has one child, 8284 * the parent is no longer needed. Remove it from the tree. 8285 */ 8286 if (pvd->vdev_children == 1) { 8287 if (pvd->vdev_ops == &vdev_spare_ops) 8288 cvd->vdev_unspare = B_FALSE; 8289 vdev_remove_parent(cvd); 8290 } 8291 8292 /* 8293 * We don't set tvd until now because the parent we just removed 8294 * may have been the previous top-level vdev. 8295 */ 8296 tvd = cvd->vdev_top; 8297 ASSERT(tvd->vdev_parent == rvd); 8298 8299 /* 8300 * Reevaluate the parent vdev state. 8301 */ 8302 vdev_propagate_state(cvd); 8303 8304 /* 8305 * If the 'autoexpand' property is set on the pool then automatically 8306 * try to expand the size of the pool. For example if the device we 8307 * just detached was smaller than the others, it may be possible to 8308 * add metaslabs (i.e. grow the pool). We need to reopen the vdev 8309 * first so that we can obtain the updated sizes of the leaf vdevs. 8310 */ 8311 if (spa->spa_autoexpand) { 8312 vdev_reopen(tvd); 8313 vdev_expand(tvd, txg); 8314 } 8315 8316 vdev_config_dirty(tvd); 8317 8318 /* 8319 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 8320 * vd->vdev_detached is set and free vd's DTL object in syncing context. 8321 * But first make sure we're not on any *other* txg's DTL list, to 8322 * prevent vd from being accessed after it's freed. 8323 */ 8324 vdpath = spa_strdup(vd->vdev_path ? vd->vdev_path : "none"); 8325 for (int t = 0; t < TXG_SIZE; t++) 8326 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 8327 vd->vdev_detached = B_TRUE; 8328 vdev_dirty(tvd, VDD_DTL, vd, txg); 8329 8330 spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE); 8331 spa_notify_waiters(spa); 8332 8333 /* hang on to the spa before we release the lock */ 8334 spa_open_ref(spa, FTAG); 8335 8336 error = spa_vdev_exit(spa, vd, txg, 0); 8337 8338 spa_history_log_internal(spa, "detach", NULL, 8339 "vdev=%s", vdpath); 8340 spa_strfree(vdpath); 8341 8342 /* 8343 * If this was the removal of the original device in a hot spare vdev, 8344 * then we want to go through and remove the device from the hot spare 8345 * list of every other pool. 8346 */ 8347 if (unspare) { 8348 spa_t *altspa = NULL; 8349 8350 spa_namespace_enter(FTAG); 8351 while ((altspa = spa_next(altspa)) != NULL) { 8352 if (altspa->spa_state != POOL_STATE_ACTIVE || 8353 altspa == spa) 8354 continue; 8355 8356 spa_open_ref(altspa, FTAG); 8357 spa_namespace_exit(FTAG); 8358 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 8359 spa_namespace_enter(FTAG); 8360 spa_close(altspa, FTAG); 8361 } 8362 spa_namespace_exit(FTAG); 8363 8364 /* search the rest of the vdevs for spares to remove */ 8365 spa_vdev_resilver_done(spa); 8366 } 8367 8368 /* all done with the spa; OK to release */ 8369 spa_namespace_enter(FTAG); 8370 spa_close(spa, FTAG); 8371 spa_namespace_exit(FTAG); 8372 8373 return (error); 8374 } 8375 8376 static int 8377 spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, 8378 list_t *vd_list) 8379 { 8380 ASSERT(spa_namespace_held()); 8381 8382 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 8383 8384 /* Look up vdev and ensure it's a leaf. */ 8385 vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); 8386 if (vd == NULL || vd->vdev_detached) { 8387 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8388 return (SET_ERROR(ENODEV)); 8389 } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { 8390 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8391 return (SET_ERROR(EINVAL)); 8392 } else if (!vdev_writeable(vd)) { 8393 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8394 return (SET_ERROR(EROFS)); 8395 } 8396 mutex_enter(&vd->vdev_initialize_lock); 8397 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8398 8399 /* 8400 * When we activate an initialize action we check to see 8401 * if the vdev_initialize_thread is NULL. We do this instead 8402 * of using the vdev_initialize_state since there might be 8403 * a previous initialization process which has completed but 8404 * the thread is not exited. 8405 */ 8406 if (cmd_type == POOL_INITIALIZE_START && 8407 (vd->vdev_initialize_thread != NULL || 8408 vd->vdev_top->vdev_removing || vd->vdev_top->vdev_rz_expanding)) { 8409 mutex_exit(&vd->vdev_initialize_lock); 8410 return (SET_ERROR(EBUSY)); 8411 } else if (cmd_type == POOL_INITIALIZE_CANCEL && 8412 (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE && 8413 vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) { 8414 mutex_exit(&vd->vdev_initialize_lock); 8415 return (SET_ERROR(ESRCH)); 8416 } else if (cmd_type == POOL_INITIALIZE_SUSPEND && 8417 vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) { 8418 mutex_exit(&vd->vdev_initialize_lock); 8419 return (SET_ERROR(ESRCH)); 8420 } else if (cmd_type == POOL_INITIALIZE_UNINIT && 8421 vd->vdev_initialize_thread != NULL) { 8422 mutex_exit(&vd->vdev_initialize_lock); 8423 return (SET_ERROR(EBUSY)); 8424 } 8425 8426 switch (cmd_type) { 8427 case POOL_INITIALIZE_START: 8428 vdev_initialize(vd); 8429 break; 8430 case POOL_INITIALIZE_CANCEL: 8431 vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED, vd_list); 8432 break; 8433 case POOL_INITIALIZE_SUSPEND: 8434 vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list); 8435 break; 8436 case POOL_INITIALIZE_UNINIT: 8437 vdev_uninitialize(vd); 8438 break; 8439 default: 8440 panic("invalid cmd_type %llu", (unsigned long long)cmd_type); 8441 } 8442 mutex_exit(&vd->vdev_initialize_lock); 8443 8444 return (0); 8445 } 8446 8447 int 8448 spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, 8449 nvlist_t *vdev_errlist) 8450 { 8451 int total_errors = 0; 8452 list_t vd_list; 8453 8454 list_create(&vd_list, sizeof (vdev_t), 8455 offsetof(vdev_t, vdev_initialize_node)); 8456 8457 /* 8458 * We hold the namespace lock through the whole function 8459 * to prevent any changes to the pool while we're starting or 8460 * stopping initialization. The config and state locks are held so that 8461 * we can properly assess the vdev state before we commit to 8462 * the initializing operation. 8463 */ 8464 spa_namespace_enter(FTAG); 8465 8466 for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); 8467 pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { 8468 uint64_t vdev_guid = fnvpair_value_uint64(pair); 8469 8470 int error = spa_vdev_initialize_impl(spa, vdev_guid, cmd_type, 8471 &vd_list); 8472 if (error != 0) { 8473 char guid_as_str[MAXNAMELEN]; 8474 8475 (void) snprintf(guid_as_str, sizeof (guid_as_str), 8476 "%llu", (unsigned long long)vdev_guid); 8477 fnvlist_add_int64(vdev_errlist, guid_as_str, error); 8478 total_errors++; 8479 } 8480 } 8481 8482 /* Wait for all initialize threads to stop. */ 8483 vdev_initialize_stop_wait(spa, &vd_list); 8484 8485 /* Sync out the initializing state */ 8486 txg_wait_synced(spa->spa_dsl_pool, 0); 8487 spa_namespace_exit(FTAG); 8488 8489 list_destroy(&vd_list); 8490 8491 return (total_errors); 8492 } 8493 8494 static int 8495 spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, 8496 uint64_t rate, boolean_t partial, boolean_t secure, list_t *vd_list) 8497 { 8498 ASSERT(spa_namespace_held()); 8499 8500 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 8501 8502 /* Look up vdev and ensure it's a leaf. */ 8503 vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); 8504 if (vd == NULL || vd->vdev_detached) { 8505 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8506 return (SET_ERROR(ENODEV)); 8507 } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { 8508 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8509 return (SET_ERROR(EINVAL)); 8510 } else if (!vdev_writeable(vd)) { 8511 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8512 return (SET_ERROR(EROFS)); 8513 } else if (!vd->vdev_has_trim) { 8514 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8515 return (SET_ERROR(EOPNOTSUPP)); 8516 } else if (secure && !vd->vdev_has_securetrim) { 8517 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8518 return (SET_ERROR(EOPNOTSUPP)); 8519 } 8520 mutex_enter(&vd->vdev_trim_lock); 8521 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8522 8523 /* 8524 * When we activate a TRIM action we check to see if the 8525 * vdev_trim_thread is NULL. We do this instead of using the 8526 * vdev_trim_state since there might be a previous TRIM process 8527 * which has completed but the thread is not exited. 8528 */ 8529 if (cmd_type == POOL_TRIM_START && 8530 (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing || 8531 vd->vdev_top->vdev_rz_expanding)) { 8532 mutex_exit(&vd->vdev_trim_lock); 8533 return (SET_ERROR(EBUSY)); 8534 } else if (cmd_type == POOL_TRIM_CANCEL && 8535 (vd->vdev_trim_state != VDEV_TRIM_ACTIVE && 8536 vd->vdev_trim_state != VDEV_TRIM_SUSPENDED)) { 8537 mutex_exit(&vd->vdev_trim_lock); 8538 return (SET_ERROR(ESRCH)); 8539 } else if (cmd_type == POOL_TRIM_SUSPEND && 8540 vd->vdev_trim_state != VDEV_TRIM_ACTIVE) { 8541 mutex_exit(&vd->vdev_trim_lock); 8542 return (SET_ERROR(ESRCH)); 8543 } 8544 8545 switch (cmd_type) { 8546 case POOL_TRIM_START: 8547 vdev_trim(vd, rate, partial, secure); 8548 break; 8549 case POOL_TRIM_CANCEL: 8550 vdev_trim_stop(vd, VDEV_TRIM_CANCELED, vd_list); 8551 break; 8552 case POOL_TRIM_SUSPEND: 8553 vdev_trim_stop(vd, VDEV_TRIM_SUSPENDED, vd_list); 8554 break; 8555 default: 8556 panic("invalid cmd_type %llu", (unsigned long long)cmd_type); 8557 } 8558 mutex_exit(&vd->vdev_trim_lock); 8559 8560 return (0); 8561 } 8562 8563 /* 8564 * Initiates a manual TRIM for the requested vdevs. This kicks off individual 8565 * TRIM threads for each child vdev. These threads pass over all of the free 8566 * space in the vdev's metaslabs and issues TRIM commands for that space. 8567 */ 8568 int 8569 spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate, 8570 boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist) 8571 { 8572 int total_errors = 0; 8573 list_t vd_list; 8574 8575 list_create(&vd_list, sizeof (vdev_t), 8576 offsetof(vdev_t, vdev_trim_node)); 8577 8578 /* 8579 * We hold the namespace lock through the whole function 8580 * to prevent any changes to the pool while we're starting or 8581 * stopping TRIM. The config and state locks are held so that 8582 * we can properly assess the vdev state before we commit to 8583 * the TRIM operation. 8584 */ 8585 spa_namespace_enter(FTAG); 8586 8587 for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); 8588 pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { 8589 uint64_t vdev_guid = fnvpair_value_uint64(pair); 8590 8591 int error = spa_vdev_trim_impl(spa, vdev_guid, cmd_type, 8592 rate, partial, secure, &vd_list); 8593 if (error != 0) { 8594 char guid_as_str[MAXNAMELEN]; 8595 8596 (void) snprintf(guid_as_str, sizeof (guid_as_str), 8597 "%llu", (unsigned long long)vdev_guid); 8598 fnvlist_add_int64(vdev_errlist, guid_as_str, error); 8599 total_errors++; 8600 } 8601 } 8602 8603 /* Wait for all TRIM threads to stop. */ 8604 vdev_trim_stop_wait(spa, &vd_list); 8605 8606 /* Sync out the TRIM state */ 8607 txg_wait_synced(spa->spa_dsl_pool, 0); 8608 spa_namespace_exit(FTAG); 8609 8610 list_destroy(&vd_list); 8611 8612 return (total_errors); 8613 } 8614 8615 /* 8616 * Split a set of devices from their mirrors, and create a new pool from them. 8617 */ 8618 int 8619 spa_vdev_split_mirror(spa_t *spa, const char *newname, nvlist_t *config, 8620 nvlist_t *props, boolean_t exp) 8621 { 8622 int error = 0; 8623 uint64_t txg, *glist; 8624 spa_t *newspa; 8625 uint_t c, children, lastlog; 8626 nvlist_t **child, *nvl, *tmp; 8627 dmu_tx_t *tx; 8628 const char *altroot = NULL; 8629 vdev_t *rvd, **vml = NULL; /* vdev modify list */ 8630 boolean_t activate_slog; 8631 8632 ASSERT(spa_writeable(spa)); 8633 8634 txg = spa_vdev_enter(spa); 8635 8636 ASSERT(spa_namespace_held()); 8637 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 8638 error = (spa_has_checkpoint(spa)) ? 8639 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 8640 return (spa_vdev_exit(spa, NULL, txg, error)); 8641 } 8642 8643 /* clear the log and flush everything up to now */ 8644 activate_slog = spa_passivate_log(spa); 8645 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 8646 error = spa_reset_logs(spa); 8647 txg = spa_vdev_config_enter(spa); 8648 8649 if (activate_slog) 8650 spa_activate_log(spa); 8651 8652 if (error != 0) 8653 return (spa_vdev_exit(spa, NULL, txg, error)); 8654 8655 /* check new spa name before going any further */ 8656 if (spa_lookup(newname) != NULL) 8657 return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 8658 8659 /* 8660 * scan through all the children to ensure they're all mirrors 8661 */ 8662 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 8663 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 8664 &children) != 0) 8665 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 8666 8667 /* first, check to ensure we've got the right child count */ 8668 rvd = spa->spa_root_vdev; 8669 lastlog = 0; 8670 for (c = 0; c < rvd->vdev_children; c++) { 8671 vdev_t *vd = rvd->vdev_child[c]; 8672 8673 /* don't count the holes & logs as children */ 8674 if (vd->vdev_islog || (vd->vdev_ops != &vdev_indirect_ops && 8675 !vdev_is_concrete(vd))) { 8676 if (lastlog == 0) 8677 lastlog = c; 8678 continue; 8679 } 8680 8681 lastlog = 0; 8682 } 8683 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 8684 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 8685 8686 /* next, ensure no spare or cache devices are part of the split */ 8687 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 8688 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 8689 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 8690 8691 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 8692 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 8693 8694 /* then, loop over each vdev and validate it */ 8695 for (c = 0; c < children; c++) { 8696 uint64_t is_hole = 0; 8697 8698 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 8699 &is_hole); 8700 8701 if (is_hole != 0) { 8702 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 8703 spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 8704 continue; 8705 } else { 8706 error = SET_ERROR(EINVAL); 8707 break; 8708 } 8709 } 8710 8711 /* deal with indirect vdevs */ 8712 if (spa->spa_root_vdev->vdev_child[c]->vdev_ops == 8713 &vdev_indirect_ops) 8714 continue; 8715 8716 /* which disk is going to be split? */ 8717 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 8718 &glist[c]) != 0) { 8719 error = SET_ERROR(EINVAL); 8720 break; 8721 } 8722 8723 /* look it up in the spa */ 8724 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 8725 if (vml[c] == NULL) { 8726 error = SET_ERROR(ENODEV); 8727 break; 8728 } 8729 8730 /* make sure there's nothing stopping the split */ 8731 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 8732 vml[c]->vdev_islog || 8733 !vdev_is_concrete(vml[c]) || 8734 vml[c]->vdev_isspare || 8735 vml[c]->vdev_isl2cache || 8736 !vdev_writeable(vml[c]) || 8737 vml[c]->vdev_children != 0 || 8738 vml[c]->vdev_state != VDEV_STATE_HEALTHY || 8739 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 8740 error = SET_ERROR(EINVAL); 8741 break; 8742 } 8743 8744 if (vdev_dtl_required(vml[c]) || 8745 vdev_resilver_needed(vml[c], NULL, NULL)) { 8746 error = SET_ERROR(EBUSY); 8747 break; 8748 } 8749 8750 /* we need certain info from the top level */ 8751 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 8752 vml[c]->vdev_top->vdev_ms_array); 8753 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 8754 vml[c]->vdev_top->vdev_ms_shift); 8755 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 8756 vml[c]->vdev_top->vdev_asize); 8757 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 8758 vml[c]->vdev_top->vdev_ashift); 8759 8760 /* transfer per-vdev ZAPs */ 8761 ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0); 8762 VERIFY0(nvlist_add_uint64(child[c], 8763 ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap)); 8764 8765 ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0); 8766 VERIFY0(nvlist_add_uint64(child[c], 8767 ZPOOL_CONFIG_VDEV_TOP_ZAP, 8768 vml[c]->vdev_parent->vdev_top_zap)); 8769 } 8770 8771 if (error != 0) { 8772 kmem_free(vml, children * sizeof (vdev_t *)); 8773 kmem_free(glist, children * sizeof (uint64_t)); 8774 return (spa_vdev_exit(spa, NULL, txg, error)); 8775 } 8776 8777 /* stop writers from using the disks */ 8778 for (c = 0; c < children; c++) { 8779 if (vml[c] != NULL) 8780 vml[c]->vdev_offline = B_TRUE; 8781 } 8782 vdev_reopen(spa->spa_root_vdev); 8783 8784 /* 8785 * Temporarily record the splitting vdevs in the spa config. This 8786 * will disappear once the config is regenerated. 8787 */ 8788 nvl = fnvlist_alloc(); 8789 fnvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, glist, children); 8790 kmem_free(glist, children * sizeof (uint64_t)); 8791 8792 mutex_enter(&spa->spa_props_lock); 8793 fnvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, nvl); 8794 mutex_exit(&spa->spa_props_lock); 8795 spa->spa_config_splitting = nvl; 8796 vdev_config_dirty(spa->spa_root_vdev); 8797 8798 /* configure and create the new pool */ 8799 fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname); 8800 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 8801 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE); 8802 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa)); 8803 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg); 8804 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 8805 spa_generate_guid(NULL)); 8806 VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); 8807 (void) nvlist_lookup_string(props, 8808 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 8809 8810 /* add the new pool to the namespace */ 8811 newspa = spa_add(newname, config, altroot); 8812 newspa->spa_avz_action = AVZ_ACTION_REBUILD; 8813 newspa->spa_config_txg = spa->spa_config_txg; 8814 spa_set_log_state(newspa, SPA_LOG_CLEAR); 8815 8816 /* release the spa config lock, retaining the namespace lock */ 8817 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 8818 8819 if (zio_injection_enabled) 8820 zio_handle_panic_injection(spa, FTAG, 1); 8821 8822 spa_activate(newspa, spa_mode_global); 8823 spa_async_suspend(newspa); 8824 8825 /* 8826 * Temporarily stop the initializing and TRIM activity. We set the 8827 * state to ACTIVE so that we know to resume initializing or TRIM 8828 * once the split has completed. 8829 */ 8830 list_t vd_initialize_list; 8831 list_create(&vd_initialize_list, sizeof (vdev_t), 8832 offsetof(vdev_t, vdev_initialize_node)); 8833 8834 list_t vd_trim_list; 8835 list_create(&vd_trim_list, sizeof (vdev_t), 8836 offsetof(vdev_t, vdev_trim_node)); 8837 8838 for (c = 0; c < children; c++) { 8839 if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { 8840 mutex_enter(&vml[c]->vdev_initialize_lock); 8841 vdev_initialize_stop(vml[c], 8842 VDEV_INITIALIZE_ACTIVE, &vd_initialize_list); 8843 mutex_exit(&vml[c]->vdev_initialize_lock); 8844 8845 mutex_enter(&vml[c]->vdev_trim_lock); 8846 vdev_trim_stop(vml[c], VDEV_TRIM_ACTIVE, &vd_trim_list); 8847 mutex_exit(&vml[c]->vdev_trim_lock); 8848 } 8849 } 8850 8851 vdev_initialize_stop_wait(spa, &vd_initialize_list); 8852 vdev_trim_stop_wait(spa, &vd_trim_list); 8853 8854 list_destroy(&vd_initialize_list); 8855 list_destroy(&vd_trim_list); 8856 8857 newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT; 8858 newspa->spa_is_splitting = B_TRUE; 8859 8860 /* create the new pool from the disks of the original pool */ 8861 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE); 8862 if (error) 8863 goto out; 8864 8865 /* if that worked, generate a real config for the new pool */ 8866 if (newspa->spa_root_vdev != NULL) { 8867 newspa->spa_config_splitting = fnvlist_alloc(); 8868 fnvlist_add_uint64(newspa->spa_config_splitting, 8869 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)); 8870 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 8871 B_TRUE)); 8872 } 8873 8874 /* set the props */ 8875 if (props != NULL) { 8876 spa_configfile_set(newspa, props, B_FALSE); 8877 error = spa_prop_set(newspa, props); 8878 if (error) 8879 goto out; 8880 } 8881 8882 /* flush everything */ 8883 txg = spa_vdev_config_enter(newspa); 8884 vdev_config_dirty(newspa->spa_root_vdev); 8885 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 8886 8887 if (zio_injection_enabled) 8888 zio_handle_panic_injection(spa, FTAG, 2); 8889 8890 spa_async_resume(newspa); 8891 8892 /* finally, update the original pool's config */ 8893 txg = spa_vdev_config_enter(spa); 8894 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 8895 error = dmu_tx_assign(tx, DMU_TX_WAIT); 8896 if (error != 0) 8897 dmu_tx_abort(tx); 8898 for (c = 0; c < children; c++) { 8899 if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { 8900 vdev_t *tvd = vml[c]->vdev_top; 8901 8902 /* 8903 * Need to be sure the detachable VDEV is not 8904 * on any *other* txg's DTL list to prevent it 8905 * from being accessed after it's freed. 8906 */ 8907 for (int t = 0; t < TXG_SIZE; t++) { 8908 (void) txg_list_remove_this( 8909 &tvd->vdev_dtl_list, vml[c], t); 8910 } 8911 8912 vdev_split(vml[c]); 8913 if (error == 0) 8914 spa_history_log_internal(spa, "detach", tx, 8915 "vdev=%s", vml[c]->vdev_path); 8916 8917 vdev_free(vml[c]); 8918 } 8919 } 8920 spa->spa_avz_action = AVZ_ACTION_REBUILD; 8921 vdev_config_dirty(spa->spa_root_vdev); 8922 spa->spa_config_splitting = NULL; 8923 nvlist_free(nvl); 8924 if (error == 0) 8925 dmu_tx_commit(tx); 8926 (void) spa_vdev_exit(spa, NULL, txg, 0); 8927 8928 if (zio_injection_enabled) 8929 zio_handle_panic_injection(spa, FTAG, 3); 8930 8931 /* split is complete; log a history record */ 8932 spa_history_log_internal(newspa, "split", NULL, 8933 "from pool %s", spa_name(spa)); 8934 8935 newspa->spa_is_splitting = B_FALSE; 8936 kmem_free(vml, children * sizeof (vdev_t *)); 8937 8938 /* if we're not going to mount the filesystems in userland, export */ 8939 if (exp) 8940 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 8941 B_FALSE, B_FALSE); 8942 8943 return (error); 8944 8945 out: 8946 spa_unload(newspa); 8947 spa_deactivate(newspa); 8948 spa_remove(newspa); 8949 8950 txg = spa_vdev_config_enter(spa); 8951 8952 /* re-online all offlined disks */ 8953 for (c = 0; c < children; c++) { 8954 if (vml[c] != NULL) 8955 vml[c]->vdev_offline = B_FALSE; 8956 } 8957 8958 /* restart initializing or trimming disks as necessary */ 8959 spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); 8960 spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); 8961 spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); 8962 8963 vdev_reopen(spa->spa_root_vdev); 8964 8965 nvlist_free(spa->spa_config_splitting); 8966 spa->spa_config_splitting = NULL; 8967 (void) spa_vdev_exit(spa, NULL, txg, error); 8968 8969 kmem_free(vml, children * sizeof (vdev_t *)); 8970 return (error); 8971 } 8972 8973 /* 8974 * Find any device that's done replacing, or a vdev marked 'unspare' that's 8975 * currently spared, so we can detach it. 8976 */ 8977 static vdev_t * 8978 spa_vdev_resilver_done_hunt(vdev_t *vd) 8979 { 8980 vdev_t *newvd, *oldvd; 8981 8982 for (int c = 0; c < vd->vdev_children; c++) { 8983 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 8984 if (oldvd != NULL) 8985 return (oldvd); 8986 } 8987 8988 /* 8989 * Check for a completed replacement. We always consider the first 8990 * vdev in the list to be the oldest vdev, and the last one to be 8991 * the newest (see spa_vdev_attach() for how that works). In 8992 * the case where the newest vdev is faulted, we will not automatically 8993 * remove it after a resilver completes. This is OK as it will require 8994 * user intervention to determine which disk the admin wishes to keep. 8995 */ 8996 if (vd->vdev_ops == &vdev_replacing_ops) { 8997 ASSERT(vd->vdev_children > 1); 8998 8999 newvd = vd->vdev_child[vd->vdev_children - 1]; 9000 oldvd = vd->vdev_child[0]; 9001 9002 if (vdev_dtl_empty(newvd, DTL_MISSING) && 9003 vdev_dtl_empty(newvd, DTL_OUTAGE) && 9004 !vdev_dtl_required(oldvd)) 9005 return (oldvd); 9006 } 9007 9008 /* 9009 * Check for a completed resilver with the 'unspare' flag set. 9010 * Also potentially update faulted state. 9011 */ 9012 if (vd->vdev_ops == &vdev_spare_ops) { 9013 vdev_t *first = vd->vdev_child[0]; 9014 vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 9015 9016 if (last->vdev_unspare) { 9017 oldvd = first; 9018 newvd = last; 9019 } else if (first->vdev_unspare) { 9020 oldvd = last; 9021 newvd = first; 9022 } else { 9023 oldvd = NULL; 9024 } 9025 9026 if (oldvd != NULL && 9027 vdev_dtl_empty(newvd, DTL_MISSING) && 9028 vdev_dtl_empty(newvd, DTL_OUTAGE) && 9029 !vdev_dtl_required(oldvd)) 9030 return (oldvd); 9031 9032 vdev_propagate_state(vd); 9033 9034 /* 9035 * If there are more than two spares attached to a disk, 9036 * and those spares are not required, then we want to 9037 * attempt to free them up now so that they can be used 9038 * by other pools. Once we're back down to a single 9039 * disk+spare, we stop removing them. 9040 */ 9041 if (vd->vdev_children > 2) { 9042 newvd = vd->vdev_child[1]; 9043 9044 if (newvd->vdev_isspare && last->vdev_isspare && 9045 vdev_dtl_empty(last, DTL_MISSING) && 9046 vdev_dtl_empty(last, DTL_OUTAGE) && 9047 !vdev_dtl_required(newvd)) 9048 return (newvd); 9049 } 9050 } 9051 9052 return (NULL); 9053 } 9054 9055 static void 9056 spa_vdev_resilver_done(spa_t *spa) 9057 { 9058 vdev_t *vd, *pvd, *ppvd; 9059 uint64_t guid, sguid, pguid, ppguid; 9060 9061 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 9062 9063 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 9064 pvd = vd->vdev_parent; 9065 ppvd = pvd->vdev_parent; 9066 guid = vd->vdev_guid; 9067 pguid = pvd->vdev_guid; 9068 ppguid = ppvd->vdev_guid; 9069 sguid = 0; 9070 /* 9071 * If we have just finished replacing a hot spared device, then 9072 * we need to detach the parent's first child (the original hot 9073 * spare) as well. 9074 */ 9075 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 9076 ppvd->vdev_children == 2) { 9077 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 9078 sguid = ppvd->vdev_child[1]->vdev_guid; 9079 } 9080 ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); 9081 9082 spa_config_exit(spa, SCL_ALL, FTAG); 9083 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 9084 return; 9085 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 9086 return; 9087 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 9088 } 9089 9090 spa_config_exit(spa, SCL_ALL, FTAG); 9091 9092 /* 9093 * If a detach was not performed above replace waiters will not have 9094 * been notified. In which case we must do so now. 9095 */ 9096 spa_notify_waiters(spa); 9097 } 9098 9099 /* 9100 * Update the stored path or FRU for this vdev. 9101 */ 9102 static int 9103 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 9104 boolean_t ispath) 9105 { 9106 vdev_t *vd; 9107 boolean_t sync = B_FALSE; 9108 9109 ASSERT(spa_writeable(spa)); 9110 9111 spa_vdev_state_enter(spa, SCL_ALL); 9112 9113 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 9114 return (spa_vdev_state_exit(spa, NULL, ENOENT)); 9115 9116 if (!vd->vdev_ops->vdev_op_leaf) 9117 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 9118 9119 if (ispath) { 9120 if (strcmp(value, vd->vdev_path) != 0) { 9121 spa_strfree(vd->vdev_path); 9122 vd->vdev_path = spa_strdup(value); 9123 sync = B_TRUE; 9124 } 9125 } else { 9126 if (vd->vdev_fru == NULL) { 9127 vd->vdev_fru = spa_strdup(value); 9128 sync = B_TRUE; 9129 } else if (strcmp(value, vd->vdev_fru) != 0) { 9130 spa_strfree(vd->vdev_fru); 9131 vd->vdev_fru = spa_strdup(value); 9132 sync = B_TRUE; 9133 } 9134 } 9135 9136 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 9137 } 9138 9139 int 9140 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 9141 { 9142 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 9143 } 9144 9145 int 9146 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 9147 { 9148 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 9149 } 9150 9151 /* 9152 * ========================================================================== 9153 * SPA Scanning 9154 * ========================================================================== 9155 */ 9156 int 9157 spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd) 9158 { 9159 ASSERT0(spa_config_held(spa, SCL_ALL, RW_WRITER)); 9160 9161 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 9162 return (SET_ERROR(EBUSY)); 9163 9164 return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd)); 9165 } 9166 9167 int 9168 spa_scan_stop(spa_t *spa) 9169 { 9170 ASSERT0(spa_config_held(spa, SCL_ALL, RW_WRITER)); 9171 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 9172 return (SET_ERROR(EBUSY)); 9173 9174 return (dsl_scan_cancel(spa->spa_dsl_pool)); 9175 } 9176 9177 int 9178 spa_scan(spa_t *spa, pool_scan_func_t func) 9179 { 9180 return (spa_scan_range(spa, func, 0, 0)); 9181 } 9182 9183 int 9184 spa_scan_range(spa_t *spa, pool_scan_func_t func, uint64_t txgstart, 9185 uint64_t txgend) 9186 { 9187 ASSERT0(spa_config_held(spa, SCL_ALL, RW_WRITER)); 9188 9189 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 9190 return (SET_ERROR(ENOTSUP)); 9191 9192 if (func == POOL_SCAN_RESILVER && 9193 !spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) 9194 return (SET_ERROR(ENOTSUP)); 9195 9196 if (func != POOL_SCAN_SCRUB && (txgstart != 0 || txgend != 0)) 9197 return (SET_ERROR(ENOTSUP)); 9198 9199 /* 9200 * If a resilver was requested, but there is no DTL on a 9201 * writeable leaf device, we have nothing to do. 9202 */ 9203 if (func == POOL_SCAN_RESILVER && 9204 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 9205 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 9206 return (0); 9207 } 9208 9209 if (func == POOL_SCAN_ERRORSCRUB && 9210 !spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) 9211 return (SET_ERROR(ENOTSUP)); 9212 9213 return (dsl_scan(spa->spa_dsl_pool, func, txgstart, txgend)); 9214 } 9215 9216 /* 9217 * ========================================================================== 9218 * SPA async task processing 9219 * ========================================================================== 9220 */ 9221 9222 static void 9223 spa_async_remove(spa_t *spa, vdev_t *vd, boolean_t by_kernel) 9224 { 9225 if (vd->vdev_remove_wanted) { 9226 vd->vdev_remove_wanted = B_FALSE; 9227 vd->vdev_delayed_close = B_FALSE; 9228 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 9229 9230 /* 9231 * We want to clear the stats, but we don't want to do a full 9232 * vdev_clear() as that will cause us to throw away 9233 * degraded/faulted state as well as attempt to reopen the 9234 * device, all of which is a waste. 9235 */ 9236 vd->vdev_stat.vs_read_errors = 0; 9237 vd->vdev_stat.vs_write_errors = 0; 9238 vd->vdev_stat.vs_checksum_errors = 0; 9239 9240 vdev_state_dirty(vd->vdev_top); 9241 9242 /* Tell userspace that the vdev is gone. */ 9243 zfs_post_remove(spa, vd, by_kernel); 9244 } 9245 9246 for (int c = 0; c < vd->vdev_children; c++) 9247 spa_async_remove(spa, vd->vdev_child[c], by_kernel); 9248 } 9249 9250 static void 9251 spa_async_fault_vdev(vdev_t *vd, boolean_t *suspend) 9252 { 9253 if (vd->vdev_fault_wanted) { 9254 vdev_state_t newstate = VDEV_STATE_FAULTED; 9255 vd->vdev_fault_wanted = B_FALSE; 9256 9257 /* 9258 * If this device has the only valid copy of the data, then 9259 * back off and simply mark the vdev as degraded instead. 9260 */ 9261 if (!vd->vdev_top->vdev_islog && vd->vdev_aux == NULL && 9262 vdev_dtl_required(vd)) { 9263 newstate = VDEV_STATE_DEGRADED; 9264 /* A required disk is missing so suspend the pool */ 9265 *suspend = B_TRUE; 9266 } 9267 vdev_set_state(vd, B_TRUE, newstate, VDEV_AUX_ERR_EXCEEDED); 9268 } 9269 for (int c = 0; c < vd->vdev_children; c++) 9270 spa_async_fault_vdev(vd->vdev_child[c], suspend); 9271 } 9272 9273 static void 9274 spa_async_autoexpand(spa_t *spa, vdev_t *vd) 9275 { 9276 if (!spa->spa_autoexpand) 9277 return; 9278 9279 for (int c = 0; c < vd->vdev_children; c++) { 9280 vdev_t *cvd = vd->vdev_child[c]; 9281 spa_async_autoexpand(spa, cvd); 9282 } 9283 9284 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 9285 return; 9286 9287 spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_AUTOEXPAND); 9288 } 9289 9290 static __attribute__((noreturn)) void 9291 spa_async_thread(void *arg) 9292 { 9293 spa_t *spa = (spa_t *)arg; 9294 dsl_pool_t *dp = spa->spa_dsl_pool; 9295 int tasks; 9296 9297 ASSERT(spa->spa_sync_on); 9298 9299 mutex_enter(&spa->spa_async_lock); 9300 tasks = spa->spa_async_tasks; 9301 spa->spa_async_tasks = 0; 9302 mutex_exit(&spa->spa_async_lock); 9303 9304 /* 9305 * See if the config needs to be updated. 9306 */ 9307 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 9308 uint64_t old_space, new_space; 9309 9310 spa_namespace_enter(FTAG); 9311 old_space = metaslab_class_get_space(spa_normal_class(spa)); 9312 old_space += metaslab_class_get_space(spa_special_class(spa)); 9313 old_space += metaslab_class_get_space(spa_dedup_class(spa)); 9314 old_space += metaslab_class_get_space( 9315 spa_embedded_log_class(spa)); 9316 old_space += metaslab_class_get_space( 9317 spa_special_embedded_log_class(spa)); 9318 9319 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 9320 9321 new_space = metaslab_class_get_space(spa_normal_class(spa)); 9322 new_space += metaslab_class_get_space(spa_special_class(spa)); 9323 new_space += metaslab_class_get_space(spa_dedup_class(spa)); 9324 new_space += metaslab_class_get_space( 9325 spa_embedded_log_class(spa)); 9326 new_space += metaslab_class_get_space( 9327 spa_special_embedded_log_class(spa)); 9328 spa_namespace_exit(FTAG); 9329 9330 /* 9331 * If the pool grew as a result of the config update, 9332 * then log an internal history event. 9333 */ 9334 if (new_space != old_space) { 9335 spa_history_log_internal(spa, "vdev online", NULL, 9336 "pool '%s' size: %llu(+%llu)", 9337 spa_name(spa), (u_longlong_t)new_space, 9338 (u_longlong_t)(new_space - old_space)); 9339 } 9340 } 9341 9342 /* 9343 * See if any devices need to be marked REMOVED. 9344 */ 9345 if (tasks & (SPA_ASYNC_REMOVE | SPA_ASYNC_REMOVE_BY_USER)) { 9346 boolean_t by_kernel = B_TRUE; 9347 if (tasks & SPA_ASYNC_REMOVE_BY_USER) 9348 by_kernel = B_FALSE; 9349 spa_vdev_state_enter(spa, SCL_NONE); 9350 spa_async_remove(spa, spa->spa_root_vdev, by_kernel); 9351 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 9352 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i], 9353 by_kernel); 9354 for (int i = 0; i < spa->spa_spares.sav_count; i++) 9355 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i], 9356 by_kernel); 9357 (void) spa_vdev_state_exit(spa, NULL, 0); 9358 } 9359 9360 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 9361 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 9362 spa_async_autoexpand(spa, spa->spa_root_vdev); 9363 spa_config_exit(spa, SCL_CONFIG, FTAG); 9364 } 9365 9366 /* 9367 * See if any devices need to be marked faulted. 9368 */ 9369 if (tasks & SPA_ASYNC_FAULT_VDEV) { 9370 spa_vdev_state_enter(spa, SCL_NONE); 9371 boolean_t suspend = B_FALSE; 9372 spa_async_fault_vdev(spa->spa_root_vdev, &suspend); 9373 (void) spa_vdev_state_exit(spa, NULL, 0); 9374 if (suspend) 9375 zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR); 9376 } 9377 9378 /* 9379 * If any devices are done replacing, detach them. 9380 */ 9381 if (tasks & SPA_ASYNC_RESILVER_DONE || 9382 tasks & SPA_ASYNC_REBUILD_DONE || 9383 tasks & SPA_ASYNC_DETACH_SPARE) { 9384 spa_vdev_resilver_done(spa); 9385 } 9386 9387 /* 9388 * Kick off a resilver. 9389 */ 9390 if (tasks & SPA_ASYNC_RESILVER && 9391 !vdev_rebuild_active(spa->spa_root_vdev) && 9392 (!dsl_scan_resilvering(dp) || 9393 !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))) 9394 dsl_scan_restart_resilver(dp, 0); 9395 9396 if (tasks & SPA_ASYNC_INITIALIZE_RESTART) { 9397 spa_namespace_enter(FTAG); 9398 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 9399 vdev_initialize_restart(spa->spa_root_vdev); 9400 spa_config_exit(spa, SCL_CONFIG, FTAG); 9401 spa_namespace_exit(FTAG); 9402 } 9403 9404 if (tasks & SPA_ASYNC_TRIM_RESTART) { 9405 spa_namespace_enter(FTAG); 9406 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 9407 vdev_trim_restart(spa->spa_root_vdev); 9408 spa_config_exit(spa, SCL_CONFIG, FTAG); 9409 spa_namespace_exit(FTAG); 9410 } 9411 9412 if (tasks & SPA_ASYNC_AUTOTRIM_RESTART) { 9413 spa_namespace_enter(FTAG); 9414 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 9415 vdev_autotrim_restart(spa); 9416 spa_config_exit(spa, SCL_CONFIG, FTAG); 9417 spa_namespace_exit(FTAG); 9418 } 9419 9420 /* 9421 * Kick off L2 cache whole device TRIM. 9422 */ 9423 if (tasks & SPA_ASYNC_L2CACHE_TRIM) { 9424 spa_namespace_enter(FTAG); 9425 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 9426 vdev_trim_l2arc(spa); 9427 spa_config_exit(spa, SCL_CONFIG, FTAG); 9428 spa_namespace_exit(FTAG); 9429 } 9430 9431 /* 9432 * Kick off L2 cache rebuilding. 9433 */ 9434 if (tasks & SPA_ASYNC_L2CACHE_REBUILD) { 9435 spa_namespace_enter(FTAG); 9436 spa_config_enter(spa, SCL_L2ARC, FTAG, RW_READER); 9437 l2arc_spa_rebuild_start(spa); 9438 spa_config_exit(spa, SCL_L2ARC, FTAG); 9439 spa_namespace_exit(FTAG); 9440 } 9441 9442 /* 9443 * Let the world know that we're done. 9444 */ 9445 mutex_enter(&spa->spa_async_lock); 9446 spa->spa_async_thread = NULL; 9447 cv_broadcast(&spa->spa_async_cv); 9448 mutex_exit(&spa->spa_async_lock); 9449 thread_exit(); 9450 } 9451 9452 void 9453 spa_async_suspend(spa_t *spa) 9454 { 9455 mutex_enter(&spa->spa_async_lock); 9456 spa->spa_async_suspended++; 9457 while (spa->spa_async_thread != NULL) 9458 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 9459 mutex_exit(&spa->spa_async_lock); 9460 9461 spa_vdev_remove_suspend(spa); 9462 9463 zthr_t *condense_thread = spa->spa_condense_zthr; 9464 if (condense_thread != NULL) 9465 zthr_cancel(condense_thread); 9466 9467 zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr; 9468 if (raidz_expand_thread != NULL) 9469 zthr_cancel(raidz_expand_thread); 9470 9471 zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; 9472 if (discard_thread != NULL) 9473 zthr_cancel(discard_thread); 9474 9475 zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; 9476 if (ll_delete_thread != NULL) 9477 zthr_cancel(ll_delete_thread); 9478 9479 zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; 9480 if (ll_condense_thread != NULL) 9481 zthr_cancel(ll_condense_thread); 9482 } 9483 9484 void 9485 spa_async_resume(spa_t *spa) 9486 { 9487 mutex_enter(&spa->spa_async_lock); 9488 ASSERT(spa->spa_async_suspended != 0); 9489 spa->spa_async_suspended--; 9490 mutex_exit(&spa->spa_async_lock); 9491 spa_restart_removal(spa); 9492 9493 zthr_t *condense_thread = spa->spa_condense_zthr; 9494 if (condense_thread != NULL) 9495 zthr_resume(condense_thread); 9496 9497 zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr; 9498 if (raidz_expand_thread != NULL) 9499 zthr_resume(raidz_expand_thread); 9500 9501 zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; 9502 if (discard_thread != NULL) 9503 zthr_resume(discard_thread); 9504 9505 zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; 9506 if (ll_delete_thread != NULL) 9507 zthr_resume(ll_delete_thread); 9508 9509 zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; 9510 if (ll_condense_thread != NULL) 9511 zthr_resume(ll_condense_thread); 9512 } 9513 9514 static boolean_t 9515 spa_async_tasks_pending(spa_t *spa) 9516 { 9517 uint_t non_config_tasks; 9518 uint_t config_task; 9519 boolean_t config_task_suspended; 9520 9521 non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE; 9522 config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; 9523 if (spa->spa_ccw_fail_time == 0) { 9524 config_task_suspended = B_FALSE; 9525 } else { 9526 config_task_suspended = 9527 (gethrtime() - spa->spa_ccw_fail_time) < 9528 ((hrtime_t)zfs_ccw_retry_interval * NANOSEC); 9529 } 9530 9531 return (non_config_tasks || (config_task && !config_task_suspended)); 9532 } 9533 9534 static void 9535 spa_async_dispatch(spa_t *spa) 9536 { 9537 mutex_enter(&spa->spa_async_lock); 9538 if (spa_async_tasks_pending(spa) && 9539 !spa->spa_async_suspended && 9540 spa->spa_async_thread == NULL) 9541 spa->spa_async_thread = thread_create(NULL, 0, 9542 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 9543 mutex_exit(&spa->spa_async_lock); 9544 } 9545 9546 void 9547 spa_async_request(spa_t *spa, int task) 9548 { 9549 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 9550 mutex_enter(&spa->spa_async_lock); 9551 spa->spa_async_tasks |= task; 9552 mutex_exit(&spa->spa_async_lock); 9553 } 9554 9555 int 9556 spa_async_tasks(spa_t *spa) 9557 { 9558 return (spa->spa_async_tasks); 9559 } 9560 9561 /* 9562 * ========================================================================== 9563 * SPA syncing routines 9564 * ========================================================================== 9565 */ 9566 9567 9568 static int 9569 bpobj_enqueue_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 9570 dmu_tx_t *tx) 9571 { 9572 bpobj_t *bpo = arg; 9573 bpobj_enqueue(bpo, bp, bp_freed, tx); 9574 return (0); 9575 } 9576 9577 int 9578 bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 9579 { 9580 return (bpobj_enqueue_cb(arg, bp, B_FALSE, tx)); 9581 } 9582 9583 int 9584 bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 9585 { 9586 return (bpobj_enqueue_cb(arg, bp, B_TRUE, tx)); 9587 } 9588 9589 static int 9590 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 9591 { 9592 zio_t *pio = arg; 9593 9594 zio_nowait(zio_free_sync(pio, pio->io_spa, dmu_tx_get_txg(tx), bp, 9595 pio->io_flags)); 9596 return (0); 9597 } 9598 9599 static int 9600 bpobj_spa_free_sync_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 9601 dmu_tx_t *tx) 9602 { 9603 ASSERT(!bp_freed); 9604 return (spa_free_sync_cb(arg, bp, tx)); 9605 } 9606 9607 /* 9608 * Note: this simple function is not inlined to make it easier to dtrace the 9609 * amount of time spent syncing frees. 9610 */ 9611 static void 9612 spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) 9613 { 9614 zio_t *zio = zio_root(spa, NULL, NULL, 0); 9615 bplist_iterate(bpl, spa_free_sync_cb, zio, tx); 9616 VERIFY0(zio_wait(zio)); 9617 } 9618 9619 /* 9620 * Note: this simple function is not inlined to make it easier to dtrace the 9621 * amount of time spent syncing deferred frees. 9622 */ 9623 static void 9624 spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) 9625 { 9626 if (spa_sync_pass(spa) != 1) 9627 return; 9628 9629 /* 9630 * Note: 9631 * If the log space map feature is active, we stop deferring 9632 * frees to the next TXG and therefore running this function 9633 * would be considered a no-op as spa_deferred_bpobj should 9634 * not have any entries. 9635 * 9636 * That said we run this function anyway (instead of returning 9637 * immediately) for the edge-case scenario where we just 9638 * activated the log space map feature in this TXG but we have 9639 * deferred frees from the previous TXG. 9640 */ 9641 zio_t *zio = zio_root(spa, NULL, NULL, 0); 9642 VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, 9643 bpobj_spa_free_sync_cb, zio, tx), ==, 0); 9644 VERIFY0(zio_wait(zio)); 9645 } 9646 9647 static void 9648 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 9649 { 9650 char *packed = NULL; 9651 size_t bufsize; 9652 size_t nvsize = 0; 9653 dmu_buf_t *db; 9654 9655 VERIFY0(nvlist_size(nv, &nvsize, NV_ENCODE_XDR)); 9656 9657 /* 9658 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 9659 * information. This avoids the dmu_buf_will_dirty() path and 9660 * saves us a pre-read to get data we don't actually care about. 9661 */ 9662 bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); 9663 packed = vmem_alloc(bufsize, KM_SLEEP); 9664 9665 VERIFY0(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 9666 KM_SLEEP)); 9667 memset(packed + nvsize, 0, bufsize - nvsize); 9668 9669 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx, 9670 DMU_READ_NO_PREFETCH); 9671 9672 vmem_free(packed, bufsize); 9673 9674 VERIFY0(dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 9675 dmu_buf_will_dirty(db, tx); 9676 *(uint64_t *)db->db_data = nvsize; 9677 dmu_buf_rele(db, FTAG); 9678 } 9679 9680 static void 9681 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 9682 const char *config, const char *entry) 9683 { 9684 nvlist_t *nvroot; 9685 nvlist_t **list; 9686 int i; 9687 9688 if (!sav->sav_sync) 9689 return; 9690 9691 /* 9692 * Update the MOS nvlist describing the list of available devices. 9693 * spa_validate_aux() will have already made sure this nvlist is 9694 * valid and the vdevs are labeled appropriately. 9695 */ 9696 if (sav->sav_object == 0) { 9697 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 9698 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 9699 sizeof (uint64_t), tx); 9700 VERIFY(zap_update(spa->spa_meta_objset, 9701 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 9702 &sav->sav_object, tx) == 0); 9703 } 9704 9705 nvroot = fnvlist_alloc(); 9706 if (sav->sav_count == 0) { 9707 fnvlist_add_nvlist_array(nvroot, config, 9708 (const nvlist_t * const *)NULL, 0); 9709 } else { 9710 list = kmem_alloc(sav->sav_count*sizeof (void *), KM_SLEEP); 9711 for (i = 0; i < sav->sav_count; i++) 9712 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 9713 B_FALSE, VDEV_CONFIG_L2CACHE); 9714 fnvlist_add_nvlist_array(nvroot, config, 9715 (const nvlist_t * const *)list, sav->sav_count); 9716 for (i = 0; i < sav->sav_count; i++) 9717 nvlist_free(list[i]); 9718 kmem_free(list, sav->sav_count * sizeof (void *)); 9719 } 9720 9721 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 9722 nvlist_free(nvroot); 9723 9724 sav->sav_sync = B_FALSE; 9725 } 9726 9727 /* 9728 * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t. 9729 * The all-vdev ZAP must be empty. 9730 */ 9731 static void 9732 spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx) 9733 { 9734 spa_t *spa = vd->vdev_spa; 9735 9736 if (vd->vdev_root_zap != 0 && 9737 spa_feature_is_active(spa, SPA_FEATURE_AVZ_V2)) { 9738 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 9739 vd->vdev_root_zap, tx)); 9740 } 9741 if (vd->vdev_top_zap != 0) { 9742 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 9743 vd->vdev_top_zap, tx)); 9744 } 9745 if (vd->vdev_leaf_zap != 0) { 9746 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 9747 vd->vdev_leaf_zap, tx)); 9748 } 9749 for (uint64_t i = 0; i < vd->vdev_children; i++) { 9750 spa_avz_build(vd->vdev_child[i], avz, tx); 9751 } 9752 } 9753 9754 static void 9755 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 9756 { 9757 nvlist_t *config; 9758 9759 /* 9760 * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS, 9761 * its config may not be dirty but we still need to build per-vdev ZAPs. 9762 * Similarly, if the pool is being assembled (e.g. after a split), we 9763 * need to rebuild the AVZ although the config may not be dirty. 9764 */ 9765 if (list_is_empty(&spa->spa_config_dirty_list) && 9766 spa->spa_avz_action == AVZ_ACTION_NONE) 9767 return; 9768 9769 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 9770 9771 ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE || 9772 spa->spa_avz_action == AVZ_ACTION_INITIALIZE || 9773 spa->spa_all_vdev_zaps != 0); 9774 9775 if (spa->spa_avz_action == AVZ_ACTION_REBUILD) { 9776 /* Make and build the new AVZ */ 9777 uint64_t new_avz = zap_create(spa->spa_meta_objset, 9778 DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); 9779 spa_avz_build(spa->spa_root_vdev, new_avz, tx); 9780 9781 /* Diff old AVZ with new one */ 9782 zap_cursor_t zc; 9783 zap_attribute_t *za = zap_attribute_alloc(); 9784 9785 for (zap_cursor_init(&zc, spa->spa_meta_objset, 9786 spa->spa_all_vdev_zaps); 9787 zap_cursor_retrieve(&zc, za) == 0; 9788 zap_cursor_advance(&zc)) { 9789 uint64_t vdzap = za->za_first_integer; 9790 if (zap_lookup_int(spa->spa_meta_objset, new_avz, 9791 vdzap) == ENOENT) { 9792 /* 9793 * ZAP is listed in old AVZ but not in new one; 9794 * destroy it 9795 */ 9796 VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap, 9797 tx)); 9798 } 9799 } 9800 9801 zap_cursor_fini(&zc); 9802 zap_attribute_free(za); 9803 9804 /* Destroy the old AVZ */ 9805 VERIFY0(zap_destroy(spa->spa_meta_objset, 9806 spa->spa_all_vdev_zaps, tx)); 9807 9808 /* Replace the old AVZ in the dir obj with the new one */ 9809 VERIFY0(zap_update(spa->spa_meta_objset, 9810 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, 9811 sizeof (new_avz), 1, &new_avz, tx)); 9812 9813 spa->spa_all_vdev_zaps = new_avz; 9814 } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) { 9815 zap_cursor_t zc; 9816 zap_attribute_t *za = zap_attribute_alloc(); 9817 9818 /* Walk through the AVZ and destroy all listed ZAPs */ 9819 for (zap_cursor_init(&zc, spa->spa_meta_objset, 9820 spa->spa_all_vdev_zaps); 9821 zap_cursor_retrieve(&zc, za) == 0; 9822 zap_cursor_advance(&zc)) { 9823 uint64_t zap = za->za_first_integer; 9824 VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx)); 9825 } 9826 9827 zap_cursor_fini(&zc); 9828 zap_attribute_free(za); 9829 9830 /* Destroy and unlink the AVZ itself */ 9831 VERIFY0(zap_destroy(spa->spa_meta_objset, 9832 spa->spa_all_vdev_zaps, tx)); 9833 VERIFY0(zap_remove(spa->spa_meta_objset, 9834 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx)); 9835 spa->spa_all_vdev_zaps = 0; 9836 } 9837 9838 if (spa->spa_all_vdev_zaps == 0) { 9839 spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset, 9840 DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, 9841 DMU_POOL_VDEV_ZAP_MAP, tx); 9842 } 9843 spa->spa_avz_action = AVZ_ACTION_NONE; 9844 9845 /* Create ZAPs for vdevs that don't have them. */ 9846 vdev_construct_zaps(spa->spa_root_vdev, tx); 9847 9848 config = spa_config_generate(spa, spa->spa_root_vdev, 9849 dmu_tx_get_txg(tx), B_FALSE); 9850 9851 /* 9852 * If we're upgrading the spa version then make sure that 9853 * the config object gets updated with the correct version. 9854 */ 9855 if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) 9856 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 9857 spa->spa_uberblock.ub_version); 9858 9859 spa_config_exit(spa, SCL_STATE, FTAG); 9860 9861 nvlist_free(spa->spa_config_syncing); 9862 spa->spa_config_syncing = config; 9863 9864 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 9865 } 9866 9867 static void 9868 spa_sync_version(void *arg, dmu_tx_t *tx) 9869 { 9870 uint64_t *versionp = arg; 9871 uint64_t version = *versionp; 9872 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 9873 9874 /* 9875 * Setting the version is special cased when first creating the pool. 9876 */ 9877 ASSERT(tx->tx_txg != TXG_INITIAL); 9878 9879 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 9880 ASSERT(version >= spa_version(spa)); 9881 9882 spa->spa_uberblock.ub_version = version; 9883 vdev_config_dirty(spa->spa_root_vdev); 9884 spa_history_log_internal(spa, "set", tx, "version=%lld", 9885 (longlong_t)version); 9886 } 9887 9888 /* 9889 * Set zpool properties. 9890 */ 9891 static void 9892 spa_sync_props(void *arg, dmu_tx_t *tx) 9893 { 9894 nvlist_t *nvp = arg; 9895 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 9896 objset_t *mos = spa->spa_meta_objset; 9897 nvpair_t *elem = NULL; 9898 9899 mutex_enter(&spa->spa_props_lock); 9900 9901 while ((elem = nvlist_next_nvpair(nvp, elem))) { 9902 uint64_t intval; 9903 const char *strval, *fname; 9904 zpool_prop_t prop; 9905 const char *propname; 9906 const char *elemname = nvpair_name(elem); 9907 zprop_type_t proptype; 9908 spa_feature_t fid; 9909 9910 switch (prop = zpool_name_to_prop(elemname)) { 9911 case ZPOOL_PROP_VERSION: 9912 intval = fnvpair_value_uint64(elem); 9913 /* 9914 * The version is synced separately before other 9915 * properties and should be correct by now. 9916 */ 9917 ASSERT3U(spa_version(spa), >=, intval); 9918 break; 9919 9920 case ZPOOL_PROP_ALTROOT: 9921 /* 9922 * 'altroot' is a non-persistent property. It should 9923 * have been set temporarily at creation or import time. 9924 */ 9925 ASSERT(spa->spa_root != NULL); 9926 break; 9927 9928 case ZPOOL_PROP_READONLY: 9929 case ZPOOL_PROP_CACHEFILE: 9930 /* 9931 * 'readonly' and 'cachefile' are also non-persistent 9932 * properties. 9933 */ 9934 break; 9935 case ZPOOL_PROP_COMMENT: 9936 strval = fnvpair_value_string(elem); 9937 if (spa->spa_comment != NULL) 9938 spa_strfree(spa->spa_comment); 9939 spa->spa_comment = spa_strdup(strval); 9940 /* 9941 * We need to dirty the configuration on all the vdevs 9942 * so that their labels get updated. We also need to 9943 * update the cache file to keep it in sync with the 9944 * MOS version. It's unnecessary to do this for pool 9945 * creation since the vdev's configuration has already 9946 * been dirtied. 9947 */ 9948 if (tx->tx_txg != TXG_INITIAL) { 9949 vdev_config_dirty(spa->spa_root_vdev); 9950 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 9951 } 9952 spa_history_log_internal(spa, "set", tx, 9953 "%s=%s", elemname, strval); 9954 break; 9955 case ZPOOL_PROP_COMPATIBILITY: 9956 strval = fnvpair_value_string(elem); 9957 if (spa->spa_compatibility != NULL) 9958 spa_strfree(spa->spa_compatibility); 9959 spa->spa_compatibility = spa_strdup(strval); 9960 /* 9961 * Dirty the configuration on vdevs as above. 9962 */ 9963 if (tx->tx_txg != TXG_INITIAL) { 9964 vdev_config_dirty(spa->spa_root_vdev); 9965 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 9966 } 9967 9968 spa_history_log_internal(spa, "set", tx, 9969 "%s=%s", nvpair_name(elem), strval); 9970 break; 9971 9972 case ZPOOL_PROP_INVAL: 9973 if (zpool_prop_feature(elemname)) { 9974 fname = strchr(elemname, '@') + 1; 9975 VERIFY0(zfeature_lookup_name(fname, &fid)); 9976 9977 spa_feature_enable(spa, fid, tx); 9978 spa_history_log_internal(spa, "set", tx, 9979 "%s=enabled", elemname); 9980 break; 9981 } else if (!zfs_prop_user(elemname)) { 9982 ASSERT(zpool_prop_feature(elemname)); 9983 break; 9984 } 9985 zfs_fallthrough; 9986 default: 9987 /* 9988 * Set pool property values in the poolprops mos object. 9989 */ 9990 if (spa->spa_pool_props_object == 0) { 9991 spa->spa_pool_props_object = 9992 zap_create_link(mos, DMU_OT_POOL_PROPS, 9993 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 9994 tx); 9995 } 9996 9997 /* normalize the property name */ 9998 if (prop == ZPOOL_PROP_INVAL) { 9999 propname = elemname; 10000 proptype = PROP_TYPE_STRING; 10001 } else { 10002 propname = zpool_prop_to_name(prop); 10003 proptype = zpool_prop_get_type(prop); 10004 } 10005 10006 if (nvpair_type(elem) == DATA_TYPE_STRING) { 10007 ASSERT(proptype == PROP_TYPE_STRING); 10008 strval = fnvpair_value_string(elem); 10009 if (strlen(strval) == 0) { 10010 /* remove the property if value == "" */ 10011 (void) zap_remove(mos, 10012 spa->spa_pool_props_object, 10013 propname, tx); 10014 } else { 10015 VERIFY0(zap_update(mos, 10016 spa->spa_pool_props_object, 10017 propname, 1, strlen(strval) + 1, 10018 strval, tx)); 10019 } 10020 spa_history_log_internal(spa, "set", tx, 10021 "%s=%s", elemname, strval); 10022 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 10023 intval = fnvpair_value_uint64(elem); 10024 10025 if (proptype == PROP_TYPE_INDEX) { 10026 const char *unused; 10027 VERIFY0(zpool_prop_index_to_string( 10028 prop, intval, &unused)); 10029 } 10030 VERIFY0(zap_update(mos, 10031 spa->spa_pool_props_object, propname, 10032 8, 1, &intval, tx)); 10033 spa_history_log_internal(spa, "set", tx, 10034 "%s=%lld", elemname, 10035 (longlong_t)intval); 10036 10037 switch (prop) { 10038 case ZPOOL_PROP_DELEGATION: 10039 spa->spa_delegation = intval; 10040 break; 10041 case ZPOOL_PROP_BOOTFS: 10042 spa->spa_bootfs = intval; 10043 break; 10044 case ZPOOL_PROP_FAILUREMODE: 10045 spa->spa_failmode = intval; 10046 break; 10047 case ZPOOL_PROP_AUTOTRIM: 10048 spa->spa_autotrim = intval; 10049 spa_async_request(spa, 10050 SPA_ASYNC_AUTOTRIM_RESTART); 10051 break; 10052 case ZPOOL_PROP_AUTOEXPAND: 10053 spa->spa_autoexpand = intval; 10054 if (tx->tx_txg != TXG_INITIAL) 10055 spa_async_request(spa, 10056 SPA_ASYNC_AUTOEXPAND); 10057 break; 10058 case ZPOOL_PROP_MULTIHOST: 10059 spa->spa_multihost = intval; 10060 break; 10061 case ZPOOL_PROP_DEDUP_TABLE_QUOTA: 10062 spa->spa_dedup_table_quota = intval; 10063 break; 10064 default: 10065 break; 10066 } 10067 } else { 10068 ASSERT(0); /* not allowed */ 10069 } 10070 } 10071 10072 } 10073 10074 mutex_exit(&spa->spa_props_lock); 10075 } 10076 10077 /* 10078 * Perform one-time upgrade on-disk changes. spa_version() does not 10079 * reflect the new version this txg, so there must be no changes this 10080 * txg to anything that the upgrade code depends on after it executes. 10081 * Therefore this must be called after dsl_pool_sync() does the sync 10082 * tasks. 10083 */ 10084 static void 10085 spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 10086 { 10087 if (spa_sync_pass(spa) != 1) 10088 return; 10089 10090 dsl_pool_t *dp = spa->spa_dsl_pool; 10091 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 10092 10093 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 10094 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 10095 dsl_pool_create_origin(dp, tx); 10096 10097 /* Keeping the origin open increases spa_minref */ 10098 spa->spa_minref += 3; 10099 } 10100 10101 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 10102 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 10103 dsl_pool_upgrade_clones(dp, tx); 10104 } 10105 10106 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 10107 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 10108 dsl_pool_upgrade_dir_clones(dp, tx); 10109 10110 /* Keeping the freedir open increases spa_minref */ 10111 spa->spa_minref += 3; 10112 } 10113 10114 if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && 10115 spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 10116 spa_feature_create_zap_objects(spa, tx); 10117 } 10118 10119 /* 10120 * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable 10121 * when possibility to use lz4 compression for metadata was added 10122 * Old pools that have this feature enabled must be upgraded to have 10123 * this feature active 10124 */ 10125 if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 10126 boolean_t lz4_en = spa_feature_is_enabled(spa, 10127 SPA_FEATURE_LZ4_COMPRESS); 10128 boolean_t lz4_ac = spa_feature_is_active(spa, 10129 SPA_FEATURE_LZ4_COMPRESS); 10130 10131 if (lz4_en && !lz4_ac) 10132 spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); 10133 } 10134 10135 /* 10136 * If we haven't written the salt, do so now. Note that the 10137 * feature may not be activated yet, but that's fine since 10138 * the presence of this ZAP entry is backwards compatible. 10139 */ 10140 if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 10141 DMU_POOL_CHECKSUM_SALT) == ENOENT) { 10142 VERIFY0(zap_add(spa->spa_meta_objset, 10143 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, 10144 sizeof (spa->spa_cksum_salt.zcs_bytes), 10145 spa->spa_cksum_salt.zcs_bytes, tx)); 10146 } 10147 10148 rrw_exit(&dp->dp_config_rwlock, FTAG); 10149 } 10150 10151 static void 10152 vdev_indirect_state_sync_verify(vdev_t *vd) 10153 { 10154 vdev_indirect_mapping_t *vim __maybe_unused = vd->vdev_indirect_mapping; 10155 vdev_indirect_births_t *vib __maybe_unused = vd->vdev_indirect_births; 10156 10157 if (vd->vdev_ops == &vdev_indirect_ops) { 10158 ASSERT(vim != NULL); 10159 ASSERT(vib != NULL); 10160 } 10161 10162 uint64_t obsolete_sm_object = 0; 10163 ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); 10164 if (obsolete_sm_object != 0) { 10165 ASSERT(vd->vdev_obsolete_sm != NULL); 10166 ASSERT(vd->vdev_removing || 10167 vd->vdev_ops == &vdev_indirect_ops); 10168 ASSERT(vdev_indirect_mapping_num_entries(vim) > 0); 10169 ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0); 10170 ASSERT3U(obsolete_sm_object, ==, 10171 space_map_object(vd->vdev_obsolete_sm)); 10172 ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=, 10173 space_map_allocated(vd->vdev_obsolete_sm)); 10174 } 10175 ASSERT(vd->vdev_obsolete_segments != NULL); 10176 10177 /* 10178 * Since frees / remaps to an indirect vdev can only 10179 * happen in syncing context, the obsolete segments 10180 * tree must be empty when we start syncing. 10181 */ 10182 ASSERT0(zfs_range_tree_space(vd->vdev_obsolete_segments)); 10183 } 10184 10185 /* 10186 * Set the top-level vdev's max queue depth. Evaluate each top-level's 10187 * async write queue depth in case it changed. The max queue depth will 10188 * not change in the middle of syncing out this txg. 10189 */ 10190 static void 10191 spa_sync_adjust_vdev_max_queue_depth(spa_t *spa) 10192 { 10193 ASSERT(spa_writeable(spa)); 10194 10195 metaslab_class_balance(spa_normal_class(spa), B_TRUE); 10196 metaslab_class_balance(spa_special_class(spa), B_TRUE); 10197 metaslab_class_balance(spa_dedup_class(spa), B_TRUE); 10198 } 10199 10200 static void 10201 spa_sync_condense_indirect(spa_t *spa, dmu_tx_t *tx) 10202 { 10203 ASSERT(spa_writeable(spa)); 10204 10205 vdev_t *rvd = spa->spa_root_vdev; 10206 for (int c = 0; c < rvd->vdev_children; c++) { 10207 vdev_t *vd = rvd->vdev_child[c]; 10208 vdev_indirect_state_sync_verify(vd); 10209 10210 if (vdev_indirect_should_condense(vd)) { 10211 spa_condense_indirect_start_sync(vd, tx); 10212 break; 10213 } 10214 } 10215 } 10216 10217 static void 10218 spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx) 10219 { 10220 objset_t *mos = spa->spa_meta_objset; 10221 dsl_pool_t *dp = spa->spa_dsl_pool; 10222 uint64_t txg = tx->tx_txg; 10223 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 10224 10225 do { 10226 int pass = ++spa->spa_sync_pass; 10227 10228 spa_sync_config_object(spa, tx); 10229 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 10230 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 10231 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 10232 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 10233 spa_errlog_sync(spa, txg); 10234 dsl_pool_sync(dp, txg); 10235 10236 if (pass < zfs_sync_pass_deferred_free || 10237 spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { 10238 /* 10239 * If the log space map feature is active we don't 10240 * care about deferred frees and the deferred bpobj 10241 * as the log space map should effectively have the 10242 * same results (i.e. appending only to one object). 10243 */ 10244 spa_sync_frees(spa, free_bpl, tx); 10245 } else { 10246 /* 10247 * We can not defer frees in pass 1, because 10248 * we sync the deferred frees later in pass 1. 10249 */ 10250 ASSERT3U(pass, >, 1); 10251 bplist_iterate(free_bpl, bpobj_enqueue_alloc_cb, 10252 &spa->spa_deferred_bpobj, tx); 10253 } 10254 10255 brt_sync(spa, txg); 10256 ddt_sync(spa, txg); 10257 dsl_scan_sync(dp, tx); 10258 dsl_errorscrub_sync(dp, tx); 10259 svr_sync(spa, tx); 10260 spa_sync_upgrades(spa, tx); 10261 10262 spa_flush_metaslabs(spa, tx); 10263 10264 vdev_t *vd = NULL; 10265 while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 10266 != NULL) 10267 vdev_sync(vd, txg); 10268 10269 if (pass == 1) { 10270 /* 10271 * dsl_pool_sync() -> dp_sync_tasks may have dirtied 10272 * the config. If that happens, this txg should not 10273 * be a no-op. So we must sync the config to the MOS 10274 * before checking for no-op. 10275 * 10276 * Note that when the config is dirty, it will 10277 * be written to the MOS (i.e. the MOS will be 10278 * dirtied) every time we call spa_sync_config_object() 10279 * in this txg. Therefore we can't call this after 10280 * dsl_pool_sync() every pass, because it would 10281 * prevent us from converging, since we'd dirty 10282 * the MOS every pass. 10283 * 10284 * Sync tasks can only be processed in pass 1, so 10285 * there's no need to do this in later passes. 10286 */ 10287 spa_sync_config_object(spa, tx); 10288 } 10289 10290 /* 10291 * Note: We need to check if the MOS is dirty because we could 10292 * have marked the MOS dirty without updating the uberblock 10293 * (e.g. if we have sync tasks but no dirty user data). We need 10294 * to check the uberblock's rootbp because it is updated if we 10295 * have synced out dirty data (though in this case the MOS will 10296 * most likely also be dirty due to second order effects, we 10297 * don't want to rely on that here). 10298 */ 10299 if (pass == 1 && 10300 BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp) < txg && 10301 !dmu_objset_is_dirty(mos, txg)) { 10302 /* 10303 * Nothing changed on the first pass, therefore this 10304 * TXG is a no-op. Avoid syncing deferred frees, so 10305 * that we can keep this TXG as a no-op. 10306 */ 10307 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 10308 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 10309 ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); 10310 ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, txg)); 10311 break; 10312 } 10313 10314 spa_sync_deferred_frees(spa, tx); 10315 } while (dmu_objset_is_dirty(mos, txg)); 10316 } 10317 10318 /* 10319 * Rewrite the vdev configuration (which includes the uberblock) to 10320 * commit the transaction group. 10321 * 10322 * If there are no dirty vdevs, we sync the uberblock to a few random 10323 * top-level vdevs that are known to be visible in the config cache 10324 * (see spa_vdev_add() for a complete description). If there *are* dirty 10325 * vdevs, sync the uberblock to all vdevs. 10326 */ 10327 static void 10328 spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx) 10329 { 10330 vdev_t *rvd = spa->spa_root_vdev; 10331 uint64_t txg = tx->tx_txg; 10332 10333 for (;;) { 10334 int error = 0; 10335 10336 /* 10337 * We hold SCL_STATE to prevent vdev open/close/etc. 10338 * while we're attempting to write the vdev labels. 10339 */ 10340 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 10341 10342 if (list_is_empty(&spa->spa_config_dirty_list)) { 10343 vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; 10344 int svdcount = 0; 10345 int children = rvd->vdev_children; 10346 int c0 = random_in_range(children); 10347 10348 for (int c = 0; c < children; c++) { 10349 vdev_t *vd = 10350 rvd->vdev_child[(c0 + c) % children]; 10351 10352 /* Stop when revisiting the first vdev */ 10353 if (c > 0 && svd[0] == vd) 10354 break; 10355 10356 if (vd->vdev_ms_array == 0 || 10357 vd->vdev_islog || 10358 !vdev_is_concrete(vd)) 10359 continue; 10360 10361 svd[svdcount++] = vd; 10362 if (svdcount == SPA_SYNC_MIN_VDEVS) 10363 break; 10364 } 10365 error = vdev_config_sync(svd, svdcount, txg); 10366 } else { 10367 error = vdev_config_sync(rvd->vdev_child, 10368 rvd->vdev_children, txg); 10369 } 10370 10371 if (error == 0) 10372 spa->spa_last_synced_guid = rvd->vdev_guid; 10373 10374 spa_config_exit(spa, SCL_STATE, FTAG); 10375 10376 if (error == 0) 10377 break; 10378 zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR); 10379 zio_resume_wait(spa); 10380 } 10381 } 10382 10383 /* 10384 * Sync the specified transaction group. New blocks may be dirtied as 10385 * part of the process, so we iterate until it converges. 10386 */ 10387 void 10388 spa_sync(spa_t *spa, uint64_t txg) 10389 { 10390 vdev_t *vd = NULL; 10391 10392 VERIFY(spa_writeable(spa)); 10393 10394 /* 10395 * Wait for i/os issued in open context that need to complete 10396 * before this txg syncs. 10397 */ 10398 (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]); 10399 spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, 10400 ZIO_FLAG_CANFAIL); 10401 10402 /* 10403 * Now that there can be no more cloning in this transaction group, 10404 * but we are still before issuing frees, we can process pending BRT 10405 * updates. 10406 */ 10407 brt_pending_apply(spa, txg); 10408 10409 spa_sync_time_logger(spa, txg); 10410 10411 /* 10412 * Lock out configuration changes. 10413 */ 10414 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 10415 10416 spa->spa_syncing_txg = txg; 10417 spa->spa_sync_pass = 0; 10418 10419 /* 10420 * If there are any pending vdev state changes, convert them 10421 * into config changes that go out with this transaction group. 10422 */ 10423 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 10424 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 10425 /* Avoid holding the write lock unless actually necessary */ 10426 if (vd->vdev_aux == NULL) { 10427 vdev_state_clean(vd); 10428 vdev_config_dirty(vd); 10429 continue; 10430 } 10431 /* 10432 * We need the write lock here because, for aux vdevs, 10433 * calling vdev_config_dirty() modifies sav_config. 10434 * This is ugly and will become unnecessary when we 10435 * eliminate the aux vdev wart by integrating all vdevs 10436 * into the root vdev tree. 10437 */ 10438 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 10439 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 10440 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 10441 vdev_state_clean(vd); 10442 vdev_config_dirty(vd); 10443 } 10444 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 10445 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 10446 } 10447 spa_config_exit(spa, SCL_STATE, FTAG); 10448 10449 dsl_pool_t *dp = spa->spa_dsl_pool; 10450 dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); 10451 10452 spa->spa_sync_starttime = gethrtime(); 10453 10454 taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); 10455 spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq, 10456 spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + 10457 NSEC_TO_TICK(spa->spa_deadman_synctime)); 10458 10459 /* 10460 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 10461 * set spa_deflate if we have no raid-z vdevs. 10462 */ 10463 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 10464 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 10465 vdev_t *rvd = spa->spa_root_vdev; 10466 10467 int i; 10468 for (i = 0; i < rvd->vdev_children; i++) { 10469 vd = rvd->vdev_child[i]; 10470 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 10471 break; 10472 } 10473 if (i == rvd->vdev_children) { 10474 spa->spa_deflate = TRUE; 10475 VERIFY0(zap_add(spa->spa_meta_objset, 10476 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 10477 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 10478 } 10479 } 10480 10481 spa_sync_adjust_vdev_max_queue_depth(spa); 10482 10483 spa_sync_condense_indirect(spa, tx); 10484 10485 spa_sync_iterate_to_convergence(spa, tx); 10486 10487 #ifdef ZFS_DEBUG 10488 if (!list_is_empty(&spa->spa_config_dirty_list)) { 10489 /* 10490 * Make sure that the number of ZAPs for all the vdevs matches 10491 * the number of ZAPs in the per-vdev ZAP list. This only gets 10492 * called if the config is dirty; otherwise there may be 10493 * outstanding AVZ operations that weren't completed in 10494 * spa_sync_config_object. 10495 */ 10496 uint64_t all_vdev_zap_entry_count; 10497 ASSERT0(zap_count(spa->spa_meta_objset, 10498 spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count)); 10499 ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==, 10500 all_vdev_zap_entry_count); 10501 } 10502 #endif 10503 10504 if (spa->spa_vdev_removal != NULL) { 10505 ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]); 10506 } 10507 10508 spa_sync_rewrite_vdev_config(spa, tx); 10509 dmu_tx_commit(tx); 10510 10511 taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); 10512 spa->spa_deadman_tqid = 0; 10513 10514 /* 10515 * Clear the dirty config list. 10516 */ 10517 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 10518 vdev_config_clean(vd); 10519 10520 /* 10521 * Now that the new config has synced transactionally, 10522 * let it become visible to the config cache. 10523 */ 10524 if (spa->spa_config_syncing != NULL) { 10525 spa_config_set(spa, spa->spa_config_syncing); 10526 spa->spa_config_txg = txg; 10527 spa->spa_config_syncing = NULL; 10528 } 10529 10530 dsl_pool_sync_done(dp, txg); 10531 10532 /* 10533 * Update usable space statistics. 10534 */ 10535 while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 10536 != NULL) 10537 vdev_sync_done(vd, txg); 10538 10539 metaslab_class_evict_old(spa->spa_normal_class, txg); 10540 metaslab_class_evict_old(spa->spa_log_class, txg); 10541 /* Embedded log classes have only one metaslab per vdev. */ 10542 metaslab_class_evict_old(spa->spa_special_class, txg); 10543 metaslab_class_evict_old(spa->spa_dedup_class, txg); 10544 10545 spa_sync_close_syncing_log_sm(spa); 10546 10547 spa_update_dspace(spa); 10548 10549 if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) 10550 vdev_autotrim_kick(spa); 10551 10552 /* 10553 * It had better be the case that we didn't dirty anything 10554 * since vdev_config_sync(). 10555 */ 10556 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 10557 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 10558 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 10559 10560 while (zfs_pause_spa_sync) 10561 delay(1); 10562 10563 spa->spa_sync_pass = 0; 10564 10565 /* 10566 * Update the last synced uberblock here. We want to do this at 10567 * the end of spa_sync() so that consumers of spa_last_synced_txg() 10568 * will be guaranteed that all the processing associated with 10569 * that txg has been completed. 10570 */ 10571 spa->spa_ubsync = spa->spa_uberblock; 10572 spa_config_exit(spa, SCL_CONFIG, FTAG); 10573 10574 spa_handle_ignored_writes(spa); 10575 10576 /* 10577 * If any async tasks have been requested, kick them off. 10578 */ 10579 spa_async_dispatch(spa); 10580 } 10581 10582 /* 10583 * Sync all pools. We don't want to hold the namespace lock across these 10584 * operations, so we take a reference on the spa_t and drop the lock during the 10585 * sync. 10586 */ 10587 void 10588 spa_sync_allpools(void) 10589 { 10590 spa_t *spa = NULL; 10591 spa_namespace_enter(FTAG); 10592 while ((spa = spa_next(spa)) != NULL) { 10593 if (spa_state(spa) != POOL_STATE_ACTIVE || 10594 !spa_writeable(spa) || spa_suspended(spa)) 10595 continue; 10596 spa_open_ref(spa, FTAG); 10597 spa_namespace_exit(FTAG); 10598 txg_wait_synced(spa_get_dsl(spa), 0); 10599 spa_namespace_enter(FTAG); 10600 spa_close(spa, FTAG); 10601 } 10602 spa_namespace_exit(FTAG); 10603 } 10604 10605 taskq_t * 10606 spa_sync_tq_create(spa_t *spa, const char *name) 10607 { 10608 kthread_t **kthreads; 10609 10610 ASSERT0P(spa->spa_sync_tq); 10611 ASSERT3S(spa->spa_alloc_count, <=, boot_ncpus); 10612 10613 /* 10614 * - do not allow more allocators than cpus. 10615 * - there may be more cpus than allocators. 10616 * - do not allow more sync taskq threads than allocators or cpus. 10617 */ 10618 int nthreads = spa->spa_alloc_count; 10619 spa->spa_syncthreads = kmem_zalloc(sizeof (spa_syncthread_info_t) * 10620 nthreads, KM_SLEEP); 10621 10622 spa->spa_sync_tq = taskq_create_synced(name, nthreads, minclsyspri, 10623 nthreads, INT_MAX, TASKQ_PREPOPULATE, &kthreads); 10624 VERIFY(spa->spa_sync_tq != NULL); 10625 VERIFY(kthreads != NULL); 10626 10627 spa_syncthread_info_t *ti = spa->spa_syncthreads; 10628 for (int i = 0; i < nthreads; i++, ti++) { 10629 ti->sti_thread = kthreads[i]; 10630 ti->sti_allocator = i; 10631 } 10632 10633 kmem_free(kthreads, sizeof (*kthreads) * nthreads); 10634 return (spa->spa_sync_tq); 10635 } 10636 10637 void 10638 spa_sync_tq_destroy(spa_t *spa) 10639 { 10640 ASSERT(spa->spa_sync_tq != NULL); 10641 10642 taskq_wait(spa->spa_sync_tq); 10643 taskq_destroy(spa->spa_sync_tq); 10644 kmem_free(spa->spa_syncthreads, 10645 sizeof (spa_syncthread_info_t) * spa->spa_alloc_count); 10646 spa->spa_sync_tq = NULL; 10647 } 10648 10649 uint_t 10650 spa_acq_allocator(spa_t *spa) 10651 { 10652 int i; 10653 10654 if (spa->spa_alloc_count == 1) 10655 return (0); 10656 10657 mutex_enter(&spa->spa_allocs_use->sau_lock); 10658 uint_t r = spa->spa_allocs_use->sau_rotor; 10659 do { 10660 if (++r == spa->spa_alloc_count) 10661 r = 0; 10662 } while (spa->spa_allocs_use->sau_inuse[r]); 10663 spa->spa_allocs_use->sau_inuse[r] = B_TRUE; 10664 spa->spa_allocs_use->sau_rotor = r; 10665 mutex_exit(&spa->spa_allocs_use->sau_lock); 10666 10667 spa_syncthread_info_t *ti = spa->spa_syncthreads; 10668 for (i = 0; i < spa->spa_alloc_count; i++, ti++) { 10669 if (ti->sti_thread == curthread) { 10670 ti->sti_allocator = r; 10671 break; 10672 } 10673 } 10674 ASSERT3S(i, <, spa->spa_alloc_count); 10675 return (r); 10676 } 10677 10678 void 10679 spa_rel_allocator(spa_t *spa, uint_t allocator) 10680 { 10681 if (spa->spa_alloc_count > 1) 10682 spa->spa_allocs_use->sau_inuse[allocator] = B_FALSE; 10683 } 10684 10685 void 10686 spa_select_allocator(zio_t *zio) 10687 { 10688 zbookmark_phys_t *bm = &zio->io_bookmark; 10689 spa_t *spa = zio->io_spa; 10690 10691 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 10692 10693 /* 10694 * A gang block (for example) may have inherited its parent's 10695 * allocator, in which case there is nothing further to do here. 10696 */ 10697 if (ZIO_HAS_ALLOCATOR(zio)) 10698 return; 10699 10700 ASSERT(spa != NULL); 10701 ASSERT(bm != NULL); 10702 10703 /* 10704 * First try to use an allocator assigned to the syncthread, and set 10705 * the corresponding write issue taskq for the allocator. 10706 * Note, we must have an open pool to do this. 10707 */ 10708 if (spa->spa_sync_tq != NULL) { 10709 spa_syncthread_info_t *ti = spa->spa_syncthreads; 10710 for (int i = 0; i < spa->spa_alloc_count; i++, ti++) { 10711 if (ti->sti_thread == curthread) { 10712 zio->io_allocator = ti->sti_allocator; 10713 return; 10714 } 10715 } 10716 } 10717 10718 /* 10719 * We want to try to use as many allocators as possible to help improve 10720 * performance, but we also want logically adjacent IOs to be physically 10721 * adjacent to improve sequential read performance. We chunk each object 10722 * into 2^20 block regions, and then hash based on the objset, object, 10723 * level, and region to accomplish both of these goals. 10724 */ 10725 uint64_t hv = cityhash4(bm->zb_objset, bm->zb_object, bm->zb_level, 10726 bm->zb_blkid >> 20); 10727 10728 zio->io_allocator = (uint_t)hv % spa->spa_alloc_count; 10729 } 10730 10731 /* 10732 * ========================================================================== 10733 * Miscellaneous routines 10734 * ========================================================================== 10735 */ 10736 10737 /* 10738 * Remove all pools in the system. 10739 */ 10740 void 10741 spa_evict_all(void) 10742 { 10743 spa_t *spa; 10744 10745 /* 10746 * Remove all cached state. All pools should be closed now, 10747 * so every spa in the AVL tree should be unreferenced. 10748 */ 10749 spa_namespace_enter(FTAG); 10750 while ((spa = spa_next(NULL)) != NULL) { 10751 /* 10752 * Stop async tasks. The async thread may need to detach 10753 * a device that's been replaced, which requires grabbing 10754 * spa_namespace_lock, so we must drop it here. 10755 */ 10756 spa_open_ref(spa, FTAG); 10757 spa_namespace_exit(FTAG); 10758 spa_async_suspend(spa); 10759 spa_namespace_enter(FTAG); 10760 spa_close(spa, FTAG); 10761 10762 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 10763 spa_unload(spa); 10764 spa_deactivate(spa); 10765 } 10766 spa_remove(spa); 10767 } 10768 spa_namespace_exit(FTAG); 10769 } 10770 10771 vdev_t * 10772 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 10773 { 10774 vdev_t *vd; 10775 int i; 10776 10777 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 10778 return (vd); 10779 10780 if (aux) { 10781 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 10782 vd = spa->spa_l2cache.sav_vdevs[i]; 10783 if (vd->vdev_guid == guid) 10784 return (vd); 10785 } 10786 10787 for (i = 0; i < spa->spa_spares.sav_count; i++) { 10788 vd = spa->spa_spares.sav_vdevs[i]; 10789 if (vd->vdev_guid == guid) 10790 return (vd); 10791 } 10792 } 10793 10794 return (NULL); 10795 } 10796 10797 void 10798 spa_upgrade(spa_t *spa, uint64_t version) 10799 { 10800 ASSERT(spa_writeable(spa)); 10801 10802 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 10803 10804 /* 10805 * This should only be called for a non-faulted pool, and since a 10806 * future version would result in an unopenable pool, this shouldn't be 10807 * possible. 10808 */ 10809 ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); 10810 ASSERT3U(version, >=, spa->spa_uberblock.ub_version); 10811 10812 spa->spa_uberblock.ub_version = version; 10813 vdev_config_dirty(spa->spa_root_vdev); 10814 10815 spa_config_exit(spa, SCL_ALL, FTAG); 10816 10817 txg_wait_synced(spa_get_dsl(spa), 0); 10818 } 10819 10820 static boolean_t 10821 spa_has_aux_vdev(spa_t *spa, uint64_t guid, spa_aux_vdev_t *sav) 10822 { 10823 (void) spa; 10824 int i; 10825 uint64_t vdev_guid; 10826 10827 for (i = 0; i < sav->sav_count; i++) 10828 if (sav->sav_vdevs[i]->vdev_guid == guid) 10829 return (B_TRUE); 10830 10831 for (i = 0; i < sav->sav_npending; i++) { 10832 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 10833 &vdev_guid) == 0 && vdev_guid == guid) 10834 return (B_TRUE); 10835 } 10836 10837 return (B_FALSE); 10838 } 10839 10840 boolean_t 10841 spa_has_l2cache(spa_t *spa, uint64_t guid) 10842 { 10843 return (spa_has_aux_vdev(spa, guid, &spa->spa_l2cache)); 10844 } 10845 10846 boolean_t 10847 spa_has_spare(spa_t *spa, uint64_t guid) 10848 { 10849 return (spa_has_aux_vdev(spa, guid, &spa->spa_spares)); 10850 } 10851 10852 /* 10853 * Check if a pool has an active shared spare device. 10854 * Note: reference count of an active spare is 2, as a spare and as a replace 10855 */ 10856 static boolean_t 10857 spa_has_active_shared_spare(spa_t *spa) 10858 { 10859 int i, refcnt; 10860 uint64_t pool; 10861 spa_aux_vdev_t *sav = &spa->spa_spares; 10862 10863 for (i = 0; i < sav->sav_count; i++) { 10864 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 10865 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 10866 refcnt > 2) 10867 return (B_TRUE); 10868 } 10869 10870 return (B_FALSE); 10871 } 10872 10873 uint64_t 10874 spa_total_metaslabs(spa_t *spa) 10875 { 10876 vdev_t *rvd = spa->spa_root_vdev; 10877 10878 uint64_t m = 0; 10879 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 10880 vdev_t *vd = rvd->vdev_child[c]; 10881 if (!vdev_is_concrete(vd)) 10882 continue; 10883 m += vd->vdev_ms_count; 10884 } 10885 return (m); 10886 } 10887 10888 /* 10889 * Notify any waiting threads that some activity has switched from being in- 10890 * progress to not-in-progress so that the thread can wake up and determine 10891 * whether it is finished waiting. 10892 */ 10893 void 10894 spa_notify_waiters(spa_t *spa) 10895 { 10896 /* 10897 * Acquiring spa_activities_lock here prevents the cv_broadcast from 10898 * happening between the waiting thread's check and cv_wait. 10899 */ 10900 mutex_enter(&spa->spa_activities_lock); 10901 cv_broadcast(&spa->spa_activities_cv); 10902 mutex_exit(&spa->spa_activities_lock); 10903 } 10904 10905 /* 10906 * Notify any waiting threads that the pool is exporting, and then block until 10907 * they are finished using the spa_t. 10908 */ 10909 void 10910 spa_wake_waiters(spa_t *spa) 10911 { 10912 mutex_enter(&spa->spa_activities_lock); 10913 spa->spa_waiters_cancel = B_TRUE; 10914 cv_broadcast(&spa->spa_activities_cv); 10915 while (spa->spa_waiters != 0) 10916 cv_wait(&spa->spa_waiters_cv, &spa->spa_activities_lock); 10917 spa->spa_waiters_cancel = B_FALSE; 10918 mutex_exit(&spa->spa_activities_lock); 10919 } 10920 10921 /* Whether the vdev or any of its descendants are being initialized/trimmed. */ 10922 static boolean_t 10923 spa_vdev_activity_in_progress_impl(vdev_t *vd, zpool_wait_activity_t activity) 10924 { 10925 spa_t *spa = vd->vdev_spa; 10926 10927 ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER)); 10928 ASSERT(MUTEX_HELD(&spa->spa_activities_lock)); 10929 ASSERT(activity == ZPOOL_WAIT_INITIALIZE || 10930 activity == ZPOOL_WAIT_TRIM); 10931 10932 kmutex_t *lock = activity == ZPOOL_WAIT_INITIALIZE ? 10933 &vd->vdev_initialize_lock : &vd->vdev_trim_lock; 10934 10935 mutex_exit(&spa->spa_activities_lock); 10936 mutex_enter(lock); 10937 mutex_enter(&spa->spa_activities_lock); 10938 10939 boolean_t in_progress = (activity == ZPOOL_WAIT_INITIALIZE) ? 10940 (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) : 10941 (vd->vdev_trim_state == VDEV_TRIM_ACTIVE); 10942 mutex_exit(lock); 10943 10944 if (in_progress) 10945 return (B_TRUE); 10946 10947 for (int i = 0; i < vd->vdev_children; i++) { 10948 if (spa_vdev_activity_in_progress_impl(vd->vdev_child[i], 10949 activity)) 10950 return (B_TRUE); 10951 } 10952 10953 return (B_FALSE); 10954 } 10955 10956 /* 10957 * If use_guid is true, this checks whether the vdev specified by guid is 10958 * being initialized/trimmed. Otherwise, it checks whether any vdev in the pool 10959 * is being initialized/trimmed. The caller must hold the config lock and 10960 * spa_activities_lock. 10961 */ 10962 static int 10963 spa_vdev_activity_in_progress(spa_t *spa, boolean_t use_guid, uint64_t guid, 10964 zpool_wait_activity_t activity, boolean_t *in_progress) 10965 { 10966 mutex_exit(&spa->spa_activities_lock); 10967 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 10968 mutex_enter(&spa->spa_activities_lock); 10969 10970 vdev_t *vd; 10971 if (use_guid) { 10972 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 10973 if (vd == NULL || !vd->vdev_ops->vdev_op_leaf) { 10974 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 10975 return (EINVAL); 10976 } 10977 } else { 10978 vd = spa->spa_root_vdev; 10979 } 10980 10981 *in_progress = spa_vdev_activity_in_progress_impl(vd, activity); 10982 10983 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 10984 return (0); 10985 } 10986 10987 /* 10988 * Locking for waiting threads 10989 * --------------------------- 10990 * 10991 * Waiting threads need a way to check whether a given activity is in progress, 10992 * and then, if it is, wait for it to complete. Each activity will have some 10993 * in-memory representation of the relevant on-disk state which can be used to 10994 * determine whether or not the activity is in progress. The in-memory state and 10995 * the locking used to protect it will be different for each activity, and may 10996 * not be suitable for use with a cvar (e.g., some state is protected by the 10997 * config lock). To allow waiting threads to wait without any races, another 10998 * lock, spa_activities_lock, is used. 10999 * 11000 * When the state is checked, both the activity-specific lock (if there is one) 11001 * and spa_activities_lock are held. In some cases, the activity-specific lock 11002 * is acquired explicitly (e.g. the config lock). In others, the locking is 11003 * internal to some check (e.g. bpobj_is_empty). After checking, the waiting 11004 * thread releases the activity-specific lock and, if the activity is in 11005 * progress, then cv_waits using spa_activities_lock. 11006 * 11007 * The waiting thread is woken when another thread, one completing some 11008 * activity, updates the state of the activity and then calls 11009 * spa_notify_waiters, which will cv_broadcast. This 'completing' thread only 11010 * needs to hold its activity-specific lock when updating the state, and this 11011 * lock can (but doesn't have to) be dropped before calling spa_notify_waiters. 11012 * 11013 * Because spa_notify_waiters acquires spa_activities_lock before broadcasting, 11014 * and because it is held when the waiting thread checks the state of the 11015 * activity, it can never be the case that the completing thread both updates 11016 * the activity state and cv_broadcasts in between the waiting thread's check 11017 * and cv_wait. Thus, a waiting thread can never miss a wakeup. 11018 * 11019 * In order to prevent deadlock, when the waiting thread does its check, in some 11020 * cases it will temporarily drop spa_activities_lock in order to acquire the 11021 * activity-specific lock. The order in which spa_activities_lock and the 11022 * activity specific lock are acquired in the waiting thread is determined by 11023 * the order in which they are acquired in the completing thread; if the 11024 * completing thread calls spa_notify_waiters with the activity-specific lock 11025 * held, then the waiting thread must also acquire the activity-specific lock 11026 * first. 11027 */ 11028 11029 static int 11030 spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity, 11031 boolean_t use_tag, uint64_t tag, boolean_t *in_progress) 11032 { 11033 int error = 0; 11034 11035 ASSERT(MUTEX_HELD(&spa->spa_activities_lock)); 11036 11037 switch (activity) { 11038 case ZPOOL_WAIT_CKPT_DISCARD: 11039 *in_progress = 11040 (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT) && 11041 zap_contains(spa_meta_objset(spa), 11042 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT) == 11043 ENOENT); 11044 break; 11045 case ZPOOL_WAIT_FREE: 11046 *in_progress = ((spa_version(spa) >= SPA_VERSION_DEADLISTS && 11047 !bpobj_is_empty(&spa->spa_dsl_pool->dp_free_bpobj)) || 11048 spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY) || 11049 spa_livelist_delete_check(spa)); 11050 break; 11051 case ZPOOL_WAIT_INITIALIZE: 11052 case ZPOOL_WAIT_TRIM: 11053 error = spa_vdev_activity_in_progress(spa, use_tag, tag, 11054 activity, in_progress); 11055 break; 11056 case ZPOOL_WAIT_REPLACE: 11057 mutex_exit(&spa->spa_activities_lock); 11058 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 11059 mutex_enter(&spa->spa_activities_lock); 11060 11061 *in_progress = vdev_replace_in_progress(spa->spa_root_vdev); 11062 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 11063 break; 11064 case ZPOOL_WAIT_REMOVE: 11065 *in_progress = (spa->spa_removing_phys.sr_state == 11066 DSS_SCANNING); 11067 break; 11068 case ZPOOL_WAIT_RESILVER: 11069 *in_progress = vdev_rebuild_active(spa->spa_root_vdev); 11070 if (*in_progress) 11071 break; 11072 zfs_fallthrough; 11073 case ZPOOL_WAIT_SCRUB: 11074 { 11075 boolean_t scanning, paused, is_scrub; 11076 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; 11077 11078 is_scrub = (scn->scn_phys.scn_func == POOL_SCAN_SCRUB); 11079 scanning = (scn->scn_phys.scn_state == DSS_SCANNING); 11080 paused = dsl_scan_is_paused_scrub(scn); 11081 *in_progress = (scanning && !paused && 11082 is_scrub == (activity == ZPOOL_WAIT_SCRUB)); 11083 break; 11084 } 11085 case ZPOOL_WAIT_RAIDZ_EXPAND: 11086 { 11087 vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 11088 *in_progress = (vre != NULL && vre->vre_state == DSS_SCANNING); 11089 break; 11090 } 11091 default: 11092 panic("unrecognized value for activity %d", activity); 11093 } 11094 11095 return (error); 11096 } 11097 11098 static int 11099 spa_wait_common(const char *pool, zpool_wait_activity_t activity, 11100 boolean_t use_tag, uint64_t tag, boolean_t *waited) 11101 { 11102 /* 11103 * The tag is used to distinguish between instances of an activity. 11104 * 'initialize' and 'trim' are the only activities that we use this for. 11105 * The other activities can only have a single instance in progress in a 11106 * pool at one time, making the tag unnecessary. 11107 * 11108 * There can be multiple devices being replaced at once, but since they 11109 * all finish once resilvering finishes, we don't bother keeping track 11110 * of them individually, we just wait for them all to finish. 11111 */ 11112 if (use_tag && activity != ZPOOL_WAIT_INITIALIZE && 11113 activity != ZPOOL_WAIT_TRIM) 11114 return (EINVAL); 11115 11116 if (activity < 0 || activity >= ZPOOL_WAIT_NUM_ACTIVITIES) 11117 return (EINVAL); 11118 11119 spa_t *spa; 11120 int error = spa_open(pool, &spa, FTAG); 11121 if (error != 0) 11122 return (error); 11123 11124 /* 11125 * Increment the spa's waiter count so that we can call spa_close and 11126 * still ensure that the spa_t doesn't get freed before this thread is 11127 * finished with it when the pool is exported. We want to call spa_close 11128 * before we start waiting because otherwise the additional ref would 11129 * prevent the pool from being exported or destroyed throughout the 11130 * potentially long wait. 11131 */ 11132 mutex_enter(&spa->spa_activities_lock); 11133 spa->spa_waiters++; 11134 spa_close(spa, FTAG); 11135 11136 *waited = B_FALSE; 11137 for (;;) { 11138 boolean_t in_progress; 11139 error = spa_activity_in_progress(spa, activity, use_tag, tag, 11140 &in_progress); 11141 11142 if (error || !in_progress || spa->spa_waiters_cancel) 11143 break; 11144 11145 *waited = B_TRUE; 11146 11147 if (cv_wait_sig(&spa->spa_activities_cv, 11148 &spa->spa_activities_lock) == 0) { 11149 error = EINTR; 11150 break; 11151 } 11152 } 11153 11154 spa->spa_waiters--; 11155 cv_signal(&spa->spa_waiters_cv); 11156 mutex_exit(&spa->spa_activities_lock); 11157 11158 return (error); 11159 } 11160 11161 /* 11162 * Wait for a particular instance of the specified activity to complete, where 11163 * the instance is identified by 'tag' 11164 */ 11165 int 11166 spa_wait_tag(const char *pool, zpool_wait_activity_t activity, uint64_t tag, 11167 boolean_t *waited) 11168 { 11169 return (spa_wait_common(pool, activity, B_TRUE, tag, waited)); 11170 } 11171 11172 /* 11173 * Wait for all instances of the specified activity complete 11174 */ 11175 int 11176 spa_wait(const char *pool, zpool_wait_activity_t activity, boolean_t *waited) 11177 { 11178 11179 return (spa_wait_common(pool, activity, B_FALSE, 0, waited)); 11180 } 11181 11182 sysevent_t * 11183 spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) 11184 { 11185 sysevent_t *ev = NULL; 11186 #ifdef _KERNEL 11187 nvlist_t *resource; 11188 11189 resource = zfs_event_create(spa, vd, FM_SYSEVENT_CLASS, name, hist_nvl); 11190 if (resource) { 11191 ev = kmem_alloc(sizeof (sysevent_t), KM_SLEEP); 11192 ev->resource = resource; 11193 } 11194 #else 11195 (void) spa, (void) vd, (void) hist_nvl, (void) name; 11196 #endif 11197 return (ev); 11198 } 11199 11200 void 11201 spa_event_post(sysevent_t *ev) 11202 { 11203 #ifdef _KERNEL 11204 if (ev) { 11205 zfs_zevent_post(ev->resource, NULL, zfs_zevent_post_cb); 11206 kmem_free(ev, sizeof (*ev)); 11207 } 11208 #else 11209 (void) ev; 11210 #endif 11211 } 11212 11213 /* 11214 * Post a zevent corresponding to the given sysevent. The 'name' must be one 11215 * of the event definitions in sys/sysevent/eventdefs.h. The payload will be 11216 * filled in from the spa and (optionally) the vdev. This doesn't do anything 11217 * in the userland libzpool, as we don't want consumers to misinterpret ztest 11218 * or zdb as real changes. 11219 */ 11220 void 11221 spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) 11222 { 11223 spa_event_post(spa_event_create(spa, vd, hist_nvl, name)); 11224 } 11225 11226 /* state manipulation functions */ 11227 EXPORT_SYMBOL(spa_open); 11228 EXPORT_SYMBOL(spa_open_rewind); 11229 EXPORT_SYMBOL(spa_get_stats); 11230 EXPORT_SYMBOL(spa_create); 11231 EXPORT_SYMBOL(spa_import); 11232 EXPORT_SYMBOL(spa_tryimport); 11233 EXPORT_SYMBOL(spa_destroy); 11234 EXPORT_SYMBOL(spa_export); 11235 EXPORT_SYMBOL(spa_reset); 11236 EXPORT_SYMBOL(spa_async_request); 11237 EXPORT_SYMBOL(spa_async_suspend); 11238 EXPORT_SYMBOL(spa_async_resume); 11239 EXPORT_SYMBOL(spa_inject_addref); 11240 EXPORT_SYMBOL(spa_inject_delref); 11241 EXPORT_SYMBOL(spa_scan_stat_init); 11242 EXPORT_SYMBOL(spa_scan_get_stats); 11243 11244 /* device manipulation */ 11245 EXPORT_SYMBOL(spa_vdev_add); 11246 EXPORT_SYMBOL(spa_vdev_attach); 11247 EXPORT_SYMBOL(spa_vdev_detach); 11248 EXPORT_SYMBOL(spa_vdev_setpath); 11249 EXPORT_SYMBOL(spa_vdev_setfru); 11250 EXPORT_SYMBOL(spa_vdev_split_mirror); 11251 11252 /* spare statech is global across all pools) */ 11253 EXPORT_SYMBOL(spa_spare_add); 11254 EXPORT_SYMBOL(spa_spare_remove); 11255 EXPORT_SYMBOL(spa_spare_exists); 11256 EXPORT_SYMBOL(spa_spare_activate); 11257 11258 /* L2ARC statech is global across all pools) */ 11259 EXPORT_SYMBOL(spa_l2cache_add); 11260 EXPORT_SYMBOL(spa_l2cache_remove); 11261 EXPORT_SYMBOL(spa_l2cache_exists); 11262 EXPORT_SYMBOL(spa_l2cache_activate); 11263 EXPORT_SYMBOL(spa_l2cache_drop); 11264 11265 /* scanning */ 11266 EXPORT_SYMBOL(spa_scan); 11267 EXPORT_SYMBOL(spa_scan_range); 11268 EXPORT_SYMBOL(spa_scan_stop); 11269 11270 /* spa syncing */ 11271 EXPORT_SYMBOL(spa_sync); /* only for DMU use */ 11272 EXPORT_SYMBOL(spa_sync_allpools); 11273 11274 /* properties */ 11275 EXPORT_SYMBOL(spa_prop_set); 11276 EXPORT_SYMBOL(spa_prop_get); 11277 EXPORT_SYMBOL(spa_prop_clear_bootfs); 11278 11279 /* asynchronous event notification */ 11280 EXPORT_SYMBOL(spa_event_notify); 11281 11282 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_pct, UINT, ZMOD_RW, 11283 "Percentage of CPUs to run a metaslab preload taskq"); 11284 11285 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, UINT, ZMOD_RW, 11286 "log2 fraction of arc that can be used by inflight I/Os when " 11287 "verifying pool during import"); 11288 11289 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_metadata, INT, ZMOD_RW, 11290 "Set to traverse metadata on pool import"); 11291 11292 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW, 11293 "Set to traverse data on pool import"); 11294 11295 ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW, 11296 "Print vdev tree to zfs_dbgmsg during pool import"); 11297 11298 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RW, 11299 "Percentage of CPUs to run an IO worker thread"); 11300 11301 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RW, 11302 "Number of threads per IO worker taskqueue"); 11303 11304 ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, U64, ZMOD_RW, 11305 "Allow importing pool with up to this number of missing top-level " 11306 "vdevs (in read-only mode)"); 11307 11308 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT, 11309 ZMOD_RW, "Set the livelist condense zthr to pause"); 11310 11311 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_pause, INT, 11312 ZMOD_RW, "Set the livelist condense synctask to pause"); 11313 11314 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_cancel, 11315 INT, ZMOD_RW, 11316 "Whether livelist condensing was canceled in the synctask"); 11317 11318 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_cancel, 11319 INT, ZMOD_RW, 11320 "Whether livelist condensing was canceled in the zthr function"); 11321 11322 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, 11323 ZMOD_RW, 11324 "Whether extra ALLOC blkptrs were added to a livelist entry while it " 11325 "was being condensed"); 11326 11327 ZFS_MODULE_PARAM(zfs_spa, spa_, note_txg_time, UINT, ZMOD_RW, 11328 "How frequently TXG timestamps are stored internally (in seconds)"); 11329 11330 ZFS_MODULE_PARAM(zfs_spa, spa_, flush_txg_time, UINT, ZMOD_RW, 11331 "How frequently the TXG timestamps database should be flushed " 11332 "to disk (in seconds)"); 11333 11334 #ifdef _KERNEL 11335 ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read, 11336 spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RW, 11337 "Configure IO queues for read IO"); 11338 ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write, 11339 spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RW, 11340 "Configure IO queues for write IO"); 11341 ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_free, 11342 spa_taskq_free_param_set, spa_taskq_free_param_get, ZMOD_RW, 11343 "Configure IO queues for free IO"); 11344 #endif 11345 11346 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_write_tpq, UINT, ZMOD_RW, 11347 "Number of CPUs per write issue taskq"); 11348