1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 23 /* 24 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 25 * Copyright (c) 2011, 2024 by Delphix. All rights reserved. 26 * Copyright (c) 2018, Nexenta Systems, Inc. All rights reserved. 27 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 28 * Copyright 2013 Saso Kiselkov. All rights reserved. 29 * Copyright (c) 2014 Integros [integros.com] 30 * Copyright 2016 Toomas Soome <tsoome@me.com> 31 * Copyright (c) 2016 Actifio, Inc. All rights reserved. 32 * Copyright 2018 Joyent, Inc. 33 * Copyright (c) 2017, 2019, Datto Inc. All rights reserved. 34 * Copyright 2017 Joyent, Inc. 35 * Copyright (c) 2017, Intel Corporation. 36 * Copyright (c) 2021, Colm Buckley <colm@tuatha.org> 37 * Copyright (c) 2023 Hewlett Packard Enterprise Development LP. 38 * Copyright (c) 2023, 2024, Klara Inc. 39 */ 40 41 /* 42 * SPA: Storage Pool Allocator 43 * 44 * This file contains all the routines used when modifying on-disk SPA state. 45 * This includes opening, importing, destroying, exporting a pool, and syncing a 46 * pool. 47 */ 48 49 #include <sys/zfs_context.h> 50 #include <sys/fm/fs/zfs.h> 51 #include <sys/spa_impl.h> 52 #include <sys/zio.h> 53 #include <sys/zio_checksum.h> 54 #include <sys/dmu.h> 55 #include <sys/dmu_tx.h> 56 #include <sys/zap.h> 57 #include <sys/zil.h> 58 #include <sys/brt.h> 59 #include <sys/ddt.h> 60 #include <sys/vdev_impl.h> 61 #include <sys/vdev_removal.h> 62 #include <sys/vdev_indirect_mapping.h> 63 #include <sys/vdev_indirect_births.h> 64 #include <sys/vdev_initialize.h> 65 #include <sys/vdev_rebuild.h> 66 #include <sys/vdev_trim.h> 67 #include <sys/vdev_disk.h> 68 #include <sys/vdev_raidz.h> 69 #include <sys/vdev_draid.h> 70 #include <sys/metaslab.h> 71 #include <sys/metaslab_impl.h> 72 #include <sys/mmp.h> 73 #include <sys/uberblock_impl.h> 74 #include <sys/txg.h> 75 #include <sys/avl.h> 76 #include <sys/bpobj.h> 77 #include <sys/dmu_traverse.h> 78 #include <sys/dmu_objset.h> 79 #include <sys/unique.h> 80 #include <sys/dsl_pool.h> 81 #include <sys/dsl_dataset.h> 82 #include <sys/dsl_dir.h> 83 #include <sys/dsl_prop.h> 84 #include <sys/dsl_synctask.h> 85 #include <sys/fs/zfs.h> 86 #include <sys/arc.h> 87 #include <sys/callb.h> 88 #include <sys/systeminfo.h> 89 #include <sys/zfs_ioctl.h> 90 #include <sys/dsl_scan.h> 91 #include <sys/zfeature.h> 92 #include <sys/dsl_destroy.h> 93 #include <sys/zvol.h> 94 95 #ifdef _KERNEL 96 #include <sys/fm/protocol.h> 97 #include <sys/fm/util.h> 98 #include <sys/callb.h> 99 #include <sys/zone.h> 100 #include <sys/vmsystm.h> 101 #endif /* _KERNEL */ 102 103 #include "zfs_crrd.h" 104 #include "zfs_prop.h" 105 #include "zfs_comutil.h" 106 #include <cityhash.h> 107 108 /* 109 * spa_thread() existed on Illumos as a parent thread for the various worker 110 * threads that actually run the pool, as a way to both reference the entire 111 * pool work as a single object, and to share properties like scheduling 112 * options. It has not yet been adapted to Linux or FreeBSD. This define is 113 * used to mark related parts of the code to make things easier for the reader, 114 * and to compile this code out. It can be removed when someone implements it, 115 * moves it to some Illumos-specific place, or removes it entirely. 116 */ 117 #undef HAVE_SPA_THREAD 118 119 /* 120 * The "System Duty Cycle" scheduling class is an Illumos feature to help 121 * prevent CPU-intensive kernel threads from affecting latency on interactive 122 * threads. It doesn't exist on Linux or FreeBSD, so the supporting code is 123 * gated behind a define. On Illumos SDC depends on spa_thread(), but 124 * spa_thread() also has other uses, so this is a separate define. 125 */ 126 #undef HAVE_SYSDC 127 128 /* 129 * The interval, in seconds, at which failed configuration cache file writes 130 * should be retried. 131 */ 132 int zfs_ccw_retry_interval = 300; 133 134 typedef enum zti_modes { 135 ZTI_MODE_FIXED, /* value is # of threads (min 1) */ 136 ZTI_MODE_SCALE, /* Taskqs scale with CPUs. */ 137 ZTI_MODE_SYNC, /* sync thread assigned */ 138 ZTI_MODE_NULL, /* don't create a taskq */ 139 ZTI_NMODES 140 } zti_modes_t; 141 142 #define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } 143 #define ZTI_PCT(n) { ZTI_MODE_ONLINE_PERCENT, (n), 1 } 144 #define ZTI_SCALE { ZTI_MODE_SCALE, 0, 1 } 145 #define ZTI_SYNC { ZTI_MODE_SYNC, 0, 1 } 146 #define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } 147 148 #define ZTI_N(n) ZTI_P(n, 1) 149 #define ZTI_ONE ZTI_N(1) 150 151 typedef struct zio_taskq_info { 152 zti_modes_t zti_mode; 153 uint_t zti_value; 154 uint_t zti_count; 155 } zio_taskq_info_t; 156 157 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 158 "iss", "iss_h", "int", "int_h" 159 }; 160 161 /* 162 * This table defines the taskq settings for each ZFS I/O type. When 163 * initializing a pool, we use this table to create an appropriately sized 164 * taskq. Some operations are low volume and therefore have a small, static 165 * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE 166 * macros. Other operations process a large amount of data; the ZTI_SCALE 167 * macro causes us to create a taskq oriented for throughput. Some operations 168 * are so high frequency and short-lived that the taskq itself can become a 169 * point of lock contention. The ZTI_P(#, #) macro indicates that we need an 170 * additional degree of parallelism specified by the number of threads per- 171 * taskq and the number of taskqs; when dispatching an event in this case, the 172 * particular taskq is chosen at random. ZTI_SCALE uses a number of taskqs 173 * that scales with the number of CPUs. 174 * 175 * The different taskq priorities are to handle the different contexts (issue 176 * and interrupt) and then to reserve threads for high priority I/Os that 177 * need to be handled with minimum delay. Illumos taskq has unfair TQ_FRONT 178 * implementation, so separate high priority threads are used there. 179 */ 180 static zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 181 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 182 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ 183 { ZTI_N(8), ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* READ */ 184 #ifdef illumos 185 { ZTI_SYNC, ZTI_N(5), ZTI_SCALE, ZTI_N(5) }, /* WRITE */ 186 #else 187 { ZTI_SYNC, ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* WRITE */ 188 #endif 189 { ZTI_SCALE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ 190 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ 191 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FLUSH */ 192 { ZTI_N(4), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* TRIM */ 193 }; 194 195 static void spa_sync_version(void *arg, dmu_tx_t *tx); 196 static void spa_sync_props(void *arg, dmu_tx_t *tx); 197 static boolean_t spa_has_active_shared_spare(spa_t *spa); 198 static int spa_load_impl(spa_t *spa, spa_import_type_t type, 199 const char **ereport); 200 static void spa_vdev_resilver_done(spa_t *spa); 201 202 /* 203 * Percentage of all CPUs that can be used by the metaslab preload taskq. 204 */ 205 static uint_t metaslab_preload_pct = 50; 206 207 static uint_t zio_taskq_batch_pct = 80; /* 1 thread per cpu in pset */ 208 static uint_t zio_taskq_batch_tpq; /* threads per taskq */ 209 210 #ifdef HAVE_SYSDC 211 static const boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 212 static const uint_t zio_taskq_basedc = 80; /* base duty cycle */ 213 #endif 214 215 #ifdef HAVE_SPA_THREAD 216 static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */ 217 #endif 218 219 static uint_t zio_taskq_write_tpq = 16; 220 221 /* 222 * Report any spa_load_verify errors found, but do not fail spa_load. 223 * This is used by zdb to analyze non-idle pools. 224 */ 225 boolean_t spa_load_verify_dryrun = B_FALSE; 226 227 /* 228 * Allow read spacemaps in case of readonly import (spa_mode == SPA_MODE_READ). 229 * This is used by zdb for spacemaps verification. 230 */ 231 boolean_t spa_mode_readable_spacemaps = B_FALSE; 232 233 /* 234 * This (illegal) pool name is used when temporarily importing a spa_t in order 235 * to get the vdev stats associated with the imported devices. 236 */ 237 #define TRYIMPORT_NAME "$import" 238 239 /* 240 * For debugging purposes: print out vdev tree during pool import. 241 */ 242 static int spa_load_print_vdev_tree = B_FALSE; 243 244 /* 245 * A non-zero value for zfs_max_missing_tvds means that we allow importing 246 * pools with missing top-level vdevs. This is strictly intended for advanced 247 * pool recovery cases since missing data is almost inevitable. Pools with 248 * missing devices can only be imported read-only for safety reasons, and their 249 * fail-mode will be automatically set to "continue". 250 * 251 * With 1 missing vdev we should be able to import the pool and mount all 252 * datasets. User data that was not modified after the missing device has been 253 * added should be recoverable. This means that snapshots created prior to the 254 * addition of that device should be completely intact. 255 * 256 * With 2 missing vdevs, some datasets may fail to mount since there are 257 * dataset statistics that are stored as regular metadata. Some data might be 258 * recoverable if those vdevs were added recently. 259 * 260 * With 3 or more missing vdevs, the pool is severely damaged and MOS entries 261 * may be missing entirely. Chances of data recovery are very low. Note that 262 * there are also risks of performing an inadvertent rewind as we might be 263 * missing all the vdevs with the latest uberblocks. 264 */ 265 uint64_t zfs_max_missing_tvds = 0; 266 267 /* 268 * The parameters below are similar to zfs_max_missing_tvds but are only 269 * intended for a preliminary open of the pool with an untrusted config which 270 * might be incomplete or out-dated. 271 * 272 * We are more tolerant for pools opened from a cachefile since we could have 273 * an out-dated cachefile where a device removal was not registered. 274 * We could have set the limit arbitrarily high but in the case where devices 275 * are really missing we would want to return the proper error codes; we chose 276 * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available 277 * and we get a chance to retrieve the trusted config. 278 */ 279 uint64_t zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1; 280 281 /* 282 * In the case where config was assembled by scanning device paths (/dev/dsks 283 * by default) we are less tolerant since all the existing devices should have 284 * been detected and we want spa_load to return the right error codes. 285 */ 286 uint64_t zfs_max_missing_tvds_scan = 0; 287 288 /* 289 * Debugging aid that pauses spa_sync() towards the end. 290 */ 291 static const boolean_t zfs_pause_spa_sync = B_FALSE; 292 293 /* 294 * Variables to indicate the livelist condense zthr func should wait at certain 295 * points for the livelist to be removed - used to test condense/destroy races 296 */ 297 static int zfs_livelist_condense_zthr_pause = 0; 298 static int zfs_livelist_condense_sync_pause = 0; 299 300 /* 301 * Variables to track whether or not condense cancellation has been 302 * triggered in testing. 303 */ 304 static int zfs_livelist_condense_sync_cancel = 0; 305 static int zfs_livelist_condense_zthr_cancel = 0; 306 307 /* 308 * Variable to track whether or not extra ALLOC blkptrs were added to a 309 * livelist entry while it was being condensed (caused by the way we track 310 * remapped blkptrs in dbuf_remap_impl) 311 */ 312 static int zfs_livelist_condense_new_alloc = 0; 313 314 /* 315 * Time variable to decide how often the txg should be added into the 316 * database (in seconds). 317 * The smallest available resolution is in minutes, which means an update occurs 318 * each time we reach `spa_note_txg_time` and the txg has changed. We provide 319 * a 256-slot ring buffer for minute-level resolution. The number is limited by 320 * the size of the structure we use and the maximum amount of bytes we can write 321 * into ZAP. Setting `spa_note_txg_time` to 10 minutes results in approximately 322 * 144 records per day. Given the 256 slots, this provides roughly 1.5 days of 323 * high-resolution data. 324 * 325 * The user can decrease `spa_note_txg_time` to increase resolution within 326 * a day, at the cost of retaining fewer days of data. Alternatively, increasing 327 * the interval allows storing data over a longer period, but with lower 328 * frequency. 329 * 330 * This parameter does not affect the daily or monthly databases, as those only 331 * store one record per day and per month, respectively. 332 */ 333 static uint_t spa_note_txg_time = 10 * 60; 334 335 /* 336 * How often flush txg database to a disk (in seconds). 337 * We flush data every time we write to it, making it the most reliable option. 338 * Since this happens every 10 minutes, it shouldn't introduce any noticeable 339 * overhead for the system. In case of failure, we will always have an 340 * up-to-date version of the database. 341 * 342 * The user can adjust the flush interval to a lower value, but it probably 343 * doesn't make sense to flush more often than the database is updated. 344 * The user can also increase the interval if they're concerned about the 345 * performance of writing the entire database to disk. 346 */ 347 static uint_t spa_flush_txg_time = 10 * 60; 348 349 /* 350 * ========================================================================== 351 * SPA properties routines 352 * ========================================================================== 353 */ 354 355 /* 356 * Add a (source=src, propname=propval) list to an nvlist. 357 */ 358 static void 359 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, const char *strval, 360 uint64_t intval, zprop_source_t src) 361 { 362 const char *propname = zpool_prop_to_name(prop); 363 nvlist_t *propval; 364 365 propval = fnvlist_alloc(); 366 fnvlist_add_uint64(propval, ZPROP_SOURCE, src); 367 368 if (strval != NULL) 369 fnvlist_add_string(propval, ZPROP_VALUE, strval); 370 else 371 fnvlist_add_uint64(propval, ZPROP_VALUE, intval); 372 373 fnvlist_add_nvlist(nvl, propname, propval); 374 nvlist_free(propval); 375 } 376 377 static int 378 spa_prop_add(spa_t *spa, const char *propname, nvlist_t *outnvl) 379 { 380 zpool_prop_t prop = zpool_name_to_prop(propname); 381 zprop_source_t src = ZPROP_SRC_NONE; 382 uint64_t intval; 383 int err; 384 385 /* 386 * NB: Not all properties lookups via this API require 387 * the spa props lock, so they must explicitly grab it here. 388 */ 389 switch (prop) { 390 case ZPOOL_PROP_DEDUPCACHED: 391 err = ddt_get_pool_dedup_cached(spa, &intval); 392 if (err != 0) 393 return (SET_ERROR(err)); 394 break; 395 default: 396 return (SET_ERROR(EINVAL)); 397 } 398 399 spa_prop_add_list(outnvl, prop, NULL, intval, src); 400 401 return (0); 402 } 403 404 int 405 spa_prop_get_nvlist(spa_t *spa, char **props, unsigned int n_props, 406 nvlist_t *outnvl) 407 { 408 int err = 0; 409 410 if (props == NULL) 411 return (0); 412 413 for (unsigned int i = 0; i < n_props && err == 0; i++) { 414 err = spa_prop_add(spa, props[i], outnvl); 415 } 416 417 return (err); 418 } 419 420 /* 421 * Add a user property (source=src, propname=propval) to an nvlist. 422 */ 423 static void 424 spa_prop_add_user(nvlist_t *nvl, const char *propname, char *strval, 425 zprop_source_t src) 426 { 427 nvlist_t *propval; 428 429 VERIFY0(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP)); 430 VERIFY0(nvlist_add_uint64(propval, ZPROP_SOURCE, src)); 431 VERIFY0(nvlist_add_string(propval, ZPROP_VALUE, strval)); 432 VERIFY0(nvlist_add_nvlist(nvl, propname, propval)); 433 nvlist_free(propval); 434 } 435 436 /* 437 * Get property values from the spa configuration. 438 */ 439 static void 440 spa_prop_get_config(spa_t *spa, nvlist_t *nv) 441 { 442 vdev_t *rvd = spa->spa_root_vdev; 443 dsl_pool_t *pool = spa->spa_dsl_pool; 444 uint64_t size, alloc, cap, version; 445 const zprop_source_t src = ZPROP_SRC_NONE; 446 spa_config_dirent_t *dp; 447 metaslab_class_t *mc = spa_normal_class(spa); 448 449 ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 450 451 if (rvd != NULL) { 452 alloc = metaslab_class_get_alloc(mc); 453 alloc += metaslab_class_get_alloc(spa_special_class(spa)); 454 alloc += metaslab_class_get_alloc(spa_dedup_class(spa)); 455 alloc += metaslab_class_get_alloc(spa_embedded_log_class(spa)); 456 alloc += metaslab_class_get_alloc( 457 spa_special_embedded_log_class(spa)); 458 459 size = metaslab_class_get_space(mc); 460 size += metaslab_class_get_space(spa_special_class(spa)); 461 size += metaslab_class_get_space(spa_dedup_class(spa)); 462 size += metaslab_class_get_space(spa_embedded_log_class(spa)); 463 size += metaslab_class_get_space( 464 spa_special_embedded_log_class(spa)); 465 466 spa_prop_add_list(nv, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 467 spa_prop_add_list(nv, ZPOOL_PROP_SIZE, NULL, size, src); 468 spa_prop_add_list(nv, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 469 spa_prop_add_list(nv, ZPOOL_PROP_FREE, NULL, 470 size - alloc, src); 471 spa_prop_add_list(nv, ZPOOL_PROP_CHECKPOINT, NULL, 472 spa->spa_checkpoint_info.sci_dspace, src); 473 474 spa_prop_add_list(nv, ZPOOL_PROP_FRAGMENTATION, NULL, 475 metaslab_class_fragmentation(mc), src); 476 spa_prop_add_list(nv, ZPOOL_PROP_EXPANDSZ, NULL, 477 metaslab_class_expandable_space(mc), src); 478 spa_prop_add_list(nv, ZPOOL_PROP_READONLY, NULL, 479 (spa_mode(spa) == SPA_MODE_READ), src); 480 481 cap = (size == 0) ? 0 : (alloc * 100 / size); 482 spa_prop_add_list(nv, ZPOOL_PROP_CAPACITY, NULL, cap, src); 483 484 spa_prop_add_list(nv, ZPOOL_PROP_DEDUPRATIO, NULL, 485 ddt_get_pool_dedup_ratio(spa), src); 486 spa_prop_add_list(nv, ZPOOL_PROP_BCLONEUSED, NULL, 487 brt_get_used(spa), src); 488 spa_prop_add_list(nv, ZPOOL_PROP_BCLONESAVED, NULL, 489 brt_get_saved(spa), src); 490 spa_prop_add_list(nv, ZPOOL_PROP_BCLONERATIO, NULL, 491 brt_get_ratio(spa), src); 492 493 spa_prop_add_list(nv, ZPOOL_PROP_DEDUP_TABLE_SIZE, NULL, 494 ddt_get_ddt_dsize(spa), src); 495 spa_prop_add_list(nv, ZPOOL_PROP_HEALTH, NULL, 496 rvd->vdev_state, src); 497 spa_prop_add_list(nv, ZPOOL_PROP_LAST_SCRUBBED_TXG, NULL, 498 spa_get_last_scrubbed_txg(spa), src); 499 500 version = spa_version(spa); 501 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) { 502 spa_prop_add_list(nv, ZPOOL_PROP_VERSION, NULL, 503 version, ZPROP_SRC_DEFAULT); 504 } else { 505 spa_prop_add_list(nv, ZPOOL_PROP_VERSION, NULL, 506 version, ZPROP_SRC_LOCAL); 507 } 508 spa_prop_add_list(nv, ZPOOL_PROP_LOAD_GUID, 509 NULL, spa_load_guid(spa), src); 510 } 511 512 if (pool != NULL) { 513 /* 514 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, 515 * when opening pools before this version freedir will be NULL. 516 */ 517 if (pool->dp_free_dir != NULL) { 518 spa_prop_add_list(nv, ZPOOL_PROP_FREEING, NULL, 519 dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes, 520 src); 521 } else { 522 spa_prop_add_list(nv, ZPOOL_PROP_FREEING, 523 NULL, 0, src); 524 } 525 526 if (pool->dp_leak_dir != NULL) { 527 spa_prop_add_list(nv, ZPOOL_PROP_LEAKED, NULL, 528 dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes, 529 src); 530 } else { 531 spa_prop_add_list(nv, ZPOOL_PROP_LEAKED, 532 NULL, 0, src); 533 } 534 } 535 536 spa_prop_add_list(nv, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 537 538 if (spa->spa_comment != NULL) { 539 spa_prop_add_list(nv, ZPOOL_PROP_COMMENT, spa->spa_comment, 540 0, ZPROP_SRC_LOCAL); 541 } 542 543 if (spa->spa_compatibility != NULL) { 544 spa_prop_add_list(nv, ZPOOL_PROP_COMPATIBILITY, 545 spa->spa_compatibility, 0, ZPROP_SRC_LOCAL); 546 } 547 548 if (spa->spa_root != NULL) 549 spa_prop_add_list(nv, ZPOOL_PROP_ALTROOT, spa->spa_root, 550 0, ZPROP_SRC_LOCAL); 551 552 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { 553 spa_prop_add_list(nv, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 554 MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE); 555 } else { 556 spa_prop_add_list(nv, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 557 SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); 558 } 559 560 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) { 561 spa_prop_add_list(nv, ZPOOL_PROP_MAXDNODESIZE, NULL, 562 DNODE_MAX_SIZE, ZPROP_SRC_NONE); 563 } else { 564 spa_prop_add_list(nv, ZPOOL_PROP_MAXDNODESIZE, NULL, 565 DNODE_MIN_SIZE, ZPROP_SRC_NONE); 566 } 567 568 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 569 if (dp->scd_path == NULL) { 570 spa_prop_add_list(nv, ZPOOL_PROP_CACHEFILE, 571 "none", 0, ZPROP_SRC_LOCAL); 572 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 573 spa_prop_add_list(nv, ZPOOL_PROP_CACHEFILE, 574 dp->scd_path, 0, ZPROP_SRC_LOCAL); 575 } 576 } 577 } 578 579 /* 580 * Get zpool property values. 581 */ 582 int 583 spa_prop_get(spa_t *spa, nvlist_t *nv) 584 { 585 objset_t *mos = spa->spa_meta_objset; 586 zap_cursor_t zc; 587 zap_attribute_t *za; 588 dsl_pool_t *dp; 589 int err = 0; 590 591 dp = spa_get_dsl(spa); 592 dsl_pool_config_enter(dp, FTAG); 593 za = zap_attribute_alloc(); 594 mutex_enter(&spa->spa_props_lock); 595 596 /* 597 * Get properties from the spa config. 598 */ 599 spa_prop_get_config(spa, nv); 600 601 /* If no pool property object, no more prop to get. */ 602 if (mos == NULL || spa->spa_pool_props_object == 0) 603 goto out; 604 605 /* 606 * Get properties from the MOS pool property object. 607 */ 608 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 609 (err = zap_cursor_retrieve(&zc, za)) == 0; 610 zap_cursor_advance(&zc)) { 611 uint64_t intval = 0; 612 char *strval = NULL; 613 zprop_source_t src = ZPROP_SRC_DEFAULT; 614 zpool_prop_t prop; 615 616 if ((prop = zpool_name_to_prop(za->za_name)) == 617 ZPOOL_PROP_INVAL && !zfs_prop_user(za->za_name)) 618 continue; 619 620 switch (za->za_integer_length) { 621 case 8: 622 /* integer property */ 623 if (za->za_first_integer != 624 zpool_prop_default_numeric(prop)) 625 src = ZPROP_SRC_LOCAL; 626 627 if (prop == ZPOOL_PROP_BOOTFS) { 628 dsl_dataset_t *ds = NULL; 629 630 err = dsl_dataset_hold_obj(dp, 631 za->za_first_integer, FTAG, &ds); 632 if (err != 0) 633 break; 634 635 strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, 636 KM_SLEEP); 637 dsl_dataset_name(ds, strval); 638 dsl_dataset_rele(ds, FTAG); 639 } else { 640 strval = NULL; 641 intval = za->za_first_integer; 642 } 643 644 spa_prop_add_list(nv, prop, strval, intval, src); 645 646 if (strval != NULL) 647 kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN); 648 649 break; 650 651 case 1: 652 /* string property */ 653 strval = kmem_alloc(za->za_num_integers, KM_SLEEP); 654 err = zap_lookup(mos, spa->spa_pool_props_object, 655 za->za_name, 1, za->za_num_integers, strval); 656 if (err) { 657 kmem_free(strval, za->za_num_integers); 658 break; 659 } 660 if (prop != ZPOOL_PROP_INVAL) { 661 spa_prop_add_list(nv, prop, strval, 0, src); 662 } else { 663 src = ZPROP_SRC_LOCAL; 664 spa_prop_add_user(nv, za->za_name, strval, 665 src); 666 } 667 kmem_free(strval, za->za_num_integers); 668 break; 669 670 default: 671 break; 672 } 673 } 674 zap_cursor_fini(&zc); 675 out: 676 mutex_exit(&spa->spa_props_lock); 677 dsl_pool_config_exit(dp, FTAG); 678 zap_attribute_free(za); 679 680 if (err && err != ENOENT) 681 return (err); 682 683 return (0); 684 } 685 686 /* 687 * Validate the given pool properties nvlist and modify the list 688 * for the property values to be set. 689 */ 690 static int 691 spa_prop_validate(spa_t *spa, nvlist_t *props) 692 { 693 nvpair_t *elem; 694 int error = 0, reset_bootfs = 0; 695 uint64_t objnum = 0; 696 boolean_t has_feature = B_FALSE; 697 698 elem = NULL; 699 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 700 uint64_t intval; 701 const char *strval, *slash, *check, *fname; 702 const char *propname = nvpair_name(elem); 703 zpool_prop_t prop = zpool_name_to_prop(propname); 704 705 switch (prop) { 706 case ZPOOL_PROP_INVAL: 707 /* 708 * Sanitize the input. 709 */ 710 if (zfs_prop_user(propname)) { 711 if (strlen(propname) >= ZAP_MAXNAMELEN) { 712 error = SET_ERROR(ENAMETOOLONG); 713 break; 714 } 715 716 if (strlen(fnvpair_value_string(elem)) >= 717 ZAP_MAXVALUELEN) { 718 error = SET_ERROR(E2BIG); 719 break; 720 } 721 } else if (zpool_prop_feature(propname)) { 722 if (nvpair_type(elem) != DATA_TYPE_UINT64) { 723 error = SET_ERROR(EINVAL); 724 break; 725 } 726 727 if (nvpair_value_uint64(elem, &intval) != 0) { 728 error = SET_ERROR(EINVAL); 729 break; 730 } 731 732 if (intval != 0) { 733 error = SET_ERROR(EINVAL); 734 break; 735 } 736 737 fname = strchr(propname, '@') + 1; 738 if (zfeature_lookup_name(fname, NULL) != 0) { 739 error = SET_ERROR(EINVAL); 740 break; 741 } 742 743 has_feature = B_TRUE; 744 } else { 745 error = SET_ERROR(EINVAL); 746 break; 747 } 748 break; 749 750 case ZPOOL_PROP_VERSION: 751 error = nvpair_value_uint64(elem, &intval); 752 if (!error && 753 (intval < spa_version(spa) || 754 intval > SPA_VERSION_BEFORE_FEATURES || 755 has_feature)) 756 error = SET_ERROR(EINVAL); 757 break; 758 759 case ZPOOL_PROP_DEDUP_TABLE_QUOTA: 760 error = nvpair_value_uint64(elem, &intval); 761 break; 762 763 case ZPOOL_PROP_DELEGATION: 764 case ZPOOL_PROP_AUTOREPLACE: 765 case ZPOOL_PROP_LISTSNAPS: 766 case ZPOOL_PROP_AUTOEXPAND: 767 case ZPOOL_PROP_AUTOTRIM: 768 error = nvpair_value_uint64(elem, &intval); 769 if (!error && intval > 1) 770 error = SET_ERROR(EINVAL); 771 break; 772 773 case ZPOOL_PROP_MULTIHOST: 774 error = nvpair_value_uint64(elem, &intval); 775 if (!error && intval > 1) 776 error = SET_ERROR(EINVAL); 777 778 if (!error) { 779 uint32_t hostid = zone_get_hostid(NULL); 780 if (hostid) 781 spa->spa_hostid = hostid; 782 else 783 error = SET_ERROR(ENOTSUP); 784 } 785 786 break; 787 788 case ZPOOL_PROP_BOOTFS: 789 /* 790 * If the pool version is less than SPA_VERSION_BOOTFS, 791 * or the pool is still being created (version == 0), 792 * the bootfs property cannot be set. 793 */ 794 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 795 error = SET_ERROR(ENOTSUP); 796 break; 797 } 798 799 /* 800 * Make sure the vdev config is bootable 801 */ 802 if (!vdev_is_bootable(spa->spa_root_vdev)) { 803 error = SET_ERROR(ENOTSUP); 804 break; 805 } 806 807 reset_bootfs = 1; 808 809 error = nvpair_value_string(elem, &strval); 810 811 if (!error) { 812 objset_t *os; 813 814 if (strval == NULL || strval[0] == '\0') { 815 objnum = zpool_prop_default_numeric( 816 ZPOOL_PROP_BOOTFS); 817 break; 818 } 819 820 error = dmu_objset_hold(strval, FTAG, &os); 821 if (error != 0) 822 break; 823 824 /* Must be ZPL. */ 825 if (dmu_objset_type(os) != DMU_OST_ZFS) { 826 error = SET_ERROR(ENOTSUP); 827 } else { 828 objnum = dmu_objset_id(os); 829 } 830 dmu_objset_rele(os, FTAG); 831 } 832 break; 833 834 case ZPOOL_PROP_FAILUREMODE: 835 error = nvpair_value_uint64(elem, &intval); 836 if (!error && intval > ZIO_FAILURE_MODE_PANIC) 837 error = SET_ERROR(EINVAL); 838 839 /* 840 * This is a special case which only occurs when 841 * the pool has completely failed. This allows 842 * the user to change the in-core failmode property 843 * without syncing it out to disk (I/Os might 844 * currently be blocked). We do this by returning 845 * EIO to the caller (spa_prop_set) to trick it 846 * into thinking we encountered a property validation 847 * error. 848 */ 849 if (!error && spa_suspended(spa)) { 850 spa->spa_failmode = intval; 851 error = SET_ERROR(EIO); 852 } 853 break; 854 855 case ZPOOL_PROP_CACHEFILE: 856 if ((error = nvpair_value_string(elem, &strval)) != 0) 857 break; 858 859 if (strval[0] == '\0') 860 break; 861 862 if (strcmp(strval, "none") == 0) 863 break; 864 865 if (strval[0] != '/') { 866 error = SET_ERROR(EINVAL); 867 break; 868 } 869 870 slash = strrchr(strval, '/'); 871 ASSERT(slash != NULL); 872 873 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 874 strcmp(slash, "/..") == 0) 875 error = SET_ERROR(EINVAL); 876 break; 877 878 case ZPOOL_PROP_COMMENT: 879 if ((error = nvpair_value_string(elem, &strval)) != 0) 880 break; 881 for (check = strval; *check != '\0'; check++) { 882 if (!isprint(*check)) { 883 error = SET_ERROR(EINVAL); 884 break; 885 } 886 } 887 if (strlen(strval) > ZPROP_MAX_COMMENT) 888 error = SET_ERROR(E2BIG); 889 break; 890 891 default: 892 break; 893 } 894 895 if (error) 896 break; 897 } 898 899 (void) nvlist_remove_all(props, 900 zpool_prop_to_name(ZPOOL_PROP_DEDUPDITTO)); 901 902 if (!error && reset_bootfs) { 903 error = nvlist_remove(props, 904 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 905 906 if (!error) { 907 error = nvlist_add_uint64(props, 908 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 909 } 910 } 911 912 return (error); 913 } 914 915 void 916 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 917 { 918 const char *cachefile; 919 spa_config_dirent_t *dp; 920 921 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 922 &cachefile) != 0) 923 return; 924 925 dp = kmem_alloc(sizeof (spa_config_dirent_t), 926 KM_SLEEP); 927 928 if (cachefile[0] == '\0') 929 dp->scd_path = spa_strdup(spa_config_path); 930 else if (strcmp(cachefile, "none") == 0) 931 dp->scd_path = NULL; 932 else 933 dp->scd_path = spa_strdup(cachefile); 934 935 list_insert_head(&spa->spa_config_list, dp); 936 if (need_sync) 937 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 938 } 939 940 int 941 spa_prop_set(spa_t *spa, nvlist_t *nvp) 942 { 943 int error; 944 nvpair_t *elem = NULL; 945 boolean_t need_sync = B_FALSE; 946 947 if ((error = spa_prop_validate(spa, nvp)) != 0) 948 return (error); 949 950 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 951 zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); 952 953 if (prop == ZPOOL_PROP_CACHEFILE || 954 prop == ZPOOL_PROP_ALTROOT || 955 prop == ZPOOL_PROP_READONLY) 956 continue; 957 958 if (prop == ZPOOL_PROP_INVAL && 959 zfs_prop_user(nvpair_name(elem))) { 960 need_sync = B_TRUE; 961 break; 962 } 963 964 if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) { 965 uint64_t ver = 0; 966 967 if (prop == ZPOOL_PROP_VERSION) { 968 VERIFY0(nvpair_value_uint64(elem, &ver)); 969 } else { 970 ASSERT(zpool_prop_feature(nvpair_name(elem))); 971 ver = SPA_VERSION_FEATURES; 972 need_sync = B_TRUE; 973 } 974 975 /* Save time if the version is already set. */ 976 if (ver == spa_version(spa)) 977 continue; 978 979 /* 980 * In addition to the pool directory object, we might 981 * create the pool properties object, the features for 982 * read object, the features for write object, or the 983 * feature descriptions object. 984 */ 985 error = dsl_sync_task(spa->spa_name, NULL, 986 spa_sync_version, &ver, 987 6, ZFS_SPACE_CHECK_RESERVED); 988 if (error) 989 return (error); 990 continue; 991 } 992 993 need_sync = B_TRUE; 994 break; 995 } 996 997 if (need_sync) { 998 return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, 999 nvp, 6, ZFS_SPACE_CHECK_RESERVED)); 1000 } 1001 1002 return (0); 1003 } 1004 1005 /* 1006 * If the bootfs property value is dsobj, clear it. 1007 */ 1008 void 1009 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 1010 { 1011 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 1012 VERIFY(zap_remove(spa->spa_meta_objset, 1013 spa->spa_pool_props_object, 1014 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 1015 spa->spa_bootfs = 0; 1016 } 1017 } 1018 1019 static int 1020 spa_change_guid_check(void *arg, dmu_tx_t *tx) 1021 { 1022 uint64_t *newguid __maybe_unused = arg; 1023 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 1024 vdev_t *rvd = spa->spa_root_vdev; 1025 uint64_t vdev_state; 1026 1027 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 1028 int error = (spa_has_checkpoint(spa)) ? 1029 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 1030 return (SET_ERROR(error)); 1031 } 1032 1033 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 1034 vdev_state = rvd->vdev_state; 1035 spa_config_exit(spa, SCL_STATE, FTAG); 1036 1037 if (vdev_state != VDEV_STATE_HEALTHY) 1038 return (SET_ERROR(ENXIO)); 1039 1040 ASSERT3U(spa_guid(spa), !=, *newguid); 1041 1042 return (0); 1043 } 1044 1045 static void 1046 spa_change_guid_sync(void *arg, dmu_tx_t *tx) 1047 { 1048 uint64_t *newguid = arg; 1049 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 1050 uint64_t oldguid; 1051 vdev_t *rvd = spa->spa_root_vdev; 1052 1053 oldguid = spa_guid(spa); 1054 1055 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 1056 rvd->vdev_guid = *newguid; 1057 rvd->vdev_guid_sum += (*newguid - oldguid); 1058 vdev_config_dirty(rvd); 1059 spa_config_exit(spa, SCL_STATE, FTAG); 1060 1061 spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", 1062 (u_longlong_t)oldguid, (u_longlong_t)*newguid); 1063 } 1064 1065 /* 1066 * Change the GUID for the pool. This is done so that we can later 1067 * re-import a pool built from a clone of our own vdevs. We will modify 1068 * the root vdev's guid, our own pool guid, and then mark all of our 1069 * vdevs dirty. Note that we must make sure that all our vdevs are 1070 * online when we do this, or else any vdevs that weren't present 1071 * would be orphaned from our pool. We are also going to issue a 1072 * sysevent to update any watchers. 1073 * 1074 * The GUID of the pool will be changed to the value pointed to by guidp. 1075 * The GUID may not be set to the reserverd value of 0. 1076 * The new GUID will be generated if guidp is NULL. 1077 */ 1078 int 1079 spa_change_guid(spa_t *spa, const uint64_t *guidp) 1080 { 1081 uint64_t guid; 1082 int error; 1083 1084 mutex_enter(&spa->spa_vdev_top_lock); 1085 mutex_enter(&spa_namespace_lock); 1086 1087 if (guidp != NULL) { 1088 guid = *guidp; 1089 if (guid == 0) { 1090 error = SET_ERROR(EINVAL); 1091 goto out; 1092 } 1093 1094 if (spa_guid_exists(guid, 0)) { 1095 error = SET_ERROR(EEXIST); 1096 goto out; 1097 } 1098 } else { 1099 guid = spa_generate_guid(NULL); 1100 } 1101 1102 error = dsl_sync_task(spa->spa_name, spa_change_guid_check, 1103 spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); 1104 1105 if (error == 0) { 1106 /* 1107 * Clear the kobj flag from all the vdevs to allow 1108 * vdev_cache_process_kobj_evt() to post events to all the 1109 * vdevs since GUID is updated. 1110 */ 1111 vdev_clear_kobj_evt(spa->spa_root_vdev); 1112 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 1113 vdev_clear_kobj_evt(spa->spa_l2cache.sav_vdevs[i]); 1114 1115 spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE); 1116 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID); 1117 } 1118 1119 out: 1120 mutex_exit(&spa_namespace_lock); 1121 mutex_exit(&spa->spa_vdev_top_lock); 1122 1123 return (error); 1124 } 1125 1126 /* 1127 * ========================================================================== 1128 * SPA state manipulation (open/create/destroy/import/export) 1129 * ========================================================================== 1130 */ 1131 1132 static int 1133 spa_error_entry_compare(const void *a, const void *b) 1134 { 1135 const spa_error_entry_t *sa = (const spa_error_entry_t *)a; 1136 const spa_error_entry_t *sb = (const spa_error_entry_t *)b; 1137 int ret; 1138 1139 ret = memcmp(&sa->se_bookmark, &sb->se_bookmark, 1140 sizeof (zbookmark_phys_t)); 1141 1142 return (TREE_ISIGN(ret)); 1143 } 1144 1145 /* 1146 * Utility function which retrieves copies of the current logs and 1147 * re-initializes them in the process. 1148 */ 1149 void 1150 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 1151 { 1152 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 1153 1154 memcpy(last, &spa->spa_errlist_last, sizeof (avl_tree_t)); 1155 memcpy(scrub, &spa->spa_errlist_scrub, sizeof (avl_tree_t)); 1156 1157 avl_create(&spa->spa_errlist_scrub, 1158 spa_error_entry_compare, sizeof (spa_error_entry_t), 1159 offsetof(spa_error_entry_t, se_avl)); 1160 avl_create(&spa->spa_errlist_last, 1161 spa_error_entry_compare, sizeof (spa_error_entry_t), 1162 offsetof(spa_error_entry_t, se_avl)); 1163 } 1164 1165 static void 1166 spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 1167 { 1168 const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 1169 enum zti_modes mode = ztip->zti_mode; 1170 uint_t value = ztip->zti_value; 1171 uint_t count = ztip->zti_count; 1172 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1173 uint_t cpus, flags = TASKQ_DYNAMIC; 1174 1175 switch (mode) { 1176 case ZTI_MODE_FIXED: 1177 ASSERT3U(value, >, 0); 1178 break; 1179 1180 case ZTI_MODE_SYNC: 1181 1182 /* 1183 * Create one wr_iss taskq for every 'zio_taskq_write_tpq' CPUs, 1184 * not to exceed the number of spa allocators, and align to it. 1185 */ 1186 cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100); 1187 count = MAX(1, cpus / MAX(1, zio_taskq_write_tpq)); 1188 count = MAX(count, (zio_taskq_batch_pct + 99) / 100); 1189 count = MIN(count, spa->spa_alloc_count); 1190 while (spa->spa_alloc_count % count != 0 && 1191 spa->spa_alloc_count < count * 2) 1192 count--; 1193 1194 /* 1195 * zio_taskq_batch_pct is unbounded and may exceed 100%, but no 1196 * single taskq may have more threads than 100% of online cpus. 1197 */ 1198 value = (zio_taskq_batch_pct + count / 2) / count; 1199 value = MIN(value, 100); 1200 flags |= TASKQ_THREADS_CPU_PCT; 1201 break; 1202 1203 case ZTI_MODE_SCALE: 1204 flags |= TASKQ_THREADS_CPU_PCT; 1205 /* 1206 * We want more taskqs to reduce lock contention, but we want 1207 * less for better request ordering and CPU utilization. 1208 */ 1209 cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100); 1210 if (zio_taskq_batch_tpq > 0) { 1211 count = MAX(1, (cpus + zio_taskq_batch_tpq / 2) / 1212 zio_taskq_batch_tpq); 1213 } else { 1214 /* 1215 * Prefer 6 threads per taskq, but no more taskqs 1216 * than threads in them on large systems. For 80%: 1217 * 1218 * taskq taskq total 1219 * cpus taskqs percent threads threads 1220 * ------- ------- ------- ------- ------- 1221 * 1 1 80% 1 1 1222 * 2 1 80% 1 1 1223 * 4 1 80% 3 3 1224 * 8 2 40% 3 6 1225 * 16 3 27% 4 12 1226 * 32 5 16% 5 25 1227 * 64 7 11% 7 49 1228 * 128 10 8% 10 100 1229 * 256 14 6% 15 210 1230 */ 1231 count = 1 + cpus / 6; 1232 while (count * count > cpus) 1233 count--; 1234 } 1235 /* Limit each taskq within 100% to not trigger assertion. */ 1236 count = MAX(count, (zio_taskq_batch_pct + 99) / 100); 1237 value = (zio_taskq_batch_pct + count / 2) / count; 1238 break; 1239 1240 case ZTI_MODE_NULL: 1241 tqs->stqs_count = 0; 1242 tqs->stqs_taskq = NULL; 1243 return; 1244 1245 default: 1246 panic("unrecognized mode for %s_%s taskq (%u:%u) in " 1247 "spa_taskqs_init()", 1248 zio_type_name[t], zio_taskq_types[q], mode, value); 1249 break; 1250 } 1251 1252 ASSERT3U(count, >, 0); 1253 tqs->stqs_count = count; 1254 tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); 1255 1256 for (uint_t i = 0; i < count; i++) { 1257 taskq_t *tq; 1258 char name[32]; 1259 1260 if (count > 1) 1261 (void) snprintf(name, sizeof (name), "%s_%s_%u", 1262 zio_type_name[t], zio_taskq_types[q], i); 1263 else 1264 (void) snprintf(name, sizeof (name), "%s_%s", 1265 zio_type_name[t], zio_taskq_types[q]); 1266 1267 #ifdef HAVE_SYSDC 1268 if (zio_taskq_sysdc && spa->spa_proc != &p0) { 1269 (void) zio_taskq_basedc; 1270 tq = taskq_create_sysdc(name, value, 50, INT_MAX, 1271 spa->spa_proc, zio_taskq_basedc, flags); 1272 } else { 1273 #endif 1274 /* 1275 * The write issue taskq can be extremely CPU 1276 * intensive. Run it at slightly less important 1277 * priority than the other taskqs. 1278 */ 1279 const pri_t pri = (t == ZIO_TYPE_WRITE && 1280 q == ZIO_TASKQ_ISSUE) ? 1281 wtqclsyspri : maxclsyspri; 1282 tq = taskq_create_proc(name, value, pri, 50, 1283 INT_MAX, spa->spa_proc, flags); 1284 #ifdef HAVE_SYSDC 1285 } 1286 #endif 1287 1288 tqs->stqs_taskq[i] = tq; 1289 } 1290 } 1291 1292 static void 1293 spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 1294 { 1295 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1296 1297 if (tqs->stqs_taskq == NULL) { 1298 ASSERT0(tqs->stqs_count); 1299 return; 1300 } 1301 1302 for (uint_t i = 0; i < tqs->stqs_count; i++) { 1303 ASSERT3P(tqs->stqs_taskq[i], !=, NULL); 1304 taskq_destroy(tqs->stqs_taskq[i]); 1305 } 1306 1307 kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); 1308 tqs->stqs_taskq = NULL; 1309 } 1310 1311 #ifdef _KERNEL 1312 /* 1313 * The READ and WRITE rows of zio_taskqs are configurable at module load time 1314 * by setting zio_taskq_read or zio_taskq_write. 1315 * 1316 * Example (the defaults for READ and WRITE) 1317 * zio_taskq_read='fixed,1,8 null scale null' 1318 * zio_taskq_write='sync null scale null' 1319 * 1320 * Each sets the entire row at a time. 1321 * 1322 * 'fixed' is parameterised: fixed,Q,T where Q is number of taskqs, T is number 1323 * of threads per taskq. 1324 * 1325 * 'null' can only be set on the high-priority queues (queue selection for 1326 * high-priority queues will fall back to the regular queue if the high-pri 1327 * is NULL. 1328 */ 1329 static const char *const modes[ZTI_NMODES] = { 1330 "fixed", "scale", "sync", "null" 1331 }; 1332 1333 /* Parse the incoming config string. Modifies cfg */ 1334 static int 1335 spa_taskq_param_set(zio_type_t t, char *cfg) 1336 { 1337 int err = 0; 1338 1339 zio_taskq_info_t row[ZIO_TASKQ_TYPES] = {{0}}; 1340 1341 char *next = cfg, *tok, *c; 1342 1343 /* 1344 * Parse out each element from the string and fill `row`. The entire 1345 * row has to be set at once, so any errors are flagged by just 1346 * breaking out of this loop early. 1347 */ 1348 uint_t q; 1349 for (q = 0; q < ZIO_TASKQ_TYPES; q++) { 1350 /* `next` is the start of the config */ 1351 if (next == NULL) 1352 break; 1353 1354 /* Eat up leading space */ 1355 while (isspace(*next)) 1356 next++; 1357 if (*next == '\0') 1358 break; 1359 1360 /* Mode ends at space or end of string */ 1361 tok = next; 1362 next = strchr(tok, ' '); 1363 if (next != NULL) *next++ = '\0'; 1364 1365 /* Parameters start after a comma */ 1366 c = strchr(tok, ','); 1367 if (c != NULL) *c++ = '\0'; 1368 1369 /* Match mode string */ 1370 uint_t mode; 1371 for (mode = 0; mode < ZTI_NMODES; mode++) 1372 if (strcmp(tok, modes[mode]) == 0) 1373 break; 1374 if (mode == ZTI_NMODES) 1375 break; 1376 1377 /* Invalid canary */ 1378 row[q].zti_mode = ZTI_NMODES; 1379 1380 /* Per-mode setup */ 1381 switch (mode) { 1382 1383 /* 1384 * FIXED is parameterised: number of queues, and number of 1385 * threads per queue. 1386 */ 1387 case ZTI_MODE_FIXED: { 1388 /* No parameters? */ 1389 if (c == NULL || *c == '\0') 1390 break; 1391 1392 /* Find next parameter */ 1393 tok = c; 1394 c = strchr(tok, ','); 1395 if (c == NULL) 1396 break; 1397 1398 /* Take digits and convert */ 1399 unsigned long long nq; 1400 if (!(isdigit(*tok))) 1401 break; 1402 err = ddi_strtoull(tok, &tok, 10, &nq); 1403 /* Must succeed and also end at the next param sep */ 1404 if (err != 0 || tok != c) 1405 break; 1406 1407 /* Move past the comma */ 1408 tok++; 1409 /* Need another number */ 1410 if (!(isdigit(*tok))) 1411 break; 1412 /* Remember start to make sure we moved */ 1413 c = tok; 1414 1415 /* Take digits */ 1416 unsigned long long ntpq; 1417 err = ddi_strtoull(tok, &tok, 10, &ntpq); 1418 /* Must succeed, and moved forward */ 1419 if (err != 0 || tok == c || *tok != '\0') 1420 break; 1421 1422 /* 1423 * sanity; zero queues/threads make no sense, and 1424 * 16K is almost certainly more than anyone will ever 1425 * need and avoids silly numbers like UINT32_MAX 1426 */ 1427 if (nq == 0 || nq >= 16384 || 1428 ntpq == 0 || ntpq >= 16384) 1429 break; 1430 1431 const zio_taskq_info_t zti = ZTI_P(ntpq, nq); 1432 row[q] = zti; 1433 break; 1434 } 1435 1436 case ZTI_MODE_SCALE: { 1437 const zio_taskq_info_t zti = ZTI_SCALE; 1438 row[q] = zti; 1439 break; 1440 } 1441 1442 case ZTI_MODE_SYNC: { 1443 const zio_taskq_info_t zti = ZTI_SYNC; 1444 row[q] = zti; 1445 break; 1446 } 1447 1448 case ZTI_MODE_NULL: { 1449 /* 1450 * Can only null the high-priority queues; the general- 1451 * purpose ones have to exist. 1452 */ 1453 if (q != ZIO_TASKQ_ISSUE_HIGH && 1454 q != ZIO_TASKQ_INTERRUPT_HIGH) 1455 break; 1456 1457 const zio_taskq_info_t zti = ZTI_NULL; 1458 row[q] = zti; 1459 break; 1460 } 1461 1462 default: 1463 break; 1464 } 1465 1466 /* Ensure we set a mode */ 1467 if (row[q].zti_mode == ZTI_NMODES) 1468 break; 1469 } 1470 1471 /* Didn't get a full row, fail */ 1472 if (q < ZIO_TASKQ_TYPES) 1473 return (SET_ERROR(EINVAL)); 1474 1475 /* Eat trailing space */ 1476 if (next != NULL) 1477 while (isspace(*next)) 1478 next++; 1479 1480 /* If there's anything left over then fail */ 1481 if (next != NULL && *next != '\0') 1482 return (SET_ERROR(EINVAL)); 1483 1484 /* Success! Copy it into the real config */ 1485 for (q = 0; q < ZIO_TASKQ_TYPES; q++) 1486 zio_taskqs[t][q] = row[q]; 1487 1488 return (0); 1489 } 1490 1491 static int 1492 spa_taskq_param_get(zio_type_t t, char *buf, boolean_t add_newline) 1493 { 1494 int pos = 0; 1495 1496 /* Build paramater string from live config */ 1497 const char *sep = ""; 1498 for (uint_t q = 0; q < ZIO_TASKQ_TYPES; q++) { 1499 const zio_taskq_info_t *zti = &zio_taskqs[t][q]; 1500 if (zti->zti_mode == ZTI_MODE_FIXED) 1501 pos += sprintf(&buf[pos], "%s%s,%u,%u", sep, 1502 modes[zti->zti_mode], zti->zti_count, 1503 zti->zti_value); 1504 else 1505 pos += sprintf(&buf[pos], "%s%s", sep, 1506 modes[zti->zti_mode]); 1507 sep = " "; 1508 } 1509 1510 if (add_newline) 1511 buf[pos++] = '\n'; 1512 buf[pos] = '\0'; 1513 1514 return (pos); 1515 } 1516 1517 #ifdef __linux__ 1518 static int 1519 spa_taskq_read_param_set(const char *val, zfs_kernel_param_t *kp) 1520 { 1521 char *cfg = kmem_strdup(val); 1522 int err = spa_taskq_param_set(ZIO_TYPE_READ, cfg); 1523 kmem_free(cfg, strlen(val)+1); 1524 return (-err); 1525 } 1526 static int 1527 spa_taskq_read_param_get(char *buf, zfs_kernel_param_t *kp) 1528 { 1529 return (spa_taskq_param_get(ZIO_TYPE_READ, buf, TRUE)); 1530 } 1531 1532 static int 1533 spa_taskq_write_param_set(const char *val, zfs_kernel_param_t *kp) 1534 { 1535 char *cfg = kmem_strdup(val); 1536 int err = spa_taskq_param_set(ZIO_TYPE_WRITE, cfg); 1537 kmem_free(cfg, strlen(val)+1); 1538 return (-err); 1539 } 1540 static int 1541 spa_taskq_write_param_get(char *buf, zfs_kernel_param_t *kp) 1542 { 1543 return (spa_taskq_param_get(ZIO_TYPE_WRITE, buf, TRUE)); 1544 } 1545 #else 1546 /* 1547 * On FreeBSD load-time parameters can be set up before malloc() is available, 1548 * so we have to do all the parsing work on the stack. 1549 */ 1550 #define SPA_TASKQ_PARAM_MAX (128) 1551 1552 static int 1553 spa_taskq_read_param(ZFS_MODULE_PARAM_ARGS) 1554 { 1555 char buf[SPA_TASKQ_PARAM_MAX]; 1556 int err; 1557 1558 (void) spa_taskq_param_get(ZIO_TYPE_READ, buf, FALSE); 1559 err = sysctl_handle_string(oidp, buf, sizeof (buf), req); 1560 if (err || req->newptr == NULL) 1561 return (err); 1562 return (spa_taskq_param_set(ZIO_TYPE_READ, buf)); 1563 } 1564 1565 static int 1566 spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS) 1567 { 1568 char buf[SPA_TASKQ_PARAM_MAX]; 1569 int err; 1570 1571 (void) spa_taskq_param_get(ZIO_TYPE_WRITE, buf, FALSE); 1572 err = sysctl_handle_string(oidp, buf, sizeof (buf), req); 1573 if (err || req->newptr == NULL) 1574 return (err); 1575 return (spa_taskq_param_set(ZIO_TYPE_WRITE, buf)); 1576 } 1577 #endif 1578 #endif /* _KERNEL */ 1579 1580 /* 1581 * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. 1582 * Note that a type may have multiple discrete taskqs to avoid lock contention 1583 * on the taskq itself. 1584 */ 1585 void 1586 spa_taskq_dispatch(spa_t *spa, zio_type_t t, zio_taskq_type_t q, 1587 task_func_t *func, zio_t *zio, boolean_t cutinline) 1588 { 1589 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1590 taskq_t *tq; 1591 1592 ASSERT3P(tqs->stqs_taskq, !=, NULL); 1593 ASSERT3U(tqs->stqs_count, !=, 0); 1594 1595 /* 1596 * NB: We are assuming that the zio can only be dispatched 1597 * to a single taskq at a time. It would be a grievous error 1598 * to dispatch the zio to another taskq at the same time. 1599 */ 1600 ASSERT(zio); 1601 ASSERT(taskq_empty_ent(&zio->io_tqent)); 1602 1603 if (tqs->stqs_count == 1) { 1604 tq = tqs->stqs_taskq[0]; 1605 } else if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) && 1606 ZIO_HAS_ALLOCATOR(zio)) { 1607 tq = tqs->stqs_taskq[zio->io_allocator % tqs->stqs_count]; 1608 } else { 1609 tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; 1610 } 1611 1612 taskq_dispatch_ent(tq, func, zio, cutinline ? TQ_FRONT : 0, 1613 &zio->io_tqent); 1614 } 1615 1616 static void 1617 spa_create_zio_taskqs(spa_t *spa) 1618 { 1619 for (int t = 0; t < ZIO_TYPES; t++) { 1620 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1621 spa_taskqs_init(spa, t, q); 1622 } 1623 } 1624 } 1625 1626 #if defined(_KERNEL) && defined(HAVE_SPA_THREAD) 1627 static void 1628 spa_thread(void *arg) 1629 { 1630 psetid_t zio_taskq_psrset_bind = PS_NONE; 1631 callb_cpr_t cprinfo; 1632 1633 spa_t *spa = arg; 1634 user_t *pu = PTOU(curproc); 1635 1636 CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 1637 spa->spa_name); 1638 1639 ASSERT(curproc != &p0); 1640 (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 1641 "zpool-%s", spa->spa_name); 1642 (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 1643 1644 /* bind this thread to the requested psrset */ 1645 if (zio_taskq_psrset_bind != PS_NONE) { 1646 pool_lock(); 1647 mutex_enter(&cpu_lock); 1648 mutex_enter(&pidlock); 1649 mutex_enter(&curproc->p_lock); 1650 1651 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 1652 0, NULL, NULL) == 0) { 1653 curthread->t_bind_pset = zio_taskq_psrset_bind; 1654 } else { 1655 cmn_err(CE_WARN, 1656 "Couldn't bind process for zfs pool \"%s\" to " 1657 "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 1658 } 1659 1660 mutex_exit(&curproc->p_lock); 1661 mutex_exit(&pidlock); 1662 mutex_exit(&cpu_lock); 1663 pool_unlock(); 1664 } 1665 1666 #ifdef HAVE_SYSDC 1667 if (zio_taskq_sysdc) { 1668 sysdc_thread_enter(curthread, 100, 0); 1669 } 1670 #endif 1671 1672 spa->spa_proc = curproc; 1673 spa->spa_did = curthread->t_did; 1674 1675 spa_create_zio_taskqs(spa); 1676 1677 mutex_enter(&spa->spa_proc_lock); 1678 ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 1679 1680 spa->spa_proc_state = SPA_PROC_ACTIVE; 1681 cv_broadcast(&spa->spa_proc_cv); 1682 1683 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1684 while (spa->spa_proc_state == SPA_PROC_ACTIVE) 1685 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1686 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 1687 1688 ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 1689 spa->spa_proc_state = SPA_PROC_GONE; 1690 spa->spa_proc = &p0; 1691 cv_broadcast(&spa->spa_proc_cv); 1692 CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 1693 1694 mutex_enter(&curproc->p_lock); 1695 lwp_exit(); 1696 } 1697 #endif 1698 1699 extern metaslab_ops_t *metaslab_allocator(spa_t *spa); 1700 1701 /* 1702 * Activate an uninitialized pool. 1703 */ 1704 static void 1705 spa_activate(spa_t *spa, spa_mode_t mode) 1706 { 1707 metaslab_ops_t *msp = metaslab_allocator(spa); 1708 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 1709 1710 spa->spa_state = POOL_STATE_ACTIVE; 1711 spa->spa_final_txg = UINT64_MAX; 1712 spa->spa_mode = mode; 1713 spa->spa_read_spacemaps = spa_mode_readable_spacemaps; 1714 1715 spa->spa_normal_class = metaslab_class_create(spa, "normal", 1716 msp, B_FALSE); 1717 spa->spa_log_class = metaslab_class_create(spa, "log", msp, B_TRUE); 1718 spa->spa_embedded_log_class = metaslab_class_create(spa, 1719 "embedded_log", msp, B_TRUE); 1720 spa->spa_special_class = metaslab_class_create(spa, "special", 1721 msp, B_FALSE); 1722 spa->spa_special_embedded_log_class = metaslab_class_create(spa, 1723 "special_embedded_log", msp, B_TRUE); 1724 spa->spa_dedup_class = metaslab_class_create(spa, "dedup", 1725 msp, B_FALSE); 1726 1727 /* Try to create a covering process */ 1728 mutex_enter(&spa->spa_proc_lock); 1729 ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 1730 ASSERT(spa->spa_proc == &p0); 1731 spa->spa_did = 0; 1732 1733 #ifdef HAVE_SPA_THREAD 1734 /* Only create a process if we're going to be around a while. */ 1735 if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 1736 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 1737 NULL, 0) == 0) { 1738 spa->spa_proc_state = SPA_PROC_CREATED; 1739 while (spa->spa_proc_state == SPA_PROC_CREATED) { 1740 cv_wait(&spa->spa_proc_cv, 1741 &spa->spa_proc_lock); 1742 } 1743 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1744 ASSERT(spa->spa_proc != &p0); 1745 ASSERT(spa->spa_did != 0); 1746 } else { 1747 #ifdef _KERNEL 1748 cmn_err(CE_WARN, 1749 "Couldn't create process for zfs pool \"%s\"\n", 1750 spa->spa_name); 1751 #endif 1752 } 1753 } 1754 #endif /* HAVE_SPA_THREAD */ 1755 mutex_exit(&spa->spa_proc_lock); 1756 1757 /* If we didn't create a process, we need to create our taskqs. */ 1758 if (spa->spa_proc == &p0) { 1759 spa_create_zio_taskqs(spa); 1760 } 1761 1762 for (size_t i = 0; i < TXG_SIZE; i++) { 1763 spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, 1764 ZIO_FLAG_CANFAIL); 1765 } 1766 1767 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 1768 offsetof(vdev_t, vdev_config_dirty_node)); 1769 list_create(&spa->spa_evicting_os_list, sizeof (objset_t), 1770 offsetof(objset_t, os_evicting_node)); 1771 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 1772 offsetof(vdev_t, vdev_state_dirty_node)); 1773 1774 txg_list_create(&spa->spa_vdev_txg_list, spa, 1775 offsetof(struct vdev, vdev_txg_node)); 1776 1777 avl_create(&spa->spa_errlist_scrub, 1778 spa_error_entry_compare, sizeof (spa_error_entry_t), 1779 offsetof(spa_error_entry_t, se_avl)); 1780 avl_create(&spa->spa_errlist_last, 1781 spa_error_entry_compare, sizeof (spa_error_entry_t), 1782 offsetof(spa_error_entry_t, se_avl)); 1783 avl_create(&spa->spa_errlist_healed, 1784 spa_error_entry_compare, sizeof (spa_error_entry_t), 1785 offsetof(spa_error_entry_t, se_avl)); 1786 1787 spa_activate_os(spa); 1788 1789 spa_keystore_init(&spa->spa_keystore); 1790 1791 /* 1792 * This taskq is used to perform zvol-minor-related tasks 1793 * asynchronously. This has several advantages, including easy 1794 * resolution of various deadlocks. 1795 * 1796 * The taskq must be single threaded to ensure tasks are always 1797 * processed in the order in which they were dispatched. 1798 * 1799 * A taskq per pool allows one to keep the pools independent. 1800 * This way if one pool is suspended, it will not impact another. 1801 * 1802 * The preferred location to dispatch a zvol minor task is a sync 1803 * task. In this context, there is easy access to the spa_t and minimal 1804 * error handling is required because the sync task must succeed. 1805 */ 1806 spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri, 1807 1, INT_MAX, 0); 1808 1809 /* 1810 * The taskq to preload metaslabs. 1811 */ 1812 spa->spa_metaslab_taskq = taskq_create("z_metaslab", 1813 metaslab_preload_pct, maxclsyspri, 1, INT_MAX, 1814 TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); 1815 1816 /* 1817 * Taskq dedicated to prefetcher threads: this is used to prevent the 1818 * pool traverse code from monopolizing the global (and limited) 1819 * system_taskq by inappropriately scheduling long running tasks on it. 1820 */ 1821 spa->spa_prefetch_taskq = taskq_create("z_prefetch", 100, 1822 defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); 1823 1824 /* 1825 * The taskq to upgrade datasets in this pool. Currently used by 1826 * feature SPA_FEATURE_USEROBJ_ACCOUNTING/SPA_FEATURE_PROJECT_QUOTA. 1827 */ 1828 spa->spa_upgrade_taskq = taskq_create("z_upgrade", 100, 1829 defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); 1830 } 1831 1832 /* 1833 * Opposite of spa_activate(). 1834 */ 1835 static void 1836 spa_deactivate(spa_t *spa) 1837 { 1838 ASSERT(spa->spa_sync_on == B_FALSE); 1839 ASSERT0P(spa->spa_dsl_pool); 1840 ASSERT0P(spa->spa_root_vdev); 1841 ASSERT0P(spa->spa_async_zio_root); 1842 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 1843 1844 spa_evicting_os_wait(spa); 1845 1846 if (spa->spa_zvol_taskq) { 1847 taskq_destroy(spa->spa_zvol_taskq); 1848 spa->spa_zvol_taskq = NULL; 1849 } 1850 1851 if (spa->spa_metaslab_taskq) { 1852 taskq_destroy(spa->spa_metaslab_taskq); 1853 spa->spa_metaslab_taskq = NULL; 1854 } 1855 1856 if (spa->spa_prefetch_taskq) { 1857 taskq_destroy(spa->spa_prefetch_taskq); 1858 spa->spa_prefetch_taskq = NULL; 1859 } 1860 1861 if (spa->spa_upgrade_taskq) { 1862 taskq_destroy(spa->spa_upgrade_taskq); 1863 spa->spa_upgrade_taskq = NULL; 1864 } 1865 1866 txg_list_destroy(&spa->spa_vdev_txg_list); 1867 1868 list_destroy(&spa->spa_config_dirty_list); 1869 list_destroy(&spa->spa_evicting_os_list); 1870 list_destroy(&spa->spa_state_dirty_list); 1871 1872 taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); 1873 1874 for (int t = 0; t < ZIO_TYPES; t++) { 1875 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1876 spa_taskqs_fini(spa, t, q); 1877 } 1878 } 1879 1880 for (size_t i = 0; i < TXG_SIZE; i++) { 1881 ASSERT3P(spa->spa_txg_zio[i], !=, NULL); 1882 VERIFY0(zio_wait(spa->spa_txg_zio[i])); 1883 spa->spa_txg_zio[i] = NULL; 1884 } 1885 1886 metaslab_class_destroy(spa->spa_normal_class); 1887 spa->spa_normal_class = NULL; 1888 1889 metaslab_class_destroy(spa->spa_log_class); 1890 spa->spa_log_class = NULL; 1891 1892 metaslab_class_destroy(spa->spa_embedded_log_class); 1893 spa->spa_embedded_log_class = NULL; 1894 1895 metaslab_class_destroy(spa->spa_special_class); 1896 spa->spa_special_class = NULL; 1897 1898 metaslab_class_destroy(spa->spa_special_embedded_log_class); 1899 spa->spa_special_embedded_log_class = NULL; 1900 1901 metaslab_class_destroy(spa->spa_dedup_class); 1902 spa->spa_dedup_class = NULL; 1903 1904 /* 1905 * If this was part of an import or the open otherwise failed, we may 1906 * still have errors left in the queues. Empty them just in case. 1907 */ 1908 spa_errlog_drain(spa); 1909 avl_destroy(&spa->spa_errlist_scrub); 1910 avl_destroy(&spa->spa_errlist_last); 1911 avl_destroy(&spa->spa_errlist_healed); 1912 1913 spa_keystore_fini(&spa->spa_keystore); 1914 1915 spa->spa_state = POOL_STATE_UNINITIALIZED; 1916 1917 mutex_enter(&spa->spa_proc_lock); 1918 if (spa->spa_proc_state != SPA_PROC_NONE) { 1919 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1920 spa->spa_proc_state = SPA_PROC_DEACTIVATE; 1921 cv_broadcast(&spa->spa_proc_cv); 1922 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 1923 ASSERT(spa->spa_proc != &p0); 1924 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1925 } 1926 ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 1927 spa->spa_proc_state = SPA_PROC_NONE; 1928 } 1929 ASSERT(spa->spa_proc == &p0); 1930 mutex_exit(&spa->spa_proc_lock); 1931 1932 /* 1933 * We want to make sure spa_thread() has actually exited the ZFS 1934 * module, so that the module can't be unloaded out from underneath 1935 * it. 1936 */ 1937 if (spa->spa_did != 0) { 1938 thread_join(spa->spa_did); 1939 spa->spa_did = 0; 1940 } 1941 1942 spa_deactivate_os(spa); 1943 1944 } 1945 1946 /* 1947 * Verify a pool configuration, and construct the vdev tree appropriately. This 1948 * will create all the necessary vdevs in the appropriate layout, with each vdev 1949 * in the CLOSED state. This will prep the pool before open/creation/import. 1950 * All vdev validation is done by the vdev_alloc() routine. 1951 */ 1952 int 1953 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 1954 uint_t id, int atype) 1955 { 1956 nvlist_t **child; 1957 uint_t children; 1958 int error; 1959 1960 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 1961 return (error); 1962 1963 if ((*vdp)->vdev_ops->vdev_op_leaf) 1964 return (0); 1965 1966 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1967 &child, &children); 1968 1969 if (error == ENOENT) 1970 return (0); 1971 1972 if (error) { 1973 vdev_free(*vdp); 1974 *vdp = NULL; 1975 return (SET_ERROR(EINVAL)); 1976 } 1977 1978 for (int c = 0; c < children; c++) { 1979 vdev_t *vd; 1980 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 1981 atype)) != 0) { 1982 vdev_free(*vdp); 1983 *vdp = NULL; 1984 return (error); 1985 } 1986 } 1987 1988 ASSERT(*vdp != NULL); 1989 1990 return (0); 1991 } 1992 1993 static boolean_t 1994 spa_should_flush_logs_on_unload(spa_t *spa) 1995 { 1996 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) 1997 return (B_FALSE); 1998 1999 if (!spa_writeable(spa)) 2000 return (B_FALSE); 2001 2002 if (!spa->spa_sync_on) 2003 return (B_FALSE); 2004 2005 if (spa_state(spa) != POOL_STATE_EXPORTED) 2006 return (B_FALSE); 2007 2008 if (zfs_keep_log_spacemaps_at_export) 2009 return (B_FALSE); 2010 2011 return (B_TRUE); 2012 } 2013 2014 /* 2015 * Opens a transaction that will set the flag that will instruct 2016 * spa_sync to attempt to flush all the metaslabs for that txg. 2017 */ 2018 static void 2019 spa_unload_log_sm_flush_all(spa_t *spa) 2020 { 2021 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 2022 VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT | DMU_TX_SUSPEND)); 2023 2024 ASSERT0(spa->spa_log_flushall_txg); 2025 spa->spa_log_flushall_txg = dmu_tx_get_txg(tx); 2026 2027 dmu_tx_commit(tx); 2028 txg_wait_synced(spa_get_dsl(spa), spa->spa_log_flushall_txg); 2029 } 2030 2031 static void 2032 spa_unload_log_sm_metadata(spa_t *spa) 2033 { 2034 void *cookie = NULL; 2035 spa_log_sm_t *sls; 2036 log_summary_entry_t *e; 2037 2038 while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg, 2039 &cookie)) != NULL) { 2040 VERIFY0(sls->sls_mscount); 2041 kmem_free(sls, sizeof (spa_log_sm_t)); 2042 } 2043 2044 while ((e = list_remove_head(&spa->spa_log_summary)) != NULL) { 2045 VERIFY0(e->lse_mscount); 2046 kmem_free(e, sizeof (log_summary_entry_t)); 2047 } 2048 2049 spa->spa_unflushed_stats.sus_nblocks = 0; 2050 spa->spa_unflushed_stats.sus_memused = 0; 2051 spa->spa_unflushed_stats.sus_blocklimit = 0; 2052 } 2053 2054 static void 2055 spa_destroy_aux_threads(spa_t *spa) 2056 { 2057 if (spa->spa_condense_zthr != NULL) { 2058 zthr_destroy(spa->spa_condense_zthr); 2059 spa->spa_condense_zthr = NULL; 2060 } 2061 if (spa->spa_checkpoint_discard_zthr != NULL) { 2062 zthr_destroy(spa->spa_checkpoint_discard_zthr); 2063 spa->spa_checkpoint_discard_zthr = NULL; 2064 } 2065 if (spa->spa_livelist_delete_zthr != NULL) { 2066 zthr_destroy(spa->spa_livelist_delete_zthr); 2067 spa->spa_livelist_delete_zthr = NULL; 2068 } 2069 if (spa->spa_livelist_condense_zthr != NULL) { 2070 zthr_destroy(spa->spa_livelist_condense_zthr); 2071 spa->spa_livelist_condense_zthr = NULL; 2072 } 2073 if (spa->spa_raidz_expand_zthr != NULL) { 2074 zthr_destroy(spa->spa_raidz_expand_zthr); 2075 spa->spa_raidz_expand_zthr = NULL; 2076 } 2077 } 2078 2079 static void 2080 spa_sync_time_logger(spa_t *spa, uint64_t txg) 2081 { 2082 uint64_t curtime; 2083 dmu_tx_t *tx; 2084 2085 if (!spa_writeable(spa)) { 2086 return; 2087 } 2088 curtime = gethrestime_sec(); 2089 if (curtime < spa->spa_last_noted_txg_time + spa_note_txg_time) { 2090 return; 2091 } 2092 2093 if (txg > spa->spa_last_noted_txg) { 2094 spa->spa_last_noted_txg_time = curtime; 2095 spa->spa_last_noted_txg = txg; 2096 2097 mutex_enter(&spa->spa_txg_log_time_lock); 2098 dbrrd_add(&spa->spa_txg_log_time, curtime, txg); 2099 mutex_exit(&spa->spa_txg_log_time_lock); 2100 } 2101 2102 if (curtime < spa->spa_last_flush_txg_time + spa_flush_txg_time) { 2103 return; 2104 } 2105 spa->spa_last_flush_txg_time = curtime; 2106 2107 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 2108 2109 VERIFY0(zap_update(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, 2110 DMU_POOL_TXG_LOG_TIME_MINUTES, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM, 2111 &spa->spa_txg_log_time.dbr_minutes, tx)); 2112 VERIFY0(zap_update(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, 2113 DMU_POOL_TXG_LOG_TIME_DAYS, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM, 2114 &spa->spa_txg_log_time.dbr_days, tx)); 2115 VERIFY0(zap_update(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, 2116 DMU_POOL_TXG_LOG_TIME_MONTHS, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM, 2117 &spa->spa_txg_log_time.dbr_months, tx)); 2118 dmu_tx_commit(tx); 2119 } 2120 2121 static void 2122 spa_unload_sync_time_logger(spa_t *spa) 2123 { 2124 uint64_t txg; 2125 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 2126 VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT)); 2127 2128 txg = dmu_tx_get_txg(tx); 2129 spa->spa_last_noted_txg_time = 0; 2130 spa->spa_last_flush_txg_time = 0; 2131 spa_sync_time_logger(spa, txg); 2132 2133 dmu_tx_commit(tx); 2134 } 2135 2136 static void 2137 spa_load_txg_log_time(spa_t *spa) 2138 { 2139 int error; 2140 2141 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 2142 DMU_POOL_TXG_LOG_TIME_MINUTES, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM, 2143 &spa->spa_txg_log_time.dbr_minutes); 2144 if (error != 0 && error != ENOENT) { 2145 spa_load_note(spa, "unable to load a txg time database with " 2146 "minute resolution [error=%d]", error); 2147 } 2148 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 2149 DMU_POOL_TXG_LOG_TIME_DAYS, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM, 2150 &spa->spa_txg_log_time.dbr_days); 2151 if (error != 0 && error != ENOENT) { 2152 spa_load_note(spa, "unable to load a txg time database with " 2153 "day resolution [error=%d]", error); 2154 } 2155 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 2156 DMU_POOL_TXG_LOG_TIME_MONTHS, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM, 2157 &spa->spa_txg_log_time.dbr_months); 2158 if (error != 0 && error != ENOENT) { 2159 spa_load_note(spa, "unable to load a txg time database with " 2160 "month resolution [error=%d]", error); 2161 } 2162 } 2163 2164 static boolean_t 2165 spa_should_sync_time_logger_on_unload(spa_t *spa) 2166 { 2167 2168 if (!spa_writeable(spa)) 2169 return (B_FALSE); 2170 2171 if (!spa->spa_sync_on) 2172 return (B_FALSE); 2173 2174 if (spa_state(spa) != POOL_STATE_EXPORTED) 2175 return (B_FALSE); 2176 2177 if (spa->spa_last_noted_txg == 0) 2178 return (B_FALSE); 2179 2180 return (B_TRUE); 2181 } 2182 2183 2184 /* 2185 * Opposite of spa_load(). 2186 */ 2187 static void 2188 spa_unload(spa_t *spa) 2189 { 2190 ASSERT(MUTEX_HELD(&spa_namespace_lock) || 2191 spa->spa_export_thread == curthread); 2192 ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED); 2193 2194 spa_import_progress_remove(spa_guid(spa)); 2195 spa_load_note(spa, "UNLOADING"); 2196 2197 spa_wake_waiters(spa); 2198 2199 /* 2200 * If we have set the spa_final_txg, we have already performed the 2201 * tasks below in spa_export_common(). We should not redo it here since 2202 * we delay the final TXGs beyond what spa_final_txg is set at. 2203 */ 2204 if (spa->spa_final_txg == UINT64_MAX) { 2205 if (spa_should_sync_time_logger_on_unload(spa)) 2206 spa_unload_sync_time_logger(spa); 2207 2208 /* 2209 * If the log space map feature is enabled and the pool is 2210 * getting exported (but not destroyed), we want to spend some 2211 * time flushing as many metaslabs as we can in an attempt to 2212 * destroy log space maps and save import time. 2213 */ 2214 if (spa_should_flush_logs_on_unload(spa)) 2215 spa_unload_log_sm_flush_all(spa); 2216 2217 /* 2218 * Stop async tasks. 2219 */ 2220 spa_async_suspend(spa); 2221 2222 if (spa->spa_root_vdev) { 2223 vdev_t *root_vdev = spa->spa_root_vdev; 2224 vdev_initialize_stop_all(root_vdev, 2225 VDEV_INITIALIZE_ACTIVE); 2226 vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE); 2227 vdev_autotrim_stop_all(spa); 2228 vdev_rebuild_stop_all(spa); 2229 l2arc_spa_rebuild_stop(spa); 2230 } 2231 2232 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2233 spa->spa_final_txg = spa_last_synced_txg(spa) + 2234 TXG_DEFER_SIZE + 1; 2235 spa_config_exit(spa, SCL_ALL, FTAG); 2236 } 2237 2238 /* 2239 * Stop syncing. 2240 */ 2241 if (spa->spa_sync_on) { 2242 txg_sync_stop(spa->spa_dsl_pool); 2243 spa->spa_sync_on = B_FALSE; 2244 } 2245 2246 /* 2247 * This ensures that there is no async metaslab prefetching 2248 * while we attempt to unload the spa. 2249 */ 2250 taskq_wait(spa->spa_metaslab_taskq); 2251 2252 if (spa->spa_mmp.mmp_thread) 2253 mmp_thread_stop(spa); 2254 2255 /* 2256 * Wait for any outstanding async I/O to complete. 2257 */ 2258 if (spa->spa_async_zio_root != NULL) { 2259 for (int i = 0; i < max_ncpus; i++) 2260 (void) zio_wait(spa->spa_async_zio_root[i]); 2261 kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *)); 2262 spa->spa_async_zio_root = NULL; 2263 } 2264 2265 if (spa->spa_vdev_removal != NULL) { 2266 spa_vdev_removal_destroy(spa->spa_vdev_removal); 2267 spa->spa_vdev_removal = NULL; 2268 } 2269 2270 spa_destroy_aux_threads(spa); 2271 2272 spa_condense_fini(spa); 2273 2274 bpobj_close(&spa->spa_deferred_bpobj); 2275 2276 spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); 2277 2278 /* 2279 * Close all vdevs. 2280 */ 2281 if (spa->spa_root_vdev) 2282 vdev_free(spa->spa_root_vdev); 2283 ASSERT0P(spa->spa_root_vdev); 2284 2285 /* 2286 * Close the dsl pool. 2287 */ 2288 if (spa->spa_dsl_pool) { 2289 dsl_pool_close(spa->spa_dsl_pool); 2290 spa->spa_dsl_pool = NULL; 2291 spa->spa_meta_objset = NULL; 2292 } 2293 2294 ddt_unload(spa); 2295 brt_unload(spa); 2296 spa_unload_log_sm_metadata(spa); 2297 2298 /* 2299 * Drop and purge level 2 cache 2300 */ 2301 spa_l2cache_drop(spa); 2302 2303 if (spa->spa_spares.sav_vdevs) { 2304 for (int i = 0; i < spa->spa_spares.sav_count; i++) 2305 vdev_free(spa->spa_spares.sav_vdevs[i]); 2306 kmem_free(spa->spa_spares.sav_vdevs, 2307 spa->spa_spares.sav_count * sizeof (void *)); 2308 spa->spa_spares.sav_vdevs = NULL; 2309 } 2310 if (spa->spa_spares.sav_config) { 2311 nvlist_free(spa->spa_spares.sav_config); 2312 spa->spa_spares.sav_config = NULL; 2313 } 2314 spa->spa_spares.sav_count = 0; 2315 2316 if (spa->spa_l2cache.sav_vdevs) { 2317 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { 2318 vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); 2319 vdev_free(spa->spa_l2cache.sav_vdevs[i]); 2320 } 2321 kmem_free(spa->spa_l2cache.sav_vdevs, 2322 spa->spa_l2cache.sav_count * sizeof (void *)); 2323 spa->spa_l2cache.sav_vdevs = NULL; 2324 } 2325 if (spa->spa_l2cache.sav_config) { 2326 nvlist_free(spa->spa_l2cache.sav_config); 2327 spa->spa_l2cache.sav_config = NULL; 2328 } 2329 spa->spa_l2cache.sav_count = 0; 2330 2331 spa->spa_async_suspended = 0; 2332 2333 spa->spa_indirect_vdevs_loaded = B_FALSE; 2334 2335 if (spa->spa_comment != NULL) { 2336 spa_strfree(spa->spa_comment); 2337 spa->spa_comment = NULL; 2338 } 2339 if (spa->spa_compatibility != NULL) { 2340 spa_strfree(spa->spa_compatibility); 2341 spa->spa_compatibility = NULL; 2342 } 2343 2344 spa->spa_raidz_expand = NULL; 2345 spa->spa_checkpoint_txg = 0; 2346 2347 spa_config_exit(spa, SCL_ALL, spa); 2348 } 2349 2350 /* 2351 * Load (or re-load) the current list of vdevs describing the active spares for 2352 * this pool. When this is called, we have some form of basic information in 2353 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 2354 * then re-generate a more complete list including status information. 2355 */ 2356 void 2357 spa_load_spares(spa_t *spa) 2358 { 2359 nvlist_t **spares; 2360 uint_t nspares; 2361 int i; 2362 vdev_t *vd, *tvd; 2363 2364 #ifndef _KERNEL 2365 /* 2366 * zdb opens both the current state of the pool and the 2367 * checkpointed state (if present), with a different spa_t. 2368 * 2369 * As spare vdevs are shared among open pools, we skip loading 2370 * them when we load the checkpointed state of the pool. 2371 */ 2372 if (!spa_writeable(spa)) 2373 return; 2374 #endif 2375 2376 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 2377 2378 /* 2379 * First, close and free any existing spare vdevs. 2380 */ 2381 if (spa->spa_spares.sav_vdevs) { 2382 for (i = 0; i < spa->spa_spares.sav_count; i++) { 2383 vd = spa->spa_spares.sav_vdevs[i]; 2384 2385 /* Undo the call to spa_activate() below */ 2386 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 2387 B_FALSE)) != NULL && tvd->vdev_isspare) 2388 spa_spare_remove(tvd); 2389 vdev_close(vd); 2390 vdev_free(vd); 2391 } 2392 2393 kmem_free(spa->spa_spares.sav_vdevs, 2394 spa->spa_spares.sav_count * sizeof (void *)); 2395 } 2396 2397 if (spa->spa_spares.sav_config == NULL) 2398 nspares = 0; 2399 else 2400 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 2401 ZPOOL_CONFIG_SPARES, &spares, &nspares)); 2402 2403 spa->spa_spares.sav_count = (int)nspares; 2404 spa->spa_spares.sav_vdevs = NULL; 2405 2406 if (nspares == 0) 2407 return; 2408 2409 /* 2410 * Construct the array of vdevs, opening them to get status in the 2411 * process. For each spare, there is potentially two different vdev_t 2412 * structures associated with it: one in the list of spares (used only 2413 * for basic validation purposes) and one in the active vdev 2414 * configuration (if it's spared in). During this phase we open and 2415 * validate each vdev on the spare list. If the vdev also exists in the 2416 * active configuration, then we also mark this vdev as an active spare. 2417 */ 2418 spa->spa_spares.sav_vdevs = kmem_zalloc(nspares * sizeof (void *), 2419 KM_SLEEP); 2420 for (i = 0; i < spa->spa_spares.sav_count; i++) { 2421 VERIFY0(spa_config_parse(spa, &vd, spares[i], NULL, 0, 2422 VDEV_ALLOC_SPARE)); 2423 ASSERT(vd != NULL); 2424 2425 spa->spa_spares.sav_vdevs[i] = vd; 2426 2427 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 2428 B_FALSE)) != NULL) { 2429 if (!tvd->vdev_isspare) 2430 spa_spare_add(tvd); 2431 2432 /* 2433 * We only mark the spare active if we were successfully 2434 * able to load the vdev. Otherwise, importing a pool 2435 * with a bad active spare would result in strange 2436 * behavior, because multiple pool would think the spare 2437 * is actively in use. 2438 * 2439 * There is a vulnerability here to an equally bizarre 2440 * circumstance, where a dead active spare is later 2441 * brought back to life (onlined or otherwise). Given 2442 * the rarity of this scenario, and the extra complexity 2443 * it adds, we ignore the possibility. 2444 */ 2445 if (!vdev_is_dead(tvd)) 2446 spa_spare_activate(tvd); 2447 } 2448 2449 vd->vdev_top = vd; 2450 vd->vdev_aux = &spa->spa_spares; 2451 2452 if (vdev_open(vd) != 0) 2453 continue; 2454 2455 if (vdev_validate_aux(vd) == 0) 2456 spa_spare_add(vd); 2457 } 2458 2459 /* 2460 * Recompute the stashed list of spares, with status information 2461 * this time. 2462 */ 2463 fnvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES); 2464 2465 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 2466 KM_SLEEP); 2467 for (i = 0; i < spa->spa_spares.sav_count; i++) 2468 spares[i] = vdev_config_generate(spa, 2469 spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 2470 fnvlist_add_nvlist_array(spa->spa_spares.sav_config, 2471 ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, 2472 spa->spa_spares.sav_count); 2473 for (i = 0; i < spa->spa_spares.sav_count; i++) 2474 nvlist_free(spares[i]); 2475 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 2476 } 2477 2478 /* 2479 * Load (or re-load) the current list of vdevs describing the active l2cache for 2480 * this pool. When this is called, we have some form of basic information in 2481 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 2482 * then re-generate a more complete list including status information. 2483 * Devices which are already active have their details maintained, and are 2484 * not re-opened. 2485 */ 2486 void 2487 spa_load_l2cache(spa_t *spa) 2488 { 2489 nvlist_t **l2cache = NULL; 2490 uint_t nl2cache; 2491 int i, j, oldnvdevs; 2492 uint64_t guid; 2493 vdev_t *vd, **oldvdevs, **newvdevs; 2494 spa_aux_vdev_t *sav = &spa->spa_l2cache; 2495 2496 #ifndef _KERNEL 2497 /* 2498 * zdb opens both the current state of the pool and the 2499 * checkpointed state (if present), with a different spa_t. 2500 * 2501 * As L2 caches are part of the ARC which is shared among open 2502 * pools, we skip loading them when we load the checkpointed 2503 * state of the pool. 2504 */ 2505 if (!spa_writeable(spa)) 2506 return; 2507 #endif 2508 2509 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 2510 2511 oldvdevs = sav->sav_vdevs; 2512 oldnvdevs = sav->sav_count; 2513 sav->sav_vdevs = NULL; 2514 sav->sav_count = 0; 2515 2516 if (sav->sav_config == NULL) { 2517 nl2cache = 0; 2518 newvdevs = NULL; 2519 goto out; 2520 } 2521 2522 VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, 2523 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache)); 2524 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 2525 2526 /* 2527 * Process new nvlist of vdevs. 2528 */ 2529 for (i = 0; i < nl2cache; i++) { 2530 guid = fnvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID); 2531 2532 newvdevs[i] = NULL; 2533 for (j = 0; j < oldnvdevs; j++) { 2534 vd = oldvdevs[j]; 2535 if (vd != NULL && guid == vd->vdev_guid) { 2536 /* 2537 * Retain previous vdev for add/remove ops. 2538 */ 2539 newvdevs[i] = vd; 2540 oldvdevs[j] = NULL; 2541 break; 2542 } 2543 } 2544 2545 if (newvdevs[i] == NULL) { 2546 /* 2547 * Create new vdev 2548 */ 2549 VERIFY0(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 2550 VDEV_ALLOC_L2CACHE)); 2551 ASSERT(vd != NULL); 2552 newvdevs[i] = vd; 2553 2554 /* 2555 * Commit this vdev as an l2cache device, 2556 * even if it fails to open. 2557 */ 2558 spa_l2cache_add(vd); 2559 2560 vd->vdev_top = vd; 2561 vd->vdev_aux = sav; 2562 2563 spa_l2cache_activate(vd); 2564 2565 if (vdev_open(vd) != 0) 2566 continue; 2567 2568 (void) vdev_validate_aux(vd); 2569 2570 if (!vdev_is_dead(vd)) 2571 l2arc_add_vdev(spa, vd); 2572 2573 /* 2574 * Upon cache device addition to a pool or pool 2575 * creation with a cache device or if the header 2576 * of the device is invalid we issue an async 2577 * TRIM command for the whole device which will 2578 * execute if l2arc_trim_ahead > 0. 2579 */ 2580 spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM); 2581 } 2582 } 2583 2584 sav->sav_vdevs = newvdevs; 2585 sav->sav_count = (int)nl2cache; 2586 2587 /* 2588 * Recompute the stashed list of l2cache devices, with status 2589 * information this time. 2590 */ 2591 fnvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE); 2592 2593 if (sav->sav_count > 0) 2594 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), 2595 KM_SLEEP); 2596 for (i = 0; i < sav->sav_count; i++) 2597 l2cache[i] = vdev_config_generate(spa, 2598 sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 2599 fnvlist_add_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 2600 (const nvlist_t * const *)l2cache, sav->sav_count); 2601 2602 out: 2603 /* 2604 * Purge vdevs that were dropped 2605 */ 2606 if (oldvdevs) { 2607 for (i = 0; i < oldnvdevs; i++) { 2608 uint64_t pool; 2609 2610 vd = oldvdevs[i]; 2611 if (vd != NULL) { 2612 ASSERT(vd->vdev_isl2cache); 2613 2614 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 2615 pool != 0ULL && l2arc_vdev_present(vd)) 2616 l2arc_remove_vdev(vd); 2617 vdev_clear_stats(vd); 2618 vdev_free(vd); 2619 } 2620 } 2621 2622 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 2623 } 2624 2625 for (i = 0; i < sav->sav_count; i++) 2626 nvlist_free(l2cache[i]); 2627 if (sav->sav_count) 2628 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 2629 } 2630 2631 static int 2632 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 2633 { 2634 dmu_buf_t *db; 2635 char *packed = NULL; 2636 size_t nvsize = 0; 2637 int error; 2638 *value = NULL; 2639 2640 error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); 2641 if (error) 2642 return (error); 2643 2644 nvsize = *(uint64_t *)db->db_data; 2645 dmu_buf_rele(db, FTAG); 2646 2647 packed = vmem_alloc(nvsize, KM_SLEEP); 2648 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 2649 DMU_READ_PREFETCH); 2650 if (error == 0) 2651 error = nvlist_unpack(packed, nvsize, value, 0); 2652 vmem_free(packed, nvsize); 2653 2654 return (error); 2655 } 2656 2657 /* 2658 * Concrete top-level vdevs that are not missing and are not logs. At every 2659 * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds. 2660 */ 2661 static uint64_t 2662 spa_healthy_core_tvds(spa_t *spa) 2663 { 2664 vdev_t *rvd = spa->spa_root_vdev; 2665 uint64_t tvds = 0; 2666 2667 for (uint64_t i = 0; i < rvd->vdev_children; i++) { 2668 vdev_t *vd = rvd->vdev_child[i]; 2669 if (vd->vdev_islog) 2670 continue; 2671 if (vdev_is_concrete(vd) && !vdev_is_dead(vd)) 2672 tvds++; 2673 } 2674 2675 return (tvds); 2676 } 2677 2678 /* 2679 * Checks to see if the given vdev could not be opened, in which case we post a 2680 * sysevent to notify the autoreplace code that the device has been removed. 2681 */ 2682 static void 2683 spa_check_removed(vdev_t *vd) 2684 { 2685 for (uint64_t c = 0; c < vd->vdev_children; c++) 2686 spa_check_removed(vd->vdev_child[c]); 2687 2688 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && 2689 vdev_is_concrete(vd)) { 2690 zfs_post_autoreplace(vd->vdev_spa, vd); 2691 spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK); 2692 } 2693 } 2694 2695 static int 2696 spa_check_for_missing_logs(spa_t *spa) 2697 { 2698 vdev_t *rvd = spa->spa_root_vdev; 2699 2700 /* 2701 * If we're doing a normal import, then build up any additional 2702 * diagnostic information about missing log devices. 2703 * We'll pass this up to the user for further processing. 2704 */ 2705 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 2706 nvlist_t **child, *nv; 2707 uint64_t idx = 0; 2708 2709 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t *), 2710 KM_SLEEP); 2711 nv = fnvlist_alloc(); 2712 2713 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 2714 vdev_t *tvd = rvd->vdev_child[c]; 2715 2716 /* 2717 * We consider a device as missing only if it failed 2718 * to open (i.e. offline or faulted is not considered 2719 * as missing). 2720 */ 2721 if (tvd->vdev_islog && 2722 tvd->vdev_state == VDEV_STATE_CANT_OPEN) { 2723 child[idx++] = vdev_config_generate(spa, tvd, 2724 B_FALSE, VDEV_CONFIG_MISSING); 2725 } 2726 } 2727 2728 if (idx > 0) { 2729 fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 2730 (const nvlist_t * const *)child, idx); 2731 fnvlist_add_nvlist(spa->spa_load_info, 2732 ZPOOL_CONFIG_MISSING_DEVICES, nv); 2733 2734 for (uint64_t i = 0; i < idx; i++) 2735 nvlist_free(child[i]); 2736 } 2737 nvlist_free(nv); 2738 kmem_free(child, rvd->vdev_children * sizeof (char **)); 2739 2740 if (idx > 0) { 2741 spa_load_failed(spa, "some log devices are missing"); 2742 vdev_dbgmsg_print_tree(rvd, 2); 2743 return (SET_ERROR(ENXIO)); 2744 } 2745 } else { 2746 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 2747 vdev_t *tvd = rvd->vdev_child[c]; 2748 2749 if (tvd->vdev_islog && 2750 tvd->vdev_state == VDEV_STATE_CANT_OPEN) { 2751 spa_set_log_state(spa, SPA_LOG_CLEAR); 2752 spa_load_note(spa, "some log devices are " 2753 "missing, ZIL is dropped."); 2754 vdev_dbgmsg_print_tree(rvd, 2); 2755 break; 2756 } 2757 } 2758 } 2759 2760 return (0); 2761 } 2762 2763 /* 2764 * Check for missing log devices 2765 */ 2766 static boolean_t 2767 spa_check_logs(spa_t *spa) 2768 { 2769 boolean_t rv = B_FALSE; 2770 dsl_pool_t *dp = spa_get_dsl(spa); 2771 2772 switch (spa->spa_log_state) { 2773 default: 2774 break; 2775 case SPA_LOG_MISSING: 2776 /* need to recheck in case slog has been restored */ 2777 case SPA_LOG_UNKNOWN: 2778 rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 2779 zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0); 2780 if (rv) 2781 spa_set_log_state(spa, SPA_LOG_MISSING); 2782 break; 2783 } 2784 return (rv); 2785 } 2786 2787 /* 2788 * Passivate any log vdevs (note, does not apply to embedded log metaslabs). 2789 */ 2790 static boolean_t 2791 spa_passivate_log(spa_t *spa) 2792 { 2793 vdev_t *rvd = spa->spa_root_vdev; 2794 boolean_t slog_found = B_FALSE; 2795 2796 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 2797 2798 for (int c = 0; c < rvd->vdev_children; c++) { 2799 vdev_t *tvd = rvd->vdev_child[c]; 2800 2801 if (tvd->vdev_islog) { 2802 ASSERT0P(tvd->vdev_log_mg); 2803 metaslab_group_passivate(tvd->vdev_mg); 2804 slog_found = B_TRUE; 2805 } 2806 } 2807 2808 return (slog_found); 2809 } 2810 2811 /* 2812 * Activate any log vdevs (note, does not apply to embedded log metaslabs). 2813 */ 2814 static void 2815 spa_activate_log(spa_t *spa) 2816 { 2817 vdev_t *rvd = spa->spa_root_vdev; 2818 2819 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 2820 2821 for (int c = 0; c < rvd->vdev_children; c++) { 2822 vdev_t *tvd = rvd->vdev_child[c]; 2823 2824 if (tvd->vdev_islog) { 2825 ASSERT0P(tvd->vdev_log_mg); 2826 metaslab_group_activate(tvd->vdev_mg); 2827 } 2828 } 2829 } 2830 2831 int 2832 spa_reset_logs(spa_t *spa) 2833 { 2834 int error; 2835 2836 error = dmu_objset_find(spa_name(spa), zil_reset, 2837 NULL, DS_FIND_CHILDREN); 2838 if (error == 0) { 2839 /* 2840 * We successfully offlined the log device, sync out the 2841 * current txg so that the "stubby" block can be removed 2842 * by zil_sync(). 2843 */ 2844 txg_wait_synced(spa->spa_dsl_pool, 0); 2845 } 2846 return (error); 2847 } 2848 2849 static void 2850 spa_aux_check_removed(spa_aux_vdev_t *sav) 2851 { 2852 for (int i = 0; i < sav->sav_count; i++) 2853 spa_check_removed(sav->sav_vdevs[i]); 2854 } 2855 2856 void 2857 spa_claim_notify(zio_t *zio) 2858 { 2859 spa_t *spa = zio->io_spa; 2860 2861 if (zio->io_error) 2862 return; 2863 2864 mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 2865 if (spa->spa_claim_max_txg < BP_GET_BIRTH(zio->io_bp)) 2866 spa->spa_claim_max_txg = BP_GET_BIRTH(zio->io_bp); 2867 mutex_exit(&spa->spa_props_lock); 2868 } 2869 2870 typedef struct spa_load_error { 2871 boolean_t sle_verify_data; 2872 uint64_t sle_meta_count; 2873 uint64_t sle_data_count; 2874 } spa_load_error_t; 2875 2876 static void 2877 spa_load_verify_done(zio_t *zio) 2878 { 2879 blkptr_t *bp = zio->io_bp; 2880 spa_load_error_t *sle = zio->io_private; 2881 dmu_object_type_t type = BP_GET_TYPE(bp); 2882 int error = zio->io_error; 2883 spa_t *spa = zio->io_spa; 2884 2885 abd_free(zio->io_abd); 2886 if (error) { 2887 if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && 2888 type != DMU_OT_INTENT_LOG) 2889 atomic_inc_64(&sle->sle_meta_count); 2890 else 2891 atomic_inc_64(&sle->sle_data_count); 2892 } 2893 2894 mutex_enter(&spa->spa_scrub_lock); 2895 spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp); 2896 cv_broadcast(&spa->spa_scrub_io_cv); 2897 mutex_exit(&spa->spa_scrub_lock); 2898 } 2899 2900 /* 2901 * Maximum number of inflight bytes is the log2 fraction of the arc size. 2902 * By default, we set it to 1/16th of the arc. 2903 */ 2904 static uint_t spa_load_verify_shift = 4; 2905 static int spa_load_verify_metadata = B_TRUE; 2906 static int spa_load_verify_data = B_TRUE; 2907 2908 static int 2909 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 2910 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 2911 { 2912 zio_t *rio = arg; 2913 spa_load_error_t *sle = rio->io_private; 2914 2915 (void) zilog, (void) dnp; 2916 2917 /* 2918 * Note: normally this routine will not be called if 2919 * spa_load_verify_metadata is not set. However, it may be useful 2920 * to manually set the flag after the traversal has begun. 2921 */ 2922 if (!spa_load_verify_metadata) 2923 return (0); 2924 2925 /* 2926 * Sanity check the block pointer in order to detect obvious damage 2927 * before using the contents in subsequent checks or in zio_read(). 2928 * When damaged consider it to be a metadata error since we cannot 2929 * trust the BP_GET_TYPE and BP_GET_LEVEL values. 2930 */ 2931 if (zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { 2932 atomic_inc_64(&sle->sle_meta_count); 2933 return (0); 2934 } 2935 2936 if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || 2937 BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp)) 2938 return (0); 2939 2940 if (!BP_IS_METADATA(bp) && 2941 (!spa_load_verify_data || !sle->sle_verify_data)) 2942 return (0); 2943 2944 uint64_t maxinflight_bytes = 2945 arc_target_bytes() >> spa_load_verify_shift; 2946 size_t size = BP_GET_PSIZE(bp); 2947 2948 mutex_enter(&spa->spa_scrub_lock); 2949 while (spa->spa_load_verify_bytes >= maxinflight_bytes) 2950 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2951 spa->spa_load_verify_bytes += size; 2952 mutex_exit(&spa->spa_scrub_lock); 2953 2954 zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size, 2955 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 2956 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 2957 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 2958 return (0); 2959 } 2960 2961 static int 2962 verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) 2963 { 2964 (void) dp, (void) arg; 2965 2966 if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN) 2967 return (SET_ERROR(ENAMETOOLONG)); 2968 2969 return (0); 2970 } 2971 2972 static int 2973 spa_load_verify(spa_t *spa) 2974 { 2975 zio_t *rio; 2976 spa_load_error_t sle = { 0 }; 2977 zpool_load_policy_t policy; 2978 boolean_t verify_ok = B_FALSE; 2979 int error = 0; 2980 2981 zpool_get_load_policy(spa->spa_config, &policy); 2982 2983 if (policy.zlp_rewind & ZPOOL_NEVER_REWIND || 2984 policy.zlp_maxmeta == UINT64_MAX) 2985 return (0); 2986 2987 dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); 2988 error = dmu_objset_find_dp(spa->spa_dsl_pool, 2989 spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL, 2990 DS_FIND_CHILDREN); 2991 dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); 2992 if (error != 0) 2993 return (error); 2994 2995 /* 2996 * Verify data only if we are rewinding or error limit was set. 2997 * Otherwise nothing except dbgmsg care about it to waste time. 2998 */ 2999 sle.sle_verify_data = (policy.zlp_rewind & ZPOOL_REWIND_MASK) || 3000 (policy.zlp_maxdata < UINT64_MAX); 3001 3002 rio = zio_root(spa, NULL, &sle, 3003 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 3004 3005 if (spa_load_verify_metadata) { 3006 if (spa->spa_extreme_rewind) { 3007 spa_load_note(spa, "performing a complete scan of the " 3008 "pool since extreme rewind is on. This may take " 3009 "a very long time.\n (spa_load_verify_data=%u, " 3010 "spa_load_verify_metadata=%u)", 3011 spa_load_verify_data, spa_load_verify_metadata); 3012 } 3013 3014 error = traverse_pool(spa, spa->spa_verify_min_txg, 3015 TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | 3016 TRAVERSE_NO_DECRYPT, spa_load_verify_cb, rio); 3017 } 3018 3019 (void) zio_wait(rio); 3020 ASSERT0(spa->spa_load_verify_bytes); 3021 3022 spa->spa_load_meta_errors = sle.sle_meta_count; 3023 spa->spa_load_data_errors = sle.sle_data_count; 3024 3025 if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) { 3026 spa_load_note(spa, "spa_load_verify found %llu metadata errors " 3027 "and %llu data errors", (u_longlong_t)sle.sle_meta_count, 3028 (u_longlong_t)sle.sle_data_count); 3029 } 3030 3031 if (spa_load_verify_dryrun || 3032 (!error && sle.sle_meta_count <= policy.zlp_maxmeta && 3033 sle.sle_data_count <= policy.zlp_maxdata)) { 3034 int64_t loss = 0; 3035 3036 verify_ok = B_TRUE; 3037 spa->spa_load_txg = spa->spa_uberblock.ub_txg; 3038 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 3039 3040 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 3041 fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_TIME, 3042 spa->spa_load_txg_ts); 3043 fnvlist_add_int64(spa->spa_load_info, ZPOOL_CONFIG_REWIND_TIME, 3044 loss); 3045 fnvlist_add_uint64(spa->spa_load_info, 3046 ZPOOL_CONFIG_LOAD_META_ERRORS, sle.sle_meta_count); 3047 fnvlist_add_uint64(spa->spa_load_info, 3048 ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count); 3049 } else { 3050 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 3051 } 3052 3053 if (spa_load_verify_dryrun) 3054 return (0); 3055 3056 if (error) { 3057 if (error != ENXIO && error != EIO) 3058 error = SET_ERROR(EIO); 3059 return (error); 3060 } 3061 3062 return (verify_ok ? 0 : EIO); 3063 } 3064 3065 /* 3066 * Find a value in the pool props object. 3067 */ 3068 static void 3069 spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 3070 { 3071 (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 3072 zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 3073 } 3074 3075 /* 3076 * Find a value in the pool directory object. 3077 */ 3078 static int 3079 spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent) 3080 { 3081 int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 3082 name, sizeof (uint64_t), 1, val); 3083 3084 if (error != 0 && (error != ENOENT || log_enoent)) { 3085 spa_load_failed(spa, "couldn't get '%s' value in MOS directory " 3086 "[error=%d]", name, error); 3087 } 3088 3089 return (error); 3090 } 3091 3092 static int 3093 spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 3094 { 3095 vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 3096 return (SET_ERROR(err)); 3097 } 3098 3099 boolean_t 3100 spa_livelist_delete_check(spa_t *spa) 3101 { 3102 return (spa->spa_livelists_to_delete != 0); 3103 } 3104 3105 static boolean_t 3106 spa_livelist_delete_cb_check(void *arg, zthr_t *z) 3107 { 3108 (void) z; 3109 spa_t *spa = arg; 3110 return (spa_livelist_delete_check(spa)); 3111 } 3112 3113 static int 3114 delete_blkptr_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 3115 { 3116 spa_t *spa = arg; 3117 zio_free(spa, tx->tx_txg, bp); 3118 dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, 3119 -bp_get_dsize_sync(spa, bp), 3120 -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); 3121 return (0); 3122 } 3123 3124 static int 3125 dsl_get_next_livelist_obj(objset_t *os, uint64_t zap_obj, uint64_t *llp) 3126 { 3127 int err; 3128 zap_cursor_t zc; 3129 zap_attribute_t *za = zap_attribute_alloc(); 3130 zap_cursor_init(&zc, os, zap_obj); 3131 err = zap_cursor_retrieve(&zc, za); 3132 zap_cursor_fini(&zc); 3133 if (err == 0) 3134 *llp = za->za_first_integer; 3135 zap_attribute_free(za); 3136 return (err); 3137 } 3138 3139 /* 3140 * Components of livelist deletion that must be performed in syncing 3141 * context: freeing block pointers and updating the pool-wide data 3142 * structures to indicate how much work is left to do 3143 */ 3144 typedef struct sublist_delete_arg { 3145 spa_t *spa; 3146 dsl_deadlist_t *ll; 3147 uint64_t key; 3148 bplist_t *to_free; 3149 } sublist_delete_arg_t; 3150 3151 static void 3152 sublist_delete_sync(void *arg, dmu_tx_t *tx) 3153 { 3154 sublist_delete_arg_t *sda = arg; 3155 spa_t *spa = sda->spa; 3156 dsl_deadlist_t *ll = sda->ll; 3157 uint64_t key = sda->key; 3158 bplist_t *to_free = sda->to_free; 3159 3160 bplist_iterate(to_free, delete_blkptr_cb, spa, tx); 3161 dsl_deadlist_remove_entry(ll, key, tx); 3162 } 3163 3164 typedef struct livelist_delete_arg { 3165 spa_t *spa; 3166 uint64_t ll_obj; 3167 uint64_t zap_obj; 3168 } livelist_delete_arg_t; 3169 3170 static void 3171 livelist_delete_sync(void *arg, dmu_tx_t *tx) 3172 { 3173 livelist_delete_arg_t *lda = arg; 3174 spa_t *spa = lda->spa; 3175 uint64_t ll_obj = lda->ll_obj; 3176 uint64_t zap_obj = lda->zap_obj; 3177 objset_t *mos = spa->spa_meta_objset; 3178 uint64_t count; 3179 3180 /* free the livelist and decrement the feature count */ 3181 VERIFY0(zap_remove_int(mos, zap_obj, ll_obj, tx)); 3182 dsl_deadlist_free(mos, ll_obj, tx); 3183 spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx); 3184 VERIFY0(zap_count(mos, zap_obj, &count)); 3185 if (count == 0) { 3186 /* no more livelists to delete */ 3187 VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, 3188 DMU_POOL_DELETED_CLONES, tx)); 3189 VERIFY0(zap_destroy(mos, zap_obj, tx)); 3190 spa->spa_livelists_to_delete = 0; 3191 spa_notify_waiters(spa); 3192 } 3193 } 3194 3195 /* 3196 * Load in the value for the livelist to be removed and open it. Then, 3197 * load its first sublist and determine which block pointers should actually 3198 * be freed. Then, call a synctask which performs the actual frees and updates 3199 * the pool-wide livelist data. 3200 */ 3201 static void 3202 spa_livelist_delete_cb(void *arg, zthr_t *z) 3203 { 3204 spa_t *spa = arg; 3205 uint64_t ll_obj = 0, count; 3206 objset_t *mos = spa->spa_meta_objset; 3207 uint64_t zap_obj = spa->spa_livelists_to_delete; 3208 /* 3209 * Determine the next livelist to delete. This function should only 3210 * be called if there is at least one deleted clone. 3211 */ 3212 VERIFY0(dsl_get_next_livelist_obj(mos, zap_obj, &ll_obj)); 3213 VERIFY0(zap_count(mos, ll_obj, &count)); 3214 if (count > 0) { 3215 dsl_deadlist_t *ll; 3216 dsl_deadlist_entry_t *dle; 3217 bplist_t to_free; 3218 ll = kmem_zalloc(sizeof (dsl_deadlist_t), KM_SLEEP); 3219 VERIFY0(dsl_deadlist_open(ll, mos, ll_obj)); 3220 dle = dsl_deadlist_first(ll); 3221 ASSERT3P(dle, !=, NULL); 3222 bplist_create(&to_free); 3223 int err = dsl_process_sub_livelist(&dle->dle_bpobj, &to_free, 3224 z, NULL); 3225 if (err == 0) { 3226 sublist_delete_arg_t sync_arg = { 3227 .spa = spa, 3228 .ll = ll, 3229 .key = dle->dle_mintxg, 3230 .to_free = &to_free 3231 }; 3232 zfs_dbgmsg("deleting sublist (id %llu) from" 3233 " livelist %llu, %lld remaining", 3234 (u_longlong_t)dle->dle_bpobj.bpo_object, 3235 (u_longlong_t)ll_obj, (longlong_t)count - 1); 3236 VERIFY0(dsl_sync_task(spa_name(spa), NULL, 3237 sublist_delete_sync, &sync_arg, 0, 3238 ZFS_SPACE_CHECK_DESTROY)); 3239 } else { 3240 VERIFY3U(err, ==, EINTR); 3241 } 3242 bplist_clear(&to_free); 3243 bplist_destroy(&to_free); 3244 dsl_deadlist_close(ll); 3245 kmem_free(ll, sizeof (dsl_deadlist_t)); 3246 } else { 3247 livelist_delete_arg_t sync_arg = { 3248 .spa = spa, 3249 .ll_obj = ll_obj, 3250 .zap_obj = zap_obj 3251 }; 3252 zfs_dbgmsg("deletion of livelist %llu completed", 3253 (u_longlong_t)ll_obj); 3254 VERIFY0(dsl_sync_task(spa_name(spa), NULL, livelist_delete_sync, 3255 &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY)); 3256 } 3257 } 3258 3259 static void 3260 spa_start_livelist_destroy_thread(spa_t *spa) 3261 { 3262 ASSERT0P(spa->spa_livelist_delete_zthr); 3263 spa->spa_livelist_delete_zthr = 3264 zthr_create("z_livelist_destroy", 3265 spa_livelist_delete_cb_check, spa_livelist_delete_cb, spa, 3266 minclsyspri); 3267 } 3268 3269 typedef struct livelist_new_arg { 3270 bplist_t *allocs; 3271 bplist_t *frees; 3272 } livelist_new_arg_t; 3273 3274 static int 3275 livelist_track_new_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 3276 dmu_tx_t *tx) 3277 { 3278 ASSERT0P(tx); 3279 livelist_new_arg_t *lna = arg; 3280 if (bp_freed) { 3281 bplist_append(lna->frees, bp); 3282 } else { 3283 bplist_append(lna->allocs, bp); 3284 zfs_livelist_condense_new_alloc++; 3285 } 3286 return (0); 3287 } 3288 3289 typedef struct livelist_condense_arg { 3290 spa_t *spa; 3291 bplist_t to_keep; 3292 uint64_t first_size; 3293 uint64_t next_size; 3294 } livelist_condense_arg_t; 3295 3296 static void 3297 spa_livelist_condense_sync(void *arg, dmu_tx_t *tx) 3298 { 3299 livelist_condense_arg_t *lca = arg; 3300 spa_t *spa = lca->spa; 3301 bplist_t new_frees; 3302 dsl_dataset_t *ds = spa->spa_to_condense.ds; 3303 3304 /* Have we been cancelled? */ 3305 if (spa->spa_to_condense.cancelled) { 3306 zfs_livelist_condense_sync_cancel++; 3307 goto out; 3308 } 3309 3310 dsl_deadlist_entry_t *first = spa->spa_to_condense.first; 3311 dsl_deadlist_entry_t *next = spa->spa_to_condense.next; 3312 dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist; 3313 3314 /* 3315 * It's possible that the livelist was changed while the zthr was 3316 * running. Therefore, we need to check for new blkptrs in the two 3317 * entries being condensed and continue to track them in the livelist. 3318 * Because of the way we handle remapped blkptrs (see dbuf_remap_impl), 3319 * it's possible that the newly added blkptrs are FREEs or ALLOCs so 3320 * we need to sort them into two different bplists. 3321 */ 3322 uint64_t first_obj = first->dle_bpobj.bpo_object; 3323 uint64_t next_obj = next->dle_bpobj.bpo_object; 3324 uint64_t cur_first_size = first->dle_bpobj.bpo_phys->bpo_num_blkptrs; 3325 uint64_t cur_next_size = next->dle_bpobj.bpo_phys->bpo_num_blkptrs; 3326 3327 bplist_create(&new_frees); 3328 livelist_new_arg_t new_bps = { 3329 .allocs = &lca->to_keep, 3330 .frees = &new_frees, 3331 }; 3332 3333 if (cur_first_size > lca->first_size) { 3334 VERIFY0(livelist_bpobj_iterate_from_nofree(&first->dle_bpobj, 3335 livelist_track_new_cb, &new_bps, lca->first_size)); 3336 } 3337 if (cur_next_size > lca->next_size) { 3338 VERIFY0(livelist_bpobj_iterate_from_nofree(&next->dle_bpobj, 3339 livelist_track_new_cb, &new_bps, lca->next_size)); 3340 } 3341 3342 dsl_deadlist_clear_entry(first, ll, tx); 3343 ASSERT(bpobj_is_empty(&first->dle_bpobj)); 3344 dsl_deadlist_remove_entry(ll, next->dle_mintxg, tx); 3345 3346 bplist_iterate(&lca->to_keep, dsl_deadlist_insert_alloc_cb, ll, tx); 3347 bplist_iterate(&new_frees, dsl_deadlist_insert_free_cb, ll, tx); 3348 bplist_destroy(&new_frees); 3349 3350 char dsname[ZFS_MAX_DATASET_NAME_LEN]; 3351 dsl_dataset_name(ds, dsname); 3352 zfs_dbgmsg("txg %llu condensing livelist of %s (id %llu), bpobj %llu " 3353 "(%llu blkptrs) and bpobj %llu (%llu blkptrs) -> bpobj %llu " 3354 "(%llu blkptrs)", (u_longlong_t)tx->tx_txg, dsname, 3355 (u_longlong_t)ds->ds_object, (u_longlong_t)first_obj, 3356 (u_longlong_t)cur_first_size, (u_longlong_t)next_obj, 3357 (u_longlong_t)cur_next_size, 3358 (u_longlong_t)first->dle_bpobj.bpo_object, 3359 (u_longlong_t)first->dle_bpobj.bpo_phys->bpo_num_blkptrs); 3360 out: 3361 dmu_buf_rele(ds->ds_dbuf, spa); 3362 spa->spa_to_condense.ds = NULL; 3363 bplist_clear(&lca->to_keep); 3364 bplist_destroy(&lca->to_keep); 3365 kmem_free(lca, sizeof (livelist_condense_arg_t)); 3366 spa->spa_to_condense.syncing = B_FALSE; 3367 } 3368 3369 static void 3370 spa_livelist_condense_cb(void *arg, zthr_t *t) 3371 { 3372 while (zfs_livelist_condense_zthr_pause && 3373 !(zthr_has_waiters(t) || zthr_iscancelled(t))) 3374 delay(1); 3375 3376 spa_t *spa = arg; 3377 dsl_deadlist_entry_t *first = spa->spa_to_condense.first; 3378 dsl_deadlist_entry_t *next = spa->spa_to_condense.next; 3379 uint64_t first_size, next_size; 3380 3381 livelist_condense_arg_t *lca = 3382 kmem_alloc(sizeof (livelist_condense_arg_t), KM_SLEEP); 3383 bplist_create(&lca->to_keep); 3384 3385 /* 3386 * Process the livelists (matching FREEs and ALLOCs) in open context 3387 * so we have minimal work in syncing context to condense. 3388 * 3389 * We save bpobj sizes (first_size and next_size) to use later in 3390 * syncing context to determine if entries were added to these sublists 3391 * while in open context. This is possible because the clone is still 3392 * active and open for normal writes and we want to make sure the new, 3393 * unprocessed blockpointers are inserted into the livelist normally. 3394 * 3395 * Note that dsl_process_sub_livelist() both stores the size number of 3396 * blockpointers and iterates over them while the bpobj's lock held, so 3397 * the sizes returned to us are consistent which what was actually 3398 * processed. 3399 */ 3400 int err = dsl_process_sub_livelist(&first->dle_bpobj, &lca->to_keep, t, 3401 &first_size); 3402 if (err == 0) 3403 err = dsl_process_sub_livelist(&next->dle_bpobj, &lca->to_keep, 3404 t, &next_size); 3405 3406 if (err == 0) { 3407 while (zfs_livelist_condense_sync_pause && 3408 !(zthr_has_waiters(t) || zthr_iscancelled(t))) 3409 delay(1); 3410 3411 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 3412 dmu_tx_mark_netfree(tx); 3413 dmu_tx_hold_space(tx, 1); 3414 err = dmu_tx_assign(tx, DMU_TX_NOWAIT | DMU_TX_NOTHROTTLE); 3415 if (err == 0) { 3416 /* 3417 * Prevent the condense zthr restarting before 3418 * the synctask completes. 3419 */ 3420 spa->spa_to_condense.syncing = B_TRUE; 3421 lca->spa = spa; 3422 lca->first_size = first_size; 3423 lca->next_size = next_size; 3424 dsl_sync_task_nowait(spa_get_dsl(spa), 3425 spa_livelist_condense_sync, lca, tx); 3426 dmu_tx_commit(tx); 3427 return; 3428 } 3429 } 3430 /* 3431 * Condensing can not continue: either it was externally stopped or 3432 * we were unable to assign to a tx because the pool has run out of 3433 * space. In the second case, we'll just end up trying to condense 3434 * again in a later txg. 3435 */ 3436 ASSERT(err != 0); 3437 bplist_clear(&lca->to_keep); 3438 bplist_destroy(&lca->to_keep); 3439 kmem_free(lca, sizeof (livelist_condense_arg_t)); 3440 dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, spa); 3441 spa->spa_to_condense.ds = NULL; 3442 if (err == EINTR) 3443 zfs_livelist_condense_zthr_cancel++; 3444 } 3445 3446 /* 3447 * Check that there is something to condense but that a condense is not 3448 * already in progress and that condensing has not been cancelled. 3449 */ 3450 static boolean_t 3451 spa_livelist_condense_cb_check(void *arg, zthr_t *z) 3452 { 3453 (void) z; 3454 spa_t *spa = arg; 3455 if ((spa->spa_to_condense.ds != NULL) && 3456 (spa->spa_to_condense.syncing == B_FALSE) && 3457 (spa->spa_to_condense.cancelled == B_FALSE)) { 3458 return (B_TRUE); 3459 } 3460 return (B_FALSE); 3461 } 3462 3463 static void 3464 spa_start_livelist_condensing_thread(spa_t *spa) 3465 { 3466 spa->spa_to_condense.ds = NULL; 3467 spa->spa_to_condense.first = NULL; 3468 spa->spa_to_condense.next = NULL; 3469 spa->spa_to_condense.syncing = B_FALSE; 3470 spa->spa_to_condense.cancelled = B_FALSE; 3471 3472 ASSERT0P(spa->spa_livelist_condense_zthr); 3473 spa->spa_livelist_condense_zthr = 3474 zthr_create("z_livelist_condense", 3475 spa_livelist_condense_cb_check, 3476 spa_livelist_condense_cb, spa, minclsyspri); 3477 } 3478 3479 static void 3480 spa_spawn_aux_threads(spa_t *spa) 3481 { 3482 ASSERT(spa_writeable(spa)); 3483 3484 spa_start_raidz_expansion_thread(spa); 3485 spa_start_indirect_condensing_thread(spa); 3486 spa_start_livelist_destroy_thread(spa); 3487 spa_start_livelist_condensing_thread(spa); 3488 3489 ASSERT0P(spa->spa_checkpoint_discard_zthr); 3490 spa->spa_checkpoint_discard_zthr = 3491 zthr_create("z_checkpoint_discard", 3492 spa_checkpoint_discard_thread_check, 3493 spa_checkpoint_discard_thread, spa, minclsyspri); 3494 } 3495 3496 /* 3497 * Fix up config after a partly-completed split. This is done with the 3498 * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 3499 * pool have that entry in their config, but only the splitting one contains 3500 * a list of all the guids of the vdevs that are being split off. 3501 * 3502 * This function determines what to do with that list: either rejoin 3503 * all the disks to the pool, or complete the splitting process. To attempt 3504 * the rejoin, each disk that is offlined is marked online again, and 3505 * we do a reopen() call. If the vdev label for every disk that was 3506 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 3507 * then we call vdev_split() on each disk, and complete the split. 3508 * 3509 * Otherwise we leave the config alone, with all the vdevs in place in 3510 * the original pool. 3511 */ 3512 static void 3513 spa_try_repair(spa_t *spa, nvlist_t *config) 3514 { 3515 uint_t extracted; 3516 uint64_t *glist; 3517 uint_t i, gcount; 3518 nvlist_t *nvl; 3519 vdev_t **vd; 3520 boolean_t attempt_reopen; 3521 3522 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 3523 return; 3524 3525 /* check that the config is complete */ 3526 if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 3527 &glist, &gcount) != 0) 3528 return; 3529 3530 vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 3531 3532 /* attempt to online all the vdevs & validate */ 3533 attempt_reopen = B_TRUE; 3534 for (i = 0; i < gcount; i++) { 3535 if (glist[i] == 0) /* vdev is hole */ 3536 continue; 3537 3538 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 3539 if (vd[i] == NULL) { 3540 /* 3541 * Don't bother attempting to reopen the disks; 3542 * just do the split. 3543 */ 3544 attempt_reopen = B_FALSE; 3545 } else { 3546 /* attempt to re-online it */ 3547 vd[i]->vdev_offline = B_FALSE; 3548 } 3549 } 3550 3551 if (attempt_reopen) { 3552 vdev_reopen(spa->spa_root_vdev); 3553 3554 /* check each device to see what state it's in */ 3555 for (extracted = 0, i = 0; i < gcount; i++) { 3556 if (vd[i] != NULL && 3557 vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 3558 break; 3559 ++extracted; 3560 } 3561 } 3562 3563 /* 3564 * If every disk has been moved to the new pool, or if we never 3565 * even attempted to look at them, then we split them off for 3566 * good. 3567 */ 3568 if (!attempt_reopen || gcount == extracted) { 3569 for (i = 0; i < gcount; i++) 3570 if (vd[i] != NULL) 3571 vdev_split(vd[i]); 3572 vdev_reopen(spa->spa_root_vdev); 3573 } 3574 3575 kmem_free(vd, gcount * sizeof (vdev_t *)); 3576 } 3577 3578 static int 3579 spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type) 3580 { 3581 const char *ereport = FM_EREPORT_ZFS_POOL; 3582 int error; 3583 3584 spa->spa_load_state = state; 3585 (void) spa_import_progress_set_state(spa_guid(spa), 3586 spa_load_state(spa)); 3587 spa_import_progress_set_notes(spa, "spa_load()"); 3588 3589 gethrestime(&spa->spa_loaded_ts); 3590 error = spa_load_impl(spa, type, &ereport); 3591 3592 /* 3593 * Don't count references from objsets that are already closed 3594 * and are making their way through the eviction process. 3595 */ 3596 spa_evicting_os_wait(spa); 3597 spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); 3598 if (error) { 3599 if (error != EEXIST) { 3600 spa->spa_loaded_ts.tv_sec = 0; 3601 spa->spa_loaded_ts.tv_nsec = 0; 3602 } 3603 if (error != EBADF) { 3604 (void) zfs_ereport_post(ereport, spa, 3605 NULL, NULL, NULL, 0); 3606 } 3607 } 3608 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 3609 spa->spa_ena = 0; 3610 3611 (void) spa_import_progress_set_state(spa_guid(spa), 3612 spa_load_state(spa)); 3613 3614 return (error); 3615 } 3616 3617 #ifdef ZFS_DEBUG 3618 /* 3619 * Count the number of per-vdev ZAPs associated with all of the vdevs in the 3620 * vdev tree rooted in the given vd, and ensure that each ZAP is present in the 3621 * spa's per-vdev ZAP list. 3622 */ 3623 static uint64_t 3624 vdev_count_verify_zaps(vdev_t *vd) 3625 { 3626 spa_t *spa = vd->vdev_spa; 3627 uint64_t total = 0; 3628 3629 if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2) && 3630 vd->vdev_root_zap != 0) { 3631 total++; 3632 ASSERT0(zap_lookup_int(spa->spa_meta_objset, 3633 spa->spa_all_vdev_zaps, vd->vdev_root_zap)); 3634 } 3635 if (vd->vdev_top_zap != 0) { 3636 total++; 3637 ASSERT0(zap_lookup_int(spa->spa_meta_objset, 3638 spa->spa_all_vdev_zaps, vd->vdev_top_zap)); 3639 } 3640 if (vd->vdev_leaf_zap != 0) { 3641 total++; 3642 ASSERT0(zap_lookup_int(spa->spa_meta_objset, 3643 spa->spa_all_vdev_zaps, vd->vdev_leaf_zap)); 3644 } 3645 3646 for (uint64_t i = 0; i < vd->vdev_children; i++) { 3647 total += vdev_count_verify_zaps(vd->vdev_child[i]); 3648 } 3649 3650 return (total); 3651 } 3652 #else 3653 #define vdev_count_verify_zaps(vd) ((void) sizeof (vd), 0) 3654 #endif 3655 3656 /* 3657 * Determine whether the activity check is required. 3658 */ 3659 static boolean_t 3660 spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label, 3661 nvlist_t *config) 3662 { 3663 uint64_t state = 0; 3664 uint64_t hostid = 0; 3665 uint64_t tryconfig_txg = 0; 3666 uint64_t tryconfig_timestamp = 0; 3667 uint16_t tryconfig_mmp_seq = 0; 3668 nvlist_t *nvinfo; 3669 3670 if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { 3671 nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); 3672 (void) nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG, 3673 &tryconfig_txg); 3674 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 3675 &tryconfig_timestamp); 3676 (void) nvlist_lookup_uint16(nvinfo, ZPOOL_CONFIG_MMP_SEQ, 3677 &tryconfig_mmp_seq); 3678 } 3679 3680 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state); 3681 3682 /* 3683 * Disable the MMP activity check - This is used by zdb which 3684 * is intended to be used on potentially active pools. 3685 */ 3686 if (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) 3687 return (B_FALSE); 3688 3689 /* 3690 * Skip the activity check when the MMP feature is disabled. 3691 */ 3692 if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay == 0) 3693 return (B_FALSE); 3694 3695 /* 3696 * If the tryconfig_ values are nonzero, they are the results of an 3697 * earlier tryimport. If they all match the uberblock we just found, 3698 * then the pool has not changed and we return false so we do not test 3699 * a second time. 3700 */ 3701 if (tryconfig_txg && tryconfig_txg == ub->ub_txg && 3702 tryconfig_timestamp && tryconfig_timestamp == ub->ub_timestamp && 3703 tryconfig_mmp_seq && tryconfig_mmp_seq == 3704 (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) 3705 return (B_FALSE); 3706 3707 /* 3708 * Allow the activity check to be skipped when importing the pool 3709 * on the same host which last imported it. Since the hostid from 3710 * configuration may be stale use the one read from the label. 3711 */ 3712 if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID)) 3713 hostid = fnvlist_lookup_uint64(label, ZPOOL_CONFIG_HOSTID); 3714 3715 if (hostid == spa_get_hostid(spa)) 3716 return (B_FALSE); 3717 3718 /* 3719 * Skip the activity test when the pool was cleanly exported. 3720 */ 3721 if (state != POOL_STATE_ACTIVE) 3722 return (B_FALSE); 3723 3724 return (B_TRUE); 3725 } 3726 3727 /* 3728 * Nanoseconds the activity check must watch for changes on-disk. 3729 */ 3730 static uint64_t 3731 spa_activity_check_duration(spa_t *spa, uberblock_t *ub) 3732 { 3733 uint64_t import_intervals = MAX(zfs_multihost_import_intervals, 1); 3734 uint64_t multihost_interval = MSEC2NSEC( 3735 MMP_INTERVAL_OK(zfs_multihost_interval)); 3736 uint64_t import_delay = MAX(NANOSEC, import_intervals * 3737 multihost_interval); 3738 3739 /* 3740 * Local tunables determine a minimum duration except for the case 3741 * where we know when the remote host will suspend the pool if MMP 3742 * writes do not land. 3743 * 3744 * See Big Theory comment at the top of mmp.c for the reasoning behind 3745 * these cases and times. 3746 */ 3747 3748 ASSERT(MMP_IMPORT_SAFETY_FACTOR >= 100); 3749 3750 if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) && 3751 MMP_FAIL_INT(ub) > 0) { 3752 3753 /* MMP on remote host will suspend pool after failed writes */ 3754 import_delay = MMP_FAIL_INT(ub) * MSEC2NSEC(MMP_INTERVAL(ub)) * 3755 MMP_IMPORT_SAFETY_FACTOR / 100; 3756 3757 zfs_dbgmsg("fail_intvals>0 import_delay=%llu ub_mmp " 3758 "mmp_fails=%llu ub_mmp mmp_interval=%llu " 3759 "import_intervals=%llu", (u_longlong_t)import_delay, 3760 (u_longlong_t)MMP_FAIL_INT(ub), 3761 (u_longlong_t)MMP_INTERVAL(ub), 3762 (u_longlong_t)import_intervals); 3763 3764 } else if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) && 3765 MMP_FAIL_INT(ub) == 0) { 3766 3767 /* MMP on remote host will never suspend pool */ 3768 import_delay = MAX(import_delay, (MSEC2NSEC(MMP_INTERVAL(ub)) + 3769 ub->ub_mmp_delay) * import_intervals); 3770 3771 zfs_dbgmsg("fail_intvals=0 import_delay=%llu ub_mmp " 3772 "mmp_interval=%llu ub_mmp_delay=%llu " 3773 "import_intervals=%llu", (u_longlong_t)import_delay, 3774 (u_longlong_t)MMP_INTERVAL(ub), 3775 (u_longlong_t)ub->ub_mmp_delay, 3776 (u_longlong_t)import_intervals); 3777 3778 } else if (MMP_VALID(ub)) { 3779 /* 3780 * zfs-0.7 compatibility case 3781 */ 3782 3783 import_delay = MAX(import_delay, (multihost_interval + 3784 ub->ub_mmp_delay) * import_intervals); 3785 3786 zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu " 3787 "import_intervals=%llu leaves=%u", 3788 (u_longlong_t)import_delay, 3789 (u_longlong_t)ub->ub_mmp_delay, 3790 (u_longlong_t)import_intervals, 3791 vdev_count_leaves(spa)); 3792 } else { 3793 /* Using local tunings is the only reasonable option */ 3794 zfs_dbgmsg("pool last imported on non-MMP aware " 3795 "host using import_delay=%llu multihost_interval=%llu " 3796 "import_intervals=%llu", (u_longlong_t)import_delay, 3797 (u_longlong_t)multihost_interval, 3798 (u_longlong_t)import_intervals); 3799 } 3800 3801 return (import_delay); 3802 } 3803 3804 /* 3805 * Remote host activity check. 3806 * 3807 * error results: 3808 * 0 - no activity detected 3809 * EREMOTEIO - remote activity detected 3810 * EINTR - user canceled the operation 3811 */ 3812 static int 3813 spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config, 3814 boolean_t importing) 3815 { 3816 uint64_t txg = ub->ub_txg; 3817 uint64_t timestamp = ub->ub_timestamp; 3818 uint64_t mmp_config = ub->ub_mmp_config; 3819 uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0; 3820 uint64_t import_delay; 3821 hrtime_t import_expire, now; 3822 nvlist_t *mmp_label = NULL; 3823 vdev_t *rvd = spa->spa_root_vdev; 3824 kcondvar_t cv; 3825 kmutex_t mtx; 3826 int error = 0; 3827 3828 cv_init(&cv, NULL, CV_DEFAULT, NULL); 3829 mutex_init(&mtx, NULL, MUTEX_DEFAULT, NULL); 3830 mutex_enter(&mtx); 3831 3832 /* 3833 * If ZPOOL_CONFIG_MMP_TXG is present an activity check was performed 3834 * during the earlier tryimport. If the txg recorded there is 0 then 3835 * the pool is known to be active on another host. 3836 * 3837 * Otherwise, the pool might be in use on another host. Check for 3838 * changes in the uberblocks on disk if necessary. 3839 */ 3840 if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { 3841 nvlist_t *nvinfo = fnvlist_lookup_nvlist(config, 3842 ZPOOL_CONFIG_LOAD_INFO); 3843 3844 if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_TXG) && 3845 fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG) == 0) { 3846 vdev_uberblock_load(rvd, ub, &mmp_label); 3847 error = SET_ERROR(EREMOTEIO); 3848 goto out; 3849 } 3850 } 3851 3852 import_delay = spa_activity_check_duration(spa, ub); 3853 3854 /* Add a small random factor in case of simultaneous imports (0-25%) */ 3855 import_delay += import_delay * random_in_range(250) / 1000; 3856 3857 import_expire = gethrtime() + import_delay; 3858 3859 if (importing) { 3860 spa_import_progress_set_notes(spa, "Checking MMP activity, " 3861 "waiting %llu ms", (u_longlong_t)NSEC2MSEC(import_delay)); 3862 } 3863 3864 int iterations = 0; 3865 while ((now = gethrtime()) < import_expire) { 3866 if (importing && iterations++ % 30 == 0) { 3867 spa_import_progress_set_notes(spa, "Checking MMP " 3868 "activity, %llu ms remaining", 3869 (u_longlong_t)NSEC2MSEC(import_expire - now)); 3870 } 3871 3872 if (importing) { 3873 (void) spa_import_progress_set_mmp_check(spa_guid(spa), 3874 NSEC2SEC(import_expire - gethrtime())); 3875 } 3876 3877 vdev_uberblock_load(rvd, ub, &mmp_label); 3878 3879 if (txg != ub->ub_txg || timestamp != ub->ub_timestamp || 3880 mmp_seq != (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) { 3881 zfs_dbgmsg("multihost activity detected " 3882 "txg %llu ub_txg %llu " 3883 "timestamp %llu ub_timestamp %llu " 3884 "mmp_config %#llx ub_mmp_config %#llx", 3885 (u_longlong_t)txg, (u_longlong_t)ub->ub_txg, 3886 (u_longlong_t)timestamp, 3887 (u_longlong_t)ub->ub_timestamp, 3888 (u_longlong_t)mmp_config, 3889 (u_longlong_t)ub->ub_mmp_config); 3890 3891 error = SET_ERROR(EREMOTEIO); 3892 break; 3893 } 3894 3895 if (mmp_label) { 3896 nvlist_free(mmp_label); 3897 mmp_label = NULL; 3898 } 3899 3900 error = cv_timedwait_sig(&cv, &mtx, ddi_get_lbolt() + hz); 3901 if (error != -1) { 3902 error = SET_ERROR(EINTR); 3903 break; 3904 } 3905 error = 0; 3906 } 3907 3908 out: 3909 mutex_exit(&mtx); 3910 mutex_destroy(&mtx); 3911 cv_destroy(&cv); 3912 3913 /* 3914 * If the pool is determined to be active store the status in the 3915 * spa->spa_load_info nvlist. If the remote hostname or hostid are 3916 * available from configuration read from disk store them as well. 3917 * This allows 'zpool import' to generate a more useful message. 3918 * 3919 * ZPOOL_CONFIG_MMP_STATE - observed pool status (mandatory) 3920 * ZPOOL_CONFIG_MMP_HOSTNAME - hostname from the active pool 3921 * ZPOOL_CONFIG_MMP_HOSTID - hostid from the active pool 3922 */ 3923 if (error == EREMOTEIO) { 3924 if (mmp_label) { 3925 if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTNAME)) { 3926 const char *hostname = fnvlist_lookup_string( 3927 mmp_label, ZPOOL_CONFIG_HOSTNAME); 3928 fnvlist_add_string(spa->spa_load_info, 3929 ZPOOL_CONFIG_MMP_HOSTNAME, hostname); 3930 } 3931 3932 if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTID)) { 3933 uint64_t hostid = fnvlist_lookup_uint64( 3934 mmp_label, ZPOOL_CONFIG_HOSTID); 3935 fnvlist_add_uint64(spa->spa_load_info, 3936 ZPOOL_CONFIG_MMP_HOSTID, hostid); 3937 } 3938 } 3939 3940 fnvlist_add_uint64(spa->spa_load_info, 3941 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_ACTIVE); 3942 fnvlist_add_uint64(spa->spa_load_info, 3943 ZPOOL_CONFIG_MMP_TXG, 0); 3944 3945 error = spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO); 3946 } 3947 3948 if (mmp_label) 3949 nvlist_free(mmp_label); 3950 3951 return (error); 3952 } 3953 3954 /* 3955 * Called from zfs_ioc_clear for a pool that was suspended 3956 * after failing mmp write checks. 3957 */ 3958 boolean_t 3959 spa_mmp_remote_host_activity(spa_t *spa) 3960 { 3961 ASSERT(spa_multihost(spa) && spa_suspended(spa)); 3962 3963 nvlist_t *best_label; 3964 uberblock_t best_ub; 3965 3966 /* 3967 * Locate the best uberblock on disk 3968 */ 3969 vdev_uberblock_load(spa->spa_root_vdev, &best_ub, &best_label); 3970 if (best_label) { 3971 /* 3972 * confirm that the best hostid matches our hostid 3973 */ 3974 if (nvlist_exists(best_label, ZPOOL_CONFIG_HOSTID) && 3975 spa_get_hostid(spa) != 3976 fnvlist_lookup_uint64(best_label, ZPOOL_CONFIG_HOSTID)) { 3977 nvlist_free(best_label); 3978 return (B_TRUE); 3979 } 3980 nvlist_free(best_label); 3981 } else { 3982 return (B_TRUE); 3983 } 3984 3985 if (!MMP_VALID(&best_ub) || 3986 !MMP_FAIL_INT_VALID(&best_ub) || 3987 MMP_FAIL_INT(&best_ub) == 0) { 3988 return (B_TRUE); 3989 } 3990 3991 if (best_ub.ub_txg != spa->spa_uberblock.ub_txg || 3992 best_ub.ub_timestamp != spa->spa_uberblock.ub_timestamp) { 3993 zfs_dbgmsg("txg mismatch detected during pool clear " 3994 "txg %llu ub_txg %llu timestamp %llu ub_timestamp %llu", 3995 (u_longlong_t)spa->spa_uberblock.ub_txg, 3996 (u_longlong_t)best_ub.ub_txg, 3997 (u_longlong_t)spa->spa_uberblock.ub_timestamp, 3998 (u_longlong_t)best_ub.ub_timestamp); 3999 return (B_TRUE); 4000 } 4001 4002 /* 4003 * Perform an activity check looking for any remote writer 4004 */ 4005 return (spa_activity_check(spa, &spa->spa_uberblock, spa->spa_config, 4006 B_FALSE) != 0); 4007 } 4008 4009 static int 4010 spa_verify_host(spa_t *spa, nvlist_t *mos_config) 4011 { 4012 uint64_t hostid; 4013 const char *hostname; 4014 uint64_t myhostid = 0; 4015 4016 if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config, 4017 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 4018 hostname = fnvlist_lookup_string(mos_config, 4019 ZPOOL_CONFIG_HOSTNAME); 4020 4021 myhostid = zone_get_hostid(NULL); 4022 4023 if (hostid != 0 && myhostid != 0 && hostid != myhostid) { 4024 cmn_err(CE_WARN, "pool '%s' could not be " 4025 "loaded as it was last accessed by " 4026 "another system (host: %s hostid: 0x%llx). " 4027 "See: https://openzfs.github.io/openzfs-docs/msg/" 4028 "ZFS-8000-EY", 4029 spa_name(spa), hostname, (u_longlong_t)hostid); 4030 spa_load_failed(spa, "hostid verification failed: pool " 4031 "last accessed by host: %s (hostid: 0x%llx)", 4032 hostname, (u_longlong_t)hostid); 4033 return (SET_ERROR(EBADF)); 4034 } 4035 } 4036 4037 return (0); 4038 } 4039 4040 static int 4041 spa_ld_parse_config(spa_t *spa, spa_import_type_t type) 4042 { 4043 int error = 0; 4044 nvlist_t *nvtree, *nvl, *config = spa->spa_config; 4045 int parse; 4046 vdev_t *rvd; 4047 uint64_t pool_guid; 4048 const char *comment; 4049 const char *compatibility; 4050 4051 /* 4052 * Versioning wasn't explicitly added to the label until later, so if 4053 * it's not present treat it as the initial version. 4054 */ 4055 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 4056 &spa->spa_ubsync.ub_version) != 0) 4057 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 4058 4059 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 4060 spa_load_failed(spa, "invalid config provided: '%s' missing", 4061 ZPOOL_CONFIG_POOL_GUID); 4062 return (SET_ERROR(EINVAL)); 4063 } 4064 4065 /* 4066 * If we are doing an import, ensure that the pool is not already 4067 * imported by checking if its pool guid already exists in the 4068 * spa namespace. 4069 * 4070 * The only case that we allow an already imported pool to be 4071 * imported again, is when the pool is checkpointed and we want to 4072 * look at its checkpointed state from userland tools like zdb. 4073 */ 4074 #ifdef _KERNEL 4075 if ((spa->spa_load_state == SPA_LOAD_IMPORT || 4076 spa->spa_load_state == SPA_LOAD_TRYIMPORT) && 4077 spa_guid_exists(pool_guid, 0)) { 4078 #else 4079 if ((spa->spa_load_state == SPA_LOAD_IMPORT || 4080 spa->spa_load_state == SPA_LOAD_TRYIMPORT) && 4081 spa_guid_exists(pool_guid, 0) && 4082 !spa_importing_readonly_checkpoint(spa)) { 4083 #endif 4084 spa_load_failed(spa, "a pool with guid %llu is already open", 4085 (u_longlong_t)pool_guid); 4086 return (SET_ERROR(EEXIST)); 4087 } 4088 4089 spa->spa_config_guid = pool_guid; 4090 4091 nvlist_free(spa->spa_load_info); 4092 spa->spa_load_info = fnvlist_alloc(); 4093 4094 ASSERT0P(spa->spa_comment); 4095 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) 4096 spa->spa_comment = spa_strdup(comment); 4097 4098 ASSERT0P(spa->spa_compatibility); 4099 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMPATIBILITY, 4100 &compatibility) == 0) 4101 spa->spa_compatibility = spa_strdup(compatibility); 4102 4103 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 4104 &spa->spa_config_txg); 4105 4106 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0) 4107 spa->spa_config_splitting = fnvlist_dup(nvl); 4108 4109 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) { 4110 spa_load_failed(spa, "invalid config provided: '%s' missing", 4111 ZPOOL_CONFIG_VDEV_TREE); 4112 return (SET_ERROR(EINVAL)); 4113 } 4114 4115 /* 4116 * Create "The Godfather" zio to hold all async IOs 4117 */ 4118 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 4119 KM_SLEEP); 4120 for (int i = 0; i < max_ncpus; i++) { 4121 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 4122 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 4123 ZIO_FLAG_GODFATHER); 4124 } 4125 4126 /* 4127 * Parse the configuration into a vdev tree. We explicitly set the 4128 * value that will be returned by spa_version() since parsing the 4129 * configuration requires knowing the version number. 4130 */ 4131 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4132 parse = (type == SPA_IMPORT_EXISTING ? 4133 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 4134 error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse); 4135 spa_config_exit(spa, SCL_ALL, FTAG); 4136 4137 if (error != 0) { 4138 spa_load_failed(spa, "unable to parse config [error=%d]", 4139 error); 4140 return (error); 4141 } 4142 4143 ASSERT(spa->spa_root_vdev == rvd); 4144 ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); 4145 ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT); 4146 4147 if (type != SPA_IMPORT_ASSEMBLE) { 4148 ASSERT(spa_guid(spa) == pool_guid); 4149 } 4150 4151 return (0); 4152 } 4153 4154 /* 4155 * Recursively open all vdevs in the vdev tree. This function is called twice: 4156 * first with the untrusted config, then with the trusted config. 4157 */ 4158 static int 4159 spa_ld_open_vdevs(spa_t *spa) 4160 { 4161 int error = 0; 4162 4163 /* 4164 * spa_missing_tvds_allowed defines how many top-level vdevs can be 4165 * missing/unopenable for the root vdev to be still considered openable. 4166 */ 4167 if (spa->spa_trust_config) { 4168 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds; 4169 } else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) { 4170 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile; 4171 } else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) { 4172 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan; 4173 } else { 4174 spa->spa_missing_tvds_allowed = 0; 4175 } 4176 4177 spa->spa_missing_tvds_allowed = 4178 MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed); 4179 4180 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4181 error = vdev_open(spa->spa_root_vdev); 4182 spa_config_exit(spa, SCL_ALL, FTAG); 4183 4184 if (spa->spa_missing_tvds != 0) { 4185 spa_load_note(spa, "vdev tree has %lld missing top-level " 4186 "vdevs.", (u_longlong_t)spa->spa_missing_tvds); 4187 if (spa->spa_trust_config && (spa->spa_mode & SPA_MODE_WRITE)) { 4188 /* 4189 * Although theoretically we could allow users to open 4190 * incomplete pools in RW mode, we'd need to add a lot 4191 * of extra logic (e.g. adjust pool space to account 4192 * for missing vdevs). 4193 * This limitation also prevents users from accidentally 4194 * opening the pool in RW mode during data recovery and 4195 * damaging it further. 4196 */ 4197 spa_load_note(spa, "pools with missing top-level " 4198 "vdevs can only be opened in read-only mode."); 4199 error = SET_ERROR(ENXIO); 4200 } else { 4201 spa_load_note(spa, "current settings allow for maximum " 4202 "%lld missing top-level vdevs at this stage.", 4203 (u_longlong_t)spa->spa_missing_tvds_allowed); 4204 } 4205 } 4206 if (error != 0) { 4207 spa_load_failed(spa, "unable to open vdev tree [error=%d]", 4208 error); 4209 } 4210 if (spa->spa_missing_tvds != 0 || error != 0) 4211 vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2); 4212 4213 return (error); 4214 } 4215 4216 /* 4217 * We need to validate the vdev labels against the configuration that 4218 * we have in hand. This function is called twice: first with an untrusted 4219 * config, then with a trusted config. The validation is more strict when the 4220 * config is trusted. 4221 */ 4222 static int 4223 spa_ld_validate_vdevs(spa_t *spa) 4224 { 4225 int error = 0; 4226 vdev_t *rvd = spa->spa_root_vdev; 4227 4228 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4229 error = vdev_validate(rvd); 4230 spa_config_exit(spa, SCL_ALL, FTAG); 4231 4232 if (error != 0) { 4233 spa_load_failed(spa, "vdev_validate failed [error=%d]", error); 4234 return (error); 4235 } 4236 4237 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 4238 spa_load_failed(spa, "cannot open vdev tree after invalidating " 4239 "some vdevs"); 4240 vdev_dbgmsg_print_tree(rvd, 2); 4241 return (SET_ERROR(ENXIO)); 4242 } 4243 4244 return (0); 4245 } 4246 4247 static void 4248 spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub) 4249 { 4250 spa->spa_state = POOL_STATE_ACTIVE; 4251 spa->spa_ubsync = spa->spa_uberblock; 4252 spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 4253 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 4254 spa->spa_first_txg = spa->spa_last_ubsync_txg ? 4255 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 4256 spa->spa_claim_max_txg = spa->spa_first_txg; 4257 spa->spa_prev_software_version = ub->ub_software_version; 4258 } 4259 4260 static int 4261 spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) 4262 { 4263 vdev_t *rvd = spa->spa_root_vdev; 4264 nvlist_t *label; 4265 uberblock_t *ub = &spa->spa_uberblock; 4266 boolean_t activity_check = B_FALSE; 4267 4268 /* 4269 * If we are opening the checkpointed state of the pool by 4270 * rewinding to it, at this point we will have written the 4271 * checkpointed uberblock to the vdev labels, so searching 4272 * the labels will find the right uberblock. However, if 4273 * we are opening the checkpointed state read-only, we have 4274 * not modified the labels. Therefore, we must ignore the 4275 * labels and continue using the spa_uberblock that was set 4276 * by spa_ld_checkpoint_rewind. 4277 * 4278 * Note that it would be fine to ignore the labels when 4279 * rewinding (opening writeable) as well. However, if we 4280 * crash just after writing the labels, we will end up 4281 * searching the labels. Doing so in the common case means 4282 * that this code path gets exercised normally, rather than 4283 * just in the edge case. 4284 */ 4285 if (ub->ub_checkpoint_txg != 0 && 4286 spa_importing_readonly_checkpoint(spa)) { 4287 spa_ld_select_uberblock_done(spa, ub); 4288 return (0); 4289 } 4290 4291 /* 4292 * Find the best uberblock. 4293 */ 4294 vdev_uberblock_load(rvd, ub, &label); 4295 4296 /* 4297 * If we weren't able to find a single valid uberblock, return failure. 4298 */ 4299 if (ub->ub_txg == 0) { 4300 nvlist_free(label); 4301 spa_load_failed(spa, "no valid uberblock found"); 4302 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 4303 } 4304 4305 if (spa->spa_load_max_txg != UINT64_MAX) { 4306 (void) spa_import_progress_set_max_txg(spa_guid(spa), 4307 (u_longlong_t)spa->spa_load_max_txg); 4308 } 4309 spa_load_note(spa, "using uberblock with txg=%llu", 4310 (u_longlong_t)ub->ub_txg); 4311 if (ub->ub_raidz_reflow_info != 0) { 4312 spa_load_note(spa, "uberblock raidz_reflow_info: " 4313 "state=%u offset=%llu", 4314 (int)RRSS_GET_STATE(ub), 4315 (u_longlong_t)RRSS_GET_OFFSET(ub)); 4316 } 4317 4318 4319 /* 4320 * For pools which have the multihost property on determine if the 4321 * pool is truly inactive and can be safely imported. Prevent 4322 * hosts which don't have a hostid set from importing the pool. 4323 */ 4324 activity_check = spa_activity_check_required(spa, ub, label, 4325 spa->spa_config); 4326 if (activity_check) { 4327 if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay && 4328 spa_get_hostid(spa) == 0) { 4329 nvlist_free(label); 4330 fnvlist_add_uint64(spa->spa_load_info, 4331 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); 4332 return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); 4333 } 4334 4335 int error = 4336 spa_activity_check(spa, ub, spa->spa_config, B_TRUE); 4337 if (error) { 4338 nvlist_free(label); 4339 return (error); 4340 } 4341 4342 fnvlist_add_uint64(spa->spa_load_info, 4343 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_INACTIVE); 4344 fnvlist_add_uint64(spa->spa_load_info, 4345 ZPOOL_CONFIG_MMP_TXG, ub->ub_txg); 4346 fnvlist_add_uint16(spa->spa_load_info, 4347 ZPOOL_CONFIG_MMP_SEQ, 4348 (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)); 4349 } 4350 4351 /* 4352 * If the pool has an unsupported version we can't open it. 4353 */ 4354 if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { 4355 nvlist_free(label); 4356 spa_load_failed(spa, "version %llu is not supported", 4357 (u_longlong_t)ub->ub_version); 4358 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 4359 } 4360 4361 if (ub->ub_version >= SPA_VERSION_FEATURES) { 4362 nvlist_t *features; 4363 4364 /* 4365 * If we weren't able to find what's necessary for reading the 4366 * MOS in the label, return failure. 4367 */ 4368 if (label == NULL) { 4369 spa_load_failed(spa, "label config unavailable"); 4370 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 4371 ENXIO)); 4372 } 4373 4374 if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ, 4375 &features) != 0) { 4376 nvlist_free(label); 4377 spa_load_failed(spa, "invalid label: '%s' missing", 4378 ZPOOL_CONFIG_FEATURES_FOR_READ); 4379 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 4380 ENXIO)); 4381 } 4382 4383 /* 4384 * Update our in-core representation with the definitive values 4385 * from the label. 4386 */ 4387 nvlist_free(spa->spa_label_features); 4388 spa->spa_label_features = fnvlist_dup(features); 4389 } 4390 4391 nvlist_free(label); 4392 4393 /* 4394 * Look through entries in the label nvlist's features_for_read. If 4395 * there is a feature listed there which we don't understand then we 4396 * cannot open a pool. 4397 */ 4398 if (ub->ub_version >= SPA_VERSION_FEATURES) { 4399 nvlist_t *unsup_feat; 4400 4401 unsup_feat = fnvlist_alloc(); 4402 4403 for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, 4404 NULL); nvp != NULL; 4405 nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { 4406 if (!zfeature_is_supported(nvpair_name(nvp))) { 4407 fnvlist_add_string(unsup_feat, 4408 nvpair_name(nvp), ""); 4409 } 4410 } 4411 4412 if (!nvlist_empty(unsup_feat)) { 4413 fnvlist_add_nvlist(spa->spa_load_info, 4414 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 4415 nvlist_free(unsup_feat); 4416 spa_load_failed(spa, "some features are unsupported"); 4417 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 4418 ENOTSUP)); 4419 } 4420 4421 nvlist_free(unsup_feat); 4422 } 4423 4424 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 4425 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4426 spa_try_repair(spa, spa->spa_config); 4427 spa_config_exit(spa, SCL_ALL, FTAG); 4428 nvlist_free(spa->spa_config_splitting); 4429 spa->spa_config_splitting = NULL; 4430 } 4431 4432 /* 4433 * Initialize internal SPA structures. 4434 */ 4435 spa_ld_select_uberblock_done(spa, ub); 4436 4437 return (0); 4438 } 4439 4440 static int 4441 spa_ld_open_rootbp(spa_t *spa) 4442 { 4443 int error = 0; 4444 vdev_t *rvd = spa->spa_root_vdev; 4445 4446 error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 4447 if (error != 0) { 4448 spa_load_failed(spa, "unable to open rootbp in dsl_pool_init " 4449 "[error=%d]", error); 4450 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4451 } 4452 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 4453 4454 return (0); 4455 } 4456 4457 static int 4458 spa_ld_trusted_config(spa_t *spa, spa_import_type_t type, 4459 boolean_t reloading) 4460 { 4461 vdev_t *mrvd, *rvd = spa->spa_root_vdev; 4462 nvlist_t *nv, *mos_config, *policy; 4463 int error = 0, copy_error; 4464 uint64_t healthy_tvds, healthy_tvds_mos; 4465 uint64_t mos_config_txg; 4466 4467 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE) 4468 != 0) 4469 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4470 4471 /* 4472 * If we're assembling a pool from a split, the config provided is 4473 * already trusted so there is nothing to do. 4474 */ 4475 if (type == SPA_IMPORT_ASSEMBLE) 4476 return (0); 4477 4478 healthy_tvds = spa_healthy_core_tvds(spa); 4479 4480 if (load_nvlist(spa, spa->spa_config_object, &mos_config) 4481 != 0) { 4482 spa_load_failed(spa, "unable to retrieve MOS config"); 4483 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4484 } 4485 4486 /* 4487 * If we are doing an open, pool owner wasn't verified yet, thus do 4488 * the verification here. 4489 */ 4490 if (spa->spa_load_state == SPA_LOAD_OPEN) { 4491 error = spa_verify_host(spa, mos_config); 4492 if (error != 0) { 4493 nvlist_free(mos_config); 4494 return (error); 4495 } 4496 } 4497 4498 nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE); 4499 4500 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4501 4502 /* 4503 * Build a new vdev tree from the trusted config 4504 */ 4505 error = spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD); 4506 if (error != 0) { 4507 nvlist_free(mos_config); 4508 spa_config_exit(spa, SCL_ALL, FTAG); 4509 spa_load_failed(spa, "spa_config_parse failed [error=%d]", 4510 error); 4511 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 4512 } 4513 4514 /* 4515 * Vdev paths in the MOS may be obsolete. If the untrusted config was 4516 * obtained by scanning /dev/dsk, then it will have the right vdev 4517 * paths. We update the trusted MOS config with this information. 4518 * We first try to copy the paths with vdev_copy_path_strict, which 4519 * succeeds only when both configs have exactly the same vdev tree. 4520 * If that fails, we fall back to a more flexible method that has a 4521 * best effort policy. 4522 */ 4523 copy_error = vdev_copy_path_strict(rvd, mrvd); 4524 if (copy_error != 0 || spa_load_print_vdev_tree) { 4525 spa_load_note(spa, "provided vdev tree:"); 4526 vdev_dbgmsg_print_tree(rvd, 2); 4527 spa_load_note(spa, "MOS vdev tree:"); 4528 vdev_dbgmsg_print_tree(mrvd, 2); 4529 } 4530 if (copy_error != 0) { 4531 spa_load_note(spa, "vdev_copy_path_strict failed, falling " 4532 "back to vdev_copy_path_relaxed"); 4533 vdev_copy_path_relaxed(rvd, mrvd); 4534 } 4535 4536 vdev_close(rvd); 4537 vdev_free(rvd); 4538 spa->spa_root_vdev = mrvd; 4539 rvd = mrvd; 4540 spa_config_exit(spa, SCL_ALL, FTAG); 4541 4542 /* 4543 * If 'zpool import' used a cached config, then the on-disk hostid and 4544 * hostname may be different to the cached config in ways that should 4545 * prevent import. Userspace can't discover this without a scan, but 4546 * we know, so we add these values to LOAD_INFO so the caller can know 4547 * the difference. 4548 * 4549 * Note that we have to do this before the config is regenerated, 4550 * because the new config will have the hostid and hostname for this 4551 * host, in readiness for import. 4552 */ 4553 if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTID)) 4554 fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_HOSTID, 4555 fnvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_HOSTID)); 4556 if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTNAME)) 4557 fnvlist_add_string(spa->spa_load_info, ZPOOL_CONFIG_HOSTNAME, 4558 fnvlist_lookup_string(mos_config, ZPOOL_CONFIG_HOSTNAME)); 4559 4560 /* 4561 * We will use spa_config if we decide to reload the spa or if spa_load 4562 * fails and we rewind. We must thus regenerate the config using the 4563 * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to 4564 * pass settings on how to load the pool and is not stored in the MOS. 4565 * We copy it over to our new, trusted config. 4566 */ 4567 mos_config_txg = fnvlist_lookup_uint64(mos_config, 4568 ZPOOL_CONFIG_POOL_TXG); 4569 nvlist_free(mos_config); 4570 mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE); 4571 if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY, 4572 &policy) == 0) 4573 fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy); 4574 spa_config_set(spa, mos_config); 4575 spa->spa_config_source = SPA_CONFIG_SRC_MOS; 4576 4577 /* 4578 * Now that we got the config from the MOS, we should be more strict 4579 * in checking blkptrs and can make assumptions about the consistency 4580 * of the vdev tree. spa_trust_config must be set to true before opening 4581 * vdevs in order for them to be writeable. 4582 */ 4583 spa->spa_trust_config = B_TRUE; 4584 4585 /* 4586 * Open and validate the new vdev tree 4587 */ 4588 error = spa_ld_open_vdevs(spa); 4589 if (error != 0) 4590 return (error); 4591 4592 error = spa_ld_validate_vdevs(spa); 4593 if (error != 0) 4594 return (error); 4595 4596 if (copy_error != 0 || spa_load_print_vdev_tree) { 4597 spa_load_note(spa, "final vdev tree:"); 4598 vdev_dbgmsg_print_tree(rvd, 2); 4599 } 4600 4601 if (spa->spa_load_state != SPA_LOAD_TRYIMPORT && 4602 !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) { 4603 /* 4604 * Sanity check to make sure that we are indeed loading the 4605 * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds 4606 * in the config provided and they happened to be the only ones 4607 * to have the latest uberblock, we could involuntarily perform 4608 * an extreme rewind. 4609 */ 4610 healthy_tvds_mos = spa_healthy_core_tvds(spa); 4611 if (healthy_tvds_mos - healthy_tvds >= 4612 SPA_SYNC_MIN_VDEVS) { 4613 spa_load_note(spa, "config provided misses too many " 4614 "top-level vdevs compared to MOS (%lld vs %lld). ", 4615 (u_longlong_t)healthy_tvds, 4616 (u_longlong_t)healthy_tvds_mos); 4617 spa_load_note(spa, "vdev tree:"); 4618 vdev_dbgmsg_print_tree(rvd, 2); 4619 if (reloading) { 4620 spa_load_failed(spa, "config was already " 4621 "provided from MOS. Aborting."); 4622 return (spa_vdev_err(rvd, 4623 VDEV_AUX_CORRUPT_DATA, EIO)); 4624 } 4625 spa_load_note(spa, "spa must be reloaded using MOS " 4626 "config"); 4627 return (SET_ERROR(EAGAIN)); 4628 } 4629 } 4630 4631 error = spa_check_for_missing_logs(spa); 4632 if (error != 0) 4633 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 4634 4635 if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) { 4636 spa_load_failed(spa, "uberblock guid sum doesn't match MOS " 4637 "guid sum (%llu != %llu)", 4638 (u_longlong_t)spa->spa_uberblock.ub_guid_sum, 4639 (u_longlong_t)rvd->vdev_guid_sum); 4640 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 4641 ENXIO)); 4642 } 4643 4644 return (0); 4645 } 4646 4647 static int 4648 spa_ld_open_indirect_vdev_metadata(spa_t *spa) 4649 { 4650 int error = 0; 4651 vdev_t *rvd = spa->spa_root_vdev; 4652 4653 /* 4654 * Everything that we read before spa_remove_init() must be stored 4655 * on concreted vdevs. Therefore we do this as early as possible. 4656 */ 4657 error = spa_remove_init(spa); 4658 if (error != 0) { 4659 spa_load_failed(spa, "spa_remove_init failed [error=%d]", 4660 error); 4661 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4662 } 4663 4664 /* 4665 * Retrieve information needed to condense indirect vdev mappings. 4666 */ 4667 error = spa_condense_init(spa); 4668 if (error != 0) { 4669 spa_load_failed(spa, "spa_condense_init failed [error=%d]", 4670 error); 4671 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 4672 } 4673 4674 return (0); 4675 } 4676 4677 static int 4678 spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep) 4679 { 4680 int error = 0; 4681 vdev_t *rvd = spa->spa_root_vdev; 4682 4683 if (spa_version(spa) >= SPA_VERSION_FEATURES) { 4684 boolean_t missing_feat_read = B_FALSE; 4685 nvlist_t *unsup_feat, *enabled_feat; 4686 4687 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, 4688 &spa->spa_feat_for_read_obj, B_TRUE) != 0) { 4689 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4690 } 4691 4692 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, 4693 &spa->spa_feat_for_write_obj, B_TRUE) != 0) { 4694 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4695 } 4696 4697 if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, 4698 &spa->spa_feat_desc_obj, B_TRUE) != 0) { 4699 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4700 } 4701 4702 enabled_feat = fnvlist_alloc(); 4703 unsup_feat = fnvlist_alloc(); 4704 4705 if (!spa_features_check(spa, B_FALSE, 4706 unsup_feat, enabled_feat)) 4707 missing_feat_read = B_TRUE; 4708 4709 if (spa_writeable(spa) || 4710 spa->spa_load_state == SPA_LOAD_TRYIMPORT) { 4711 if (!spa_features_check(spa, B_TRUE, 4712 unsup_feat, enabled_feat)) { 4713 *missing_feat_writep = B_TRUE; 4714 } 4715 } 4716 4717 fnvlist_add_nvlist(spa->spa_load_info, 4718 ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); 4719 4720 if (!nvlist_empty(unsup_feat)) { 4721 fnvlist_add_nvlist(spa->spa_load_info, 4722 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 4723 } 4724 4725 fnvlist_free(enabled_feat); 4726 fnvlist_free(unsup_feat); 4727 4728 if (!missing_feat_read) { 4729 fnvlist_add_boolean(spa->spa_load_info, 4730 ZPOOL_CONFIG_CAN_RDONLY); 4731 } 4732 4733 /* 4734 * If the state is SPA_LOAD_TRYIMPORT, our objective is 4735 * twofold: to determine whether the pool is available for 4736 * import in read-write mode and (if it is not) whether the 4737 * pool is available for import in read-only mode. If the pool 4738 * is available for import in read-write mode, it is displayed 4739 * as available in userland; if it is not available for import 4740 * in read-only mode, it is displayed as unavailable in 4741 * userland. If the pool is available for import in read-only 4742 * mode but not read-write mode, it is displayed as unavailable 4743 * in userland with a special note that the pool is actually 4744 * available for open in read-only mode. 4745 * 4746 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are 4747 * missing a feature for write, we must first determine whether 4748 * the pool can be opened read-only before returning to 4749 * userland in order to know whether to display the 4750 * abovementioned note. 4751 */ 4752 if (missing_feat_read || (*missing_feat_writep && 4753 spa_writeable(spa))) { 4754 spa_load_failed(spa, "pool uses unsupported features"); 4755 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 4756 ENOTSUP)); 4757 } 4758 4759 /* 4760 * Load refcounts for ZFS features from disk into an in-memory 4761 * cache during SPA initialization. 4762 */ 4763 for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { 4764 uint64_t refcount; 4765 4766 error = feature_get_refcount_from_disk(spa, 4767 &spa_feature_table[i], &refcount); 4768 if (error == 0) { 4769 spa->spa_feat_refcount_cache[i] = refcount; 4770 } else if (error == ENOTSUP) { 4771 spa->spa_feat_refcount_cache[i] = 4772 SPA_FEATURE_DISABLED; 4773 } else { 4774 spa_load_failed(spa, "error getting refcount " 4775 "for feature %s [error=%d]", 4776 spa_feature_table[i].fi_guid, error); 4777 return (spa_vdev_err(rvd, 4778 VDEV_AUX_CORRUPT_DATA, EIO)); 4779 } 4780 } 4781 } 4782 4783 if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { 4784 if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, 4785 &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0) 4786 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4787 } 4788 4789 /* 4790 * Encryption was added before bookmark_v2, even though bookmark_v2 4791 * is now a dependency. If this pool has encryption enabled without 4792 * bookmark_v2, trigger an errata message. 4793 */ 4794 if (spa_feature_is_enabled(spa, SPA_FEATURE_ENCRYPTION) && 4795 !spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_V2)) { 4796 spa->spa_errata = ZPOOL_ERRATA_ZOL_8308_ENCRYPTION; 4797 } 4798 4799 return (0); 4800 } 4801 4802 static int 4803 spa_ld_load_special_directories(spa_t *spa) 4804 { 4805 int error = 0; 4806 vdev_t *rvd = spa->spa_root_vdev; 4807 4808 spa->spa_is_initializing = B_TRUE; 4809 error = dsl_pool_open(spa->spa_dsl_pool); 4810 spa->spa_is_initializing = B_FALSE; 4811 if (error != 0) { 4812 spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error); 4813 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4814 } 4815 4816 return (0); 4817 } 4818 4819 static int 4820 spa_ld_get_props(spa_t *spa) 4821 { 4822 int error = 0; 4823 uint64_t obj; 4824 vdev_t *rvd = spa->spa_root_vdev; 4825 4826 /* Grab the checksum salt from the MOS. */ 4827 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 4828 DMU_POOL_CHECKSUM_SALT, 1, 4829 sizeof (spa->spa_cksum_salt.zcs_bytes), 4830 spa->spa_cksum_salt.zcs_bytes); 4831 if (error == ENOENT) { 4832 /* Generate a new salt for subsequent use */ 4833 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 4834 sizeof (spa->spa_cksum_salt.zcs_bytes)); 4835 } else if (error != 0) { 4836 spa_load_failed(spa, "unable to retrieve checksum salt from " 4837 "MOS [error=%d]", error); 4838 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4839 } 4840 4841 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0) 4842 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4843 error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 4844 if (error != 0) { 4845 spa_load_failed(spa, "error opening deferred-frees bpobj " 4846 "[error=%d]", error); 4847 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4848 } 4849 4850 /* 4851 * Load the bit that tells us to use the new accounting function 4852 * (raid-z deflation). If we have an older pool, this will not 4853 * be present. 4854 */ 4855 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE); 4856 if (error != 0 && error != ENOENT) 4857 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4858 4859 error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 4860 &spa->spa_creation_version, B_FALSE); 4861 if (error != 0 && error != ENOENT) 4862 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4863 4864 /* Load time log */ 4865 spa_load_txg_log_time(spa); 4866 4867 /* 4868 * Load the persistent error log. If we have an older pool, this will 4869 * not be present. 4870 */ 4871 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last, 4872 B_FALSE); 4873 if (error != 0 && error != ENOENT) 4874 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4875 4876 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 4877 &spa->spa_errlog_scrub, B_FALSE); 4878 if (error != 0 && error != ENOENT) 4879 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4880 4881 /* Load the last scrubbed txg. */ 4882 error = spa_dir_prop(spa, DMU_POOL_LAST_SCRUBBED_TXG, 4883 &spa->spa_scrubbed_last_txg, B_FALSE); 4884 if (error != 0 && error != ENOENT) 4885 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4886 4887 /* 4888 * Load the livelist deletion field. If a livelist is queued for 4889 * deletion, indicate that in the spa 4890 */ 4891 error = spa_dir_prop(spa, DMU_POOL_DELETED_CLONES, 4892 &spa->spa_livelists_to_delete, B_FALSE); 4893 if (error != 0 && error != ENOENT) 4894 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4895 4896 /* 4897 * Load the history object. If we have an older pool, this 4898 * will not be present. 4899 */ 4900 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE); 4901 if (error != 0 && error != ENOENT) 4902 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4903 4904 /* 4905 * Load the per-vdev ZAP map. If we have an older pool, this will not 4906 * be present; in this case, defer its creation to a later time to 4907 * avoid dirtying the MOS this early / out of sync context. See 4908 * spa_sync_config_object. 4909 */ 4910 4911 /* The sentinel is only available in the MOS config. */ 4912 nvlist_t *mos_config; 4913 if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) { 4914 spa_load_failed(spa, "unable to retrieve MOS config"); 4915 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4916 } 4917 4918 error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP, 4919 &spa->spa_all_vdev_zaps, B_FALSE); 4920 4921 if (error == ENOENT) { 4922 VERIFY(!nvlist_exists(mos_config, 4923 ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); 4924 spa->spa_avz_action = AVZ_ACTION_INITIALIZE; 4925 ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); 4926 } else if (error != 0) { 4927 nvlist_free(mos_config); 4928 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4929 } else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) { 4930 /* 4931 * An older version of ZFS overwrote the sentinel value, so 4932 * we have orphaned per-vdev ZAPs in the MOS. Defer their 4933 * destruction to later; see spa_sync_config_object. 4934 */ 4935 spa->spa_avz_action = AVZ_ACTION_DESTROY; 4936 /* 4937 * We're assuming that no vdevs have had their ZAPs created 4938 * before this. Better be sure of it. 4939 */ 4940 ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); 4941 } 4942 nvlist_free(mos_config); 4943 4944 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 4945 4946 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object, 4947 B_FALSE); 4948 if (error && error != ENOENT) 4949 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4950 4951 if (error == 0) { 4952 uint64_t autoreplace = 0; 4953 4954 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 4955 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 4956 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 4957 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 4958 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 4959 spa_prop_find(spa, ZPOOL_PROP_DEDUP_TABLE_QUOTA, 4960 &spa->spa_dedup_table_quota); 4961 spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost); 4962 spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_autotrim); 4963 spa->spa_autoreplace = (autoreplace != 0); 4964 } 4965 4966 /* 4967 * If we are importing a pool with missing top-level vdevs, 4968 * we enforce that the pool doesn't panic or get suspended on 4969 * error since the likelihood of missing data is extremely high. 4970 */ 4971 if (spa->spa_missing_tvds > 0 && 4972 spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE && 4973 spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 4974 spa_load_note(spa, "forcing failmode to 'continue' " 4975 "as some top level vdevs are missing"); 4976 spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE; 4977 } 4978 4979 return (0); 4980 } 4981 4982 static int 4983 spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type) 4984 { 4985 int error = 0; 4986 vdev_t *rvd = spa->spa_root_vdev; 4987 4988 /* 4989 * If we're assembling the pool from the split-off vdevs of 4990 * an existing pool, we don't want to attach the spares & cache 4991 * devices. 4992 */ 4993 4994 /* 4995 * Load any hot spares for this pool. 4996 */ 4997 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object, 4998 B_FALSE); 4999 if (error != 0 && error != ENOENT) 5000 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 5001 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 5002 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 5003 if (load_nvlist(spa, spa->spa_spares.sav_object, 5004 &spa->spa_spares.sav_config) != 0) { 5005 spa_load_failed(spa, "error loading spares nvlist"); 5006 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 5007 } 5008 5009 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5010 spa_load_spares(spa); 5011 spa_config_exit(spa, SCL_ALL, FTAG); 5012 } else if (error == 0) { 5013 spa->spa_spares.sav_sync = B_TRUE; 5014 } 5015 5016 /* 5017 * Load any level 2 ARC devices for this pool. 5018 */ 5019 error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 5020 &spa->spa_l2cache.sav_object, B_FALSE); 5021 if (error != 0 && error != ENOENT) 5022 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 5023 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 5024 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 5025 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 5026 &spa->spa_l2cache.sav_config) != 0) { 5027 spa_load_failed(spa, "error loading l2cache nvlist"); 5028 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 5029 } 5030 5031 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5032 spa_load_l2cache(spa); 5033 spa_config_exit(spa, SCL_ALL, FTAG); 5034 } else if (error == 0) { 5035 spa->spa_l2cache.sav_sync = B_TRUE; 5036 } 5037 5038 return (0); 5039 } 5040 5041 static int 5042 spa_ld_load_vdev_metadata(spa_t *spa) 5043 { 5044 int error = 0; 5045 vdev_t *rvd = spa->spa_root_vdev; 5046 5047 /* 5048 * If the 'multihost' property is set, then never allow a pool to 5049 * be imported when the system hostid is zero. The exception to 5050 * this rule is zdb which is always allowed to access pools. 5051 */ 5052 if (spa_multihost(spa) && spa_get_hostid(spa) == 0 && 5053 (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) == 0) { 5054 fnvlist_add_uint64(spa->spa_load_info, 5055 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); 5056 return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); 5057 } 5058 5059 /* 5060 * If the 'autoreplace' property is set, then post a resource notifying 5061 * the ZFS DE that it should not issue any faults for unopenable 5062 * devices. We also iterate over the vdevs, and post a sysevent for any 5063 * unopenable vdevs so that the normal autoreplace handler can take 5064 * over. 5065 */ 5066 if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 5067 spa_check_removed(spa->spa_root_vdev); 5068 /* 5069 * For the import case, this is done in spa_import(), because 5070 * at this point we're using the spare definitions from 5071 * the MOS config, not necessarily from the userland config. 5072 */ 5073 if (spa->spa_load_state != SPA_LOAD_IMPORT) { 5074 spa_aux_check_removed(&spa->spa_spares); 5075 spa_aux_check_removed(&spa->spa_l2cache); 5076 } 5077 } 5078 5079 /* 5080 * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc. 5081 */ 5082 error = vdev_load(rvd); 5083 if (error != 0) { 5084 spa_load_failed(spa, "vdev_load failed [error=%d]", error); 5085 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 5086 } 5087 5088 error = spa_ld_log_spacemaps(spa); 5089 if (error != 0) { 5090 spa_load_failed(spa, "spa_ld_log_spacemaps failed [error=%d]", 5091 error); 5092 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 5093 } 5094 5095 /* 5096 * Propagate the leaf DTLs we just loaded all the way up the vdev tree. 5097 */ 5098 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5099 vdev_dtl_reassess(rvd, 0, 0, B_FALSE, B_FALSE); 5100 spa_config_exit(spa, SCL_ALL, FTAG); 5101 5102 return (0); 5103 } 5104 5105 static int 5106 spa_ld_load_dedup_tables(spa_t *spa) 5107 { 5108 int error = 0; 5109 vdev_t *rvd = spa->spa_root_vdev; 5110 5111 error = ddt_load(spa); 5112 if (error != 0) { 5113 spa_load_failed(spa, "ddt_load failed [error=%d]", error); 5114 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 5115 } 5116 5117 return (0); 5118 } 5119 5120 static int 5121 spa_ld_load_brt(spa_t *spa) 5122 { 5123 int error = 0; 5124 vdev_t *rvd = spa->spa_root_vdev; 5125 5126 error = brt_load(spa); 5127 if (error != 0) { 5128 spa_load_failed(spa, "brt_load failed [error=%d]", error); 5129 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 5130 } 5131 5132 return (0); 5133 } 5134 5135 static int 5136 spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, const char **ereport) 5137 { 5138 vdev_t *rvd = spa->spa_root_vdev; 5139 5140 if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) { 5141 boolean_t missing = spa_check_logs(spa); 5142 if (missing) { 5143 if (spa->spa_missing_tvds != 0) { 5144 spa_load_note(spa, "spa_check_logs failed " 5145 "so dropping the logs"); 5146 } else { 5147 *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 5148 spa_load_failed(spa, "spa_check_logs failed"); 5149 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, 5150 ENXIO)); 5151 } 5152 } 5153 } 5154 5155 return (0); 5156 } 5157 5158 static int 5159 spa_ld_verify_pool_data(spa_t *spa) 5160 { 5161 int error = 0; 5162 vdev_t *rvd = spa->spa_root_vdev; 5163 5164 /* 5165 * We've successfully opened the pool, verify that we're ready 5166 * to start pushing transactions. 5167 */ 5168 if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 5169 error = spa_load_verify(spa); 5170 if (error != 0) { 5171 spa_load_failed(spa, "spa_load_verify failed " 5172 "[error=%d]", error); 5173 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 5174 error)); 5175 } 5176 } 5177 5178 return (0); 5179 } 5180 5181 static void 5182 spa_ld_claim_log_blocks(spa_t *spa) 5183 { 5184 dmu_tx_t *tx; 5185 dsl_pool_t *dp = spa_get_dsl(spa); 5186 5187 /* 5188 * Claim log blocks that haven't been committed yet. 5189 * This must all happen in a single txg. 5190 * Note: spa_claim_max_txg is updated by spa_claim_notify(), 5191 * invoked from zil_claim_log_block()'s i/o done callback. 5192 * Price of rollback is that we abandon the log. 5193 */ 5194 spa->spa_claiming = B_TRUE; 5195 5196 tx = dmu_tx_create_assigned(dp, spa_first_txg(spa)); 5197 (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 5198 zil_claim, tx, DS_FIND_CHILDREN); 5199 dmu_tx_commit(tx); 5200 5201 spa->spa_claiming = B_FALSE; 5202 5203 spa_set_log_state(spa, SPA_LOG_GOOD); 5204 } 5205 5206 static void 5207 spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg, 5208 boolean_t update_config_cache) 5209 { 5210 vdev_t *rvd = spa->spa_root_vdev; 5211 int need_update = B_FALSE; 5212 5213 /* 5214 * If the config cache is stale, or we have uninitialized 5215 * metaslabs (see spa_vdev_add()), then update the config. 5216 * 5217 * If this is a verbatim import, trust the current 5218 * in-core spa_config and update the disk labels. 5219 */ 5220 if (update_config_cache || config_cache_txg != spa->spa_config_txg || 5221 spa->spa_load_state == SPA_LOAD_IMPORT || 5222 spa->spa_load_state == SPA_LOAD_RECOVER || 5223 (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 5224 need_update = B_TRUE; 5225 5226 for (int c = 0; c < rvd->vdev_children; c++) 5227 if (rvd->vdev_child[c]->vdev_ms_array == 0) 5228 need_update = B_TRUE; 5229 5230 /* 5231 * Update the config cache asynchronously in case we're the 5232 * root pool, in which case the config cache isn't writable yet. 5233 */ 5234 if (need_update) 5235 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 5236 } 5237 5238 static void 5239 spa_ld_prepare_for_reload(spa_t *spa) 5240 { 5241 spa_mode_t mode = spa->spa_mode; 5242 int async_suspended = spa->spa_async_suspended; 5243 5244 spa_unload(spa); 5245 spa_deactivate(spa); 5246 spa_activate(spa, mode); 5247 5248 /* 5249 * We save the value of spa_async_suspended as it gets reset to 0 by 5250 * spa_unload(). We want to restore it back to the original value before 5251 * returning as we might be calling spa_async_resume() later. 5252 */ 5253 spa->spa_async_suspended = async_suspended; 5254 } 5255 5256 static int 5257 spa_ld_read_checkpoint_txg(spa_t *spa) 5258 { 5259 uberblock_t checkpoint; 5260 int error = 0; 5261 5262 ASSERT0(spa->spa_checkpoint_txg); 5263 ASSERT(MUTEX_HELD(&spa_namespace_lock) || 5264 spa->spa_load_thread == curthread); 5265 5266 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 5267 DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), 5268 sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); 5269 5270 if (error == ENOENT) 5271 return (0); 5272 5273 if (error != 0) 5274 return (error); 5275 5276 ASSERT3U(checkpoint.ub_txg, !=, 0); 5277 ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0); 5278 ASSERT3U(checkpoint.ub_timestamp, !=, 0); 5279 spa->spa_checkpoint_txg = checkpoint.ub_txg; 5280 spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp; 5281 5282 return (0); 5283 } 5284 5285 static int 5286 spa_ld_mos_init(spa_t *spa, spa_import_type_t type) 5287 { 5288 int error = 0; 5289 5290 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5291 ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); 5292 5293 /* 5294 * Never trust the config that is provided unless we are assembling 5295 * a pool following a split. 5296 * This means don't trust blkptrs and the vdev tree in general. This 5297 * also effectively puts the spa in read-only mode since 5298 * spa_writeable() checks for spa_trust_config to be true. 5299 * We will later load a trusted config from the MOS. 5300 */ 5301 if (type != SPA_IMPORT_ASSEMBLE) 5302 spa->spa_trust_config = B_FALSE; 5303 5304 /* 5305 * Parse the config provided to create a vdev tree. 5306 */ 5307 error = spa_ld_parse_config(spa, type); 5308 if (error != 0) 5309 return (error); 5310 5311 spa_import_progress_add(spa); 5312 5313 /* 5314 * Now that we have the vdev tree, try to open each vdev. This involves 5315 * opening the underlying physical device, retrieving its geometry and 5316 * probing the vdev with a dummy I/O. The state of each vdev will be set 5317 * based on the success of those operations. After this we'll be ready 5318 * to read from the vdevs. 5319 */ 5320 error = spa_ld_open_vdevs(spa); 5321 if (error != 0) 5322 return (error); 5323 5324 /* 5325 * Read the label of each vdev and make sure that the GUIDs stored 5326 * there match the GUIDs in the config provided. 5327 * If we're assembling a new pool that's been split off from an 5328 * existing pool, the labels haven't yet been updated so we skip 5329 * validation for now. 5330 */ 5331 if (type != SPA_IMPORT_ASSEMBLE) { 5332 error = spa_ld_validate_vdevs(spa); 5333 if (error != 0) 5334 return (error); 5335 } 5336 5337 /* 5338 * Read all vdev labels to find the best uberblock (i.e. latest, 5339 * unless spa_load_max_txg is set) and store it in spa_uberblock. We 5340 * get the list of features required to read blkptrs in the MOS from 5341 * the vdev label with the best uberblock and verify that our version 5342 * of zfs supports them all. 5343 */ 5344 error = spa_ld_select_uberblock(spa, type); 5345 if (error != 0) 5346 return (error); 5347 5348 /* 5349 * Pass that uberblock to the dsl_pool layer which will open the root 5350 * blkptr. This blkptr points to the latest version of the MOS and will 5351 * allow us to read its contents. 5352 */ 5353 error = spa_ld_open_rootbp(spa); 5354 if (error != 0) 5355 return (error); 5356 5357 return (0); 5358 } 5359 5360 static int 5361 spa_ld_checkpoint_rewind(spa_t *spa) 5362 { 5363 uberblock_t checkpoint; 5364 int error = 0; 5365 5366 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5367 ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); 5368 5369 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 5370 DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), 5371 sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); 5372 5373 if (error != 0) { 5374 spa_load_failed(spa, "unable to retrieve checkpointed " 5375 "uberblock from the MOS config [error=%d]", error); 5376 5377 if (error == ENOENT) 5378 error = ZFS_ERR_NO_CHECKPOINT; 5379 5380 return (error); 5381 } 5382 5383 ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg); 5384 ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg); 5385 5386 /* 5387 * We need to update the txg and timestamp of the checkpointed 5388 * uberblock to be higher than the latest one. This ensures that 5389 * the checkpointed uberblock is selected if we were to close and 5390 * reopen the pool right after we've written it in the vdev labels. 5391 * (also see block comment in vdev_uberblock_compare) 5392 */ 5393 checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1; 5394 checkpoint.ub_timestamp = gethrestime_sec(); 5395 5396 /* 5397 * Set current uberblock to be the checkpointed uberblock. 5398 */ 5399 spa->spa_uberblock = checkpoint; 5400 5401 /* 5402 * If we are doing a normal rewind, then the pool is open for 5403 * writing and we sync the "updated" checkpointed uberblock to 5404 * disk. Once this is done, we've basically rewound the whole 5405 * pool and there is no way back. 5406 * 5407 * There are cases when we don't want to attempt and sync the 5408 * checkpointed uberblock to disk because we are opening a 5409 * pool as read-only. Specifically, verifying the checkpointed 5410 * state with zdb, and importing the checkpointed state to get 5411 * a "preview" of its content. 5412 */ 5413 if (spa_writeable(spa)) { 5414 vdev_t *rvd = spa->spa_root_vdev; 5415 5416 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5417 vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; 5418 int svdcount = 0; 5419 int children = rvd->vdev_children; 5420 int c0 = random_in_range(children); 5421 5422 for (int c = 0; c < children; c++) { 5423 vdev_t *vd = rvd->vdev_child[(c0 + c) % children]; 5424 5425 /* Stop when revisiting the first vdev */ 5426 if (c > 0 && svd[0] == vd) 5427 break; 5428 5429 if (vd->vdev_ms_array == 0 || vd->vdev_islog || 5430 !vdev_is_concrete(vd)) 5431 continue; 5432 5433 svd[svdcount++] = vd; 5434 if (svdcount == SPA_SYNC_MIN_VDEVS) 5435 break; 5436 } 5437 error = vdev_config_sync(svd, svdcount, spa->spa_first_txg); 5438 if (error == 0) 5439 spa->spa_last_synced_guid = rvd->vdev_guid; 5440 spa_config_exit(spa, SCL_ALL, FTAG); 5441 5442 if (error != 0) { 5443 spa_load_failed(spa, "failed to write checkpointed " 5444 "uberblock to the vdev labels [error=%d]", error); 5445 return (error); 5446 } 5447 } 5448 5449 return (0); 5450 } 5451 5452 static int 5453 spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type, 5454 boolean_t *update_config_cache) 5455 { 5456 int error; 5457 5458 /* 5459 * Parse the config for pool, open and validate vdevs, 5460 * select an uberblock, and use that uberblock to open 5461 * the MOS. 5462 */ 5463 error = spa_ld_mos_init(spa, type); 5464 if (error != 0) 5465 return (error); 5466 5467 /* 5468 * Retrieve the trusted config stored in the MOS and use it to create 5469 * a new, exact version of the vdev tree, then reopen all vdevs. 5470 */ 5471 error = spa_ld_trusted_config(spa, type, B_FALSE); 5472 if (error == EAGAIN) { 5473 if (update_config_cache != NULL) 5474 *update_config_cache = B_TRUE; 5475 5476 /* 5477 * Redo the loading process with the trusted config if it is 5478 * too different from the untrusted config. 5479 */ 5480 spa_ld_prepare_for_reload(spa); 5481 spa_load_note(spa, "RELOADING"); 5482 error = spa_ld_mos_init(spa, type); 5483 if (error != 0) 5484 return (error); 5485 5486 error = spa_ld_trusted_config(spa, type, B_TRUE); 5487 if (error != 0) 5488 return (error); 5489 5490 } else if (error != 0) { 5491 return (error); 5492 } 5493 5494 return (0); 5495 } 5496 5497 /* 5498 * Load an existing storage pool, using the config provided. This config 5499 * describes which vdevs are part of the pool and is later validated against 5500 * partial configs present in each vdev's label and an entire copy of the 5501 * config stored in the MOS. 5502 */ 5503 static int 5504 spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) 5505 { 5506 int error = 0; 5507 boolean_t missing_feat_write = B_FALSE; 5508 boolean_t checkpoint_rewind = 5509 (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); 5510 boolean_t update_config_cache = B_FALSE; 5511 hrtime_t load_start = gethrtime(); 5512 5513 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5514 ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); 5515 5516 spa_load_note(spa, "LOADING"); 5517 5518 error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache); 5519 if (error != 0) 5520 return (error); 5521 5522 /* 5523 * If we are rewinding to the checkpoint then we need to repeat 5524 * everything we've done so far in this function but this time 5525 * selecting the checkpointed uberblock and using that to open 5526 * the MOS. 5527 */ 5528 if (checkpoint_rewind) { 5529 /* 5530 * If we are rewinding to the checkpoint update config cache 5531 * anyway. 5532 */ 5533 update_config_cache = B_TRUE; 5534 5535 /* 5536 * Extract the checkpointed uberblock from the current MOS 5537 * and use this as the pool's uberblock from now on. If the 5538 * pool is imported as writeable we also write the checkpoint 5539 * uberblock to the labels, making the rewind permanent. 5540 */ 5541 error = spa_ld_checkpoint_rewind(spa); 5542 if (error != 0) 5543 return (error); 5544 5545 /* 5546 * Redo the loading process again with the 5547 * checkpointed uberblock. 5548 */ 5549 spa_ld_prepare_for_reload(spa); 5550 spa_load_note(spa, "LOADING checkpointed uberblock"); 5551 error = spa_ld_mos_with_trusted_config(spa, type, NULL); 5552 if (error != 0) 5553 return (error); 5554 } 5555 5556 /* 5557 * Drop the namespace lock for the rest of the function. 5558 */ 5559 spa->spa_load_thread = curthread; 5560 mutex_exit(&spa_namespace_lock); 5561 5562 /* 5563 * Retrieve the checkpoint txg if the pool has a checkpoint. 5564 */ 5565 spa_import_progress_set_notes(spa, "Loading checkpoint txg"); 5566 error = spa_ld_read_checkpoint_txg(spa); 5567 if (error != 0) 5568 goto fail; 5569 5570 /* 5571 * Retrieve the mapping of indirect vdevs. Those vdevs were removed 5572 * from the pool and their contents were re-mapped to other vdevs. Note 5573 * that everything that we read before this step must have been 5574 * rewritten on concrete vdevs after the last device removal was 5575 * initiated. Otherwise we could be reading from indirect vdevs before 5576 * we have loaded their mappings. 5577 */ 5578 spa_import_progress_set_notes(spa, "Loading indirect vdev metadata"); 5579 error = spa_ld_open_indirect_vdev_metadata(spa); 5580 if (error != 0) 5581 goto fail; 5582 5583 /* 5584 * Retrieve the full list of active features from the MOS and check if 5585 * they are all supported. 5586 */ 5587 spa_import_progress_set_notes(spa, "Checking feature flags"); 5588 error = spa_ld_check_features(spa, &missing_feat_write); 5589 if (error != 0) 5590 goto fail; 5591 5592 /* 5593 * Load several special directories from the MOS needed by the dsl_pool 5594 * layer. 5595 */ 5596 spa_import_progress_set_notes(spa, "Loading special MOS directories"); 5597 error = spa_ld_load_special_directories(spa); 5598 if (error != 0) 5599 goto fail; 5600 5601 /* 5602 * Retrieve pool properties from the MOS. 5603 */ 5604 spa_import_progress_set_notes(spa, "Loading properties"); 5605 error = spa_ld_get_props(spa); 5606 if (error != 0) 5607 goto fail; 5608 5609 /* 5610 * Retrieve the list of auxiliary devices - cache devices and spares - 5611 * and open them. 5612 */ 5613 spa_import_progress_set_notes(spa, "Loading AUX vdevs"); 5614 error = spa_ld_open_aux_vdevs(spa, type); 5615 if (error != 0) 5616 goto fail; 5617 5618 /* 5619 * Load the metadata for all vdevs. Also check if unopenable devices 5620 * should be autoreplaced. 5621 */ 5622 spa_import_progress_set_notes(spa, "Loading vdev metadata"); 5623 error = spa_ld_load_vdev_metadata(spa); 5624 if (error != 0) 5625 goto fail; 5626 5627 spa_import_progress_set_notes(spa, "Loading dedup tables"); 5628 error = spa_ld_load_dedup_tables(spa); 5629 if (error != 0) 5630 goto fail; 5631 5632 spa_import_progress_set_notes(spa, "Loading BRT"); 5633 error = spa_ld_load_brt(spa); 5634 if (error != 0) 5635 goto fail; 5636 5637 /* 5638 * Verify the logs now to make sure we don't have any unexpected errors 5639 * when we claim log blocks later. 5640 */ 5641 spa_import_progress_set_notes(spa, "Verifying Log Devices"); 5642 error = spa_ld_verify_logs(spa, type, ereport); 5643 if (error != 0) 5644 goto fail; 5645 5646 if (missing_feat_write) { 5647 ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT); 5648 5649 /* 5650 * At this point, we know that we can open the pool in 5651 * read-only mode but not read-write mode. We now have enough 5652 * information and can return to userland. 5653 */ 5654 error = spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT, 5655 ENOTSUP); 5656 goto fail; 5657 } 5658 5659 /* 5660 * Traverse the last txgs to make sure the pool was left off in a safe 5661 * state. When performing an extreme rewind, we verify the whole pool, 5662 * which can take a very long time. 5663 */ 5664 spa_import_progress_set_notes(spa, "Verifying pool data"); 5665 error = spa_ld_verify_pool_data(spa); 5666 if (error != 0) 5667 goto fail; 5668 5669 /* 5670 * Calculate the deflated space for the pool. This must be done before 5671 * we write anything to the pool because we'd need to update the space 5672 * accounting using the deflated sizes. 5673 */ 5674 spa_import_progress_set_notes(spa, "Calculating deflated space"); 5675 spa_update_dspace(spa); 5676 5677 /* 5678 * We have now retrieved all the information we needed to open the 5679 * pool. If we are importing the pool in read-write mode, a few 5680 * additional steps must be performed to finish the import. 5681 */ 5682 spa_import_progress_set_notes(spa, "Starting import"); 5683 if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER || 5684 spa->spa_load_max_txg == UINT64_MAX)) { 5685 uint64_t config_cache_txg = spa->spa_config_txg; 5686 5687 ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT); 5688 5689 /* 5690 * Before we do any zio_write's, complete the raidz expansion 5691 * scratch space copying, if necessary. 5692 */ 5693 if (RRSS_GET_STATE(&spa->spa_uberblock) == RRSS_SCRATCH_VALID) 5694 vdev_raidz_reflow_copy_scratch(spa); 5695 5696 /* 5697 * In case of a checkpoint rewind, log the original txg 5698 * of the checkpointed uberblock. 5699 */ 5700 if (checkpoint_rewind) { 5701 spa_history_log_internal(spa, "checkpoint rewind", 5702 NULL, "rewound state to txg=%llu", 5703 (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg); 5704 } 5705 5706 spa_import_progress_set_notes(spa, "Claiming ZIL blocks"); 5707 /* 5708 * Traverse the ZIL and claim all blocks. 5709 */ 5710 spa_ld_claim_log_blocks(spa); 5711 5712 /* 5713 * Kick-off the syncing thread. 5714 */ 5715 spa->spa_sync_on = B_TRUE; 5716 txg_sync_start(spa->spa_dsl_pool); 5717 mmp_thread_start(spa); 5718 5719 /* 5720 * Wait for all claims to sync. We sync up to the highest 5721 * claimed log block birth time so that claimed log blocks 5722 * don't appear to be from the future. spa_claim_max_txg 5723 * will have been set for us by ZIL traversal operations 5724 * performed above. 5725 */ 5726 spa_import_progress_set_notes(spa, "Syncing ZIL claims"); 5727 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 5728 5729 /* 5730 * Check if we need to request an update of the config. On the 5731 * next sync, we would update the config stored in vdev labels 5732 * and the cachefile (by default /etc/zfs/zpool.cache). 5733 */ 5734 spa_import_progress_set_notes(spa, "Updating configs"); 5735 spa_ld_check_for_config_update(spa, config_cache_txg, 5736 update_config_cache); 5737 5738 /* 5739 * Check if a rebuild was in progress and if so resume it. 5740 * Then check all DTLs to see if anything needs resilvering. 5741 * The resilver will be deferred if a rebuild was started. 5742 */ 5743 spa_import_progress_set_notes(spa, "Starting resilvers"); 5744 if (vdev_rebuild_active(spa->spa_root_vdev)) { 5745 vdev_rebuild_restart(spa); 5746 } else if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 5747 vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 5748 spa_async_request(spa, SPA_ASYNC_RESILVER); 5749 } 5750 5751 /* 5752 * Log the fact that we booted up (so that we can detect if 5753 * we rebooted in the middle of an operation). 5754 */ 5755 spa_history_log_version(spa, "open", NULL); 5756 5757 spa_import_progress_set_notes(spa, 5758 "Restarting device removals"); 5759 spa_restart_removal(spa); 5760 spa_spawn_aux_threads(spa); 5761 5762 /* 5763 * Delete any inconsistent datasets. 5764 * 5765 * Note: 5766 * Since we may be issuing deletes for clones here, 5767 * we make sure to do so after we've spawned all the 5768 * auxiliary threads above (from which the livelist 5769 * deletion zthr is part of). 5770 */ 5771 spa_import_progress_set_notes(spa, 5772 "Cleaning up inconsistent objsets"); 5773 (void) dmu_objset_find(spa_name(spa), 5774 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 5775 5776 /* 5777 * Clean up any stale temporary dataset userrefs. 5778 */ 5779 spa_import_progress_set_notes(spa, 5780 "Cleaning up temporary userrefs"); 5781 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 5782 5783 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5784 spa_import_progress_set_notes(spa, "Restarting initialize"); 5785 vdev_initialize_restart(spa->spa_root_vdev); 5786 spa_import_progress_set_notes(spa, "Restarting TRIM"); 5787 vdev_trim_restart(spa->spa_root_vdev); 5788 vdev_autotrim_restart(spa); 5789 spa_config_exit(spa, SCL_CONFIG, FTAG); 5790 spa_import_progress_set_notes(spa, "Finished importing"); 5791 } 5792 zio_handle_import_delay(spa, gethrtime() - load_start); 5793 5794 spa_import_progress_remove(spa_guid(spa)); 5795 spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); 5796 5797 spa_load_note(spa, "LOADED"); 5798 fail: 5799 mutex_enter(&spa_namespace_lock); 5800 spa->spa_load_thread = NULL; 5801 cv_broadcast(&spa_namespace_cv); 5802 5803 return (error); 5804 5805 } 5806 5807 static int 5808 spa_load_retry(spa_t *spa, spa_load_state_t state) 5809 { 5810 spa_mode_t mode = spa->spa_mode; 5811 5812 spa_unload(spa); 5813 spa_deactivate(spa); 5814 5815 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1; 5816 5817 spa_activate(spa, mode); 5818 spa_async_suspend(spa); 5819 5820 spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu", 5821 (u_longlong_t)spa->spa_load_max_txg); 5822 5823 return (spa_load(spa, state, SPA_IMPORT_EXISTING)); 5824 } 5825 5826 /* 5827 * If spa_load() fails this function will try loading prior txg's. If 5828 * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool 5829 * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this 5830 * function will not rewind the pool and will return the same error as 5831 * spa_load(). 5832 */ 5833 static int 5834 spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request, 5835 int rewind_flags) 5836 { 5837 nvlist_t *loadinfo = NULL; 5838 nvlist_t *config = NULL; 5839 int load_error, rewind_error; 5840 uint64_t safe_rewind_txg; 5841 uint64_t min_txg; 5842 5843 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 5844 spa->spa_load_max_txg = spa->spa_load_txg; 5845 spa_set_log_state(spa, SPA_LOG_CLEAR); 5846 } else { 5847 spa->spa_load_max_txg = max_request; 5848 if (max_request != UINT64_MAX) 5849 spa->spa_extreme_rewind = B_TRUE; 5850 } 5851 5852 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING); 5853 if (load_error == 0) 5854 return (0); 5855 if (load_error == ZFS_ERR_NO_CHECKPOINT) { 5856 /* 5857 * When attempting checkpoint-rewind on a pool with no 5858 * checkpoint, we should not attempt to load uberblocks 5859 * from previous txgs when spa_load fails. 5860 */ 5861 ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); 5862 spa_import_progress_remove(spa_guid(spa)); 5863 return (load_error); 5864 } 5865 5866 if (spa->spa_root_vdev != NULL) 5867 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 5868 5869 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 5870 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 5871 5872 if (rewind_flags & ZPOOL_NEVER_REWIND) { 5873 nvlist_free(config); 5874 spa_import_progress_remove(spa_guid(spa)); 5875 return (load_error); 5876 } 5877 5878 if (state == SPA_LOAD_RECOVER) { 5879 /* Price of rolling back is discarding txgs, including log */ 5880 spa_set_log_state(spa, SPA_LOG_CLEAR); 5881 } else { 5882 /* 5883 * If we aren't rolling back save the load info from our first 5884 * import attempt so that we can restore it after attempting 5885 * to rewind. 5886 */ 5887 loadinfo = spa->spa_load_info; 5888 spa->spa_load_info = fnvlist_alloc(); 5889 } 5890 5891 spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 5892 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 5893 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 5894 TXG_INITIAL : safe_rewind_txg; 5895 5896 /* 5897 * Continue as long as we're finding errors, we're still within 5898 * the acceptable rewind range, and we're still finding uberblocks 5899 */ 5900 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 5901 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 5902 if (spa->spa_load_max_txg < safe_rewind_txg) 5903 spa->spa_extreme_rewind = B_TRUE; 5904 rewind_error = spa_load_retry(spa, state); 5905 } 5906 5907 spa->spa_extreme_rewind = B_FALSE; 5908 spa->spa_load_max_txg = UINT64_MAX; 5909 5910 if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 5911 spa_config_set(spa, config); 5912 else 5913 nvlist_free(config); 5914 5915 if (state == SPA_LOAD_RECOVER) { 5916 ASSERT0P(loadinfo); 5917 spa_import_progress_remove(spa_guid(spa)); 5918 return (rewind_error); 5919 } else { 5920 /* Store the rewind info as part of the initial load info */ 5921 fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, 5922 spa->spa_load_info); 5923 5924 /* Restore the initial load info */ 5925 fnvlist_free(spa->spa_load_info); 5926 spa->spa_load_info = loadinfo; 5927 5928 spa_import_progress_remove(spa_guid(spa)); 5929 return (load_error); 5930 } 5931 } 5932 5933 /* 5934 * Pool Open/Import 5935 * 5936 * The import case is identical to an open except that the configuration is sent 5937 * down from userland, instead of grabbed from the configuration cache. For the 5938 * case of an open, the pool configuration will exist in the 5939 * POOL_STATE_UNINITIALIZED state. 5940 * 5941 * The stats information (gen/count/ustats) is used to gather vdev statistics at 5942 * the same time open the pool, without having to keep around the spa_t in some 5943 * ambiguous state. 5944 */ 5945 static int 5946 spa_open_common(const char *pool, spa_t **spapp, const void *tag, 5947 nvlist_t *nvpolicy, nvlist_t **config) 5948 { 5949 spa_t *spa; 5950 spa_load_state_t state = SPA_LOAD_OPEN; 5951 int error; 5952 int locked = B_FALSE; 5953 int firstopen = B_FALSE; 5954 5955 *spapp = NULL; 5956 5957 /* 5958 * As disgusting as this is, we need to support recursive calls to this 5959 * function because dsl_dir_open() is called during spa_load(), and ends 5960 * up calling spa_open() again. The real fix is to figure out how to 5961 * avoid dsl_dir_open() calling this in the first place. 5962 */ 5963 if (MUTEX_NOT_HELD(&spa_namespace_lock)) { 5964 mutex_enter(&spa_namespace_lock); 5965 locked = B_TRUE; 5966 } 5967 5968 if ((spa = spa_lookup(pool)) == NULL) { 5969 if (locked) 5970 mutex_exit(&spa_namespace_lock); 5971 return (SET_ERROR(ENOENT)); 5972 } 5973 5974 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 5975 zpool_load_policy_t policy; 5976 5977 firstopen = B_TRUE; 5978 5979 zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config, 5980 &policy); 5981 if (policy.zlp_rewind & ZPOOL_DO_REWIND) 5982 state = SPA_LOAD_RECOVER; 5983 5984 spa_activate(spa, spa_mode_global); 5985 5986 if (state != SPA_LOAD_RECOVER) 5987 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 5988 spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; 5989 5990 zfs_dbgmsg("spa_open_common: opening %s", pool); 5991 error = spa_load_best(spa, state, policy.zlp_txg, 5992 policy.zlp_rewind); 5993 5994 if (error == EBADF) { 5995 /* 5996 * If vdev_validate() returns failure (indicated by 5997 * EBADF), it indicates that one of the vdevs indicates 5998 * that the pool has been exported or destroyed. If 5999 * this is the case, the config cache is out of sync and 6000 * we should remove the pool from the namespace. 6001 */ 6002 spa_unload(spa); 6003 spa_deactivate(spa); 6004 spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE); 6005 spa_remove(spa); 6006 if (locked) 6007 mutex_exit(&spa_namespace_lock); 6008 return (SET_ERROR(ENOENT)); 6009 } 6010 6011 if (error) { 6012 /* 6013 * We can't open the pool, but we still have useful 6014 * information: the state of each vdev after the 6015 * attempted vdev_open(). Return this to the user. 6016 */ 6017 if (config != NULL && spa->spa_config) { 6018 *config = fnvlist_dup(spa->spa_config); 6019 fnvlist_add_nvlist(*config, 6020 ZPOOL_CONFIG_LOAD_INFO, 6021 spa->spa_load_info); 6022 } 6023 spa_unload(spa); 6024 spa_deactivate(spa); 6025 spa->spa_last_open_failed = error; 6026 if (locked) 6027 mutex_exit(&spa_namespace_lock); 6028 *spapp = NULL; 6029 return (error); 6030 } 6031 } 6032 6033 spa_open_ref(spa, tag); 6034 6035 if (config != NULL) 6036 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 6037 6038 /* 6039 * If we've recovered the pool, pass back any information we 6040 * gathered while doing the load. 6041 */ 6042 if (state == SPA_LOAD_RECOVER && config != NULL) { 6043 fnvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 6044 spa->spa_load_info); 6045 } 6046 6047 if (locked) { 6048 spa->spa_last_open_failed = 0; 6049 spa->spa_last_ubsync_txg = 0; 6050 spa->spa_load_txg = 0; 6051 mutex_exit(&spa_namespace_lock); 6052 } 6053 6054 if (firstopen) 6055 zvol_create_minors(spa_name(spa)); 6056 6057 *spapp = spa; 6058 6059 return (0); 6060 } 6061 6062 int 6063 spa_open_rewind(const char *name, spa_t **spapp, const void *tag, 6064 nvlist_t *policy, nvlist_t **config) 6065 { 6066 return (spa_open_common(name, spapp, tag, policy, config)); 6067 } 6068 6069 int 6070 spa_open(const char *name, spa_t **spapp, const void *tag) 6071 { 6072 return (spa_open_common(name, spapp, tag, NULL, NULL)); 6073 } 6074 6075 /* 6076 * Lookup the given spa_t, incrementing the inject count in the process, 6077 * preventing it from being exported or destroyed. 6078 */ 6079 spa_t * 6080 spa_inject_addref(char *name) 6081 { 6082 spa_t *spa; 6083 6084 mutex_enter(&spa_namespace_lock); 6085 if ((spa = spa_lookup(name)) == NULL) { 6086 mutex_exit(&spa_namespace_lock); 6087 return (NULL); 6088 } 6089 spa->spa_inject_ref++; 6090 mutex_exit(&spa_namespace_lock); 6091 6092 return (spa); 6093 } 6094 6095 void 6096 spa_inject_delref(spa_t *spa) 6097 { 6098 mutex_enter(&spa_namespace_lock); 6099 spa->spa_inject_ref--; 6100 mutex_exit(&spa_namespace_lock); 6101 } 6102 6103 /* 6104 * Add spares device information to the nvlist. 6105 */ 6106 static void 6107 spa_add_spares(spa_t *spa, nvlist_t *config) 6108 { 6109 nvlist_t **spares; 6110 uint_t i, nspares; 6111 nvlist_t *nvroot; 6112 uint64_t guid; 6113 vdev_stat_t *vs; 6114 uint_t vsc; 6115 uint64_t pool; 6116 6117 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 6118 6119 if (spa->spa_spares.sav_count == 0) 6120 return; 6121 6122 nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); 6123 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 6124 ZPOOL_CONFIG_SPARES, &spares, &nspares)); 6125 if (nspares != 0) { 6126 fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 6127 (const nvlist_t * const *)spares, nspares); 6128 VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 6129 &spares, &nspares)); 6130 6131 /* 6132 * Go through and find any spares which have since been 6133 * repurposed as an active spare. If this is the case, update 6134 * their status appropriately. 6135 */ 6136 for (i = 0; i < nspares; i++) { 6137 guid = fnvlist_lookup_uint64(spares[i], 6138 ZPOOL_CONFIG_GUID); 6139 VERIFY0(nvlist_lookup_uint64_array(spares[i], 6140 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)); 6141 if (spa_spare_exists(guid, &pool, NULL) && 6142 pool != 0ULL) { 6143 vs->vs_state = VDEV_STATE_CANT_OPEN; 6144 vs->vs_aux = VDEV_AUX_SPARED; 6145 } else { 6146 vs->vs_state = 6147 spa->spa_spares.sav_vdevs[i]->vdev_state; 6148 } 6149 } 6150 } 6151 } 6152 6153 /* 6154 * Add l2cache device information to the nvlist, including vdev stats. 6155 */ 6156 static void 6157 spa_add_l2cache(spa_t *spa, nvlist_t *config) 6158 { 6159 nvlist_t **l2cache; 6160 uint_t i, j, nl2cache; 6161 nvlist_t *nvroot; 6162 uint64_t guid; 6163 vdev_t *vd; 6164 vdev_stat_t *vs; 6165 uint_t vsc; 6166 6167 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 6168 6169 if (spa->spa_l2cache.sav_count == 0) 6170 return; 6171 6172 nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); 6173 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 6174 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache)); 6175 if (nl2cache != 0) { 6176 fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 6177 (const nvlist_t * const *)l2cache, nl2cache); 6178 VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 6179 &l2cache, &nl2cache)); 6180 6181 /* 6182 * Update level 2 cache device stats. 6183 */ 6184 6185 for (i = 0; i < nl2cache; i++) { 6186 guid = fnvlist_lookup_uint64(l2cache[i], 6187 ZPOOL_CONFIG_GUID); 6188 6189 vd = NULL; 6190 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 6191 if (guid == 6192 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 6193 vd = spa->spa_l2cache.sav_vdevs[j]; 6194 break; 6195 } 6196 } 6197 ASSERT(vd != NULL); 6198 6199 VERIFY0(nvlist_lookup_uint64_array(l2cache[i], 6200 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)); 6201 vdev_get_stats(vd, vs); 6202 vdev_config_generate_stats(vd, l2cache[i]); 6203 6204 } 6205 } 6206 } 6207 6208 static void 6209 spa_feature_stats_from_disk(spa_t *spa, nvlist_t *features) 6210 { 6211 zap_cursor_t zc; 6212 zap_attribute_t *za = zap_attribute_alloc(); 6213 6214 if (spa->spa_feat_for_read_obj != 0) { 6215 for (zap_cursor_init(&zc, spa->spa_meta_objset, 6216 spa->spa_feat_for_read_obj); 6217 zap_cursor_retrieve(&zc, za) == 0; 6218 zap_cursor_advance(&zc)) { 6219 ASSERT(za->za_integer_length == sizeof (uint64_t) && 6220 za->za_num_integers == 1); 6221 VERIFY0(nvlist_add_uint64(features, za->za_name, 6222 za->za_first_integer)); 6223 } 6224 zap_cursor_fini(&zc); 6225 } 6226 6227 if (spa->spa_feat_for_write_obj != 0) { 6228 for (zap_cursor_init(&zc, spa->spa_meta_objset, 6229 spa->spa_feat_for_write_obj); 6230 zap_cursor_retrieve(&zc, za) == 0; 6231 zap_cursor_advance(&zc)) { 6232 ASSERT(za->za_integer_length == sizeof (uint64_t) && 6233 za->za_num_integers == 1); 6234 VERIFY0(nvlist_add_uint64(features, za->za_name, 6235 za->za_first_integer)); 6236 } 6237 zap_cursor_fini(&zc); 6238 } 6239 zap_attribute_free(za); 6240 } 6241 6242 static void 6243 spa_feature_stats_from_cache(spa_t *spa, nvlist_t *features) 6244 { 6245 int i; 6246 6247 for (i = 0; i < SPA_FEATURES; i++) { 6248 zfeature_info_t feature = spa_feature_table[i]; 6249 uint64_t refcount; 6250 6251 if (feature_get_refcount(spa, &feature, &refcount) != 0) 6252 continue; 6253 6254 VERIFY0(nvlist_add_uint64(features, feature.fi_guid, refcount)); 6255 } 6256 } 6257 6258 /* 6259 * Store a list of pool features and their reference counts in the 6260 * config. 6261 * 6262 * The first time this is called on a spa, allocate a new nvlist, fetch 6263 * the pool features and reference counts from disk, then save the list 6264 * in the spa. In subsequent calls on the same spa use the saved nvlist 6265 * and refresh its values from the cached reference counts. This 6266 * ensures we don't block here on I/O on a suspended pool so 'zpool 6267 * clear' can resume the pool. 6268 */ 6269 static void 6270 spa_add_feature_stats(spa_t *spa, nvlist_t *config) 6271 { 6272 nvlist_t *features; 6273 6274 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 6275 6276 mutex_enter(&spa->spa_feat_stats_lock); 6277 features = spa->spa_feat_stats; 6278 6279 if (features != NULL) { 6280 spa_feature_stats_from_cache(spa, features); 6281 } else { 6282 VERIFY0(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP)); 6283 spa->spa_feat_stats = features; 6284 spa_feature_stats_from_disk(spa, features); 6285 } 6286 6287 VERIFY0(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, 6288 features)); 6289 6290 mutex_exit(&spa->spa_feat_stats_lock); 6291 } 6292 6293 int 6294 spa_get_stats(const char *name, nvlist_t **config, 6295 char *altroot, size_t buflen) 6296 { 6297 int error; 6298 spa_t *spa; 6299 6300 *config = NULL; 6301 error = spa_open_common(name, &spa, FTAG, NULL, config); 6302 6303 if (spa != NULL) { 6304 /* 6305 * This still leaves a window of inconsistency where the spares 6306 * or l2cache devices could change and the config would be 6307 * self-inconsistent. 6308 */ 6309 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6310 6311 if (*config != NULL) { 6312 uint64_t loadtimes[2]; 6313 6314 loadtimes[0] = spa->spa_loaded_ts.tv_sec; 6315 loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 6316 fnvlist_add_uint64_array(*config, 6317 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2); 6318 6319 fnvlist_add_uint64(*config, 6320 ZPOOL_CONFIG_ERRCOUNT, 6321 spa_approx_errlog_size(spa)); 6322 6323 if (spa_suspended(spa)) { 6324 fnvlist_add_uint64(*config, 6325 ZPOOL_CONFIG_SUSPENDED, 6326 spa->spa_failmode); 6327 fnvlist_add_uint64(*config, 6328 ZPOOL_CONFIG_SUSPENDED_REASON, 6329 spa->spa_suspended); 6330 } 6331 6332 spa_add_spares(spa, *config); 6333 spa_add_l2cache(spa, *config); 6334 spa_add_feature_stats(spa, *config); 6335 } 6336 } 6337 6338 /* 6339 * We want to get the alternate root even for faulted pools, so we cheat 6340 * and call spa_lookup() directly. 6341 */ 6342 if (altroot) { 6343 if (spa == NULL) { 6344 mutex_enter(&spa_namespace_lock); 6345 spa = spa_lookup(name); 6346 if (spa) 6347 spa_altroot(spa, altroot, buflen); 6348 else 6349 altroot[0] = '\0'; 6350 spa = NULL; 6351 mutex_exit(&spa_namespace_lock); 6352 } else { 6353 spa_altroot(spa, altroot, buflen); 6354 } 6355 } 6356 6357 if (spa != NULL) { 6358 spa_config_exit(spa, SCL_CONFIG, FTAG); 6359 spa_close(spa, FTAG); 6360 } 6361 6362 return (error); 6363 } 6364 6365 /* 6366 * Validate that the auxiliary device array is well formed. We must have an 6367 * array of nvlists, each which describes a valid leaf vdev. If this is an 6368 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 6369 * specified, as long as they are well-formed. 6370 */ 6371 static int 6372 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 6373 spa_aux_vdev_t *sav, const char *config, uint64_t version, 6374 vdev_labeltype_t label) 6375 { 6376 nvlist_t **dev; 6377 uint_t i, ndev; 6378 vdev_t *vd; 6379 int error; 6380 6381 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 6382 6383 /* 6384 * It's acceptable to have no devs specified. 6385 */ 6386 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 6387 return (0); 6388 6389 if (ndev == 0) 6390 return (SET_ERROR(EINVAL)); 6391 6392 /* 6393 * Make sure the pool is formatted with a version that supports this 6394 * device type. 6395 */ 6396 if (spa_version(spa) < version) 6397 return (SET_ERROR(ENOTSUP)); 6398 6399 /* 6400 * Set the pending device list so we correctly handle device in-use 6401 * checking. 6402 */ 6403 sav->sav_pending = dev; 6404 sav->sav_npending = ndev; 6405 6406 for (i = 0; i < ndev; i++) { 6407 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 6408 mode)) != 0) 6409 goto out; 6410 6411 if (!vd->vdev_ops->vdev_op_leaf) { 6412 vdev_free(vd); 6413 error = SET_ERROR(EINVAL); 6414 goto out; 6415 } 6416 6417 vd->vdev_top = vd; 6418 6419 if ((error = vdev_open(vd)) == 0 && 6420 (error = vdev_label_init(vd, crtxg, label)) == 0) { 6421 fnvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 6422 vd->vdev_guid); 6423 } 6424 6425 vdev_free(vd); 6426 6427 if (error && 6428 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 6429 goto out; 6430 else 6431 error = 0; 6432 } 6433 6434 out: 6435 sav->sav_pending = NULL; 6436 sav->sav_npending = 0; 6437 return (error); 6438 } 6439 6440 static int 6441 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 6442 { 6443 int error; 6444 6445 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 6446 6447 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 6448 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 6449 VDEV_LABEL_SPARE)) != 0) { 6450 return (error); 6451 } 6452 6453 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 6454 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 6455 VDEV_LABEL_L2CACHE)); 6456 } 6457 6458 static void 6459 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 6460 const char *config) 6461 { 6462 int i; 6463 6464 if (sav->sav_config != NULL) { 6465 nvlist_t **olddevs; 6466 uint_t oldndevs; 6467 nvlist_t **newdevs; 6468 6469 /* 6470 * Generate new dev list by concatenating with the 6471 * current dev list. 6472 */ 6473 VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, config, 6474 &olddevs, &oldndevs)); 6475 6476 newdevs = kmem_alloc(sizeof (void *) * 6477 (ndevs + oldndevs), KM_SLEEP); 6478 for (i = 0; i < oldndevs; i++) 6479 newdevs[i] = fnvlist_dup(olddevs[i]); 6480 for (i = 0; i < ndevs; i++) 6481 newdevs[i + oldndevs] = fnvlist_dup(devs[i]); 6482 6483 fnvlist_remove(sav->sav_config, config); 6484 6485 fnvlist_add_nvlist_array(sav->sav_config, config, 6486 (const nvlist_t * const *)newdevs, ndevs + oldndevs); 6487 for (i = 0; i < oldndevs + ndevs; i++) 6488 nvlist_free(newdevs[i]); 6489 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 6490 } else { 6491 /* 6492 * Generate a new dev list. 6493 */ 6494 sav->sav_config = fnvlist_alloc(); 6495 fnvlist_add_nvlist_array(sav->sav_config, config, 6496 (const nvlist_t * const *)devs, ndevs); 6497 } 6498 } 6499 6500 /* 6501 * Stop and drop level 2 ARC devices 6502 */ 6503 void 6504 spa_l2cache_drop(spa_t *spa) 6505 { 6506 vdev_t *vd; 6507 int i; 6508 spa_aux_vdev_t *sav = &spa->spa_l2cache; 6509 6510 for (i = 0; i < sav->sav_count; i++) { 6511 uint64_t pool; 6512 6513 vd = sav->sav_vdevs[i]; 6514 ASSERT(vd != NULL); 6515 6516 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 6517 pool != 0ULL && l2arc_vdev_present(vd)) 6518 l2arc_remove_vdev(vd); 6519 } 6520 } 6521 6522 /* 6523 * Verify encryption parameters for spa creation. If we are encrypting, we must 6524 * have the encryption feature flag enabled. 6525 */ 6526 static int 6527 spa_create_check_encryption_params(dsl_crypto_params_t *dcp, 6528 boolean_t has_encryption) 6529 { 6530 if (dcp->cp_crypt != ZIO_CRYPT_OFF && 6531 dcp->cp_crypt != ZIO_CRYPT_INHERIT && 6532 !has_encryption) 6533 return (SET_ERROR(ENOTSUP)); 6534 6535 return (dmu_objset_create_crypt_check(NULL, dcp, NULL)); 6536 } 6537 6538 /* 6539 * Pool Creation 6540 */ 6541 int 6542 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 6543 nvlist_t *zplprops, dsl_crypto_params_t *dcp) 6544 { 6545 spa_t *spa; 6546 const char *altroot = NULL; 6547 vdev_t *rvd; 6548 dsl_pool_t *dp; 6549 dmu_tx_t *tx; 6550 int error = 0; 6551 uint64_t txg = TXG_INITIAL; 6552 nvlist_t **spares, **l2cache; 6553 uint_t nspares, nl2cache; 6554 uint64_t version, obj, ndraid = 0; 6555 boolean_t has_features; 6556 boolean_t has_encryption; 6557 boolean_t has_allocclass; 6558 spa_feature_t feat; 6559 const char *feat_name; 6560 const char *poolname; 6561 nvlist_t *nvl; 6562 6563 if (props == NULL || 6564 nvlist_lookup_string(props, 6565 zpool_prop_to_name(ZPOOL_PROP_TNAME), &poolname) != 0) 6566 poolname = (char *)pool; 6567 6568 /* 6569 * If this pool already exists, return failure. 6570 */ 6571 mutex_enter(&spa_namespace_lock); 6572 if (spa_lookup(poolname) != NULL) { 6573 mutex_exit(&spa_namespace_lock); 6574 return (SET_ERROR(EEXIST)); 6575 } 6576 6577 /* 6578 * Allocate a new spa_t structure. 6579 */ 6580 nvl = fnvlist_alloc(); 6581 fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool); 6582 (void) nvlist_lookup_string(props, 6583 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 6584 spa = spa_add(poolname, nvl, altroot); 6585 fnvlist_free(nvl); 6586 spa_activate(spa, spa_mode_global); 6587 6588 if (props && (error = spa_prop_validate(spa, props))) { 6589 spa_deactivate(spa); 6590 spa_remove(spa); 6591 mutex_exit(&spa_namespace_lock); 6592 return (error); 6593 } 6594 6595 /* 6596 * Temporary pool names should never be written to disk. 6597 */ 6598 if (poolname != pool) 6599 spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME; 6600 6601 has_features = B_FALSE; 6602 has_encryption = B_FALSE; 6603 has_allocclass = B_FALSE; 6604 for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); 6605 elem != NULL; elem = nvlist_next_nvpair(props, elem)) { 6606 if (zpool_prop_feature(nvpair_name(elem))) { 6607 has_features = B_TRUE; 6608 6609 feat_name = strchr(nvpair_name(elem), '@') + 1; 6610 VERIFY0(zfeature_lookup_name(feat_name, &feat)); 6611 if (feat == SPA_FEATURE_ENCRYPTION) 6612 has_encryption = B_TRUE; 6613 if (feat == SPA_FEATURE_ALLOCATION_CLASSES) 6614 has_allocclass = B_TRUE; 6615 } 6616 } 6617 6618 /* verify encryption params, if they were provided */ 6619 if (dcp != NULL) { 6620 error = spa_create_check_encryption_params(dcp, has_encryption); 6621 if (error != 0) { 6622 spa_deactivate(spa); 6623 spa_remove(spa); 6624 mutex_exit(&spa_namespace_lock); 6625 return (error); 6626 } 6627 } 6628 if (!has_allocclass && zfs_special_devs(nvroot, NULL)) { 6629 spa_deactivate(spa); 6630 spa_remove(spa); 6631 mutex_exit(&spa_namespace_lock); 6632 return (ENOTSUP); 6633 } 6634 6635 if (has_features || nvlist_lookup_uint64(props, 6636 zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { 6637 version = SPA_VERSION; 6638 } 6639 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 6640 6641 spa->spa_first_txg = txg; 6642 spa->spa_uberblock.ub_txg = txg - 1; 6643 spa->spa_uberblock.ub_version = version; 6644 spa->spa_ubsync = spa->spa_uberblock; 6645 spa->spa_load_state = SPA_LOAD_CREATE; 6646 spa->spa_removing_phys.sr_state = DSS_NONE; 6647 spa->spa_removing_phys.sr_removing_vdev = -1; 6648 spa->spa_removing_phys.sr_prev_indirect_vdev = -1; 6649 spa->spa_indirect_vdevs_loaded = B_TRUE; 6650 6651 /* 6652 * Create "The Godfather" zio to hold all async IOs 6653 */ 6654 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 6655 KM_SLEEP); 6656 for (int i = 0; i < max_ncpus; i++) { 6657 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 6658 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 6659 ZIO_FLAG_GODFATHER); 6660 } 6661 6662 /* 6663 * Create the root vdev. 6664 */ 6665 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6666 6667 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 6668 6669 ASSERT(error != 0 || rvd != NULL); 6670 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 6671 6672 if (error == 0 && !zfs_allocatable_devs(nvroot)) 6673 error = SET_ERROR(EINVAL); 6674 6675 if (error == 0 && 6676 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 6677 (error = vdev_draid_spare_create(nvroot, rvd, &ndraid, 0)) == 0 && 6678 (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { 6679 /* 6680 * instantiate the metaslab groups (this will dirty the vdevs) 6681 * we can no longer error exit past this point 6682 */ 6683 for (int c = 0; error == 0 && c < rvd->vdev_children; c++) { 6684 vdev_t *vd = rvd->vdev_child[c]; 6685 6686 vdev_metaslab_set_size(vd); 6687 vdev_expand(vd, txg); 6688 } 6689 } 6690 6691 spa_config_exit(spa, SCL_ALL, FTAG); 6692 6693 if (error != 0) { 6694 spa_unload(spa); 6695 spa_deactivate(spa); 6696 spa_remove(spa); 6697 mutex_exit(&spa_namespace_lock); 6698 return (error); 6699 } 6700 6701 /* 6702 * Get the list of spares, if specified. 6703 */ 6704 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 6705 &spares, &nspares) == 0) { 6706 spa->spa_spares.sav_config = fnvlist_alloc(); 6707 fnvlist_add_nvlist_array(spa->spa_spares.sav_config, 6708 ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, 6709 nspares); 6710 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6711 spa_load_spares(spa); 6712 spa_config_exit(spa, SCL_ALL, FTAG); 6713 spa->spa_spares.sav_sync = B_TRUE; 6714 } 6715 6716 /* 6717 * Get the list of level 2 cache devices, if specified. 6718 */ 6719 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 6720 &l2cache, &nl2cache) == 0) { 6721 VERIFY0(nvlist_alloc(&spa->spa_l2cache.sav_config, 6722 NV_UNIQUE_NAME, KM_SLEEP)); 6723 fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 6724 ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache, 6725 nl2cache); 6726 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6727 spa_load_l2cache(spa); 6728 spa_config_exit(spa, SCL_ALL, FTAG); 6729 spa->spa_l2cache.sav_sync = B_TRUE; 6730 } 6731 6732 spa->spa_is_initializing = B_TRUE; 6733 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, dcp, txg); 6734 spa->spa_is_initializing = B_FALSE; 6735 6736 /* 6737 * Create DDTs (dedup tables). 6738 */ 6739 ddt_create(spa); 6740 /* 6741 * Create BRT table and BRT table object. 6742 */ 6743 brt_create(spa); 6744 6745 spa_update_dspace(spa); 6746 6747 tx = dmu_tx_create_assigned(dp, txg); 6748 6749 /* 6750 * Create the pool's history object. 6751 */ 6752 if (version >= SPA_VERSION_ZPOOL_HISTORY && !spa->spa_history) 6753 spa_history_create_obj(spa, tx); 6754 6755 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE); 6756 spa_history_log_version(spa, "create", tx); 6757 6758 /* 6759 * Create the pool config object. 6760 */ 6761 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 6762 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 6763 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 6764 6765 if (zap_add(spa->spa_meta_objset, 6766 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 6767 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 6768 cmn_err(CE_PANIC, "failed to add pool config"); 6769 } 6770 6771 if (zap_add(spa->spa_meta_objset, 6772 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 6773 sizeof (uint64_t), 1, &version, tx) != 0) { 6774 cmn_err(CE_PANIC, "failed to add pool version"); 6775 } 6776 6777 /* Newly created pools with the right version are always deflated. */ 6778 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 6779 spa->spa_deflate = TRUE; 6780 if (zap_add(spa->spa_meta_objset, 6781 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 6782 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 6783 cmn_err(CE_PANIC, "failed to add deflate"); 6784 } 6785 } 6786 6787 /* 6788 * Create the deferred-free bpobj. Turn off compression 6789 * because sync-to-convergence takes longer if the blocksize 6790 * keeps changing. 6791 */ 6792 obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 6793 dmu_object_set_compress(spa->spa_meta_objset, obj, 6794 ZIO_COMPRESS_OFF, tx); 6795 if (zap_add(spa->spa_meta_objset, 6796 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 6797 sizeof (uint64_t), 1, &obj, tx) != 0) { 6798 cmn_err(CE_PANIC, "failed to add bpobj"); 6799 } 6800 VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 6801 spa->spa_meta_objset, obj)); 6802 6803 /* 6804 * Generate some random noise for salted checksums to operate on. 6805 */ 6806 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 6807 sizeof (spa->spa_cksum_salt.zcs_bytes)); 6808 6809 /* 6810 * Set pool properties. 6811 */ 6812 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 6813 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 6814 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 6815 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 6816 spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST); 6817 spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM); 6818 spa->spa_dedup_table_quota = 6819 zpool_prop_default_numeric(ZPOOL_PROP_DEDUP_TABLE_QUOTA); 6820 6821 if (props != NULL) { 6822 spa_configfile_set(spa, props, B_FALSE); 6823 spa_sync_props(props, tx); 6824 } 6825 6826 for (int i = 0; i < ndraid; i++) 6827 spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); 6828 6829 dmu_tx_commit(tx); 6830 6831 spa->spa_sync_on = B_TRUE; 6832 txg_sync_start(dp); 6833 mmp_thread_start(spa); 6834 txg_wait_synced(dp, txg); 6835 6836 spa_spawn_aux_threads(spa); 6837 6838 spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE); 6839 6840 /* 6841 * Don't count references from objsets that are already closed 6842 * and are making their way through the eviction process. 6843 */ 6844 spa_evicting_os_wait(spa); 6845 spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); 6846 spa->spa_load_state = SPA_LOAD_NONE; 6847 6848 spa_import_os(spa); 6849 6850 mutex_exit(&spa_namespace_lock); 6851 6852 return (0); 6853 } 6854 6855 /* 6856 * Import a non-root pool into the system. 6857 */ 6858 int 6859 spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 6860 { 6861 spa_t *spa; 6862 const char *altroot = NULL; 6863 spa_load_state_t state = SPA_LOAD_IMPORT; 6864 zpool_load_policy_t policy; 6865 spa_mode_t mode = spa_mode_global; 6866 uint64_t readonly = B_FALSE; 6867 int error; 6868 nvlist_t *nvroot; 6869 nvlist_t **spares, **l2cache; 6870 uint_t nspares, nl2cache; 6871 6872 /* 6873 * If a pool with this name exists, return failure. 6874 */ 6875 mutex_enter(&spa_namespace_lock); 6876 if (spa_lookup(pool) != NULL) { 6877 mutex_exit(&spa_namespace_lock); 6878 return (SET_ERROR(EEXIST)); 6879 } 6880 6881 /* 6882 * Create and initialize the spa structure. 6883 */ 6884 (void) nvlist_lookup_string(props, 6885 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 6886 (void) nvlist_lookup_uint64(props, 6887 zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 6888 if (readonly) 6889 mode = SPA_MODE_READ; 6890 spa = spa_add(pool, config, altroot); 6891 spa->spa_import_flags = flags; 6892 6893 /* 6894 * Verbatim import - Take a pool and insert it into the namespace 6895 * as if it had been loaded at boot. 6896 */ 6897 if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 6898 if (props != NULL) 6899 spa_configfile_set(spa, props, B_FALSE); 6900 6901 spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE); 6902 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); 6903 zfs_dbgmsg("spa_import: verbatim import of %s", pool); 6904 mutex_exit(&spa_namespace_lock); 6905 return (0); 6906 } 6907 6908 spa_activate(spa, mode); 6909 6910 /* 6911 * Don't start async tasks until we know everything is healthy. 6912 */ 6913 spa_async_suspend(spa); 6914 6915 zpool_get_load_policy(config, &policy); 6916 if (policy.zlp_rewind & ZPOOL_DO_REWIND) 6917 state = SPA_LOAD_RECOVER; 6918 6919 spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT; 6920 6921 if (state != SPA_LOAD_RECOVER) { 6922 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 6923 zfs_dbgmsg("spa_import: importing %s", pool); 6924 } else { 6925 zfs_dbgmsg("spa_import: importing %s, max_txg=%lld " 6926 "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg); 6927 } 6928 error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind); 6929 6930 /* 6931 * Propagate anything learned while loading the pool and pass it 6932 * back to caller (i.e. rewind info, missing devices, etc). 6933 */ 6934 fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info); 6935 6936 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6937 /* 6938 * Toss any existing sparelist, as it doesn't have any validity 6939 * anymore, and conflicts with spa_has_spare(). 6940 */ 6941 if (spa->spa_spares.sav_config) { 6942 nvlist_free(spa->spa_spares.sav_config); 6943 spa->spa_spares.sav_config = NULL; 6944 spa_load_spares(spa); 6945 } 6946 if (spa->spa_l2cache.sav_config) { 6947 nvlist_free(spa->spa_l2cache.sav_config); 6948 spa->spa_l2cache.sav_config = NULL; 6949 spa_load_l2cache(spa); 6950 } 6951 6952 nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); 6953 spa_config_exit(spa, SCL_ALL, FTAG); 6954 6955 if (props != NULL) 6956 spa_configfile_set(spa, props, B_FALSE); 6957 6958 if (error != 0 || (props && spa_writeable(spa) && 6959 (error = spa_prop_set(spa, props)))) { 6960 spa_unload(spa); 6961 spa_deactivate(spa); 6962 spa_remove(spa); 6963 mutex_exit(&spa_namespace_lock); 6964 return (error); 6965 } 6966 6967 spa_async_resume(spa); 6968 6969 /* 6970 * Override any spares and level 2 cache devices as specified by 6971 * the user, as these may have correct device names/devids, etc. 6972 */ 6973 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 6974 &spares, &nspares) == 0) { 6975 if (spa->spa_spares.sav_config) 6976 fnvlist_remove(spa->spa_spares.sav_config, 6977 ZPOOL_CONFIG_SPARES); 6978 else 6979 spa->spa_spares.sav_config = fnvlist_alloc(); 6980 fnvlist_add_nvlist_array(spa->spa_spares.sav_config, 6981 ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, 6982 nspares); 6983 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6984 spa_load_spares(spa); 6985 spa_config_exit(spa, SCL_ALL, FTAG); 6986 spa->spa_spares.sav_sync = B_TRUE; 6987 spa->spa_spares.sav_label_sync = B_TRUE; 6988 } 6989 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 6990 &l2cache, &nl2cache) == 0) { 6991 if (spa->spa_l2cache.sav_config) 6992 fnvlist_remove(spa->spa_l2cache.sav_config, 6993 ZPOOL_CONFIG_L2CACHE); 6994 else 6995 spa->spa_l2cache.sav_config = fnvlist_alloc(); 6996 fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 6997 ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache, 6998 nl2cache); 6999 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 7000 spa_load_l2cache(spa); 7001 spa_config_exit(spa, SCL_ALL, FTAG); 7002 spa->spa_l2cache.sav_sync = B_TRUE; 7003 spa->spa_l2cache.sav_label_sync = B_TRUE; 7004 } 7005 7006 /* 7007 * Check for any removed devices. 7008 */ 7009 if (spa->spa_autoreplace) { 7010 spa_aux_check_removed(&spa->spa_spares); 7011 spa_aux_check_removed(&spa->spa_l2cache); 7012 } 7013 7014 if (spa_writeable(spa)) { 7015 /* 7016 * Update the config cache to include the newly-imported pool. 7017 */ 7018 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 7019 } 7020 7021 /* 7022 * It's possible that the pool was expanded while it was exported. 7023 * We kick off an async task to handle this for us. 7024 */ 7025 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 7026 7027 spa_history_log_version(spa, "import", NULL); 7028 7029 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); 7030 7031 mutex_exit(&spa_namespace_lock); 7032 7033 zvol_create_minors(pool); 7034 7035 spa_import_os(spa); 7036 7037 return (0); 7038 } 7039 7040 nvlist_t * 7041 spa_tryimport(nvlist_t *tryconfig) 7042 { 7043 nvlist_t *config = NULL; 7044 const char *poolname, *cachefile; 7045 spa_t *spa; 7046 uint64_t state; 7047 int error; 7048 zpool_load_policy_t policy; 7049 7050 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 7051 return (NULL); 7052 7053 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 7054 return (NULL); 7055 7056 /* 7057 * Create and initialize the spa structure. 7058 */ 7059 char *name = kmem_alloc(MAXPATHLEN, KM_SLEEP); 7060 (void) snprintf(name, MAXPATHLEN, "%s-%llx-%s", 7061 TRYIMPORT_NAME, (u_longlong_t)(uintptr_t)curthread, poolname); 7062 7063 mutex_enter(&spa_namespace_lock); 7064 spa = spa_add(name, tryconfig, NULL); 7065 spa_activate(spa, SPA_MODE_READ); 7066 kmem_free(name, MAXPATHLEN); 7067 7068 /* 7069 * Rewind pool if a max txg was provided. 7070 */ 7071 zpool_get_load_policy(spa->spa_config, &policy); 7072 if (policy.zlp_txg != UINT64_MAX) { 7073 spa->spa_load_max_txg = policy.zlp_txg; 7074 spa->spa_extreme_rewind = B_TRUE; 7075 zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld", 7076 poolname, (longlong_t)policy.zlp_txg); 7077 } else { 7078 zfs_dbgmsg("spa_tryimport: importing %s", poolname); 7079 } 7080 7081 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile) 7082 == 0) { 7083 zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile); 7084 spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; 7085 } else { 7086 spa->spa_config_source = SPA_CONFIG_SRC_SCAN; 7087 } 7088 7089 /* 7090 * spa_import() relies on a pool config fetched by spa_try_import() 7091 * for spare/cache devices. Import flags are not passed to 7092 * spa_tryimport(), which makes it return early due to a missing log 7093 * device and missing retrieving the cache device and spare eventually. 7094 * Passing ZFS_IMPORT_MISSING_LOG to spa_tryimport() makes it fetch 7095 * the correct configuration regardless of the missing log device. 7096 */ 7097 spa->spa_import_flags |= ZFS_IMPORT_MISSING_LOG; 7098 7099 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING); 7100 7101 /* 7102 * If 'tryconfig' was at least parsable, return the current config. 7103 */ 7104 if (spa->spa_root_vdev != NULL) { 7105 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 7106 fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, poolname); 7107 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, state); 7108 fnvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 7109 spa->spa_uberblock.ub_timestamp); 7110 fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 7111 spa->spa_load_info); 7112 fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, 7113 spa->spa_errata); 7114 7115 /* 7116 * If the bootfs property exists on this pool then we 7117 * copy it out so that external consumers can tell which 7118 * pools are bootable. 7119 */ 7120 if ((!error || error == EEXIST) && spa->spa_bootfs) { 7121 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 7122 7123 /* 7124 * We have to play games with the name since the 7125 * pool was opened as TRYIMPORT_NAME. 7126 */ 7127 if (dsl_dsobj_to_dsname(spa_name(spa), 7128 spa->spa_bootfs, tmpname) == 0) { 7129 char *cp; 7130 char *dsname; 7131 7132 dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 7133 7134 cp = strchr(tmpname, '/'); 7135 if (cp == NULL) { 7136 (void) strlcpy(dsname, tmpname, 7137 MAXPATHLEN); 7138 } else { 7139 (void) snprintf(dsname, MAXPATHLEN, 7140 "%s/%s", poolname, ++cp); 7141 } 7142 fnvlist_add_string(config, ZPOOL_CONFIG_BOOTFS, 7143 dsname); 7144 kmem_free(dsname, MAXPATHLEN); 7145 } 7146 kmem_free(tmpname, MAXPATHLEN); 7147 } 7148 7149 /* 7150 * Add the list of hot spares and level 2 cache devices. 7151 */ 7152 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 7153 spa_add_spares(spa, config); 7154 spa_add_l2cache(spa, config); 7155 spa_config_exit(spa, SCL_CONFIG, FTAG); 7156 } 7157 7158 spa_unload(spa); 7159 spa_deactivate(spa); 7160 spa_remove(spa); 7161 mutex_exit(&spa_namespace_lock); 7162 7163 return (config); 7164 } 7165 7166 /* 7167 * Pool export/destroy 7168 * 7169 * The act of destroying or exporting a pool is very simple. We make sure there 7170 * is no more pending I/O and any references to the pool are gone. Then, we 7171 * update the pool state and sync all the labels to disk, removing the 7172 * configuration from the cache afterwards. If the 'hardforce' flag is set, then 7173 * we don't sync the labels or remove the configuration cache. 7174 */ 7175 static int 7176 spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, 7177 boolean_t force, boolean_t hardforce) 7178 { 7179 int error = 0; 7180 spa_t *spa; 7181 hrtime_t export_start = gethrtime(); 7182 7183 if (oldconfig) 7184 *oldconfig = NULL; 7185 7186 if (!(spa_mode_global & SPA_MODE_WRITE)) 7187 return (SET_ERROR(EROFS)); 7188 7189 mutex_enter(&spa_namespace_lock); 7190 if ((spa = spa_lookup(pool)) == NULL) { 7191 mutex_exit(&spa_namespace_lock); 7192 return (SET_ERROR(ENOENT)); 7193 } 7194 7195 if (spa->spa_is_exporting) { 7196 /* the pool is being exported by another thread */ 7197 mutex_exit(&spa_namespace_lock); 7198 return (SET_ERROR(ZFS_ERR_EXPORT_IN_PROGRESS)); 7199 } 7200 spa->spa_is_exporting = B_TRUE; 7201 7202 /* 7203 * Put a hold on the pool, drop the namespace lock, stop async tasks 7204 * and see if we can export. 7205 */ 7206 spa_open_ref(spa, FTAG); 7207 mutex_exit(&spa_namespace_lock); 7208 spa_async_suspend(spa); 7209 if (spa->spa_zvol_taskq) { 7210 zvol_remove_minors(spa, spa_name(spa), B_TRUE); 7211 taskq_wait(spa->spa_zvol_taskq); 7212 } 7213 mutex_enter(&spa_namespace_lock); 7214 spa->spa_export_thread = curthread; 7215 spa_close(spa, FTAG); 7216 7217 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 7218 mutex_exit(&spa_namespace_lock); 7219 goto export_spa; 7220 } 7221 7222 /* 7223 * The pool will be in core if it's openable, in which case we can 7224 * modify its state. Objsets may be open only because they're dirty, 7225 * so we have to force it to sync before checking spa_refcnt. 7226 */ 7227 if (spa->spa_sync_on) { 7228 txg_wait_synced(spa->spa_dsl_pool, 0); 7229 spa_evicting_os_wait(spa); 7230 } 7231 7232 /* 7233 * A pool cannot be exported or destroyed if there are active 7234 * references. If we are resetting a pool, allow references by 7235 * fault injection handlers. 7236 */ 7237 if (!spa_refcount_zero(spa) || (spa->spa_inject_ref != 0)) { 7238 error = SET_ERROR(EBUSY); 7239 goto fail; 7240 } 7241 7242 mutex_exit(&spa_namespace_lock); 7243 /* 7244 * At this point we no longer hold the spa_namespace_lock and 7245 * there were no references on the spa. Future spa_lookups will 7246 * notice the spa->spa_export_thread and wait until we signal 7247 * that we are finshed. 7248 */ 7249 7250 if (spa->spa_sync_on) { 7251 vdev_t *rvd = spa->spa_root_vdev; 7252 /* 7253 * A pool cannot be exported if it has an active shared spare. 7254 * This is to prevent other pools stealing the active spare 7255 * from an exported pool. At user's own will, such pool can 7256 * be forcedly exported. 7257 */ 7258 if (!force && new_state == POOL_STATE_EXPORTED && 7259 spa_has_active_shared_spare(spa)) { 7260 error = SET_ERROR(EXDEV); 7261 mutex_enter(&spa_namespace_lock); 7262 goto fail; 7263 } 7264 7265 /* 7266 * We're about to export or destroy this pool. Make sure 7267 * we stop all initialization and trim activity here before 7268 * we set the spa_final_txg. This will ensure that all 7269 * dirty data resulting from the initialization is 7270 * committed to disk before we unload the pool. 7271 */ 7272 vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE); 7273 vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE); 7274 vdev_autotrim_stop_all(spa); 7275 vdev_rebuild_stop_all(spa); 7276 l2arc_spa_rebuild_stop(spa); 7277 7278 /* 7279 * We want this to be reflected on every label, 7280 * so mark them all dirty. spa_unload() will do the 7281 * final sync that pushes these changes out. 7282 */ 7283 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 7284 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 7285 spa->spa_state = new_state; 7286 vdev_config_dirty(rvd); 7287 spa_config_exit(spa, SCL_ALL, FTAG); 7288 } 7289 7290 if (spa_should_sync_time_logger_on_unload(spa)) 7291 spa_unload_sync_time_logger(spa); 7292 7293 /* 7294 * If the log space map feature is enabled and the pool is 7295 * getting exported (but not destroyed), we want to spend some 7296 * time flushing as many metaslabs as we can in an attempt to 7297 * destroy log space maps and save import time. This has to be 7298 * done before we set the spa_final_txg, otherwise 7299 * spa_sync() -> spa_flush_metaslabs() may dirty the final TXGs. 7300 * spa_should_flush_logs_on_unload() should be called after 7301 * spa_state has been set to the new_state. 7302 */ 7303 if (spa_should_flush_logs_on_unload(spa)) 7304 spa_unload_log_sm_flush_all(spa); 7305 7306 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 7307 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 7308 spa->spa_final_txg = spa_last_synced_txg(spa) + 7309 TXG_DEFER_SIZE + 1; 7310 spa_config_exit(spa, SCL_ALL, FTAG); 7311 } 7312 } 7313 7314 export_spa: 7315 spa_export_os(spa); 7316 7317 if (new_state == POOL_STATE_DESTROYED) 7318 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY); 7319 else if (new_state == POOL_STATE_EXPORTED) 7320 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_EXPORT); 7321 7322 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 7323 spa_unload(spa); 7324 spa_deactivate(spa); 7325 } 7326 7327 if (oldconfig && spa->spa_config) 7328 *oldconfig = fnvlist_dup(spa->spa_config); 7329 7330 if (new_state == POOL_STATE_EXPORTED) 7331 zio_handle_export_delay(spa, gethrtime() - export_start); 7332 7333 /* 7334 * Take the namespace lock for the actual spa_t removal 7335 */ 7336 mutex_enter(&spa_namespace_lock); 7337 if (new_state != POOL_STATE_UNINITIALIZED) { 7338 if (!hardforce) 7339 spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE); 7340 spa_remove(spa); 7341 } else { 7342 /* 7343 * If spa_remove() is not called for this spa_t and 7344 * there is any possibility that it can be reused, 7345 * we make sure to reset the exporting flag. 7346 */ 7347 spa->spa_is_exporting = B_FALSE; 7348 spa->spa_export_thread = NULL; 7349 } 7350 7351 /* 7352 * Wake up any waiters in spa_lookup() 7353 */ 7354 cv_broadcast(&spa_namespace_cv); 7355 mutex_exit(&spa_namespace_lock); 7356 return (0); 7357 7358 fail: 7359 spa->spa_is_exporting = B_FALSE; 7360 spa->spa_export_thread = NULL; 7361 7362 spa_async_resume(spa); 7363 /* 7364 * Wake up any waiters in spa_lookup() 7365 */ 7366 cv_broadcast(&spa_namespace_cv); 7367 mutex_exit(&spa_namespace_lock); 7368 return (error); 7369 } 7370 7371 /* 7372 * Destroy a storage pool. 7373 */ 7374 int 7375 spa_destroy(const char *pool) 7376 { 7377 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 7378 B_FALSE, B_FALSE)); 7379 } 7380 7381 /* 7382 * Export a storage pool. 7383 */ 7384 int 7385 spa_export(const char *pool, nvlist_t **oldconfig, boolean_t force, 7386 boolean_t hardforce) 7387 { 7388 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 7389 force, hardforce)); 7390 } 7391 7392 /* 7393 * Similar to spa_export(), this unloads the spa_t without actually removing it 7394 * from the namespace in any way. 7395 */ 7396 int 7397 spa_reset(const char *pool) 7398 { 7399 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 7400 B_FALSE, B_FALSE)); 7401 } 7402 7403 /* 7404 * ========================================================================== 7405 * Device manipulation 7406 * ========================================================================== 7407 */ 7408 7409 /* 7410 * This is called as a synctask to increment the draid feature flag 7411 */ 7412 static void 7413 spa_draid_feature_incr(void *arg, dmu_tx_t *tx) 7414 { 7415 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 7416 int draid = (int)(uintptr_t)arg; 7417 7418 for (int c = 0; c < draid; c++) 7419 spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); 7420 } 7421 7422 /* 7423 * Add a device to a storage pool. 7424 */ 7425 int 7426 spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t check_ashift) 7427 { 7428 uint64_t txg, ndraid = 0; 7429 int error; 7430 vdev_t *rvd = spa->spa_root_vdev; 7431 vdev_t *vd, *tvd; 7432 nvlist_t **spares, **l2cache; 7433 uint_t nspares, nl2cache; 7434 7435 ASSERT(spa_writeable(spa)); 7436 7437 txg = spa_vdev_enter(spa); 7438 7439 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 7440 VDEV_ALLOC_ADD)) != 0) 7441 return (spa_vdev_exit(spa, NULL, txg, error)); 7442 7443 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 7444 7445 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 7446 &nspares) != 0) 7447 nspares = 0; 7448 7449 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 7450 &nl2cache) != 0) 7451 nl2cache = 0; 7452 7453 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 7454 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 7455 7456 if (vd->vdev_children != 0 && 7457 (error = vdev_create(vd, txg, B_FALSE)) != 0) { 7458 return (spa_vdev_exit(spa, vd, txg, error)); 7459 } 7460 7461 /* 7462 * The virtual dRAID spares must be added after vdev tree is created 7463 * and the vdev guids are generated. The guid of their associated 7464 * dRAID is stored in the config and used when opening the spare. 7465 */ 7466 if ((error = vdev_draid_spare_create(nvroot, vd, &ndraid, 7467 rvd->vdev_children)) == 0) { 7468 if (ndraid > 0 && nvlist_lookup_nvlist_array(nvroot, 7469 ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0) 7470 nspares = 0; 7471 } else { 7472 return (spa_vdev_exit(spa, vd, txg, error)); 7473 } 7474 7475 /* 7476 * We must validate the spares and l2cache devices after checking the 7477 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 7478 */ 7479 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 7480 return (spa_vdev_exit(spa, vd, txg, error)); 7481 7482 /* 7483 * If we are in the middle of a device removal, we can only add 7484 * devices which match the existing devices in the pool. 7485 * If we are in the middle of a removal, or have some indirect 7486 * vdevs, we can not add raidz or dRAID top levels. 7487 */ 7488 if (spa->spa_vdev_removal != NULL || 7489 spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { 7490 for (int c = 0; c < vd->vdev_children; c++) { 7491 tvd = vd->vdev_child[c]; 7492 if (spa->spa_vdev_removal != NULL && 7493 tvd->vdev_ashift != spa->spa_max_ashift) { 7494 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 7495 } 7496 /* Fail if top level vdev is raidz or a dRAID */ 7497 if (vdev_get_nparity(tvd) != 0) 7498 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 7499 7500 /* 7501 * Need the top level mirror to be 7502 * a mirror of leaf vdevs only 7503 */ 7504 if (tvd->vdev_ops == &vdev_mirror_ops) { 7505 for (uint64_t cid = 0; 7506 cid < tvd->vdev_children; cid++) { 7507 vdev_t *cvd = tvd->vdev_child[cid]; 7508 if (!cvd->vdev_ops->vdev_op_leaf) { 7509 return (spa_vdev_exit(spa, vd, 7510 txg, EINVAL)); 7511 } 7512 } 7513 } 7514 } 7515 } 7516 7517 if (check_ashift && spa->spa_max_ashift == spa->spa_min_ashift) { 7518 for (int c = 0; c < vd->vdev_children; c++) { 7519 tvd = vd->vdev_child[c]; 7520 if (tvd->vdev_ashift != spa->spa_max_ashift) { 7521 return (spa_vdev_exit(spa, vd, txg, 7522 ZFS_ERR_ASHIFT_MISMATCH)); 7523 } 7524 } 7525 } 7526 7527 for (int c = 0; c < vd->vdev_children; c++) { 7528 tvd = vd->vdev_child[c]; 7529 vdev_remove_child(vd, tvd); 7530 tvd->vdev_id = rvd->vdev_children; 7531 vdev_add_child(rvd, tvd); 7532 vdev_config_dirty(tvd); 7533 } 7534 7535 if (nspares != 0) { 7536 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 7537 ZPOOL_CONFIG_SPARES); 7538 spa_load_spares(spa); 7539 spa->spa_spares.sav_sync = B_TRUE; 7540 } 7541 7542 if (nl2cache != 0) { 7543 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 7544 ZPOOL_CONFIG_L2CACHE); 7545 spa_load_l2cache(spa); 7546 spa->spa_l2cache.sav_sync = B_TRUE; 7547 } 7548 7549 /* 7550 * We can't increment a feature while holding spa_vdev so we 7551 * have to do it in a synctask. 7552 */ 7553 if (ndraid != 0) { 7554 dmu_tx_t *tx; 7555 7556 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 7557 dsl_sync_task_nowait(spa->spa_dsl_pool, spa_draid_feature_incr, 7558 (void *)(uintptr_t)ndraid, tx); 7559 dmu_tx_commit(tx); 7560 } 7561 7562 /* 7563 * We have to be careful when adding new vdevs to an existing pool. 7564 * If other threads start allocating from these vdevs before we 7565 * sync the config cache, and we lose power, then upon reboot we may 7566 * fail to open the pool because there are DVAs that the config cache 7567 * can't translate. Therefore, we first add the vdevs without 7568 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 7569 * and then let spa_config_update() initialize the new metaslabs. 7570 * 7571 * spa_load() checks for added-but-not-initialized vdevs, so that 7572 * if we lose power at any point in this sequence, the remaining 7573 * steps will be completed the next time we load the pool. 7574 */ 7575 (void) spa_vdev_exit(spa, vd, txg, 0); 7576 7577 mutex_enter(&spa_namespace_lock); 7578 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 7579 spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD); 7580 mutex_exit(&spa_namespace_lock); 7581 7582 return (0); 7583 } 7584 7585 /* 7586 * Given a vdev to be replaced and its parent, check for a possible 7587 * "double spare" condition if a vdev is to be replaced by a spare. When this 7588 * happens, you can get two spares assigned to one failed vdev. 7589 * 7590 * To trigger a double spare condition: 7591 * 7592 * 1. disk1 fails 7593 * 2. 1st spare is kicked in for disk1 and it resilvers 7594 * 3. Someone replaces disk1 with a new blank disk 7595 * 4. New blank disk starts resilvering 7596 * 5. While resilvering, new blank disk has IO errors and faults 7597 * 6. 2nd spare is kicked in for new blank disk 7598 * 7. At this point two spares are kicked in for the original disk1. 7599 * 7600 * It looks like this: 7601 * 7602 * NAME STATE READ WRITE CKSUM 7603 * tank2 DEGRADED 0 0 0 7604 * draid2:6d:10c:2s-0 DEGRADED 0 0 0 7605 * scsi-0QEMU_QEMU_HARDDISK_d1 ONLINE 0 0 0 7606 * scsi-0QEMU_QEMU_HARDDISK_d2 ONLINE 0 0 0 7607 * scsi-0QEMU_QEMU_HARDDISK_d3 ONLINE 0 0 0 7608 * scsi-0QEMU_QEMU_HARDDISK_d4 ONLINE 0 0 0 7609 * scsi-0QEMU_QEMU_HARDDISK_d5 ONLINE 0 0 0 7610 * scsi-0QEMU_QEMU_HARDDISK_d6 ONLINE 0 0 0 7611 * scsi-0QEMU_QEMU_HARDDISK_d7 ONLINE 0 0 0 7612 * scsi-0QEMU_QEMU_HARDDISK_d8 ONLINE 0 0 0 7613 * scsi-0QEMU_QEMU_HARDDISK_d9 ONLINE 0 0 0 7614 * spare-9 DEGRADED 0 0 0 7615 * replacing-0 DEGRADED 0 93 0 7616 * scsi-0QEMU_QEMU_HARDDISK_d10-part1/old UNAVAIL 0 0 0 7617 * spare-1 DEGRADED 0 0 0 7618 * scsi-0QEMU_QEMU_HARDDISK_d10 REMOVED 0 0 0 7619 * draid2-0-0 ONLINE 0 0 0 7620 * draid2-0-1 ONLINE 0 0 0 7621 * spares 7622 * draid2-0-0 INUSE currently in use 7623 * draid2-0-1 INUSE currently in use 7624 * 7625 * ARGS: 7626 * 7627 * newvd: New spare disk 7628 * pvd: Parent vdev_t the spare should attach to 7629 * 7630 * This function returns B_TRUE if adding the new vdev would create a double 7631 * spare condition, B_FALSE otherwise. 7632 */ 7633 static boolean_t 7634 spa_vdev_new_spare_would_cause_double_spares(vdev_t *newvd, vdev_t *pvd) 7635 { 7636 vdev_t *ppvd; 7637 7638 ppvd = pvd->vdev_parent; 7639 if (ppvd == NULL) 7640 return (B_FALSE); 7641 7642 /* 7643 * To determine if this configuration would cause a double spare, we 7644 * look at the vdev_op of the parent vdev, and of the parent's parent 7645 * vdev. We also look at vdev_isspare on the new disk. A double spare 7646 * condition looks like this: 7647 * 7648 * 1. parent of parent's op is a spare or draid spare 7649 * 2. parent's op is replacing 7650 * 3. new disk is a spare 7651 */ 7652 if ((ppvd->vdev_ops == &vdev_spare_ops) || 7653 (ppvd->vdev_ops == &vdev_draid_spare_ops)) 7654 if (pvd->vdev_ops == &vdev_replacing_ops) 7655 if (newvd->vdev_isspare) 7656 return (B_TRUE); 7657 7658 return (B_FALSE); 7659 } 7660 7661 /* 7662 * Attach a device to a vdev specified by its guid. The vdev type can be 7663 * a mirror, a raidz, or a leaf device that is also a top-level (e.g. a 7664 * single device). When the vdev is a single device, a mirror vdev will be 7665 * automatically inserted. 7666 * 7667 * If 'replacing' is specified, the new device is intended to replace the 7668 * existing device; in this case the two devices are made into their own 7669 * mirror using the 'replacing' vdev, which is functionally identical to 7670 * the mirror vdev (it actually reuses all the same ops) but has a few 7671 * extra rules: you can't attach to it after it's been created, and upon 7672 * completion of resilvering, the first disk (the one being replaced) 7673 * is automatically detached. 7674 * 7675 * If 'rebuild' is specified, then sequential reconstruction (a.ka. rebuild) 7676 * should be performed instead of traditional healing reconstruction. From 7677 * an administrators perspective these are both resilver operations. 7678 */ 7679 int 7680 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, 7681 int rebuild) 7682 { 7683 uint64_t txg, dtl_max_txg; 7684 vdev_t *rvd = spa->spa_root_vdev; 7685 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 7686 vdev_ops_t *pvops; 7687 char *oldvdpath, *newvdpath; 7688 int newvd_isspare = B_FALSE; 7689 int error; 7690 7691 ASSERT(spa_writeable(spa)); 7692 7693 txg = spa_vdev_enter(spa); 7694 7695 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 7696 7697 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 7698 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 7699 error = (spa_has_checkpoint(spa)) ? 7700 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 7701 return (spa_vdev_exit(spa, NULL, txg, error)); 7702 } 7703 7704 if (rebuild) { 7705 if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) 7706 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7707 7708 if (dsl_scan_resilvering(spa_get_dsl(spa)) || 7709 dsl_scan_resilver_scheduled(spa_get_dsl(spa))) { 7710 return (spa_vdev_exit(spa, NULL, txg, 7711 ZFS_ERR_RESILVER_IN_PROGRESS)); 7712 } 7713 } else { 7714 if (vdev_rebuild_active(rvd)) 7715 return (spa_vdev_exit(spa, NULL, txg, 7716 ZFS_ERR_REBUILD_IN_PROGRESS)); 7717 } 7718 7719 if (spa->spa_vdev_removal != NULL) { 7720 return (spa_vdev_exit(spa, NULL, txg, 7721 ZFS_ERR_DEVRM_IN_PROGRESS)); 7722 } 7723 7724 if (oldvd == NULL) 7725 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 7726 7727 boolean_t raidz = oldvd->vdev_ops == &vdev_raidz_ops; 7728 7729 if (raidz) { 7730 if (!spa_feature_is_enabled(spa, SPA_FEATURE_RAIDZ_EXPANSION)) 7731 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7732 7733 /* 7734 * Can't expand a raidz while prior expand is in progress. 7735 */ 7736 if (spa->spa_raidz_expand != NULL) { 7737 return (spa_vdev_exit(spa, NULL, txg, 7738 ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS)); 7739 } 7740 } else if (!oldvd->vdev_ops->vdev_op_leaf) { 7741 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7742 } 7743 7744 if (raidz) 7745 pvd = oldvd; 7746 else 7747 pvd = oldvd->vdev_parent; 7748 7749 if (spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 7750 VDEV_ALLOC_ATTACH) != 0) 7751 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 7752 7753 if (newrootvd->vdev_children != 1) 7754 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 7755 7756 newvd = newrootvd->vdev_child[0]; 7757 7758 if (!newvd->vdev_ops->vdev_op_leaf) 7759 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 7760 7761 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 7762 return (spa_vdev_exit(spa, newrootvd, txg, error)); 7763 7764 /* 7765 * log, dedup and special vdevs should not be replaced by spares. 7766 */ 7767 if ((oldvd->vdev_top->vdev_alloc_bias != VDEV_BIAS_NONE || 7768 oldvd->vdev_top->vdev_islog) && newvd->vdev_isspare) { 7769 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7770 } 7771 7772 /* 7773 * A dRAID spare can only replace a child of its parent dRAID vdev. 7774 */ 7775 if (newvd->vdev_ops == &vdev_draid_spare_ops && 7776 oldvd->vdev_top != vdev_draid_spare_get_parent(newvd)) { 7777 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7778 } 7779 7780 if (rebuild) { 7781 /* 7782 * For rebuilds, the top vdev must support reconstruction 7783 * using only space maps. This means the only allowable 7784 * vdevs types are the root vdev, a mirror, or dRAID. 7785 */ 7786 tvd = pvd; 7787 if (pvd->vdev_top != NULL) 7788 tvd = pvd->vdev_top; 7789 7790 if (tvd->vdev_ops != &vdev_mirror_ops && 7791 tvd->vdev_ops != &vdev_root_ops && 7792 tvd->vdev_ops != &vdev_draid_ops) { 7793 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7794 } 7795 } 7796 7797 if (!replacing) { 7798 /* 7799 * For attach, the only allowable parent is a mirror or 7800 * the root vdev. A raidz vdev can be attached to, but 7801 * you cannot attach to a raidz child. 7802 */ 7803 if (pvd->vdev_ops != &vdev_mirror_ops && 7804 pvd->vdev_ops != &vdev_root_ops && 7805 !raidz) 7806 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7807 7808 pvops = &vdev_mirror_ops; 7809 } else { 7810 /* 7811 * Active hot spares can only be replaced by inactive hot 7812 * spares. 7813 */ 7814 if (pvd->vdev_ops == &vdev_spare_ops && 7815 oldvd->vdev_isspare && 7816 !spa_has_spare(spa, newvd->vdev_guid)) 7817 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7818 7819 /* 7820 * If the source is a hot spare, and the parent isn't already a 7821 * spare, then we want to create a new hot spare. Otherwise, we 7822 * want to create a replacing vdev. The user is not allowed to 7823 * attach to a spared vdev child unless the 'isspare' state is 7824 * the same (spare replaces spare, non-spare replaces 7825 * non-spare). 7826 */ 7827 if (pvd->vdev_ops == &vdev_replacing_ops && 7828 spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 7829 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7830 } else if (pvd->vdev_ops == &vdev_spare_ops && 7831 newvd->vdev_isspare != oldvd->vdev_isspare) { 7832 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7833 } 7834 7835 if (spa_vdev_new_spare_would_cause_double_spares(newvd, pvd)) { 7836 vdev_dbgmsg(newvd, 7837 "disk would create double spares, ignore."); 7838 return (spa_vdev_exit(spa, newrootvd, txg, EEXIST)); 7839 } 7840 7841 if (newvd->vdev_isspare) 7842 pvops = &vdev_spare_ops; 7843 else 7844 pvops = &vdev_replacing_ops; 7845 } 7846 7847 /* 7848 * Make sure the new device is big enough. 7849 */ 7850 vdev_t *min_vdev = raidz ? oldvd->vdev_child[0] : oldvd; 7851 if (newvd->vdev_asize < vdev_get_min_asize(min_vdev)) 7852 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 7853 7854 /* 7855 * The new device cannot have a higher alignment requirement 7856 * than the top-level vdev. 7857 */ 7858 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) { 7859 return (spa_vdev_exit(spa, newrootvd, txg, 7860 ZFS_ERR_ASHIFT_MISMATCH)); 7861 } 7862 7863 /* 7864 * RAIDZ-expansion-specific checks. 7865 */ 7866 if (raidz) { 7867 if (vdev_raidz_attach_check(newvd) != 0) 7868 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7869 7870 /* 7871 * Fail early if a child is not healthy or being replaced 7872 */ 7873 for (int i = 0; i < oldvd->vdev_children; i++) { 7874 if (vdev_is_dead(oldvd->vdev_child[i]) || 7875 !oldvd->vdev_child[i]->vdev_ops->vdev_op_leaf) { 7876 return (spa_vdev_exit(spa, newrootvd, txg, 7877 ENXIO)); 7878 } 7879 /* Also fail if reserved boot area is in-use */ 7880 if (vdev_check_boot_reserve(spa, oldvd->vdev_child[i]) 7881 != 0) { 7882 return (spa_vdev_exit(spa, newrootvd, txg, 7883 EADDRINUSE)); 7884 } 7885 } 7886 } 7887 7888 if (raidz) { 7889 /* 7890 * Note: oldvdpath is freed by spa_strfree(), but 7891 * kmem_asprintf() is freed by kmem_strfree(), so we have to 7892 * move it to a spa_strdup-ed string. 7893 */ 7894 char *tmp = kmem_asprintf("raidz%u-%u", 7895 (uint_t)vdev_get_nparity(oldvd), (uint_t)oldvd->vdev_id); 7896 oldvdpath = spa_strdup(tmp); 7897 kmem_strfree(tmp); 7898 } else { 7899 oldvdpath = spa_strdup(oldvd->vdev_path); 7900 } 7901 newvdpath = spa_strdup(newvd->vdev_path); 7902 7903 /* 7904 * If this is an in-place replacement, update oldvd's path and devid 7905 * to make it distinguishable from newvd, and unopenable from now on. 7906 */ 7907 if (strcmp(oldvdpath, newvdpath) == 0) { 7908 spa_strfree(oldvd->vdev_path); 7909 oldvd->vdev_path = kmem_alloc(strlen(newvdpath) + 5, 7910 KM_SLEEP); 7911 (void) sprintf(oldvd->vdev_path, "%s/old", 7912 newvdpath); 7913 if (oldvd->vdev_devid != NULL) { 7914 spa_strfree(oldvd->vdev_devid); 7915 oldvd->vdev_devid = NULL; 7916 } 7917 spa_strfree(oldvdpath); 7918 oldvdpath = spa_strdup(oldvd->vdev_path); 7919 } 7920 7921 /* 7922 * If the parent is not a mirror, or if we're replacing, insert the new 7923 * mirror/replacing/spare vdev above oldvd. 7924 */ 7925 if (!raidz && pvd->vdev_ops != pvops) { 7926 pvd = vdev_add_parent(oldvd, pvops); 7927 ASSERT(pvd->vdev_ops == pvops); 7928 ASSERT(oldvd->vdev_parent == pvd); 7929 } 7930 7931 ASSERT(pvd->vdev_top->vdev_parent == rvd); 7932 7933 /* 7934 * Extract the new device from its root and add it to pvd. 7935 */ 7936 vdev_remove_child(newrootvd, newvd); 7937 newvd->vdev_id = pvd->vdev_children; 7938 newvd->vdev_crtxg = oldvd->vdev_crtxg; 7939 vdev_add_child(pvd, newvd); 7940 7941 /* 7942 * Reevaluate the parent vdev state. 7943 */ 7944 vdev_propagate_state(pvd); 7945 7946 tvd = newvd->vdev_top; 7947 ASSERT(pvd->vdev_top == tvd); 7948 ASSERT(tvd->vdev_parent == rvd); 7949 7950 vdev_config_dirty(tvd); 7951 7952 /* 7953 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 7954 * for any dmu_sync-ed blocks. It will propagate upward when 7955 * spa_vdev_exit() calls vdev_dtl_reassess(). 7956 */ 7957 dtl_max_txg = txg + TXG_CONCURRENT_STATES; 7958 7959 if (raidz) { 7960 /* 7961 * Wait for the youngest allocations and frees to sync, 7962 * and then wait for the deferral of those frees to finish. 7963 */ 7964 spa_vdev_config_exit(spa, NULL, 7965 txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 7966 7967 vdev_initialize_stop_all(tvd, VDEV_INITIALIZE_ACTIVE); 7968 vdev_trim_stop_all(tvd, VDEV_TRIM_ACTIVE); 7969 vdev_autotrim_stop_wait(tvd); 7970 7971 dtl_max_txg = spa_vdev_config_enter(spa); 7972 7973 tvd->vdev_rz_expanding = B_TRUE; 7974 7975 vdev_dirty_leaves(tvd, VDD_DTL, dtl_max_txg); 7976 vdev_config_dirty(tvd); 7977 7978 dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, 7979 dtl_max_txg); 7980 dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_raidz_attach_sync, 7981 newvd, tx); 7982 dmu_tx_commit(tx); 7983 } else { 7984 vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, 7985 dtl_max_txg - TXG_INITIAL); 7986 7987 if (newvd->vdev_isspare) { 7988 spa_spare_activate(newvd); 7989 spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE); 7990 } 7991 7992 newvd_isspare = newvd->vdev_isspare; 7993 7994 /* 7995 * Mark newvd's DTL dirty in this txg. 7996 */ 7997 vdev_dirty(tvd, VDD_DTL, newvd, txg); 7998 7999 /* 8000 * Schedule the resilver or rebuild to restart in the future. 8001 * We do this to ensure that dmu_sync-ed blocks have been 8002 * stitched into the respective datasets. 8003 */ 8004 if (rebuild) { 8005 newvd->vdev_rebuild_txg = txg; 8006 8007 vdev_rebuild(tvd); 8008 } else { 8009 newvd->vdev_resilver_txg = txg; 8010 8011 if (dsl_scan_resilvering(spa_get_dsl(spa)) && 8012 spa_feature_is_enabled(spa, 8013 SPA_FEATURE_RESILVER_DEFER)) { 8014 vdev_defer_resilver(newvd); 8015 } else { 8016 dsl_scan_restart_resilver(spa->spa_dsl_pool, 8017 dtl_max_txg); 8018 } 8019 } 8020 } 8021 8022 if (spa->spa_bootfs) 8023 spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH); 8024 8025 spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH); 8026 8027 /* 8028 * Commit the config 8029 */ 8030 (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 8031 8032 spa_history_log_internal(spa, "vdev attach", NULL, 8033 "%s vdev=%s %s vdev=%s", 8034 replacing && newvd_isspare ? "spare in" : 8035 replacing ? "replace" : "attach", newvdpath, 8036 replacing ? "for" : "to", oldvdpath); 8037 8038 spa_strfree(oldvdpath); 8039 spa_strfree(newvdpath); 8040 8041 return (0); 8042 } 8043 8044 /* 8045 * Detach a device from a mirror or replacing vdev. 8046 * 8047 * If 'replace_done' is specified, only detach if the parent 8048 * is a replacing or a spare vdev. 8049 */ 8050 int 8051 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 8052 { 8053 uint64_t txg; 8054 int error; 8055 vdev_t *rvd __maybe_unused = spa->spa_root_vdev; 8056 vdev_t *vd, *pvd, *cvd, *tvd; 8057 boolean_t unspare = B_FALSE; 8058 uint64_t unspare_guid = 0; 8059 char *vdpath; 8060 8061 ASSERT(spa_writeable(spa)); 8062 8063 txg = spa_vdev_detach_enter(spa, guid); 8064 8065 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 8066 8067 /* 8068 * Besides being called directly from the userland through the 8069 * ioctl interface, spa_vdev_detach() can be potentially called 8070 * at the end of spa_vdev_resilver_done(). 8071 * 8072 * In the regular case, when we have a checkpoint this shouldn't 8073 * happen as we never empty the DTLs of a vdev during the scrub 8074 * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done() 8075 * should never get here when we have a checkpoint. 8076 * 8077 * That said, even in a case when we checkpoint the pool exactly 8078 * as spa_vdev_resilver_done() calls this function everything 8079 * should be fine as the resilver will return right away. 8080 */ 8081 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 8082 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 8083 error = (spa_has_checkpoint(spa)) ? 8084 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 8085 return (spa_vdev_exit(spa, NULL, txg, error)); 8086 } 8087 8088 if (vd == NULL) 8089 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 8090 8091 if (!vd->vdev_ops->vdev_op_leaf) 8092 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 8093 8094 pvd = vd->vdev_parent; 8095 8096 /* 8097 * If the parent/child relationship is not as expected, don't do it. 8098 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 8099 * vdev that's replacing B with C. The user's intent in replacing 8100 * is to go from M(A,B) to M(A,C). If the user decides to cancel 8101 * the replace by detaching C, the expected behavior is to end up 8102 * M(A,B). But suppose that right after deciding to detach C, 8103 * the replacement of B completes. We would have M(A,C), and then 8104 * ask to detach C, which would leave us with just A -- not what 8105 * the user wanted. To prevent this, we make sure that the 8106 * parent/child relationship hasn't changed -- in this example, 8107 * that C's parent is still the replacing vdev R. 8108 */ 8109 if (pvd->vdev_guid != pguid && pguid != 0) 8110 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 8111 8112 /* 8113 * Only 'replacing' or 'spare' vdevs can be replaced. 8114 */ 8115 if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 8116 pvd->vdev_ops != &vdev_spare_ops) 8117 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 8118 8119 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 8120 spa_version(spa) >= SPA_VERSION_SPARES); 8121 8122 /* 8123 * Only mirror, replacing, and spare vdevs support detach. 8124 */ 8125 if (pvd->vdev_ops != &vdev_replacing_ops && 8126 pvd->vdev_ops != &vdev_mirror_ops && 8127 pvd->vdev_ops != &vdev_spare_ops) 8128 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 8129 8130 /* 8131 * If this device has the only valid copy of some data, 8132 * we cannot safely detach it. 8133 */ 8134 if (vdev_dtl_required(vd)) 8135 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 8136 8137 ASSERT(pvd->vdev_children >= 2); 8138 8139 /* 8140 * If we are detaching the second disk from a replacing vdev, then 8141 * check to see if we changed the original vdev's path to have "/old" 8142 * at the end in spa_vdev_attach(). If so, undo that change now. 8143 */ 8144 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 8145 vd->vdev_path != NULL) { 8146 size_t len = strlen(vd->vdev_path); 8147 8148 for (int c = 0; c < pvd->vdev_children; c++) { 8149 cvd = pvd->vdev_child[c]; 8150 8151 if (cvd == vd || cvd->vdev_path == NULL) 8152 continue; 8153 8154 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 8155 strcmp(cvd->vdev_path + len, "/old") == 0) { 8156 spa_strfree(cvd->vdev_path); 8157 cvd->vdev_path = spa_strdup(vd->vdev_path); 8158 break; 8159 } 8160 } 8161 } 8162 8163 /* 8164 * If we are detaching the original disk from a normal spare, then it 8165 * implies that the spare should become a real disk, and be removed 8166 * from the active spare list for the pool. dRAID spares on the 8167 * other hand are coupled to the pool and thus should never be removed 8168 * from the spares list. 8169 */ 8170 if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0) { 8171 vdev_t *last_cvd = pvd->vdev_child[pvd->vdev_children - 1]; 8172 8173 if (last_cvd->vdev_isspare && 8174 last_cvd->vdev_ops != &vdev_draid_spare_ops) { 8175 unspare = B_TRUE; 8176 } 8177 } 8178 8179 /* 8180 * Erase the disk labels so the disk can be used for other things. 8181 * This must be done after all other error cases are handled, 8182 * but before we disembowel vd (so we can still do I/O to it). 8183 * But if we can't do it, don't treat the error as fatal -- 8184 * it may be that the unwritability of the disk is the reason 8185 * it's being detached! 8186 */ 8187 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 8188 8189 /* 8190 * Remove vd from its parent and compact the parent's children. 8191 */ 8192 vdev_remove_child(pvd, vd); 8193 vdev_compact_children(pvd); 8194 8195 /* 8196 * Remember one of the remaining children so we can get tvd below. 8197 */ 8198 cvd = pvd->vdev_child[pvd->vdev_children - 1]; 8199 8200 /* 8201 * If we need to remove the remaining child from the list of hot spares, 8202 * do it now, marking the vdev as no longer a spare in the process. 8203 * We must do this before vdev_remove_parent(), because that can 8204 * change the GUID if it creates a new toplevel GUID. For a similar 8205 * reason, we must remove the spare now, in the same txg as the detach; 8206 * otherwise someone could attach a new sibling, change the GUID, and 8207 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 8208 */ 8209 if (unspare) { 8210 ASSERT(cvd->vdev_isspare); 8211 spa_spare_remove(cvd); 8212 unspare_guid = cvd->vdev_guid; 8213 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 8214 cvd->vdev_unspare = B_TRUE; 8215 } 8216 8217 /* 8218 * If the parent mirror/replacing vdev only has one child, 8219 * the parent is no longer needed. Remove it from the tree. 8220 */ 8221 if (pvd->vdev_children == 1) { 8222 if (pvd->vdev_ops == &vdev_spare_ops) 8223 cvd->vdev_unspare = B_FALSE; 8224 vdev_remove_parent(cvd); 8225 } 8226 8227 /* 8228 * We don't set tvd until now because the parent we just removed 8229 * may have been the previous top-level vdev. 8230 */ 8231 tvd = cvd->vdev_top; 8232 ASSERT(tvd->vdev_parent == rvd); 8233 8234 /* 8235 * Reevaluate the parent vdev state. 8236 */ 8237 vdev_propagate_state(cvd); 8238 8239 /* 8240 * If the 'autoexpand' property is set on the pool then automatically 8241 * try to expand the size of the pool. For example if the device we 8242 * just detached was smaller than the others, it may be possible to 8243 * add metaslabs (i.e. grow the pool). We need to reopen the vdev 8244 * first so that we can obtain the updated sizes of the leaf vdevs. 8245 */ 8246 if (spa->spa_autoexpand) { 8247 vdev_reopen(tvd); 8248 vdev_expand(tvd, txg); 8249 } 8250 8251 vdev_config_dirty(tvd); 8252 8253 /* 8254 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 8255 * vd->vdev_detached is set and free vd's DTL object in syncing context. 8256 * But first make sure we're not on any *other* txg's DTL list, to 8257 * prevent vd from being accessed after it's freed. 8258 */ 8259 vdpath = spa_strdup(vd->vdev_path ? vd->vdev_path : "none"); 8260 for (int t = 0; t < TXG_SIZE; t++) 8261 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 8262 vd->vdev_detached = B_TRUE; 8263 vdev_dirty(tvd, VDD_DTL, vd, txg); 8264 8265 spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE); 8266 spa_notify_waiters(spa); 8267 8268 /* hang on to the spa before we release the lock */ 8269 spa_open_ref(spa, FTAG); 8270 8271 error = spa_vdev_exit(spa, vd, txg, 0); 8272 8273 spa_history_log_internal(spa, "detach", NULL, 8274 "vdev=%s", vdpath); 8275 spa_strfree(vdpath); 8276 8277 /* 8278 * If this was the removal of the original device in a hot spare vdev, 8279 * then we want to go through and remove the device from the hot spare 8280 * list of every other pool. 8281 */ 8282 if (unspare) { 8283 spa_t *altspa = NULL; 8284 8285 mutex_enter(&spa_namespace_lock); 8286 while ((altspa = spa_next(altspa)) != NULL) { 8287 if (altspa->spa_state != POOL_STATE_ACTIVE || 8288 altspa == spa) 8289 continue; 8290 8291 spa_open_ref(altspa, FTAG); 8292 mutex_exit(&spa_namespace_lock); 8293 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 8294 mutex_enter(&spa_namespace_lock); 8295 spa_close(altspa, FTAG); 8296 } 8297 mutex_exit(&spa_namespace_lock); 8298 8299 /* search the rest of the vdevs for spares to remove */ 8300 spa_vdev_resilver_done(spa); 8301 } 8302 8303 /* all done with the spa; OK to release */ 8304 mutex_enter(&spa_namespace_lock); 8305 spa_close(spa, FTAG); 8306 mutex_exit(&spa_namespace_lock); 8307 8308 return (error); 8309 } 8310 8311 static int 8312 spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, 8313 list_t *vd_list) 8314 { 8315 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 8316 8317 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 8318 8319 /* Look up vdev and ensure it's a leaf. */ 8320 vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); 8321 if (vd == NULL || vd->vdev_detached) { 8322 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8323 return (SET_ERROR(ENODEV)); 8324 } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { 8325 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8326 return (SET_ERROR(EINVAL)); 8327 } else if (!vdev_writeable(vd)) { 8328 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8329 return (SET_ERROR(EROFS)); 8330 } 8331 mutex_enter(&vd->vdev_initialize_lock); 8332 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8333 8334 /* 8335 * When we activate an initialize action we check to see 8336 * if the vdev_initialize_thread is NULL. We do this instead 8337 * of using the vdev_initialize_state since there might be 8338 * a previous initialization process which has completed but 8339 * the thread is not exited. 8340 */ 8341 if (cmd_type == POOL_INITIALIZE_START && 8342 (vd->vdev_initialize_thread != NULL || 8343 vd->vdev_top->vdev_removing || vd->vdev_top->vdev_rz_expanding)) { 8344 mutex_exit(&vd->vdev_initialize_lock); 8345 return (SET_ERROR(EBUSY)); 8346 } else if (cmd_type == POOL_INITIALIZE_CANCEL && 8347 (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE && 8348 vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) { 8349 mutex_exit(&vd->vdev_initialize_lock); 8350 return (SET_ERROR(ESRCH)); 8351 } else if (cmd_type == POOL_INITIALIZE_SUSPEND && 8352 vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) { 8353 mutex_exit(&vd->vdev_initialize_lock); 8354 return (SET_ERROR(ESRCH)); 8355 } else if (cmd_type == POOL_INITIALIZE_UNINIT && 8356 vd->vdev_initialize_thread != NULL) { 8357 mutex_exit(&vd->vdev_initialize_lock); 8358 return (SET_ERROR(EBUSY)); 8359 } 8360 8361 switch (cmd_type) { 8362 case POOL_INITIALIZE_START: 8363 vdev_initialize(vd); 8364 break; 8365 case POOL_INITIALIZE_CANCEL: 8366 vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED, vd_list); 8367 break; 8368 case POOL_INITIALIZE_SUSPEND: 8369 vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list); 8370 break; 8371 case POOL_INITIALIZE_UNINIT: 8372 vdev_uninitialize(vd); 8373 break; 8374 default: 8375 panic("invalid cmd_type %llu", (unsigned long long)cmd_type); 8376 } 8377 mutex_exit(&vd->vdev_initialize_lock); 8378 8379 return (0); 8380 } 8381 8382 int 8383 spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, 8384 nvlist_t *vdev_errlist) 8385 { 8386 int total_errors = 0; 8387 list_t vd_list; 8388 8389 list_create(&vd_list, sizeof (vdev_t), 8390 offsetof(vdev_t, vdev_initialize_node)); 8391 8392 /* 8393 * We hold the namespace lock through the whole function 8394 * to prevent any changes to the pool while we're starting or 8395 * stopping initialization. The config and state locks are held so that 8396 * we can properly assess the vdev state before we commit to 8397 * the initializing operation. 8398 */ 8399 mutex_enter(&spa_namespace_lock); 8400 8401 for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); 8402 pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { 8403 uint64_t vdev_guid = fnvpair_value_uint64(pair); 8404 8405 int error = spa_vdev_initialize_impl(spa, vdev_guid, cmd_type, 8406 &vd_list); 8407 if (error != 0) { 8408 char guid_as_str[MAXNAMELEN]; 8409 8410 (void) snprintf(guid_as_str, sizeof (guid_as_str), 8411 "%llu", (unsigned long long)vdev_guid); 8412 fnvlist_add_int64(vdev_errlist, guid_as_str, error); 8413 total_errors++; 8414 } 8415 } 8416 8417 /* Wait for all initialize threads to stop. */ 8418 vdev_initialize_stop_wait(spa, &vd_list); 8419 8420 /* Sync out the initializing state */ 8421 txg_wait_synced(spa->spa_dsl_pool, 0); 8422 mutex_exit(&spa_namespace_lock); 8423 8424 list_destroy(&vd_list); 8425 8426 return (total_errors); 8427 } 8428 8429 static int 8430 spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, 8431 uint64_t rate, boolean_t partial, boolean_t secure, list_t *vd_list) 8432 { 8433 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 8434 8435 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 8436 8437 /* Look up vdev and ensure it's a leaf. */ 8438 vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); 8439 if (vd == NULL || vd->vdev_detached) { 8440 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8441 return (SET_ERROR(ENODEV)); 8442 } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { 8443 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8444 return (SET_ERROR(EINVAL)); 8445 } else if (!vdev_writeable(vd)) { 8446 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8447 return (SET_ERROR(EROFS)); 8448 } else if (!vd->vdev_has_trim) { 8449 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8450 return (SET_ERROR(EOPNOTSUPP)); 8451 } else if (secure && !vd->vdev_has_securetrim) { 8452 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8453 return (SET_ERROR(EOPNOTSUPP)); 8454 } 8455 mutex_enter(&vd->vdev_trim_lock); 8456 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8457 8458 /* 8459 * When we activate a TRIM action we check to see if the 8460 * vdev_trim_thread is NULL. We do this instead of using the 8461 * vdev_trim_state since there might be a previous TRIM process 8462 * which has completed but the thread is not exited. 8463 */ 8464 if (cmd_type == POOL_TRIM_START && 8465 (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing || 8466 vd->vdev_top->vdev_rz_expanding)) { 8467 mutex_exit(&vd->vdev_trim_lock); 8468 return (SET_ERROR(EBUSY)); 8469 } else if (cmd_type == POOL_TRIM_CANCEL && 8470 (vd->vdev_trim_state != VDEV_TRIM_ACTIVE && 8471 vd->vdev_trim_state != VDEV_TRIM_SUSPENDED)) { 8472 mutex_exit(&vd->vdev_trim_lock); 8473 return (SET_ERROR(ESRCH)); 8474 } else if (cmd_type == POOL_TRIM_SUSPEND && 8475 vd->vdev_trim_state != VDEV_TRIM_ACTIVE) { 8476 mutex_exit(&vd->vdev_trim_lock); 8477 return (SET_ERROR(ESRCH)); 8478 } 8479 8480 switch (cmd_type) { 8481 case POOL_TRIM_START: 8482 vdev_trim(vd, rate, partial, secure); 8483 break; 8484 case POOL_TRIM_CANCEL: 8485 vdev_trim_stop(vd, VDEV_TRIM_CANCELED, vd_list); 8486 break; 8487 case POOL_TRIM_SUSPEND: 8488 vdev_trim_stop(vd, VDEV_TRIM_SUSPENDED, vd_list); 8489 break; 8490 default: 8491 panic("invalid cmd_type %llu", (unsigned long long)cmd_type); 8492 } 8493 mutex_exit(&vd->vdev_trim_lock); 8494 8495 return (0); 8496 } 8497 8498 /* 8499 * Initiates a manual TRIM for the requested vdevs. This kicks off individual 8500 * TRIM threads for each child vdev. These threads pass over all of the free 8501 * space in the vdev's metaslabs and issues TRIM commands for that space. 8502 */ 8503 int 8504 spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate, 8505 boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist) 8506 { 8507 int total_errors = 0; 8508 list_t vd_list; 8509 8510 list_create(&vd_list, sizeof (vdev_t), 8511 offsetof(vdev_t, vdev_trim_node)); 8512 8513 /* 8514 * We hold the namespace lock through the whole function 8515 * to prevent any changes to the pool while we're starting or 8516 * stopping TRIM. The config and state locks are held so that 8517 * we can properly assess the vdev state before we commit to 8518 * the TRIM operation. 8519 */ 8520 mutex_enter(&spa_namespace_lock); 8521 8522 for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); 8523 pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { 8524 uint64_t vdev_guid = fnvpair_value_uint64(pair); 8525 8526 int error = spa_vdev_trim_impl(spa, vdev_guid, cmd_type, 8527 rate, partial, secure, &vd_list); 8528 if (error != 0) { 8529 char guid_as_str[MAXNAMELEN]; 8530 8531 (void) snprintf(guid_as_str, sizeof (guid_as_str), 8532 "%llu", (unsigned long long)vdev_guid); 8533 fnvlist_add_int64(vdev_errlist, guid_as_str, error); 8534 total_errors++; 8535 } 8536 } 8537 8538 /* Wait for all TRIM threads to stop. */ 8539 vdev_trim_stop_wait(spa, &vd_list); 8540 8541 /* Sync out the TRIM state */ 8542 txg_wait_synced(spa->spa_dsl_pool, 0); 8543 mutex_exit(&spa_namespace_lock); 8544 8545 list_destroy(&vd_list); 8546 8547 return (total_errors); 8548 } 8549 8550 /* 8551 * Split a set of devices from their mirrors, and create a new pool from them. 8552 */ 8553 int 8554 spa_vdev_split_mirror(spa_t *spa, const char *newname, nvlist_t *config, 8555 nvlist_t *props, boolean_t exp) 8556 { 8557 int error = 0; 8558 uint64_t txg, *glist; 8559 spa_t *newspa; 8560 uint_t c, children, lastlog; 8561 nvlist_t **child, *nvl, *tmp; 8562 dmu_tx_t *tx; 8563 const char *altroot = NULL; 8564 vdev_t *rvd, **vml = NULL; /* vdev modify list */ 8565 boolean_t activate_slog; 8566 8567 ASSERT(spa_writeable(spa)); 8568 8569 txg = spa_vdev_enter(spa); 8570 8571 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 8572 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 8573 error = (spa_has_checkpoint(spa)) ? 8574 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 8575 return (spa_vdev_exit(spa, NULL, txg, error)); 8576 } 8577 8578 /* clear the log and flush everything up to now */ 8579 activate_slog = spa_passivate_log(spa); 8580 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 8581 error = spa_reset_logs(spa); 8582 txg = spa_vdev_config_enter(spa); 8583 8584 if (activate_slog) 8585 spa_activate_log(spa); 8586 8587 if (error != 0) 8588 return (spa_vdev_exit(spa, NULL, txg, error)); 8589 8590 /* check new spa name before going any further */ 8591 if (spa_lookup(newname) != NULL) 8592 return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 8593 8594 /* 8595 * scan through all the children to ensure they're all mirrors 8596 */ 8597 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 8598 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 8599 &children) != 0) 8600 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 8601 8602 /* first, check to ensure we've got the right child count */ 8603 rvd = spa->spa_root_vdev; 8604 lastlog = 0; 8605 for (c = 0; c < rvd->vdev_children; c++) { 8606 vdev_t *vd = rvd->vdev_child[c]; 8607 8608 /* don't count the holes & logs as children */ 8609 if (vd->vdev_islog || (vd->vdev_ops != &vdev_indirect_ops && 8610 !vdev_is_concrete(vd))) { 8611 if (lastlog == 0) 8612 lastlog = c; 8613 continue; 8614 } 8615 8616 lastlog = 0; 8617 } 8618 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 8619 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 8620 8621 /* next, ensure no spare or cache devices are part of the split */ 8622 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 8623 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 8624 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 8625 8626 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 8627 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 8628 8629 /* then, loop over each vdev and validate it */ 8630 for (c = 0; c < children; c++) { 8631 uint64_t is_hole = 0; 8632 8633 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 8634 &is_hole); 8635 8636 if (is_hole != 0) { 8637 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 8638 spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 8639 continue; 8640 } else { 8641 error = SET_ERROR(EINVAL); 8642 break; 8643 } 8644 } 8645 8646 /* deal with indirect vdevs */ 8647 if (spa->spa_root_vdev->vdev_child[c]->vdev_ops == 8648 &vdev_indirect_ops) 8649 continue; 8650 8651 /* which disk is going to be split? */ 8652 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 8653 &glist[c]) != 0) { 8654 error = SET_ERROR(EINVAL); 8655 break; 8656 } 8657 8658 /* look it up in the spa */ 8659 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 8660 if (vml[c] == NULL) { 8661 error = SET_ERROR(ENODEV); 8662 break; 8663 } 8664 8665 /* make sure there's nothing stopping the split */ 8666 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 8667 vml[c]->vdev_islog || 8668 !vdev_is_concrete(vml[c]) || 8669 vml[c]->vdev_isspare || 8670 vml[c]->vdev_isl2cache || 8671 !vdev_writeable(vml[c]) || 8672 vml[c]->vdev_children != 0 || 8673 vml[c]->vdev_state != VDEV_STATE_HEALTHY || 8674 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 8675 error = SET_ERROR(EINVAL); 8676 break; 8677 } 8678 8679 if (vdev_dtl_required(vml[c]) || 8680 vdev_resilver_needed(vml[c], NULL, NULL)) { 8681 error = SET_ERROR(EBUSY); 8682 break; 8683 } 8684 8685 /* we need certain info from the top level */ 8686 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 8687 vml[c]->vdev_top->vdev_ms_array); 8688 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 8689 vml[c]->vdev_top->vdev_ms_shift); 8690 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 8691 vml[c]->vdev_top->vdev_asize); 8692 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 8693 vml[c]->vdev_top->vdev_ashift); 8694 8695 /* transfer per-vdev ZAPs */ 8696 ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0); 8697 VERIFY0(nvlist_add_uint64(child[c], 8698 ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap)); 8699 8700 ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0); 8701 VERIFY0(nvlist_add_uint64(child[c], 8702 ZPOOL_CONFIG_VDEV_TOP_ZAP, 8703 vml[c]->vdev_parent->vdev_top_zap)); 8704 } 8705 8706 if (error != 0) { 8707 kmem_free(vml, children * sizeof (vdev_t *)); 8708 kmem_free(glist, children * sizeof (uint64_t)); 8709 return (spa_vdev_exit(spa, NULL, txg, error)); 8710 } 8711 8712 /* stop writers from using the disks */ 8713 for (c = 0; c < children; c++) { 8714 if (vml[c] != NULL) 8715 vml[c]->vdev_offline = B_TRUE; 8716 } 8717 vdev_reopen(spa->spa_root_vdev); 8718 8719 /* 8720 * Temporarily record the splitting vdevs in the spa config. This 8721 * will disappear once the config is regenerated. 8722 */ 8723 nvl = fnvlist_alloc(); 8724 fnvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, glist, children); 8725 kmem_free(glist, children * sizeof (uint64_t)); 8726 8727 mutex_enter(&spa->spa_props_lock); 8728 fnvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, nvl); 8729 mutex_exit(&spa->spa_props_lock); 8730 spa->spa_config_splitting = nvl; 8731 vdev_config_dirty(spa->spa_root_vdev); 8732 8733 /* configure and create the new pool */ 8734 fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname); 8735 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 8736 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE); 8737 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa)); 8738 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg); 8739 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 8740 spa_generate_guid(NULL)); 8741 VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); 8742 (void) nvlist_lookup_string(props, 8743 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 8744 8745 /* add the new pool to the namespace */ 8746 newspa = spa_add(newname, config, altroot); 8747 newspa->spa_avz_action = AVZ_ACTION_REBUILD; 8748 newspa->spa_config_txg = spa->spa_config_txg; 8749 spa_set_log_state(newspa, SPA_LOG_CLEAR); 8750 8751 /* release the spa config lock, retaining the namespace lock */ 8752 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 8753 8754 if (zio_injection_enabled) 8755 zio_handle_panic_injection(spa, FTAG, 1); 8756 8757 spa_activate(newspa, spa_mode_global); 8758 spa_async_suspend(newspa); 8759 8760 /* 8761 * Temporarily stop the initializing and TRIM activity. We set the 8762 * state to ACTIVE so that we know to resume initializing or TRIM 8763 * once the split has completed. 8764 */ 8765 list_t vd_initialize_list; 8766 list_create(&vd_initialize_list, sizeof (vdev_t), 8767 offsetof(vdev_t, vdev_initialize_node)); 8768 8769 list_t vd_trim_list; 8770 list_create(&vd_trim_list, sizeof (vdev_t), 8771 offsetof(vdev_t, vdev_trim_node)); 8772 8773 for (c = 0; c < children; c++) { 8774 if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { 8775 mutex_enter(&vml[c]->vdev_initialize_lock); 8776 vdev_initialize_stop(vml[c], 8777 VDEV_INITIALIZE_ACTIVE, &vd_initialize_list); 8778 mutex_exit(&vml[c]->vdev_initialize_lock); 8779 8780 mutex_enter(&vml[c]->vdev_trim_lock); 8781 vdev_trim_stop(vml[c], VDEV_TRIM_ACTIVE, &vd_trim_list); 8782 mutex_exit(&vml[c]->vdev_trim_lock); 8783 } 8784 } 8785 8786 vdev_initialize_stop_wait(spa, &vd_initialize_list); 8787 vdev_trim_stop_wait(spa, &vd_trim_list); 8788 8789 list_destroy(&vd_initialize_list); 8790 list_destroy(&vd_trim_list); 8791 8792 newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT; 8793 newspa->spa_is_splitting = B_TRUE; 8794 8795 /* create the new pool from the disks of the original pool */ 8796 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE); 8797 if (error) 8798 goto out; 8799 8800 /* if that worked, generate a real config for the new pool */ 8801 if (newspa->spa_root_vdev != NULL) { 8802 newspa->spa_config_splitting = fnvlist_alloc(); 8803 fnvlist_add_uint64(newspa->spa_config_splitting, 8804 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)); 8805 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 8806 B_TRUE)); 8807 } 8808 8809 /* set the props */ 8810 if (props != NULL) { 8811 spa_configfile_set(newspa, props, B_FALSE); 8812 error = spa_prop_set(newspa, props); 8813 if (error) 8814 goto out; 8815 } 8816 8817 /* flush everything */ 8818 txg = spa_vdev_config_enter(newspa); 8819 vdev_config_dirty(newspa->spa_root_vdev); 8820 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 8821 8822 if (zio_injection_enabled) 8823 zio_handle_panic_injection(spa, FTAG, 2); 8824 8825 spa_async_resume(newspa); 8826 8827 /* finally, update the original pool's config */ 8828 txg = spa_vdev_config_enter(spa); 8829 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 8830 error = dmu_tx_assign(tx, DMU_TX_WAIT); 8831 if (error != 0) 8832 dmu_tx_abort(tx); 8833 for (c = 0; c < children; c++) { 8834 if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { 8835 vdev_t *tvd = vml[c]->vdev_top; 8836 8837 /* 8838 * Need to be sure the detachable VDEV is not 8839 * on any *other* txg's DTL list to prevent it 8840 * from being accessed after it's freed. 8841 */ 8842 for (int t = 0; t < TXG_SIZE; t++) { 8843 (void) txg_list_remove_this( 8844 &tvd->vdev_dtl_list, vml[c], t); 8845 } 8846 8847 vdev_split(vml[c]); 8848 if (error == 0) 8849 spa_history_log_internal(spa, "detach", tx, 8850 "vdev=%s", vml[c]->vdev_path); 8851 8852 vdev_free(vml[c]); 8853 } 8854 } 8855 spa->spa_avz_action = AVZ_ACTION_REBUILD; 8856 vdev_config_dirty(spa->spa_root_vdev); 8857 spa->spa_config_splitting = NULL; 8858 nvlist_free(nvl); 8859 if (error == 0) 8860 dmu_tx_commit(tx); 8861 (void) spa_vdev_exit(spa, NULL, txg, 0); 8862 8863 if (zio_injection_enabled) 8864 zio_handle_panic_injection(spa, FTAG, 3); 8865 8866 /* split is complete; log a history record */ 8867 spa_history_log_internal(newspa, "split", NULL, 8868 "from pool %s", spa_name(spa)); 8869 8870 newspa->spa_is_splitting = B_FALSE; 8871 kmem_free(vml, children * sizeof (vdev_t *)); 8872 8873 /* if we're not going to mount the filesystems in userland, export */ 8874 if (exp) 8875 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 8876 B_FALSE, B_FALSE); 8877 8878 return (error); 8879 8880 out: 8881 spa_unload(newspa); 8882 spa_deactivate(newspa); 8883 spa_remove(newspa); 8884 8885 txg = spa_vdev_config_enter(spa); 8886 8887 /* re-online all offlined disks */ 8888 for (c = 0; c < children; c++) { 8889 if (vml[c] != NULL) 8890 vml[c]->vdev_offline = B_FALSE; 8891 } 8892 8893 /* restart initializing or trimming disks as necessary */ 8894 spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); 8895 spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); 8896 spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); 8897 8898 vdev_reopen(spa->spa_root_vdev); 8899 8900 nvlist_free(spa->spa_config_splitting); 8901 spa->spa_config_splitting = NULL; 8902 (void) spa_vdev_exit(spa, NULL, txg, error); 8903 8904 kmem_free(vml, children * sizeof (vdev_t *)); 8905 return (error); 8906 } 8907 8908 /* 8909 * Find any device that's done replacing, or a vdev marked 'unspare' that's 8910 * currently spared, so we can detach it. 8911 */ 8912 static vdev_t * 8913 spa_vdev_resilver_done_hunt(vdev_t *vd) 8914 { 8915 vdev_t *newvd, *oldvd; 8916 8917 for (int c = 0; c < vd->vdev_children; c++) { 8918 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 8919 if (oldvd != NULL) 8920 return (oldvd); 8921 } 8922 8923 /* 8924 * Check for a completed replacement. We always consider the first 8925 * vdev in the list to be the oldest vdev, and the last one to be 8926 * the newest (see spa_vdev_attach() for how that works). In 8927 * the case where the newest vdev is faulted, we will not automatically 8928 * remove it after a resilver completes. This is OK as it will require 8929 * user intervention to determine which disk the admin wishes to keep. 8930 */ 8931 if (vd->vdev_ops == &vdev_replacing_ops) { 8932 ASSERT(vd->vdev_children > 1); 8933 8934 newvd = vd->vdev_child[vd->vdev_children - 1]; 8935 oldvd = vd->vdev_child[0]; 8936 8937 if (vdev_dtl_empty(newvd, DTL_MISSING) && 8938 vdev_dtl_empty(newvd, DTL_OUTAGE) && 8939 !vdev_dtl_required(oldvd)) 8940 return (oldvd); 8941 } 8942 8943 /* 8944 * Check for a completed resilver with the 'unspare' flag set. 8945 * Also potentially update faulted state. 8946 */ 8947 if (vd->vdev_ops == &vdev_spare_ops) { 8948 vdev_t *first = vd->vdev_child[0]; 8949 vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 8950 8951 if (last->vdev_unspare) { 8952 oldvd = first; 8953 newvd = last; 8954 } else if (first->vdev_unspare) { 8955 oldvd = last; 8956 newvd = first; 8957 } else { 8958 oldvd = NULL; 8959 } 8960 8961 if (oldvd != NULL && 8962 vdev_dtl_empty(newvd, DTL_MISSING) && 8963 vdev_dtl_empty(newvd, DTL_OUTAGE) && 8964 !vdev_dtl_required(oldvd)) 8965 return (oldvd); 8966 8967 vdev_propagate_state(vd); 8968 8969 /* 8970 * If there are more than two spares attached to a disk, 8971 * and those spares are not required, then we want to 8972 * attempt to free them up now so that they can be used 8973 * by other pools. Once we're back down to a single 8974 * disk+spare, we stop removing them. 8975 */ 8976 if (vd->vdev_children > 2) { 8977 newvd = vd->vdev_child[1]; 8978 8979 if (newvd->vdev_isspare && last->vdev_isspare && 8980 vdev_dtl_empty(last, DTL_MISSING) && 8981 vdev_dtl_empty(last, DTL_OUTAGE) && 8982 !vdev_dtl_required(newvd)) 8983 return (newvd); 8984 } 8985 } 8986 8987 return (NULL); 8988 } 8989 8990 static void 8991 spa_vdev_resilver_done(spa_t *spa) 8992 { 8993 vdev_t *vd, *pvd, *ppvd; 8994 uint64_t guid, sguid, pguid, ppguid; 8995 8996 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 8997 8998 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 8999 pvd = vd->vdev_parent; 9000 ppvd = pvd->vdev_parent; 9001 guid = vd->vdev_guid; 9002 pguid = pvd->vdev_guid; 9003 ppguid = ppvd->vdev_guid; 9004 sguid = 0; 9005 /* 9006 * If we have just finished replacing a hot spared device, then 9007 * we need to detach the parent's first child (the original hot 9008 * spare) as well. 9009 */ 9010 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 9011 ppvd->vdev_children == 2) { 9012 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 9013 sguid = ppvd->vdev_child[1]->vdev_guid; 9014 } 9015 ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); 9016 9017 spa_config_exit(spa, SCL_ALL, FTAG); 9018 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 9019 return; 9020 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 9021 return; 9022 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 9023 } 9024 9025 spa_config_exit(spa, SCL_ALL, FTAG); 9026 9027 /* 9028 * If a detach was not performed above replace waiters will not have 9029 * been notified. In which case we must do so now. 9030 */ 9031 spa_notify_waiters(spa); 9032 } 9033 9034 /* 9035 * Update the stored path or FRU for this vdev. 9036 */ 9037 static int 9038 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 9039 boolean_t ispath) 9040 { 9041 vdev_t *vd; 9042 boolean_t sync = B_FALSE; 9043 9044 ASSERT(spa_writeable(spa)); 9045 9046 spa_vdev_state_enter(spa, SCL_ALL); 9047 9048 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 9049 return (spa_vdev_state_exit(spa, NULL, ENOENT)); 9050 9051 if (!vd->vdev_ops->vdev_op_leaf) 9052 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 9053 9054 if (ispath) { 9055 if (strcmp(value, vd->vdev_path) != 0) { 9056 spa_strfree(vd->vdev_path); 9057 vd->vdev_path = spa_strdup(value); 9058 sync = B_TRUE; 9059 } 9060 } else { 9061 if (vd->vdev_fru == NULL) { 9062 vd->vdev_fru = spa_strdup(value); 9063 sync = B_TRUE; 9064 } else if (strcmp(value, vd->vdev_fru) != 0) { 9065 spa_strfree(vd->vdev_fru); 9066 vd->vdev_fru = spa_strdup(value); 9067 sync = B_TRUE; 9068 } 9069 } 9070 9071 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 9072 } 9073 9074 int 9075 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 9076 { 9077 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 9078 } 9079 9080 int 9081 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 9082 { 9083 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 9084 } 9085 9086 /* 9087 * ========================================================================== 9088 * SPA Scanning 9089 * ========================================================================== 9090 */ 9091 int 9092 spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd) 9093 { 9094 ASSERT0(spa_config_held(spa, SCL_ALL, RW_WRITER)); 9095 9096 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 9097 return (SET_ERROR(EBUSY)); 9098 9099 return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd)); 9100 } 9101 9102 int 9103 spa_scan_stop(spa_t *spa) 9104 { 9105 ASSERT0(spa_config_held(spa, SCL_ALL, RW_WRITER)); 9106 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 9107 return (SET_ERROR(EBUSY)); 9108 9109 return (dsl_scan_cancel(spa->spa_dsl_pool)); 9110 } 9111 9112 int 9113 spa_scan(spa_t *spa, pool_scan_func_t func) 9114 { 9115 return (spa_scan_range(spa, func, 0, 0)); 9116 } 9117 9118 int 9119 spa_scan_range(spa_t *spa, pool_scan_func_t func, uint64_t txgstart, 9120 uint64_t txgend) 9121 { 9122 ASSERT0(spa_config_held(spa, SCL_ALL, RW_WRITER)); 9123 9124 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 9125 return (SET_ERROR(ENOTSUP)); 9126 9127 if (func == POOL_SCAN_RESILVER && 9128 !spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) 9129 return (SET_ERROR(ENOTSUP)); 9130 9131 if (func != POOL_SCAN_SCRUB && (txgstart != 0 || txgend != 0)) 9132 return (SET_ERROR(ENOTSUP)); 9133 9134 /* 9135 * If a resilver was requested, but there is no DTL on a 9136 * writeable leaf device, we have nothing to do. 9137 */ 9138 if (func == POOL_SCAN_RESILVER && 9139 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 9140 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 9141 return (0); 9142 } 9143 9144 if (func == POOL_SCAN_ERRORSCRUB && 9145 !spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) 9146 return (SET_ERROR(ENOTSUP)); 9147 9148 return (dsl_scan(spa->spa_dsl_pool, func, txgstart, txgend)); 9149 } 9150 9151 /* 9152 * ========================================================================== 9153 * SPA async task processing 9154 * ========================================================================== 9155 */ 9156 9157 static void 9158 spa_async_remove(spa_t *spa, vdev_t *vd, boolean_t by_kernel) 9159 { 9160 if (vd->vdev_remove_wanted) { 9161 vd->vdev_remove_wanted = B_FALSE; 9162 vd->vdev_delayed_close = B_FALSE; 9163 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 9164 9165 /* 9166 * We want to clear the stats, but we don't want to do a full 9167 * vdev_clear() as that will cause us to throw away 9168 * degraded/faulted state as well as attempt to reopen the 9169 * device, all of which is a waste. 9170 */ 9171 vd->vdev_stat.vs_read_errors = 0; 9172 vd->vdev_stat.vs_write_errors = 0; 9173 vd->vdev_stat.vs_checksum_errors = 0; 9174 9175 vdev_state_dirty(vd->vdev_top); 9176 9177 /* Tell userspace that the vdev is gone. */ 9178 zfs_post_remove(spa, vd, by_kernel); 9179 } 9180 9181 for (int c = 0; c < vd->vdev_children; c++) 9182 spa_async_remove(spa, vd->vdev_child[c], by_kernel); 9183 } 9184 9185 static void 9186 spa_async_fault_vdev(vdev_t *vd, boolean_t *suspend) 9187 { 9188 if (vd->vdev_fault_wanted) { 9189 vdev_state_t newstate = VDEV_STATE_FAULTED; 9190 vd->vdev_fault_wanted = B_FALSE; 9191 9192 /* 9193 * If this device has the only valid copy of the data, then 9194 * back off and simply mark the vdev as degraded instead. 9195 */ 9196 if (!vd->vdev_top->vdev_islog && vd->vdev_aux == NULL && 9197 vdev_dtl_required(vd)) { 9198 newstate = VDEV_STATE_DEGRADED; 9199 /* A required disk is missing so suspend the pool */ 9200 *suspend = B_TRUE; 9201 } 9202 vdev_set_state(vd, B_TRUE, newstate, VDEV_AUX_ERR_EXCEEDED); 9203 } 9204 for (int c = 0; c < vd->vdev_children; c++) 9205 spa_async_fault_vdev(vd->vdev_child[c], suspend); 9206 } 9207 9208 static void 9209 spa_async_autoexpand(spa_t *spa, vdev_t *vd) 9210 { 9211 if (!spa->spa_autoexpand) 9212 return; 9213 9214 for (int c = 0; c < vd->vdev_children; c++) { 9215 vdev_t *cvd = vd->vdev_child[c]; 9216 spa_async_autoexpand(spa, cvd); 9217 } 9218 9219 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 9220 return; 9221 9222 spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_AUTOEXPAND); 9223 } 9224 9225 static __attribute__((noreturn)) void 9226 spa_async_thread(void *arg) 9227 { 9228 spa_t *spa = (spa_t *)arg; 9229 dsl_pool_t *dp = spa->spa_dsl_pool; 9230 int tasks; 9231 9232 ASSERT(spa->spa_sync_on); 9233 9234 mutex_enter(&spa->spa_async_lock); 9235 tasks = spa->spa_async_tasks; 9236 spa->spa_async_tasks = 0; 9237 mutex_exit(&spa->spa_async_lock); 9238 9239 /* 9240 * See if the config needs to be updated. 9241 */ 9242 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 9243 uint64_t old_space, new_space; 9244 9245 mutex_enter(&spa_namespace_lock); 9246 old_space = metaslab_class_get_space(spa_normal_class(spa)); 9247 old_space += metaslab_class_get_space(spa_special_class(spa)); 9248 old_space += metaslab_class_get_space(spa_dedup_class(spa)); 9249 old_space += metaslab_class_get_space( 9250 spa_embedded_log_class(spa)); 9251 old_space += metaslab_class_get_space( 9252 spa_special_embedded_log_class(spa)); 9253 9254 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 9255 9256 new_space = metaslab_class_get_space(spa_normal_class(spa)); 9257 new_space += metaslab_class_get_space(spa_special_class(spa)); 9258 new_space += metaslab_class_get_space(spa_dedup_class(spa)); 9259 new_space += metaslab_class_get_space( 9260 spa_embedded_log_class(spa)); 9261 new_space += metaslab_class_get_space( 9262 spa_special_embedded_log_class(spa)); 9263 mutex_exit(&spa_namespace_lock); 9264 9265 /* 9266 * If the pool grew as a result of the config update, 9267 * then log an internal history event. 9268 */ 9269 if (new_space != old_space) { 9270 spa_history_log_internal(spa, "vdev online", NULL, 9271 "pool '%s' size: %llu(+%llu)", 9272 spa_name(spa), (u_longlong_t)new_space, 9273 (u_longlong_t)(new_space - old_space)); 9274 } 9275 } 9276 9277 /* 9278 * See if any devices need to be marked REMOVED. 9279 */ 9280 if (tasks & (SPA_ASYNC_REMOVE | SPA_ASYNC_REMOVE_BY_USER)) { 9281 boolean_t by_kernel = B_TRUE; 9282 if (tasks & SPA_ASYNC_REMOVE_BY_USER) 9283 by_kernel = B_FALSE; 9284 spa_vdev_state_enter(spa, SCL_NONE); 9285 spa_async_remove(spa, spa->spa_root_vdev, by_kernel); 9286 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 9287 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i], 9288 by_kernel); 9289 for (int i = 0; i < spa->spa_spares.sav_count; i++) 9290 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i], 9291 by_kernel); 9292 (void) spa_vdev_state_exit(spa, NULL, 0); 9293 } 9294 9295 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 9296 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 9297 spa_async_autoexpand(spa, spa->spa_root_vdev); 9298 spa_config_exit(spa, SCL_CONFIG, FTAG); 9299 } 9300 9301 /* 9302 * See if any devices need to be marked faulted. 9303 */ 9304 if (tasks & SPA_ASYNC_FAULT_VDEV) { 9305 spa_vdev_state_enter(spa, SCL_NONE); 9306 boolean_t suspend = B_FALSE; 9307 spa_async_fault_vdev(spa->spa_root_vdev, &suspend); 9308 (void) spa_vdev_state_exit(spa, NULL, 0); 9309 if (suspend) 9310 zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR); 9311 } 9312 9313 /* 9314 * If any devices are done replacing, detach them. 9315 */ 9316 if (tasks & SPA_ASYNC_RESILVER_DONE || 9317 tasks & SPA_ASYNC_REBUILD_DONE || 9318 tasks & SPA_ASYNC_DETACH_SPARE) { 9319 spa_vdev_resilver_done(spa); 9320 } 9321 9322 /* 9323 * Kick off a resilver. 9324 */ 9325 if (tasks & SPA_ASYNC_RESILVER && 9326 !vdev_rebuild_active(spa->spa_root_vdev) && 9327 (!dsl_scan_resilvering(dp) || 9328 !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))) 9329 dsl_scan_restart_resilver(dp, 0); 9330 9331 if (tasks & SPA_ASYNC_INITIALIZE_RESTART) { 9332 mutex_enter(&spa_namespace_lock); 9333 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 9334 vdev_initialize_restart(spa->spa_root_vdev); 9335 spa_config_exit(spa, SCL_CONFIG, FTAG); 9336 mutex_exit(&spa_namespace_lock); 9337 } 9338 9339 if (tasks & SPA_ASYNC_TRIM_RESTART) { 9340 mutex_enter(&spa_namespace_lock); 9341 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 9342 vdev_trim_restart(spa->spa_root_vdev); 9343 spa_config_exit(spa, SCL_CONFIG, FTAG); 9344 mutex_exit(&spa_namespace_lock); 9345 } 9346 9347 if (tasks & SPA_ASYNC_AUTOTRIM_RESTART) { 9348 mutex_enter(&spa_namespace_lock); 9349 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 9350 vdev_autotrim_restart(spa); 9351 spa_config_exit(spa, SCL_CONFIG, FTAG); 9352 mutex_exit(&spa_namespace_lock); 9353 } 9354 9355 /* 9356 * Kick off L2 cache whole device TRIM. 9357 */ 9358 if (tasks & SPA_ASYNC_L2CACHE_TRIM) { 9359 mutex_enter(&spa_namespace_lock); 9360 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 9361 vdev_trim_l2arc(spa); 9362 spa_config_exit(spa, SCL_CONFIG, FTAG); 9363 mutex_exit(&spa_namespace_lock); 9364 } 9365 9366 /* 9367 * Kick off L2 cache rebuilding. 9368 */ 9369 if (tasks & SPA_ASYNC_L2CACHE_REBUILD) { 9370 mutex_enter(&spa_namespace_lock); 9371 spa_config_enter(spa, SCL_L2ARC, FTAG, RW_READER); 9372 l2arc_spa_rebuild_start(spa); 9373 spa_config_exit(spa, SCL_L2ARC, FTAG); 9374 mutex_exit(&spa_namespace_lock); 9375 } 9376 9377 /* 9378 * Let the world know that we're done. 9379 */ 9380 mutex_enter(&spa->spa_async_lock); 9381 spa->spa_async_thread = NULL; 9382 cv_broadcast(&spa->spa_async_cv); 9383 mutex_exit(&spa->spa_async_lock); 9384 thread_exit(); 9385 } 9386 9387 void 9388 spa_async_suspend(spa_t *spa) 9389 { 9390 mutex_enter(&spa->spa_async_lock); 9391 spa->spa_async_suspended++; 9392 while (spa->spa_async_thread != NULL) 9393 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 9394 mutex_exit(&spa->spa_async_lock); 9395 9396 spa_vdev_remove_suspend(spa); 9397 9398 zthr_t *condense_thread = spa->spa_condense_zthr; 9399 if (condense_thread != NULL) 9400 zthr_cancel(condense_thread); 9401 9402 zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr; 9403 if (raidz_expand_thread != NULL) 9404 zthr_cancel(raidz_expand_thread); 9405 9406 zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; 9407 if (discard_thread != NULL) 9408 zthr_cancel(discard_thread); 9409 9410 zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; 9411 if (ll_delete_thread != NULL) 9412 zthr_cancel(ll_delete_thread); 9413 9414 zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; 9415 if (ll_condense_thread != NULL) 9416 zthr_cancel(ll_condense_thread); 9417 } 9418 9419 void 9420 spa_async_resume(spa_t *spa) 9421 { 9422 mutex_enter(&spa->spa_async_lock); 9423 ASSERT(spa->spa_async_suspended != 0); 9424 spa->spa_async_suspended--; 9425 mutex_exit(&spa->spa_async_lock); 9426 spa_restart_removal(spa); 9427 9428 zthr_t *condense_thread = spa->spa_condense_zthr; 9429 if (condense_thread != NULL) 9430 zthr_resume(condense_thread); 9431 9432 zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr; 9433 if (raidz_expand_thread != NULL) 9434 zthr_resume(raidz_expand_thread); 9435 9436 zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; 9437 if (discard_thread != NULL) 9438 zthr_resume(discard_thread); 9439 9440 zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; 9441 if (ll_delete_thread != NULL) 9442 zthr_resume(ll_delete_thread); 9443 9444 zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; 9445 if (ll_condense_thread != NULL) 9446 zthr_resume(ll_condense_thread); 9447 } 9448 9449 static boolean_t 9450 spa_async_tasks_pending(spa_t *spa) 9451 { 9452 uint_t non_config_tasks; 9453 uint_t config_task; 9454 boolean_t config_task_suspended; 9455 9456 non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE; 9457 config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; 9458 if (spa->spa_ccw_fail_time == 0) { 9459 config_task_suspended = B_FALSE; 9460 } else { 9461 config_task_suspended = 9462 (gethrtime() - spa->spa_ccw_fail_time) < 9463 ((hrtime_t)zfs_ccw_retry_interval * NANOSEC); 9464 } 9465 9466 return (non_config_tasks || (config_task && !config_task_suspended)); 9467 } 9468 9469 static void 9470 spa_async_dispatch(spa_t *spa) 9471 { 9472 mutex_enter(&spa->spa_async_lock); 9473 if (spa_async_tasks_pending(spa) && 9474 !spa->spa_async_suspended && 9475 spa->spa_async_thread == NULL) 9476 spa->spa_async_thread = thread_create(NULL, 0, 9477 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 9478 mutex_exit(&spa->spa_async_lock); 9479 } 9480 9481 void 9482 spa_async_request(spa_t *spa, int task) 9483 { 9484 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 9485 mutex_enter(&spa->spa_async_lock); 9486 spa->spa_async_tasks |= task; 9487 mutex_exit(&spa->spa_async_lock); 9488 } 9489 9490 int 9491 spa_async_tasks(spa_t *spa) 9492 { 9493 return (spa->spa_async_tasks); 9494 } 9495 9496 /* 9497 * ========================================================================== 9498 * SPA syncing routines 9499 * ========================================================================== 9500 */ 9501 9502 9503 static int 9504 bpobj_enqueue_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 9505 dmu_tx_t *tx) 9506 { 9507 bpobj_t *bpo = arg; 9508 bpobj_enqueue(bpo, bp, bp_freed, tx); 9509 return (0); 9510 } 9511 9512 int 9513 bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 9514 { 9515 return (bpobj_enqueue_cb(arg, bp, B_FALSE, tx)); 9516 } 9517 9518 int 9519 bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 9520 { 9521 return (bpobj_enqueue_cb(arg, bp, B_TRUE, tx)); 9522 } 9523 9524 static int 9525 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 9526 { 9527 zio_t *pio = arg; 9528 9529 zio_nowait(zio_free_sync(pio, pio->io_spa, dmu_tx_get_txg(tx), bp, 9530 pio->io_flags)); 9531 return (0); 9532 } 9533 9534 static int 9535 bpobj_spa_free_sync_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 9536 dmu_tx_t *tx) 9537 { 9538 ASSERT(!bp_freed); 9539 return (spa_free_sync_cb(arg, bp, tx)); 9540 } 9541 9542 /* 9543 * Note: this simple function is not inlined to make it easier to dtrace the 9544 * amount of time spent syncing frees. 9545 */ 9546 static void 9547 spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) 9548 { 9549 zio_t *zio = zio_root(spa, NULL, NULL, 0); 9550 bplist_iterate(bpl, spa_free_sync_cb, zio, tx); 9551 VERIFY0(zio_wait(zio)); 9552 } 9553 9554 /* 9555 * Note: this simple function is not inlined to make it easier to dtrace the 9556 * amount of time spent syncing deferred frees. 9557 */ 9558 static void 9559 spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) 9560 { 9561 if (spa_sync_pass(spa) != 1) 9562 return; 9563 9564 /* 9565 * Note: 9566 * If the log space map feature is active, we stop deferring 9567 * frees to the next TXG and therefore running this function 9568 * would be considered a no-op as spa_deferred_bpobj should 9569 * not have any entries. 9570 * 9571 * That said we run this function anyway (instead of returning 9572 * immediately) for the edge-case scenario where we just 9573 * activated the log space map feature in this TXG but we have 9574 * deferred frees from the previous TXG. 9575 */ 9576 zio_t *zio = zio_root(spa, NULL, NULL, 0); 9577 VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, 9578 bpobj_spa_free_sync_cb, zio, tx), ==, 0); 9579 VERIFY0(zio_wait(zio)); 9580 } 9581 9582 static void 9583 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 9584 { 9585 char *packed = NULL; 9586 size_t bufsize; 9587 size_t nvsize = 0; 9588 dmu_buf_t *db; 9589 9590 VERIFY0(nvlist_size(nv, &nvsize, NV_ENCODE_XDR)); 9591 9592 /* 9593 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 9594 * information. This avoids the dmu_buf_will_dirty() path and 9595 * saves us a pre-read to get data we don't actually care about. 9596 */ 9597 bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); 9598 packed = vmem_alloc(bufsize, KM_SLEEP); 9599 9600 VERIFY0(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 9601 KM_SLEEP)); 9602 memset(packed + nvsize, 0, bufsize - nvsize); 9603 9604 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 9605 9606 vmem_free(packed, bufsize); 9607 9608 VERIFY0(dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 9609 dmu_buf_will_dirty(db, tx); 9610 *(uint64_t *)db->db_data = nvsize; 9611 dmu_buf_rele(db, FTAG); 9612 } 9613 9614 static void 9615 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 9616 const char *config, const char *entry) 9617 { 9618 nvlist_t *nvroot; 9619 nvlist_t **list; 9620 int i; 9621 9622 if (!sav->sav_sync) 9623 return; 9624 9625 /* 9626 * Update the MOS nvlist describing the list of available devices. 9627 * spa_validate_aux() will have already made sure this nvlist is 9628 * valid and the vdevs are labeled appropriately. 9629 */ 9630 if (sav->sav_object == 0) { 9631 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 9632 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 9633 sizeof (uint64_t), tx); 9634 VERIFY(zap_update(spa->spa_meta_objset, 9635 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 9636 &sav->sav_object, tx) == 0); 9637 } 9638 9639 nvroot = fnvlist_alloc(); 9640 if (sav->sav_count == 0) { 9641 fnvlist_add_nvlist_array(nvroot, config, 9642 (const nvlist_t * const *)NULL, 0); 9643 } else { 9644 list = kmem_alloc(sav->sav_count*sizeof (void *), KM_SLEEP); 9645 for (i = 0; i < sav->sav_count; i++) 9646 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 9647 B_FALSE, VDEV_CONFIG_L2CACHE); 9648 fnvlist_add_nvlist_array(nvroot, config, 9649 (const nvlist_t * const *)list, sav->sav_count); 9650 for (i = 0; i < sav->sav_count; i++) 9651 nvlist_free(list[i]); 9652 kmem_free(list, sav->sav_count * sizeof (void *)); 9653 } 9654 9655 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 9656 nvlist_free(nvroot); 9657 9658 sav->sav_sync = B_FALSE; 9659 } 9660 9661 /* 9662 * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t. 9663 * The all-vdev ZAP must be empty. 9664 */ 9665 static void 9666 spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx) 9667 { 9668 spa_t *spa = vd->vdev_spa; 9669 9670 if (vd->vdev_root_zap != 0 && 9671 spa_feature_is_active(spa, SPA_FEATURE_AVZ_V2)) { 9672 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 9673 vd->vdev_root_zap, tx)); 9674 } 9675 if (vd->vdev_top_zap != 0) { 9676 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 9677 vd->vdev_top_zap, tx)); 9678 } 9679 if (vd->vdev_leaf_zap != 0) { 9680 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 9681 vd->vdev_leaf_zap, tx)); 9682 } 9683 for (uint64_t i = 0; i < vd->vdev_children; i++) { 9684 spa_avz_build(vd->vdev_child[i], avz, tx); 9685 } 9686 } 9687 9688 static void 9689 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 9690 { 9691 nvlist_t *config; 9692 9693 /* 9694 * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS, 9695 * its config may not be dirty but we still need to build per-vdev ZAPs. 9696 * Similarly, if the pool is being assembled (e.g. after a split), we 9697 * need to rebuild the AVZ although the config may not be dirty. 9698 */ 9699 if (list_is_empty(&spa->spa_config_dirty_list) && 9700 spa->spa_avz_action == AVZ_ACTION_NONE) 9701 return; 9702 9703 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 9704 9705 ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE || 9706 spa->spa_avz_action == AVZ_ACTION_INITIALIZE || 9707 spa->spa_all_vdev_zaps != 0); 9708 9709 if (spa->spa_avz_action == AVZ_ACTION_REBUILD) { 9710 /* Make and build the new AVZ */ 9711 uint64_t new_avz = zap_create(spa->spa_meta_objset, 9712 DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); 9713 spa_avz_build(spa->spa_root_vdev, new_avz, tx); 9714 9715 /* Diff old AVZ with new one */ 9716 zap_cursor_t zc; 9717 zap_attribute_t *za = zap_attribute_alloc(); 9718 9719 for (zap_cursor_init(&zc, spa->spa_meta_objset, 9720 spa->spa_all_vdev_zaps); 9721 zap_cursor_retrieve(&zc, za) == 0; 9722 zap_cursor_advance(&zc)) { 9723 uint64_t vdzap = za->za_first_integer; 9724 if (zap_lookup_int(spa->spa_meta_objset, new_avz, 9725 vdzap) == ENOENT) { 9726 /* 9727 * ZAP is listed in old AVZ but not in new one; 9728 * destroy it 9729 */ 9730 VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap, 9731 tx)); 9732 } 9733 } 9734 9735 zap_cursor_fini(&zc); 9736 zap_attribute_free(za); 9737 9738 /* Destroy the old AVZ */ 9739 VERIFY0(zap_destroy(spa->spa_meta_objset, 9740 spa->spa_all_vdev_zaps, tx)); 9741 9742 /* Replace the old AVZ in the dir obj with the new one */ 9743 VERIFY0(zap_update(spa->spa_meta_objset, 9744 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, 9745 sizeof (new_avz), 1, &new_avz, tx)); 9746 9747 spa->spa_all_vdev_zaps = new_avz; 9748 } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) { 9749 zap_cursor_t zc; 9750 zap_attribute_t *za = zap_attribute_alloc(); 9751 9752 /* Walk through the AVZ and destroy all listed ZAPs */ 9753 for (zap_cursor_init(&zc, spa->spa_meta_objset, 9754 spa->spa_all_vdev_zaps); 9755 zap_cursor_retrieve(&zc, za) == 0; 9756 zap_cursor_advance(&zc)) { 9757 uint64_t zap = za->za_first_integer; 9758 VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx)); 9759 } 9760 9761 zap_cursor_fini(&zc); 9762 zap_attribute_free(za); 9763 9764 /* Destroy and unlink the AVZ itself */ 9765 VERIFY0(zap_destroy(spa->spa_meta_objset, 9766 spa->spa_all_vdev_zaps, tx)); 9767 VERIFY0(zap_remove(spa->spa_meta_objset, 9768 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx)); 9769 spa->spa_all_vdev_zaps = 0; 9770 } 9771 9772 if (spa->spa_all_vdev_zaps == 0) { 9773 spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset, 9774 DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, 9775 DMU_POOL_VDEV_ZAP_MAP, tx); 9776 } 9777 spa->spa_avz_action = AVZ_ACTION_NONE; 9778 9779 /* Create ZAPs for vdevs that don't have them. */ 9780 vdev_construct_zaps(spa->spa_root_vdev, tx); 9781 9782 config = spa_config_generate(spa, spa->spa_root_vdev, 9783 dmu_tx_get_txg(tx), B_FALSE); 9784 9785 /* 9786 * If we're upgrading the spa version then make sure that 9787 * the config object gets updated with the correct version. 9788 */ 9789 if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) 9790 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 9791 spa->spa_uberblock.ub_version); 9792 9793 spa_config_exit(spa, SCL_STATE, FTAG); 9794 9795 nvlist_free(spa->spa_config_syncing); 9796 spa->spa_config_syncing = config; 9797 9798 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 9799 } 9800 9801 static void 9802 spa_sync_version(void *arg, dmu_tx_t *tx) 9803 { 9804 uint64_t *versionp = arg; 9805 uint64_t version = *versionp; 9806 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 9807 9808 /* 9809 * Setting the version is special cased when first creating the pool. 9810 */ 9811 ASSERT(tx->tx_txg != TXG_INITIAL); 9812 9813 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 9814 ASSERT(version >= spa_version(spa)); 9815 9816 spa->spa_uberblock.ub_version = version; 9817 vdev_config_dirty(spa->spa_root_vdev); 9818 spa_history_log_internal(spa, "set", tx, "version=%lld", 9819 (longlong_t)version); 9820 } 9821 9822 /* 9823 * Set zpool properties. 9824 */ 9825 static void 9826 spa_sync_props(void *arg, dmu_tx_t *tx) 9827 { 9828 nvlist_t *nvp = arg; 9829 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 9830 objset_t *mos = spa->spa_meta_objset; 9831 nvpair_t *elem = NULL; 9832 9833 mutex_enter(&spa->spa_props_lock); 9834 9835 while ((elem = nvlist_next_nvpair(nvp, elem))) { 9836 uint64_t intval; 9837 const char *strval, *fname; 9838 zpool_prop_t prop; 9839 const char *propname; 9840 const char *elemname = nvpair_name(elem); 9841 zprop_type_t proptype; 9842 spa_feature_t fid; 9843 9844 switch (prop = zpool_name_to_prop(elemname)) { 9845 case ZPOOL_PROP_VERSION: 9846 intval = fnvpair_value_uint64(elem); 9847 /* 9848 * The version is synced separately before other 9849 * properties and should be correct by now. 9850 */ 9851 ASSERT3U(spa_version(spa), >=, intval); 9852 break; 9853 9854 case ZPOOL_PROP_ALTROOT: 9855 /* 9856 * 'altroot' is a non-persistent property. It should 9857 * have been set temporarily at creation or import time. 9858 */ 9859 ASSERT(spa->spa_root != NULL); 9860 break; 9861 9862 case ZPOOL_PROP_READONLY: 9863 case ZPOOL_PROP_CACHEFILE: 9864 /* 9865 * 'readonly' and 'cachefile' are also non-persistent 9866 * properties. 9867 */ 9868 break; 9869 case ZPOOL_PROP_COMMENT: 9870 strval = fnvpair_value_string(elem); 9871 if (spa->spa_comment != NULL) 9872 spa_strfree(spa->spa_comment); 9873 spa->spa_comment = spa_strdup(strval); 9874 /* 9875 * We need to dirty the configuration on all the vdevs 9876 * so that their labels get updated. We also need to 9877 * update the cache file to keep it in sync with the 9878 * MOS version. It's unnecessary to do this for pool 9879 * creation since the vdev's configuration has already 9880 * been dirtied. 9881 */ 9882 if (tx->tx_txg != TXG_INITIAL) { 9883 vdev_config_dirty(spa->spa_root_vdev); 9884 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 9885 } 9886 spa_history_log_internal(spa, "set", tx, 9887 "%s=%s", elemname, strval); 9888 break; 9889 case ZPOOL_PROP_COMPATIBILITY: 9890 strval = fnvpair_value_string(elem); 9891 if (spa->spa_compatibility != NULL) 9892 spa_strfree(spa->spa_compatibility); 9893 spa->spa_compatibility = spa_strdup(strval); 9894 /* 9895 * Dirty the configuration on vdevs as above. 9896 */ 9897 if (tx->tx_txg != TXG_INITIAL) { 9898 vdev_config_dirty(spa->spa_root_vdev); 9899 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 9900 } 9901 9902 spa_history_log_internal(spa, "set", tx, 9903 "%s=%s", nvpair_name(elem), strval); 9904 break; 9905 9906 case ZPOOL_PROP_INVAL: 9907 if (zpool_prop_feature(elemname)) { 9908 fname = strchr(elemname, '@') + 1; 9909 VERIFY0(zfeature_lookup_name(fname, &fid)); 9910 9911 spa_feature_enable(spa, fid, tx); 9912 spa_history_log_internal(spa, "set", tx, 9913 "%s=enabled", elemname); 9914 break; 9915 } else if (!zfs_prop_user(elemname)) { 9916 ASSERT(zpool_prop_feature(elemname)); 9917 break; 9918 } 9919 zfs_fallthrough; 9920 default: 9921 /* 9922 * Set pool property values in the poolprops mos object. 9923 */ 9924 if (spa->spa_pool_props_object == 0) { 9925 spa->spa_pool_props_object = 9926 zap_create_link(mos, DMU_OT_POOL_PROPS, 9927 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 9928 tx); 9929 } 9930 9931 /* normalize the property name */ 9932 if (prop == ZPOOL_PROP_INVAL) { 9933 propname = elemname; 9934 proptype = PROP_TYPE_STRING; 9935 } else { 9936 propname = zpool_prop_to_name(prop); 9937 proptype = zpool_prop_get_type(prop); 9938 } 9939 9940 if (nvpair_type(elem) == DATA_TYPE_STRING) { 9941 ASSERT(proptype == PROP_TYPE_STRING); 9942 strval = fnvpair_value_string(elem); 9943 if (strlen(strval) == 0) { 9944 /* remove the property if value == "" */ 9945 (void) zap_remove(mos, 9946 spa->spa_pool_props_object, 9947 propname, tx); 9948 } else { 9949 VERIFY0(zap_update(mos, 9950 spa->spa_pool_props_object, 9951 propname, 1, strlen(strval) + 1, 9952 strval, tx)); 9953 } 9954 spa_history_log_internal(spa, "set", tx, 9955 "%s=%s", elemname, strval); 9956 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 9957 intval = fnvpair_value_uint64(elem); 9958 9959 if (proptype == PROP_TYPE_INDEX) { 9960 const char *unused; 9961 VERIFY0(zpool_prop_index_to_string( 9962 prop, intval, &unused)); 9963 } 9964 VERIFY0(zap_update(mos, 9965 spa->spa_pool_props_object, propname, 9966 8, 1, &intval, tx)); 9967 spa_history_log_internal(spa, "set", tx, 9968 "%s=%lld", elemname, 9969 (longlong_t)intval); 9970 9971 switch (prop) { 9972 case ZPOOL_PROP_DELEGATION: 9973 spa->spa_delegation = intval; 9974 break; 9975 case ZPOOL_PROP_BOOTFS: 9976 spa->spa_bootfs = intval; 9977 break; 9978 case ZPOOL_PROP_FAILUREMODE: 9979 spa->spa_failmode = intval; 9980 break; 9981 case ZPOOL_PROP_AUTOTRIM: 9982 spa->spa_autotrim = intval; 9983 spa_async_request(spa, 9984 SPA_ASYNC_AUTOTRIM_RESTART); 9985 break; 9986 case ZPOOL_PROP_AUTOEXPAND: 9987 spa->spa_autoexpand = intval; 9988 if (tx->tx_txg != TXG_INITIAL) 9989 spa_async_request(spa, 9990 SPA_ASYNC_AUTOEXPAND); 9991 break; 9992 case ZPOOL_PROP_MULTIHOST: 9993 spa->spa_multihost = intval; 9994 break; 9995 case ZPOOL_PROP_DEDUP_TABLE_QUOTA: 9996 spa->spa_dedup_table_quota = intval; 9997 break; 9998 default: 9999 break; 10000 } 10001 } else { 10002 ASSERT(0); /* not allowed */ 10003 } 10004 } 10005 10006 } 10007 10008 mutex_exit(&spa->spa_props_lock); 10009 } 10010 10011 /* 10012 * Perform one-time upgrade on-disk changes. spa_version() does not 10013 * reflect the new version this txg, so there must be no changes this 10014 * txg to anything that the upgrade code depends on after it executes. 10015 * Therefore this must be called after dsl_pool_sync() does the sync 10016 * tasks. 10017 */ 10018 static void 10019 spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 10020 { 10021 if (spa_sync_pass(spa) != 1) 10022 return; 10023 10024 dsl_pool_t *dp = spa->spa_dsl_pool; 10025 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 10026 10027 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 10028 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 10029 dsl_pool_create_origin(dp, tx); 10030 10031 /* Keeping the origin open increases spa_minref */ 10032 spa->spa_minref += 3; 10033 } 10034 10035 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 10036 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 10037 dsl_pool_upgrade_clones(dp, tx); 10038 } 10039 10040 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 10041 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 10042 dsl_pool_upgrade_dir_clones(dp, tx); 10043 10044 /* Keeping the freedir open increases spa_minref */ 10045 spa->spa_minref += 3; 10046 } 10047 10048 if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && 10049 spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 10050 spa_feature_create_zap_objects(spa, tx); 10051 } 10052 10053 /* 10054 * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable 10055 * when possibility to use lz4 compression for metadata was added 10056 * Old pools that have this feature enabled must be upgraded to have 10057 * this feature active 10058 */ 10059 if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 10060 boolean_t lz4_en = spa_feature_is_enabled(spa, 10061 SPA_FEATURE_LZ4_COMPRESS); 10062 boolean_t lz4_ac = spa_feature_is_active(spa, 10063 SPA_FEATURE_LZ4_COMPRESS); 10064 10065 if (lz4_en && !lz4_ac) 10066 spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); 10067 } 10068 10069 /* 10070 * If we haven't written the salt, do so now. Note that the 10071 * feature may not be activated yet, but that's fine since 10072 * the presence of this ZAP entry is backwards compatible. 10073 */ 10074 if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 10075 DMU_POOL_CHECKSUM_SALT) == ENOENT) { 10076 VERIFY0(zap_add(spa->spa_meta_objset, 10077 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, 10078 sizeof (spa->spa_cksum_salt.zcs_bytes), 10079 spa->spa_cksum_salt.zcs_bytes, tx)); 10080 } 10081 10082 rrw_exit(&dp->dp_config_rwlock, FTAG); 10083 } 10084 10085 static void 10086 vdev_indirect_state_sync_verify(vdev_t *vd) 10087 { 10088 vdev_indirect_mapping_t *vim __maybe_unused = vd->vdev_indirect_mapping; 10089 vdev_indirect_births_t *vib __maybe_unused = vd->vdev_indirect_births; 10090 10091 if (vd->vdev_ops == &vdev_indirect_ops) { 10092 ASSERT(vim != NULL); 10093 ASSERT(vib != NULL); 10094 } 10095 10096 uint64_t obsolete_sm_object = 0; 10097 ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); 10098 if (obsolete_sm_object != 0) { 10099 ASSERT(vd->vdev_obsolete_sm != NULL); 10100 ASSERT(vd->vdev_removing || 10101 vd->vdev_ops == &vdev_indirect_ops); 10102 ASSERT(vdev_indirect_mapping_num_entries(vim) > 0); 10103 ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0); 10104 ASSERT3U(obsolete_sm_object, ==, 10105 space_map_object(vd->vdev_obsolete_sm)); 10106 ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=, 10107 space_map_allocated(vd->vdev_obsolete_sm)); 10108 } 10109 ASSERT(vd->vdev_obsolete_segments != NULL); 10110 10111 /* 10112 * Since frees / remaps to an indirect vdev can only 10113 * happen in syncing context, the obsolete segments 10114 * tree must be empty when we start syncing. 10115 */ 10116 ASSERT0(zfs_range_tree_space(vd->vdev_obsolete_segments)); 10117 } 10118 10119 /* 10120 * Set the top-level vdev's max queue depth. Evaluate each top-level's 10121 * async write queue depth in case it changed. The max queue depth will 10122 * not change in the middle of syncing out this txg. 10123 */ 10124 static void 10125 spa_sync_adjust_vdev_max_queue_depth(spa_t *spa) 10126 { 10127 ASSERT(spa_writeable(spa)); 10128 10129 metaslab_class_balance(spa_normal_class(spa), B_TRUE); 10130 metaslab_class_balance(spa_special_class(spa), B_TRUE); 10131 metaslab_class_balance(spa_dedup_class(spa), B_TRUE); 10132 } 10133 10134 static void 10135 spa_sync_condense_indirect(spa_t *spa, dmu_tx_t *tx) 10136 { 10137 ASSERT(spa_writeable(spa)); 10138 10139 vdev_t *rvd = spa->spa_root_vdev; 10140 for (int c = 0; c < rvd->vdev_children; c++) { 10141 vdev_t *vd = rvd->vdev_child[c]; 10142 vdev_indirect_state_sync_verify(vd); 10143 10144 if (vdev_indirect_should_condense(vd)) { 10145 spa_condense_indirect_start_sync(vd, tx); 10146 break; 10147 } 10148 } 10149 } 10150 10151 static void 10152 spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx) 10153 { 10154 objset_t *mos = spa->spa_meta_objset; 10155 dsl_pool_t *dp = spa->spa_dsl_pool; 10156 uint64_t txg = tx->tx_txg; 10157 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 10158 10159 do { 10160 int pass = ++spa->spa_sync_pass; 10161 10162 spa_sync_config_object(spa, tx); 10163 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 10164 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 10165 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 10166 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 10167 spa_errlog_sync(spa, txg); 10168 dsl_pool_sync(dp, txg); 10169 10170 if (pass < zfs_sync_pass_deferred_free || 10171 spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { 10172 /* 10173 * If the log space map feature is active we don't 10174 * care about deferred frees and the deferred bpobj 10175 * as the log space map should effectively have the 10176 * same results (i.e. appending only to one object). 10177 */ 10178 spa_sync_frees(spa, free_bpl, tx); 10179 } else { 10180 /* 10181 * We can not defer frees in pass 1, because 10182 * we sync the deferred frees later in pass 1. 10183 */ 10184 ASSERT3U(pass, >, 1); 10185 bplist_iterate(free_bpl, bpobj_enqueue_alloc_cb, 10186 &spa->spa_deferred_bpobj, tx); 10187 } 10188 10189 brt_sync(spa, txg); 10190 ddt_sync(spa, txg); 10191 dsl_scan_sync(dp, tx); 10192 dsl_errorscrub_sync(dp, tx); 10193 svr_sync(spa, tx); 10194 spa_sync_upgrades(spa, tx); 10195 10196 spa_flush_metaslabs(spa, tx); 10197 10198 vdev_t *vd = NULL; 10199 while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 10200 != NULL) 10201 vdev_sync(vd, txg); 10202 10203 if (pass == 1) { 10204 /* 10205 * dsl_pool_sync() -> dp_sync_tasks may have dirtied 10206 * the config. If that happens, this txg should not 10207 * be a no-op. So we must sync the config to the MOS 10208 * before checking for no-op. 10209 * 10210 * Note that when the config is dirty, it will 10211 * be written to the MOS (i.e. the MOS will be 10212 * dirtied) every time we call spa_sync_config_object() 10213 * in this txg. Therefore we can't call this after 10214 * dsl_pool_sync() every pass, because it would 10215 * prevent us from converging, since we'd dirty 10216 * the MOS every pass. 10217 * 10218 * Sync tasks can only be processed in pass 1, so 10219 * there's no need to do this in later passes. 10220 */ 10221 spa_sync_config_object(spa, tx); 10222 } 10223 10224 /* 10225 * Note: We need to check if the MOS is dirty because we could 10226 * have marked the MOS dirty without updating the uberblock 10227 * (e.g. if we have sync tasks but no dirty user data). We need 10228 * to check the uberblock's rootbp because it is updated if we 10229 * have synced out dirty data (though in this case the MOS will 10230 * most likely also be dirty due to second order effects, we 10231 * don't want to rely on that here). 10232 */ 10233 if (pass == 1 && 10234 BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp) < txg && 10235 !dmu_objset_is_dirty(mos, txg)) { 10236 /* 10237 * Nothing changed on the first pass, therefore this 10238 * TXG is a no-op. Avoid syncing deferred frees, so 10239 * that we can keep this TXG as a no-op. 10240 */ 10241 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 10242 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 10243 ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); 10244 ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, txg)); 10245 break; 10246 } 10247 10248 spa_sync_deferred_frees(spa, tx); 10249 } while (dmu_objset_is_dirty(mos, txg)); 10250 } 10251 10252 /* 10253 * Rewrite the vdev configuration (which includes the uberblock) to 10254 * commit the transaction group. 10255 * 10256 * If there are no dirty vdevs, we sync the uberblock to a few random 10257 * top-level vdevs that are known to be visible in the config cache 10258 * (see spa_vdev_add() for a complete description). If there *are* dirty 10259 * vdevs, sync the uberblock to all vdevs. 10260 */ 10261 static void 10262 spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx) 10263 { 10264 vdev_t *rvd = spa->spa_root_vdev; 10265 uint64_t txg = tx->tx_txg; 10266 10267 for (;;) { 10268 int error = 0; 10269 10270 /* 10271 * We hold SCL_STATE to prevent vdev open/close/etc. 10272 * while we're attempting to write the vdev labels. 10273 */ 10274 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 10275 10276 if (list_is_empty(&spa->spa_config_dirty_list)) { 10277 vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; 10278 int svdcount = 0; 10279 int children = rvd->vdev_children; 10280 int c0 = random_in_range(children); 10281 10282 for (int c = 0; c < children; c++) { 10283 vdev_t *vd = 10284 rvd->vdev_child[(c0 + c) % children]; 10285 10286 /* Stop when revisiting the first vdev */ 10287 if (c > 0 && svd[0] == vd) 10288 break; 10289 10290 if (vd->vdev_ms_array == 0 || 10291 vd->vdev_islog || 10292 !vdev_is_concrete(vd)) 10293 continue; 10294 10295 svd[svdcount++] = vd; 10296 if (svdcount == SPA_SYNC_MIN_VDEVS) 10297 break; 10298 } 10299 error = vdev_config_sync(svd, svdcount, txg); 10300 } else { 10301 error = vdev_config_sync(rvd->vdev_child, 10302 rvd->vdev_children, txg); 10303 } 10304 10305 if (error == 0) 10306 spa->spa_last_synced_guid = rvd->vdev_guid; 10307 10308 spa_config_exit(spa, SCL_STATE, FTAG); 10309 10310 if (error == 0) 10311 break; 10312 zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR); 10313 zio_resume_wait(spa); 10314 } 10315 } 10316 10317 /* 10318 * Sync the specified transaction group. New blocks may be dirtied as 10319 * part of the process, so we iterate until it converges. 10320 */ 10321 void 10322 spa_sync(spa_t *spa, uint64_t txg) 10323 { 10324 vdev_t *vd = NULL; 10325 10326 VERIFY(spa_writeable(spa)); 10327 10328 /* 10329 * Wait for i/os issued in open context that need to complete 10330 * before this txg syncs. 10331 */ 10332 (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]); 10333 spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, 10334 ZIO_FLAG_CANFAIL); 10335 10336 /* 10337 * Now that there can be no more cloning in this transaction group, 10338 * but we are still before issuing frees, we can process pending BRT 10339 * updates. 10340 */ 10341 brt_pending_apply(spa, txg); 10342 10343 spa_sync_time_logger(spa, txg); 10344 10345 /* 10346 * Lock out configuration changes. 10347 */ 10348 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 10349 10350 spa->spa_syncing_txg = txg; 10351 spa->spa_sync_pass = 0; 10352 10353 /* 10354 * If there are any pending vdev state changes, convert them 10355 * into config changes that go out with this transaction group. 10356 */ 10357 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 10358 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 10359 /* Avoid holding the write lock unless actually necessary */ 10360 if (vd->vdev_aux == NULL) { 10361 vdev_state_clean(vd); 10362 vdev_config_dirty(vd); 10363 continue; 10364 } 10365 /* 10366 * We need the write lock here because, for aux vdevs, 10367 * calling vdev_config_dirty() modifies sav_config. 10368 * This is ugly and will become unnecessary when we 10369 * eliminate the aux vdev wart by integrating all vdevs 10370 * into the root vdev tree. 10371 */ 10372 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 10373 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 10374 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 10375 vdev_state_clean(vd); 10376 vdev_config_dirty(vd); 10377 } 10378 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 10379 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 10380 } 10381 spa_config_exit(spa, SCL_STATE, FTAG); 10382 10383 dsl_pool_t *dp = spa->spa_dsl_pool; 10384 dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); 10385 10386 spa->spa_sync_starttime = gethrtime(); 10387 10388 taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); 10389 spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq, 10390 spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + 10391 NSEC_TO_TICK(spa->spa_deadman_synctime)); 10392 10393 /* 10394 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 10395 * set spa_deflate if we have no raid-z vdevs. 10396 */ 10397 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 10398 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 10399 vdev_t *rvd = spa->spa_root_vdev; 10400 10401 int i; 10402 for (i = 0; i < rvd->vdev_children; i++) { 10403 vd = rvd->vdev_child[i]; 10404 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 10405 break; 10406 } 10407 if (i == rvd->vdev_children) { 10408 spa->spa_deflate = TRUE; 10409 VERIFY0(zap_add(spa->spa_meta_objset, 10410 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 10411 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 10412 } 10413 } 10414 10415 spa_sync_adjust_vdev_max_queue_depth(spa); 10416 10417 spa_sync_condense_indirect(spa, tx); 10418 10419 spa_sync_iterate_to_convergence(spa, tx); 10420 10421 #ifdef ZFS_DEBUG 10422 if (!list_is_empty(&spa->spa_config_dirty_list)) { 10423 /* 10424 * Make sure that the number of ZAPs for all the vdevs matches 10425 * the number of ZAPs in the per-vdev ZAP list. This only gets 10426 * called if the config is dirty; otherwise there may be 10427 * outstanding AVZ operations that weren't completed in 10428 * spa_sync_config_object. 10429 */ 10430 uint64_t all_vdev_zap_entry_count; 10431 ASSERT0(zap_count(spa->spa_meta_objset, 10432 spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count)); 10433 ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==, 10434 all_vdev_zap_entry_count); 10435 } 10436 #endif 10437 10438 if (spa->spa_vdev_removal != NULL) { 10439 ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]); 10440 } 10441 10442 spa_sync_rewrite_vdev_config(spa, tx); 10443 dmu_tx_commit(tx); 10444 10445 taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); 10446 spa->spa_deadman_tqid = 0; 10447 10448 /* 10449 * Clear the dirty config list. 10450 */ 10451 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 10452 vdev_config_clean(vd); 10453 10454 /* 10455 * Now that the new config has synced transactionally, 10456 * let it become visible to the config cache. 10457 */ 10458 if (spa->spa_config_syncing != NULL) { 10459 spa_config_set(spa, spa->spa_config_syncing); 10460 spa->spa_config_txg = txg; 10461 spa->spa_config_syncing = NULL; 10462 } 10463 10464 dsl_pool_sync_done(dp, txg); 10465 10466 /* 10467 * Update usable space statistics. 10468 */ 10469 while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 10470 != NULL) 10471 vdev_sync_done(vd, txg); 10472 10473 metaslab_class_evict_old(spa->spa_normal_class, txg); 10474 metaslab_class_evict_old(spa->spa_log_class, txg); 10475 /* Embedded log classes have only one metaslab per vdev. */ 10476 metaslab_class_evict_old(spa->spa_special_class, txg); 10477 metaslab_class_evict_old(spa->spa_dedup_class, txg); 10478 10479 spa_sync_close_syncing_log_sm(spa); 10480 10481 spa_update_dspace(spa); 10482 10483 if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) 10484 vdev_autotrim_kick(spa); 10485 10486 /* 10487 * It had better be the case that we didn't dirty anything 10488 * since vdev_config_sync(). 10489 */ 10490 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 10491 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 10492 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 10493 10494 while (zfs_pause_spa_sync) 10495 delay(1); 10496 10497 spa->spa_sync_pass = 0; 10498 10499 /* 10500 * Update the last synced uberblock here. We want to do this at 10501 * the end of spa_sync() so that consumers of spa_last_synced_txg() 10502 * will be guaranteed that all the processing associated with 10503 * that txg has been completed. 10504 */ 10505 spa->spa_ubsync = spa->spa_uberblock; 10506 spa_config_exit(spa, SCL_CONFIG, FTAG); 10507 10508 spa_handle_ignored_writes(spa); 10509 10510 /* 10511 * If any async tasks have been requested, kick them off. 10512 */ 10513 spa_async_dispatch(spa); 10514 } 10515 10516 /* 10517 * Sync all pools. We don't want to hold the namespace lock across these 10518 * operations, so we take a reference on the spa_t and drop the lock during the 10519 * sync. 10520 */ 10521 void 10522 spa_sync_allpools(void) 10523 { 10524 spa_t *spa = NULL; 10525 mutex_enter(&spa_namespace_lock); 10526 while ((spa = spa_next(spa)) != NULL) { 10527 if (spa_state(spa) != POOL_STATE_ACTIVE || 10528 !spa_writeable(spa) || spa_suspended(spa)) 10529 continue; 10530 spa_open_ref(spa, FTAG); 10531 mutex_exit(&spa_namespace_lock); 10532 txg_wait_synced(spa_get_dsl(spa), 0); 10533 mutex_enter(&spa_namespace_lock); 10534 spa_close(spa, FTAG); 10535 } 10536 mutex_exit(&spa_namespace_lock); 10537 } 10538 10539 taskq_t * 10540 spa_sync_tq_create(spa_t *spa, const char *name) 10541 { 10542 kthread_t **kthreads; 10543 10544 ASSERT0P(spa->spa_sync_tq); 10545 ASSERT3S(spa->spa_alloc_count, <=, boot_ncpus); 10546 10547 /* 10548 * - do not allow more allocators than cpus. 10549 * - there may be more cpus than allocators. 10550 * - do not allow more sync taskq threads than allocators or cpus. 10551 */ 10552 int nthreads = spa->spa_alloc_count; 10553 spa->spa_syncthreads = kmem_zalloc(sizeof (spa_syncthread_info_t) * 10554 nthreads, KM_SLEEP); 10555 10556 spa->spa_sync_tq = taskq_create_synced(name, nthreads, minclsyspri, 10557 nthreads, INT_MAX, TASKQ_PREPOPULATE, &kthreads); 10558 VERIFY(spa->spa_sync_tq != NULL); 10559 VERIFY(kthreads != NULL); 10560 10561 spa_syncthread_info_t *ti = spa->spa_syncthreads; 10562 for (int i = 0; i < nthreads; i++, ti++) { 10563 ti->sti_thread = kthreads[i]; 10564 ti->sti_allocator = i; 10565 } 10566 10567 kmem_free(kthreads, sizeof (*kthreads) * nthreads); 10568 return (spa->spa_sync_tq); 10569 } 10570 10571 void 10572 spa_sync_tq_destroy(spa_t *spa) 10573 { 10574 ASSERT(spa->spa_sync_tq != NULL); 10575 10576 taskq_wait(spa->spa_sync_tq); 10577 taskq_destroy(spa->spa_sync_tq); 10578 kmem_free(spa->spa_syncthreads, 10579 sizeof (spa_syncthread_info_t) * spa->spa_alloc_count); 10580 spa->spa_sync_tq = NULL; 10581 } 10582 10583 uint_t 10584 spa_acq_allocator(spa_t *spa) 10585 { 10586 int i; 10587 10588 if (spa->spa_alloc_count == 1) 10589 return (0); 10590 10591 mutex_enter(&spa->spa_allocs_use->sau_lock); 10592 uint_t r = spa->spa_allocs_use->sau_rotor; 10593 do { 10594 if (++r == spa->spa_alloc_count) 10595 r = 0; 10596 } while (spa->spa_allocs_use->sau_inuse[r]); 10597 spa->spa_allocs_use->sau_inuse[r] = B_TRUE; 10598 spa->spa_allocs_use->sau_rotor = r; 10599 mutex_exit(&spa->spa_allocs_use->sau_lock); 10600 10601 spa_syncthread_info_t *ti = spa->spa_syncthreads; 10602 for (i = 0; i < spa->spa_alloc_count; i++, ti++) { 10603 if (ti->sti_thread == curthread) { 10604 ti->sti_allocator = r; 10605 break; 10606 } 10607 } 10608 ASSERT3S(i, <, spa->spa_alloc_count); 10609 return (r); 10610 } 10611 10612 void 10613 spa_rel_allocator(spa_t *spa, uint_t allocator) 10614 { 10615 if (spa->spa_alloc_count > 1) 10616 spa->spa_allocs_use->sau_inuse[allocator] = B_FALSE; 10617 } 10618 10619 void 10620 spa_select_allocator(zio_t *zio) 10621 { 10622 zbookmark_phys_t *bm = &zio->io_bookmark; 10623 spa_t *spa = zio->io_spa; 10624 10625 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 10626 10627 /* 10628 * A gang block (for example) may have inherited its parent's 10629 * allocator, in which case there is nothing further to do here. 10630 */ 10631 if (ZIO_HAS_ALLOCATOR(zio)) 10632 return; 10633 10634 ASSERT(spa != NULL); 10635 ASSERT(bm != NULL); 10636 10637 /* 10638 * First try to use an allocator assigned to the syncthread, and set 10639 * the corresponding write issue taskq for the allocator. 10640 * Note, we must have an open pool to do this. 10641 */ 10642 if (spa->spa_sync_tq != NULL) { 10643 spa_syncthread_info_t *ti = spa->spa_syncthreads; 10644 for (int i = 0; i < spa->spa_alloc_count; i++, ti++) { 10645 if (ti->sti_thread == curthread) { 10646 zio->io_allocator = ti->sti_allocator; 10647 return; 10648 } 10649 } 10650 } 10651 10652 /* 10653 * We want to try to use as many allocators as possible to help improve 10654 * performance, but we also want logically adjacent IOs to be physically 10655 * adjacent to improve sequential read performance. We chunk each object 10656 * into 2^20 block regions, and then hash based on the objset, object, 10657 * level, and region to accomplish both of these goals. 10658 */ 10659 uint64_t hv = cityhash4(bm->zb_objset, bm->zb_object, bm->zb_level, 10660 bm->zb_blkid >> 20); 10661 10662 zio->io_allocator = (uint_t)hv % spa->spa_alloc_count; 10663 } 10664 10665 /* 10666 * ========================================================================== 10667 * Miscellaneous routines 10668 * ========================================================================== 10669 */ 10670 10671 /* 10672 * Remove all pools in the system. 10673 */ 10674 void 10675 spa_evict_all(void) 10676 { 10677 spa_t *spa; 10678 10679 /* 10680 * Remove all cached state. All pools should be closed now, 10681 * so every spa in the AVL tree should be unreferenced. 10682 */ 10683 mutex_enter(&spa_namespace_lock); 10684 while ((spa = spa_next(NULL)) != NULL) { 10685 /* 10686 * Stop async tasks. The async thread may need to detach 10687 * a device that's been replaced, which requires grabbing 10688 * spa_namespace_lock, so we must drop it here. 10689 */ 10690 spa_open_ref(spa, FTAG); 10691 mutex_exit(&spa_namespace_lock); 10692 spa_async_suspend(spa); 10693 mutex_enter(&spa_namespace_lock); 10694 spa_close(spa, FTAG); 10695 10696 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 10697 spa_unload(spa); 10698 spa_deactivate(spa); 10699 } 10700 spa_remove(spa); 10701 } 10702 mutex_exit(&spa_namespace_lock); 10703 } 10704 10705 vdev_t * 10706 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 10707 { 10708 vdev_t *vd; 10709 int i; 10710 10711 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 10712 return (vd); 10713 10714 if (aux) { 10715 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 10716 vd = spa->spa_l2cache.sav_vdevs[i]; 10717 if (vd->vdev_guid == guid) 10718 return (vd); 10719 } 10720 10721 for (i = 0; i < spa->spa_spares.sav_count; i++) { 10722 vd = spa->spa_spares.sav_vdevs[i]; 10723 if (vd->vdev_guid == guid) 10724 return (vd); 10725 } 10726 } 10727 10728 return (NULL); 10729 } 10730 10731 void 10732 spa_upgrade(spa_t *spa, uint64_t version) 10733 { 10734 ASSERT(spa_writeable(spa)); 10735 10736 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 10737 10738 /* 10739 * This should only be called for a non-faulted pool, and since a 10740 * future version would result in an unopenable pool, this shouldn't be 10741 * possible. 10742 */ 10743 ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); 10744 ASSERT3U(version, >=, spa->spa_uberblock.ub_version); 10745 10746 spa->spa_uberblock.ub_version = version; 10747 vdev_config_dirty(spa->spa_root_vdev); 10748 10749 spa_config_exit(spa, SCL_ALL, FTAG); 10750 10751 txg_wait_synced(spa_get_dsl(spa), 0); 10752 } 10753 10754 static boolean_t 10755 spa_has_aux_vdev(spa_t *spa, uint64_t guid, spa_aux_vdev_t *sav) 10756 { 10757 (void) spa; 10758 int i; 10759 uint64_t vdev_guid; 10760 10761 for (i = 0; i < sav->sav_count; i++) 10762 if (sav->sav_vdevs[i]->vdev_guid == guid) 10763 return (B_TRUE); 10764 10765 for (i = 0; i < sav->sav_npending; i++) { 10766 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 10767 &vdev_guid) == 0 && vdev_guid == guid) 10768 return (B_TRUE); 10769 } 10770 10771 return (B_FALSE); 10772 } 10773 10774 boolean_t 10775 spa_has_l2cache(spa_t *spa, uint64_t guid) 10776 { 10777 return (spa_has_aux_vdev(spa, guid, &spa->spa_l2cache)); 10778 } 10779 10780 boolean_t 10781 spa_has_spare(spa_t *spa, uint64_t guid) 10782 { 10783 return (spa_has_aux_vdev(spa, guid, &spa->spa_spares)); 10784 } 10785 10786 /* 10787 * Check if a pool has an active shared spare device. 10788 * Note: reference count of an active spare is 2, as a spare and as a replace 10789 */ 10790 static boolean_t 10791 spa_has_active_shared_spare(spa_t *spa) 10792 { 10793 int i, refcnt; 10794 uint64_t pool; 10795 spa_aux_vdev_t *sav = &spa->spa_spares; 10796 10797 for (i = 0; i < sav->sav_count; i++) { 10798 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 10799 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 10800 refcnt > 2) 10801 return (B_TRUE); 10802 } 10803 10804 return (B_FALSE); 10805 } 10806 10807 uint64_t 10808 spa_total_metaslabs(spa_t *spa) 10809 { 10810 vdev_t *rvd = spa->spa_root_vdev; 10811 10812 uint64_t m = 0; 10813 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 10814 vdev_t *vd = rvd->vdev_child[c]; 10815 if (!vdev_is_concrete(vd)) 10816 continue; 10817 m += vd->vdev_ms_count; 10818 } 10819 return (m); 10820 } 10821 10822 /* 10823 * Notify any waiting threads that some activity has switched from being in- 10824 * progress to not-in-progress so that the thread can wake up and determine 10825 * whether it is finished waiting. 10826 */ 10827 void 10828 spa_notify_waiters(spa_t *spa) 10829 { 10830 /* 10831 * Acquiring spa_activities_lock here prevents the cv_broadcast from 10832 * happening between the waiting thread's check and cv_wait. 10833 */ 10834 mutex_enter(&spa->spa_activities_lock); 10835 cv_broadcast(&spa->spa_activities_cv); 10836 mutex_exit(&spa->spa_activities_lock); 10837 } 10838 10839 /* 10840 * Notify any waiting threads that the pool is exporting, and then block until 10841 * they are finished using the spa_t. 10842 */ 10843 void 10844 spa_wake_waiters(spa_t *spa) 10845 { 10846 mutex_enter(&spa->spa_activities_lock); 10847 spa->spa_waiters_cancel = B_TRUE; 10848 cv_broadcast(&spa->spa_activities_cv); 10849 while (spa->spa_waiters != 0) 10850 cv_wait(&spa->spa_waiters_cv, &spa->spa_activities_lock); 10851 spa->spa_waiters_cancel = B_FALSE; 10852 mutex_exit(&spa->spa_activities_lock); 10853 } 10854 10855 /* Whether the vdev or any of its descendants are being initialized/trimmed. */ 10856 static boolean_t 10857 spa_vdev_activity_in_progress_impl(vdev_t *vd, zpool_wait_activity_t activity) 10858 { 10859 spa_t *spa = vd->vdev_spa; 10860 10861 ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER)); 10862 ASSERT(MUTEX_HELD(&spa->spa_activities_lock)); 10863 ASSERT(activity == ZPOOL_WAIT_INITIALIZE || 10864 activity == ZPOOL_WAIT_TRIM); 10865 10866 kmutex_t *lock = activity == ZPOOL_WAIT_INITIALIZE ? 10867 &vd->vdev_initialize_lock : &vd->vdev_trim_lock; 10868 10869 mutex_exit(&spa->spa_activities_lock); 10870 mutex_enter(lock); 10871 mutex_enter(&spa->spa_activities_lock); 10872 10873 boolean_t in_progress = (activity == ZPOOL_WAIT_INITIALIZE) ? 10874 (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) : 10875 (vd->vdev_trim_state == VDEV_TRIM_ACTIVE); 10876 mutex_exit(lock); 10877 10878 if (in_progress) 10879 return (B_TRUE); 10880 10881 for (int i = 0; i < vd->vdev_children; i++) { 10882 if (spa_vdev_activity_in_progress_impl(vd->vdev_child[i], 10883 activity)) 10884 return (B_TRUE); 10885 } 10886 10887 return (B_FALSE); 10888 } 10889 10890 /* 10891 * If use_guid is true, this checks whether the vdev specified by guid is 10892 * being initialized/trimmed. Otherwise, it checks whether any vdev in the pool 10893 * is being initialized/trimmed. The caller must hold the config lock and 10894 * spa_activities_lock. 10895 */ 10896 static int 10897 spa_vdev_activity_in_progress(spa_t *spa, boolean_t use_guid, uint64_t guid, 10898 zpool_wait_activity_t activity, boolean_t *in_progress) 10899 { 10900 mutex_exit(&spa->spa_activities_lock); 10901 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 10902 mutex_enter(&spa->spa_activities_lock); 10903 10904 vdev_t *vd; 10905 if (use_guid) { 10906 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 10907 if (vd == NULL || !vd->vdev_ops->vdev_op_leaf) { 10908 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 10909 return (EINVAL); 10910 } 10911 } else { 10912 vd = spa->spa_root_vdev; 10913 } 10914 10915 *in_progress = spa_vdev_activity_in_progress_impl(vd, activity); 10916 10917 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 10918 return (0); 10919 } 10920 10921 /* 10922 * Locking for waiting threads 10923 * --------------------------- 10924 * 10925 * Waiting threads need a way to check whether a given activity is in progress, 10926 * and then, if it is, wait for it to complete. Each activity will have some 10927 * in-memory representation of the relevant on-disk state which can be used to 10928 * determine whether or not the activity is in progress. The in-memory state and 10929 * the locking used to protect it will be different for each activity, and may 10930 * not be suitable for use with a cvar (e.g., some state is protected by the 10931 * config lock). To allow waiting threads to wait without any races, another 10932 * lock, spa_activities_lock, is used. 10933 * 10934 * When the state is checked, both the activity-specific lock (if there is one) 10935 * and spa_activities_lock are held. In some cases, the activity-specific lock 10936 * is acquired explicitly (e.g. the config lock). In others, the locking is 10937 * internal to some check (e.g. bpobj_is_empty). After checking, the waiting 10938 * thread releases the activity-specific lock and, if the activity is in 10939 * progress, then cv_waits using spa_activities_lock. 10940 * 10941 * The waiting thread is woken when another thread, one completing some 10942 * activity, updates the state of the activity and then calls 10943 * spa_notify_waiters, which will cv_broadcast. This 'completing' thread only 10944 * needs to hold its activity-specific lock when updating the state, and this 10945 * lock can (but doesn't have to) be dropped before calling spa_notify_waiters. 10946 * 10947 * Because spa_notify_waiters acquires spa_activities_lock before broadcasting, 10948 * and because it is held when the waiting thread checks the state of the 10949 * activity, it can never be the case that the completing thread both updates 10950 * the activity state and cv_broadcasts in between the waiting thread's check 10951 * and cv_wait. Thus, a waiting thread can never miss a wakeup. 10952 * 10953 * In order to prevent deadlock, when the waiting thread does its check, in some 10954 * cases it will temporarily drop spa_activities_lock in order to acquire the 10955 * activity-specific lock. The order in which spa_activities_lock and the 10956 * activity specific lock are acquired in the waiting thread is determined by 10957 * the order in which they are acquired in the completing thread; if the 10958 * completing thread calls spa_notify_waiters with the activity-specific lock 10959 * held, then the waiting thread must also acquire the activity-specific lock 10960 * first. 10961 */ 10962 10963 static int 10964 spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity, 10965 boolean_t use_tag, uint64_t tag, boolean_t *in_progress) 10966 { 10967 int error = 0; 10968 10969 ASSERT(MUTEX_HELD(&spa->spa_activities_lock)); 10970 10971 switch (activity) { 10972 case ZPOOL_WAIT_CKPT_DISCARD: 10973 *in_progress = 10974 (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT) && 10975 zap_contains(spa_meta_objset(spa), 10976 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT) == 10977 ENOENT); 10978 break; 10979 case ZPOOL_WAIT_FREE: 10980 *in_progress = ((spa_version(spa) >= SPA_VERSION_DEADLISTS && 10981 !bpobj_is_empty(&spa->spa_dsl_pool->dp_free_bpobj)) || 10982 spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY) || 10983 spa_livelist_delete_check(spa)); 10984 break; 10985 case ZPOOL_WAIT_INITIALIZE: 10986 case ZPOOL_WAIT_TRIM: 10987 error = spa_vdev_activity_in_progress(spa, use_tag, tag, 10988 activity, in_progress); 10989 break; 10990 case ZPOOL_WAIT_REPLACE: 10991 mutex_exit(&spa->spa_activities_lock); 10992 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 10993 mutex_enter(&spa->spa_activities_lock); 10994 10995 *in_progress = vdev_replace_in_progress(spa->spa_root_vdev); 10996 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 10997 break; 10998 case ZPOOL_WAIT_REMOVE: 10999 *in_progress = (spa->spa_removing_phys.sr_state == 11000 DSS_SCANNING); 11001 break; 11002 case ZPOOL_WAIT_RESILVER: 11003 *in_progress = vdev_rebuild_active(spa->spa_root_vdev); 11004 if (*in_progress) 11005 break; 11006 zfs_fallthrough; 11007 case ZPOOL_WAIT_SCRUB: 11008 { 11009 boolean_t scanning, paused, is_scrub; 11010 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; 11011 11012 is_scrub = (scn->scn_phys.scn_func == POOL_SCAN_SCRUB); 11013 scanning = (scn->scn_phys.scn_state == DSS_SCANNING); 11014 paused = dsl_scan_is_paused_scrub(scn); 11015 *in_progress = (scanning && !paused && 11016 is_scrub == (activity == ZPOOL_WAIT_SCRUB)); 11017 break; 11018 } 11019 case ZPOOL_WAIT_RAIDZ_EXPAND: 11020 { 11021 vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 11022 *in_progress = (vre != NULL && vre->vre_state == DSS_SCANNING); 11023 break; 11024 } 11025 default: 11026 panic("unrecognized value for activity %d", activity); 11027 } 11028 11029 return (error); 11030 } 11031 11032 static int 11033 spa_wait_common(const char *pool, zpool_wait_activity_t activity, 11034 boolean_t use_tag, uint64_t tag, boolean_t *waited) 11035 { 11036 /* 11037 * The tag is used to distinguish between instances of an activity. 11038 * 'initialize' and 'trim' are the only activities that we use this for. 11039 * The other activities can only have a single instance in progress in a 11040 * pool at one time, making the tag unnecessary. 11041 * 11042 * There can be multiple devices being replaced at once, but since they 11043 * all finish once resilvering finishes, we don't bother keeping track 11044 * of them individually, we just wait for them all to finish. 11045 */ 11046 if (use_tag && activity != ZPOOL_WAIT_INITIALIZE && 11047 activity != ZPOOL_WAIT_TRIM) 11048 return (EINVAL); 11049 11050 if (activity < 0 || activity >= ZPOOL_WAIT_NUM_ACTIVITIES) 11051 return (EINVAL); 11052 11053 spa_t *spa; 11054 int error = spa_open(pool, &spa, FTAG); 11055 if (error != 0) 11056 return (error); 11057 11058 /* 11059 * Increment the spa's waiter count so that we can call spa_close and 11060 * still ensure that the spa_t doesn't get freed before this thread is 11061 * finished with it when the pool is exported. We want to call spa_close 11062 * before we start waiting because otherwise the additional ref would 11063 * prevent the pool from being exported or destroyed throughout the 11064 * potentially long wait. 11065 */ 11066 mutex_enter(&spa->spa_activities_lock); 11067 spa->spa_waiters++; 11068 spa_close(spa, FTAG); 11069 11070 *waited = B_FALSE; 11071 for (;;) { 11072 boolean_t in_progress; 11073 error = spa_activity_in_progress(spa, activity, use_tag, tag, 11074 &in_progress); 11075 11076 if (error || !in_progress || spa->spa_waiters_cancel) 11077 break; 11078 11079 *waited = B_TRUE; 11080 11081 if (cv_wait_sig(&spa->spa_activities_cv, 11082 &spa->spa_activities_lock) == 0) { 11083 error = EINTR; 11084 break; 11085 } 11086 } 11087 11088 spa->spa_waiters--; 11089 cv_signal(&spa->spa_waiters_cv); 11090 mutex_exit(&spa->spa_activities_lock); 11091 11092 return (error); 11093 } 11094 11095 /* 11096 * Wait for a particular instance of the specified activity to complete, where 11097 * the instance is identified by 'tag' 11098 */ 11099 int 11100 spa_wait_tag(const char *pool, zpool_wait_activity_t activity, uint64_t tag, 11101 boolean_t *waited) 11102 { 11103 return (spa_wait_common(pool, activity, B_TRUE, tag, waited)); 11104 } 11105 11106 /* 11107 * Wait for all instances of the specified activity complete 11108 */ 11109 int 11110 spa_wait(const char *pool, zpool_wait_activity_t activity, boolean_t *waited) 11111 { 11112 11113 return (spa_wait_common(pool, activity, B_FALSE, 0, waited)); 11114 } 11115 11116 sysevent_t * 11117 spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) 11118 { 11119 sysevent_t *ev = NULL; 11120 #ifdef _KERNEL 11121 nvlist_t *resource; 11122 11123 resource = zfs_event_create(spa, vd, FM_SYSEVENT_CLASS, name, hist_nvl); 11124 if (resource) { 11125 ev = kmem_alloc(sizeof (sysevent_t), KM_SLEEP); 11126 ev->resource = resource; 11127 } 11128 #else 11129 (void) spa, (void) vd, (void) hist_nvl, (void) name; 11130 #endif 11131 return (ev); 11132 } 11133 11134 void 11135 spa_event_post(sysevent_t *ev) 11136 { 11137 #ifdef _KERNEL 11138 if (ev) { 11139 zfs_zevent_post(ev->resource, NULL, zfs_zevent_post_cb); 11140 kmem_free(ev, sizeof (*ev)); 11141 } 11142 #else 11143 (void) ev; 11144 #endif 11145 } 11146 11147 /* 11148 * Post a zevent corresponding to the given sysevent. The 'name' must be one 11149 * of the event definitions in sys/sysevent/eventdefs.h. The payload will be 11150 * filled in from the spa and (optionally) the vdev. This doesn't do anything 11151 * in the userland libzpool, as we don't want consumers to misinterpret ztest 11152 * or zdb as real changes. 11153 */ 11154 void 11155 spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) 11156 { 11157 spa_event_post(spa_event_create(spa, vd, hist_nvl, name)); 11158 } 11159 11160 /* state manipulation functions */ 11161 EXPORT_SYMBOL(spa_open); 11162 EXPORT_SYMBOL(spa_open_rewind); 11163 EXPORT_SYMBOL(spa_get_stats); 11164 EXPORT_SYMBOL(spa_create); 11165 EXPORT_SYMBOL(spa_import); 11166 EXPORT_SYMBOL(spa_tryimport); 11167 EXPORT_SYMBOL(spa_destroy); 11168 EXPORT_SYMBOL(spa_export); 11169 EXPORT_SYMBOL(spa_reset); 11170 EXPORT_SYMBOL(spa_async_request); 11171 EXPORT_SYMBOL(spa_async_suspend); 11172 EXPORT_SYMBOL(spa_async_resume); 11173 EXPORT_SYMBOL(spa_inject_addref); 11174 EXPORT_SYMBOL(spa_inject_delref); 11175 EXPORT_SYMBOL(spa_scan_stat_init); 11176 EXPORT_SYMBOL(spa_scan_get_stats); 11177 11178 /* device manipulation */ 11179 EXPORT_SYMBOL(spa_vdev_add); 11180 EXPORT_SYMBOL(spa_vdev_attach); 11181 EXPORT_SYMBOL(spa_vdev_detach); 11182 EXPORT_SYMBOL(spa_vdev_setpath); 11183 EXPORT_SYMBOL(spa_vdev_setfru); 11184 EXPORT_SYMBOL(spa_vdev_split_mirror); 11185 11186 /* spare statech is global across all pools) */ 11187 EXPORT_SYMBOL(spa_spare_add); 11188 EXPORT_SYMBOL(spa_spare_remove); 11189 EXPORT_SYMBOL(spa_spare_exists); 11190 EXPORT_SYMBOL(spa_spare_activate); 11191 11192 /* L2ARC statech is global across all pools) */ 11193 EXPORT_SYMBOL(spa_l2cache_add); 11194 EXPORT_SYMBOL(spa_l2cache_remove); 11195 EXPORT_SYMBOL(spa_l2cache_exists); 11196 EXPORT_SYMBOL(spa_l2cache_activate); 11197 EXPORT_SYMBOL(spa_l2cache_drop); 11198 11199 /* scanning */ 11200 EXPORT_SYMBOL(spa_scan); 11201 EXPORT_SYMBOL(spa_scan_range); 11202 EXPORT_SYMBOL(spa_scan_stop); 11203 11204 /* spa syncing */ 11205 EXPORT_SYMBOL(spa_sync); /* only for DMU use */ 11206 EXPORT_SYMBOL(spa_sync_allpools); 11207 11208 /* properties */ 11209 EXPORT_SYMBOL(spa_prop_set); 11210 EXPORT_SYMBOL(spa_prop_get); 11211 EXPORT_SYMBOL(spa_prop_clear_bootfs); 11212 11213 /* asynchronous event notification */ 11214 EXPORT_SYMBOL(spa_event_notify); 11215 11216 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_pct, UINT, ZMOD_RW, 11217 "Percentage of CPUs to run a metaslab preload taskq"); 11218 11219 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, UINT, ZMOD_RW, 11220 "log2 fraction of arc that can be used by inflight I/Os when " 11221 "verifying pool during import"); 11222 11223 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_metadata, INT, ZMOD_RW, 11224 "Set to traverse metadata on pool import"); 11225 11226 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW, 11227 "Set to traverse data on pool import"); 11228 11229 ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW, 11230 "Print vdev tree to zfs_dbgmsg during pool import"); 11231 11232 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RW, 11233 "Percentage of CPUs to run an IO worker thread"); 11234 11235 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RW, 11236 "Number of threads per IO worker taskqueue"); 11237 11238 ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, U64, ZMOD_RW, 11239 "Allow importing pool with up to this number of missing top-level " 11240 "vdevs (in read-only mode)"); 11241 11242 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT, 11243 ZMOD_RW, "Set the livelist condense zthr to pause"); 11244 11245 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_pause, INT, 11246 ZMOD_RW, "Set the livelist condense synctask to pause"); 11247 11248 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_cancel, 11249 INT, ZMOD_RW, 11250 "Whether livelist condensing was canceled in the synctask"); 11251 11252 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_cancel, 11253 INT, ZMOD_RW, 11254 "Whether livelist condensing was canceled in the zthr function"); 11255 11256 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, 11257 ZMOD_RW, 11258 "Whether extra ALLOC blkptrs were added to a livelist entry while it " 11259 "was being condensed"); 11260 11261 ZFS_MODULE_PARAM(zfs_spa, spa_, note_txg_time, UINT, ZMOD_RW, 11262 "How frequently TXG timestamps are stored internally (in seconds)"); 11263 11264 ZFS_MODULE_PARAM(zfs_spa, spa_, flush_txg_time, UINT, ZMOD_RW, 11265 "How frequently the TXG timestamps database should be flushed " 11266 "to disk (in seconds)"); 11267 11268 #ifdef _KERNEL 11269 ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read, 11270 spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RW, 11271 "Configure IO queues for read IO"); 11272 ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write, 11273 spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RW, 11274 "Configure IO queues for write IO"); 11275 #endif 11276 11277 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_write_tpq, UINT, ZMOD_RW, 11278 "Number of CPUs per write issue taskq"); 11279