1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 23 /* 24 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 25 * Copyright (c) 2011, 2024 by Delphix. All rights reserved. 26 * Copyright (c) 2018, Nexenta Systems, Inc. All rights reserved. 27 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 28 * Copyright 2013 Saso Kiselkov. All rights reserved. 29 * Copyright (c) 2014 Integros [integros.com] 30 * Copyright 2016 Toomas Soome <tsoome@me.com> 31 * Copyright (c) 2016 Actifio, Inc. All rights reserved. 32 * Copyright 2018 Joyent, Inc. 33 * Copyright (c) 2017, 2019, Datto Inc. All rights reserved. 34 * Copyright 2017 Joyent, Inc. 35 * Copyright (c) 2017, Intel Corporation. 36 * Copyright (c) 2021, Colm Buckley <colm@tuatha.org> 37 * Copyright (c) 2023 Hewlett Packard Enterprise Development LP. 38 * Copyright (c) 2023, 2024, Klara Inc. 39 */ 40 41 /* 42 * SPA: Storage Pool Allocator 43 * 44 * This file contains all the routines used when modifying on-disk SPA state. 45 * This includes opening, importing, destroying, exporting a pool, and syncing a 46 * pool. 47 */ 48 49 #include <sys/zfs_context.h> 50 #include <sys/fm/fs/zfs.h> 51 #include <sys/spa_impl.h> 52 #include <sys/zio.h> 53 #include <sys/zio_checksum.h> 54 #include <sys/dmu.h> 55 #include <sys/dmu_tx.h> 56 #include <sys/zap.h> 57 #include <sys/zil.h> 58 #include <sys/brt.h> 59 #include <sys/ddt.h> 60 #include <sys/vdev_impl.h> 61 #include <sys/vdev_removal.h> 62 #include <sys/vdev_indirect_mapping.h> 63 #include <sys/vdev_indirect_births.h> 64 #include <sys/vdev_initialize.h> 65 #include <sys/vdev_rebuild.h> 66 #include <sys/vdev_trim.h> 67 #include <sys/vdev_disk.h> 68 #include <sys/vdev_raidz.h> 69 #include <sys/vdev_draid.h> 70 #include <sys/metaslab.h> 71 #include <sys/metaslab_impl.h> 72 #include <sys/mmp.h> 73 #include <sys/uberblock_impl.h> 74 #include <sys/txg.h> 75 #include <sys/avl.h> 76 #include <sys/bpobj.h> 77 #include <sys/dmu_traverse.h> 78 #include <sys/dmu_objset.h> 79 #include <sys/unique.h> 80 #include <sys/dsl_pool.h> 81 #include <sys/dsl_dataset.h> 82 #include <sys/dsl_dir.h> 83 #include <sys/dsl_prop.h> 84 #include <sys/dsl_synctask.h> 85 #include <sys/fs/zfs.h> 86 #include <sys/arc.h> 87 #include <sys/callb.h> 88 #include <sys/systeminfo.h> 89 #include <sys/zfs_ioctl.h> 90 #include <sys/dsl_scan.h> 91 #include <sys/zfeature.h> 92 #include <sys/dsl_destroy.h> 93 #include <sys/zvol.h> 94 95 #ifdef _KERNEL 96 #include <sys/fm/protocol.h> 97 #include <sys/fm/util.h> 98 #include <sys/callb.h> 99 #include <sys/zone.h> 100 #include <sys/vmsystm.h> 101 #endif /* _KERNEL */ 102 103 #include "zfs_prop.h" 104 #include "zfs_comutil.h" 105 #include <cityhash.h> 106 107 /* 108 * spa_thread() existed on Illumos as a parent thread for the various worker 109 * threads that actually run the pool, as a way to both reference the entire 110 * pool work as a single object, and to share properties like scheduling 111 * options. It has not yet been adapted to Linux or FreeBSD. This define is 112 * used to mark related parts of the code to make things easier for the reader, 113 * and to compile this code out. It can be removed when someone implements it, 114 * moves it to some Illumos-specific place, or removes it entirely. 115 */ 116 #undef HAVE_SPA_THREAD 117 118 /* 119 * The "System Duty Cycle" scheduling class is an Illumos feature to help 120 * prevent CPU-intensive kernel threads from affecting latency on interactive 121 * threads. It doesn't exist on Linux or FreeBSD, so the supporting code is 122 * gated behind a define. On Illumos SDC depends on spa_thread(), but 123 * spa_thread() also has other uses, so this is a separate define. 124 */ 125 #undef HAVE_SYSDC 126 127 /* 128 * The interval, in seconds, at which failed configuration cache file writes 129 * should be retried. 130 */ 131 int zfs_ccw_retry_interval = 300; 132 133 typedef enum zti_modes { 134 ZTI_MODE_FIXED, /* value is # of threads (min 1) */ 135 ZTI_MODE_SCALE, /* Taskqs scale with CPUs. */ 136 ZTI_MODE_SYNC, /* sync thread assigned */ 137 ZTI_MODE_NULL, /* don't create a taskq */ 138 ZTI_NMODES 139 } zti_modes_t; 140 141 #define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } 142 #define ZTI_PCT(n) { ZTI_MODE_ONLINE_PERCENT, (n), 1 } 143 #define ZTI_SCALE { ZTI_MODE_SCALE, 0, 1 } 144 #define ZTI_SYNC { ZTI_MODE_SYNC, 0, 1 } 145 #define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } 146 147 #define ZTI_N(n) ZTI_P(n, 1) 148 #define ZTI_ONE ZTI_N(1) 149 150 typedef struct zio_taskq_info { 151 zti_modes_t zti_mode; 152 uint_t zti_value; 153 uint_t zti_count; 154 } zio_taskq_info_t; 155 156 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 157 "iss", "iss_h", "int", "int_h" 158 }; 159 160 /* 161 * This table defines the taskq settings for each ZFS I/O type. When 162 * initializing a pool, we use this table to create an appropriately sized 163 * taskq. Some operations are low volume and therefore have a small, static 164 * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE 165 * macros. Other operations process a large amount of data; the ZTI_SCALE 166 * macro causes us to create a taskq oriented for throughput. Some operations 167 * are so high frequency and short-lived that the taskq itself can become a 168 * point of lock contention. The ZTI_P(#, #) macro indicates that we need an 169 * additional degree of parallelism specified by the number of threads per- 170 * taskq and the number of taskqs; when dispatching an event in this case, the 171 * particular taskq is chosen at random. ZTI_SCALE uses a number of taskqs 172 * that scales with the number of CPUs. 173 * 174 * The different taskq priorities are to handle the different contexts (issue 175 * and interrupt) and then to reserve threads for high priority I/Os that 176 * need to be handled with minimum delay. Illumos taskq has unfair TQ_FRONT 177 * implementation, so separate high priority threads are used there. 178 */ 179 static zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 180 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 181 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ 182 { ZTI_N(8), ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* READ */ 183 #ifdef illumos 184 { ZTI_SYNC, ZTI_N(5), ZTI_SCALE, ZTI_N(5) }, /* WRITE */ 185 #else 186 { ZTI_SYNC, ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* WRITE */ 187 #endif 188 { ZTI_SCALE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ 189 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ 190 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FLUSH */ 191 { ZTI_N(4), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* TRIM */ 192 }; 193 194 static void spa_sync_version(void *arg, dmu_tx_t *tx); 195 static void spa_sync_props(void *arg, dmu_tx_t *tx); 196 static boolean_t spa_has_active_shared_spare(spa_t *spa); 197 static int spa_load_impl(spa_t *spa, spa_import_type_t type, 198 const char **ereport); 199 static void spa_vdev_resilver_done(spa_t *spa); 200 201 /* 202 * Percentage of all CPUs that can be used by the metaslab preload taskq. 203 */ 204 static uint_t metaslab_preload_pct = 50; 205 206 static uint_t zio_taskq_batch_pct = 80; /* 1 thread per cpu in pset */ 207 static uint_t zio_taskq_batch_tpq; /* threads per taskq */ 208 209 #ifdef HAVE_SYSDC 210 static const boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 211 static const uint_t zio_taskq_basedc = 80; /* base duty cycle */ 212 #endif 213 214 #ifdef HAVE_SPA_THREAD 215 static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */ 216 #endif 217 218 static uint_t zio_taskq_write_tpq = 16; 219 220 /* 221 * Report any spa_load_verify errors found, but do not fail spa_load. 222 * This is used by zdb to analyze non-idle pools. 223 */ 224 boolean_t spa_load_verify_dryrun = B_FALSE; 225 226 /* 227 * Allow read spacemaps in case of readonly import (spa_mode == SPA_MODE_READ). 228 * This is used by zdb for spacemaps verification. 229 */ 230 boolean_t spa_mode_readable_spacemaps = B_FALSE; 231 232 /* 233 * This (illegal) pool name is used when temporarily importing a spa_t in order 234 * to get the vdev stats associated with the imported devices. 235 */ 236 #define TRYIMPORT_NAME "$import" 237 238 /* 239 * For debugging purposes: print out vdev tree during pool import. 240 */ 241 static int spa_load_print_vdev_tree = B_FALSE; 242 243 /* 244 * A non-zero value for zfs_max_missing_tvds means that we allow importing 245 * pools with missing top-level vdevs. This is strictly intended for advanced 246 * pool recovery cases since missing data is almost inevitable. Pools with 247 * missing devices can only be imported read-only for safety reasons, and their 248 * fail-mode will be automatically set to "continue". 249 * 250 * With 1 missing vdev we should be able to import the pool and mount all 251 * datasets. User data that was not modified after the missing device has been 252 * added should be recoverable. This means that snapshots created prior to the 253 * addition of that device should be completely intact. 254 * 255 * With 2 missing vdevs, some datasets may fail to mount since there are 256 * dataset statistics that are stored as regular metadata. Some data might be 257 * recoverable if those vdevs were added recently. 258 * 259 * With 3 or more missing vdevs, the pool is severely damaged and MOS entries 260 * may be missing entirely. Chances of data recovery are very low. Note that 261 * there are also risks of performing an inadvertent rewind as we might be 262 * missing all the vdevs with the latest uberblocks. 263 */ 264 uint64_t zfs_max_missing_tvds = 0; 265 266 /* 267 * The parameters below are similar to zfs_max_missing_tvds but are only 268 * intended for a preliminary open of the pool with an untrusted config which 269 * might be incomplete or out-dated. 270 * 271 * We are more tolerant for pools opened from a cachefile since we could have 272 * an out-dated cachefile where a device removal was not registered. 273 * We could have set the limit arbitrarily high but in the case where devices 274 * are really missing we would want to return the proper error codes; we chose 275 * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available 276 * and we get a chance to retrieve the trusted config. 277 */ 278 uint64_t zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1; 279 280 /* 281 * In the case where config was assembled by scanning device paths (/dev/dsks 282 * by default) we are less tolerant since all the existing devices should have 283 * been detected and we want spa_load to return the right error codes. 284 */ 285 uint64_t zfs_max_missing_tvds_scan = 0; 286 287 /* 288 * Debugging aid that pauses spa_sync() towards the end. 289 */ 290 static const boolean_t zfs_pause_spa_sync = B_FALSE; 291 292 /* 293 * Variables to indicate the livelist condense zthr func should wait at certain 294 * points for the livelist to be removed - used to test condense/destroy races 295 */ 296 static int zfs_livelist_condense_zthr_pause = 0; 297 static int zfs_livelist_condense_sync_pause = 0; 298 299 /* 300 * Variables to track whether or not condense cancellation has been 301 * triggered in testing. 302 */ 303 static int zfs_livelist_condense_sync_cancel = 0; 304 static int zfs_livelist_condense_zthr_cancel = 0; 305 306 /* 307 * Variable to track whether or not extra ALLOC blkptrs were added to a 308 * livelist entry while it was being condensed (caused by the way we track 309 * remapped blkptrs in dbuf_remap_impl) 310 */ 311 static int zfs_livelist_condense_new_alloc = 0; 312 313 /* 314 * ========================================================================== 315 * SPA properties routines 316 * ========================================================================== 317 */ 318 319 /* 320 * Add a (source=src, propname=propval) list to an nvlist. 321 */ 322 static void 323 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, const char *strval, 324 uint64_t intval, zprop_source_t src) 325 { 326 const char *propname = zpool_prop_to_name(prop); 327 nvlist_t *propval; 328 329 propval = fnvlist_alloc(); 330 fnvlist_add_uint64(propval, ZPROP_SOURCE, src); 331 332 if (strval != NULL) 333 fnvlist_add_string(propval, ZPROP_VALUE, strval); 334 else 335 fnvlist_add_uint64(propval, ZPROP_VALUE, intval); 336 337 fnvlist_add_nvlist(nvl, propname, propval); 338 nvlist_free(propval); 339 } 340 341 static int 342 spa_prop_add(spa_t *spa, const char *propname, nvlist_t *outnvl) 343 { 344 zpool_prop_t prop = zpool_name_to_prop(propname); 345 zprop_source_t src = ZPROP_SRC_NONE; 346 uint64_t intval; 347 int err; 348 349 /* 350 * NB: Not all properties lookups via this API require 351 * the spa props lock, so they must explicitly grab it here. 352 */ 353 switch (prop) { 354 case ZPOOL_PROP_DEDUPCACHED: 355 err = ddt_get_pool_dedup_cached(spa, &intval); 356 if (err != 0) 357 return (SET_ERROR(err)); 358 break; 359 default: 360 return (SET_ERROR(EINVAL)); 361 } 362 363 spa_prop_add_list(outnvl, prop, NULL, intval, src); 364 365 return (0); 366 } 367 368 int 369 spa_prop_get_nvlist(spa_t *spa, char **props, unsigned int n_props, 370 nvlist_t *outnvl) 371 { 372 int err = 0; 373 374 if (props == NULL) 375 return (0); 376 377 for (unsigned int i = 0; i < n_props && err == 0; i++) { 378 err = spa_prop_add(spa, props[i], outnvl); 379 } 380 381 return (err); 382 } 383 384 /* 385 * Add a user property (source=src, propname=propval) to an nvlist. 386 */ 387 static void 388 spa_prop_add_user(nvlist_t *nvl, const char *propname, char *strval, 389 zprop_source_t src) 390 { 391 nvlist_t *propval; 392 393 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 394 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 395 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 396 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 397 nvlist_free(propval); 398 } 399 400 /* 401 * Get property values from the spa configuration. 402 */ 403 static void 404 spa_prop_get_config(spa_t *spa, nvlist_t *nv) 405 { 406 vdev_t *rvd = spa->spa_root_vdev; 407 dsl_pool_t *pool = spa->spa_dsl_pool; 408 uint64_t size, alloc, cap, version; 409 const zprop_source_t src = ZPROP_SRC_NONE; 410 spa_config_dirent_t *dp; 411 metaslab_class_t *mc = spa_normal_class(spa); 412 413 ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 414 415 if (rvd != NULL) { 416 alloc = metaslab_class_get_alloc(mc); 417 alloc += metaslab_class_get_alloc(spa_special_class(spa)); 418 alloc += metaslab_class_get_alloc(spa_dedup_class(spa)); 419 alloc += metaslab_class_get_alloc(spa_embedded_log_class(spa)); 420 421 size = metaslab_class_get_space(mc); 422 size += metaslab_class_get_space(spa_special_class(spa)); 423 size += metaslab_class_get_space(spa_dedup_class(spa)); 424 size += metaslab_class_get_space(spa_embedded_log_class(spa)); 425 426 spa_prop_add_list(nv, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 427 spa_prop_add_list(nv, ZPOOL_PROP_SIZE, NULL, size, src); 428 spa_prop_add_list(nv, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 429 spa_prop_add_list(nv, ZPOOL_PROP_FREE, NULL, 430 size - alloc, src); 431 spa_prop_add_list(nv, ZPOOL_PROP_CHECKPOINT, NULL, 432 spa->spa_checkpoint_info.sci_dspace, src); 433 434 spa_prop_add_list(nv, ZPOOL_PROP_FRAGMENTATION, NULL, 435 metaslab_class_fragmentation(mc), src); 436 spa_prop_add_list(nv, ZPOOL_PROP_EXPANDSZ, NULL, 437 metaslab_class_expandable_space(mc), src); 438 spa_prop_add_list(nv, ZPOOL_PROP_READONLY, NULL, 439 (spa_mode(spa) == SPA_MODE_READ), src); 440 441 cap = (size == 0) ? 0 : (alloc * 100 / size); 442 spa_prop_add_list(nv, ZPOOL_PROP_CAPACITY, NULL, cap, src); 443 444 spa_prop_add_list(nv, ZPOOL_PROP_DEDUPRATIO, NULL, 445 ddt_get_pool_dedup_ratio(spa), src); 446 spa_prop_add_list(nv, ZPOOL_PROP_BCLONEUSED, NULL, 447 brt_get_used(spa), src); 448 spa_prop_add_list(nv, ZPOOL_PROP_BCLONESAVED, NULL, 449 brt_get_saved(spa), src); 450 spa_prop_add_list(nv, ZPOOL_PROP_BCLONERATIO, NULL, 451 brt_get_ratio(spa), src); 452 453 spa_prop_add_list(nv, ZPOOL_PROP_DEDUP_TABLE_SIZE, NULL, 454 ddt_get_ddt_dsize(spa), src); 455 spa_prop_add_list(nv, ZPOOL_PROP_HEALTH, NULL, 456 rvd->vdev_state, src); 457 spa_prop_add_list(nv, ZPOOL_PROP_LAST_SCRUBBED_TXG, NULL, 458 spa_get_last_scrubbed_txg(spa), src); 459 460 version = spa_version(spa); 461 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) { 462 spa_prop_add_list(nv, ZPOOL_PROP_VERSION, NULL, 463 version, ZPROP_SRC_DEFAULT); 464 } else { 465 spa_prop_add_list(nv, ZPOOL_PROP_VERSION, NULL, 466 version, ZPROP_SRC_LOCAL); 467 } 468 spa_prop_add_list(nv, ZPOOL_PROP_LOAD_GUID, 469 NULL, spa_load_guid(spa), src); 470 } 471 472 if (pool != NULL) { 473 /* 474 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, 475 * when opening pools before this version freedir will be NULL. 476 */ 477 if (pool->dp_free_dir != NULL) { 478 spa_prop_add_list(nv, ZPOOL_PROP_FREEING, NULL, 479 dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes, 480 src); 481 } else { 482 spa_prop_add_list(nv, ZPOOL_PROP_FREEING, 483 NULL, 0, src); 484 } 485 486 if (pool->dp_leak_dir != NULL) { 487 spa_prop_add_list(nv, ZPOOL_PROP_LEAKED, NULL, 488 dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes, 489 src); 490 } else { 491 spa_prop_add_list(nv, ZPOOL_PROP_LEAKED, 492 NULL, 0, src); 493 } 494 } 495 496 spa_prop_add_list(nv, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 497 498 if (spa->spa_comment != NULL) { 499 spa_prop_add_list(nv, ZPOOL_PROP_COMMENT, spa->spa_comment, 500 0, ZPROP_SRC_LOCAL); 501 } 502 503 if (spa->spa_compatibility != NULL) { 504 spa_prop_add_list(nv, ZPOOL_PROP_COMPATIBILITY, 505 spa->spa_compatibility, 0, ZPROP_SRC_LOCAL); 506 } 507 508 if (spa->spa_root != NULL) 509 spa_prop_add_list(nv, ZPOOL_PROP_ALTROOT, spa->spa_root, 510 0, ZPROP_SRC_LOCAL); 511 512 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { 513 spa_prop_add_list(nv, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 514 MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE); 515 } else { 516 spa_prop_add_list(nv, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 517 SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); 518 } 519 520 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) { 521 spa_prop_add_list(nv, ZPOOL_PROP_MAXDNODESIZE, NULL, 522 DNODE_MAX_SIZE, ZPROP_SRC_NONE); 523 } else { 524 spa_prop_add_list(nv, ZPOOL_PROP_MAXDNODESIZE, NULL, 525 DNODE_MIN_SIZE, ZPROP_SRC_NONE); 526 } 527 528 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 529 if (dp->scd_path == NULL) { 530 spa_prop_add_list(nv, ZPOOL_PROP_CACHEFILE, 531 "none", 0, ZPROP_SRC_LOCAL); 532 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 533 spa_prop_add_list(nv, ZPOOL_PROP_CACHEFILE, 534 dp->scd_path, 0, ZPROP_SRC_LOCAL); 535 } 536 } 537 } 538 539 /* 540 * Get zpool property values. 541 */ 542 int 543 spa_prop_get(spa_t *spa, nvlist_t *nv) 544 { 545 objset_t *mos = spa->spa_meta_objset; 546 zap_cursor_t zc; 547 zap_attribute_t *za; 548 dsl_pool_t *dp; 549 int err = 0; 550 551 dp = spa_get_dsl(spa); 552 dsl_pool_config_enter(dp, FTAG); 553 za = zap_attribute_alloc(); 554 mutex_enter(&spa->spa_props_lock); 555 556 /* 557 * Get properties from the spa config. 558 */ 559 spa_prop_get_config(spa, nv); 560 561 /* If no pool property object, no more prop to get. */ 562 if (mos == NULL || spa->spa_pool_props_object == 0) 563 goto out; 564 565 /* 566 * Get properties from the MOS pool property object. 567 */ 568 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 569 (err = zap_cursor_retrieve(&zc, za)) == 0; 570 zap_cursor_advance(&zc)) { 571 uint64_t intval = 0; 572 char *strval = NULL; 573 zprop_source_t src = ZPROP_SRC_DEFAULT; 574 zpool_prop_t prop; 575 576 if ((prop = zpool_name_to_prop(za->za_name)) == 577 ZPOOL_PROP_INVAL && !zfs_prop_user(za->za_name)) 578 continue; 579 580 switch (za->za_integer_length) { 581 case 8: 582 /* integer property */ 583 if (za->za_first_integer != 584 zpool_prop_default_numeric(prop)) 585 src = ZPROP_SRC_LOCAL; 586 587 if (prop == ZPOOL_PROP_BOOTFS) { 588 dsl_dataset_t *ds = NULL; 589 590 err = dsl_dataset_hold_obj(dp, 591 za->za_first_integer, FTAG, &ds); 592 if (err != 0) 593 break; 594 595 strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, 596 KM_SLEEP); 597 dsl_dataset_name(ds, strval); 598 dsl_dataset_rele(ds, FTAG); 599 } else { 600 strval = NULL; 601 intval = za->za_first_integer; 602 } 603 604 spa_prop_add_list(nv, prop, strval, intval, src); 605 606 if (strval != NULL) 607 kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN); 608 609 break; 610 611 case 1: 612 /* string property */ 613 strval = kmem_alloc(za->za_num_integers, KM_SLEEP); 614 err = zap_lookup(mos, spa->spa_pool_props_object, 615 za->za_name, 1, za->za_num_integers, strval); 616 if (err) { 617 kmem_free(strval, za->za_num_integers); 618 break; 619 } 620 if (prop != ZPOOL_PROP_INVAL) { 621 spa_prop_add_list(nv, prop, strval, 0, src); 622 } else { 623 src = ZPROP_SRC_LOCAL; 624 spa_prop_add_user(nv, za->za_name, strval, 625 src); 626 } 627 kmem_free(strval, za->za_num_integers); 628 break; 629 630 default: 631 break; 632 } 633 } 634 zap_cursor_fini(&zc); 635 out: 636 mutex_exit(&spa->spa_props_lock); 637 dsl_pool_config_exit(dp, FTAG); 638 zap_attribute_free(za); 639 640 if (err && err != ENOENT) 641 return (err); 642 643 return (0); 644 } 645 646 /* 647 * Validate the given pool properties nvlist and modify the list 648 * for the property values to be set. 649 */ 650 static int 651 spa_prop_validate(spa_t *spa, nvlist_t *props) 652 { 653 nvpair_t *elem; 654 int error = 0, reset_bootfs = 0; 655 uint64_t objnum = 0; 656 boolean_t has_feature = B_FALSE; 657 658 elem = NULL; 659 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 660 uint64_t intval; 661 const char *strval, *slash, *check, *fname; 662 const char *propname = nvpair_name(elem); 663 zpool_prop_t prop = zpool_name_to_prop(propname); 664 665 switch (prop) { 666 case ZPOOL_PROP_INVAL: 667 /* 668 * Sanitize the input. 669 */ 670 if (zfs_prop_user(propname)) { 671 if (strlen(propname) >= ZAP_MAXNAMELEN) { 672 error = SET_ERROR(ENAMETOOLONG); 673 break; 674 } 675 676 if (strlen(fnvpair_value_string(elem)) >= 677 ZAP_MAXVALUELEN) { 678 error = SET_ERROR(E2BIG); 679 break; 680 } 681 } else if (zpool_prop_feature(propname)) { 682 if (nvpair_type(elem) != DATA_TYPE_UINT64) { 683 error = SET_ERROR(EINVAL); 684 break; 685 } 686 687 if (nvpair_value_uint64(elem, &intval) != 0) { 688 error = SET_ERROR(EINVAL); 689 break; 690 } 691 692 if (intval != 0) { 693 error = SET_ERROR(EINVAL); 694 break; 695 } 696 697 fname = strchr(propname, '@') + 1; 698 if (zfeature_lookup_name(fname, NULL) != 0) { 699 error = SET_ERROR(EINVAL); 700 break; 701 } 702 703 has_feature = B_TRUE; 704 } else { 705 error = SET_ERROR(EINVAL); 706 break; 707 } 708 break; 709 710 case ZPOOL_PROP_VERSION: 711 error = nvpair_value_uint64(elem, &intval); 712 if (!error && 713 (intval < spa_version(spa) || 714 intval > SPA_VERSION_BEFORE_FEATURES || 715 has_feature)) 716 error = SET_ERROR(EINVAL); 717 break; 718 719 case ZPOOL_PROP_DEDUP_TABLE_QUOTA: 720 error = nvpair_value_uint64(elem, &intval); 721 break; 722 723 case ZPOOL_PROP_DELEGATION: 724 case ZPOOL_PROP_AUTOREPLACE: 725 case ZPOOL_PROP_LISTSNAPS: 726 case ZPOOL_PROP_AUTOEXPAND: 727 case ZPOOL_PROP_AUTOTRIM: 728 error = nvpair_value_uint64(elem, &intval); 729 if (!error && intval > 1) 730 error = SET_ERROR(EINVAL); 731 break; 732 733 case ZPOOL_PROP_MULTIHOST: 734 error = nvpair_value_uint64(elem, &intval); 735 if (!error && intval > 1) 736 error = SET_ERROR(EINVAL); 737 738 if (!error) { 739 uint32_t hostid = zone_get_hostid(NULL); 740 if (hostid) 741 spa->spa_hostid = hostid; 742 else 743 error = SET_ERROR(ENOTSUP); 744 } 745 746 break; 747 748 case ZPOOL_PROP_BOOTFS: 749 /* 750 * If the pool version is less than SPA_VERSION_BOOTFS, 751 * or the pool is still being created (version == 0), 752 * the bootfs property cannot be set. 753 */ 754 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 755 error = SET_ERROR(ENOTSUP); 756 break; 757 } 758 759 /* 760 * Make sure the vdev config is bootable 761 */ 762 if (!vdev_is_bootable(spa->spa_root_vdev)) { 763 error = SET_ERROR(ENOTSUP); 764 break; 765 } 766 767 reset_bootfs = 1; 768 769 error = nvpair_value_string(elem, &strval); 770 771 if (!error) { 772 objset_t *os; 773 774 if (strval == NULL || strval[0] == '\0') { 775 objnum = zpool_prop_default_numeric( 776 ZPOOL_PROP_BOOTFS); 777 break; 778 } 779 780 error = dmu_objset_hold(strval, FTAG, &os); 781 if (error != 0) 782 break; 783 784 /* Must be ZPL. */ 785 if (dmu_objset_type(os) != DMU_OST_ZFS) { 786 error = SET_ERROR(ENOTSUP); 787 } else { 788 objnum = dmu_objset_id(os); 789 } 790 dmu_objset_rele(os, FTAG); 791 } 792 break; 793 794 case ZPOOL_PROP_FAILUREMODE: 795 error = nvpair_value_uint64(elem, &intval); 796 if (!error && intval > ZIO_FAILURE_MODE_PANIC) 797 error = SET_ERROR(EINVAL); 798 799 /* 800 * This is a special case which only occurs when 801 * the pool has completely failed. This allows 802 * the user to change the in-core failmode property 803 * without syncing it out to disk (I/Os might 804 * currently be blocked). We do this by returning 805 * EIO to the caller (spa_prop_set) to trick it 806 * into thinking we encountered a property validation 807 * error. 808 */ 809 if (!error && spa_suspended(spa)) { 810 spa->spa_failmode = intval; 811 error = SET_ERROR(EIO); 812 } 813 break; 814 815 case ZPOOL_PROP_CACHEFILE: 816 if ((error = nvpair_value_string(elem, &strval)) != 0) 817 break; 818 819 if (strval[0] == '\0') 820 break; 821 822 if (strcmp(strval, "none") == 0) 823 break; 824 825 if (strval[0] != '/') { 826 error = SET_ERROR(EINVAL); 827 break; 828 } 829 830 slash = strrchr(strval, '/'); 831 ASSERT(slash != NULL); 832 833 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 834 strcmp(slash, "/..") == 0) 835 error = SET_ERROR(EINVAL); 836 break; 837 838 case ZPOOL_PROP_COMMENT: 839 if ((error = nvpair_value_string(elem, &strval)) != 0) 840 break; 841 for (check = strval; *check != '\0'; check++) { 842 if (!isprint(*check)) { 843 error = SET_ERROR(EINVAL); 844 break; 845 } 846 } 847 if (strlen(strval) > ZPROP_MAX_COMMENT) 848 error = SET_ERROR(E2BIG); 849 break; 850 851 default: 852 break; 853 } 854 855 if (error) 856 break; 857 } 858 859 (void) nvlist_remove_all(props, 860 zpool_prop_to_name(ZPOOL_PROP_DEDUPDITTO)); 861 862 if (!error && reset_bootfs) { 863 error = nvlist_remove(props, 864 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 865 866 if (!error) { 867 error = nvlist_add_uint64(props, 868 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 869 } 870 } 871 872 return (error); 873 } 874 875 void 876 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 877 { 878 const char *cachefile; 879 spa_config_dirent_t *dp; 880 881 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 882 &cachefile) != 0) 883 return; 884 885 dp = kmem_alloc(sizeof (spa_config_dirent_t), 886 KM_SLEEP); 887 888 if (cachefile[0] == '\0') 889 dp->scd_path = spa_strdup(spa_config_path); 890 else if (strcmp(cachefile, "none") == 0) 891 dp->scd_path = NULL; 892 else 893 dp->scd_path = spa_strdup(cachefile); 894 895 list_insert_head(&spa->spa_config_list, dp); 896 if (need_sync) 897 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 898 } 899 900 int 901 spa_prop_set(spa_t *spa, nvlist_t *nvp) 902 { 903 int error; 904 nvpair_t *elem = NULL; 905 boolean_t need_sync = B_FALSE; 906 907 if ((error = spa_prop_validate(spa, nvp)) != 0) 908 return (error); 909 910 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 911 zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); 912 913 if (prop == ZPOOL_PROP_CACHEFILE || 914 prop == ZPOOL_PROP_ALTROOT || 915 prop == ZPOOL_PROP_READONLY) 916 continue; 917 918 if (prop == ZPOOL_PROP_INVAL && 919 zfs_prop_user(nvpair_name(elem))) { 920 need_sync = B_TRUE; 921 break; 922 } 923 924 if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) { 925 uint64_t ver = 0; 926 927 if (prop == ZPOOL_PROP_VERSION) { 928 VERIFY(nvpair_value_uint64(elem, &ver) == 0); 929 } else { 930 ASSERT(zpool_prop_feature(nvpair_name(elem))); 931 ver = SPA_VERSION_FEATURES; 932 need_sync = B_TRUE; 933 } 934 935 /* Save time if the version is already set. */ 936 if (ver == spa_version(spa)) 937 continue; 938 939 /* 940 * In addition to the pool directory object, we might 941 * create the pool properties object, the features for 942 * read object, the features for write object, or the 943 * feature descriptions object. 944 */ 945 error = dsl_sync_task(spa->spa_name, NULL, 946 spa_sync_version, &ver, 947 6, ZFS_SPACE_CHECK_RESERVED); 948 if (error) 949 return (error); 950 continue; 951 } 952 953 need_sync = B_TRUE; 954 break; 955 } 956 957 if (need_sync) { 958 return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, 959 nvp, 6, ZFS_SPACE_CHECK_RESERVED)); 960 } 961 962 return (0); 963 } 964 965 /* 966 * If the bootfs property value is dsobj, clear it. 967 */ 968 void 969 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 970 { 971 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 972 VERIFY(zap_remove(spa->spa_meta_objset, 973 spa->spa_pool_props_object, 974 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 975 spa->spa_bootfs = 0; 976 } 977 } 978 979 static int 980 spa_change_guid_check(void *arg, dmu_tx_t *tx) 981 { 982 uint64_t *newguid __maybe_unused = arg; 983 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 984 vdev_t *rvd = spa->spa_root_vdev; 985 uint64_t vdev_state; 986 987 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 988 int error = (spa_has_checkpoint(spa)) ? 989 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 990 return (SET_ERROR(error)); 991 } 992 993 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 994 vdev_state = rvd->vdev_state; 995 spa_config_exit(spa, SCL_STATE, FTAG); 996 997 if (vdev_state != VDEV_STATE_HEALTHY) 998 return (SET_ERROR(ENXIO)); 999 1000 ASSERT3U(spa_guid(spa), !=, *newguid); 1001 1002 return (0); 1003 } 1004 1005 static void 1006 spa_change_guid_sync(void *arg, dmu_tx_t *tx) 1007 { 1008 uint64_t *newguid = arg; 1009 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 1010 uint64_t oldguid; 1011 vdev_t *rvd = spa->spa_root_vdev; 1012 1013 oldguid = spa_guid(spa); 1014 1015 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 1016 rvd->vdev_guid = *newguid; 1017 rvd->vdev_guid_sum += (*newguid - oldguid); 1018 vdev_config_dirty(rvd); 1019 spa_config_exit(spa, SCL_STATE, FTAG); 1020 1021 spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", 1022 (u_longlong_t)oldguid, (u_longlong_t)*newguid); 1023 } 1024 1025 /* 1026 * Change the GUID for the pool. This is done so that we can later 1027 * re-import a pool built from a clone of our own vdevs. We will modify 1028 * the root vdev's guid, our own pool guid, and then mark all of our 1029 * vdevs dirty. Note that we must make sure that all our vdevs are 1030 * online when we do this, or else any vdevs that weren't present 1031 * would be orphaned from our pool. We are also going to issue a 1032 * sysevent to update any watchers. 1033 * 1034 * The GUID of the pool will be changed to the value pointed to by guidp. 1035 * The GUID may not be set to the reserverd value of 0. 1036 * The new GUID will be generated if guidp is NULL. 1037 */ 1038 int 1039 spa_change_guid(spa_t *spa, const uint64_t *guidp) 1040 { 1041 uint64_t guid; 1042 int error; 1043 1044 mutex_enter(&spa->spa_vdev_top_lock); 1045 mutex_enter(&spa_namespace_lock); 1046 1047 if (guidp != NULL) { 1048 guid = *guidp; 1049 if (guid == 0) { 1050 error = SET_ERROR(EINVAL); 1051 goto out; 1052 } 1053 1054 if (spa_guid_exists(guid, 0)) { 1055 error = SET_ERROR(EEXIST); 1056 goto out; 1057 } 1058 } else { 1059 guid = spa_generate_guid(NULL); 1060 } 1061 1062 error = dsl_sync_task(spa->spa_name, spa_change_guid_check, 1063 spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); 1064 1065 if (error == 0) { 1066 /* 1067 * Clear the kobj flag from all the vdevs to allow 1068 * vdev_cache_process_kobj_evt() to post events to all the 1069 * vdevs since GUID is updated. 1070 */ 1071 vdev_clear_kobj_evt(spa->spa_root_vdev); 1072 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 1073 vdev_clear_kobj_evt(spa->spa_l2cache.sav_vdevs[i]); 1074 1075 spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE); 1076 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID); 1077 } 1078 1079 out: 1080 mutex_exit(&spa_namespace_lock); 1081 mutex_exit(&spa->spa_vdev_top_lock); 1082 1083 return (error); 1084 } 1085 1086 /* 1087 * ========================================================================== 1088 * SPA state manipulation (open/create/destroy/import/export) 1089 * ========================================================================== 1090 */ 1091 1092 static int 1093 spa_error_entry_compare(const void *a, const void *b) 1094 { 1095 const spa_error_entry_t *sa = (const spa_error_entry_t *)a; 1096 const spa_error_entry_t *sb = (const spa_error_entry_t *)b; 1097 int ret; 1098 1099 ret = memcmp(&sa->se_bookmark, &sb->se_bookmark, 1100 sizeof (zbookmark_phys_t)); 1101 1102 return (TREE_ISIGN(ret)); 1103 } 1104 1105 /* 1106 * Utility function which retrieves copies of the current logs and 1107 * re-initializes them in the process. 1108 */ 1109 void 1110 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 1111 { 1112 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 1113 1114 memcpy(last, &spa->spa_errlist_last, sizeof (avl_tree_t)); 1115 memcpy(scrub, &spa->spa_errlist_scrub, sizeof (avl_tree_t)); 1116 1117 avl_create(&spa->spa_errlist_scrub, 1118 spa_error_entry_compare, sizeof (spa_error_entry_t), 1119 offsetof(spa_error_entry_t, se_avl)); 1120 avl_create(&spa->spa_errlist_last, 1121 spa_error_entry_compare, sizeof (spa_error_entry_t), 1122 offsetof(spa_error_entry_t, se_avl)); 1123 } 1124 1125 static void 1126 spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 1127 { 1128 const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 1129 enum zti_modes mode = ztip->zti_mode; 1130 uint_t value = ztip->zti_value; 1131 uint_t count = ztip->zti_count; 1132 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1133 uint_t cpus, flags = TASKQ_DYNAMIC; 1134 1135 switch (mode) { 1136 case ZTI_MODE_FIXED: 1137 ASSERT3U(value, >, 0); 1138 break; 1139 1140 case ZTI_MODE_SYNC: 1141 1142 /* 1143 * Create one wr_iss taskq for every 'zio_taskq_write_tpq' CPUs, 1144 * not to exceed the number of spa allocators, and align to it. 1145 */ 1146 cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100); 1147 count = MAX(1, cpus / MAX(1, zio_taskq_write_tpq)); 1148 count = MAX(count, (zio_taskq_batch_pct + 99) / 100); 1149 count = MIN(count, spa->spa_alloc_count); 1150 while (spa->spa_alloc_count % count != 0 && 1151 spa->spa_alloc_count < count * 2) 1152 count--; 1153 1154 /* 1155 * zio_taskq_batch_pct is unbounded and may exceed 100%, but no 1156 * single taskq may have more threads than 100% of online cpus. 1157 */ 1158 value = (zio_taskq_batch_pct + count / 2) / count; 1159 value = MIN(value, 100); 1160 flags |= TASKQ_THREADS_CPU_PCT; 1161 break; 1162 1163 case ZTI_MODE_SCALE: 1164 flags |= TASKQ_THREADS_CPU_PCT; 1165 /* 1166 * We want more taskqs to reduce lock contention, but we want 1167 * less for better request ordering and CPU utilization. 1168 */ 1169 cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100); 1170 if (zio_taskq_batch_tpq > 0) { 1171 count = MAX(1, (cpus + zio_taskq_batch_tpq / 2) / 1172 zio_taskq_batch_tpq); 1173 } else { 1174 /* 1175 * Prefer 6 threads per taskq, but no more taskqs 1176 * than threads in them on large systems. For 80%: 1177 * 1178 * taskq taskq total 1179 * cpus taskqs percent threads threads 1180 * ------- ------- ------- ------- ------- 1181 * 1 1 80% 1 1 1182 * 2 1 80% 1 1 1183 * 4 1 80% 3 3 1184 * 8 2 40% 3 6 1185 * 16 3 27% 4 12 1186 * 32 5 16% 5 25 1187 * 64 7 11% 7 49 1188 * 128 10 8% 10 100 1189 * 256 14 6% 15 210 1190 */ 1191 count = 1 + cpus / 6; 1192 while (count * count > cpus) 1193 count--; 1194 } 1195 /* Limit each taskq within 100% to not trigger assertion. */ 1196 count = MAX(count, (zio_taskq_batch_pct + 99) / 100); 1197 value = (zio_taskq_batch_pct + count / 2) / count; 1198 break; 1199 1200 case ZTI_MODE_NULL: 1201 tqs->stqs_count = 0; 1202 tqs->stqs_taskq = NULL; 1203 return; 1204 1205 default: 1206 panic("unrecognized mode for %s_%s taskq (%u:%u) in " 1207 "spa_taskqs_init()", 1208 zio_type_name[t], zio_taskq_types[q], mode, value); 1209 break; 1210 } 1211 1212 ASSERT3U(count, >, 0); 1213 tqs->stqs_count = count; 1214 tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); 1215 1216 for (uint_t i = 0; i < count; i++) { 1217 taskq_t *tq; 1218 char name[32]; 1219 1220 if (count > 1) 1221 (void) snprintf(name, sizeof (name), "%s_%s_%u", 1222 zio_type_name[t], zio_taskq_types[q], i); 1223 else 1224 (void) snprintf(name, sizeof (name), "%s_%s", 1225 zio_type_name[t], zio_taskq_types[q]); 1226 1227 #ifdef HAVE_SYSDC 1228 if (zio_taskq_sysdc && spa->spa_proc != &p0) { 1229 (void) zio_taskq_basedc; 1230 tq = taskq_create_sysdc(name, value, 50, INT_MAX, 1231 spa->spa_proc, zio_taskq_basedc, flags); 1232 } else { 1233 #endif 1234 /* 1235 * The write issue taskq can be extremely CPU 1236 * intensive. Run it at slightly less important 1237 * priority than the other taskqs. 1238 */ 1239 const pri_t pri = (t == ZIO_TYPE_WRITE && 1240 q == ZIO_TASKQ_ISSUE) ? 1241 wtqclsyspri : maxclsyspri; 1242 tq = taskq_create_proc(name, value, pri, 50, 1243 INT_MAX, spa->spa_proc, flags); 1244 #ifdef HAVE_SYSDC 1245 } 1246 #endif 1247 1248 tqs->stqs_taskq[i] = tq; 1249 } 1250 } 1251 1252 static void 1253 spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 1254 { 1255 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1256 1257 if (tqs->stqs_taskq == NULL) { 1258 ASSERT3U(tqs->stqs_count, ==, 0); 1259 return; 1260 } 1261 1262 for (uint_t i = 0; i < tqs->stqs_count; i++) { 1263 ASSERT3P(tqs->stqs_taskq[i], !=, NULL); 1264 taskq_destroy(tqs->stqs_taskq[i]); 1265 } 1266 1267 kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); 1268 tqs->stqs_taskq = NULL; 1269 } 1270 1271 #ifdef _KERNEL 1272 /* 1273 * The READ and WRITE rows of zio_taskqs are configurable at module load time 1274 * by setting zio_taskq_read or zio_taskq_write. 1275 * 1276 * Example (the defaults for READ and WRITE) 1277 * zio_taskq_read='fixed,1,8 null scale null' 1278 * zio_taskq_write='sync null scale null' 1279 * 1280 * Each sets the entire row at a time. 1281 * 1282 * 'fixed' is parameterised: fixed,Q,T where Q is number of taskqs, T is number 1283 * of threads per taskq. 1284 * 1285 * 'null' can only be set on the high-priority queues (queue selection for 1286 * high-priority queues will fall back to the regular queue if the high-pri 1287 * is NULL. 1288 */ 1289 static const char *const modes[ZTI_NMODES] = { 1290 "fixed", "scale", "sync", "null" 1291 }; 1292 1293 /* Parse the incoming config string. Modifies cfg */ 1294 static int 1295 spa_taskq_param_set(zio_type_t t, char *cfg) 1296 { 1297 int err = 0; 1298 1299 zio_taskq_info_t row[ZIO_TASKQ_TYPES] = {{0}}; 1300 1301 char *next = cfg, *tok, *c; 1302 1303 /* 1304 * Parse out each element from the string and fill `row`. The entire 1305 * row has to be set at once, so any errors are flagged by just 1306 * breaking out of this loop early. 1307 */ 1308 uint_t q; 1309 for (q = 0; q < ZIO_TASKQ_TYPES; q++) { 1310 /* `next` is the start of the config */ 1311 if (next == NULL) 1312 break; 1313 1314 /* Eat up leading space */ 1315 while (isspace(*next)) 1316 next++; 1317 if (*next == '\0') 1318 break; 1319 1320 /* Mode ends at space or end of string */ 1321 tok = next; 1322 next = strchr(tok, ' '); 1323 if (next != NULL) *next++ = '\0'; 1324 1325 /* Parameters start after a comma */ 1326 c = strchr(tok, ','); 1327 if (c != NULL) *c++ = '\0'; 1328 1329 /* Match mode string */ 1330 uint_t mode; 1331 for (mode = 0; mode < ZTI_NMODES; mode++) 1332 if (strcmp(tok, modes[mode]) == 0) 1333 break; 1334 if (mode == ZTI_NMODES) 1335 break; 1336 1337 /* Invalid canary */ 1338 row[q].zti_mode = ZTI_NMODES; 1339 1340 /* Per-mode setup */ 1341 switch (mode) { 1342 1343 /* 1344 * FIXED is parameterised: number of queues, and number of 1345 * threads per queue. 1346 */ 1347 case ZTI_MODE_FIXED: { 1348 /* No parameters? */ 1349 if (c == NULL || *c == '\0') 1350 break; 1351 1352 /* Find next parameter */ 1353 tok = c; 1354 c = strchr(tok, ','); 1355 if (c == NULL) 1356 break; 1357 1358 /* Take digits and convert */ 1359 unsigned long long nq; 1360 if (!(isdigit(*tok))) 1361 break; 1362 err = ddi_strtoull(tok, &tok, 10, &nq); 1363 /* Must succeed and also end at the next param sep */ 1364 if (err != 0 || tok != c) 1365 break; 1366 1367 /* Move past the comma */ 1368 tok++; 1369 /* Need another number */ 1370 if (!(isdigit(*tok))) 1371 break; 1372 /* Remember start to make sure we moved */ 1373 c = tok; 1374 1375 /* Take digits */ 1376 unsigned long long ntpq; 1377 err = ddi_strtoull(tok, &tok, 10, &ntpq); 1378 /* Must succeed, and moved forward */ 1379 if (err != 0 || tok == c || *tok != '\0') 1380 break; 1381 1382 /* 1383 * sanity; zero queues/threads make no sense, and 1384 * 16K is almost certainly more than anyone will ever 1385 * need and avoids silly numbers like UINT32_MAX 1386 */ 1387 if (nq == 0 || nq >= 16384 || 1388 ntpq == 0 || ntpq >= 16384) 1389 break; 1390 1391 const zio_taskq_info_t zti = ZTI_P(ntpq, nq); 1392 row[q] = zti; 1393 break; 1394 } 1395 1396 case ZTI_MODE_SCALE: { 1397 const zio_taskq_info_t zti = ZTI_SCALE; 1398 row[q] = zti; 1399 break; 1400 } 1401 1402 case ZTI_MODE_SYNC: { 1403 const zio_taskq_info_t zti = ZTI_SYNC; 1404 row[q] = zti; 1405 break; 1406 } 1407 1408 case ZTI_MODE_NULL: { 1409 /* 1410 * Can only null the high-priority queues; the general- 1411 * purpose ones have to exist. 1412 */ 1413 if (q != ZIO_TASKQ_ISSUE_HIGH && 1414 q != ZIO_TASKQ_INTERRUPT_HIGH) 1415 break; 1416 1417 const zio_taskq_info_t zti = ZTI_NULL; 1418 row[q] = zti; 1419 break; 1420 } 1421 1422 default: 1423 break; 1424 } 1425 1426 /* Ensure we set a mode */ 1427 if (row[q].zti_mode == ZTI_NMODES) 1428 break; 1429 } 1430 1431 /* Didn't get a full row, fail */ 1432 if (q < ZIO_TASKQ_TYPES) 1433 return (SET_ERROR(EINVAL)); 1434 1435 /* Eat trailing space */ 1436 if (next != NULL) 1437 while (isspace(*next)) 1438 next++; 1439 1440 /* If there's anything left over then fail */ 1441 if (next != NULL && *next != '\0') 1442 return (SET_ERROR(EINVAL)); 1443 1444 /* Success! Copy it into the real config */ 1445 for (q = 0; q < ZIO_TASKQ_TYPES; q++) 1446 zio_taskqs[t][q] = row[q]; 1447 1448 return (0); 1449 } 1450 1451 static int 1452 spa_taskq_param_get(zio_type_t t, char *buf, boolean_t add_newline) 1453 { 1454 int pos = 0; 1455 1456 /* Build paramater string from live config */ 1457 const char *sep = ""; 1458 for (uint_t q = 0; q < ZIO_TASKQ_TYPES; q++) { 1459 const zio_taskq_info_t *zti = &zio_taskqs[t][q]; 1460 if (zti->zti_mode == ZTI_MODE_FIXED) 1461 pos += sprintf(&buf[pos], "%s%s,%u,%u", sep, 1462 modes[zti->zti_mode], zti->zti_count, 1463 zti->zti_value); 1464 else 1465 pos += sprintf(&buf[pos], "%s%s", sep, 1466 modes[zti->zti_mode]); 1467 sep = " "; 1468 } 1469 1470 if (add_newline) 1471 buf[pos++] = '\n'; 1472 buf[pos] = '\0'; 1473 1474 return (pos); 1475 } 1476 1477 #ifdef __linux__ 1478 static int 1479 spa_taskq_read_param_set(const char *val, zfs_kernel_param_t *kp) 1480 { 1481 char *cfg = kmem_strdup(val); 1482 int err = spa_taskq_param_set(ZIO_TYPE_READ, cfg); 1483 kmem_free(cfg, strlen(val)+1); 1484 return (-err); 1485 } 1486 static int 1487 spa_taskq_read_param_get(char *buf, zfs_kernel_param_t *kp) 1488 { 1489 return (spa_taskq_param_get(ZIO_TYPE_READ, buf, TRUE)); 1490 } 1491 1492 static int 1493 spa_taskq_write_param_set(const char *val, zfs_kernel_param_t *kp) 1494 { 1495 char *cfg = kmem_strdup(val); 1496 int err = spa_taskq_param_set(ZIO_TYPE_WRITE, cfg); 1497 kmem_free(cfg, strlen(val)+1); 1498 return (-err); 1499 } 1500 static int 1501 spa_taskq_write_param_get(char *buf, zfs_kernel_param_t *kp) 1502 { 1503 return (spa_taskq_param_get(ZIO_TYPE_WRITE, buf, TRUE)); 1504 } 1505 #else 1506 /* 1507 * On FreeBSD load-time parameters can be set up before malloc() is available, 1508 * so we have to do all the parsing work on the stack. 1509 */ 1510 #define SPA_TASKQ_PARAM_MAX (128) 1511 1512 static int 1513 spa_taskq_read_param(ZFS_MODULE_PARAM_ARGS) 1514 { 1515 char buf[SPA_TASKQ_PARAM_MAX]; 1516 int err; 1517 1518 (void) spa_taskq_param_get(ZIO_TYPE_READ, buf, FALSE); 1519 err = sysctl_handle_string(oidp, buf, sizeof (buf), req); 1520 if (err || req->newptr == NULL) 1521 return (err); 1522 return (spa_taskq_param_set(ZIO_TYPE_READ, buf)); 1523 } 1524 1525 static int 1526 spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS) 1527 { 1528 char buf[SPA_TASKQ_PARAM_MAX]; 1529 int err; 1530 1531 (void) spa_taskq_param_get(ZIO_TYPE_WRITE, buf, FALSE); 1532 err = sysctl_handle_string(oidp, buf, sizeof (buf), req); 1533 if (err || req->newptr == NULL) 1534 return (err); 1535 return (spa_taskq_param_set(ZIO_TYPE_WRITE, buf)); 1536 } 1537 #endif 1538 #endif /* _KERNEL */ 1539 1540 /* 1541 * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. 1542 * Note that a type may have multiple discrete taskqs to avoid lock contention 1543 * on the taskq itself. 1544 */ 1545 void 1546 spa_taskq_dispatch(spa_t *spa, zio_type_t t, zio_taskq_type_t q, 1547 task_func_t *func, zio_t *zio, boolean_t cutinline) 1548 { 1549 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1550 taskq_t *tq; 1551 1552 ASSERT3P(tqs->stqs_taskq, !=, NULL); 1553 ASSERT3U(tqs->stqs_count, !=, 0); 1554 1555 /* 1556 * NB: We are assuming that the zio can only be dispatched 1557 * to a single taskq at a time. It would be a grievous error 1558 * to dispatch the zio to another taskq at the same time. 1559 */ 1560 ASSERT(zio); 1561 ASSERT(taskq_empty_ent(&zio->io_tqent)); 1562 1563 if (tqs->stqs_count == 1) { 1564 tq = tqs->stqs_taskq[0]; 1565 } else if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) && 1566 ZIO_HAS_ALLOCATOR(zio)) { 1567 tq = tqs->stqs_taskq[zio->io_allocator % tqs->stqs_count]; 1568 } else { 1569 tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; 1570 } 1571 1572 taskq_dispatch_ent(tq, func, zio, cutinline ? TQ_FRONT : 0, 1573 &zio->io_tqent); 1574 } 1575 1576 static void 1577 spa_create_zio_taskqs(spa_t *spa) 1578 { 1579 for (int t = 0; t < ZIO_TYPES; t++) { 1580 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1581 spa_taskqs_init(spa, t, q); 1582 } 1583 } 1584 } 1585 1586 #if defined(_KERNEL) && defined(HAVE_SPA_THREAD) 1587 static void 1588 spa_thread(void *arg) 1589 { 1590 psetid_t zio_taskq_psrset_bind = PS_NONE; 1591 callb_cpr_t cprinfo; 1592 1593 spa_t *spa = arg; 1594 user_t *pu = PTOU(curproc); 1595 1596 CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 1597 spa->spa_name); 1598 1599 ASSERT(curproc != &p0); 1600 (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 1601 "zpool-%s", spa->spa_name); 1602 (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 1603 1604 /* bind this thread to the requested psrset */ 1605 if (zio_taskq_psrset_bind != PS_NONE) { 1606 pool_lock(); 1607 mutex_enter(&cpu_lock); 1608 mutex_enter(&pidlock); 1609 mutex_enter(&curproc->p_lock); 1610 1611 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 1612 0, NULL, NULL) == 0) { 1613 curthread->t_bind_pset = zio_taskq_psrset_bind; 1614 } else { 1615 cmn_err(CE_WARN, 1616 "Couldn't bind process for zfs pool \"%s\" to " 1617 "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 1618 } 1619 1620 mutex_exit(&curproc->p_lock); 1621 mutex_exit(&pidlock); 1622 mutex_exit(&cpu_lock); 1623 pool_unlock(); 1624 } 1625 1626 #ifdef HAVE_SYSDC 1627 if (zio_taskq_sysdc) { 1628 sysdc_thread_enter(curthread, 100, 0); 1629 } 1630 #endif 1631 1632 spa->spa_proc = curproc; 1633 spa->spa_did = curthread->t_did; 1634 1635 spa_create_zio_taskqs(spa); 1636 1637 mutex_enter(&spa->spa_proc_lock); 1638 ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 1639 1640 spa->spa_proc_state = SPA_PROC_ACTIVE; 1641 cv_broadcast(&spa->spa_proc_cv); 1642 1643 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1644 while (spa->spa_proc_state == SPA_PROC_ACTIVE) 1645 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1646 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 1647 1648 ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 1649 spa->spa_proc_state = SPA_PROC_GONE; 1650 spa->spa_proc = &p0; 1651 cv_broadcast(&spa->spa_proc_cv); 1652 CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 1653 1654 mutex_enter(&curproc->p_lock); 1655 lwp_exit(); 1656 } 1657 #endif 1658 1659 extern metaslab_ops_t *metaslab_allocator(spa_t *spa); 1660 1661 /* 1662 * Activate an uninitialized pool. 1663 */ 1664 static void 1665 spa_activate(spa_t *spa, spa_mode_t mode) 1666 { 1667 metaslab_ops_t *msp = metaslab_allocator(spa); 1668 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 1669 1670 spa->spa_state = POOL_STATE_ACTIVE; 1671 spa->spa_final_txg = UINT64_MAX; 1672 spa->spa_mode = mode; 1673 spa->spa_read_spacemaps = spa_mode_readable_spacemaps; 1674 1675 spa->spa_normal_class = metaslab_class_create(spa, "normal", 1676 msp, B_FALSE); 1677 spa->spa_log_class = metaslab_class_create(spa, "log", msp, B_TRUE); 1678 spa->spa_embedded_log_class = metaslab_class_create(spa, 1679 "embedded_log", msp, B_TRUE); 1680 spa->spa_special_class = metaslab_class_create(spa, "special", 1681 msp, B_FALSE); 1682 spa->spa_dedup_class = metaslab_class_create(spa, "dedup", 1683 msp, B_FALSE); 1684 1685 /* Try to create a covering process */ 1686 mutex_enter(&spa->spa_proc_lock); 1687 ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 1688 ASSERT(spa->spa_proc == &p0); 1689 spa->spa_did = 0; 1690 1691 #ifdef HAVE_SPA_THREAD 1692 /* Only create a process if we're going to be around a while. */ 1693 if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 1694 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 1695 NULL, 0) == 0) { 1696 spa->spa_proc_state = SPA_PROC_CREATED; 1697 while (spa->spa_proc_state == SPA_PROC_CREATED) { 1698 cv_wait(&spa->spa_proc_cv, 1699 &spa->spa_proc_lock); 1700 } 1701 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1702 ASSERT(spa->spa_proc != &p0); 1703 ASSERT(spa->spa_did != 0); 1704 } else { 1705 #ifdef _KERNEL 1706 cmn_err(CE_WARN, 1707 "Couldn't create process for zfs pool \"%s\"\n", 1708 spa->spa_name); 1709 #endif 1710 } 1711 } 1712 #endif /* HAVE_SPA_THREAD */ 1713 mutex_exit(&spa->spa_proc_lock); 1714 1715 /* If we didn't create a process, we need to create our taskqs. */ 1716 if (spa->spa_proc == &p0) { 1717 spa_create_zio_taskqs(spa); 1718 } 1719 1720 for (size_t i = 0; i < TXG_SIZE; i++) { 1721 spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, 1722 ZIO_FLAG_CANFAIL); 1723 } 1724 1725 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 1726 offsetof(vdev_t, vdev_config_dirty_node)); 1727 list_create(&spa->spa_evicting_os_list, sizeof (objset_t), 1728 offsetof(objset_t, os_evicting_node)); 1729 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 1730 offsetof(vdev_t, vdev_state_dirty_node)); 1731 1732 txg_list_create(&spa->spa_vdev_txg_list, spa, 1733 offsetof(struct vdev, vdev_txg_node)); 1734 1735 avl_create(&spa->spa_errlist_scrub, 1736 spa_error_entry_compare, sizeof (spa_error_entry_t), 1737 offsetof(spa_error_entry_t, se_avl)); 1738 avl_create(&spa->spa_errlist_last, 1739 spa_error_entry_compare, sizeof (spa_error_entry_t), 1740 offsetof(spa_error_entry_t, se_avl)); 1741 avl_create(&spa->spa_errlist_healed, 1742 spa_error_entry_compare, sizeof (spa_error_entry_t), 1743 offsetof(spa_error_entry_t, se_avl)); 1744 1745 spa_activate_os(spa); 1746 1747 spa_keystore_init(&spa->spa_keystore); 1748 1749 /* 1750 * This taskq is used to perform zvol-minor-related tasks 1751 * asynchronously. This has several advantages, including easy 1752 * resolution of various deadlocks. 1753 * 1754 * The taskq must be single threaded to ensure tasks are always 1755 * processed in the order in which they were dispatched. 1756 * 1757 * A taskq per pool allows one to keep the pools independent. 1758 * This way if one pool is suspended, it will not impact another. 1759 * 1760 * The preferred location to dispatch a zvol minor task is a sync 1761 * task. In this context, there is easy access to the spa_t and minimal 1762 * error handling is required because the sync task must succeed. 1763 */ 1764 spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri, 1765 1, INT_MAX, 0); 1766 1767 /* 1768 * The taskq to preload metaslabs. 1769 */ 1770 spa->spa_metaslab_taskq = taskq_create("z_metaslab", 1771 metaslab_preload_pct, maxclsyspri, 1, INT_MAX, 1772 TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); 1773 1774 /* 1775 * Taskq dedicated to prefetcher threads: this is used to prevent the 1776 * pool traverse code from monopolizing the global (and limited) 1777 * system_taskq by inappropriately scheduling long running tasks on it. 1778 */ 1779 spa->spa_prefetch_taskq = taskq_create("z_prefetch", 100, 1780 defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); 1781 1782 /* 1783 * The taskq to upgrade datasets in this pool. Currently used by 1784 * feature SPA_FEATURE_USEROBJ_ACCOUNTING/SPA_FEATURE_PROJECT_QUOTA. 1785 */ 1786 spa->spa_upgrade_taskq = taskq_create("z_upgrade", 100, 1787 defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); 1788 } 1789 1790 /* 1791 * Opposite of spa_activate(). 1792 */ 1793 static void 1794 spa_deactivate(spa_t *spa) 1795 { 1796 ASSERT(spa->spa_sync_on == B_FALSE); 1797 ASSERT(spa->spa_dsl_pool == NULL); 1798 ASSERT(spa->spa_root_vdev == NULL); 1799 ASSERT(spa->spa_async_zio_root == NULL); 1800 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 1801 1802 spa_evicting_os_wait(spa); 1803 1804 if (spa->spa_zvol_taskq) { 1805 taskq_destroy(spa->spa_zvol_taskq); 1806 spa->spa_zvol_taskq = NULL; 1807 } 1808 1809 if (spa->spa_metaslab_taskq) { 1810 taskq_destroy(spa->spa_metaslab_taskq); 1811 spa->spa_metaslab_taskq = NULL; 1812 } 1813 1814 if (spa->spa_prefetch_taskq) { 1815 taskq_destroy(spa->spa_prefetch_taskq); 1816 spa->spa_prefetch_taskq = NULL; 1817 } 1818 1819 if (spa->spa_upgrade_taskq) { 1820 taskq_destroy(spa->spa_upgrade_taskq); 1821 spa->spa_upgrade_taskq = NULL; 1822 } 1823 1824 txg_list_destroy(&spa->spa_vdev_txg_list); 1825 1826 list_destroy(&spa->spa_config_dirty_list); 1827 list_destroy(&spa->spa_evicting_os_list); 1828 list_destroy(&spa->spa_state_dirty_list); 1829 1830 taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); 1831 1832 for (int t = 0; t < ZIO_TYPES; t++) { 1833 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1834 spa_taskqs_fini(spa, t, q); 1835 } 1836 } 1837 1838 for (size_t i = 0; i < TXG_SIZE; i++) { 1839 ASSERT3P(spa->spa_txg_zio[i], !=, NULL); 1840 VERIFY0(zio_wait(spa->spa_txg_zio[i])); 1841 spa->spa_txg_zio[i] = NULL; 1842 } 1843 1844 metaslab_class_destroy(spa->spa_normal_class); 1845 spa->spa_normal_class = NULL; 1846 1847 metaslab_class_destroy(spa->spa_log_class); 1848 spa->spa_log_class = NULL; 1849 1850 metaslab_class_destroy(spa->spa_embedded_log_class); 1851 spa->spa_embedded_log_class = NULL; 1852 1853 metaslab_class_destroy(spa->spa_special_class); 1854 spa->spa_special_class = NULL; 1855 1856 metaslab_class_destroy(spa->spa_dedup_class); 1857 spa->spa_dedup_class = NULL; 1858 1859 /* 1860 * If this was part of an import or the open otherwise failed, we may 1861 * still have errors left in the queues. Empty them just in case. 1862 */ 1863 spa_errlog_drain(spa); 1864 avl_destroy(&spa->spa_errlist_scrub); 1865 avl_destroy(&spa->spa_errlist_last); 1866 avl_destroy(&spa->spa_errlist_healed); 1867 1868 spa_keystore_fini(&spa->spa_keystore); 1869 1870 spa->spa_state = POOL_STATE_UNINITIALIZED; 1871 1872 mutex_enter(&spa->spa_proc_lock); 1873 if (spa->spa_proc_state != SPA_PROC_NONE) { 1874 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1875 spa->spa_proc_state = SPA_PROC_DEACTIVATE; 1876 cv_broadcast(&spa->spa_proc_cv); 1877 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 1878 ASSERT(spa->spa_proc != &p0); 1879 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1880 } 1881 ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 1882 spa->spa_proc_state = SPA_PROC_NONE; 1883 } 1884 ASSERT(spa->spa_proc == &p0); 1885 mutex_exit(&spa->spa_proc_lock); 1886 1887 /* 1888 * We want to make sure spa_thread() has actually exited the ZFS 1889 * module, so that the module can't be unloaded out from underneath 1890 * it. 1891 */ 1892 if (spa->spa_did != 0) { 1893 thread_join(spa->spa_did); 1894 spa->spa_did = 0; 1895 } 1896 1897 spa_deactivate_os(spa); 1898 1899 } 1900 1901 /* 1902 * Verify a pool configuration, and construct the vdev tree appropriately. This 1903 * will create all the necessary vdevs in the appropriate layout, with each vdev 1904 * in the CLOSED state. This will prep the pool before open/creation/import. 1905 * All vdev validation is done by the vdev_alloc() routine. 1906 */ 1907 int 1908 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 1909 uint_t id, int atype) 1910 { 1911 nvlist_t **child; 1912 uint_t children; 1913 int error; 1914 1915 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 1916 return (error); 1917 1918 if ((*vdp)->vdev_ops->vdev_op_leaf) 1919 return (0); 1920 1921 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1922 &child, &children); 1923 1924 if (error == ENOENT) 1925 return (0); 1926 1927 if (error) { 1928 vdev_free(*vdp); 1929 *vdp = NULL; 1930 return (SET_ERROR(EINVAL)); 1931 } 1932 1933 for (int c = 0; c < children; c++) { 1934 vdev_t *vd; 1935 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 1936 atype)) != 0) { 1937 vdev_free(*vdp); 1938 *vdp = NULL; 1939 return (error); 1940 } 1941 } 1942 1943 ASSERT(*vdp != NULL); 1944 1945 return (0); 1946 } 1947 1948 static boolean_t 1949 spa_should_flush_logs_on_unload(spa_t *spa) 1950 { 1951 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) 1952 return (B_FALSE); 1953 1954 if (!spa_writeable(spa)) 1955 return (B_FALSE); 1956 1957 if (!spa->spa_sync_on) 1958 return (B_FALSE); 1959 1960 if (spa_state(spa) != POOL_STATE_EXPORTED) 1961 return (B_FALSE); 1962 1963 if (zfs_keep_log_spacemaps_at_export) 1964 return (B_FALSE); 1965 1966 return (B_TRUE); 1967 } 1968 1969 /* 1970 * Opens a transaction that will set the flag that will instruct 1971 * spa_sync to attempt to flush all the metaslabs for that txg. 1972 */ 1973 static void 1974 spa_unload_log_sm_flush_all(spa_t *spa) 1975 { 1976 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 1977 VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT | DMU_TX_SUSPEND)); 1978 1979 ASSERT3U(spa->spa_log_flushall_txg, ==, 0); 1980 spa->spa_log_flushall_txg = dmu_tx_get_txg(tx); 1981 1982 dmu_tx_commit(tx); 1983 txg_wait_synced(spa_get_dsl(spa), spa->spa_log_flushall_txg); 1984 } 1985 1986 static void 1987 spa_unload_log_sm_metadata(spa_t *spa) 1988 { 1989 void *cookie = NULL; 1990 spa_log_sm_t *sls; 1991 log_summary_entry_t *e; 1992 1993 while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg, 1994 &cookie)) != NULL) { 1995 VERIFY0(sls->sls_mscount); 1996 kmem_free(sls, sizeof (spa_log_sm_t)); 1997 } 1998 1999 while ((e = list_remove_head(&spa->spa_log_summary)) != NULL) { 2000 VERIFY0(e->lse_mscount); 2001 kmem_free(e, sizeof (log_summary_entry_t)); 2002 } 2003 2004 spa->spa_unflushed_stats.sus_nblocks = 0; 2005 spa->spa_unflushed_stats.sus_memused = 0; 2006 spa->spa_unflushed_stats.sus_blocklimit = 0; 2007 } 2008 2009 static void 2010 spa_destroy_aux_threads(spa_t *spa) 2011 { 2012 if (spa->spa_condense_zthr != NULL) { 2013 zthr_destroy(spa->spa_condense_zthr); 2014 spa->spa_condense_zthr = NULL; 2015 } 2016 if (spa->spa_checkpoint_discard_zthr != NULL) { 2017 zthr_destroy(spa->spa_checkpoint_discard_zthr); 2018 spa->spa_checkpoint_discard_zthr = NULL; 2019 } 2020 if (spa->spa_livelist_delete_zthr != NULL) { 2021 zthr_destroy(spa->spa_livelist_delete_zthr); 2022 spa->spa_livelist_delete_zthr = NULL; 2023 } 2024 if (spa->spa_livelist_condense_zthr != NULL) { 2025 zthr_destroy(spa->spa_livelist_condense_zthr); 2026 spa->spa_livelist_condense_zthr = NULL; 2027 } 2028 if (spa->spa_raidz_expand_zthr != NULL) { 2029 zthr_destroy(spa->spa_raidz_expand_zthr); 2030 spa->spa_raidz_expand_zthr = NULL; 2031 } 2032 } 2033 2034 /* 2035 * Opposite of spa_load(). 2036 */ 2037 static void 2038 spa_unload(spa_t *spa) 2039 { 2040 ASSERT(MUTEX_HELD(&spa_namespace_lock) || 2041 spa->spa_export_thread == curthread); 2042 ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED); 2043 2044 spa_import_progress_remove(spa_guid(spa)); 2045 spa_load_note(spa, "UNLOADING"); 2046 2047 spa_wake_waiters(spa); 2048 2049 /* 2050 * If we have set the spa_final_txg, we have already performed the 2051 * tasks below in spa_export_common(). We should not redo it here since 2052 * we delay the final TXGs beyond what spa_final_txg is set at. 2053 */ 2054 if (spa->spa_final_txg == UINT64_MAX) { 2055 /* 2056 * If the log space map feature is enabled and the pool is 2057 * getting exported (but not destroyed), we want to spend some 2058 * time flushing as many metaslabs as we can in an attempt to 2059 * destroy log space maps and save import time. 2060 */ 2061 if (spa_should_flush_logs_on_unload(spa)) 2062 spa_unload_log_sm_flush_all(spa); 2063 2064 /* 2065 * Stop async tasks. 2066 */ 2067 spa_async_suspend(spa); 2068 2069 if (spa->spa_root_vdev) { 2070 vdev_t *root_vdev = spa->spa_root_vdev; 2071 vdev_initialize_stop_all(root_vdev, 2072 VDEV_INITIALIZE_ACTIVE); 2073 vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE); 2074 vdev_autotrim_stop_all(spa); 2075 vdev_rebuild_stop_all(spa); 2076 l2arc_spa_rebuild_stop(spa); 2077 } 2078 2079 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2080 spa->spa_final_txg = spa_last_synced_txg(spa) + 2081 TXG_DEFER_SIZE + 1; 2082 spa_config_exit(spa, SCL_ALL, FTAG); 2083 } 2084 2085 /* 2086 * Stop syncing. 2087 */ 2088 if (spa->spa_sync_on) { 2089 txg_sync_stop(spa->spa_dsl_pool); 2090 spa->spa_sync_on = B_FALSE; 2091 } 2092 2093 /* 2094 * This ensures that there is no async metaslab prefetching 2095 * while we attempt to unload the spa. 2096 */ 2097 taskq_wait(spa->spa_metaslab_taskq); 2098 2099 if (spa->spa_mmp.mmp_thread) 2100 mmp_thread_stop(spa); 2101 2102 /* 2103 * Wait for any outstanding async I/O to complete. 2104 */ 2105 if (spa->spa_async_zio_root != NULL) { 2106 for (int i = 0; i < max_ncpus; i++) 2107 (void) zio_wait(spa->spa_async_zio_root[i]); 2108 kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *)); 2109 spa->spa_async_zio_root = NULL; 2110 } 2111 2112 if (spa->spa_vdev_removal != NULL) { 2113 spa_vdev_removal_destroy(spa->spa_vdev_removal); 2114 spa->spa_vdev_removal = NULL; 2115 } 2116 2117 spa_destroy_aux_threads(spa); 2118 2119 spa_condense_fini(spa); 2120 2121 bpobj_close(&spa->spa_deferred_bpobj); 2122 2123 spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); 2124 2125 /* 2126 * Close all vdevs. 2127 */ 2128 if (spa->spa_root_vdev) 2129 vdev_free(spa->spa_root_vdev); 2130 ASSERT(spa->spa_root_vdev == NULL); 2131 2132 /* 2133 * Close the dsl pool. 2134 */ 2135 if (spa->spa_dsl_pool) { 2136 dsl_pool_close(spa->spa_dsl_pool); 2137 spa->spa_dsl_pool = NULL; 2138 spa->spa_meta_objset = NULL; 2139 } 2140 2141 ddt_unload(spa); 2142 brt_unload(spa); 2143 spa_unload_log_sm_metadata(spa); 2144 2145 /* 2146 * Drop and purge level 2 cache 2147 */ 2148 spa_l2cache_drop(spa); 2149 2150 if (spa->spa_spares.sav_vdevs) { 2151 for (int i = 0; i < spa->spa_spares.sav_count; i++) 2152 vdev_free(spa->spa_spares.sav_vdevs[i]); 2153 kmem_free(spa->spa_spares.sav_vdevs, 2154 spa->spa_spares.sav_count * sizeof (void *)); 2155 spa->spa_spares.sav_vdevs = NULL; 2156 } 2157 if (spa->spa_spares.sav_config) { 2158 nvlist_free(spa->spa_spares.sav_config); 2159 spa->spa_spares.sav_config = NULL; 2160 } 2161 spa->spa_spares.sav_count = 0; 2162 2163 if (spa->spa_l2cache.sav_vdevs) { 2164 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { 2165 vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); 2166 vdev_free(spa->spa_l2cache.sav_vdevs[i]); 2167 } 2168 kmem_free(spa->spa_l2cache.sav_vdevs, 2169 spa->spa_l2cache.sav_count * sizeof (void *)); 2170 spa->spa_l2cache.sav_vdevs = NULL; 2171 } 2172 if (spa->spa_l2cache.sav_config) { 2173 nvlist_free(spa->spa_l2cache.sav_config); 2174 spa->spa_l2cache.sav_config = NULL; 2175 } 2176 spa->spa_l2cache.sav_count = 0; 2177 2178 spa->spa_async_suspended = 0; 2179 2180 spa->spa_indirect_vdevs_loaded = B_FALSE; 2181 2182 if (spa->spa_comment != NULL) { 2183 spa_strfree(spa->spa_comment); 2184 spa->spa_comment = NULL; 2185 } 2186 if (spa->spa_compatibility != NULL) { 2187 spa_strfree(spa->spa_compatibility); 2188 spa->spa_compatibility = NULL; 2189 } 2190 2191 spa->spa_raidz_expand = NULL; 2192 spa->spa_checkpoint_txg = 0; 2193 2194 spa_config_exit(spa, SCL_ALL, spa); 2195 } 2196 2197 /* 2198 * Load (or re-load) the current list of vdevs describing the active spares for 2199 * this pool. When this is called, we have some form of basic information in 2200 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 2201 * then re-generate a more complete list including status information. 2202 */ 2203 void 2204 spa_load_spares(spa_t *spa) 2205 { 2206 nvlist_t **spares; 2207 uint_t nspares; 2208 int i; 2209 vdev_t *vd, *tvd; 2210 2211 #ifndef _KERNEL 2212 /* 2213 * zdb opens both the current state of the pool and the 2214 * checkpointed state (if present), with a different spa_t. 2215 * 2216 * As spare vdevs are shared among open pools, we skip loading 2217 * them when we load the checkpointed state of the pool. 2218 */ 2219 if (!spa_writeable(spa)) 2220 return; 2221 #endif 2222 2223 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 2224 2225 /* 2226 * First, close and free any existing spare vdevs. 2227 */ 2228 if (spa->spa_spares.sav_vdevs) { 2229 for (i = 0; i < spa->spa_spares.sav_count; i++) { 2230 vd = spa->spa_spares.sav_vdevs[i]; 2231 2232 /* Undo the call to spa_activate() below */ 2233 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 2234 B_FALSE)) != NULL && tvd->vdev_isspare) 2235 spa_spare_remove(tvd); 2236 vdev_close(vd); 2237 vdev_free(vd); 2238 } 2239 2240 kmem_free(spa->spa_spares.sav_vdevs, 2241 spa->spa_spares.sav_count * sizeof (void *)); 2242 } 2243 2244 if (spa->spa_spares.sav_config == NULL) 2245 nspares = 0; 2246 else 2247 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 2248 ZPOOL_CONFIG_SPARES, &spares, &nspares)); 2249 2250 spa->spa_spares.sav_count = (int)nspares; 2251 spa->spa_spares.sav_vdevs = NULL; 2252 2253 if (nspares == 0) 2254 return; 2255 2256 /* 2257 * Construct the array of vdevs, opening them to get status in the 2258 * process. For each spare, there is potentially two different vdev_t 2259 * structures associated with it: one in the list of spares (used only 2260 * for basic validation purposes) and one in the active vdev 2261 * configuration (if it's spared in). During this phase we open and 2262 * validate each vdev on the spare list. If the vdev also exists in the 2263 * active configuration, then we also mark this vdev as an active spare. 2264 */ 2265 spa->spa_spares.sav_vdevs = kmem_zalloc(nspares * sizeof (void *), 2266 KM_SLEEP); 2267 for (i = 0; i < spa->spa_spares.sav_count; i++) { 2268 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 2269 VDEV_ALLOC_SPARE) == 0); 2270 ASSERT(vd != NULL); 2271 2272 spa->spa_spares.sav_vdevs[i] = vd; 2273 2274 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 2275 B_FALSE)) != NULL) { 2276 if (!tvd->vdev_isspare) 2277 spa_spare_add(tvd); 2278 2279 /* 2280 * We only mark the spare active if we were successfully 2281 * able to load the vdev. Otherwise, importing a pool 2282 * with a bad active spare would result in strange 2283 * behavior, because multiple pool would think the spare 2284 * is actively in use. 2285 * 2286 * There is a vulnerability here to an equally bizarre 2287 * circumstance, where a dead active spare is later 2288 * brought back to life (onlined or otherwise). Given 2289 * the rarity of this scenario, and the extra complexity 2290 * it adds, we ignore the possibility. 2291 */ 2292 if (!vdev_is_dead(tvd)) 2293 spa_spare_activate(tvd); 2294 } 2295 2296 vd->vdev_top = vd; 2297 vd->vdev_aux = &spa->spa_spares; 2298 2299 if (vdev_open(vd) != 0) 2300 continue; 2301 2302 if (vdev_validate_aux(vd) == 0) 2303 spa_spare_add(vd); 2304 } 2305 2306 /* 2307 * Recompute the stashed list of spares, with status information 2308 * this time. 2309 */ 2310 fnvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES); 2311 2312 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 2313 KM_SLEEP); 2314 for (i = 0; i < spa->spa_spares.sav_count; i++) 2315 spares[i] = vdev_config_generate(spa, 2316 spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 2317 fnvlist_add_nvlist_array(spa->spa_spares.sav_config, 2318 ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, 2319 spa->spa_spares.sav_count); 2320 for (i = 0; i < spa->spa_spares.sav_count; i++) 2321 nvlist_free(spares[i]); 2322 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 2323 } 2324 2325 /* 2326 * Load (or re-load) the current list of vdevs describing the active l2cache for 2327 * this pool. When this is called, we have some form of basic information in 2328 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 2329 * then re-generate a more complete list including status information. 2330 * Devices which are already active have their details maintained, and are 2331 * not re-opened. 2332 */ 2333 void 2334 spa_load_l2cache(spa_t *spa) 2335 { 2336 nvlist_t **l2cache = NULL; 2337 uint_t nl2cache; 2338 int i, j, oldnvdevs; 2339 uint64_t guid; 2340 vdev_t *vd, **oldvdevs, **newvdevs; 2341 spa_aux_vdev_t *sav = &spa->spa_l2cache; 2342 2343 #ifndef _KERNEL 2344 /* 2345 * zdb opens both the current state of the pool and the 2346 * checkpointed state (if present), with a different spa_t. 2347 * 2348 * As L2 caches are part of the ARC which is shared among open 2349 * pools, we skip loading them when we load the checkpointed 2350 * state of the pool. 2351 */ 2352 if (!spa_writeable(spa)) 2353 return; 2354 #endif 2355 2356 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 2357 2358 oldvdevs = sav->sav_vdevs; 2359 oldnvdevs = sav->sav_count; 2360 sav->sav_vdevs = NULL; 2361 sav->sav_count = 0; 2362 2363 if (sav->sav_config == NULL) { 2364 nl2cache = 0; 2365 newvdevs = NULL; 2366 goto out; 2367 } 2368 2369 VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, 2370 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache)); 2371 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 2372 2373 /* 2374 * Process new nvlist of vdevs. 2375 */ 2376 for (i = 0; i < nl2cache; i++) { 2377 guid = fnvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID); 2378 2379 newvdevs[i] = NULL; 2380 for (j = 0; j < oldnvdevs; j++) { 2381 vd = oldvdevs[j]; 2382 if (vd != NULL && guid == vd->vdev_guid) { 2383 /* 2384 * Retain previous vdev for add/remove ops. 2385 */ 2386 newvdevs[i] = vd; 2387 oldvdevs[j] = NULL; 2388 break; 2389 } 2390 } 2391 2392 if (newvdevs[i] == NULL) { 2393 /* 2394 * Create new vdev 2395 */ 2396 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 2397 VDEV_ALLOC_L2CACHE) == 0); 2398 ASSERT(vd != NULL); 2399 newvdevs[i] = vd; 2400 2401 /* 2402 * Commit this vdev as an l2cache device, 2403 * even if it fails to open. 2404 */ 2405 spa_l2cache_add(vd); 2406 2407 vd->vdev_top = vd; 2408 vd->vdev_aux = sav; 2409 2410 spa_l2cache_activate(vd); 2411 2412 if (vdev_open(vd) != 0) 2413 continue; 2414 2415 (void) vdev_validate_aux(vd); 2416 2417 if (!vdev_is_dead(vd)) 2418 l2arc_add_vdev(spa, vd); 2419 2420 /* 2421 * Upon cache device addition to a pool or pool 2422 * creation with a cache device or if the header 2423 * of the device is invalid we issue an async 2424 * TRIM command for the whole device which will 2425 * execute if l2arc_trim_ahead > 0. 2426 */ 2427 spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM); 2428 } 2429 } 2430 2431 sav->sav_vdevs = newvdevs; 2432 sav->sav_count = (int)nl2cache; 2433 2434 /* 2435 * Recompute the stashed list of l2cache devices, with status 2436 * information this time. 2437 */ 2438 fnvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE); 2439 2440 if (sav->sav_count > 0) 2441 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), 2442 KM_SLEEP); 2443 for (i = 0; i < sav->sav_count; i++) 2444 l2cache[i] = vdev_config_generate(spa, 2445 sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 2446 fnvlist_add_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 2447 (const nvlist_t * const *)l2cache, sav->sav_count); 2448 2449 out: 2450 /* 2451 * Purge vdevs that were dropped 2452 */ 2453 if (oldvdevs) { 2454 for (i = 0; i < oldnvdevs; i++) { 2455 uint64_t pool; 2456 2457 vd = oldvdevs[i]; 2458 if (vd != NULL) { 2459 ASSERT(vd->vdev_isl2cache); 2460 2461 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 2462 pool != 0ULL && l2arc_vdev_present(vd)) 2463 l2arc_remove_vdev(vd); 2464 vdev_clear_stats(vd); 2465 vdev_free(vd); 2466 } 2467 } 2468 2469 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 2470 } 2471 2472 for (i = 0; i < sav->sav_count; i++) 2473 nvlist_free(l2cache[i]); 2474 if (sav->sav_count) 2475 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 2476 } 2477 2478 static int 2479 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 2480 { 2481 dmu_buf_t *db; 2482 char *packed = NULL; 2483 size_t nvsize = 0; 2484 int error; 2485 *value = NULL; 2486 2487 error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); 2488 if (error) 2489 return (error); 2490 2491 nvsize = *(uint64_t *)db->db_data; 2492 dmu_buf_rele(db, FTAG); 2493 2494 packed = vmem_alloc(nvsize, KM_SLEEP); 2495 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 2496 DMU_READ_PREFETCH); 2497 if (error == 0) 2498 error = nvlist_unpack(packed, nvsize, value, 0); 2499 vmem_free(packed, nvsize); 2500 2501 return (error); 2502 } 2503 2504 /* 2505 * Concrete top-level vdevs that are not missing and are not logs. At every 2506 * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds. 2507 */ 2508 static uint64_t 2509 spa_healthy_core_tvds(spa_t *spa) 2510 { 2511 vdev_t *rvd = spa->spa_root_vdev; 2512 uint64_t tvds = 0; 2513 2514 for (uint64_t i = 0; i < rvd->vdev_children; i++) { 2515 vdev_t *vd = rvd->vdev_child[i]; 2516 if (vd->vdev_islog) 2517 continue; 2518 if (vdev_is_concrete(vd) && !vdev_is_dead(vd)) 2519 tvds++; 2520 } 2521 2522 return (tvds); 2523 } 2524 2525 /* 2526 * Checks to see if the given vdev could not be opened, in which case we post a 2527 * sysevent to notify the autoreplace code that the device has been removed. 2528 */ 2529 static void 2530 spa_check_removed(vdev_t *vd) 2531 { 2532 for (uint64_t c = 0; c < vd->vdev_children; c++) 2533 spa_check_removed(vd->vdev_child[c]); 2534 2535 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && 2536 vdev_is_concrete(vd)) { 2537 zfs_post_autoreplace(vd->vdev_spa, vd); 2538 spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK); 2539 } 2540 } 2541 2542 static int 2543 spa_check_for_missing_logs(spa_t *spa) 2544 { 2545 vdev_t *rvd = spa->spa_root_vdev; 2546 2547 /* 2548 * If we're doing a normal import, then build up any additional 2549 * diagnostic information about missing log devices. 2550 * We'll pass this up to the user for further processing. 2551 */ 2552 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 2553 nvlist_t **child, *nv; 2554 uint64_t idx = 0; 2555 2556 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t *), 2557 KM_SLEEP); 2558 nv = fnvlist_alloc(); 2559 2560 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 2561 vdev_t *tvd = rvd->vdev_child[c]; 2562 2563 /* 2564 * We consider a device as missing only if it failed 2565 * to open (i.e. offline or faulted is not considered 2566 * as missing). 2567 */ 2568 if (tvd->vdev_islog && 2569 tvd->vdev_state == VDEV_STATE_CANT_OPEN) { 2570 child[idx++] = vdev_config_generate(spa, tvd, 2571 B_FALSE, VDEV_CONFIG_MISSING); 2572 } 2573 } 2574 2575 if (idx > 0) { 2576 fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 2577 (const nvlist_t * const *)child, idx); 2578 fnvlist_add_nvlist(spa->spa_load_info, 2579 ZPOOL_CONFIG_MISSING_DEVICES, nv); 2580 2581 for (uint64_t i = 0; i < idx; i++) 2582 nvlist_free(child[i]); 2583 } 2584 nvlist_free(nv); 2585 kmem_free(child, rvd->vdev_children * sizeof (char **)); 2586 2587 if (idx > 0) { 2588 spa_load_failed(spa, "some log devices are missing"); 2589 vdev_dbgmsg_print_tree(rvd, 2); 2590 return (SET_ERROR(ENXIO)); 2591 } 2592 } else { 2593 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 2594 vdev_t *tvd = rvd->vdev_child[c]; 2595 2596 if (tvd->vdev_islog && 2597 tvd->vdev_state == VDEV_STATE_CANT_OPEN) { 2598 spa_set_log_state(spa, SPA_LOG_CLEAR); 2599 spa_load_note(spa, "some log devices are " 2600 "missing, ZIL is dropped."); 2601 vdev_dbgmsg_print_tree(rvd, 2); 2602 break; 2603 } 2604 } 2605 } 2606 2607 return (0); 2608 } 2609 2610 /* 2611 * Check for missing log devices 2612 */ 2613 static boolean_t 2614 spa_check_logs(spa_t *spa) 2615 { 2616 boolean_t rv = B_FALSE; 2617 dsl_pool_t *dp = spa_get_dsl(spa); 2618 2619 switch (spa->spa_log_state) { 2620 default: 2621 break; 2622 case SPA_LOG_MISSING: 2623 /* need to recheck in case slog has been restored */ 2624 case SPA_LOG_UNKNOWN: 2625 rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 2626 zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0); 2627 if (rv) 2628 spa_set_log_state(spa, SPA_LOG_MISSING); 2629 break; 2630 } 2631 return (rv); 2632 } 2633 2634 /* 2635 * Passivate any log vdevs (note, does not apply to embedded log metaslabs). 2636 */ 2637 static boolean_t 2638 spa_passivate_log(spa_t *spa) 2639 { 2640 vdev_t *rvd = spa->spa_root_vdev; 2641 boolean_t slog_found = B_FALSE; 2642 2643 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 2644 2645 for (int c = 0; c < rvd->vdev_children; c++) { 2646 vdev_t *tvd = rvd->vdev_child[c]; 2647 2648 if (tvd->vdev_islog) { 2649 ASSERT3P(tvd->vdev_log_mg, ==, NULL); 2650 metaslab_group_passivate(tvd->vdev_mg); 2651 slog_found = B_TRUE; 2652 } 2653 } 2654 2655 return (slog_found); 2656 } 2657 2658 /* 2659 * Activate any log vdevs (note, does not apply to embedded log metaslabs). 2660 */ 2661 static void 2662 spa_activate_log(spa_t *spa) 2663 { 2664 vdev_t *rvd = spa->spa_root_vdev; 2665 2666 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 2667 2668 for (int c = 0; c < rvd->vdev_children; c++) { 2669 vdev_t *tvd = rvd->vdev_child[c]; 2670 2671 if (tvd->vdev_islog) { 2672 ASSERT3P(tvd->vdev_log_mg, ==, NULL); 2673 metaslab_group_activate(tvd->vdev_mg); 2674 } 2675 } 2676 } 2677 2678 int 2679 spa_reset_logs(spa_t *spa) 2680 { 2681 int error; 2682 2683 error = dmu_objset_find(spa_name(spa), zil_reset, 2684 NULL, DS_FIND_CHILDREN); 2685 if (error == 0) { 2686 /* 2687 * We successfully offlined the log device, sync out the 2688 * current txg so that the "stubby" block can be removed 2689 * by zil_sync(). 2690 */ 2691 txg_wait_synced(spa->spa_dsl_pool, 0); 2692 } 2693 return (error); 2694 } 2695 2696 static void 2697 spa_aux_check_removed(spa_aux_vdev_t *sav) 2698 { 2699 for (int i = 0; i < sav->sav_count; i++) 2700 spa_check_removed(sav->sav_vdevs[i]); 2701 } 2702 2703 void 2704 spa_claim_notify(zio_t *zio) 2705 { 2706 spa_t *spa = zio->io_spa; 2707 2708 if (zio->io_error) 2709 return; 2710 2711 mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 2712 if (spa->spa_claim_max_txg < BP_GET_LOGICAL_BIRTH(zio->io_bp)) 2713 spa->spa_claim_max_txg = BP_GET_LOGICAL_BIRTH(zio->io_bp); 2714 mutex_exit(&spa->spa_props_lock); 2715 } 2716 2717 typedef struct spa_load_error { 2718 boolean_t sle_verify_data; 2719 uint64_t sle_meta_count; 2720 uint64_t sle_data_count; 2721 } spa_load_error_t; 2722 2723 static void 2724 spa_load_verify_done(zio_t *zio) 2725 { 2726 blkptr_t *bp = zio->io_bp; 2727 spa_load_error_t *sle = zio->io_private; 2728 dmu_object_type_t type = BP_GET_TYPE(bp); 2729 int error = zio->io_error; 2730 spa_t *spa = zio->io_spa; 2731 2732 abd_free(zio->io_abd); 2733 if (error) { 2734 if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && 2735 type != DMU_OT_INTENT_LOG) 2736 atomic_inc_64(&sle->sle_meta_count); 2737 else 2738 atomic_inc_64(&sle->sle_data_count); 2739 } 2740 2741 mutex_enter(&spa->spa_scrub_lock); 2742 spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp); 2743 cv_broadcast(&spa->spa_scrub_io_cv); 2744 mutex_exit(&spa->spa_scrub_lock); 2745 } 2746 2747 /* 2748 * Maximum number of inflight bytes is the log2 fraction of the arc size. 2749 * By default, we set it to 1/16th of the arc. 2750 */ 2751 static uint_t spa_load_verify_shift = 4; 2752 static int spa_load_verify_metadata = B_TRUE; 2753 static int spa_load_verify_data = B_TRUE; 2754 2755 static int 2756 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 2757 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 2758 { 2759 zio_t *rio = arg; 2760 spa_load_error_t *sle = rio->io_private; 2761 2762 (void) zilog, (void) dnp; 2763 2764 /* 2765 * Note: normally this routine will not be called if 2766 * spa_load_verify_metadata is not set. However, it may be useful 2767 * to manually set the flag after the traversal has begun. 2768 */ 2769 if (!spa_load_verify_metadata) 2770 return (0); 2771 2772 /* 2773 * Sanity check the block pointer in order to detect obvious damage 2774 * before using the contents in subsequent checks or in zio_read(). 2775 * When damaged consider it to be a metadata error since we cannot 2776 * trust the BP_GET_TYPE and BP_GET_LEVEL values. 2777 */ 2778 if (zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { 2779 atomic_inc_64(&sle->sle_meta_count); 2780 return (0); 2781 } 2782 2783 if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || 2784 BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp)) 2785 return (0); 2786 2787 if (!BP_IS_METADATA(bp) && 2788 (!spa_load_verify_data || !sle->sle_verify_data)) 2789 return (0); 2790 2791 uint64_t maxinflight_bytes = 2792 arc_target_bytes() >> spa_load_verify_shift; 2793 size_t size = BP_GET_PSIZE(bp); 2794 2795 mutex_enter(&spa->spa_scrub_lock); 2796 while (spa->spa_load_verify_bytes >= maxinflight_bytes) 2797 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2798 spa->spa_load_verify_bytes += size; 2799 mutex_exit(&spa->spa_scrub_lock); 2800 2801 zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size, 2802 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 2803 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 2804 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 2805 return (0); 2806 } 2807 2808 static int 2809 verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) 2810 { 2811 (void) dp, (void) arg; 2812 2813 if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN) 2814 return (SET_ERROR(ENAMETOOLONG)); 2815 2816 return (0); 2817 } 2818 2819 static int 2820 spa_load_verify(spa_t *spa) 2821 { 2822 zio_t *rio; 2823 spa_load_error_t sle = { 0 }; 2824 zpool_load_policy_t policy; 2825 boolean_t verify_ok = B_FALSE; 2826 int error = 0; 2827 2828 zpool_get_load_policy(spa->spa_config, &policy); 2829 2830 if (policy.zlp_rewind & ZPOOL_NEVER_REWIND || 2831 policy.zlp_maxmeta == UINT64_MAX) 2832 return (0); 2833 2834 dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); 2835 error = dmu_objset_find_dp(spa->spa_dsl_pool, 2836 spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL, 2837 DS_FIND_CHILDREN); 2838 dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); 2839 if (error != 0) 2840 return (error); 2841 2842 /* 2843 * Verify data only if we are rewinding or error limit was set. 2844 * Otherwise nothing except dbgmsg care about it to waste time. 2845 */ 2846 sle.sle_verify_data = (policy.zlp_rewind & ZPOOL_REWIND_MASK) || 2847 (policy.zlp_maxdata < UINT64_MAX); 2848 2849 rio = zio_root(spa, NULL, &sle, 2850 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 2851 2852 if (spa_load_verify_metadata) { 2853 if (spa->spa_extreme_rewind) { 2854 spa_load_note(spa, "performing a complete scan of the " 2855 "pool since extreme rewind is on. This may take " 2856 "a very long time.\n (spa_load_verify_data=%u, " 2857 "spa_load_verify_metadata=%u)", 2858 spa_load_verify_data, spa_load_verify_metadata); 2859 } 2860 2861 error = traverse_pool(spa, spa->spa_verify_min_txg, 2862 TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | 2863 TRAVERSE_NO_DECRYPT, spa_load_verify_cb, rio); 2864 } 2865 2866 (void) zio_wait(rio); 2867 ASSERT0(spa->spa_load_verify_bytes); 2868 2869 spa->spa_load_meta_errors = sle.sle_meta_count; 2870 spa->spa_load_data_errors = sle.sle_data_count; 2871 2872 if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) { 2873 spa_load_note(spa, "spa_load_verify found %llu metadata errors " 2874 "and %llu data errors", (u_longlong_t)sle.sle_meta_count, 2875 (u_longlong_t)sle.sle_data_count); 2876 } 2877 2878 if (spa_load_verify_dryrun || 2879 (!error && sle.sle_meta_count <= policy.zlp_maxmeta && 2880 sle.sle_data_count <= policy.zlp_maxdata)) { 2881 int64_t loss = 0; 2882 2883 verify_ok = B_TRUE; 2884 spa->spa_load_txg = spa->spa_uberblock.ub_txg; 2885 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 2886 2887 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 2888 fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_TIME, 2889 spa->spa_load_txg_ts); 2890 fnvlist_add_int64(spa->spa_load_info, ZPOOL_CONFIG_REWIND_TIME, 2891 loss); 2892 fnvlist_add_uint64(spa->spa_load_info, 2893 ZPOOL_CONFIG_LOAD_META_ERRORS, sle.sle_meta_count); 2894 fnvlist_add_uint64(spa->spa_load_info, 2895 ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count); 2896 } else { 2897 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 2898 } 2899 2900 if (spa_load_verify_dryrun) 2901 return (0); 2902 2903 if (error) { 2904 if (error != ENXIO && error != EIO) 2905 error = SET_ERROR(EIO); 2906 return (error); 2907 } 2908 2909 return (verify_ok ? 0 : EIO); 2910 } 2911 2912 /* 2913 * Find a value in the pool props object. 2914 */ 2915 static void 2916 spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 2917 { 2918 (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 2919 zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 2920 } 2921 2922 /* 2923 * Find a value in the pool directory object. 2924 */ 2925 static int 2926 spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent) 2927 { 2928 int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 2929 name, sizeof (uint64_t), 1, val); 2930 2931 if (error != 0 && (error != ENOENT || log_enoent)) { 2932 spa_load_failed(spa, "couldn't get '%s' value in MOS directory " 2933 "[error=%d]", name, error); 2934 } 2935 2936 return (error); 2937 } 2938 2939 static int 2940 spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 2941 { 2942 vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 2943 return (SET_ERROR(err)); 2944 } 2945 2946 boolean_t 2947 spa_livelist_delete_check(spa_t *spa) 2948 { 2949 return (spa->spa_livelists_to_delete != 0); 2950 } 2951 2952 static boolean_t 2953 spa_livelist_delete_cb_check(void *arg, zthr_t *z) 2954 { 2955 (void) z; 2956 spa_t *spa = arg; 2957 return (spa_livelist_delete_check(spa)); 2958 } 2959 2960 static int 2961 delete_blkptr_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 2962 { 2963 spa_t *spa = arg; 2964 zio_free(spa, tx->tx_txg, bp); 2965 dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, 2966 -bp_get_dsize_sync(spa, bp), 2967 -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); 2968 return (0); 2969 } 2970 2971 static int 2972 dsl_get_next_livelist_obj(objset_t *os, uint64_t zap_obj, uint64_t *llp) 2973 { 2974 int err; 2975 zap_cursor_t zc; 2976 zap_attribute_t *za = zap_attribute_alloc(); 2977 zap_cursor_init(&zc, os, zap_obj); 2978 err = zap_cursor_retrieve(&zc, za); 2979 zap_cursor_fini(&zc); 2980 if (err == 0) 2981 *llp = za->za_first_integer; 2982 zap_attribute_free(za); 2983 return (err); 2984 } 2985 2986 /* 2987 * Components of livelist deletion that must be performed in syncing 2988 * context: freeing block pointers and updating the pool-wide data 2989 * structures to indicate how much work is left to do 2990 */ 2991 typedef struct sublist_delete_arg { 2992 spa_t *spa; 2993 dsl_deadlist_t *ll; 2994 uint64_t key; 2995 bplist_t *to_free; 2996 } sublist_delete_arg_t; 2997 2998 static void 2999 sublist_delete_sync(void *arg, dmu_tx_t *tx) 3000 { 3001 sublist_delete_arg_t *sda = arg; 3002 spa_t *spa = sda->spa; 3003 dsl_deadlist_t *ll = sda->ll; 3004 uint64_t key = sda->key; 3005 bplist_t *to_free = sda->to_free; 3006 3007 bplist_iterate(to_free, delete_blkptr_cb, spa, tx); 3008 dsl_deadlist_remove_entry(ll, key, tx); 3009 } 3010 3011 typedef struct livelist_delete_arg { 3012 spa_t *spa; 3013 uint64_t ll_obj; 3014 uint64_t zap_obj; 3015 } livelist_delete_arg_t; 3016 3017 static void 3018 livelist_delete_sync(void *arg, dmu_tx_t *tx) 3019 { 3020 livelist_delete_arg_t *lda = arg; 3021 spa_t *spa = lda->spa; 3022 uint64_t ll_obj = lda->ll_obj; 3023 uint64_t zap_obj = lda->zap_obj; 3024 objset_t *mos = spa->spa_meta_objset; 3025 uint64_t count; 3026 3027 /* free the livelist and decrement the feature count */ 3028 VERIFY0(zap_remove_int(mos, zap_obj, ll_obj, tx)); 3029 dsl_deadlist_free(mos, ll_obj, tx); 3030 spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx); 3031 VERIFY0(zap_count(mos, zap_obj, &count)); 3032 if (count == 0) { 3033 /* no more livelists to delete */ 3034 VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, 3035 DMU_POOL_DELETED_CLONES, tx)); 3036 VERIFY0(zap_destroy(mos, zap_obj, tx)); 3037 spa->spa_livelists_to_delete = 0; 3038 spa_notify_waiters(spa); 3039 } 3040 } 3041 3042 /* 3043 * Load in the value for the livelist to be removed and open it. Then, 3044 * load its first sublist and determine which block pointers should actually 3045 * be freed. Then, call a synctask which performs the actual frees and updates 3046 * the pool-wide livelist data. 3047 */ 3048 static void 3049 spa_livelist_delete_cb(void *arg, zthr_t *z) 3050 { 3051 spa_t *spa = arg; 3052 uint64_t ll_obj = 0, count; 3053 objset_t *mos = spa->spa_meta_objset; 3054 uint64_t zap_obj = spa->spa_livelists_to_delete; 3055 /* 3056 * Determine the next livelist to delete. This function should only 3057 * be called if there is at least one deleted clone. 3058 */ 3059 VERIFY0(dsl_get_next_livelist_obj(mos, zap_obj, &ll_obj)); 3060 VERIFY0(zap_count(mos, ll_obj, &count)); 3061 if (count > 0) { 3062 dsl_deadlist_t *ll; 3063 dsl_deadlist_entry_t *dle; 3064 bplist_t to_free; 3065 ll = kmem_zalloc(sizeof (dsl_deadlist_t), KM_SLEEP); 3066 VERIFY0(dsl_deadlist_open(ll, mos, ll_obj)); 3067 dle = dsl_deadlist_first(ll); 3068 ASSERT3P(dle, !=, NULL); 3069 bplist_create(&to_free); 3070 int err = dsl_process_sub_livelist(&dle->dle_bpobj, &to_free, 3071 z, NULL); 3072 if (err == 0) { 3073 sublist_delete_arg_t sync_arg = { 3074 .spa = spa, 3075 .ll = ll, 3076 .key = dle->dle_mintxg, 3077 .to_free = &to_free 3078 }; 3079 zfs_dbgmsg("deleting sublist (id %llu) from" 3080 " livelist %llu, %lld remaining", 3081 (u_longlong_t)dle->dle_bpobj.bpo_object, 3082 (u_longlong_t)ll_obj, (longlong_t)count - 1); 3083 VERIFY0(dsl_sync_task(spa_name(spa), NULL, 3084 sublist_delete_sync, &sync_arg, 0, 3085 ZFS_SPACE_CHECK_DESTROY)); 3086 } else { 3087 VERIFY3U(err, ==, EINTR); 3088 } 3089 bplist_clear(&to_free); 3090 bplist_destroy(&to_free); 3091 dsl_deadlist_close(ll); 3092 kmem_free(ll, sizeof (dsl_deadlist_t)); 3093 } else { 3094 livelist_delete_arg_t sync_arg = { 3095 .spa = spa, 3096 .ll_obj = ll_obj, 3097 .zap_obj = zap_obj 3098 }; 3099 zfs_dbgmsg("deletion of livelist %llu completed", 3100 (u_longlong_t)ll_obj); 3101 VERIFY0(dsl_sync_task(spa_name(spa), NULL, livelist_delete_sync, 3102 &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY)); 3103 } 3104 } 3105 3106 static void 3107 spa_start_livelist_destroy_thread(spa_t *spa) 3108 { 3109 ASSERT3P(spa->spa_livelist_delete_zthr, ==, NULL); 3110 spa->spa_livelist_delete_zthr = 3111 zthr_create("z_livelist_destroy", 3112 spa_livelist_delete_cb_check, spa_livelist_delete_cb, spa, 3113 minclsyspri); 3114 } 3115 3116 typedef struct livelist_new_arg { 3117 bplist_t *allocs; 3118 bplist_t *frees; 3119 } livelist_new_arg_t; 3120 3121 static int 3122 livelist_track_new_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 3123 dmu_tx_t *tx) 3124 { 3125 ASSERT(tx == NULL); 3126 livelist_new_arg_t *lna = arg; 3127 if (bp_freed) { 3128 bplist_append(lna->frees, bp); 3129 } else { 3130 bplist_append(lna->allocs, bp); 3131 zfs_livelist_condense_new_alloc++; 3132 } 3133 return (0); 3134 } 3135 3136 typedef struct livelist_condense_arg { 3137 spa_t *spa; 3138 bplist_t to_keep; 3139 uint64_t first_size; 3140 uint64_t next_size; 3141 } livelist_condense_arg_t; 3142 3143 static void 3144 spa_livelist_condense_sync(void *arg, dmu_tx_t *tx) 3145 { 3146 livelist_condense_arg_t *lca = arg; 3147 spa_t *spa = lca->spa; 3148 bplist_t new_frees; 3149 dsl_dataset_t *ds = spa->spa_to_condense.ds; 3150 3151 /* Have we been cancelled? */ 3152 if (spa->spa_to_condense.cancelled) { 3153 zfs_livelist_condense_sync_cancel++; 3154 goto out; 3155 } 3156 3157 dsl_deadlist_entry_t *first = spa->spa_to_condense.first; 3158 dsl_deadlist_entry_t *next = spa->spa_to_condense.next; 3159 dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist; 3160 3161 /* 3162 * It's possible that the livelist was changed while the zthr was 3163 * running. Therefore, we need to check for new blkptrs in the two 3164 * entries being condensed and continue to track them in the livelist. 3165 * Because of the way we handle remapped blkptrs (see dbuf_remap_impl), 3166 * it's possible that the newly added blkptrs are FREEs or ALLOCs so 3167 * we need to sort them into two different bplists. 3168 */ 3169 uint64_t first_obj = first->dle_bpobj.bpo_object; 3170 uint64_t next_obj = next->dle_bpobj.bpo_object; 3171 uint64_t cur_first_size = first->dle_bpobj.bpo_phys->bpo_num_blkptrs; 3172 uint64_t cur_next_size = next->dle_bpobj.bpo_phys->bpo_num_blkptrs; 3173 3174 bplist_create(&new_frees); 3175 livelist_new_arg_t new_bps = { 3176 .allocs = &lca->to_keep, 3177 .frees = &new_frees, 3178 }; 3179 3180 if (cur_first_size > lca->first_size) { 3181 VERIFY0(livelist_bpobj_iterate_from_nofree(&first->dle_bpobj, 3182 livelist_track_new_cb, &new_bps, lca->first_size)); 3183 } 3184 if (cur_next_size > lca->next_size) { 3185 VERIFY0(livelist_bpobj_iterate_from_nofree(&next->dle_bpobj, 3186 livelist_track_new_cb, &new_bps, lca->next_size)); 3187 } 3188 3189 dsl_deadlist_clear_entry(first, ll, tx); 3190 ASSERT(bpobj_is_empty(&first->dle_bpobj)); 3191 dsl_deadlist_remove_entry(ll, next->dle_mintxg, tx); 3192 3193 bplist_iterate(&lca->to_keep, dsl_deadlist_insert_alloc_cb, ll, tx); 3194 bplist_iterate(&new_frees, dsl_deadlist_insert_free_cb, ll, tx); 3195 bplist_destroy(&new_frees); 3196 3197 char dsname[ZFS_MAX_DATASET_NAME_LEN]; 3198 dsl_dataset_name(ds, dsname); 3199 zfs_dbgmsg("txg %llu condensing livelist of %s (id %llu), bpobj %llu " 3200 "(%llu blkptrs) and bpobj %llu (%llu blkptrs) -> bpobj %llu " 3201 "(%llu blkptrs)", (u_longlong_t)tx->tx_txg, dsname, 3202 (u_longlong_t)ds->ds_object, (u_longlong_t)first_obj, 3203 (u_longlong_t)cur_first_size, (u_longlong_t)next_obj, 3204 (u_longlong_t)cur_next_size, 3205 (u_longlong_t)first->dle_bpobj.bpo_object, 3206 (u_longlong_t)first->dle_bpobj.bpo_phys->bpo_num_blkptrs); 3207 out: 3208 dmu_buf_rele(ds->ds_dbuf, spa); 3209 spa->spa_to_condense.ds = NULL; 3210 bplist_clear(&lca->to_keep); 3211 bplist_destroy(&lca->to_keep); 3212 kmem_free(lca, sizeof (livelist_condense_arg_t)); 3213 spa->spa_to_condense.syncing = B_FALSE; 3214 } 3215 3216 static void 3217 spa_livelist_condense_cb(void *arg, zthr_t *t) 3218 { 3219 while (zfs_livelist_condense_zthr_pause && 3220 !(zthr_has_waiters(t) || zthr_iscancelled(t))) 3221 delay(1); 3222 3223 spa_t *spa = arg; 3224 dsl_deadlist_entry_t *first = spa->spa_to_condense.first; 3225 dsl_deadlist_entry_t *next = spa->spa_to_condense.next; 3226 uint64_t first_size, next_size; 3227 3228 livelist_condense_arg_t *lca = 3229 kmem_alloc(sizeof (livelist_condense_arg_t), KM_SLEEP); 3230 bplist_create(&lca->to_keep); 3231 3232 /* 3233 * Process the livelists (matching FREEs and ALLOCs) in open context 3234 * so we have minimal work in syncing context to condense. 3235 * 3236 * We save bpobj sizes (first_size and next_size) to use later in 3237 * syncing context to determine if entries were added to these sublists 3238 * while in open context. This is possible because the clone is still 3239 * active and open for normal writes and we want to make sure the new, 3240 * unprocessed blockpointers are inserted into the livelist normally. 3241 * 3242 * Note that dsl_process_sub_livelist() both stores the size number of 3243 * blockpointers and iterates over them while the bpobj's lock held, so 3244 * the sizes returned to us are consistent which what was actually 3245 * processed. 3246 */ 3247 int err = dsl_process_sub_livelist(&first->dle_bpobj, &lca->to_keep, t, 3248 &first_size); 3249 if (err == 0) 3250 err = dsl_process_sub_livelist(&next->dle_bpobj, &lca->to_keep, 3251 t, &next_size); 3252 3253 if (err == 0) { 3254 while (zfs_livelist_condense_sync_pause && 3255 !(zthr_has_waiters(t) || zthr_iscancelled(t))) 3256 delay(1); 3257 3258 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 3259 dmu_tx_mark_netfree(tx); 3260 dmu_tx_hold_space(tx, 1); 3261 err = dmu_tx_assign(tx, DMU_TX_NOWAIT | DMU_TX_NOTHROTTLE); 3262 if (err == 0) { 3263 /* 3264 * Prevent the condense zthr restarting before 3265 * the synctask completes. 3266 */ 3267 spa->spa_to_condense.syncing = B_TRUE; 3268 lca->spa = spa; 3269 lca->first_size = first_size; 3270 lca->next_size = next_size; 3271 dsl_sync_task_nowait(spa_get_dsl(spa), 3272 spa_livelist_condense_sync, lca, tx); 3273 dmu_tx_commit(tx); 3274 return; 3275 } 3276 } 3277 /* 3278 * Condensing can not continue: either it was externally stopped or 3279 * we were unable to assign to a tx because the pool has run out of 3280 * space. In the second case, we'll just end up trying to condense 3281 * again in a later txg. 3282 */ 3283 ASSERT(err != 0); 3284 bplist_clear(&lca->to_keep); 3285 bplist_destroy(&lca->to_keep); 3286 kmem_free(lca, sizeof (livelist_condense_arg_t)); 3287 dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, spa); 3288 spa->spa_to_condense.ds = NULL; 3289 if (err == EINTR) 3290 zfs_livelist_condense_zthr_cancel++; 3291 } 3292 3293 /* 3294 * Check that there is something to condense but that a condense is not 3295 * already in progress and that condensing has not been cancelled. 3296 */ 3297 static boolean_t 3298 spa_livelist_condense_cb_check(void *arg, zthr_t *z) 3299 { 3300 (void) z; 3301 spa_t *spa = arg; 3302 if ((spa->spa_to_condense.ds != NULL) && 3303 (spa->spa_to_condense.syncing == B_FALSE) && 3304 (spa->spa_to_condense.cancelled == B_FALSE)) { 3305 return (B_TRUE); 3306 } 3307 return (B_FALSE); 3308 } 3309 3310 static void 3311 spa_start_livelist_condensing_thread(spa_t *spa) 3312 { 3313 spa->spa_to_condense.ds = NULL; 3314 spa->spa_to_condense.first = NULL; 3315 spa->spa_to_condense.next = NULL; 3316 spa->spa_to_condense.syncing = B_FALSE; 3317 spa->spa_to_condense.cancelled = B_FALSE; 3318 3319 ASSERT3P(spa->spa_livelist_condense_zthr, ==, NULL); 3320 spa->spa_livelist_condense_zthr = 3321 zthr_create("z_livelist_condense", 3322 spa_livelist_condense_cb_check, 3323 spa_livelist_condense_cb, spa, minclsyspri); 3324 } 3325 3326 static void 3327 spa_spawn_aux_threads(spa_t *spa) 3328 { 3329 ASSERT(spa_writeable(spa)); 3330 3331 spa_start_raidz_expansion_thread(spa); 3332 spa_start_indirect_condensing_thread(spa); 3333 spa_start_livelist_destroy_thread(spa); 3334 spa_start_livelist_condensing_thread(spa); 3335 3336 ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL); 3337 spa->spa_checkpoint_discard_zthr = 3338 zthr_create("z_checkpoint_discard", 3339 spa_checkpoint_discard_thread_check, 3340 spa_checkpoint_discard_thread, spa, minclsyspri); 3341 } 3342 3343 /* 3344 * Fix up config after a partly-completed split. This is done with the 3345 * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 3346 * pool have that entry in their config, but only the splitting one contains 3347 * a list of all the guids of the vdevs that are being split off. 3348 * 3349 * This function determines what to do with that list: either rejoin 3350 * all the disks to the pool, or complete the splitting process. To attempt 3351 * the rejoin, each disk that is offlined is marked online again, and 3352 * we do a reopen() call. If the vdev label for every disk that was 3353 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 3354 * then we call vdev_split() on each disk, and complete the split. 3355 * 3356 * Otherwise we leave the config alone, with all the vdevs in place in 3357 * the original pool. 3358 */ 3359 static void 3360 spa_try_repair(spa_t *spa, nvlist_t *config) 3361 { 3362 uint_t extracted; 3363 uint64_t *glist; 3364 uint_t i, gcount; 3365 nvlist_t *nvl; 3366 vdev_t **vd; 3367 boolean_t attempt_reopen; 3368 3369 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 3370 return; 3371 3372 /* check that the config is complete */ 3373 if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 3374 &glist, &gcount) != 0) 3375 return; 3376 3377 vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 3378 3379 /* attempt to online all the vdevs & validate */ 3380 attempt_reopen = B_TRUE; 3381 for (i = 0; i < gcount; i++) { 3382 if (glist[i] == 0) /* vdev is hole */ 3383 continue; 3384 3385 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 3386 if (vd[i] == NULL) { 3387 /* 3388 * Don't bother attempting to reopen the disks; 3389 * just do the split. 3390 */ 3391 attempt_reopen = B_FALSE; 3392 } else { 3393 /* attempt to re-online it */ 3394 vd[i]->vdev_offline = B_FALSE; 3395 } 3396 } 3397 3398 if (attempt_reopen) { 3399 vdev_reopen(spa->spa_root_vdev); 3400 3401 /* check each device to see what state it's in */ 3402 for (extracted = 0, i = 0; i < gcount; i++) { 3403 if (vd[i] != NULL && 3404 vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 3405 break; 3406 ++extracted; 3407 } 3408 } 3409 3410 /* 3411 * If every disk has been moved to the new pool, or if we never 3412 * even attempted to look at them, then we split them off for 3413 * good. 3414 */ 3415 if (!attempt_reopen || gcount == extracted) { 3416 for (i = 0; i < gcount; i++) 3417 if (vd[i] != NULL) 3418 vdev_split(vd[i]); 3419 vdev_reopen(spa->spa_root_vdev); 3420 } 3421 3422 kmem_free(vd, gcount * sizeof (vdev_t *)); 3423 } 3424 3425 static int 3426 spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type) 3427 { 3428 const char *ereport = FM_EREPORT_ZFS_POOL; 3429 int error; 3430 3431 spa->spa_load_state = state; 3432 (void) spa_import_progress_set_state(spa_guid(spa), 3433 spa_load_state(spa)); 3434 spa_import_progress_set_notes(spa, "spa_load()"); 3435 3436 gethrestime(&spa->spa_loaded_ts); 3437 error = spa_load_impl(spa, type, &ereport); 3438 3439 /* 3440 * Don't count references from objsets that are already closed 3441 * and are making their way through the eviction process. 3442 */ 3443 spa_evicting_os_wait(spa); 3444 spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); 3445 if (error) { 3446 if (error != EEXIST) { 3447 spa->spa_loaded_ts.tv_sec = 0; 3448 spa->spa_loaded_ts.tv_nsec = 0; 3449 } 3450 if (error != EBADF) { 3451 (void) zfs_ereport_post(ereport, spa, 3452 NULL, NULL, NULL, 0); 3453 } 3454 } 3455 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 3456 spa->spa_ena = 0; 3457 3458 (void) spa_import_progress_set_state(spa_guid(spa), 3459 spa_load_state(spa)); 3460 3461 return (error); 3462 } 3463 3464 #ifdef ZFS_DEBUG 3465 /* 3466 * Count the number of per-vdev ZAPs associated with all of the vdevs in the 3467 * vdev tree rooted in the given vd, and ensure that each ZAP is present in the 3468 * spa's per-vdev ZAP list. 3469 */ 3470 static uint64_t 3471 vdev_count_verify_zaps(vdev_t *vd) 3472 { 3473 spa_t *spa = vd->vdev_spa; 3474 uint64_t total = 0; 3475 3476 if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2) && 3477 vd->vdev_root_zap != 0) { 3478 total++; 3479 ASSERT0(zap_lookup_int(spa->spa_meta_objset, 3480 spa->spa_all_vdev_zaps, vd->vdev_root_zap)); 3481 } 3482 if (vd->vdev_top_zap != 0) { 3483 total++; 3484 ASSERT0(zap_lookup_int(spa->spa_meta_objset, 3485 spa->spa_all_vdev_zaps, vd->vdev_top_zap)); 3486 } 3487 if (vd->vdev_leaf_zap != 0) { 3488 total++; 3489 ASSERT0(zap_lookup_int(spa->spa_meta_objset, 3490 spa->spa_all_vdev_zaps, vd->vdev_leaf_zap)); 3491 } 3492 3493 for (uint64_t i = 0; i < vd->vdev_children; i++) { 3494 total += vdev_count_verify_zaps(vd->vdev_child[i]); 3495 } 3496 3497 return (total); 3498 } 3499 #else 3500 #define vdev_count_verify_zaps(vd) ((void) sizeof (vd), 0) 3501 #endif 3502 3503 /* 3504 * Determine whether the activity check is required. 3505 */ 3506 static boolean_t 3507 spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label, 3508 nvlist_t *config) 3509 { 3510 uint64_t state = 0; 3511 uint64_t hostid = 0; 3512 uint64_t tryconfig_txg = 0; 3513 uint64_t tryconfig_timestamp = 0; 3514 uint16_t tryconfig_mmp_seq = 0; 3515 nvlist_t *nvinfo; 3516 3517 if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { 3518 nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); 3519 (void) nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG, 3520 &tryconfig_txg); 3521 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 3522 &tryconfig_timestamp); 3523 (void) nvlist_lookup_uint16(nvinfo, ZPOOL_CONFIG_MMP_SEQ, 3524 &tryconfig_mmp_seq); 3525 } 3526 3527 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state); 3528 3529 /* 3530 * Disable the MMP activity check - This is used by zdb which 3531 * is intended to be used on potentially active pools. 3532 */ 3533 if (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) 3534 return (B_FALSE); 3535 3536 /* 3537 * Skip the activity check when the MMP feature is disabled. 3538 */ 3539 if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay == 0) 3540 return (B_FALSE); 3541 3542 /* 3543 * If the tryconfig_ values are nonzero, they are the results of an 3544 * earlier tryimport. If they all match the uberblock we just found, 3545 * then the pool has not changed and we return false so we do not test 3546 * a second time. 3547 */ 3548 if (tryconfig_txg && tryconfig_txg == ub->ub_txg && 3549 tryconfig_timestamp && tryconfig_timestamp == ub->ub_timestamp && 3550 tryconfig_mmp_seq && tryconfig_mmp_seq == 3551 (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) 3552 return (B_FALSE); 3553 3554 /* 3555 * Allow the activity check to be skipped when importing the pool 3556 * on the same host which last imported it. Since the hostid from 3557 * configuration may be stale use the one read from the label. 3558 */ 3559 if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID)) 3560 hostid = fnvlist_lookup_uint64(label, ZPOOL_CONFIG_HOSTID); 3561 3562 if (hostid == spa_get_hostid(spa)) 3563 return (B_FALSE); 3564 3565 /* 3566 * Skip the activity test when the pool was cleanly exported. 3567 */ 3568 if (state != POOL_STATE_ACTIVE) 3569 return (B_FALSE); 3570 3571 return (B_TRUE); 3572 } 3573 3574 /* 3575 * Nanoseconds the activity check must watch for changes on-disk. 3576 */ 3577 static uint64_t 3578 spa_activity_check_duration(spa_t *spa, uberblock_t *ub) 3579 { 3580 uint64_t import_intervals = MAX(zfs_multihost_import_intervals, 1); 3581 uint64_t multihost_interval = MSEC2NSEC( 3582 MMP_INTERVAL_OK(zfs_multihost_interval)); 3583 uint64_t import_delay = MAX(NANOSEC, import_intervals * 3584 multihost_interval); 3585 3586 /* 3587 * Local tunables determine a minimum duration except for the case 3588 * where we know when the remote host will suspend the pool if MMP 3589 * writes do not land. 3590 * 3591 * See Big Theory comment at the top of mmp.c for the reasoning behind 3592 * these cases and times. 3593 */ 3594 3595 ASSERT(MMP_IMPORT_SAFETY_FACTOR >= 100); 3596 3597 if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) && 3598 MMP_FAIL_INT(ub) > 0) { 3599 3600 /* MMP on remote host will suspend pool after failed writes */ 3601 import_delay = MMP_FAIL_INT(ub) * MSEC2NSEC(MMP_INTERVAL(ub)) * 3602 MMP_IMPORT_SAFETY_FACTOR / 100; 3603 3604 zfs_dbgmsg("fail_intvals>0 import_delay=%llu ub_mmp " 3605 "mmp_fails=%llu ub_mmp mmp_interval=%llu " 3606 "import_intervals=%llu", (u_longlong_t)import_delay, 3607 (u_longlong_t)MMP_FAIL_INT(ub), 3608 (u_longlong_t)MMP_INTERVAL(ub), 3609 (u_longlong_t)import_intervals); 3610 3611 } else if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) && 3612 MMP_FAIL_INT(ub) == 0) { 3613 3614 /* MMP on remote host will never suspend pool */ 3615 import_delay = MAX(import_delay, (MSEC2NSEC(MMP_INTERVAL(ub)) + 3616 ub->ub_mmp_delay) * import_intervals); 3617 3618 zfs_dbgmsg("fail_intvals=0 import_delay=%llu ub_mmp " 3619 "mmp_interval=%llu ub_mmp_delay=%llu " 3620 "import_intervals=%llu", (u_longlong_t)import_delay, 3621 (u_longlong_t)MMP_INTERVAL(ub), 3622 (u_longlong_t)ub->ub_mmp_delay, 3623 (u_longlong_t)import_intervals); 3624 3625 } else if (MMP_VALID(ub)) { 3626 /* 3627 * zfs-0.7 compatibility case 3628 */ 3629 3630 import_delay = MAX(import_delay, (multihost_interval + 3631 ub->ub_mmp_delay) * import_intervals); 3632 3633 zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu " 3634 "import_intervals=%llu leaves=%u", 3635 (u_longlong_t)import_delay, 3636 (u_longlong_t)ub->ub_mmp_delay, 3637 (u_longlong_t)import_intervals, 3638 vdev_count_leaves(spa)); 3639 } else { 3640 /* Using local tunings is the only reasonable option */ 3641 zfs_dbgmsg("pool last imported on non-MMP aware " 3642 "host using import_delay=%llu multihost_interval=%llu " 3643 "import_intervals=%llu", (u_longlong_t)import_delay, 3644 (u_longlong_t)multihost_interval, 3645 (u_longlong_t)import_intervals); 3646 } 3647 3648 return (import_delay); 3649 } 3650 3651 /* 3652 * Remote host activity check. 3653 * 3654 * error results: 3655 * 0 - no activity detected 3656 * EREMOTEIO - remote activity detected 3657 * EINTR - user canceled the operation 3658 */ 3659 static int 3660 spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config, 3661 boolean_t importing) 3662 { 3663 uint64_t txg = ub->ub_txg; 3664 uint64_t timestamp = ub->ub_timestamp; 3665 uint64_t mmp_config = ub->ub_mmp_config; 3666 uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0; 3667 uint64_t import_delay; 3668 hrtime_t import_expire, now; 3669 nvlist_t *mmp_label = NULL; 3670 vdev_t *rvd = spa->spa_root_vdev; 3671 kcondvar_t cv; 3672 kmutex_t mtx; 3673 int error = 0; 3674 3675 cv_init(&cv, NULL, CV_DEFAULT, NULL); 3676 mutex_init(&mtx, NULL, MUTEX_DEFAULT, NULL); 3677 mutex_enter(&mtx); 3678 3679 /* 3680 * If ZPOOL_CONFIG_MMP_TXG is present an activity check was performed 3681 * during the earlier tryimport. If the txg recorded there is 0 then 3682 * the pool is known to be active on another host. 3683 * 3684 * Otherwise, the pool might be in use on another host. Check for 3685 * changes in the uberblocks on disk if necessary. 3686 */ 3687 if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { 3688 nvlist_t *nvinfo = fnvlist_lookup_nvlist(config, 3689 ZPOOL_CONFIG_LOAD_INFO); 3690 3691 if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_TXG) && 3692 fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG) == 0) { 3693 vdev_uberblock_load(rvd, ub, &mmp_label); 3694 error = SET_ERROR(EREMOTEIO); 3695 goto out; 3696 } 3697 } 3698 3699 import_delay = spa_activity_check_duration(spa, ub); 3700 3701 /* Add a small random factor in case of simultaneous imports (0-25%) */ 3702 import_delay += import_delay * random_in_range(250) / 1000; 3703 3704 import_expire = gethrtime() + import_delay; 3705 3706 if (importing) { 3707 spa_import_progress_set_notes(spa, "Checking MMP activity, " 3708 "waiting %llu ms", (u_longlong_t)NSEC2MSEC(import_delay)); 3709 } 3710 3711 int iterations = 0; 3712 while ((now = gethrtime()) < import_expire) { 3713 if (importing && iterations++ % 30 == 0) { 3714 spa_import_progress_set_notes(spa, "Checking MMP " 3715 "activity, %llu ms remaining", 3716 (u_longlong_t)NSEC2MSEC(import_expire - now)); 3717 } 3718 3719 if (importing) { 3720 (void) spa_import_progress_set_mmp_check(spa_guid(spa), 3721 NSEC2SEC(import_expire - gethrtime())); 3722 } 3723 3724 vdev_uberblock_load(rvd, ub, &mmp_label); 3725 3726 if (txg != ub->ub_txg || timestamp != ub->ub_timestamp || 3727 mmp_seq != (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) { 3728 zfs_dbgmsg("multihost activity detected " 3729 "txg %llu ub_txg %llu " 3730 "timestamp %llu ub_timestamp %llu " 3731 "mmp_config %#llx ub_mmp_config %#llx", 3732 (u_longlong_t)txg, (u_longlong_t)ub->ub_txg, 3733 (u_longlong_t)timestamp, 3734 (u_longlong_t)ub->ub_timestamp, 3735 (u_longlong_t)mmp_config, 3736 (u_longlong_t)ub->ub_mmp_config); 3737 3738 error = SET_ERROR(EREMOTEIO); 3739 break; 3740 } 3741 3742 if (mmp_label) { 3743 nvlist_free(mmp_label); 3744 mmp_label = NULL; 3745 } 3746 3747 error = cv_timedwait_sig(&cv, &mtx, ddi_get_lbolt() + hz); 3748 if (error != -1) { 3749 error = SET_ERROR(EINTR); 3750 break; 3751 } 3752 error = 0; 3753 } 3754 3755 out: 3756 mutex_exit(&mtx); 3757 mutex_destroy(&mtx); 3758 cv_destroy(&cv); 3759 3760 /* 3761 * If the pool is determined to be active store the status in the 3762 * spa->spa_load_info nvlist. If the remote hostname or hostid are 3763 * available from configuration read from disk store them as well. 3764 * This allows 'zpool import' to generate a more useful message. 3765 * 3766 * ZPOOL_CONFIG_MMP_STATE - observed pool status (mandatory) 3767 * ZPOOL_CONFIG_MMP_HOSTNAME - hostname from the active pool 3768 * ZPOOL_CONFIG_MMP_HOSTID - hostid from the active pool 3769 */ 3770 if (error == EREMOTEIO) { 3771 const char *hostname = "<unknown>"; 3772 uint64_t hostid = 0; 3773 3774 if (mmp_label) { 3775 if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTNAME)) { 3776 hostname = fnvlist_lookup_string(mmp_label, 3777 ZPOOL_CONFIG_HOSTNAME); 3778 fnvlist_add_string(spa->spa_load_info, 3779 ZPOOL_CONFIG_MMP_HOSTNAME, hostname); 3780 } 3781 3782 if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTID)) { 3783 hostid = fnvlist_lookup_uint64(mmp_label, 3784 ZPOOL_CONFIG_HOSTID); 3785 fnvlist_add_uint64(spa->spa_load_info, 3786 ZPOOL_CONFIG_MMP_HOSTID, hostid); 3787 } 3788 } 3789 3790 fnvlist_add_uint64(spa->spa_load_info, 3791 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_ACTIVE); 3792 fnvlist_add_uint64(spa->spa_load_info, 3793 ZPOOL_CONFIG_MMP_TXG, 0); 3794 3795 error = spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO); 3796 } 3797 3798 if (mmp_label) 3799 nvlist_free(mmp_label); 3800 3801 return (error); 3802 } 3803 3804 /* 3805 * Called from zfs_ioc_clear for a pool that was suspended 3806 * after failing mmp write checks. 3807 */ 3808 boolean_t 3809 spa_mmp_remote_host_activity(spa_t *spa) 3810 { 3811 ASSERT(spa_multihost(spa) && spa_suspended(spa)); 3812 3813 nvlist_t *best_label; 3814 uberblock_t best_ub; 3815 3816 /* 3817 * Locate the best uberblock on disk 3818 */ 3819 vdev_uberblock_load(spa->spa_root_vdev, &best_ub, &best_label); 3820 if (best_label) { 3821 /* 3822 * confirm that the best hostid matches our hostid 3823 */ 3824 if (nvlist_exists(best_label, ZPOOL_CONFIG_HOSTID) && 3825 spa_get_hostid(spa) != 3826 fnvlist_lookup_uint64(best_label, ZPOOL_CONFIG_HOSTID)) { 3827 nvlist_free(best_label); 3828 return (B_TRUE); 3829 } 3830 nvlist_free(best_label); 3831 } else { 3832 return (B_TRUE); 3833 } 3834 3835 if (!MMP_VALID(&best_ub) || 3836 !MMP_FAIL_INT_VALID(&best_ub) || 3837 MMP_FAIL_INT(&best_ub) == 0) { 3838 return (B_TRUE); 3839 } 3840 3841 if (best_ub.ub_txg != spa->spa_uberblock.ub_txg || 3842 best_ub.ub_timestamp != spa->spa_uberblock.ub_timestamp) { 3843 zfs_dbgmsg("txg mismatch detected during pool clear " 3844 "txg %llu ub_txg %llu timestamp %llu ub_timestamp %llu", 3845 (u_longlong_t)spa->spa_uberblock.ub_txg, 3846 (u_longlong_t)best_ub.ub_txg, 3847 (u_longlong_t)spa->spa_uberblock.ub_timestamp, 3848 (u_longlong_t)best_ub.ub_timestamp); 3849 return (B_TRUE); 3850 } 3851 3852 /* 3853 * Perform an activity check looking for any remote writer 3854 */ 3855 return (spa_activity_check(spa, &spa->spa_uberblock, spa->spa_config, 3856 B_FALSE) != 0); 3857 } 3858 3859 static int 3860 spa_verify_host(spa_t *spa, nvlist_t *mos_config) 3861 { 3862 uint64_t hostid; 3863 const char *hostname; 3864 uint64_t myhostid = 0; 3865 3866 if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config, 3867 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 3868 hostname = fnvlist_lookup_string(mos_config, 3869 ZPOOL_CONFIG_HOSTNAME); 3870 3871 myhostid = zone_get_hostid(NULL); 3872 3873 if (hostid != 0 && myhostid != 0 && hostid != myhostid) { 3874 cmn_err(CE_WARN, "pool '%s' could not be " 3875 "loaded as it was last accessed by " 3876 "another system (host: %s hostid: 0x%llx). " 3877 "See: https://openzfs.github.io/openzfs-docs/msg/" 3878 "ZFS-8000-EY", 3879 spa_name(spa), hostname, (u_longlong_t)hostid); 3880 spa_load_failed(spa, "hostid verification failed: pool " 3881 "last accessed by host: %s (hostid: 0x%llx)", 3882 hostname, (u_longlong_t)hostid); 3883 return (SET_ERROR(EBADF)); 3884 } 3885 } 3886 3887 return (0); 3888 } 3889 3890 static int 3891 spa_ld_parse_config(spa_t *spa, spa_import_type_t type) 3892 { 3893 int error = 0; 3894 nvlist_t *nvtree, *nvl, *config = spa->spa_config; 3895 int parse; 3896 vdev_t *rvd; 3897 uint64_t pool_guid; 3898 const char *comment; 3899 const char *compatibility; 3900 3901 /* 3902 * Versioning wasn't explicitly added to the label until later, so if 3903 * it's not present treat it as the initial version. 3904 */ 3905 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 3906 &spa->spa_ubsync.ub_version) != 0) 3907 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 3908 3909 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 3910 spa_load_failed(spa, "invalid config provided: '%s' missing", 3911 ZPOOL_CONFIG_POOL_GUID); 3912 return (SET_ERROR(EINVAL)); 3913 } 3914 3915 /* 3916 * If we are doing an import, ensure that the pool is not already 3917 * imported by checking if its pool guid already exists in the 3918 * spa namespace. 3919 * 3920 * The only case that we allow an already imported pool to be 3921 * imported again, is when the pool is checkpointed and we want to 3922 * look at its checkpointed state from userland tools like zdb. 3923 */ 3924 #ifdef _KERNEL 3925 if ((spa->spa_load_state == SPA_LOAD_IMPORT || 3926 spa->spa_load_state == SPA_LOAD_TRYIMPORT) && 3927 spa_guid_exists(pool_guid, 0)) { 3928 #else 3929 if ((spa->spa_load_state == SPA_LOAD_IMPORT || 3930 spa->spa_load_state == SPA_LOAD_TRYIMPORT) && 3931 spa_guid_exists(pool_guid, 0) && 3932 !spa_importing_readonly_checkpoint(spa)) { 3933 #endif 3934 spa_load_failed(spa, "a pool with guid %llu is already open", 3935 (u_longlong_t)pool_guid); 3936 return (SET_ERROR(EEXIST)); 3937 } 3938 3939 spa->spa_config_guid = pool_guid; 3940 3941 nvlist_free(spa->spa_load_info); 3942 spa->spa_load_info = fnvlist_alloc(); 3943 3944 ASSERT(spa->spa_comment == NULL); 3945 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) 3946 spa->spa_comment = spa_strdup(comment); 3947 3948 ASSERT(spa->spa_compatibility == NULL); 3949 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMPATIBILITY, 3950 &compatibility) == 0) 3951 spa->spa_compatibility = spa_strdup(compatibility); 3952 3953 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 3954 &spa->spa_config_txg); 3955 3956 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0) 3957 spa->spa_config_splitting = fnvlist_dup(nvl); 3958 3959 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) { 3960 spa_load_failed(spa, "invalid config provided: '%s' missing", 3961 ZPOOL_CONFIG_VDEV_TREE); 3962 return (SET_ERROR(EINVAL)); 3963 } 3964 3965 /* 3966 * Create "The Godfather" zio to hold all async IOs 3967 */ 3968 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 3969 KM_SLEEP); 3970 for (int i = 0; i < max_ncpus; i++) { 3971 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 3972 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 3973 ZIO_FLAG_GODFATHER); 3974 } 3975 3976 /* 3977 * Parse the configuration into a vdev tree. We explicitly set the 3978 * value that will be returned by spa_version() since parsing the 3979 * configuration requires knowing the version number. 3980 */ 3981 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3982 parse = (type == SPA_IMPORT_EXISTING ? 3983 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 3984 error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse); 3985 spa_config_exit(spa, SCL_ALL, FTAG); 3986 3987 if (error != 0) { 3988 spa_load_failed(spa, "unable to parse config [error=%d]", 3989 error); 3990 return (error); 3991 } 3992 3993 ASSERT(spa->spa_root_vdev == rvd); 3994 ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); 3995 ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT); 3996 3997 if (type != SPA_IMPORT_ASSEMBLE) { 3998 ASSERT(spa_guid(spa) == pool_guid); 3999 } 4000 4001 return (0); 4002 } 4003 4004 /* 4005 * Recursively open all vdevs in the vdev tree. This function is called twice: 4006 * first with the untrusted config, then with the trusted config. 4007 */ 4008 static int 4009 spa_ld_open_vdevs(spa_t *spa) 4010 { 4011 int error = 0; 4012 4013 /* 4014 * spa_missing_tvds_allowed defines how many top-level vdevs can be 4015 * missing/unopenable for the root vdev to be still considered openable. 4016 */ 4017 if (spa->spa_trust_config) { 4018 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds; 4019 } else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) { 4020 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile; 4021 } else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) { 4022 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan; 4023 } else { 4024 spa->spa_missing_tvds_allowed = 0; 4025 } 4026 4027 spa->spa_missing_tvds_allowed = 4028 MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed); 4029 4030 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4031 error = vdev_open(spa->spa_root_vdev); 4032 spa_config_exit(spa, SCL_ALL, FTAG); 4033 4034 if (spa->spa_missing_tvds != 0) { 4035 spa_load_note(spa, "vdev tree has %lld missing top-level " 4036 "vdevs.", (u_longlong_t)spa->spa_missing_tvds); 4037 if (spa->spa_trust_config && (spa->spa_mode & SPA_MODE_WRITE)) { 4038 /* 4039 * Although theoretically we could allow users to open 4040 * incomplete pools in RW mode, we'd need to add a lot 4041 * of extra logic (e.g. adjust pool space to account 4042 * for missing vdevs). 4043 * This limitation also prevents users from accidentally 4044 * opening the pool in RW mode during data recovery and 4045 * damaging it further. 4046 */ 4047 spa_load_note(spa, "pools with missing top-level " 4048 "vdevs can only be opened in read-only mode."); 4049 error = SET_ERROR(ENXIO); 4050 } else { 4051 spa_load_note(spa, "current settings allow for maximum " 4052 "%lld missing top-level vdevs at this stage.", 4053 (u_longlong_t)spa->spa_missing_tvds_allowed); 4054 } 4055 } 4056 if (error != 0) { 4057 spa_load_failed(spa, "unable to open vdev tree [error=%d]", 4058 error); 4059 } 4060 if (spa->spa_missing_tvds != 0 || error != 0) 4061 vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2); 4062 4063 return (error); 4064 } 4065 4066 /* 4067 * We need to validate the vdev labels against the configuration that 4068 * we have in hand. This function is called twice: first with an untrusted 4069 * config, then with a trusted config. The validation is more strict when the 4070 * config is trusted. 4071 */ 4072 static int 4073 spa_ld_validate_vdevs(spa_t *spa) 4074 { 4075 int error = 0; 4076 vdev_t *rvd = spa->spa_root_vdev; 4077 4078 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4079 error = vdev_validate(rvd); 4080 spa_config_exit(spa, SCL_ALL, FTAG); 4081 4082 if (error != 0) { 4083 spa_load_failed(spa, "vdev_validate failed [error=%d]", error); 4084 return (error); 4085 } 4086 4087 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 4088 spa_load_failed(spa, "cannot open vdev tree after invalidating " 4089 "some vdevs"); 4090 vdev_dbgmsg_print_tree(rvd, 2); 4091 return (SET_ERROR(ENXIO)); 4092 } 4093 4094 return (0); 4095 } 4096 4097 static void 4098 spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub) 4099 { 4100 spa->spa_state = POOL_STATE_ACTIVE; 4101 spa->spa_ubsync = spa->spa_uberblock; 4102 spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 4103 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 4104 spa->spa_first_txg = spa->spa_last_ubsync_txg ? 4105 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 4106 spa->spa_claim_max_txg = spa->spa_first_txg; 4107 spa->spa_prev_software_version = ub->ub_software_version; 4108 } 4109 4110 static int 4111 spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) 4112 { 4113 vdev_t *rvd = spa->spa_root_vdev; 4114 nvlist_t *label; 4115 uberblock_t *ub = &spa->spa_uberblock; 4116 boolean_t activity_check = B_FALSE; 4117 4118 /* 4119 * If we are opening the checkpointed state of the pool by 4120 * rewinding to it, at this point we will have written the 4121 * checkpointed uberblock to the vdev labels, so searching 4122 * the labels will find the right uberblock. However, if 4123 * we are opening the checkpointed state read-only, we have 4124 * not modified the labels. Therefore, we must ignore the 4125 * labels and continue using the spa_uberblock that was set 4126 * by spa_ld_checkpoint_rewind. 4127 * 4128 * Note that it would be fine to ignore the labels when 4129 * rewinding (opening writeable) as well. However, if we 4130 * crash just after writing the labels, we will end up 4131 * searching the labels. Doing so in the common case means 4132 * that this code path gets exercised normally, rather than 4133 * just in the edge case. 4134 */ 4135 if (ub->ub_checkpoint_txg != 0 && 4136 spa_importing_readonly_checkpoint(spa)) { 4137 spa_ld_select_uberblock_done(spa, ub); 4138 return (0); 4139 } 4140 4141 /* 4142 * Find the best uberblock. 4143 */ 4144 vdev_uberblock_load(rvd, ub, &label); 4145 4146 /* 4147 * If we weren't able to find a single valid uberblock, return failure. 4148 */ 4149 if (ub->ub_txg == 0) { 4150 nvlist_free(label); 4151 spa_load_failed(spa, "no valid uberblock found"); 4152 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 4153 } 4154 4155 if (spa->spa_load_max_txg != UINT64_MAX) { 4156 (void) spa_import_progress_set_max_txg(spa_guid(spa), 4157 (u_longlong_t)spa->spa_load_max_txg); 4158 } 4159 spa_load_note(spa, "using uberblock with txg=%llu", 4160 (u_longlong_t)ub->ub_txg); 4161 if (ub->ub_raidz_reflow_info != 0) { 4162 spa_load_note(spa, "uberblock raidz_reflow_info: " 4163 "state=%u offset=%llu", 4164 (int)RRSS_GET_STATE(ub), 4165 (u_longlong_t)RRSS_GET_OFFSET(ub)); 4166 } 4167 4168 4169 /* 4170 * For pools which have the multihost property on determine if the 4171 * pool is truly inactive and can be safely imported. Prevent 4172 * hosts which don't have a hostid set from importing the pool. 4173 */ 4174 activity_check = spa_activity_check_required(spa, ub, label, 4175 spa->spa_config); 4176 if (activity_check) { 4177 if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay && 4178 spa_get_hostid(spa) == 0) { 4179 nvlist_free(label); 4180 fnvlist_add_uint64(spa->spa_load_info, 4181 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); 4182 return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); 4183 } 4184 4185 int error = 4186 spa_activity_check(spa, ub, spa->spa_config, B_TRUE); 4187 if (error) { 4188 nvlist_free(label); 4189 return (error); 4190 } 4191 4192 fnvlist_add_uint64(spa->spa_load_info, 4193 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_INACTIVE); 4194 fnvlist_add_uint64(spa->spa_load_info, 4195 ZPOOL_CONFIG_MMP_TXG, ub->ub_txg); 4196 fnvlist_add_uint16(spa->spa_load_info, 4197 ZPOOL_CONFIG_MMP_SEQ, 4198 (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)); 4199 } 4200 4201 /* 4202 * If the pool has an unsupported version we can't open it. 4203 */ 4204 if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { 4205 nvlist_free(label); 4206 spa_load_failed(spa, "version %llu is not supported", 4207 (u_longlong_t)ub->ub_version); 4208 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 4209 } 4210 4211 if (ub->ub_version >= SPA_VERSION_FEATURES) { 4212 nvlist_t *features; 4213 4214 /* 4215 * If we weren't able to find what's necessary for reading the 4216 * MOS in the label, return failure. 4217 */ 4218 if (label == NULL) { 4219 spa_load_failed(spa, "label config unavailable"); 4220 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 4221 ENXIO)); 4222 } 4223 4224 if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ, 4225 &features) != 0) { 4226 nvlist_free(label); 4227 spa_load_failed(spa, "invalid label: '%s' missing", 4228 ZPOOL_CONFIG_FEATURES_FOR_READ); 4229 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 4230 ENXIO)); 4231 } 4232 4233 /* 4234 * Update our in-core representation with the definitive values 4235 * from the label. 4236 */ 4237 nvlist_free(spa->spa_label_features); 4238 spa->spa_label_features = fnvlist_dup(features); 4239 } 4240 4241 nvlist_free(label); 4242 4243 /* 4244 * Look through entries in the label nvlist's features_for_read. If 4245 * there is a feature listed there which we don't understand then we 4246 * cannot open a pool. 4247 */ 4248 if (ub->ub_version >= SPA_VERSION_FEATURES) { 4249 nvlist_t *unsup_feat; 4250 4251 unsup_feat = fnvlist_alloc(); 4252 4253 for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, 4254 NULL); nvp != NULL; 4255 nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { 4256 if (!zfeature_is_supported(nvpair_name(nvp))) { 4257 fnvlist_add_string(unsup_feat, 4258 nvpair_name(nvp), ""); 4259 } 4260 } 4261 4262 if (!nvlist_empty(unsup_feat)) { 4263 fnvlist_add_nvlist(spa->spa_load_info, 4264 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 4265 nvlist_free(unsup_feat); 4266 spa_load_failed(spa, "some features are unsupported"); 4267 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 4268 ENOTSUP)); 4269 } 4270 4271 nvlist_free(unsup_feat); 4272 } 4273 4274 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 4275 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4276 spa_try_repair(spa, spa->spa_config); 4277 spa_config_exit(spa, SCL_ALL, FTAG); 4278 nvlist_free(spa->spa_config_splitting); 4279 spa->spa_config_splitting = NULL; 4280 } 4281 4282 /* 4283 * Initialize internal SPA structures. 4284 */ 4285 spa_ld_select_uberblock_done(spa, ub); 4286 4287 return (0); 4288 } 4289 4290 static int 4291 spa_ld_open_rootbp(spa_t *spa) 4292 { 4293 int error = 0; 4294 vdev_t *rvd = spa->spa_root_vdev; 4295 4296 error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 4297 if (error != 0) { 4298 spa_load_failed(spa, "unable to open rootbp in dsl_pool_init " 4299 "[error=%d]", error); 4300 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4301 } 4302 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 4303 4304 return (0); 4305 } 4306 4307 static int 4308 spa_ld_trusted_config(spa_t *spa, spa_import_type_t type, 4309 boolean_t reloading) 4310 { 4311 vdev_t *mrvd, *rvd = spa->spa_root_vdev; 4312 nvlist_t *nv, *mos_config, *policy; 4313 int error = 0, copy_error; 4314 uint64_t healthy_tvds, healthy_tvds_mos; 4315 uint64_t mos_config_txg; 4316 4317 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE) 4318 != 0) 4319 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4320 4321 /* 4322 * If we're assembling a pool from a split, the config provided is 4323 * already trusted so there is nothing to do. 4324 */ 4325 if (type == SPA_IMPORT_ASSEMBLE) 4326 return (0); 4327 4328 healthy_tvds = spa_healthy_core_tvds(spa); 4329 4330 if (load_nvlist(spa, spa->spa_config_object, &mos_config) 4331 != 0) { 4332 spa_load_failed(spa, "unable to retrieve MOS config"); 4333 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4334 } 4335 4336 /* 4337 * If we are doing an open, pool owner wasn't verified yet, thus do 4338 * the verification here. 4339 */ 4340 if (spa->spa_load_state == SPA_LOAD_OPEN) { 4341 error = spa_verify_host(spa, mos_config); 4342 if (error != 0) { 4343 nvlist_free(mos_config); 4344 return (error); 4345 } 4346 } 4347 4348 nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE); 4349 4350 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4351 4352 /* 4353 * Build a new vdev tree from the trusted config 4354 */ 4355 error = spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD); 4356 if (error != 0) { 4357 nvlist_free(mos_config); 4358 spa_config_exit(spa, SCL_ALL, FTAG); 4359 spa_load_failed(spa, "spa_config_parse failed [error=%d]", 4360 error); 4361 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 4362 } 4363 4364 /* 4365 * Vdev paths in the MOS may be obsolete. If the untrusted config was 4366 * obtained by scanning /dev/dsk, then it will have the right vdev 4367 * paths. We update the trusted MOS config with this information. 4368 * We first try to copy the paths with vdev_copy_path_strict, which 4369 * succeeds only when both configs have exactly the same vdev tree. 4370 * If that fails, we fall back to a more flexible method that has a 4371 * best effort policy. 4372 */ 4373 copy_error = vdev_copy_path_strict(rvd, mrvd); 4374 if (copy_error != 0 || spa_load_print_vdev_tree) { 4375 spa_load_note(spa, "provided vdev tree:"); 4376 vdev_dbgmsg_print_tree(rvd, 2); 4377 spa_load_note(spa, "MOS vdev tree:"); 4378 vdev_dbgmsg_print_tree(mrvd, 2); 4379 } 4380 if (copy_error != 0) { 4381 spa_load_note(spa, "vdev_copy_path_strict failed, falling " 4382 "back to vdev_copy_path_relaxed"); 4383 vdev_copy_path_relaxed(rvd, mrvd); 4384 } 4385 4386 vdev_close(rvd); 4387 vdev_free(rvd); 4388 spa->spa_root_vdev = mrvd; 4389 rvd = mrvd; 4390 spa_config_exit(spa, SCL_ALL, FTAG); 4391 4392 /* 4393 * If 'zpool import' used a cached config, then the on-disk hostid and 4394 * hostname may be different to the cached config in ways that should 4395 * prevent import. Userspace can't discover this without a scan, but 4396 * we know, so we add these values to LOAD_INFO so the caller can know 4397 * the difference. 4398 * 4399 * Note that we have to do this before the config is regenerated, 4400 * because the new config will have the hostid and hostname for this 4401 * host, in readiness for import. 4402 */ 4403 if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTID)) 4404 fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_HOSTID, 4405 fnvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_HOSTID)); 4406 if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTNAME)) 4407 fnvlist_add_string(spa->spa_load_info, ZPOOL_CONFIG_HOSTNAME, 4408 fnvlist_lookup_string(mos_config, ZPOOL_CONFIG_HOSTNAME)); 4409 4410 /* 4411 * We will use spa_config if we decide to reload the spa or if spa_load 4412 * fails and we rewind. We must thus regenerate the config using the 4413 * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to 4414 * pass settings on how to load the pool and is not stored in the MOS. 4415 * We copy it over to our new, trusted config. 4416 */ 4417 mos_config_txg = fnvlist_lookup_uint64(mos_config, 4418 ZPOOL_CONFIG_POOL_TXG); 4419 nvlist_free(mos_config); 4420 mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE); 4421 if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY, 4422 &policy) == 0) 4423 fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy); 4424 spa_config_set(spa, mos_config); 4425 spa->spa_config_source = SPA_CONFIG_SRC_MOS; 4426 4427 /* 4428 * Now that we got the config from the MOS, we should be more strict 4429 * in checking blkptrs and can make assumptions about the consistency 4430 * of the vdev tree. spa_trust_config must be set to true before opening 4431 * vdevs in order for them to be writeable. 4432 */ 4433 spa->spa_trust_config = B_TRUE; 4434 4435 /* 4436 * Open and validate the new vdev tree 4437 */ 4438 error = spa_ld_open_vdevs(spa); 4439 if (error != 0) 4440 return (error); 4441 4442 error = spa_ld_validate_vdevs(spa); 4443 if (error != 0) 4444 return (error); 4445 4446 if (copy_error != 0 || spa_load_print_vdev_tree) { 4447 spa_load_note(spa, "final vdev tree:"); 4448 vdev_dbgmsg_print_tree(rvd, 2); 4449 } 4450 4451 if (spa->spa_load_state != SPA_LOAD_TRYIMPORT && 4452 !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) { 4453 /* 4454 * Sanity check to make sure that we are indeed loading the 4455 * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds 4456 * in the config provided and they happened to be the only ones 4457 * to have the latest uberblock, we could involuntarily perform 4458 * an extreme rewind. 4459 */ 4460 healthy_tvds_mos = spa_healthy_core_tvds(spa); 4461 if (healthy_tvds_mos - healthy_tvds >= 4462 SPA_SYNC_MIN_VDEVS) { 4463 spa_load_note(spa, "config provided misses too many " 4464 "top-level vdevs compared to MOS (%lld vs %lld). ", 4465 (u_longlong_t)healthy_tvds, 4466 (u_longlong_t)healthy_tvds_mos); 4467 spa_load_note(spa, "vdev tree:"); 4468 vdev_dbgmsg_print_tree(rvd, 2); 4469 if (reloading) { 4470 spa_load_failed(spa, "config was already " 4471 "provided from MOS. Aborting."); 4472 return (spa_vdev_err(rvd, 4473 VDEV_AUX_CORRUPT_DATA, EIO)); 4474 } 4475 spa_load_note(spa, "spa must be reloaded using MOS " 4476 "config"); 4477 return (SET_ERROR(EAGAIN)); 4478 } 4479 } 4480 4481 error = spa_check_for_missing_logs(spa); 4482 if (error != 0) 4483 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 4484 4485 if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) { 4486 spa_load_failed(spa, "uberblock guid sum doesn't match MOS " 4487 "guid sum (%llu != %llu)", 4488 (u_longlong_t)spa->spa_uberblock.ub_guid_sum, 4489 (u_longlong_t)rvd->vdev_guid_sum); 4490 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 4491 ENXIO)); 4492 } 4493 4494 return (0); 4495 } 4496 4497 static int 4498 spa_ld_open_indirect_vdev_metadata(spa_t *spa) 4499 { 4500 int error = 0; 4501 vdev_t *rvd = spa->spa_root_vdev; 4502 4503 /* 4504 * Everything that we read before spa_remove_init() must be stored 4505 * on concreted vdevs. Therefore we do this as early as possible. 4506 */ 4507 error = spa_remove_init(spa); 4508 if (error != 0) { 4509 spa_load_failed(spa, "spa_remove_init failed [error=%d]", 4510 error); 4511 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4512 } 4513 4514 /* 4515 * Retrieve information needed to condense indirect vdev mappings. 4516 */ 4517 error = spa_condense_init(spa); 4518 if (error != 0) { 4519 spa_load_failed(spa, "spa_condense_init failed [error=%d]", 4520 error); 4521 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 4522 } 4523 4524 return (0); 4525 } 4526 4527 static int 4528 spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep) 4529 { 4530 int error = 0; 4531 vdev_t *rvd = spa->spa_root_vdev; 4532 4533 if (spa_version(spa) >= SPA_VERSION_FEATURES) { 4534 boolean_t missing_feat_read = B_FALSE; 4535 nvlist_t *unsup_feat, *enabled_feat; 4536 4537 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, 4538 &spa->spa_feat_for_read_obj, B_TRUE) != 0) { 4539 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4540 } 4541 4542 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, 4543 &spa->spa_feat_for_write_obj, B_TRUE) != 0) { 4544 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4545 } 4546 4547 if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, 4548 &spa->spa_feat_desc_obj, B_TRUE) != 0) { 4549 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4550 } 4551 4552 enabled_feat = fnvlist_alloc(); 4553 unsup_feat = fnvlist_alloc(); 4554 4555 if (!spa_features_check(spa, B_FALSE, 4556 unsup_feat, enabled_feat)) 4557 missing_feat_read = B_TRUE; 4558 4559 if (spa_writeable(spa) || 4560 spa->spa_load_state == SPA_LOAD_TRYIMPORT) { 4561 if (!spa_features_check(spa, B_TRUE, 4562 unsup_feat, enabled_feat)) { 4563 *missing_feat_writep = B_TRUE; 4564 } 4565 } 4566 4567 fnvlist_add_nvlist(spa->spa_load_info, 4568 ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); 4569 4570 if (!nvlist_empty(unsup_feat)) { 4571 fnvlist_add_nvlist(spa->spa_load_info, 4572 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 4573 } 4574 4575 fnvlist_free(enabled_feat); 4576 fnvlist_free(unsup_feat); 4577 4578 if (!missing_feat_read) { 4579 fnvlist_add_boolean(spa->spa_load_info, 4580 ZPOOL_CONFIG_CAN_RDONLY); 4581 } 4582 4583 /* 4584 * If the state is SPA_LOAD_TRYIMPORT, our objective is 4585 * twofold: to determine whether the pool is available for 4586 * import in read-write mode and (if it is not) whether the 4587 * pool is available for import in read-only mode. If the pool 4588 * is available for import in read-write mode, it is displayed 4589 * as available in userland; if it is not available for import 4590 * in read-only mode, it is displayed as unavailable in 4591 * userland. If the pool is available for import in read-only 4592 * mode but not read-write mode, it is displayed as unavailable 4593 * in userland with a special note that the pool is actually 4594 * available for open in read-only mode. 4595 * 4596 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are 4597 * missing a feature for write, we must first determine whether 4598 * the pool can be opened read-only before returning to 4599 * userland in order to know whether to display the 4600 * abovementioned note. 4601 */ 4602 if (missing_feat_read || (*missing_feat_writep && 4603 spa_writeable(spa))) { 4604 spa_load_failed(spa, "pool uses unsupported features"); 4605 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 4606 ENOTSUP)); 4607 } 4608 4609 /* 4610 * Load refcounts for ZFS features from disk into an in-memory 4611 * cache during SPA initialization. 4612 */ 4613 for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { 4614 uint64_t refcount; 4615 4616 error = feature_get_refcount_from_disk(spa, 4617 &spa_feature_table[i], &refcount); 4618 if (error == 0) { 4619 spa->spa_feat_refcount_cache[i] = refcount; 4620 } else if (error == ENOTSUP) { 4621 spa->spa_feat_refcount_cache[i] = 4622 SPA_FEATURE_DISABLED; 4623 } else { 4624 spa_load_failed(spa, "error getting refcount " 4625 "for feature %s [error=%d]", 4626 spa_feature_table[i].fi_guid, error); 4627 return (spa_vdev_err(rvd, 4628 VDEV_AUX_CORRUPT_DATA, EIO)); 4629 } 4630 } 4631 } 4632 4633 if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { 4634 if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, 4635 &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0) 4636 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4637 } 4638 4639 /* 4640 * Encryption was added before bookmark_v2, even though bookmark_v2 4641 * is now a dependency. If this pool has encryption enabled without 4642 * bookmark_v2, trigger an errata message. 4643 */ 4644 if (spa_feature_is_enabled(spa, SPA_FEATURE_ENCRYPTION) && 4645 !spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_V2)) { 4646 spa->spa_errata = ZPOOL_ERRATA_ZOL_8308_ENCRYPTION; 4647 } 4648 4649 return (0); 4650 } 4651 4652 static int 4653 spa_ld_load_special_directories(spa_t *spa) 4654 { 4655 int error = 0; 4656 vdev_t *rvd = spa->spa_root_vdev; 4657 4658 spa->spa_is_initializing = B_TRUE; 4659 error = dsl_pool_open(spa->spa_dsl_pool); 4660 spa->spa_is_initializing = B_FALSE; 4661 if (error != 0) { 4662 spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error); 4663 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4664 } 4665 4666 return (0); 4667 } 4668 4669 static int 4670 spa_ld_get_props(spa_t *spa) 4671 { 4672 int error = 0; 4673 uint64_t obj; 4674 vdev_t *rvd = spa->spa_root_vdev; 4675 4676 /* Grab the checksum salt from the MOS. */ 4677 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 4678 DMU_POOL_CHECKSUM_SALT, 1, 4679 sizeof (spa->spa_cksum_salt.zcs_bytes), 4680 spa->spa_cksum_salt.zcs_bytes); 4681 if (error == ENOENT) { 4682 /* Generate a new salt for subsequent use */ 4683 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 4684 sizeof (spa->spa_cksum_salt.zcs_bytes)); 4685 } else if (error != 0) { 4686 spa_load_failed(spa, "unable to retrieve checksum salt from " 4687 "MOS [error=%d]", error); 4688 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4689 } 4690 4691 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0) 4692 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4693 error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 4694 if (error != 0) { 4695 spa_load_failed(spa, "error opening deferred-frees bpobj " 4696 "[error=%d]", error); 4697 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4698 } 4699 4700 /* 4701 * Load the bit that tells us to use the new accounting function 4702 * (raid-z deflation). If we have an older pool, this will not 4703 * be present. 4704 */ 4705 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE); 4706 if (error != 0 && error != ENOENT) 4707 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4708 4709 error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 4710 &spa->spa_creation_version, B_FALSE); 4711 if (error != 0 && error != ENOENT) 4712 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4713 4714 /* 4715 * Load the persistent error log. If we have an older pool, this will 4716 * not be present. 4717 */ 4718 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last, 4719 B_FALSE); 4720 if (error != 0 && error != ENOENT) 4721 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4722 4723 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 4724 &spa->spa_errlog_scrub, B_FALSE); 4725 if (error != 0 && error != ENOENT) 4726 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4727 4728 /* Load the last scrubbed txg. */ 4729 error = spa_dir_prop(spa, DMU_POOL_LAST_SCRUBBED_TXG, 4730 &spa->spa_scrubbed_last_txg, B_FALSE); 4731 if (error != 0 && error != ENOENT) 4732 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4733 4734 /* 4735 * Load the livelist deletion field. If a livelist is queued for 4736 * deletion, indicate that in the spa 4737 */ 4738 error = spa_dir_prop(spa, DMU_POOL_DELETED_CLONES, 4739 &spa->spa_livelists_to_delete, B_FALSE); 4740 if (error != 0 && error != ENOENT) 4741 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4742 4743 /* 4744 * Load the history object. If we have an older pool, this 4745 * will not be present. 4746 */ 4747 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE); 4748 if (error != 0 && error != ENOENT) 4749 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4750 4751 /* 4752 * Load the per-vdev ZAP map. If we have an older pool, this will not 4753 * be present; in this case, defer its creation to a later time to 4754 * avoid dirtying the MOS this early / out of sync context. See 4755 * spa_sync_config_object. 4756 */ 4757 4758 /* The sentinel is only available in the MOS config. */ 4759 nvlist_t *mos_config; 4760 if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) { 4761 spa_load_failed(spa, "unable to retrieve MOS config"); 4762 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4763 } 4764 4765 error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP, 4766 &spa->spa_all_vdev_zaps, B_FALSE); 4767 4768 if (error == ENOENT) { 4769 VERIFY(!nvlist_exists(mos_config, 4770 ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); 4771 spa->spa_avz_action = AVZ_ACTION_INITIALIZE; 4772 ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); 4773 } else if (error != 0) { 4774 nvlist_free(mos_config); 4775 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4776 } else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) { 4777 /* 4778 * An older version of ZFS overwrote the sentinel value, so 4779 * we have orphaned per-vdev ZAPs in the MOS. Defer their 4780 * destruction to later; see spa_sync_config_object. 4781 */ 4782 spa->spa_avz_action = AVZ_ACTION_DESTROY; 4783 /* 4784 * We're assuming that no vdevs have had their ZAPs created 4785 * before this. Better be sure of it. 4786 */ 4787 ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); 4788 } 4789 nvlist_free(mos_config); 4790 4791 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 4792 4793 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object, 4794 B_FALSE); 4795 if (error && error != ENOENT) 4796 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4797 4798 if (error == 0) { 4799 uint64_t autoreplace = 0; 4800 4801 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 4802 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 4803 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 4804 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 4805 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 4806 spa_prop_find(spa, ZPOOL_PROP_DEDUP_TABLE_QUOTA, 4807 &spa->spa_dedup_table_quota); 4808 spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost); 4809 spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_autotrim); 4810 spa->spa_autoreplace = (autoreplace != 0); 4811 } 4812 4813 /* 4814 * If we are importing a pool with missing top-level vdevs, 4815 * we enforce that the pool doesn't panic or get suspended on 4816 * error since the likelihood of missing data is extremely high. 4817 */ 4818 if (spa->spa_missing_tvds > 0 && 4819 spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE && 4820 spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 4821 spa_load_note(spa, "forcing failmode to 'continue' " 4822 "as some top level vdevs are missing"); 4823 spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE; 4824 } 4825 4826 return (0); 4827 } 4828 4829 static int 4830 spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type) 4831 { 4832 int error = 0; 4833 vdev_t *rvd = spa->spa_root_vdev; 4834 4835 /* 4836 * If we're assembling the pool from the split-off vdevs of 4837 * an existing pool, we don't want to attach the spares & cache 4838 * devices. 4839 */ 4840 4841 /* 4842 * Load any hot spares for this pool. 4843 */ 4844 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object, 4845 B_FALSE); 4846 if (error != 0 && error != ENOENT) 4847 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4848 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 4849 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 4850 if (load_nvlist(spa, spa->spa_spares.sav_object, 4851 &spa->spa_spares.sav_config) != 0) { 4852 spa_load_failed(spa, "error loading spares nvlist"); 4853 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4854 } 4855 4856 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4857 spa_load_spares(spa); 4858 spa_config_exit(spa, SCL_ALL, FTAG); 4859 } else if (error == 0) { 4860 spa->spa_spares.sav_sync = B_TRUE; 4861 } 4862 4863 /* 4864 * Load any level 2 ARC devices for this pool. 4865 */ 4866 error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 4867 &spa->spa_l2cache.sav_object, B_FALSE); 4868 if (error != 0 && error != ENOENT) 4869 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4870 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 4871 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 4872 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 4873 &spa->spa_l2cache.sav_config) != 0) { 4874 spa_load_failed(spa, "error loading l2cache nvlist"); 4875 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4876 } 4877 4878 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4879 spa_load_l2cache(spa); 4880 spa_config_exit(spa, SCL_ALL, FTAG); 4881 } else if (error == 0) { 4882 spa->spa_l2cache.sav_sync = B_TRUE; 4883 } 4884 4885 return (0); 4886 } 4887 4888 static int 4889 spa_ld_load_vdev_metadata(spa_t *spa) 4890 { 4891 int error = 0; 4892 vdev_t *rvd = spa->spa_root_vdev; 4893 4894 /* 4895 * If the 'multihost' property is set, then never allow a pool to 4896 * be imported when the system hostid is zero. The exception to 4897 * this rule is zdb which is always allowed to access pools. 4898 */ 4899 if (spa_multihost(spa) && spa_get_hostid(spa) == 0 && 4900 (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) == 0) { 4901 fnvlist_add_uint64(spa->spa_load_info, 4902 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); 4903 return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); 4904 } 4905 4906 /* 4907 * If the 'autoreplace' property is set, then post a resource notifying 4908 * the ZFS DE that it should not issue any faults for unopenable 4909 * devices. We also iterate over the vdevs, and post a sysevent for any 4910 * unopenable vdevs so that the normal autoreplace handler can take 4911 * over. 4912 */ 4913 if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 4914 spa_check_removed(spa->spa_root_vdev); 4915 /* 4916 * For the import case, this is done in spa_import(), because 4917 * at this point we're using the spare definitions from 4918 * the MOS config, not necessarily from the userland config. 4919 */ 4920 if (spa->spa_load_state != SPA_LOAD_IMPORT) { 4921 spa_aux_check_removed(&spa->spa_spares); 4922 spa_aux_check_removed(&spa->spa_l2cache); 4923 } 4924 } 4925 4926 /* 4927 * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc. 4928 */ 4929 error = vdev_load(rvd); 4930 if (error != 0) { 4931 spa_load_failed(spa, "vdev_load failed [error=%d]", error); 4932 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 4933 } 4934 4935 error = spa_ld_log_spacemaps(spa); 4936 if (error != 0) { 4937 spa_load_failed(spa, "spa_ld_log_spacemaps failed [error=%d]", 4938 error); 4939 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 4940 } 4941 4942 /* 4943 * Propagate the leaf DTLs we just loaded all the way up the vdev tree. 4944 */ 4945 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4946 vdev_dtl_reassess(rvd, 0, 0, B_FALSE, B_FALSE); 4947 spa_config_exit(spa, SCL_ALL, FTAG); 4948 4949 return (0); 4950 } 4951 4952 static int 4953 spa_ld_load_dedup_tables(spa_t *spa) 4954 { 4955 int error = 0; 4956 vdev_t *rvd = spa->spa_root_vdev; 4957 4958 error = ddt_load(spa); 4959 if (error != 0) { 4960 spa_load_failed(spa, "ddt_load failed [error=%d]", error); 4961 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4962 } 4963 4964 return (0); 4965 } 4966 4967 static int 4968 spa_ld_load_brt(spa_t *spa) 4969 { 4970 int error = 0; 4971 vdev_t *rvd = spa->spa_root_vdev; 4972 4973 error = brt_load(spa); 4974 if (error != 0) { 4975 spa_load_failed(spa, "brt_load failed [error=%d]", error); 4976 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4977 } 4978 4979 return (0); 4980 } 4981 4982 static int 4983 spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, const char **ereport) 4984 { 4985 vdev_t *rvd = spa->spa_root_vdev; 4986 4987 if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) { 4988 boolean_t missing = spa_check_logs(spa); 4989 if (missing) { 4990 if (spa->spa_missing_tvds != 0) { 4991 spa_load_note(spa, "spa_check_logs failed " 4992 "so dropping the logs"); 4993 } else { 4994 *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 4995 spa_load_failed(spa, "spa_check_logs failed"); 4996 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, 4997 ENXIO)); 4998 } 4999 } 5000 } 5001 5002 return (0); 5003 } 5004 5005 static int 5006 spa_ld_verify_pool_data(spa_t *spa) 5007 { 5008 int error = 0; 5009 vdev_t *rvd = spa->spa_root_vdev; 5010 5011 /* 5012 * We've successfully opened the pool, verify that we're ready 5013 * to start pushing transactions. 5014 */ 5015 if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 5016 error = spa_load_verify(spa); 5017 if (error != 0) { 5018 spa_load_failed(spa, "spa_load_verify failed " 5019 "[error=%d]", error); 5020 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 5021 error)); 5022 } 5023 } 5024 5025 return (0); 5026 } 5027 5028 static void 5029 spa_ld_claim_log_blocks(spa_t *spa) 5030 { 5031 dmu_tx_t *tx; 5032 dsl_pool_t *dp = spa_get_dsl(spa); 5033 5034 /* 5035 * Claim log blocks that haven't been committed yet. 5036 * This must all happen in a single txg. 5037 * Note: spa_claim_max_txg is updated by spa_claim_notify(), 5038 * invoked from zil_claim_log_block()'s i/o done callback. 5039 * Price of rollback is that we abandon the log. 5040 */ 5041 spa->spa_claiming = B_TRUE; 5042 5043 tx = dmu_tx_create_assigned(dp, spa_first_txg(spa)); 5044 (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 5045 zil_claim, tx, DS_FIND_CHILDREN); 5046 dmu_tx_commit(tx); 5047 5048 spa->spa_claiming = B_FALSE; 5049 5050 spa_set_log_state(spa, SPA_LOG_GOOD); 5051 } 5052 5053 static void 5054 spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg, 5055 boolean_t update_config_cache) 5056 { 5057 vdev_t *rvd = spa->spa_root_vdev; 5058 int need_update = B_FALSE; 5059 5060 /* 5061 * If the config cache is stale, or we have uninitialized 5062 * metaslabs (see spa_vdev_add()), then update the config. 5063 * 5064 * If this is a verbatim import, trust the current 5065 * in-core spa_config and update the disk labels. 5066 */ 5067 if (update_config_cache || config_cache_txg != spa->spa_config_txg || 5068 spa->spa_load_state == SPA_LOAD_IMPORT || 5069 spa->spa_load_state == SPA_LOAD_RECOVER || 5070 (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 5071 need_update = B_TRUE; 5072 5073 for (int c = 0; c < rvd->vdev_children; c++) 5074 if (rvd->vdev_child[c]->vdev_ms_array == 0) 5075 need_update = B_TRUE; 5076 5077 /* 5078 * Update the config cache asynchronously in case we're the 5079 * root pool, in which case the config cache isn't writable yet. 5080 */ 5081 if (need_update) 5082 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 5083 } 5084 5085 static void 5086 spa_ld_prepare_for_reload(spa_t *spa) 5087 { 5088 spa_mode_t mode = spa->spa_mode; 5089 int async_suspended = spa->spa_async_suspended; 5090 5091 spa_unload(spa); 5092 spa_deactivate(spa); 5093 spa_activate(spa, mode); 5094 5095 /* 5096 * We save the value of spa_async_suspended as it gets reset to 0 by 5097 * spa_unload(). We want to restore it back to the original value before 5098 * returning as we might be calling spa_async_resume() later. 5099 */ 5100 spa->spa_async_suspended = async_suspended; 5101 } 5102 5103 static int 5104 spa_ld_read_checkpoint_txg(spa_t *spa) 5105 { 5106 uberblock_t checkpoint; 5107 int error = 0; 5108 5109 ASSERT0(spa->spa_checkpoint_txg); 5110 ASSERT(MUTEX_HELD(&spa_namespace_lock) || 5111 spa->spa_load_thread == curthread); 5112 5113 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 5114 DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), 5115 sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); 5116 5117 if (error == ENOENT) 5118 return (0); 5119 5120 if (error != 0) 5121 return (error); 5122 5123 ASSERT3U(checkpoint.ub_txg, !=, 0); 5124 ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0); 5125 ASSERT3U(checkpoint.ub_timestamp, !=, 0); 5126 spa->spa_checkpoint_txg = checkpoint.ub_txg; 5127 spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp; 5128 5129 return (0); 5130 } 5131 5132 static int 5133 spa_ld_mos_init(spa_t *spa, spa_import_type_t type) 5134 { 5135 int error = 0; 5136 5137 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5138 ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); 5139 5140 /* 5141 * Never trust the config that is provided unless we are assembling 5142 * a pool following a split. 5143 * This means don't trust blkptrs and the vdev tree in general. This 5144 * also effectively puts the spa in read-only mode since 5145 * spa_writeable() checks for spa_trust_config to be true. 5146 * We will later load a trusted config from the MOS. 5147 */ 5148 if (type != SPA_IMPORT_ASSEMBLE) 5149 spa->spa_trust_config = B_FALSE; 5150 5151 /* 5152 * Parse the config provided to create a vdev tree. 5153 */ 5154 error = spa_ld_parse_config(spa, type); 5155 if (error != 0) 5156 return (error); 5157 5158 spa_import_progress_add(spa); 5159 5160 /* 5161 * Now that we have the vdev tree, try to open each vdev. This involves 5162 * opening the underlying physical device, retrieving its geometry and 5163 * probing the vdev with a dummy I/O. The state of each vdev will be set 5164 * based on the success of those operations. After this we'll be ready 5165 * to read from the vdevs. 5166 */ 5167 error = spa_ld_open_vdevs(spa); 5168 if (error != 0) 5169 return (error); 5170 5171 /* 5172 * Read the label of each vdev and make sure that the GUIDs stored 5173 * there match the GUIDs in the config provided. 5174 * If we're assembling a new pool that's been split off from an 5175 * existing pool, the labels haven't yet been updated so we skip 5176 * validation for now. 5177 */ 5178 if (type != SPA_IMPORT_ASSEMBLE) { 5179 error = spa_ld_validate_vdevs(spa); 5180 if (error != 0) 5181 return (error); 5182 } 5183 5184 /* 5185 * Read all vdev labels to find the best uberblock (i.e. latest, 5186 * unless spa_load_max_txg is set) and store it in spa_uberblock. We 5187 * get the list of features required to read blkptrs in the MOS from 5188 * the vdev label with the best uberblock and verify that our version 5189 * of zfs supports them all. 5190 */ 5191 error = spa_ld_select_uberblock(spa, type); 5192 if (error != 0) 5193 return (error); 5194 5195 /* 5196 * Pass that uberblock to the dsl_pool layer which will open the root 5197 * blkptr. This blkptr points to the latest version of the MOS and will 5198 * allow us to read its contents. 5199 */ 5200 error = spa_ld_open_rootbp(spa); 5201 if (error != 0) 5202 return (error); 5203 5204 return (0); 5205 } 5206 5207 static int 5208 spa_ld_checkpoint_rewind(spa_t *spa) 5209 { 5210 uberblock_t checkpoint; 5211 int error = 0; 5212 5213 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5214 ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); 5215 5216 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 5217 DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), 5218 sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); 5219 5220 if (error != 0) { 5221 spa_load_failed(spa, "unable to retrieve checkpointed " 5222 "uberblock from the MOS config [error=%d]", error); 5223 5224 if (error == ENOENT) 5225 error = ZFS_ERR_NO_CHECKPOINT; 5226 5227 return (error); 5228 } 5229 5230 ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg); 5231 ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg); 5232 5233 /* 5234 * We need to update the txg and timestamp of the checkpointed 5235 * uberblock to be higher than the latest one. This ensures that 5236 * the checkpointed uberblock is selected if we were to close and 5237 * reopen the pool right after we've written it in the vdev labels. 5238 * (also see block comment in vdev_uberblock_compare) 5239 */ 5240 checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1; 5241 checkpoint.ub_timestamp = gethrestime_sec(); 5242 5243 /* 5244 * Set current uberblock to be the checkpointed uberblock. 5245 */ 5246 spa->spa_uberblock = checkpoint; 5247 5248 /* 5249 * If we are doing a normal rewind, then the pool is open for 5250 * writing and we sync the "updated" checkpointed uberblock to 5251 * disk. Once this is done, we've basically rewound the whole 5252 * pool and there is no way back. 5253 * 5254 * There are cases when we don't want to attempt and sync the 5255 * checkpointed uberblock to disk because we are opening a 5256 * pool as read-only. Specifically, verifying the checkpointed 5257 * state with zdb, and importing the checkpointed state to get 5258 * a "preview" of its content. 5259 */ 5260 if (spa_writeable(spa)) { 5261 vdev_t *rvd = spa->spa_root_vdev; 5262 5263 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5264 vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; 5265 int svdcount = 0; 5266 int children = rvd->vdev_children; 5267 int c0 = random_in_range(children); 5268 5269 for (int c = 0; c < children; c++) { 5270 vdev_t *vd = rvd->vdev_child[(c0 + c) % children]; 5271 5272 /* Stop when revisiting the first vdev */ 5273 if (c > 0 && svd[0] == vd) 5274 break; 5275 5276 if (vd->vdev_ms_array == 0 || vd->vdev_islog || 5277 !vdev_is_concrete(vd)) 5278 continue; 5279 5280 svd[svdcount++] = vd; 5281 if (svdcount == SPA_SYNC_MIN_VDEVS) 5282 break; 5283 } 5284 error = vdev_config_sync(svd, svdcount, spa->spa_first_txg); 5285 if (error == 0) 5286 spa->spa_last_synced_guid = rvd->vdev_guid; 5287 spa_config_exit(spa, SCL_ALL, FTAG); 5288 5289 if (error != 0) { 5290 spa_load_failed(spa, "failed to write checkpointed " 5291 "uberblock to the vdev labels [error=%d]", error); 5292 return (error); 5293 } 5294 } 5295 5296 return (0); 5297 } 5298 5299 static int 5300 spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type, 5301 boolean_t *update_config_cache) 5302 { 5303 int error; 5304 5305 /* 5306 * Parse the config for pool, open and validate vdevs, 5307 * select an uberblock, and use that uberblock to open 5308 * the MOS. 5309 */ 5310 error = spa_ld_mos_init(spa, type); 5311 if (error != 0) 5312 return (error); 5313 5314 /* 5315 * Retrieve the trusted config stored in the MOS and use it to create 5316 * a new, exact version of the vdev tree, then reopen all vdevs. 5317 */ 5318 error = spa_ld_trusted_config(spa, type, B_FALSE); 5319 if (error == EAGAIN) { 5320 if (update_config_cache != NULL) 5321 *update_config_cache = B_TRUE; 5322 5323 /* 5324 * Redo the loading process with the trusted config if it is 5325 * too different from the untrusted config. 5326 */ 5327 spa_ld_prepare_for_reload(spa); 5328 spa_load_note(spa, "RELOADING"); 5329 error = spa_ld_mos_init(spa, type); 5330 if (error != 0) 5331 return (error); 5332 5333 error = spa_ld_trusted_config(spa, type, B_TRUE); 5334 if (error != 0) 5335 return (error); 5336 5337 } else if (error != 0) { 5338 return (error); 5339 } 5340 5341 return (0); 5342 } 5343 5344 /* 5345 * Load an existing storage pool, using the config provided. This config 5346 * describes which vdevs are part of the pool and is later validated against 5347 * partial configs present in each vdev's label and an entire copy of the 5348 * config stored in the MOS. 5349 */ 5350 static int 5351 spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) 5352 { 5353 int error = 0; 5354 boolean_t missing_feat_write = B_FALSE; 5355 boolean_t checkpoint_rewind = 5356 (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); 5357 boolean_t update_config_cache = B_FALSE; 5358 hrtime_t load_start = gethrtime(); 5359 5360 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5361 ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); 5362 5363 spa_load_note(spa, "LOADING"); 5364 5365 error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache); 5366 if (error != 0) 5367 return (error); 5368 5369 /* 5370 * If we are rewinding to the checkpoint then we need to repeat 5371 * everything we've done so far in this function but this time 5372 * selecting the checkpointed uberblock and using that to open 5373 * the MOS. 5374 */ 5375 if (checkpoint_rewind) { 5376 /* 5377 * If we are rewinding to the checkpoint update config cache 5378 * anyway. 5379 */ 5380 update_config_cache = B_TRUE; 5381 5382 /* 5383 * Extract the checkpointed uberblock from the current MOS 5384 * and use this as the pool's uberblock from now on. If the 5385 * pool is imported as writeable we also write the checkpoint 5386 * uberblock to the labels, making the rewind permanent. 5387 */ 5388 error = spa_ld_checkpoint_rewind(spa); 5389 if (error != 0) 5390 return (error); 5391 5392 /* 5393 * Redo the loading process again with the 5394 * checkpointed uberblock. 5395 */ 5396 spa_ld_prepare_for_reload(spa); 5397 spa_load_note(spa, "LOADING checkpointed uberblock"); 5398 error = spa_ld_mos_with_trusted_config(spa, type, NULL); 5399 if (error != 0) 5400 return (error); 5401 } 5402 5403 /* 5404 * Drop the namespace lock for the rest of the function. 5405 */ 5406 spa->spa_load_thread = curthread; 5407 mutex_exit(&spa_namespace_lock); 5408 5409 /* 5410 * Retrieve the checkpoint txg if the pool has a checkpoint. 5411 */ 5412 spa_import_progress_set_notes(spa, "Loading checkpoint txg"); 5413 error = spa_ld_read_checkpoint_txg(spa); 5414 if (error != 0) 5415 goto fail; 5416 5417 /* 5418 * Retrieve the mapping of indirect vdevs. Those vdevs were removed 5419 * from the pool and their contents were re-mapped to other vdevs. Note 5420 * that everything that we read before this step must have been 5421 * rewritten on concrete vdevs after the last device removal was 5422 * initiated. Otherwise we could be reading from indirect vdevs before 5423 * we have loaded their mappings. 5424 */ 5425 spa_import_progress_set_notes(spa, "Loading indirect vdev metadata"); 5426 error = spa_ld_open_indirect_vdev_metadata(spa); 5427 if (error != 0) 5428 goto fail; 5429 5430 /* 5431 * Retrieve the full list of active features from the MOS and check if 5432 * they are all supported. 5433 */ 5434 spa_import_progress_set_notes(spa, "Checking feature flags"); 5435 error = spa_ld_check_features(spa, &missing_feat_write); 5436 if (error != 0) 5437 goto fail; 5438 5439 /* 5440 * Load several special directories from the MOS needed by the dsl_pool 5441 * layer. 5442 */ 5443 spa_import_progress_set_notes(spa, "Loading special MOS directories"); 5444 error = spa_ld_load_special_directories(spa); 5445 if (error != 0) 5446 goto fail; 5447 5448 /* 5449 * Retrieve pool properties from the MOS. 5450 */ 5451 spa_import_progress_set_notes(spa, "Loading properties"); 5452 error = spa_ld_get_props(spa); 5453 if (error != 0) 5454 goto fail; 5455 5456 /* 5457 * Retrieve the list of auxiliary devices - cache devices and spares - 5458 * and open them. 5459 */ 5460 spa_import_progress_set_notes(spa, "Loading AUX vdevs"); 5461 error = spa_ld_open_aux_vdevs(spa, type); 5462 if (error != 0) 5463 goto fail; 5464 5465 /* 5466 * Load the metadata for all vdevs. Also check if unopenable devices 5467 * should be autoreplaced. 5468 */ 5469 spa_import_progress_set_notes(spa, "Loading vdev metadata"); 5470 error = spa_ld_load_vdev_metadata(spa); 5471 if (error != 0) 5472 goto fail; 5473 5474 spa_import_progress_set_notes(spa, "Loading dedup tables"); 5475 error = spa_ld_load_dedup_tables(spa); 5476 if (error != 0) 5477 goto fail; 5478 5479 spa_import_progress_set_notes(spa, "Loading BRT"); 5480 error = spa_ld_load_brt(spa); 5481 if (error != 0) 5482 goto fail; 5483 5484 /* 5485 * Verify the logs now to make sure we don't have any unexpected errors 5486 * when we claim log blocks later. 5487 */ 5488 spa_import_progress_set_notes(spa, "Verifying Log Devices"); 5489 error = spa_ld_verify_logs(spa, type, ereport); 5490 if (error != 0) 5491 goto fail; 5492 5493 if (missing_feat_write) { 5494 ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT); 5495 5496 /* 5497 * At this point, we know that we can open the pool in 5498 * read-only mode but not read-write mode. We now have enough 5499 * information and can return to userland. 5500 */ 5501 error = spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT, 5502 ENOTSUP); 5503 goto fail; 5504 } 5505 5506 /* 5507 * Traverse the last txgs to make sure the pool was left off in a safe 5508 * state. When performing an extreme rewind, we verify the whole pool, 5509 * which can take a very long time. 5510 */ 5511 spa_import_progress_set_notes(spa, "Verifying pool data"); 5512 error = spa_ld_verify_pool_data(spa); 5513 if (error != 0) 5514 goto fail; 5515 5516 /* 5517 * Calculate the deflated space for the pool. This must be done before 5518 * we write anything to the pool because we'd need to update the space 5519 * accounting using the deflated sizes. 5520 */ 5521 spa_import_progress_set_notes(spa, "Calculating deflated space"); 5522 spa_update_dspace(spa); 5523 5524 /* 5525 * We have now retrieved all the information we needed to open the 5526 * pool. If we are importing the pool in read-write mode, a few 5527 * additional steps must be performed to finish the import. 5528 */ 5529 spa_import_progress_set_notes(spa, "Starting import"); 5530 if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER || 5531 spa->spa_load_max_txg == UINT64_MAX)) { 5532 uint64_t config_cache_txg = spa->spa_config_txg; 5533 5534 ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT); 5535 5536 /* 5537 * Before we do any zio_write's, complete the raidz expansion 5538 * scratch space copying, if necessary. 5539 */ 5540 if (RRSS_GET_STATE(&spa->spa_uberblock) == RRSS_SCRATCH_VALID) 5541 vdev_raidz_reflow_copy_scratch(spa); 5542 5543 /* 5544 * In case of a checkpoint rewind, log the original txg 5545 * of the checkpointed uberblock. 5546 */ 5547 if (checkpoint_rewind) { 5548 spa_history_log_internal(spa, "checkpoint rewind", 5549 NULL, "rewound state to txg=%llu", 5550 (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg); 5551 } 5552 5553 spa_import_progress_set_notes(spa, "Claiming ZIL blocks"); 5554 /* 5555 * Traverse the ZIL and claim all blocks. 5556 */ 5557 spa_ld_claim_log_blocks(spa); 5558 5559 /* 5560 * Kick-off the syncing thread. 5561 */ 5562 spa->spa_sync_on = B_TRUE; 5563 txg_sync_start(spa->spa_dsl_pool); 5564 mmp_thread_start(spa); 5565 5566 /* 5567 * Wait for all claims to sync. We sync up to the highest 5568 * claimed log block birth time so that claimed log blocks 5569 * don't appear to be from the future. spa_claim_max_txg 5570 * will have been set for us by ZIL traversal operations 5571 * performed above. 5572 */ 5573 spa_import_progress_set_notes(spa, "Syncing ZIL claims"); 5574 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 5575 5576 /* 5577 * Check if we need to request an update of the config. On the 5578 * next sync, we would update the config stored in vdev labels 5579 * and the cachefile (by default /etc/zfs/zpool.cache). 5580 */ 5581 spa_import_progress_set_notes(spa, "Updating configs"); 5582 spa_ld_check_for_config_update(spa, config_cache_txg, 5583 update_config_cache); 5584 5585 /* 5586 * Check if a rebuild was in progress and if so resume it. 5587 * Then check all DTLs to see if anything needs resilvering. 5588 * The resilver will be deferred if a rebuild was started. 5589 */ 5590 spa_import_progress_set_notes(spa, "Starting resilvers"); 5591 if (vdev_rebuild_active(spa->spa_root_vdev)) { 5592 vdev_rebuild_restart(spa); 5593 } else if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 5594 vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 5595 spa_async_request(spa, SPA_ASYNC_RESILVER); 5596 } 5597 5598 /* 5599 * Log the fact that we booted up (so that we can detect if 5600 * we rebooted in the middle of an operation). 5601 */ 5602 spa_history_log_version(spa, "open", NULL); 5603 5604 spa_import_progress_set_notes(spa, 5605 "Restarting device removals"); 5606 spa_restart_removal(spa); 5607 spa_spawn_aux_threads(spa); 5608 5609 /* 5610 * Delete any inconsistent datasets. 5611 * 5612 * Note: 5613 * Since we may be issuing deletes for clones here, 5614 * we make sure to do so after we've spawned all the 5615 * auxiliary threads above (from which the livelist 5616 * deletion zthr is part of). 5617 */ 5618 spa_import_progress_set_notes(spa, 5619 "Cleaning up inconsistent objsets"); 5620 (void) dmu_objset_find(spa_name(spa), 5621 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 5622 5623 /* 5624 * Clean up any stale temporary dataset userrefs. 5625 */ 5626 spa_import_progress_set_notes(spa, 5627 "Cleaning up temporary userrefs"); 5628 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 5629 5630 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5631 spa_import_progress_set_notes(spa, "Restarting initialize"); 5632 vdev_initialize_restart(spa->spa_root_vdev); 5633 spa_import_progress_set_notes(spa, "Restarting TRIM"); 5634 vdev_trim_restart(spa->spa_root_vdev); 5635 vdev_autotrim_restart(spa); 5636 spa_config_exit(spa, SCL_CONFIG, FTAG); 5637 spa_import_progress_set_notes(spa, "Finished importing"); 5638 } 5639 zio_handle_import_delay(spa, gethrtime() - load_start); 5640 5641 spa_import_progress_remove(spa_guid(spa)); 5642 spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); 5643 5644 spa_load_note(spa, "LOADED"); 5645 fail: 5646 mutex_enter(&spa_namespace_lock); 5647 spa->spa_load_thread = NULL; 5648 cv_broadcast(&spa_namespace_cv); 5649 5650 return (error); 5651 5652 } 5653 5654 static int 5655 spa_load_retry(spa_t *spa, spa_load_state_t state) 5656 { 5657 spa_mode_t mode = spa->spa_mode; 5658 5659 spa_unload(spa); 5660 spa_deactivate(spa); 5661 5662 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1; 5663 5664 spa_activate(spa, mode); 5665 spa_async_suspend(spa); 5666 5667 spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu", 5668 (u_longlong_t)spa->spa_load_max_txg); 5669 5670 return (spa_load(spa, state, SPA_IMPORT_EXISTING)); 5671 } 5672 5673 /* 5674 * If spa_load() fails this function will try loading prior txg's. If 5675 * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool 5676 * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this 5677 * function will not rewind the pool and will return the same error as 5678 * spa_load(). 5679 */ 5680 static int 5681 spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request, 5682 int rewind_flags) 5683 { 5684 nvlist_t *loadinfo = NULL; 5685 nvlist_t *config = NULL; 5686 int load_error, rewind_error; 5687 uint64_t safe_rewind_txg; 5688 uint64_t min_txg; 5689 5690 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 5691 spa->spa_load_max_txg = spa->spa_load_txg; 5692 spa_set_log_state(spa, SPA_LOG_CLEAR); 5693 } else { 5694 spa->spa_load_max_txg = max_request; 5695 if (max_request != UINT64_MAX) 5696 spa->spa_extreme_rewind = B_TRUE; 5697 } 5698 5699 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING); 5700 if (load_error == 0) 5701 return (0); 5702 if (load_error == ZFS_ERR_NO_CHECKPOINT) { 5703 /* 5704 * When attempting checkpoint-rewind on a pool with no 5705 * checkpoint, we should not attempt to load uberblocks 5706 * from previous txgs when spa_load fails. 5707 */ 5708 ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); 5709 spa_import_progress_remove(spa_guid(spa)); 5710 return (load_error); 5711 } 5712 5713 if (spa->spa_root_vdev != NULL) 5714 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 5715 5716 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 5717 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 5718 5719 if (rewind_flags & ZPOOL_NEVER_REWIND) { 5720 nvlist_free(config); 5721 spa_import_progress_remove(spa_guid(spa)); 5722 return (load_error); 5723 } 5724 5725 if (state == SPA_LOAD_RECOVER) { 5726 /* Price of rolling back is discarding txgs, including log */ 5727 spa_set_log_state(spa, SPA_LOG_CLEAR); 5728 } else { 5729 /* 5730 * If we aren't rolling back save the load info from our first 5731 * import attempt so that we can restore it after attempting 5732 * to rewind. 5733 */ 5734 loadinfo = spa->spa_load_info; 5735 spa->spa_load_info = fnvlist_alloc(); 5736 } 5737 5738 spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 5739 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 5740 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 5741 TXG_INITIAL : safe_rewind_txg; 5742 5743 /* 5744 * Continue as long as we're finding errors, we're still within 5745 * the acceptable rewind range, and we're still finding uberblocks 5746 */ 5747 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 5748 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 5749 if (spa->spa_load_max_txg < safe_rewind_txg) 5750 spa->spa_extreme_rewind = B_TRUE; 5751 rewind_error = spa_load_retry(spa, state); 5752 } 5753 5754 spa->spa_extreme_rewind = B_FALSE; 5755 spa->spa_load_max_txg = UINT64_MAX; 5756 5757 if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 5758 spa_config_set(spa, config); 5759 else 5760 nvlist_free(config); 5761 5762 if (state == SPA_LOAD_RECOVER) { 5763 ASSERT3P(loadinfo, ==, NULL); 5764 spa_import_progress_remove(spa_guid(spa)); 5765 return (rewind_error); 5766 } else { 5767 /* Store the rewind info as part of the initial load info */ 5768 fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, 5769 spa->spa_load_info); 5770 5771 /* Restore the initial load info */ 5772 fnvlist_free(spa->spa_load_info); 5773 spa->spa_load_info = loadinfo; 5774 5775 spa_import_progress_remove(spa_guid(spa)); 5776 return (load_error); 5777 } 5778 } 5779 5780 /* 5781 * Pool Open/Import 5782 * 5783 * The import case is identical to an open except that the configuration is sent 5784 * down from userland, instead of grabbed from the configuration cache. For the 5785 * case of an open, the pool configuration will exist in the 5786 * POOL_STATE_UNINITIALIZED state. 5787 * 5788 * The stats information (gen/count/ustats) is used to gather vdev statistics at 5789 * the same time open the pool, without having to keep around the spa_t in some 5790 * ambiguous state. 5791 */ 5792 static int 5793 spa_open_common(const char *pool, spa_t **spapp, const void *tag, 5794 nvlist_t *nvpolicy, nvlist_t **config) 5795 { 5796 spa_t *spa; 5797 spa_load_state_t state = SPA_LOAD_OPEN; 5798 int error; 5799 int locked = B_FALSE; 5800 int firstopen = B_FALSE; 5801 5802 *spapp = NULL; 5803 5804 /* 5805 * As disgusting as this is, we need to support recursive calls to this 5806 * function because dsl_dir_open() is called during spa_load(), and ends 5807 * up calling spa_open() again. The real fix is to figure out how to 5808 * avoid dsl_dir_open() calling this in the first place. 5809 */ 5810 if (MUTEX_NOT_HELD(&spa_namespace_lock)) { 5811 mutex_enter(&spa_namespace_lock); 5812 locked = B_TRUE; 5813 } 5814 5815 if ((spa = spa_lookup(pool)) == NULL) { 5816 if (locked) 5817 mutex_exit(&spa_namespace_lock); 5818 return (SET_ERROR(ENOENT)); 5819 } 5820 5821 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 5822 zpool_load_policy_t policy; 5823 5824 firstopen = B_TRUE; 5825 5826 zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config, 5827 &policy); 5828 if (policy.zlp_rewind & ZPOOL_DO_REWIND) 5829 state = SPA_LOAD_RECOVER; 5830 5831 spa_activate(spa, spa_mode_global); 5832 5833 if (state != SPA_LOAD_RECOVER) 5834 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 5835 spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; 5836 5837 zfs_dbgmsg("spa_open_common: opening %s", pool); 5838 error = spa_load_best(spa, state, policy.zlp_txg, 5839 policy.zlp_rewind); 5840 5841 if (error == EBADF) { 5842 /* 5843 * If vdev_validate() returns failure (indicated by 5844 * EBADF), it indicates that one of the vdevs indicates 5845 * that the pool has been exported or destroyed. If 5846 * this is the case, the config cache is out of sync and 5847 * we should remove the pool from the namespace. 5848 */ 5849 spa_unload(spa); 5850 spa_deactivate(spa); 5851 spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE); 5852 spa_remove(spa); 5853 if (locked) 5854 mutex_exit(&spa_namespace_lock); 5855 return (SET_ERROR(ENOENT)); 5856 } 5857 5858 if (error) { 5859 /* 5860 * We can't open the pool, but we still have useful 5861 * information: the state of each vdev after the 5862 * attempted vdev_open(). Return this to the user. 5863 */ 5864 if (config != NULL && spa->spa_config) { 5865 *config = fnvlist_dup(spa->spa_config); 5866 fnvlist_add_nvlist(*config, 5867 ZPOOL_CONFIG_LOAD_INFO, 5868 spa->spa_load_info); 5869 } 5870 spa_unload(spa); 5871 spa_deactivate(spa); 5872 spa->spa_last_open_failed = error; 5873 if (locked) 5874 mutex_exit(&spa_namespace_lock); 5875 *spapp = NULL; 5876 return (error); 5877 } 5878 } 5879 5880 spa_open_ref(spa, tag); 5881 5882 if (config != NULL) 5883 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 5884 5885 /* 5886 * If we've recovered the pool, pass back any information we 5887 * gathered while doing the load. 5888 */ 5889 if (state == SPA_LOAD_RECOVER && config != NULL) { 5890 fnvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 5891 spa->spa_load_info); 5892 } 5893 5894 if (locked) { 5895 spa->spa_last_open_failed = 0; 5896 spa->spa_last_ubsync_txg = 0; 5897 spa->spa_load_txg = 0; 5898 mutex_exit(&spa_namespace_lock); 5899 } 5900 5901 if (firstopen) 5902 zvol_create_minors_recursive(spa_name(spa)); 5903 5904 *spapp = spa; 5905 5906 return (0); 5907 } 5908 5909 int 5910 spa_open_rewind(const char *name, spa_t **spapp, const void *tag, 5911 nvlist_t *policy, nvlist_t **config) 5912 { 5913 return (spa_open_common(name, spapp, tag, policy, config)); 5914 } 5915 5916 int 5917 spa_open(const char *name, spa_t **spapp, const void *tag) 5918 { 5919 return (spa_open_common(name, spapp, tag, NULL, NULL)); 5920 } 5921 5922 /* 5923 * Lookup the given spa_t, incrementing the inject count in the process, 5924 * preventing it from being exported or destroyed. 5925 */ 5926 spa_t * 5927 spa_inject_addref(char *name) 5928 { 5929 spa_t *spa; 5930 5931 mutex_enter(&spa_namespace_lock); 5932 if ((spa = spa_lookup(name)) == NULL) { 5933 mutex_exit(&spa_namespace_lock); 5934 return (NULL); 5935 } 5936 spa->spa_inject_ref++; 5937 mutex_exit(&spa_namespace_lock); 5938 5939 return (spa); 5940 } 5941 5942 void 5943 spa_inject_delref(spa_t *spa) 5944 { 5945 mutex_enter(&spa_namespace_lock); 5946 spa->spa_inject_ref--; 5947 mutex_exit(&spa_namespace_lock); 5948 } 5949 5950 /* 5951 * Add spares device information to the nvlist. 5952 */ 5953 static void 5954 spa_add_spares(spa_t *spa, nvlist_t *config) 5955 { 5956 nvlist_t **spares; 5957 uint_t i, nspares; 5958 nvlist_t *nvroot; 5959 uint64_t guid; 5960 vdev_stat_t *vs; 5961 uint_t vsc; 5962 uint64_t pool; 5963 5964 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 5965 5966 if (spa->spa_spares.sav_count == 0) 5967 return; 5968 5969 nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); 5970 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 5971 ZPOOL_CONFIG_SPARES, &spares, &nspares)); 5972 if (nspares != 0) { 5973 fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 5974 (const nvlist_t * const *)spares, nspares); 5975 VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 5976 &spares, &nspares)); 5977 5978 /* 5979 * Go through and find any spares which have since been 5980 * repurposed as an active spare. If this is the case, update 5981 * their status appropriately. 5982 */ 5983 for (i = 0; i < nspares; i++) { 5984 guid = fnvlist_lookup_uint64(spares[i], 5985 ZPOOL_CONFIG_GUID); 5986 VERIFY0(nvlist_lookup_uint64_array(spares[i], 5987 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)); 5988 if (spa_spare_exists(guid, &pool, NULL) && 5989 pool != 0ULL) { 5990 vs->vs_state = VDEV_STATE_CANT_OPEN; 5991 vs->vs_aux = VDEV_AUX_SPARED; 5992 } else { 5993 vs->vs_state = 5994 spa->spa_spares.sav_vdevs[i]->vdev_state; 5995 } 5996 } 5997 } 5998 } 5999 6000 /* 6001 * Add l2cache device information to the nvlist, including vdev stats. 6002 */ 6003 static void 6004 spa_add_l2cache(spa_t *spa, nvlist_t *config) 6005 { 6006 nvlist_t **l2cache; 6007 uint_t i, j, nl2cache; 6008 nvlist_t *nvroot; 6009 uint64_t guid; 6010 vdev_t *vd; 6011 vdev_stat_t *vs; 6012 uint_t vsc; 6013 6014 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 6015 6016 if (spa->spa_l2cache.sav_count == 0) 6017 return; 6018 6019 nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); 6020 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 6021 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache)); 6022 if (nl2cache != 0) { 6023 fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 6024 (const nvlist_t * const *)l2cache, nl2cache); 6025 VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 6026 &l2cache, &nl2cache)); 6027 6028 /* 6029 * Update level 2 cache device stats. 6030 */ 6031 6032 for (i = 0; i < nl2cache; i++) { 6033 guid = fnvlist_lookup_uint64(l2cache[i], 6034 ZPOOL_CONFIG_GUID); 6035 6036 vd = NULL; 6037 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 6038 if (guid == 6039 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 6040 vd = spa->spa_l2cache.sav_vdevs[j]; 6041 break; 6042 } 6043 } 6044 ASSERT(vd != NULL); 6045 6046 VERIFY0(nvlist_lookup_uint64_array(l2cache[i], 6047 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)); 6048 vdev_get_stats(vd, vs); 6049 vdev_config_generate_stats(vd, l2cache[i]); 6050 6051 } 6052 } 6053 } 6054 6055 static void 6056 spa_feature_stats_from_disk(spa_t *spa, nvlist_t *features) 6057 { 6058 zap_cursor_t zc; 6059 zap_attribute_t *za = zap_attribute_alloc(); 6060 6061 if (spa->spa_feat_for_read_obj != 0) { 6062 for (zap_cursor_init(&zc, spa->spa_meta_objset, 6063 spa->spa_feat_for_read_obj); 6064 zap_cursor_retrieve(&zc, za) == 0; 6065 zap_cursor_advance(&zc)) { 6066 ASSERT(za->za_integer_length == sizeof (uint64_t) && 6067 za->za_num_integers == 1); 6068 VERIFY0(nvlist_add_uint64(features, za->za_name, 6069 za->za_first_integer)); 6070 } 6071 zap_cursor_fini(&zc); 6072 } 6073 6074 if (spa->spa_feat_for_write_obj != 0) { 6075 for (zap_cursor_init(&zc, spa->spa_meta_objset, 6076 spa->spa_feat_for_write_obj); 6077 zap_cursor_retrieve(&zc, za) == 0; 6078 zap_cursor_advance(&zc)) { 6079 ASSERT(za->za_integer_length == sizeof (uint64_t) && 6080 za->za_num_integers == 1); 6081 VERIFY0(nvlist_add_uint64(features, za->za_name, 6082 za->za_first_integer)); 6083 } 6084 zap_cursor_fini(&zc); 6085 } 6086 zap_attribute_free(za); 6087 } 6088 6089 static void 6090 spa_feature_stats_from_cache(spa_t *spa, nvlist_t *features) 6091 { 6092 int i; 6093 6094 for (i = 0; i < SPA_FEATURES; i++) { 6095 zfeature_info_t feature = spa_feature_table[i]; 6096 uint64_t refcount; 6097 6098 if (feature_get_refcount(spa, &feature, &refcount) != 0) 6099 continue; 6100 6101 VERIFY0(nvlist_add_uint64(features, feature.fi_guid, refcount)); 6102 } 6103 } 6104 6105 /* 6106 * Store a list of pool features and their reference counts in the 6107 * config. 6108 * 6109 * The first time this is called on a spa, allocate a new nvlist, fetch 6110 * the pool features and reference counts from disk, then save the list 6111 * in the spa. In subsequent calls on the same spa use the saved nvlist 6112 * and refresh its values from the cached reference counts. This 6113 * ensures we don't block here on I/O on a suspended pool so 'zpool 6114 * clear' can resume the pool. 6115 */ 6116 static void 6117 spa_add_feature_stats(spa_t *spa, nvlist_t *config) 6118 { 6119 nvlist_t *features; 6120 6121 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 6122 6123 mutex_enter(&spa->spa_feat_stats_lock); 6124 features = spa->spa_feat_stats; 6125 6126 if (features != NULL) { 6127 spa_feature_stats_from_cache(spa, features); 6128 } else { 6129 VERIFY0(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP)); 6130 spa->spa_feat_stats = features; 6131 spa_feature_stats_from_disk(spa, features); 6132 } 6133 6134 VERIFY0(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, 6135 features)); 6136 6137 mutex_exit(&spa->spa_feat_stats_lock); 6138 } 6139 6140 int 6141 spa_get_stats(const char *name, nvlist_t **config, 6142 char *altroot, size_t buflen) 6143 { 6144 int error; 6145 spa_t *spa; 6146 6147 *config = NULL; 6148 error = spa_open_common(name, &spa, FTAG, NULL, config); 6149 6150 if (spa != NULL) { 6151 /* 6152 * This still leaves a window of inconsistency where the spares 6153 * or l2cache devices could change and the config would be 6154 * self-inconsistent. 6155 */ 6156 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6157 6158 if (*config != NULL) { 6159 uint64_t loadtimes[2]; 6160 6161 loadtimes[0] = spa->spa_loaded_ts.tv_sec; 6162 loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 6163 fnvlist_add_uint64_array(*config, 6164 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2); 6165 6166 fnvlist_add_uint64(*config, 6167 ZPOOL_CONFIG_ERRCOUNT, 6168 spa_approx_errlog_size(spa)); 6169 6170 if (spa_suspended(spa)) { 6171 fnvlist_add_uint64(*config, 6172 ZPOOL_CONFIG_SUSPENDED, 6173 spa->spa_failmode); 6174 fnvlist_add_uint64(*config, 6175 ZPOOL_CONFIG_SUSPENDED_REASON, 6176 spa->spa_suspended); 6177 } 6178 6179 spa_add_spares(spa, *config); 6180 spa_add_l2cache(spa, *config); 6181 spa_add_feature_stats(spa, *config); 6182 } 6183 } 6184 6185 /* 6186 * We want to get the alternate root even for faulted pools, so we cheat 6187 * and call spa_lookup() directly. 6188 */ 6189 if (altroot) { 6190 if (spa == NULL) { 6191 mutex_enter(&spa_namespace_lock); 6192 spa = spa_lookup(name); 6193 if (spa) 6194 spa_altroot(spa, altroot, buflen); 6195 else 6196 altroot[0] = '\0'; 6197 spa = NULL; 6198 mutex_exit(&spa_namespace_lock); 6199 } else { 6200 spa_altroot(spa, altroot, buflen); 6201 } 6202 } 6203 6204 if (spa != NULL) { 6205 spa_config_exit(spa, SCL_CONFIG, FTAG); 6206 spa_close(spa, FTAG); 6207 } 6208 6209 return (error); 6210 } 6211 6212 /* 6213 * Validate that the auxiliary device array is well formed. We must have an 6214 * array of nvlists, each which describes a valid leaf vdev. If this is an 6215 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 6216 * specified, as long as they are well-formed. 6217 */ 6218 static int 6219 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 6220 spa_aux_vdev_t *sav, const char *config, uint64_t version, 6221 vdev_labeltype_t label) 6222 { 6223 nvlist_t **dev; 6224 uint_t i, ndev; 6225 vdev_t *vd; 6226 int error; 6227 6228 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 6229 6230 /* 6231 * It's acceptable to have no devs specified. 6232 */ 6233 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 6234 return (0); 6235 6236 if (ndev == 0) 6237 return (SET_ERROR(EINVAL)); 6238 6239 /* 6240 * Make sure the pool is formatted with a version that supports this 6241 * device type. 6242 */ 6243 if (spa_version(spa) < version) 6244 return (SET_ERROR(ENOTSUP)); 6245 6246 /* 6247 * Set the pending device list so we correctly handle device in-use 6248 * checking. 6249 */ 6250 sav->sav_pending = dev; 6251 sav->sav_npending = ndev; 6252 6253 for (i = 0; i < ndev; i++) { 6254 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 6255 mode)) != 0) 6256 goto out; 6257 6258 if (!vd->vdev_ops->vdev_op_leaf) { 6259 vdev_free(vd); 6260 error = SET_ERROR(EINVAL); 6261 goto out; 6262 } 6263 6264 vd->vdev_top = vd; 6265 6266 if ((error = vdev_open(vd)) == 0 && 6267 (error = vdev_label_init(vd, crtxg, label)) == 0) { 6268 fnvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 6269 vd->vdev_guid); 6270 } 6271 6272 vdev_free(vd); 6273 6274 if (error && 6275 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 6276 goto out; 6277 else 6278 error = 0; 6279 } 6280 6281 out: 6282 sav->sav_pending = NULL; 6283 sav->sav_npending = 0; 6284 return (error); 6285 } 6286 6287 static int 6288 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 6289 { 6290 int error; 6291 6292 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 6293 6294 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 6295 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 6296 VDEV_LABEL_SPARE)) != 0) { 6297 return (error); 6298 } 6299 6300 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 6301 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 6302 VDEV_LABEL_L2CACHE)); 6303 } 6304 6305 static void 6306 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 6307 const char *config) 6308 { 6309 int i; 6310 6311 if (sav->sav_config != NULL) { 6312 nvlist_t **olddevs; 6313 uint_t oldndevs; 6314 nvlist_t **newdevs; 6315 6316 /* 6317 * Generate new dev list by concatenating with the 6318 * current dev list. 6319 */ 6320 VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, config, 6321 &olddevs, &oldndevs)); 6322 6323 newdevs = kmem_alloc(sizeof (void *) * 6324 (ndevs + oldndevs), KM_SLEEP); 6325 for (i = 0; i < oldndevs; i++) 6326 newdevs[i] = fnvlist_dup(olddevs[i]); 6327 for (i = 0; i < ndevs; i++) 6328 newdevs[i + oldndevs] = fnvlist_dup(devs[i]); 6329 6330 fnvlist_remove(sav->sav_config, config); 6331 6332 fnvlist_add_nvlist_array(sav->sav_config, config, 6333 (const nvlist_t * const *)newdevs, ndevs + oldndevs); 6334 for (i = 0; i < oldndevs + ndevs; i++) 6335 nvlist_free(newdevs[i]); 6336 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 6337 } else { 6338 /* 6339 * Generate a new dev list. 6340 */ 6341 sav->sav_config = fnvlist_alloc(); 6342 fnvlist_add_nvlist_array(sav->sav_config, config, 6343 (const nvlist_t * const *)devs, ndevs); 6344 } 6345 } 6346 6347 /* 6348 * Stop and drop level 2 ARC devices 6349 */ 6350 void 6351 spa_l2cache_drop(spa_t *spa) 6352 { 6353 vdev_t *vd; 6354 int i; 6355 spa_aux_vdev_t *sav = &spa->spa_l2cache; 6356 6357 for (i = 0; i < sav->sav_count; i++) { 6358 uint64_t pool; 6359 6360 vd = sav->sav_vdevs[i]; 6361 ASSERT(vd != NULL); 6362 6363 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 6364 pool != 0ULL && l2arc_vdev_present(vd)) 6365 l2arc_remove_vdev(vd); 6366 } 6367 } 6368 6369 /* 6370 * Verify encryption parameters for spa creation. If we are encrypting, we must 6371 * have the encryption feature flag enabled. 6372 */ 6373 static int 6374 spa_create_check_encryption_params(dsl_crypto_params_t *dcp, 6375 boolean_t has_encryption) 6376 { 6377 if (dcp->cp_crypt != ZIO_CRYPT_OFF && 6378 dcp->cp_crypt != ZIO_CRYPT_INHERIT && 6379 !has_encryption) 6380 return (SET_ERROR(ENOTSUP)); 6381 6382 return (dmu_objset_create_crypt_check(NULL, dcp, NULL)); 6383 } 6384 6385 /* 6386 * Pool Creation 6387 */ 6388 int 6389 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 6390 nvlist_t *zplprops, dsl_crypto_params_t *dcp) 6391 { 6392 spa_t *spa; 6393 const char *altroot = NULL; 6394 vdev_t *rvd; 6395 dsl_pool_t *dp; 6396 dmu_tx_t *tx; 6397 int error = 0; 6398 uint64_t txg = TXG_INITIAL; 6399 nvlist_t **spares, **l2cache; 6400 uint_t nspares, nl2cache; 6401 uint64_t version, obj, ndraid = 0; 6402 boolean_t has_features; 6403 boolean_t has_encryption; 6404 boolean_t has_allocclass; 6405 spa_feature_t feat; 6406 const char *feat_name; 6407 const char *poolname; 6408 nvlist_t *nvl; 6409 6410 if (props == NULL || 6411 nvlist_lookup_string(props, 6412 zpool_prop_to_name(ZPOOL_PROP_TNAME), &poolname) != 0) 6413 poolname = (char *)pool; 6414 6415 /* 6416 * If this pool already exists, return failure. 6417 */ 6418 mutex_enter(&spa_namespace_lock); 6419 if (spa_lookup(poolname) != NULL) { 6420 mutex_exit(&spa_namespace_lock); 6421 return (SET_ERROR(EEXIST)); 6422 } 6423 6424 /* 6425 * Allocate a new spa_t structure. 6426 */ 6427 nvl = fnvlist_alloc(); 6428 fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool); 6429 (void) nvlist_lookup_string(props, 6430 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 6431 spa = spa_add(poolname, nvl, altroot); 6432 fnvlist_free(nvl); 6433 spa_activate(spa, spa_mode_global); 6434 6435 if (props && (error = spa_prop_validate(spa, props))) { 6436 spa_deactivate(spa); 6437 spa_remove(spa); 6438 mutex_exit(&spa_namespace_lock); 6439 return (error); 6440 } 6441 6442 /* 6443 * Temporary pool names should never be written to disk. 6444 */ 6445 if (poolname != pool) 6446 spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME; 6447 6448 has_features = B_FALSE; 6449 has_encryption = B_FALSE; 6450 has_allocclass = B_FALSE; 6451 for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); 6452 elem != NULL; elem = nvlist_next_nvpair(props, elem)) { 6453 if (zpool_prop_feature(nvpair_name(elem))) { 6454 has_features = B_TRUE; 6455 6456 feat_name = strchr(nvpair_name(elem), '@') + 1; 6457 VERIFY0(zfeature_lookup_name(feat_name, &feat)); 6458 if (feat == SPA_FEATURE_ENCRYPTION) 6459 has_encryption = B_TRUE; 6460 if (feat == SPA_FEATURE_ALLOCATION_CLASSES) 6461 has_allocclass = B_TRUE; 6462 } 6463 } 6464 6465 /* verify encryption params, if they were provided */ 6466 if (dcp != NULL) { 6467 error = spa_create_check_encryption_params(dcp, has_encryption); 6468 if (error != 0) { 6469 spa_deactivate(spa); 6470 spa_remove(spa); 6471 mutex_exit(&spa_namespace_lock); 6472 return (error); 6473 } 6474 } 6475 if (!has_allocclass && zfs_special_devs(nvroot, NULL)) { 6476 spa_deactivate(spa); 6477 spa_remove(spa); 6478 mutex_exit(&spa_namespace_lock); 6479 return (ENOTSUP); 6480 } 6481 6482 if (has_features || nvlist_lookup_uint64(props, 6483 zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { 6484 version = SPA_VERSION; 6485 } 6486 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 6487 6488 spa->spa_first_txg = txg; 6489 spa->spa_uberblock.ub_txg = txg - 1; 6490 spa->spa_uberblock.ub_version = version; 6491 spa->spa_ubsync = spa->spa_uberblock; 6492 spa->spa_load_state = SPA_LOAD_CREATE; 6493 spa->spa_removing_phys.sr_state = DSS_NONE; 6494 spa->spa_removing_phys.sr_removing_vdev = -1; 6495 spa->spa_removing_phys.sr_prev_indirect_vdev = -1; 6496 spa->spa_indirect_vdevs_loaded = B_TRUE; 6497 6498 /* 6499 * Create "The Godfather" zio to hold all async IOs 6500 */ 6501 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 6502 KM_SLEEP); 6503 for (int i = 0; i < max_ncpus; i++) { 6504 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 6505 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 6506 ZIO_FLAG_GODFATHER); 6507 } 6508 6509 /* 6510 * Create the root vdev. 6511 */ 6512 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6513 6514 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 6515 6516 ASSERT(error != 0 || rvd != NULL); 6517 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 6518 6519 if (error == 0 && !zfs_allocatable_devs(nvroot)) 6520 error = SET_ERROR(EINVAL); 6521 6522 if (error == 0 && 6523 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 6524 (error = vdev_draid_spare_create(nvroot, rvd, &ndraid, 0)) == 0 && 6525 (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { 6526 /* 6527 * instantiate the metaslab groups (this will dirty the vdevs) 6528 * we can no longer error exit past this point 6529 */ 6530 for (int c = 0; error == 0 && c < rvd->vdev_children; c++) { 6531 vdev_t *vd = rvd->vdev_child[c]; 6532 6533 vdev_metaslab_set_size(vd); 6534 vdev_expand(vd, txg); 6535 } 6536 } 6537 6538 spa_config_exit(spa, SCL_ALL, FTAG); 6539 6540 if (error != 0) { 6541 spa_unload(spa); 6542 spa_deactivate(spa); 6543 spa_remove(spa); 6544 mutex_exit(&spa_namespace_lock); 6545 return (error); 6546 } 6547 6548 /* 6549 * Get the list of spares, if specified. 6550 */ 6551 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 6552 &spares, &nspares) == 0) { 6553 spa->spa_spares.sav_config = fnvlist_alloc(); 6554 fnvlist_add_nvlist_array(spa->spa_spares.sav_config, 6555 ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, 6556 nspares); 6557 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6558 spa_load_spares(spa); 6559 spa_config_exit(spa, SCL_ALL, FTAG); 6560 spa->spa_spares.sav_sync = B_TRUE; 6561 } 6562 6563 /* 6564 * Get the list of level 2 cache devices, if specified. 6565 */ 6566 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 6567 &l2cache, &nl2cache) == 0) { 6568 VERIFY0(nvlist_alloc(&spa->spa_l2cache.sav_config, 6569 NV_UNIQUE_NAME, KM_SLEEP)); 6570 fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 6571 ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache, 6572 nl2cache); 6573 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6574 spa_load_l2cache(spa); 6575 spa_config_exit(spa, SCL_ALL, FTAG); 6576 spa->spa_l2cache.sav_sync = B_TRUE; 6577 } 6578 6579 spa->spa_is_initializing = B_TRUE; 6580 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, dcp, txg); 6581 spa->spa_is_initializing = B_FALSE; 6582 6583 /* 6584 * Create DDTs (dedup tables). 6585 */ 6586 ddt_create(spa); 6587 /* 6588 * Create BRT table and BRT table object. 6589 */ 6590 brt_create(spa); 6591 6592 spa_update_dspace(spa); 6593 6594 tx = dmu_tx_create_assigned(dp, txg); 6595 6596 /* 6597 * Create the pool's history object. 6598 */ 6599 if (version >= SPA_VERSION_ZPOOL_HISTORY && !spa->spa_history) 6600 spa_history_create_obj(spa, tx); 6601 6602 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE); 6603 spa_history_log_version(spa, "create", tx); 6604 6605 /* 6606 * Create the pool config object. 6607 */ 6608 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 6609 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 6610 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 6611 6612 if (zap_add(spa->spa_meta_objset, 6613 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 6614 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 6615 cmn_err(CE_PANIC, "failed to add pool config"); 6616 } 6617 6618 if (zap_add(spa->spa_meta_objset, 6619 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 6620 sizeof (uint64_t), 1, &version, tx) != 0) { 6621 cmn_err(CE_PANIC, "failed to add pool version"); 6622 } 6623 6624 /* Newly created pools with the right version are always deflated. */ 6625 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 6626 spa->spa_deflate = TRUE; 6627 if (zap_add(spa->spa_meta_objset, 6628 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 6629 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 6630 cmn_err(CE_PANIC, "failed to add deflate"); 6631 } 6632 } 6633 6634 /* 6635 * Create the deferred-free bpobj. Turn off compression 6636 * because sync-to-convergence takes longer if the blocksize 6637 * keeps changing. 6638 */ 6639 obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 6640 dmu_object_set_compress(spa->spa_meta_objset, obj, 6641 ZIO_COMPRESS_OFF, tx); 6642 if (zap_add(spa->spa_meta_objset, 6643 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 6644 sizeof (uint64_t), 1, &obj, tx) != 0) { 6645 cmn_err(CE_PANIC, "failed to add bpobj"); 6646 } 6647 VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 6648 spa->spa_meta_objset, obj)); 6649 6650 /* 6651 * Generate some random noise for salted checksums to operate on. 6652 */ 6653 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 6654 sizeof (spa->spa_cksum_salt.zcs_bytes)); 6655 6656 /* 6657 * Set pool properties. 6658 */ 6659 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 6660 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 6661 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 6662 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 6663 spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST); 6664 spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM); 6665 spa->spa_dedup_table_quota = 6666 zpool_prop_default_numeric(ZPOOL_PROP_DEDUP_TABLE_QUOTA); 6667 6668 if (props != NULL) { 6669 spa_configfile_set(spa, props, B_FALSE); 6670 spa_sync_props(props, tx); 6671 } 6672 6673 for (int i = 0; i < ndraid; i++) 6674 spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); 6675 6676 dmu_tx_commit(tx); 6677 6678 spa->spa_sync_on = B_TRUE; 6679 txg_sync_start(dp); 6680 mmp_thread_start(spa); 6681 txg_wait_synced(dp, txg); 6682 6683 spa_spawn_aux_threads(spa); 6684 6685 spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE); 6686 6687 /* 6688 * Don't count references from objsets that are already closed 6689 * and are making their way through the eviction process. 6690 */ 6691 spa_evicting_os_wait(spa); 6692 spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); 6693 spa->spa_load_state = SPA_LOAD_NONE; 6694 6695 spa_import_os(spa); 6696 6697 mutex_exit(&spa_namespace_lock); 6698 6699 return (0); 6700 } 6701 6702 /* 6703 * Import a non-root pool into the system. 6704 */ 6705 int 6706 spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 6707 { 6708 spa_t *spa; 6709 const char *altroot = NULL; 6710 spa_load_state_t state = SPA_LOAD_IMPORT; 6711 zpool_load_policy_t policy; 6712 spa_mode_t mode = spa_mode_global; 6713 uint64_t readonly = B_FALSE; 6714 int error; 6715 nvlist_t *nvroot; 6716 nvlist_t **spares, **l2cache; 6717 uint_t nspares, nl2cache; 6718 6719 /* 6720 * If a pool with this name exists, return failure. 6721 */ 6722 mutex_enter(&spa_namespace_lock); 6723 if (spa_lookup(pool) != NULL) { 6724 mutex_exit(&spa_namespace_lock); 6725 return (SET_ERROR(EEXIST)); 6726 } 6727 6728 /* 6729 * Create and initialize the spa structure. 6730 */ 6731 (void) nvlist_lookup_string(props, 6732 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 6733 (void) nvlist_lookup_uint64(props, 6734 zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 6735 if (readonly) 6736 mode = SPA_MODE_READ; 6737 spa = spa_add(pool, config, altroot); 6738 spa->spa_import_flags = flags; 6739 6740 /* 6741 * Verbatim import - Take a pool and insert it into the namespace 6742 * as if it had been loaded at boot. 6743 */ 6744 if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 6745 if (props != NULL) 6746 spa_configfile_set(spa, props, B_FALSE); 6747 6748 spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE); 6749 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); 6750 zfs_dbgmsg("spa_import: verbatim import of %s", pool); 6751 mutex_exit(&spa_namespace_lock); 6752 return (0); 6753 } 6754 6755 spa_activate(spa, mode); 6756 6757 /* 6758 * Don't start async tasks until we know everything is healthy. 6759 */ 6760 spa_async_suspend(spa); 6761 6762 zpool_get_load_policy(config, &policy); 6763 if (policy.zlp_rewind & ZPOOL_DO_REWIND) 6764 state = SPA_LOAD_RECOVER; 6765 6766 spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT; 6767 6768 if (state != SPA_LOAD_RECOVER) { 6769 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 6770 zfs_dbgmsg("spa_import: importing %s", pool); 6771 } else { 6772 zfs_dbgmsg("spa_import: importing %s, max_txg=%lld " 6773 "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg); 6774 } 6775 error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind); 6776 6777 /* 6778 * Propagate anything learned while loading the pool and pass it 6779 * back to caller (i.e. rewind info, missing devices, etc). 6780 */ 6781 fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info); 6782 6783 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6784 /* 6785 * Toss any existing sparelist, as it doesn't have any validity 6786 * anymore, and conflicts with spa_has_spare(). 6787 */ 6788 if (spa->spa_spares.sav_config) { 6789 nvlist_free(spa->spa_spares.sav_config); 6790 spa->spa_spares.sav_config = NULL; 6791 spa_load_spares(spa); 6792 } 6793 if (spa->spa_l2cache.sav_config) { 6794 nvlist_free(spa->spa_l2cache.sav_config); 6795 spa->spa_l2cache.sav_config = NULL; 6796 spa_load_l2cache(spa); 6797 } 6798 6799 nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); 6800 spa_config_exit(spa, SCL_ALL, FTAG); 6801 6802 if (props != NULL) 6803 spa_configfile_set(spa, props, B_FALSE); 6804 6805 if (error != 0 || (props && spa_writeable(spa) && 6806 (error = spa_prop_set(spa, props)))) { 6807 spa_unload(spa); 6808 spa_deactivate(spa); 6809 spa_remove(spa); 6810 mutex_exit(&spa_namespace_lock); 6811 return (error); 6812 } 6813 6814 spa_async_resume(spa); 6815 6816 /* 6817 * Override any spares and level 2 cache devices as specified by 6818 * the user, as these may have correct device names/devids, etc. 6819 */ 6820 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 6821 &spares, &nspares) == 0) { 6822 if (spa->spa_spares.sav_config) 6823 fnvlist_remove(spa->spa_spares.sav_config, 6824 ZPOOL_CONFIG_SPARES); 6825 else 6826 spa->spa_spares.sav_config = fnvlist_alloc(); 6827 fnvlist_add_nvlist_array(spa->spa_spares.sav_config, 6828 ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, 6829 nspares); 6830 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6831 spa_load_spares(spa); 6832 spa_config_exit(spa, SCL_ALL, FTAG); 6833 spa->spa_spares.sav_sync = B_TRUE; 6834 spa->spa_spares.sav_label_sync = B_TRUE; 6835 } 6836 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 6837 &l2cache, &nl2cache) == 0) { 6838 if (spa->spa_l2cache.sav_config) 6839 fnvlist_remove(spa->spa_l2cache.sav_config, 6840 ZPOOL_CONFIG_L2CACHE); 6841 else 6842 spa->spa_l2cache.sav_config = fnvlist_alloc(); 6843 fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 6844 ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache, 6845 nl2cache); 6846 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6847 spa_load_l2cache(spa); 6848 spa_config_exit(spa, SCL_ALL, FTAG); 6849 spa->spa_l2cache.sav_sync = B_TRUE; 6850 spa->spa_l2cache.sav_label_sync = B_TRUE; 6851 } 6852 6853 /* 6854 * Check for any removed devices. 6855 */ 6856 if (spa->spa_autoreplace) { 6857 spa_aux_check_removed(&spa->spa_spares); 6858 spa_aux_check_removed(&spa->spa_l2cache); 6859 } 6860 6861 if (spa_writeable(spa)) { 6862 /* 6863 * Update the config cache to include the newly-imported pool. 6864 */ 6865 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 6866 } 6867 6868 /* 6869 * It's possible that the pool was expanded while it was exported. 6870 * We kick off an async task to handle this for us. 6871 */ 6872 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 6873 6874 spa_history_log_version(spa, "import", NULL); 6875 6876 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); 6877 6878 mutex_exit(&spa_namespace_lock); 6879 6880 zvol_create_minors_recursive(pool); 6881 6882 spa_import_os(spa); 6883 6884 return (0); 6885 } 6886 6887 nvlist_t * 6888 spa_tryimport(nvlist_t *tryconfig) 6889 { 6890 nvlist_t *config = NULL; 6891 const char *poolname, *cachefile; 6892 spa_t *spa; 6893 uint64_t state; 6894 int error; 6895 zpool_load_policy_t policy; 6896 6897 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 6898 return (NULL); 6899 6900 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 6901 return (NULL); 6902 6903 /* 6904 * Create and initialize the spa structure. 6905 */ 6906 char *name = kmem_alloc(MAXPATHLEN, KM_SLEEP); 6907 (void) snprintf(name, MAXPATHLEN, "%s-%llx-%s", 6908 TRYIMPORT_NAME, (u_longlong_t)(uintptr_t)curthread, poolname); 6909 6910 mutex_enter(&spa_namespace_lock); 6911 spa = spa_add(name, tryconfig, NULL); 6912 spa_activate(spa, SPA_MODE_READ); 6913 kmem_free(name, MAXPATHLEN); 6914 6915 /* 6916 * Rewind pool if a max txg was provided. 6917 */ 6918 zpool_get_load_policy(spa->spa_config, &policy); 6919 if (policy.zlp_txg != UINT64_MAX) { 6920 spa->spa_load_max_txg = policy.zlp_txg; 6921 spa->spa_extreme_rewind = B_TRUE; 6922 zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld", 6923 poolname, (longlong_t)policy.zlp_txg); 6924 } else { 6925 zfs_dbgmsg("spa_tryimport: importing %s", poolname); 6926 } 6927 6928 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile) 6929 == 0) { 6930 zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile); 6931 spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; 6932 } else { 6933 spa->spa_config_source = SPA_CONFIG_SRC_SCAN; 6934 } 6935 6936 /* 6937 * spa_import() relies on a pool config fetched by spa_try_import() 6938 * for spare/cache devices. Import flags are not passed to 6939 * spa_tryimport(), which makes it return early due to a missing log 6940 * device and missing retrieving the cache device and spare eventually. 6941 * Passing ZFS_IMPORT_MISSING_LOG to spa_tryimport() makes it fetch 6942 * the correct configuration regardless of the missing log device. 6943 */ 6944 spa->spa_import_flags |= ZFS_IMPORT_MISSING_LOG; 6945 6946 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING); 6947 6948 /* 6949 * If 'tryconfig' was at least parsable, return the current config. 6950 */ 6951 if (spa->spa_root_vdev != NULL) { 6952 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 6953 fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, poolname); 6954 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, state); 6955 fnvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 6956 spa->spa_uberblock.ub_timestamp); 6957 fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 6958 spa->spa_load_info); 6959 fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, 6960 spa->spa_errata); 6961 6962 /* 6963 * If the bootfs property exists on this pool then we 6964 * copy it out so that external consumers can tell which 6965 * pools are bootable. 6966 */ 6967 if ((!error || error == EEXIST) && spa->spa_bootfs) { 6968 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 6969 6970 /* 6971 * We have to play games with the name since the 6972 * pool was opened as TRYIMPORT_NAME. 6973 */ 6974 if (dsl_dsobj_to_dsname(spa_name(spa), 6975 spa->spa_bootfs, tmpname) == 0) { 6976 char *cp; 6977 char *dsname; 6978 6979 dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 6980 6981 cp = strchr(tmpname, '/'); 6982 if (cp == NULL) { 6983 (void) strlcpy(dsname, tmpname, 6984 MAXPATHLEN); 6985 } else { 6986 (void) snprintf(dsname, MAXPATHLEN, 6987 "%s/%s", poolname, ++cp); 6988 } 6989 fnvlist_add_string(config, ZPOOL_CONFIG_BOOTFS, 6990 dsname); 6991 kmem_free(dsname, MAXPATHLEN); 6992 } 6993 kmem_free(tmpname, MAXPATHLEN); 6994 } 6995 6996 /* 6997 * Add the list of hot spares and level 2 cache devices. 6998 */ 6999 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 7000 spa_add_spares(spa, config); 7001 spa_add_l2cache(spa, config); 7002 spa_config_exit(spa, SCL_CONFIG, FTAG); 7003 } 7004 7005 spa_unload(spa); 7006 spa_deactivate(spa); 7007 spa_remove(spa); 7008 mutex_exit(&spa_namespace_lock); 7009 7010 return (config); 7011 } 7012 7013 /* 7014 * Pool export/destroy 7015 * 7016 * The act of destroying or exporting a pool is very simple. We make sure there 7017 * is no more pending I/O and any references to the pool are gone. Then, we 7018 * update the pool state and sync all the labels to disk, removing the 7019 * configuration from the cache afterwards. If the 'hardforce' flag is set, then 7020 * we don't sync the labels or remove the configuration cache. 7021 */ 7022 static int 7023 spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, 7024 boolean_t force, boolean_t hardforce) 7025 { 7026 int error = 0; 7027 spa_t *spa; 7028 hrtime_t export_start = gethrtime(); 7029 7030 if (oldconfig) 7031 *oldconfig = NULL; 7032 7033 if (!(spa_mode_global & SPA_MODE_WRITE)) 7034 return (SET_ERROR(EROFS)); 7035 7036 mutex_enter(&spa_namespace_lock); 7037 if ((spa = spa_lookup(pool)) == NULL) { 7038 mutex_exit(&spa_namespace_lock); 7039 return (SET_ERROR(ENOENT)); 7040 } 7041 7042 if (spa->spa_is_exporting) { 7043 /* the pool is being exported by another thread */ 7044 mutex_exit(&spa_namespace_lock); 7045 return (SET_ERROR(ZFS_ERR_EXPORT_IN_PROGRESS)); 7046 } 7047 spa->spa_is_exporting = B_TRUE; 7048 7049 /* 7050 * Put a hold on the pool, drop the namespace lock, stop async tasks 7051 * and see if we can export. 7052 */ 7053 spa_open_ref(spa, FTAG); 7054 mutex_exit(&spa_namespace_lock); 7055 spa_async_suspend(spa); 7056 if (spa->spa_zvol_taskq) { 7057 zvol_remove_minors(spa, spa_name(spa), B_TRUE); 7058 taskq_wait(spa->spa_zvol_taskq); 7059 } 7060 mutex_enter(&spa_namespace_lock); 7061 spa->spa_export_thread = curthread; 7062 spa_close(spa, FTAG); 7063 7064 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 7065 mutex_exit(&spa_namespace_lock); 7066 goto export_spa; 7067 } 7068 7069 /* 7070 * The pool will be in core if it's openable, in which case we can 7071 * modify its state. Objsets may be open only because they're dirty, 7072 * so we have to force it to sync before checking spa_refcnt. 7073 */ 7074 if (spa->spa_sync_on) { 7075 txg_wait_synced(spa->spa_dsl_pool, 0); 7076 spa_evicting_os_wait(spa); 7077 } 7078 7079 /* 7080 * A pool cannot be exported or destroyed if there are active 7081 * references. If we are resetting a pool, allow references by 7082 * fault injection handlers. 7083 */ 7084 if (!spa_refcount_zero(spa) || (spa->spa_inject_ref != 0)) { 7085 error = SET_ERROR(EBUSY); 7086 goto fail; 7087 } 7088 7089 mutex_exit(&spa_namespace_lock); 7090 /* 7091 * At this point we no longer hold the spa_namespace_lock and 7092 * there were no references on the spa. Future spa_lookups will 7093 * notice the spa->spa_export_thread and wait until we signal 7094 * that we are finshed. 7095 */ 7096 7097 if (spa->spa_sync_on) { 7098 vdev_t *rvd = spa->spa_root_vdev; 7099 /* 7100 * A pool cannot be exported if it has an active shared spare. 7101 * This is to prevent other pools stealing the active spare 7102 * from an exported pool. At user's own will, such pool can 7103 * be forcedly exported. 7104 */ 7105 if (!force && new_state == POOL_STATE_EXPORTED && 7106 spa_has_active_shared_spare(spa)) { 7107 error = SET_ERROR(EXDEV); 7108 mutex_enter(&spa_namespace_lock); 7109 goto fail; 7110 } 7111 7112 /* 7113 * We're about to export or destroy this pool. Make sure 7114 * we stop all initialization and trim activity here before 7115 * we set the spa_final_txg. This will ensure that all 7116 * dirty data resulting from the initialization is 7117 * committed to disk before we unload the pool. 7118 */ 7119 vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE); 7120 vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE); 7121 vdev_autotrim_stop_all(spa); 7122 vdev_rebuild_stop_all(spa); 7123 l2arc_spa_rebuild_stop(spa); 7124 7125 /* 7126 * We want this to be reflected on every label, 7127 * so mark them all dirty. spa_unload() will do the 7128 * final sync that pushes these changes out. 7129 */ 7130 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 7131 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 7132 spa->spa_state = new_state; 7133 vdev_config_dirty(rvd); 7134 spa_config_exit(spa, SCL_ALL, FTAG); 7135 } 7136 7137 /* 7138 * If the log space map feature is enabled and the pool is 7139 * getting exported (but not destroyed), we want to spend some 7140 * time flushing as many metaslabs as we can in an attempt to 7141 * destroy log space maps and save import time. This has to be 7142 * done before we set the spa_final_txg, otherwise 7143 * spa_sync() -> spa_flush_metaslabs() may dirty the final TXGs. 7144 * spa_should_flush_logs_on_unload() should be called after 7145 * spa_state has been set to the new_state. 7146 */ 7147 if (spa_should_flush_logs_on_unload(spa)) 7148 spa_unload_log_sm_flush_all(spa); 7149 7150 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 7151 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 7152 spa->spa_final_txg = spa_last_synced_txg(spa) + 7153 TXG_DEFER_SIZE + 1; 7154 spa_config_exit(spa, SCL_ALL, FTAG); 7155 } 7156 } 7157 7158 export_spa: 7159 spa_export_os(spa); 7160 7161 if (new_state == POOL_STATE_DESTROYED) 7162 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY); 7163 else if (new_state == POOL_STATE_EXPORTED) 7164 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_EXPORT); 7165 7166 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 7167 spa_unload(spa); 7168 spa_deactivate(spa); 7169 } 7170 7171 if (oldconfig && spa->spa_config) 7172 *oldconfig = fnvlist_dup(spa->spa_config); 7173 7174 if (new_state == POOL_STATE_EXPORTED) 7175 zio_handle_export_delay(spa, gethrtime() - export_start); 7176 7177 /* 7178 * Take the namespace lock for the actual spa_t removal 7179 */ 7180 mutex_enter(&spa_namespace_lock); 7181 if (new_state != POOL_STATE_UNINITIALIZED) { 7182 if (!hardforce) 7183 spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE); 7184 spa_remove(spa); 7185 } else { 7186 /* 7187 * If spa_remove() is not called for this spa_t and 7188 * there is any possibility that it can be reused, 7189 * we make sure to reset the exporting flag. 7190 */ 7191 spa->spa_is_exporting = B_FALSE; 7192 spa->spa_export_thread = NULL; 7193 } 7194 7195 /* 7196 * Wake up any waiters in spa_lookup() 7197 */ 7198 cv_broadcast(&spa_namespace_cv); 7199 mutex_exit(&spa_namespace_lock); 7200 return (0); 7201 7202 fail: 7203 spa->spa_is_exporting = B_FALSE; 7204 spa->spa_export_thread = NULL; 7205 7206 spa_async_resume(spa); 7207 /* 7208 * Wake up any waiters in spa_lookup() 7209 */ 7210 cv_broadcast(&spa_namespace_cv); 7211 mutex_exit(&spa_namespace_lock); 7212 return (error); 7213 } 7214 7215 /* 7216 * Destroy a storage pool. 7217 */ 7218 int 7219 spa_destroy(const char *pool) 7220 { 7221 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 7222 B_FALSE, B_FALSE)); 7223 } 7224 7225 /* 7226 * Export a storage pool. 7227 */ 7228 int 7229 spa_export(const char *pool, nvlist_t **oldconfig, boolean_t force, 7230 boolean_t hardforce) 7231 { 7232 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 7233 force, hardforce)); 7234 } 7235 7236 /* 7237 * Similar to spa_export(), this unloads the spa_t without actually removing it 7238 * from the namespace in any way. 7239 */ 7240 int 7241 spa_reset(const char *pool) 7242 { 7243 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 7244 B_FALSE, B_FALSE)); 7245 } 7246 7247 /* 7248 * ========================================================================== 7249 * Device manipulation 7250 * ========================================================================== 7251 */ 7252 7253 /* 7254 * This is called as a synctask to increment the draid feature flag 7255 */ 7256 static void 7257 spa_draid_feature_incr(void *arg, dmu_tx_t *tx) 7258 { 7259 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 7260 int draid = (int)(uintptr_t)arg; 7261 7262 for (int c = 0; c < draid; c++) 7263 spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); 7264 } 7265 7266 /* 7267 * Add a device to a storage pool. 7268 */ 7269 int 7270 spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t check_ashift) 7271 { 7272 uint64_t txg, ndraid = 0; 7273 int error; 7274 vdev_t *rvd = spa->spa_root_vdev; 7275 vdev_t *vd, *tvd; 7276 nvlist_t **spares, **l2cache; 7277 uint_t nspares, nl2cache; 7278 7279 ASSERT(spa_writeable(spa)); 7280 7281 txg = spa_vdev_enter(spa); 7282 7283 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 7284 VDEV_ALLOC_ADD)) != 0) 7285 return (spa_vdev_exit(spa, NULL, txg, error)); 7286 7287 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 7288 7289 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 7290 &nspares) != 0) 7291 nspares = 0; 7292 7293 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 7294 &nl2cache) != 0) 7295 nl2cache = 0; 7296 7297 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 7298 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 7299 7300 if (vd->vdev_children != 0 && 7301 (error = vdev_create(vd, txg, B_FALSE)) != 0) { 7302 return (spa_vdev_exit(spa, vd, txg, error)); 7303 } 7304 7305 /* 7306 * The virtual dRAID spares must be added after vdev tree is created 7307 * and the vdev guids are generated. The guid of their associated 7308 * dRAID is stored in the config and used when opening the spare. 7309 */ 7310 if ((error = vdev_draid_spare_create(nvroot, vd, &ndraid, 7311 rvd->vdev_children)) == 0) { 7312 if (ndraid > 0 && nvlist_lookup_nvlist_array(nvroot, 7313 ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0) 7314 nspares = 0; 7315 } else { 7316 return (spa_vdev_exit(spa, vd, txg, error)); 7317 } 7318 7319 /* 7320 * We must validate the spares and l2cache devices after checking the 7321 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 7322 */ 7323 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 7324 return (spa_vdev_exit(spa, vd, txg, error)); 7325 7326 /* 7327 * If we are in the middle of a device removal, we can only add 7328 * devices which match the existing devices in the pool. 7329 * If we are in the middle of a removal, or have some indirect 7330 * vdevs, we can not add raidz or dRAID top levels. 7331 */ 7332 if (spa->spa_vdev_removal != NULL || 7333 spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { 7334 for (int c = 0; c < vd->vdev_children; c++) { 7335 tvd = vd->vdev_child[c]; 7336 if (spa->spa_vdev_removal != NULL && 7337 tvd->vdev_ashift != spa->spa_max_ashift) { 7338 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 7339 } 7340 /* Fail if top level vdev is raidz or a dRAID */ 7341 if (vdev_get_nparity(tvd) != 0) 7342 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 7343 7344 /* 7345 * Need the top level mirror to be 7346 * a mirror of leaf vdevs only 7347 */ 7348 if (tvd->vdev_ops == &vdev_mirror_ops) { 7349 for (uint64_t cid = 0; 7350 cid < tvd->vdev_children; cid++) { 7351 vdev_t *cvd = tvd->vdev_child[cid]; 7352 if (!cvd->vdev_ops->vdev_op_leaf) { 7353 return (spa_vdev_exit(spa, vd, 7354 txg, EINVAL)); 7355 } 7356 } 7357 } 7358 } 7359 } 7360 7361 if (check_ashift && spa->spa_max_ashift == spa->spa_min_ashift) { 7362 for (int c = 0; c < vd->vdev_children; c++) { 7363 tvd = vd->vdev_child[c]; 7364 if (tvd->vdev_ashift != spa->spa_max_ashift) { 7365 return (spa_vdev_exit(spa, vd, txg, 7366 ZFS_ERR_ASHIFT_MISMATCH)); 7367 } 7368 } 7369 } 7370 7371 for (int c = 0; c < vd->vdev_children; c++) { 7372 tvd = vd->vdev_child[c]; 7373 vdev_remove_child(vd, tvd); 7374 tvd->vdev_id = rvd->vdev_children; 7375 vdev_add_child(rvd, tvd); 7376 vdev_config_dirty(tvd); 7377 } 7378 7379 if (nspares != 0) { 7380 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 7381 ZPOOL_CONFIG_SPARES); 7382 spa_load_spares(spa); 7383 spa->spa_spares.sav_sync = B_TRUE; 7384 } 7385 7386 if (nl2cache != 0) { 7387 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 7388 ZPOOL_CONFIG_L2CACHE); 7389 spa_load_l2cache(spa); 7390 spa->spa_l2cache.sav_sync = B_TRUE; 7391 } 7392 7393 /* 7394 * We can't increment a feature while holding spa_vdev so we 7395 * have to do it in a synctask. 7396 */ 7397 if (ndraid != 0) { 7398 dmu_tx_t *tx; 7399 7400 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 7401 dsl_sync_task_nowait(spa->spa_dsl_pool, spa_draid_feature_incr, 7402 (void *)(uintptr_t)ndraid, tx); 7403 dmu_tx_commit(tx); 7404 } 7405 7406 /* 7407 * We have to be careful when adding new vdevs to an existing pool. 7408 * If other threads start allocating from these vdevs before we 7409 * sync the config cache, and we lose power, then upon reboot we may 7410 * fail to open the pool because there are DVAs that the config cache 7411 * can't translate. Therefore, we first add the vdevs without 7412 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 7413 * and then let spa_config_update() initialize the new metaslabs. 7414 * 7415 * spa_load() checks for added-but-not-initialized vdevs, so that 7416 * if we lose power at any point in this sequence, the remaining 7417 * steps will be completed the next time we load the pool. 7418 */ 7419 (void) spa_vdev_exit(spa, vd, txg, 0); 7420 7421 mutex_enter(&spa_namespace_lock); 7422 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 7423 spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD); 7424 mutex_exit(&spa_namespace_lock); 7425 7426 return (0); 7427 } 7428 7429 /* 7430 * Given a vdev to be replaced and its parent, check for a possible 7431 * "double spare" condition if a vdev is to be replaced by a spare. When this 7432 * happens, you can get two spares assigned to one failed vdev. 7433 * 7434 * To trigger a double spare condition: 7435 * 7436 * 1. disk1 fails 7437 * 2. 1st spare is kicked in for disk1 and it resilvers 7438 * 3. Someone replaces disk1 with a new blank disk 7439 * 4. New blank disk starts resilvering 7440 * 5. While resilvering, new blank disk has IO errors and faults 7441 * 6. 2nd spare is kicked in for new blank disk 7442 * 7. At this point two spares are kicked in for the original disk1. 7443 * 7444 * It looks like this: 7445 * 7446 * NAME STATE READ WRITE CKSUM 7447 * tank2 DEGRADED 0 0 0 7448 * draid2:6d:10c:2s-0 DEGRADED 0 0 0 7449 * scsi-0QEMU_QEMU_HARDDISK_d1 ONLINE 0 0 0 7450 * scsi-0QEMU_QEMU_HARDDISK_d2 ONLINE 0 0 0 7451 * scsi-0QEMU_QEMU_HARDDISK_d3 ONLINE 0 0 0 7452 * scsi-0QEMU_QEMU_HARDDISK_d4 ONLINE 0 0 0 7453 * scsi-0QEMU_QEMU_HARDDISK_d5 ONLINE 0 0 0 7454 * scsi-0QEMU_QEMU_HARDDISK_d6 ONLINE 0 0 0 7455 * scsi-0QEMU_QEMU_HARDDISK_d7 ONLINE 0 0 0 7456 * scsi-0QEMU_QEMU_HARDDISK_d8 ONLINE 0 0 0 7457 * scsi-0QEMU_QEMU_HARDDISK_d9 ONLINE 0 0 0 7458 * spare-9 DEGRADED 0 0 0 7459 * replacing-0 DEGRADED 0 93 0 7460 * scsi-0QEMU_QEMU_HARDDISK_d10-part1/old UNAVAIL 0 0 0 7461 * spare-1 DEGRADED 0 0 0 7462 * scsi-0QEMU_QEMU_HARDDISK_d10 REMOVED 0 0 0 7463 * draid2-0-0 ONLINE 0 0 0 7464 * draid2-0-1 ONLINE 0 0 0 7465 * spares 7466 * draid2-0-0 INUSE currently in use 7467 * draid2-0-1 INUSE currently in use 7468 * 7469 * ARGS: 7470 * 7471 * newvd: New spare disk 7472 * pvd: Parent vdev_t the spare should attach to 7473 * 7474 * This function returns B_TRUE if adding the new vdev would create a double 7475 * spare condition, B_FALSE otherwise. 7476 */ 7477 static boolean_t 7478 spa_vdev_new_spare_would_cause_double_spares(vdev_t *newvd, vdev_t *pvd) 7479 { 7480 vdev_t *ppvd; 7481 7482 ppvd = pvd->vdev_parent; 7483 if (ppvd == NULL) 7484 return (B_FALSE); 7485 7486 /* 7487 * To determine if this configuration would cause a double spare, we 7488 * look at the vdev_op of the parent vdev, and of the parent's parent 7489 * vdev. We also look at vdev_isspare on the new disk. A double spare 7490 * condition looks like this: 7491 * 7492 * 1. parent of parent's op is a spare or draid spare 7493 * 2. parent's op is replacing 7494 * 3. new disk is a spare 7495 */ 7496 if ((ppvd->vdev_ops == &vdev_spare_ops) || 7497 (ppvd->vdev_ops == &vdev_draid_spare_ops)) 7498 if (pvd->vdev_ops == &vdev_replacing_ops) 7499 if (newvd->vdev_isspare) 7500 return (B_TRUE); 7501 7502 return (B_FALSE); 7503 } 7504 7505 /* 7506 * Attach a device to a vdev specified by its guid. The vdev type can be 7507 * a mirror, a raidz, or a leaf device that is also a top-level (e.g. a 7508 * single device). When the vdev is a single device, a mirror vdev will be 7509 * automatically inserted. 7510 * 7511 * If 'replacing' is specified, the new device is intended to replace the 7512 * existing device; in this case the two devices are made into their own 7513 * mirror using the 'replacing' vdev, which is functionally identical to 7514 * the mirror vdev (it actually reuses all the same ops) but has a few 7515 * extra rules: you can't attach to it after it's been created, and upon 7516 * completion of resilvering, the first disk (the one being replaced) 7517 * is automatically detached. 7518 * 7519 * If 'rebuild' is specified, then sequential reconstruction (a.ka. rebuild) 7520 * should be performed instead of traditional healing reconstruction. From 7521 * an administrators perspective these are both resilver operations. 7522 */ 7523 int 7524 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, 7525 int rebuild) 7526 { 7527 uint64_t txg, dtl_max_txg; 7528 vdev_t *rvd = spa->spa_root_vdev; 7529 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 7530 vdev_ops_t *pvops; 7531 char *oldvdpath, *newvdpath; 7532 int newvd_isspare = B_FALSE; 7533 int error; 7534 7535 ASSERT(spa_writeable(spa)); 7536 7537 txg = spa_vdev_enter(spa); 7538 7539 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 7540 7541 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 7542 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 7543 error = (spa_has_checkpoint(spa)) ? 7544 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 7545 return (spa_vdev_exit(spa, NULL, txg, error)); 7546 } 7547 7548 if (rebuild) { 7549 if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) 7550 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7551 7552 if (dsl_scan_resilvering(spa_get_dsl(spa)) || 7553 dsl_scan_resilver_scheduled(spa_get_dsl(spa))) { 7554 return (spa_vdev_exit(spa, NULL, txg, 7555 ZFS_ERR_RESILVER_IN_PROGRESS)); 7556 } 7557 } else { 7558 if (vdev_rebuild_active(rvd)) 7559 return (spa_vdev_exit(spa, NULL, txg, 7560 ZFS_ERR_REBUILD_IN_PROGRESS)); 7561 } 7562 7563 if (spa->spa_vdev_removal != NULL) { 7564 return (spa_vdev_exit(spa, NULL, txg, 7565 ZFS_ERR_DEVRM_IN_PROGRESS)); 7566 } 7567 7568 if (oldvd == NULL) 7569 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 7570 7571 boolean_t raidz = oldvd->vdev_ops == &vdev_raidz_ops; 7572 7573 if (raidz) { 7574 if (!spa_feature_is_enabled(spa, SPA_FEATURE_RAIDZ_EXPANSION)) 7575 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7576 7577 /* 7578 * Can't expand a raidz while prior expand is in progress. 7579 */ 7580 if (spa->spa_raidz_expand != NULL) { 7581 return (spa_vdev_exit(spa, NULL, txg, 7582 ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS)); 7583 } 7584 } else if (!oldvd->vdev_ops->vdev_op_leaf) { 7585 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7586 } 7587 7588 if (raidz) 7589 pvd = oldvd; 7590 else 7591 pvd = oldvd->vdev_parent; 7592 7593 if (spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 7594 VDEV_ALLOC_ATTACH) != 0) 7595 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 7596 7597 if (newrootvd->vdev_children != 1) 7598 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 7599 7600 newvd = newrootvd->vdev_child[0]; 7601 7602 if (!newvd->vdev_ops->vdev_op_leaf) 7603 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 7604 7605 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 7606 return (spa_vdev_exit(spa, newrootvd, txg, error)); 7607 7608 /* 7609 * log, dedup and special vdevs should not be replaced by spares. 7610 */ 7611 if ((oldvd->vdev_top->vdev_alloc_bias != VDEV_BIAS_NONE || 7612 oldvd->vdev_top->vdev_islog) && newvd->vdev_isspare) { 7613 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7614 } 7615 7616 /* 7617 * A dRAID spare can only replace a child of its parent dRAID vdev. 7618 */ 7619 if (newvd->vdev_ops == &vdev_draid_spare_ops && 7620 oldvd->vdev_top != vdev_draid_spare_get_parent(newvd)) { 7621 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7622 } 7623 7624 if (rebuild) { 7625 /* 7626 * For rebuilds, the top vdev must support reconstruction 7627 * using only space maps. This means the only allowable 7628 * vdevs types are the root vdev, a mirror, or dRAID. 7629 */ 7630 tvd = pvd; 7631 if (pvd->vdev_top != NULL) 7632 tvd = pvd->vdev_top; 7633 7634 if (tvd->vdev_ops != &vdev_mirror_ops && 7635 tvd->vdev_ops != &vdev_root_ops && 7636 tvd->vdev_ops != &vdev_draid_ops) { 7637 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7638 } 7639 } 7640 7641 if (!replacing) { 7642 /* 7643 * For attach, the only allowable parent is a mirror or 7644 * the root vdev. A raidz vdev can be attached to, but 7645 * you cannot attach to a raidz child. 7646 */ 7647 if (pvd->vdev_ops != &vdev_mirror_ops && 7648 pvd->vdev_ops != &vdev_root_ops && 7649 !raidz) 7650 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7651 7652 pvops = &vdev_mirror_ops; 7653 } else { 7654 /* 7655 * Active hot spares can only be replaced by inactive hot 7656 * spares. 7657 */ 7658 if (pvd->vdev_ops == &vdev_spare_ops && 7659 oldvd->vdev_isspare && 7660 !spa_has_spare(spa, newvd->vdev_guid)) 7661 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7662 7663 /* 7664 * If the source is a hot spare, and the parent isn't already a 7665 * spare, then we want to create a new hot spare. Otherwise, we 7666 * want to create a replacing vdev. The user is not allowed to 7667 * attach to a spared vdev child unless the 'isspare' state is 7668 * the same (spare replaces spare, non-spare replaces 7669 * non-spare). 7670 */ 7671 if (pvd->vdev_ops == &vdev_replacing_ops && 7672 spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 7673 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7674 } else if (pvd->vdev_ops == &vdev_spare_ops && 7675 newvd->vdev_isspare != oldvd->vdev_isspare) { 7676 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7677 } 7678 7679 if (spa_vdev_new_spare_would_cause_double_spares(newvd, pvd)) { 7680 vdev_dbgmsg(newvd, 7681 "disk would create double spares, ignore."); 7682 return (spa_vdev_exit(spa, newrootvd, txg, EEXIST)); 7683 } 7684 7685 if (newvd->vdev_isspare) 7686 pvops = &vdev_spare_ops; 7687 else 7688 pvops = &vdev_replacing_ops; 7689 } 7690 7691 /* 7692 * Make sure the new device is big enough. 7693 */ 7694 vdev_t *min_vdev = raidz ? oldvd->vdev_child[0] : oldvd; 7695 if (newvd->vdev_asize < vdev_get_min_asize(min_vdev)) 7696 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 7697 7698 /* 7699 * The new device cannot have a higher alignment requirement 7700 * than the top-level vdev. 7701 */ 7702 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) { 7703 return (spa_vdev_exit(spa, newrootvd, txg, 7704 ZFS_ERR_ASHIFT_MISMATCH)); 7705 } 7706 7707 /* 7708 * RAIDZ-expansion-specific checks. 7709 */ 7710 if (raidz) { 7711 if (vdev_raidz_attach_check(newvd) != 0) 7712 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7713 7714 /* 7715 * Fail early if a child is not healthy or being replaced 7716 */ 7717 for (int i = 0; i < oldvd->vdev_children; i++) { 7718 if (vdev_is_dead(oldvd->vdev_child[i]) || 7719 !oldvd->vdev_child[i]->vdev_ops->vdev_op_leaf) { 7720 return (spa_vdev_exit(spa, newrootvd, txg, 7721 ENXIO)); 7722 } 7723 /* Also fail if reserved boot area is in-use */ 7724 if (vdev_check_boot_reserve(spa, oldvd->vdev_child[i]) 7725 != 0) { 7726 return (spa_vdev_exit(spa, newrootvd, txg, 7727 EADDRINUSE)); 7728 } 7729 } 7730 } 7731 7732 if (raidz) { 7733 /* 7734 * Note: oldvdpath is freed by spa_strfree(), but 7735 * kmem_asprintf() is freed by kmem_strfree(), so we have to 7736 * move it to a spa_strdup-ed string. 7737 */ 7738 char *tmp = kmem_asprintf("raidz%u-%u", 7739 (uint_t)vdev_get_nparity(oldvd), (uint_t)oldvd->vdev_id); 7740 oldvdpath = spa_strdup(tmp); 7741 kmem_strfree(tmp); 7742 } else { 7743 oldvdpath = spa_strdup(oldvd->vdev_path); 7744 } 7745 newvdpath = spa_strdup(newvd->vdev_path); 7746 7747 /* 7748 * If this is an in-place replacement, update oldvd's path and devid 7749 * to make it distinguishable from newvd, and unopenable from now on. 7750 */ 7751 if (strcmp(oldvdpath, newvdpath) == 0) { 7752 spa_strfree(oldvd->vdev_path); 7753 oldvd->vdev_path = kmem_alloc(strlen(newvdpath) + 5, 7754 KM_SLEEP); 7755 (void) sprintf(oldvd->vdev_path, "%s/old", 7756 newvdpath); 7757 if (oldvd->vdev_devid != NULL) { 7758 spa_strfree(oldvd->vdev_devid); 7759 oldvd->vdev_devid = NULL; 7760 } 7761 spa_strfree(oldvdpath); 7762 oldvdpath = spa_strdup(oldvd->vdev_path); 7763 } 7764 7765 /* 7766 * If the parent is not a mirror, or if we're replacing, insert the new 7767 * mirror/replacing/spare vdev above oldvd. 7768 */ 7769 if (!raidz && pvd->vdev_ops != pvops) { 7770 pvd = vdev_add_parent(oldvd, pvops); 7771 ASSERT(pvd->vdev_ops == pvops); 7772 ASSERT(oldvd->vdev_parent == pvd); 7773 } 7774 7775 ASSERT(pvd->vdev_top->vdev_parent == rvd); 7776 7777 /* 7778 * Extract the new device from its root and add it to pvd. 7779 */ 7780 vdev_remove_child(newrootvd, newvd); 7781 newvd->vdev_id = pvd->vdev_children; 7782 newvd->vdev_crtxg = oldvd->vdev_crtxg; 7783 vdev_add_child(pvd, newvd); 7784 7785 /* 7786 * Reevaluate the parent vdev state. 7787 */ 7788 vdev_propagate_state(pvd); 7789 7790 tvd = newvd->vdev_top; 7791 ASSERT(pvd->vdev_top == tvd); 7792 ASSERT(tvd->vdev_parent == rvd); 7793 7794 vdev_config_dirty(tvd); 7795 7796 /* 7797 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 7798 * for any dmu_sync-ed blocks. It will propagate upward when 7799 * spa_vdev_exit() calls vdev_dtl_reassess(). 7800 */ 7801 dtl_max_txg = txg + TXG_CONCURRENT_STATES; 7802 7803 if (raidz) { 7804 /* 7805 * Wait for the youngest allocations and frees to sync, 7806 * and then wait for the deferral of those frees to finish. 7807 */ 7808 spa_vdev_config_exit(spa, NULL, 7809 txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 7810 7811 vdev_initialize_stop_all(tvd, VDEV_INITIALIZE_ACTIVE); 7812 vdev_trim_stop_all(tvd, VDEV_TRIM_ACTIVE); 7813 vdev_autotrim_stop_wait(tvd); 7814 7815 dtl_max_txg = spa_vdev_config_enter(spa); 7816 7817 tvd->vdev_rz_expanding = B_TRUE; 7818 7819 vdev_dirty_leaves(tvd, VDD_DTL, dtl_max_txg); 7820 vdev_config_dirty(tvd); 7821 7822 dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, 7823 dtl_max_txg); 7824 dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_raidz_attach_sync, 7825 newvd, tx); 7826 dmu_tx_commit(tx); 7827 } else { 7828 vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, 7829 dtl_max_txg - TXG_INITIAL); 7830 7831 if (newvd->vdev_isspare) { 7832 spa_spare_activate(newvd); 7833 spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE); 7834 } 7835 7836 newvd_isspare = newvd->vdev_isspare; 7837 7838 /* 7839 * Mark newvd's DTL dirty in this txg. 7840 */ 7841 vdev_dirty(tvd, VDD_DTL, newvd, txg); 7842 7843 /* 7844 * Schedule the resilver or rebuild to restart in the future. 7845 * We do this to ensure that dmu_sync-ed blocks have been 7846 * stitched into the respective datasets. 7847 */ 7848 if (rebuild) { 7849 newvd->vdev_rebuild_txg = txg; 7850 7851 vdev_rebuild(tvd); 7852 } else { 7853 newvd->vdev_resilver_txg = txg; 7854 7855 if (dsl_scan_resilvering(spa_get_dsl(spa)) && 7856 spa_feature_is_enabled(spa, 7857 SPA_FEATURE_RESILVER_DEFER)) { 7858 vdev_defer_resilver(newvd); 7859 } else { 7860 dsl_scan_restart_resilver(spa->spa_dsl_pool, 7861 dtl_max_txg); 7862 } 7863 } 7864 } 7865 7866 if (spa->spa_bootfs) 7867 spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH); 7868 7869 spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH); 7870 7871 /* 7872 * Commit the config 7873 */ 7874 (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 7875 7876 spa_history_log_internal(spa, "vdev attach", NULL, 7877 "%s vdev=%s %s vdev=%s", 7878 replacing && newvd_isspare ? "spare in" : 7879 replacing ? "replace" : "attach", newvdpath, 7880 replacing ? "for" : "to", oldvdpath); 7881 7882 spa_strfree(oldvdpath); 7883 spa_strfree(newvdpath); 7884 7885 return (0); 7886 } 7887 7888 /* 7889 * Detach a device from a mirror or replacing vdev. 7890 * 7891 * If 'replace_done' is specified, only detach if the parent 7892 * is a replacing or a spare vdev. 7893 */ 7894 int 7895 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 7896 { 7897 uint64_t txg; 7898 int error; 7899 vdev_t *rvd __maybe_unused = spa->spa_root_vdev; 7900 vdev_t *vd, *pvd, *cvd, *tvd; 7901 boolean_t unspare = B_FALSE; 7902 uint64_t unspare_guid = 0; 7903 char *vdpath; 7904 7905 ASSERT(spa_writeable(spa)); 7906 7907 txg = spa_vdev_detach_enter(spa, guid); 7908 7909 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 7910 7911 /* 7912 * Besides being called directly from the userland through the 7913 * ioctl interface, spa_vdev_detach() can be potentially called 7914 * at the end of spa_vdev_resilver_done(). 7915 * 7916 * In the regular case, when we have a checkpoint this shouldn't 7917 * happen as we never empty the DTLs of a vdev during the scrub 7918 * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done() 7919 * should never get here when we have a checkpoint. 7920 * 7921 * That said, even in a case when we checkpoint the pool exactly 7922 * as spa_vdev_resilver_done() calls this function everything 7923 * should be fine as the resilver will return right away. 7924 */ 7925 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 7926 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 7927 error = (spa_has_checkpoint(spa)) ? 7928 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 7929 return (spa_vdev_exit(spa, NULL, txg, error)); 7930 } 7931 7932 if (vd == NULL) 7933 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 7934 7935 if (!vd->vdev_ops->vdev_op_leaf) 7936 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7937 7938 pvd = vd->vdev_parent; 7939 7940 /* 7941 * If the parent/child relationship is not as expected, don't do it. 7942 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 7943 * vdev that's replacing B with C. The user's intent in replacing 7944 * is to go from M(A,B) to M(A,C). If the user decides to cancel 7945 * the replace by detaching C, the expected behavior is to end up 7946 * M(A,B). But suppose that right after deciding to detach C, 7947 * the replacement of B completes. We would have M(A,C), and then 7948 * ask to detach C, which would leave us with just A -- not what 7949 * the user wanted. To prevent this, we make sure that the 7950 * parent/child relationship hasn't changed -- in this example, 7951 * that C's parent is still the replacing vdev R. 7952 */ 7953 if (pvd->vdev_guid != pguid && pguid != 0) 7954 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 7955 7956 /* 7957 * Only 'replacing' or 'spare' vdevs can be replaced. 7958 */ 7959 if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 7960 pvd->vdev_ops != &vdev_spare_ops) 7961 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7962 7963 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 7964 spa_version(spa) >= SPA_VERSION_SPARES); 7965 7966 /* 7967 * Only mirror, replacing, and spare vdevs support detach. 7968 */ 7969 if (pvd->vdev_ops != &vdev_replacing_ops && 7970 pvd->vdev_ops != &vdev_mirror_ops && 7971 pvd->vdev_ops != &vdev_spare_ops) 7972 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7973 7974 /* 7975 * If this device has the only valid copy of some data, 7976 * we cannot safely detach it. 7977 */ 7978 if (vdev_dtl_required(vd)) 7979 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 7980 7981 ASSERT(pvd->vdev_children >= 2); 7982 7983 /* 7984 * If we are detaching the second disk from a replacing vdev, then 7985 * check to see if we changed the original vdev's path to have "/old" 7986 * at the end in spa_vdev_attach(). If so, undo that change now. 7987 */ 7988 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 7989 vd->vdev_path != NULL) { 7990 size_t len = strlen(vd->vdev_path); 7991 7992 for (int c = 0; c < pvd->vdev_children; c++) { 7993 cvd = pvd->vdev_child[c]; 7994 7995 if (cvd == vd || cvd->vdev_path == NULL) 7996 continue; 7997 7998 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 7999 strcmp(cvd->vdev_path + len, "/old") == 0) { 8000 spa_strfree(cvd->vdev_path); 8001 cvd->vdev_path = spa_strdup(vd->vdev_path); 8002 break; 8003 } 8004 } 8005 } 8006 8007 /* 8008 * If we are detaching the original disk from a normal spare, then it 8009 * implies that the spare should become a real disk, and be removed 8010 * from the active spare list for the pool. dRAID spares on the 8011 * other hand are coupled to the pool and thus should never be removed 8012 * from the spares list. 8013 */ 8014 if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0) { 8015 vdev_t *last_cvd = pvd->vdev_child[pvd->vdev_children - 1]; 8016 8017 if (last_cvd->vdev_isspare && 8018 last_cvd->vdev_ops != &vdev_draid_spare_ops) { 8019 unspare = B_TRUE; 8020 } 8021 } 8022 8023 /* 8024 * Erase the disk labels so the disk can be used for other things. 8025 * This must be done after all other error cases are handled, 8026 * but before we disembowel vd (so we can still do I/O to it). 8027 * But if we can't do it, don't treat the error as fatal -- 8028 * it may be that the unwritability of the disk is the reason 8029 * it's being detached! 8030 */ 8031 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 8032 8033 /* 8034 * Remove vd from its parent and compact the parent's children. 8035 */ 8036 vdev_remove_child(pvd, vd); 8037 vdev_compact_children(pvd); 8038 8039 /* 8040 * Remember one of the remaining children so we can get tvd below. 8041 */ 8042 cvd = pvd->vdev_child[pvd->vdev_children - 1]; 8043 8044 /* 8045 * If we need to remove the remaining child from the list of hot spares, 8046 * do it now, marking the vdev as no longer a spare in the process. 8047 * We must do this before vdev_remove_parent(), because that can 8048 * change the GUID if it creates a new toplevel GUID. For a similar 8049 * reason, we must remove the spare now, in the same txg as the detach; 8050 * otherwise someone could attach a new sibling, change the GUID, and 8051 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 8052 */ 8053 if (unspare) { 8054 ASSERT(cvd->vdev_isspare); 8055 spa_spare_remove(cvd); 8056 unspare_guid = cvd->vdev_guid; 8057 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 8058 cvd->vdev_unspare = B_TRUE; 8059 } 8060 8061 /* 8062 * If the parent mirror/replacing vdev only has one child, 8063 * the parent is no longer needed. Remove it from the tree. 8064 */ 8065 if (pvd->vdev_children == 1) { 8066 if (pvd->vdev_ops == &vdev_spare_ops) 8067 cvd->vdev_unspare = B_FALSE; 8068 vdev_remove_parent(cvd); 8069 } 8070 8071 /* 8072 * We don't set tvd until now because the parent we just removed 8073 * may have been the previous top-level vdev. 8074 */ 8075 tvd = cvd->vdev_top; 8076 ASSERT(tvd->vdev_parent == rvd); 8077 8078 /* 8079 * Reevaluate the parent vdev state. 8080 */ 8081 vdev_propagate_state(cvd); 8082 8083 /* 8084 * If the 'autoexpand' property is set on the pool then automatically 8085 * try to expand the size of the pool. For example if the device we 8086 * just detached was smaller than the others, it may be possible to 8087 * add metaslabs (i.e. grow the pool). We need to reopen the vdev 8088 * first so that we can obtain the updated sizes of the leaf vdevs. 8089 */ 8090 if (spa->spa_autoexpand) { 8091 vdev_reopen(tvd); 8092 vdev_expand(tvd, txg); 8093 } 8094 8095 vdev_config_dirty(tvd); 8096 8097 /* 8098 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 8099 * vd->vdev_detached is set and free vd's DTL object in syncing context. 8100 * But first make sure we're not on any *other* txg's DTL list, to 8101 * prevent vd from being accessed after it's freed. 8102 */ 8103 vdpath = spa_strdup(vd->vdev_path ? vd->vdev_path : "none"); 8104 for (int t = 0; t < TXG_SIZE; t++) 8105 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 8106 vd->vdev_detached = B_TRUE; 8107 vdev_dirty(tvd, VDD_DTL, vd, txg); 8108 8109 spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE); 8110 spa_notify_waiters(spa); 8111 8112 /* hang on to the spa before we release the lock */ 8113 spa_open_ref(spa, FTAG); 8114 8115 error = spa_vdev_exit(spa, vd, txg, 0); 8116 8117 spa_history_log_internal(spa, "detach", NULL, 8118 "vdev=%s", vdpath); 8119 spa_strfree(vdpath); 8120 8121 /* 8122 * If this was the removal of the original device in a hot spare vdev, 8123 * then we want to go through and remove the device from the hot spare 8124 * list of every other pool. 8125 */ 8126 if (unspare) { 8127 spa_t *altspa = NULL; 8128 8129 mutex_enter(&spa_namespace_lock); 8130 while ((altspa = spa_next(altspa)) != NULL) { 8131 if (altspa->spa_state != POOL_STATE_ACTIVE || 8132 altspa == spa) 8133 continue; 8134 8135 spa_open_ref(altspa, FTAG); 8136 mutex_exit(&spa_namespace_lock); 8137 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 8138 mutex_enter(&spa_namespace_lock); 8139 spa_close(altspa, FTAG); 8140 } 8141 mutex_exit(&spa_namespace_lock); 8142 8143 /* search the rest of the vdevs for spares to remove */ 8144 spa_vdev_resilver_done(spa); 8145 } 8146 8147 /* all done with the spa; OK to release */ 8148 mutex_enter(&spa_namespace_lock); 8149 spa_close(spa, FTAG); 8150 mutex_exit(&spa_namespace_lock); 8151 8152 return (error); 8153 } 8154 8155 static int 8156 spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, 8157 list_t *vd_list) 8158 { 8159 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 8160 8161 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 8162 8163 /* Look up vdev and ensure it's a leaf. */ 8164 vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); 8165 if (vd == NULL || vd->vdev_detached) { 8166 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8167 return (SET_ERROR(ENODEV)); 8168 } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { 8169 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8170 return (SET_ERROR(EINVAL)); 8171 } else if (!vdev_writeable(vd)) { 8172 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8173 return (SET_ERROR(EROFS)); 8174 } 8175 mutex_enter(&vd->vdev_initialize_lock); 8176 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8177 8178 /* 8179 * When we activate an initialize action we check to see 8180 * if the vdev_initialize_thread is NULL. We do this instead 8181 * of using the vdev_initialize_state since there might be 8182 * a previous initialization process which has completed but 8183 * the thread is not exited. 8184 */ 8185 if (cmd_type == POOL_INITIALIZE_START && 8186 (vd->vdev_initialize_thread != NULL || 8187 vd->vdev_top->vdev_removing || vd->vdev_top->vdev_rz_expanding)) { 8188 mutex_exit(&vd->vdev_initialize_lock); 8189 return (SET_ERROR(EBUSY)); 8190 } else if (cmd_type == POOL_INITIALIZE_CANCEL && 8191 (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE && 8192 vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) { 8193 mutex_exit(&vd->vdev_initialize_lock); 8194 return (SET_ERROR(ESRCH)); 8195 } else if (cmd_type == POOL_INITIALIZE_SUSPEND && 8196 vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) { 8197 mutex_exit(&vd->vdev_initialize_lock); 8198 return (SET_ERROR(ESRCH)); 8199 } else if (cmd_type == POOL_INITIALIZE_UNINIT && 8200 vd->vdev_initialize_thread != NULL) { 8201 mutex_exit(&vd->vdev_initialize_lock); 8202 return (SET_ERROR(EBUSY)); 8203 } 8204 8205 switch (cmd_type) { 8206 case POOL_INITIALIZE_START: 8207 vdev_initialize(vd); 8208 break; 8209 case POOL_INITIALIZE_CANCEL: 8210 vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED, vd_list); 8211 break; 8212 case POOL_INITIALIZE_SUSPEND: 8213 vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list); 8214 break; 8215 case POOL_INITIALIZE_UNINIT: 8216 vdev_uninitialize(vd); 8217 break; 8218 default: 8219 panic("invalid cmd_type %llu", (unsigned long long)cmd_type); 8220 } 8221 mutex_exit(&vd->vdev_initialize_lock); 8222 8223 return (0); 8224 } 8225 8226 int 8227 spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, 8228 nvlist_t *vdev_errlist) 8229 { 8230 int total_errors = 0; 8231 list_t vd_list; 8232 8233 list_create(&vd_list, sizeof (vdev_t), 8234 offsetof(vdev_t, vdev_initialize_node)); 8235 8236 /* 8237 * We hold the namespace lock through the whole function 8238 * to prevent any changes to the pool while we're starting or 8239 * stopping initialization. The config and state locks are held so that 8240 * we can properly assess the vdev state before we commit to 8241 * the initializing operation. 8242 */ 8243 mutex_enter(&spa_namespace_lock); 8244 8245 for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); 8246 pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { 8247 uint64_t vdev_guid = fnvpair_value_uint64(pair); 8248 8249 int error = spa_vdev_initialize_impl(spa, vdev_guid, cmd_type, 8250 &vd_list); 8251 if (error != 0) { 8252 char guid_as_str[MAXNAMELEN]; 8253 8254 (void) snprintf(guid_as_str, sizeof (guid_as_str), 8255 "%llu", (unsigned long long)vdev_guid); 8256 fnvlist_add_int64(vdev_errlist, guid_as_str, error); 8257 total_errors++; 8258 } 8259 } 8260 8261 /* Wait for all initialize threads to stop. */ 8262 vdev_initialize_stop_wait(spa, &vd_list); 8263 8264 /* Sync out the initializing state */ 8265 txg_wait_synced(spa->spa_dsl_pool, 0); 8266 mutex_exit(&spa_namespace_lock); 8267 8268 list_destroy(&vd_list); 8269 8270 return (total_errors); 8271 } 8272 8273 static int 8274 spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, 8275 uint64_t rate, boolean_t partial, boolean_t secure, list_t *vd_list) 8276 { 8277 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 8278 8279 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 8280 8281 /* Look up vdev and ensure it's a leaf. */ 8282 vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); 8283 if (vd == NULL || vd->vdev_detached) { 8284 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8285 return (SET_ERROR(ENODEV)); 8286 } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { 8287 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8288 return (SET_ERROR(EINVAL)); 8289 } else if (!vdev_writeable(vd)) { 8290 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8291 return (SET_ERROR(EROFS)); 8292 } else if (!vd->vdev_has_trim) { 8293 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8294 return (SET_ERROR(EOPNOTSUPP)); 8295 } else if (secure && !vd->vdev_has_securetrim) { 8296 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8297 return (SET_ERROR(EOPNOTSUPP)); 8298 } 8299 mutex_enter(&vd->vdev_trim_lock); 8300 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8301 8302 /* 8303 * When we activate a TRIM action we check to see if the 8304 * vdev_trim_thread is NULL. We do this instead of using the 8305 * vdev_trim_state since there might be a previous TRIM process 8306 * which has completed but the thread is not exited. 8307 */ 8308 if (cmd_type == POOL_TRIM_START && 8309 (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing || 8310 vd->vdev_top->vdev_rz_expanding)) { 8311 mutex_exit(&vd->vdev_trim_lock); 8312 return (SET_ERROR(EBUSY)); 8313 } else if (cmd_type == POOL_TRIM_CANCEL && 8314 (vd->vdev_trim_state != VDEV_TRIM_ACTIVE && 8315 vd->vdev_trim_state != VDEV_TRIM_SUSPENDED)) { 8316 mutex_exit(&vd->vdev_trim_lock); 8317 return (SET_ERROR(ESRCH)); 8318 } else if (cmd_type == POOL_TRIM_SUSPEND && 8319 vd->vdev_trim_state != VDEV_TRIM_ACTIVE) { 8320 mutex_exit(&vd->vdev_trim_lock); 8321 return (SET_ERROR(ESRCH)); 8322 } 8323 8324 switch (cmd_type) { 8325 case POOL_TRIM_START: 8326 vdev_trim(vd, rate, partial, secure); 8327 break; 8328 case POOL_TRIM_CANCEL: 8329 vdev_trim_stop(vd, VDEV_TRIM_CANCELED, vd_list); 8330 break; 8331 case POOL_TRIM_SUSPEND: 8332 vdev_trim_stop(vd, VDEV_TRIM_SUSPENDED, vd_list); 8333 break; 8334 default: 8335 panic("invalid cmd_type %llu", (unsigned long long)cmd_type); 8336 } 8337 mutex_exit(&vd->vdev_trim_lock); 8338 8339 return (0); 8340 } 8341 8342 /* 8343 * Initiates a manual TRIM for the requested vdevs. This kicks off individual 8344 * TRIM threads for each child vdev. These threads pass over all of the free 8345 * space in the vdev's metaslabs and issues TRIM commands for that space. 8346 */ 8347 int 8348 spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate, 8349 boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist) 8350 { 8351 int total_errors = 0; 8352 list_t vd_list; 8353 8354 list_create(&vd_list, sizeof (vdev_t), 8355 offsetof(vdev_t, vdev_trim_node)); 8356 8357 /* 8358 * We hold the namespace lock through the whole function 8359 * to prevent any changes to the pool while we're starting or 8360 * stopping TRIM. The config and state locks are held so that 8361 * we can properly assess the vdev state before we commit to 8362 * the TRIM operation. 8363 */ 8364 mutex_enter(&spa_namespace_lock); 8365 8366 for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); 8367 pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { 8368 uint64_t vdev_guid = fnvpair_value_uint64(pair); 8369 8370 int error = spa_vdev_trim_impl(spa, vdev_guid, cmd_type, 8371 rate, partial, secure, &vd_list); 8372 if (error != 0) { 8373 char guid_as_str[MAXNAMELEN]; 8374 8375 (void) snprintf(guid_as_str, sizeof (guid_as_str), 8376 "%llu", (unsigned long long)vdev_guid); 8377 fnvlist_add_int64(vdev_errlist, guid_as_str, error); 8378 total_errors++; 8379 } 8380 } 8381 8382 /* Wait for all TRIM threads to stop. */ 8383 vdev_trim_stop_wait(spa, &vd_list); 8384 8385 /* Sync out the TRIM state */ 8386 txg_wait_synced(spa->spa_dsl_pool, 0); 8387 mutex_exit(&spa_namespace_lock); 8388 8389 list_destroy(&vd_list); 8390 8391 return (total_errors); 8392 } 8393 8394 /* 8395 * Split a set of devices from their mirrors, and create a new pool from them. 8396 */ 8397 int 8398 spa_vdev_split_mirror(spa_t *spa, const char *newname, nvlist_t *config, 8399 nvlist_t *props, boolean_t exp) 8400 { 8401 int error = 0; 8402 uint64_t txg, *glist; 8403 spa_t *newspa; 8404 uint_t c, children, lastlog; 8405 nvlist_t **child, *nvl, *tmp; 8406 dmu_tx_t *tx; 8407 const char *altroot = NULL; 8408 vdev_t *rvd, **vml = NULL; /* vdev modify list */ 8409 boolean_t activate_slog; 8410 8411 ASSERT(spa_writeable(spa)); 8412 8413 txg = spa_vdev_enter(spa); 8414 8415 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 8416 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 8417 error = (spa_has_checkpoint(spa)) ? 8418 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 8419 return (spa_vdev_exit(spa, NULL, txg, error)); 8420 } 8421 8422 /* clear the log and flush everything up to now */ 8423 activate_slog = spa_passivate_log(spa); 8424 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 8425 error = spa_reset_logs(spa); 8426 txg = spa_vdev_config_enter(spa); 8427 8428 if (activate_slog) 8429 spa_activate_log(spa); 8430 8431 if (error != 0) 8432 return (spa_vdev_exit(spa, NULL, txg, error)); 8433 8434 /* check new spa name before going any further */ 8435 if (spa_lookup(newname) != NULL) 8436 return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 8437 8438 /* 8439 * scan through all the children to ensure they're all mirrors 8440 */ 8441 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 8442 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 8443 &children) != 0) 8444 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 8445 8446 /* first, check to ensure we've got the right child count */ 8447 rvd = spa->spa_root_vdev; 8448 lastlog = 0; 8449 for (c = 0; c < rvd->vdev_children; c++) { 8450 vdev_t *vd = rvd->vdev_child[c]; 8451 8452 /* don't count the holes & logs as children */ 8453 if (vd->vdev_islog || (vd->vdev_ops != &vdev_indirect_ops && 8454 !vdev_is_concrete(vd))) { 8455 if (lastlog == 0) 8456 lastlog = c; 8457 continue; 8458 } 8459 8460 lastlog = 0; 8461 } 8462 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 8463 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 8464 8465 /* next, ensure no spare or cache devices are part of the split */ 8466 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 8467 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 8468 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 8469 8470 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 8471 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 8472 8473 /* then, loop over each vdev and validate it */ 8474 for (c = 0; c < children; c++) { 8475 uint64_t is_hole = 0; 8476 8477 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 8478 &is_hole); 8479 8480 if (is_hole != 0) { 8481 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 8482 spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 8483 continue; 8484 } else { 8485 error = SET_ERROR(EINVAL); 8486 break; 8487 } 8488 } 8489 8490 /* deal with indirect vdevs */ 8491 if (spa->spa_root_vdev->vdev_child[c]->vdev_ops == 8492 &vdev_indirect_ops) 8493 continue; 8494 8495 /* which disk is going to be split? */ 8496 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 8497 &glist[c]) != 0) { 8498 error = SET_ERROR(EINVAL); 8499 break; 8500 } 8501 8502 /* look it up in the spa */ 8503 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 8504 if (vml[c] == NULL) { 8505 error = SET_ERROR(ENODEV); 8506 break; 8507 } 8508 8509 /* make sure there's nothing stopping the split */ 8510 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 8511 vml[c]->vdev_islog || 8512 !vdev_is_concrete(vml[c]) || 8513 vml[c]->vdev_isspare || 8514 vml[c]->vdev_isl2cache || 8515 !vdev_writeable(vml[c]) || 8516 vml[c]->vdev_children != 0 || 8517 vml[c]->vdev_state != VDEV_STATE_HEALTHY || 8518 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 8519 error = SET_ERROR(EINVAL); 8520 break; 8521 } 8522 8523 if (vdev_dtl_required(vml[c]) || 8524 vdev_resilver_needed(vml[c], NULL, NULL)) { 8525 error = SET_ERROR(EBUSY); 8526 break; 8527 } 8528 8529 /* we need certain info from the top level */ 8530 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 8531 vml[c]->vdev_top->vdev_ms_array); 8532 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 8533 vml[c]->vdev_top->vdev_ms_shift); 8534 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 8535 vml[c]->vdev_top->vdev_asize); 8536 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 8537 vml[c]->vdev_top->vdev_ashift); 8538 8539 /* transfer per-vdev ZAPs */ 8540 ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0); 8541 VERIFY0(nvlist_add_uint64(child[c], 8542 ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap)); 8543 8544 ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0); 8545 VERIFY0(nvlist_add_uint64(child[c], 8546 ZPOOL_CONFIG_VDEV_TOP_ZAP, 8547 vml[c]->vdev_parent->vdev_top_zap)); 8548 } 8549 8550 if (error != 0) { 8551 kmem_free(vml, children * sizeof (vdev_t *)); 8552 kmem_free(glist, children * sizeof (uint64_t)); 8553 return (spa_vdev_exit(spa, NULL, txg, error)); 8554 } 8555 8556 /* stop writers from using the disks */ 8557 for (c = 0; c < children; c++) { 8558 if (vml[c] != NULL) 8559 vml[c]->vdev_offline = B_TRUE; 8560 } 8561 vdev_reopen(spa->spa_root_vdev); 8562 8563 /* 8564 * Temporarily record the splitting vdevs in the spa config. This 8565 * will disappear once the config is regenerated. 8566 */ 8567 nvl = fnvlist_alloc(); 8568 fnvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, glist, children); 8569 kmem_free(glist, children * sizeof (uint64_t)); 8570 8571 mutex_enter(&spa->spa_props_lock); 8572 fnvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, nvl); 8573 mutex_exit(&spa->spa_props_lock); 8574 spa->spa_config_splitting = nvl; 8575 vdev_config_dirty(spa->spa_root_vdev); 8576 8577 /* configure and create the new pool */ 8578 fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname); 8579 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 8580 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE); 8581 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa)); 8582 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg); 8583 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 8584 spa_generate_guid(NULL)); 8585 VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); 8586 (void) nvlist_lookup_string(props, 8587 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 8588 8589 /* add the new pool to the namespace */ 8590 newspa = spa_add(newname, config, altroot); 8591 newspa->spa_avz_action = AVZ_ACTION_REBUILD; 8592 newspa->spa_config_txg = spa->spa_config_txg; 8593 spa_set_log_state(newspa, SPA_LOG_CLEAR); 8594 8595 /* release the spa config lock, retaining the namespace lock */ 8596 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 8597 8598 if (zio_injection_enabled) 8599 zio_handle_panic_injection(spa, FTAG, 1); 8600 8601 spa_activate(newspa, spa_mode_global); 8602 spa_async_suspend(newspa); 8603 8604 /* 8605 * Temporarily stop the initializing and TRIM activity. We set the 8606 * state to ACTIVE so that we know to resume initializing or TRIM 8607 * once the split has completed. 8608 */ 8609 list_t vd_initialize_list; 8610 list_create(&vd_initialize_list, sizeof (vdev_t), 8611 offsetof(vdev_t, vdev_initialize_node)); 8612 8613 list_t vd_trim_list; 8614 list_create(&vd_trim_list, sizeof (vdev_t), 8615 offsetof(vdev_t, vdev_trim_node)); 8616 8617 for (c = 0; c < children; c++) { 8618 if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { 8619 mutex_enter(&vml[c]->vdev_initialize_lock); 8620 vdev_initialize_stop(vml[c], 8621 VDEV_INITIALIZE_ACTIVE, &vd_initialize_list); 8622 mutex_exit(&vml[c]->vdev_initialize_lock); 8623 8624 mutex_enter(&vml[c]->vdev_trim_lock); 8625 vdev_trim_stop(vml[c], VDEV_TRIM_ACTIVE, &vd_trim_list); 8626 mutex_exit(&vml[c]->vdev_trim_lock); 8627 } 8628 } 8629 8630 vdev_initialize_stop_wait(spa, &vd_initialize_list); 8631 vdev_trim_stop_wait(spa, &vd_trim_list); 8632 8633 list_destroy(&vd_initialize_list); 8634 list_destroy(&vd_trim_list); 8635 8636 newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT; 8637 newspa->spa_is_splitting = B_TRUE; 8638 8639 /* create the new pool from the disks of the original pool */ 8640 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE); 8641 if (error) 8642 goto out; 8643 8644 /* if that worked, generate a real config for the new pool */ 8645 if (newspa->spa_root_vdev != NULL) { 8646 newspa->spa_config_splitting = fnvlist_alloc(); 8647 fnvlist_add_uint64(newspa->spa_config_splitting, 8648 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)); 8649 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 8650 B_TRUE)); 8651 } 8652 8653 /* set the props */ 8654 if (props != NULL) { 8655 spa_configfile_set(newspa, props, B_FALSE); 8656 error = spa_prop_set(newspa, props); 8657 if (error) 8658 goto out; 8659 } 8660 8661 /* flush everything */ 8662 txg = spa_vdev_config_enter(newspa); 8663 vdev_config_dirty(newspa->spa_root_vdev); 8664 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 8665 8666 if (zio_injection_enabled) 8667 zio_handle_panic_injection(spa, FTAG, 2); 8668 8669 spa_async_resume(newspa); 8670 8671 /* finally, update the original pool's config */ 8672 txg = spa_vdev_config_enter(spa); 8673 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 8674 error = dmu_tx_assign(tx, DMU_TX_WAIT); 8675 if (error != 0) 8676 dmu_tx_abort(tx); 8677 for (c = 0; c < children; c++) { 8678 if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { 8679 vdev_t *tvd = vml[c]->vdev_top; 8680 8681 /* 8682 * Need to be sure the detachable VDEV is not 8683 * on any *other* txg's DTL list to prevent it 8684 * from being accessed after it's freed. 8685 */ 8686 for (int t = 0; t < TXG_SIZE; t++) { 8687 (void) txg_list_remove_this( 8688 &tvd->vdev_dtl_list, vml[c], t); 8689 } 8690 8691 vdev_split(vml[c]); 8692 if (error == 0) 8693 spa_history_log_internal(spa, "detach", tx, 8694 "vdev=%s", vml[c]->vdev_path); 8695 8696 vdev_free(vml[c]); 8697 } 8698 } 8699 spa->spa_avz_action = AVZ_ACTION_REBUILD; 8700 vdev_config_dirty(spa->spa_root_vdev); 8701 spa->spa_config_splitting = NULL; 8702 nvlist_free(nvl); 8703 if (error == 0) 8704 dmu_tx_commit(tx); 8705 (void) spa_vdev_exit(spa, NULL, txg, 0); 8706 8707 if (zio_injection_enabled) 8708 zio_handle_panic_injection(spa, FTAG, 3); 8709 8710 /* split is complete; log a history record */ 8711 spa_history_log_internal(newspa, "split", NULL, 8712 "from pool %s", spa_name(spa)); 8713 8714 newspa->spa_is_splitting = B_FALSE; 8715 kmem_free(vml, children * sizeof (vdev_t *)); 8716 8717 /* if we're not going to mount the filesystems in userland, export */ 8718 if (exp) 8719 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 8720 B_FALSE, B_FALSE); 8721 8722 return (error); 8723 8724 out: 8725 spa_unload(newspa); 8726 spa_deactivate(newspa); 8727 spa_remove(newspa); 8728 8729 txg = spa_vdev_config_enter(spa); 8730 8731 /* re-online all offlined disks */ 8732 for (c = 0; c < children; c++) { 8733 if (vml[c] != NULL) 8734 vml[c]->vdev_offline = B_FALSE; 8735 } 8736 8737 /* restart initializing or trimming disks as necessary */ 8738 spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); 8739 spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); 8740 spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); 8741 8742 vdev_reopen(spa->spa_root_vdev); 8743 8744 nvlist_free(spa->spa_config_splitting); 8745 spa->spa_config_splitting = NULL; 8746 (void) spa_vdev_exit(spa, NULL, txg, error); 8747 8748 kmem_free(vml, children * sizeof (vdev_t *)); 8749 return (error); 8750 } 8751 8752 /* 8753 * Find any device that's done replacing, or a vdev marked 'unspare' that's 8754 * currently spared, so we can detach it. 8755 */ 8756 static vdev_t * 8757 spa_vdev_resilver_done_hunt(vdev_t *vd) 8758 { 8759 vdev_t *newvd, *oldvd; 8760 8761 for (int c = 0; c < vd->vdev_children; c++) { 8762 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 8763 if (oldvd != NULL) 8764 return (oldvd); 8765 } 8766 8767 /* 8768 * Check for a completed replacement. We always consider the first 8769 * vdev in the list to be the oldest vdev, and the last one to be 8770 * the newest (see spa_vdev_attach() for how that works). In 8771 * the case where the newest vdev is faulted, we will not automatically 8772 * remove it after a resilver completes. This is OK as it will require 8773 * user intervention to determine which disk the admin wishes to keep. 8774 */ 8775 if (vd->vdev_ops == &vdev_replacing_ops) { 8776 ASSERT(vd->vdev_children > 1); 8777 8778 newvd = vd->vdev_child[vd->vdev_children - 1]; 8779 oldvd = vd->vdev_child[0]; 8780 8781 if (vdev_dtl_empty(newvd, DTL_MISSING) && 8782 vdev_dtl_empty(newvd, DTL_OUTAGE) && 8783 !vdev_dtl_required(oldvd)) 8784 return (oldvd); 8785 } 8786 8787 /* 8788 * Check for a completed resilver with the 'unspare' flag set. 8789 * Also potentially update faulted state. 8790 */ 8791 if (vd->vdev_ops == &vdev_spare_ops) { 8792 vdev_t *first = vd->vdev_child[0]; 8793 vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 8794 8795 if (last->vdev_unspare) { 8796 oldvd = first; 8797 newvd = last; 8798 } else if (first->vdev_unspare) { 8799 oldvd = last; 8800 newvd = first; 8801 } else { 8802 oldvd = NULL; 8803 } 8804 8805 if (oldvd != NULL && 8806 vdev_dtl_empty(newvd, DTL_MISSING) && 8807 vdev_dtl_empty(newvd, DTL_OUTAGE) && 8808 !vdev_dtl_required(oldvd)) 8809 return (oldvd); 8810 8811 vdev_propagate_state(vd); 8812 8813 /* 8814 * If there are more than two spares attached to a disk, 8815 * and those spares are not required, then we want to 8816 * attempt to free them up now so that they can be used 8817 * by other pools. Once we're back down to a single 8818 * disk+spare, we stop removing them. 8819 */ 8820 if (vd->vdev_children > 2) { 8821 newvd = vd->vdev_child[1]; 8822 8823 if (newvd->vdev_isspare && last->vdev_isspare && 8824 vdev_dtl_empty(last, DTL_MISSING) && 8825 vdev_dtl_empty(last, DTL_OUTAGE) && 8826 !vdev_dtl_required(newvd)) 8827 return (newvd); 8828 } 8829 } 8830 8831 return (NULL); 8832 } 8833 8834 static void 8835 spa_vdev_resilver_done(spa_t *spa) 8836 { 8837 vdev_t *vd, *pvd, *ppvd; 8838 uint64_t guid, sguid, pguid, ppguid; 8839 8840 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 8841 8842 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 8843 pvd = vd->vdev_parent; 8844 ppvd = pvd->vdev_parent; 8845 guid = vd->vdev_guid; 8846 pguid = pvd->vdev_guid; 8847 ppguid = ppvd->vdev_guid; 8848 sguid = 0; 8849 /* 8850 * If we have just finished replacing a hot spared device, then 8851 * we need to detach the parent's first child (the original hot 8852 * spare) as well. 8853 */ 8854 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 8855 ppvd->vdev_children == 2) { 8856 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 8857 sguid = ppvd->vdev_child[1]->vdev_guid; 8858 } 8859 ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); 8860 8861 spa_config_exit(spa, SCL_ALL, FTAG); 8862 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 8863 return; 8864 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 8865 return; 8866 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 8867 } 8868 8869 spa_config_exit(spa, SCL_ALL, FTAG); 8870 8871 /* 8872 * If a detach was not performed above replace waiters will not have 8873 * been notified. In which case we must do so now. 8874 */ 8875 spa_notify_waiters(spa); 8876 } 8877 8878 /* 8879 * Update the stored path or FRU for this vdev. 8880 */ 8881 static int 8882 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 8883 boolean_t ispath) 8884 { 8885 vdev_t *vd; 8886 boolean_t sync = B_FALSE; 8887 8888 ASSERT(spa_writeable(spa)); 8889 8890 spa_vdev_state_enter(spa, SCL_ALL); 8891 8892 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 8893 return (spa_vdev_state_exit(spa, NULL, ENOENT)); 8894 8895 if (!vd->vdev_ops->vdev_op_leaf) 8896 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 8897 8898 if (ispath) { 8899 if (strcmp(value, vd->vdev_path) != 0) { 8900 spa_strfree(vd->vdev_path); 8901 vd->vdev_path = spa_strdup(value); 8902 sync = B_TRUE; 8903 } 8904 } else { 8905 if (vd->vdev_fru == NULL) { 8906 vd->vdev_fru = spa_strdup(value); 8907 sync = B_TRUE; 8908 } else if (strcmp(value, vd->vdev_fru) != 0) { 8909 spa_strfree(vd->vdev_fru); 8910 vd->vdev_fru = spa_strdup(value); 8911 sync = B_TRUE; 8912 } 8913 } 8914 8915 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 8916 } 8917 8918 int 8919 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 8920 { 8921 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 8922 } 8923 8924 int 8925 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 8926 { 8927 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 8928 } 8929 8930 /* 8931 * ========================================================================== 8932 * SPA Scanning 8933 * ========================================================================== 8934 */ 8935 int 8936 spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd) 8937 { 8938 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 8939 8940 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 8941 return (SET_ERROR(EBUSY)); 8942 8943 return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd)); 8944 } 8945 8946 int 8947 spa_scan_stop(spa_t *spa) 8948 { 8949 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 8950 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 8951 return (SET_ERROR(EBUSY)); 8952 8953 return (dsl_scan_cancel(spa->spa_dsl_pool)); 8954 } 8955 8956 int 8957 spa_scan(spa_t *spa, pool_scan_func_t func) 8958 { 8959 return (spa_scan_range(spa, func, 0, 0)); 8960 } 8961 8962 int 8963 spa_scan_range(spa_t *spa, pool_scan_func_t func, uint64_t txgstart, 8964 uint64_t txgend) 8965 { 8966 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 8967 8968 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 8969 return (SET_ERROR(ENOTSUP)); 8970 8971 if (func == POOL_SCAN_RESILVER && 8972 !spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) 8973 return (SET_ERROR(ENOTSUP)); 8974 8975 if (func != POOL_SCAN_SCRUB && (txgstart != 0 || txgend != 0)) 8976 return (SET_ERROR(ENOTSUP)); 8977 8978 /* 8979 * If a resilver was requested, but there is no DTL on a 8980 * writeable leaf device, we have nothing to do. 8981 */ 8982 if (func == POOL_SCAN_RESILVER && 8983 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 8984 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 8985 return (0); 8986 } 8987 8988 if (func == POOL_SCAN_ERRORSCRUB && 8989 !spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) 8990 return (SET_ERROR(ENOTSUP)); 8991 8992 return (dsl_scan(spa->spa_dsl_pool, func, txgstart, txgend)); 8993 } 8994 8995 /* 8996 * ========================================================================== 8997 * SPA async task processing 8998 * ========================================================================== 8999 */ 9000 9001 static void 9002 spa_async_remove(spa_t *spa, vdev_t *vd, boolean_t by_kernel) 9003 { 9004 if (vd->vdev_remove_wanted) { 9005 vd->vdev_remove_wanted = B_FALSE; 9006 vd->vdev_delayed_close = B_FALSE; 9007 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 9008 9009 /* 9010 * We want to clear the stats, but we don't want to do a full 9011 * vdev_clear() as that will cause us to throw away 9012 * degraded/faulted state as well as attempt to reopen the 9013 * device, all of which is a waste. 9014 */ 9015 vd->vdev_stat.vs_read_errors = 0; 9016 vd->vdev_stat.vs_write_errors = 0; 9017 vd->vdev_stat.vs_checksum_errors = 0; 9018 9019 vdev_state_dirty(vd->vdev_top); 9020 9021 /* Tell userspace that the vdev is gone. */ 9022 zfs_post_remove(spa, vd, by_kernel); 9023 } 9024 9025 for (int c = 0; c < vd->vdev_children; c++) 9026 spa_async_remove(spa, vd->vdev_child[c], by_kernel); 9027 } 9028 9029 static void 9030 spa_async_fault_vdev(vdev_t *vd, boolean_t *suspend) 9031 { 9032 if (vd->vdev_fault_wanted) { 9033 vdev_state_t newstate = VDEV_STATE_FAULTED; 9034 vd->vdev_fault_wanted = B_FALSE; 9035 9036 /* 9037 * If this device has the only valid copy of the data, then 9038 * back off and simply mark the vdev as degraded instead. 9039 */ 9040 if (!vd->vdev_top->vdev_islog && vd->vdev_aux == NULL && 9041 vdev_dtl_required(vd)) { 9042 newstate = VDEV_STATE_DEGRADED; 9043 /* A required disk is missing so suspend the pool */ 9044 *suspend = B_TRUE; 9045 } 9046 vdev_set_state(vd, B_TRUE, newstate, VDEV_AUX_ERR_EXCEEDED); 9047 } 9048 for (int c = 0; c < vd->vdev_children; c++) 9049 spa_async_fault_vdev(vd->vdev_child[c], suspend); 9050 } 9051 9052 static void 9053 spa_async_autoexpand(spa_t *spa, vdev_t *vd) 9054 { 9055 if (!spa->spa_autoexpand) 9056 return; 9057 9058 for (int c = 0; c < vd->vdev_children; c++) { 9059 vdev_t *cvd = vd->vdev_child[c]; 9060 spa_async_autoexpand(spa, cvd); 9061 } 9062 9063 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 9064 return; 9065 9066 spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_AUTOEXPAND); 9067 } 9068 9069 static __attribute__((noreturn)) void 9070 spa_async_thread(void *arg) 9071 { 9072 spa_t *spa = (spa_t *)arg; 9073 dsl_pool_t *dp = spa->spa_dsl_pool; 9074 int tasks; 9075 9076 ASSERT(spa->spa_sync_on); 9077 9078 mutex_enter(&spa->spa_async_lock); 9079 tasks = spa->spa_async_tasks; 9080 spa->spa_async_tasks = 0; 9081 mutex_exit(&spa->spa_async_lock); 9082 9083 /* 9084 * See if the config needs to be updated. 9085 */ 9086 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 9087 uint64_t old_space, new_space; 9088 9089 mutex_enter(&spa_namespace_lock); 9090 old_space = metaslab_class_get_space(spa_normal_class(spa)); 9091 old_space += metaslab_class_get_space(spa_special_class(spa)); 9092 old_space += metaslab_class_get_space(spa_dedup_class(spa)); 9093 old_space += metaslab_class_get_space( 9094 spa_embedded_log_class(spa)); 9095 9096 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 9097 9098 new_space = metaslab_class_get_space(spa_normal_class(spa)); 9099 new_space += metaslab_class_get_space(spa_special_class(spa)); 9100 new_space += metaslab_class_get_space(spa_dedup_class(spa)); 9101 new_space += metaslab_class_get_space( 9102 spa_embedded_log_class(spa)); 9103 mutex_exit(&spa_namespace_lock); 9104 9105 /* 9106 * If the pool grew as a result of the config update, 9107 * then log an internal history event. 9108 */ 9109 if (new_space != old_space) { 9110 spa_history_log_internal(spa, "vdev online", NULL, 9111 "pool '%s' size: %llu(+%llu)", 9112 spa_name(spa), (u_longlong_t)new_space, 9113 (u_longlong_t)(new_space - old_space)); 9114 } 9115 } 9116 9117 /* 9118 * See if any devices need to be marked REMOVED. 9119 */ 9120 if (tasks & (SPA_ASYNC_REMOVE | SPA_ASYNC_REMOVE_BY_USER)) { 9121 boolean_t by_kernel = B_TRUE; 9122 if (tasks & SPA_ASYNC_REMOVE_BY_USER) 9123 by_kernel = B_FALSE; 9124 spa_vdev_state_enter(spa, SCL_NONE); 9125 spa_async_remove(spa, spa->spa_root_vdev, by_kernel); 9126 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 9127 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i], 9128 by_kernel); 9129 for (int i = 0; i < spa->spa_spares.sav_count; i++) 9130 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i], 9131 by_kernel); 9132 (void) spa_vdev_state_exit(spa, NULL, 0); 9133 } 9134 9135 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 9136 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 9137 spa_async_autoexpand(spa, spa->spa_root_vdev); 9138 spa_config_exit(spa, SCL_CONFIG, FTAG); 9139 } 9140 9141 /* 9142 * See if any devices need to be marked faulted. 9143 */ 9144 if (tasks & SPA_ASYNC_FAULT_VDEV) { 9145 spa_vdev_state_enter(spa, SCL_NONE); 9146 boolean_t suspend = B_FALSE; 9147 spa_async_fault_vdev(spa->spa_root_vdev, &suspend); 9148 (void) spa_vdev_state_exit(spa, NULL, 0); 9149 if (suspend) 9150 zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR); 9151 } 9152 9153 /* 9154 * If any devices are done replacing, detach them. 9155 */ 9156 if (tasks & SPA_ASYNC_RESILVER_DONE || 9157 tasks & SPA_ASYNC_REBUILD_DONE || 9158 tasks & SPA_ASYNC_DETACH_SPARE) { 9159 spa_vdev_resilver_done(spa); 9160 } 9161 9162 /* 9163 * Kick off a resilver. 9164 */ 9165 if (tasks & SPA_ASYNC_RESILVER && 9166 !vdev_rebuild_active(spa->spa_root_vdev) && 9167 (!dsl_scan_resilvering(dp) || 9168 !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))) 9169 dsl_scan_restart_resilver(dp, 0); 9170 9171 if (tasks & SPA_ASYNC_INITIALIZE_RESTART) { 9172 mutex_enter(&spa_namespace_lock); 9173 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 9174 vdev_initialize_restart(spa->spa_root_vdev); 9175 spa_config_exit(spa, SCL_CONFIG, FTAG); 9176 mutex_exit(&spa_namespace_lock); 9177 } 9178 9179 if (tasks & SPA_ASYNC_TRIM_RESTART) { 9180 mutex_enter(&spa_namespace_lock); 9181 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 9182 vdev_trim_restart(spa->spa_root_vdev); 9183 spa_config_exit(spa, SCL_CONFIG, FTAG); 9184 mutex_exit(&spa_namespace_lock); 9185 } 9186 9187 if (tasks & SPA_ASYNC_AUTOTRIM_RESTART) { 9188 mutex_enter(&spa_namespace_lock); 9189 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 9190 vdev_autotrim_restart(spa); 9191 spa_config_exit(spa, SCL_CONFIG, FTAG); 9192 mutex_exit(&spa_namespace_lock); 9193 } 9194 9195 /* 9196 * Kick off L2 cache whole device TRIM. 9197 */ 9198 if (tasks & SPA_ASYNC_L2CACHE_TRIM) { 9199 mutex_enter(&spa_namespace_lock); 9200 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 9201 vdev_trim_l2arc(spa); 9202 spa_config_exit(spa, SCL_CONFIG, FTAG); 9203 mutex_exit(&spa_namespace_lock); 9204 } 9205 9206 /* 9207 * Kick off L2 cache rebuilding. 9208 */ 9209 if (tasks & SPA_ASYNC_L2CACHE_REBUILD) { 9210 mutex_enter(&spa_namespace_lock); 9211 spa_config_enter(spa, SCL_L2ARC, FTAG, RW_READER); 9212 l2arc_spa_rebuild_start(spa); 9213 spa_config_exit(spa, SCL_L2ARC, FTAG); 9214 mutex_exit(&spa_namespace_lock); 9215 } 9216 9217 /* 9218 * Let the world know that we're done. 9219 */ 9220 mutex_enter(&spa->spa_async_lock); 9221 spa->spa_async_thread = NULL; 9222 cv_broadcast(&spa->spa_async_cv); 9223 mutex_exit(&spa->spa_async_lock); 9224 thread_exit(); 9225 } 9226 9227 void 9228 spa_async_suspend(spa_t *spa) 9229 { 9230 mutex_enter(&spa->spa_async_lock); 9231 spa->spa_async_suspended++; 9232 while (spa->spa_async_thread != NULL) 9233 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 9234 mutex_exit(&spa->spa_async_lock); 9235 9236 spa_vdev_remove_suspend(spa); 9237 9238 zthr_t *condense_thread = spa->spa_condense_zthr; 9239 if (condense_thread != NULL) 9240 zthr_cancel(condense_thread); 9241 9242 zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr; 9243 if (raidz_expand_thread != NULL) 9244 zthr_cancel(raidz_expand_thread); 9245 9246 zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; 9247 if (discard_thread != NULL) 9248 zthr_cancel(discard_thread); 9249 9250 zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; 9251 if (ll_delete_thread != NULL) 9252 zthr_cancel(ll_delete_thread); 9253 9254 zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; 9255 if (ll_condense_thread != NULL) 9256 zthr_cancel(ll_condense_thread); 9257 } 9258 9259 void 9260 spa_async_resume(spa_t *spa) 9261 { 9262 mutex_enter(&spa->spa_async_lock); 9263 ASSERT(spa->spa_async_suspended != 0); 9264 spa->spa_async_suspended--; 9265 mutex_exit(&spa->spa_async_lock); 9266 spa_restart_removal(spa); 9267 9268 zthr_t *condense_thread = spa->spa_condense_zthr; 9269 if (condense_thread != NULL) 9270 zthr_resume(condense_thread); 9271 9272 zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr; 9273 if (raidz_expand_thread != NULL) 9274 zthr_resume(raidz_expand_thread); 9275 9276 zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; 9277 if (discard_thread != NULL) 9278 zthr_resume(discard_thread); 9279 9280 zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; 9281 if (ll_delete_thread != NULL) 9282 zthr_resume(ll_delete_thread); 9283 9284 zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; 9285 if (ll_condense_thread != NULL) 9286 zthr_resume(ll_condense_thread); 9287 } 9288 9289 static boolean_t 9290 spa_async_tasks_pending(spa_t *spa) 9291 { 9292 uint_t non_config_tasks; 9293 uint_t config_task; 9294 boolean_t config_task_suspended; 9295 9296 non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE; 9297 config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; 9298 if (spa->spa_ccw_fail_time == 0) { 9299 config_task_suspended = B_FALSE; 9300 } else { 9301 config_task_suspended = 9302 (gethrtime() - spa->spa_ccw_fail_time) < 9303 ((hrtime_t)zfs_ccw_retry_interval * NANOSEC); 9304 } 9305 9306 return (non_config_tasks || (config_task && !config_task_suspended)); 9307 } 9308 9309 static void 9310 spa_async_dispatch(spa_t *spa) 9311 { 9312 mutex_enter(&spa->spa_async_lock); 9313 if (spa_async_tasks_pending(spa) && 9314 !spa->spa_async_suspended && 9315 spa->spa_async_thread == NULL) 9316 spa->spa_async_thread = thread_create(NULL, 0, 9317 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 9318 mutex_exit(&spa->spa_async_lock); 9319 } 9320 9321 void 9322 spa_async_request(spa_t *spa, int task) 9323 { 9324 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 9325 mutex_enter(&spa->spa_async_lock); 9326 spa->spa_async_tasks |= task; 9327 mutex_exit(&spa->spa_async_lock); 9328 } 9329 9330 int 9331 spa_async_tasks(spa_t *spa) 9332 { 9333 return (spa->spa_async_tasks); 9334 } 9335 9336 /* 9337 * ========================================================================== 9338 * SPA syncing routines 9339 * ========================================================================== 9340 */ 9341 9342 9343 static int 9344 bpobj_enqueue_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 9345 dmu_tx_t *tx) 9346 { 9347 bpobj_t *bpo = arg; 9348 bpobj_enqueue(bpo, bp, bp_freed, tx); 9349 return (0); 9350 } 9351 9352 int 9353 bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 9354 { 9355 return (bpobj_enqueue_cb(arg, bp, B_FALSE, tx)); 9356 } 9357 9358 int 9359 bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 9360 { 9361 return (bpobj_enqueue_cb(arg, bp, B_TRUE, tx)); 9362 } 9363 9364 static int 9365 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 9366 { 9367 zio_t *pio = arg; 9368 9369 zio_nowait(zio_free_sync(pio, pio->io_spa, dmu_tx_get_txg(tx), bp, 9370 pio->io_flags)); 9371 return (0); 9372 } 9373 9374 static int 9375 bpobj_spa_free_sync_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 9376 dmu_tx_t *tx) 9377 { 9378 ASSERT(!bp_freed); 9379 return (spa_free_sync_cb(arg, bp, tx)); 9380 } 9381 9382 /* 9383 * Note: this simple function is not inlined to make it easier to dtrace the 9384 * amount of time spent syncing frees. 9385 */ 9386 static void 9387 spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) 9388 { 9389 zio_t *zio = zio_root(spa, NULL, NULL, 0); 9390 bplist_iterate(bpl, spa_free_sync_cb, zio, tx); 9391 VERIFY(zio_wait(zio) == 0); 9392 } 9393 9394 /* 9395 * Note: this simple function is not inlined to make it easier to dtrace the 9396 * amount of time spent syncing deferred frees. 9397 */ 9398 static void 9399 spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) 9400 { 9401 if (spa_sync_pass(spa) != 1) 9402 return; 9403 9404 /* 9405 * Note: 9406 * If the log space map feature is active, we stop deferring 9407 * frees to the next TXG and therefore running this function 9408 * would be considered a no-op as spa_deferred_bpobj should 9409 * not have any entries. 9410 * 9411 * That said we run this function anyway (instead of returning 9412 * immediately) for the edge-case scenario where we just 9413 * activated the log space map feature in this TXG but we have 9414 * deferred frees from the previous TXG. 9415 */ 9416 zio_t *zio = zio_root(spa, NULL, NULL, 0); 9417 VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, 9418 bpobj_spa_free_sync_cb, zio, tx), ==, 0); 9419 VERIFY0(zio_wait(zio)); 9420 } 9421 9422 static void 9423 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 9424 { 9425 char *packed = NULL; 9426 size_t bufsize; 9427 size_t nvsize = 0; 9428 dmu_buf_t *db; 9429 9430 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 9431 9432 /* 9433 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 9434 * information. This avoids the dmu_buf_will_dirty() path and 9435 * saves us a pre-read to get data we don't actually care about. 9436 */ 9437 bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); 9438 packed = vmem_alloc(bufsize, KM_SLEEP); 9439 9440 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 9441 KM_SLEEP) == 0); 9442 memset(packed + nvsize, 0, bufsize - nvsize); 9443 9444 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 9445 9446 vmem_free(packed, bufsize); 9447 9448 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 9449 dmu_buf_will_dirty(db, tx); 9450 *(uint64_t *)db->db_data = nvsize; 9451 dmu_buf_rele(db, FTAG); 9452 } 9453 9454 static void 9455 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 9456 const char *config, const char *entry) 9457 { 9458 nvlist_t *nvroot; 9459 nvlist_t **list; 9460 int i; 9461 9462 if (!sav->sav_sync) 9463 return; 9464 9465 /* 9466 * Update the MOS nvlist describing the list of available devices. 9467 * spa_validate_aux() will have already made sure this nvlist is 9468 * valid and the vdevs are labeled appropriately. 9469 */ 9470 if (sav->sav_object == 0) { 9471 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 9472 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 9473 sizeof (uint64_t), tx); 9474 VERIFY(zap_update(spa->spa_meta_objset, 9475 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 9476 &sav->sav_object, tx) == 0); 9477 } 9478 9479 nvroot = fnvlist_alloc(); 9480 if (sav->sav_count == 0) { 9481 fnvlist_add_nvlist_array(nvroot, config, 9482 (const nvlist_t * const *)NULL, 0); 9483 } else { 9484 list = kmem_alloc(sav->sav_count*sizeof (void *), KM_SLEEP); 9485 for (i = 0; i < sav->sav_count; i++) 9486 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 9487 B_FALSE, VDEV_CONFIG_L2CACHE); 9488 fnvlist_add_nvlist_array(nvroot, config, 9489 (const nvlist_t * const *)list, sav->sav_count); 9490 for (i = 0; i < sav->sav_count; i++) 9491 nvlist_free(list[i]); 9492 kmem_free(list, sav->sav_count * sizeof (void *)); 9493 } 9494 9495 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 9496 nvlist_free(nvroot); 9497 9498 sav->sav_sync = B_FALSE; 9499 } 9500 9501 /* 9502 * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t. 9503 * The all-vdev ZAP must be empty. 9504 */ 9505 static void 9506 spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx) 9507 { 9508 spa_t *spa = vd->vdev_spa; 9509 9510 if (vd->vdev_root_zap != 0 && 9511 spa_feature_is_active(spa, SPA_FEATURE_AVZ_V2)) { 9512 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 9513 vd->vdev_root_zap, tx)); 9514 } 9515 if (vd->vdev_top_zap != 0) { 9516 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 9517 vd->vdev_top_zap, tx)); 9518 } 9519 if (vd->vdev_leaf_zap != 0) { 9520 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 9521 vd->vdev_leaf_zap, tx)); 9522 } 9523 for (uint64_t i = 0; i < vd->vdev_children; i++) { 9524 spa_avz_build(vd->vdev_child[i], avz, tx); 9525 } 9526 } 9527 9528 static void 9529 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 9530 { 9531 nvlist_t *config; 9532 9533 /* 9534 * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS, 9535 * its config may not be dirty but we still need to build per-vdev ZAPs. 9536 * Similarly, if the pool is being assembled (e.g. after a split), we 9537 * need to rebuild the AVZ although the config may not be dirty. 9538 */ 9539 if (list_is_empty(&spa->spa_config_dirty_list) && 9540 spa->spa_avz_action == AVZ_ACTION_NONE) 9541 return; 9542 9543 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 9544 9545 ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE || 9546 spa->spa_avz_action == AVZ_ACTION_INITIALIZE || 9547 spa->spa_all_vdev_zaps != 0); 9548 9549 if (spa->spa_avz_action == AVZ_ACTION_REBUILD) { 9550 /* Make and build the new AVZ */ 9551 uint64_t new_avz = zap_create(spa->spa_meta_objset, 9552 DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); 9553 spa_avz_build(spa->spa_root_vdev, new_avz, tx); 9554 9555 /* Diff old AVZ with new one */ 9556 zap_cursor_t zc; 9557 zap_attribute_t *za = zap_attribute_alloc(); 9558 9559 for (zap_cursor_init(&zc, spa->spa_meta_objset, 9560 spa->spa_all_vdev_zaps); 9561 zap_cursor_retrieve(&zc, za) == 0; 9562 zap_cursor_advance(&zc)) { 9563 uint64_t vdzap = za->za_first_integer; 9564 if (zap_lookup_int(spa->spa_meta_objset, new_avz, 9565 vdzap) == ENOENT) { 9566 /* 9567 * ZAP is listed in old AVZ but not in new one; 9568 * destroy it 9569 */ 9570 VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap, 9571 tx)); 9572 } 9573 } 9574 9575 zap_cursor_fini(&zc); 9576 zap_attribute_free(za); 9577 9578 /* Destroy the old AVZ */ 9579 VERIFY0(zap_destroy(spa->spa_meta_objset, 9580 spa->spa_all_vdev_zaps, tx)); 9581 9582 /* Replace the old AVZ in the dir obj with the new one */ 9583 VERIFY0(zap_update(spa->spa_meta_objset, 9584 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, 9585 sizeof (new_avz), 1, &new_avz, tx)); 9586 9587 spa->spa_all_vdev_zaps = new_avz; 9588 } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) { 9589 zap_cursor_t zc; 9590 zap_attribute_t *za = zap_attribute_alloc(); 9591 9592 /* Walk through the AVZ and destroy all listed ZAPs */ 9593 for (zap_cursor_init(&zc, spa->spa_meta_objset, 9594 spa->spa_all_vdev_zaps); 9595 zap_cursor_retrieve(&zc, za) == 0; 9596 zap_cursor_advance(&zc)) { 9597 uint64_t zap = za->za_first_integer; 9598 VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx)); 9599 } 9600 9601 zap_cursor_fini(&zc); 9602 zap_attribute_free(za); 9603 9604 /* Destroy and unlink the AVZ itself */ 9605 VERIFY0(zap_destroy(spa->spa_meta_objset, 9606 spa->spa_all_vdev_zaps, tx)); 9607 VERIFY0(zap_remove(spa->spa_meta_objset, 9608 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx)); 9609 spa->spa_all_vdev_zaps = 0; 9610 } 9611 9612 if (spa->spa_all_vdev_zaps == 0) { 9613 spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset, 9614 DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, 9615 DMU_POOL_VDEV_ZAP_MAP, tx); 9616 } 9617 spa->spa_avz_action = AVZ_ACTION_NONE; 9618 9619 /* Create ZAPs for vdevs that don't have them. */ 9620 vdev_construct_zaps(spa->spa_root_vdev, tx); 9621 9622 config = spa_config_generate(spa, spa->spa_root_vdev, 9623 dmu_tx_get_txg(tx), B_FALSE); 9624 9625 /* 9626 * If we're upgrading the spa version then make sure that 9627 * the config object gets updated with the correct version. 9628 */ 9629 if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) 9630 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 9631 spa->spa_uberblock.ub_version); 9632 9633 spa_config_exit(spa, SCL_STATE, FTAG); 9634 9635 nvlist_free(spa->spa_config_syncing); 9636 spa->spa_config_syncing = config; 9637 9638 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 9639 } 9640 9641 static void 9642 spa_sync_version(void *arg, dmu_tx_t *tx) 9643 { 9644 uint64_t *versionp = arg; 9645 uint64_t version = *versionp; 9646 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 9647 9648 /* 9649 * Setting the version is special cased when first creating the pool. 9650 */ 9651 ASSERT(tx->tx_txg != TXG_INITIAL); 9652 9653 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 9654 ASSERT(version >= spa_version(spa)); 9655 9656 spa->spa_uberblock.ub_version = version; 9657 vdev_config_dirty(spa->spa_root_vdev); 9658 spa_history_log_internal(spa, "set", tx, "version=%lld", 9659 (longlong_t)version); 9660 } 9661 9662 /* 9663 * Set zpool properties. 9664 */ 9665 static void 9666 spa_sync_props(void *arg, dmu_tx_t *tx) 9667 { 9668 nvlist_t *nvp = arg; 9669 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 9670 objset_t *mos = spa->spa_meta_objset; 9671 nvpair_t *elem = NULL; 9672 9673 mutex_enter(&spa->spa_props_lock); 9674 9675 while ((elem = nvlist_next_nvpair(nvp, elem))) { 9676 uint64_t intval; 9677 const char *strval, *fname; 9678 zpool_prop_t prop; 9679 const char *propname; 9680 const char *elemname = nvpair_name(elem); 9681 zprop_type_t proptype; 9682 spa_feature_t fid; 9683 9684 switch (prop = zpool_name_to_prop(elemname)) { 9685 case ZPOOL_PROP_VERSION: 9686 intval = fnvpair_value_uint64(elem); 9687 /* 9688 * The version is synced separately before other 9689 * properties and should be correct by now. 9690 */ 9691 ASSERT3U(spa_version(spa), >=, intval); 9692 break; 9693 9694 case ZPOOL_PROP_ALTROOT: 9695 /* 9696 * 'altroot' is a non-persistent property. It should 9697 * have been set temporarily at creation or import time. 9698 */ 9699 ASSERT(spa->spa_root != NULL); 9700 break; 9701 9702 case ZPOOL_PROP_READONLY: 9703 case ZPOOL_PROP_CACHEFILE: 9704 /* 9705 * 'readonly' and 'cachefile' are also non-persistent 9706 * properties. 9707 */ 9708 break; 9709 case ZPOOL_PROP_COMMENT: 9710 strval = fnvpair_value_string(elem); 9711 if (spa->spa_comment != NULL) 9712 spa_strfree(spa->spa_comment); 9713 spa->spa_comment = spa_strdup(strval); 9714 /* 9715 * We need to dirty the configuration on all the vdevs 9716 * so that their labels get updated. We also need to 9717 * update the cache file to keep it in sync with the 9718 * MOS version. It's unnecessary to do this for pool 9719 * creation since the vdev's configuration has already 9720 * been dirtied. 9721 */ 9722 if (tx->tx_txg != TXG_INITIAL) { 9723 vdev_config_dirty(spa->spa_root_vdev); 9724 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 9725 } 9726 spa_history_log_internal(spa, "set", tx, 9727 "%s=%s", elemname, strval); 9728 break; 9729 case ZPOOL_PROP_COMPATIBILITY: 9730 strval = fnvpair_value_string(elem); 9731 if (spa->spa_compatibility != NULL) 9732 spa_strfree(spa->spa_compatibility); 9733 spa->spa_compatibility = spa_strdup(strval); 9734 /* 9735 * Dirty the configuration on vdevs as above. 9736 */ 9737 if (tx->tx_txg != TXG_INITIAL) { 9738 vdev_config_dirty(spa->spa_root_vdev); 9739 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 9740 } 9741 9742 spa_history_log_internal(spa, "set", tx, 9743 "%s=%s", nvpair_name(elem), strval); 9744 break; 9745 9746 case ZPOOL_PROP_INVAL: 9747 if (zpool_prop_feature(elemname)) { 9748 fname = strchr(elemname, '@') + 1; 9749 VERIFY0(zfeature_lookup_name(fname, &fid)); 9750 9751 spa_feature_enable(spa, fid, tx); 9752 spa_history_log_internal(spa, "set", tx, 9753 "%s=enabled", elemname); 9754 break; 9755 } else if (!zfs_prop_user(elemname)) { 9756 ASSERT(zpool_prop_feature(elemname)); 9757 break; 9758 } 9759 zfs_fallthrough; 9760 default: 9761 /* 9762 * Set pool property values in the poolprops mos object. 9763 */ 9764 if (spa->spa_pool_props_object == 0) { 9765 spa->spa_pool_props_object = 9766 zap_create_link(mos, DMU_OT_POOL_PROPS, 9767 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 9768 tx); 9769 } 9770 9771 /* normalize the property name */ 9772 if (prop == ZPOOL_PROP_INVAL) { 9773 propname = elemname; 9774 proptype = PROP_TYPE_STRING; 9775 } else { 9776 propname = zpool_prop_to_name(prop); 9777 proptype = zpool_prop_get_type(prop); 9778 } 9779 9780 if (nvpair_type(elem) == DATA_TYPE_STRING) { 9781 ASSERT(proptype == PROP_TYPE_STRING); 9782 strval = fnvpair_value_string(elem); 9783 if (strlen(strval) == 0) { 9784 /* remove the property if value == "" */ 9785 (void) zap_remove(mos, 9786 spa->spa_pool_props_object, 9787 propname, tx); 9788 } else { 9789 VERIFY0(zap_update(mos, 9790 spa->spa_pool_props_object, 9791 propname, 1, strlen(strval) + 1, 9792 strval, tx)); 9793 } 9794 spa_history_log_internal(spa, "set", tx, 9795 "%s=%s", elemname, strval); 9796 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 9797 intval = fnvpair_value_uint64(elem); 9798 9799 if (proptype == PROP_TYPE_INDEX) { 9800 const char *unused; 9801 VERIFY0(zpool_prop_index_to_string( 9802 prop, intval, &unused)); 9803 } 9804 VERIFY0(zap_update(mos, 9805 spa->spa_pool_props_object, propname, 9806 8, 1, &intval, tx)); 9807 spa_history_log_internal(spa, "set", tx, 9808 "%s=%lld", elemname, 9809 (longlong_t)intval); 9810 9811 switch (prop) { 9812 case ZPOOL_PROP_DELEGATION: 9813 spa->spa_delegation = intval; 9814 break; 9815 case ZPOOL_PROP_BOOTFS: 9816 spa->spa_bootfs = intval; 9817 break; 9818 case ZPOOL_PROP_FAILUREMODE: 9819 spa->spa_failmode = intval; 9820 break; 9821 case ZPOOL_PROP_AUTOTRIM: 9822 spa->spa_autotrim = intval; 9823 spa_async_request(spa, 9824 SPA_ASYNC_AUTOTRIM_RESTART); 9825 break; 9826 case ZPOOL_PROP_AUTOEXPAND: 9827 spa->spa_autoexpand = intval; 9828 if (tx->tx_txg != TXG_INITIAL) 9829 spa_async_request(spa, 9830 SPA_ASYNC_AUTOEXPAND); 9831 break; 9832 case ZPOOL_PROP_MULTIHOST: 9833 spa->spa_multihost = intval; 9834 break; 9835 case ZPOOL_PROP_DEDUP_TABLE_QUOTA: 9836 spa->spa_dedup_table_quota = intval; 9837 break; 9838 default: 9839 break; 9840 } 9841 } else { 9842 ASSERT(0); /* not allowed */ 9843 } 9844 } 9845 9846 } 9847 9848 mutex_exit(&spa->spa_props_lock); 9849 } 9850 9851 /* 9852 * Perform one-time upgrade on-disk changes. spa_version() does not 9853 * reflect the new version this txg, so there must be no changes this 9854 * txg to anything that the upgrade code depends on after it executes. 9855 * Therefore this must be called after dsl_pool_sync() does the sync 9856 * tasks. 9857 */ 9858 static void 9859 spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 9860 { 9861 if (spa_sync_pass(spa) != 1) 9862 return; 9863 9864 dsl_pool_t *dp = spa->spa_dsl_pool; 9865 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 9866 9867 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 9868 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 9869 dsl_pool_create_origin(dp, tx); 9870 9871 /* Keeping the origin open increases spa_minref */ 9872 spa->spa_minref += 3; 9873 } 9874 9875 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 9876 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 9877 dsl_pool_upgrade_clones(dp, tx); 9878 } 9879 9880 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 9881 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 9882 dsl_pool_upgrade_dir_clones(dp, tx); 9883 9884 /* Keeping the freedir open increases spa_minref */ 9885 spa->spa_minref += 3; 9886 } 9887 9888 if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && 9889 spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 9890 spa_feature_create_zap_objects(spa, tx); 9891 } 9892 9893 /* 9894 * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable 9895 * when possibility to use lz4 compression for metadata was added 9896 * Old pools that have this feature enabled must be upgraded to have 9897 * this feature active 9898 */ 9899 if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 9900 boolean_t lz4_en = spa_feature_is_enabled(spa, 9901 SPA_FEATURE_LZ4_COMPRESS); 9902 boolean_t lz4_ac = spa_feature_is_active(spa, 9903 SPA_FEATURE_LZ4_COMPRESS); 9904 9905 if (lz4_en && !lz4_ac) 9906 spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); 9907 } 9908 9909 /* 9910 * If we haven't written the salt, do so now. Note that the 9911 * feature may not be activated yet, but that's fine since 9912 * the presence of this ZAP entry is backwards compatible. 9913 */ 9914 if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 9915 DMU_POOL_CHECKSUM_SALT) == ENOENT) { 9916 VERIFY0(zap_add(spa->spa_meta_objset, 9917 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, 9918 sizeof (spa->spa_cksum_salt.zcs_bytes), 9919 spa->spa_cksum_salt.zcs_bytes, tx)); 9920 } 9921 9922 rrw_exit(&dp->dp_config_rwlock, FTAG); 9923 } 9924 9925 static void 9926 vdev_indirect_state_sync_verify(vdev_t *vd) 9927 { 9928 vdev_indirect_mapping_t *vim __maybe_unused = vd->vdev_indirect_mapping; 9929 vdev_indirect_births_t *vib __maybe_unused = vd->vdev_indirect_births; 9930 9931 if (vd->vdev_ops == &vdev_indirect_ops) { 9932 ASSERT(vim != NULL); 9933 ASSERT(vib != NULL); 9934 } 9935 9936 uint64_t obsolete_sm_object = 0; 9937 ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); 9938 if (obsolete_sm_object != 0) { 9939 ASSERT(vd->vdev_obsolete_sm != NULL); 9940 ASSERT(vd->vdev_removing || 9941 vd->vdev_ops == &vdev_indirect_ops); 9942 ASSERT(vdev_indirect_mapping_num_entries(vim) > 0); 9943 ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0); 9944 ASSERT3U(obsolete_sm_object, ==, 9945 space_map_object(vd->vdev_obsolete_sm)); 9946 ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=, 9947 space_map_allocated(vd->vdev_obsolete_sm)); 9948 } 9949 ASSERT(vd->vdev_obsolete_segments != NULL); 9950 9951 /* 9952 * Since frees / remaps to an indirect vdev can only 9953 * happen in syncing context, the obsolete segments 9954 * tree must be empty when we start syncing. 9955 */ 9956 ASSERT0(zfs_range_tree_space(vd->vdev_obsolete_segments)); 9957 } 9958 9959 /* 9960 * Set the top-level vdev's max queue depth. Evaluate each top-level's 9961 * async write queue depth in case it changed. The max queue depth will 9962 * not change in the middle of syncing out this txg. 9963 */ 9964 static void 9965 spa_sync_adjust_vdev_max_queue_depth(spa_t *spa) 9966 { 9967 ASSERT(spa_writeable(spa)); 9968 9969 metaslab_class_balance(spa_normal_class(spa), B_TRUE); 9970 metaslab_class_balance(spa_special_class(spa), B_TRUE); 9971 metaslab_class_balance(spa_dedup_class(spa), B_TRUE); 9972 } 9973 9974 static void 9975 spa_sync_condense_indirect(spa_t *spa, dmu_tx_t *tx) 9976 { 9977 ASSERT(spa_writeable(spa)); 9978 9979 vdev_t *rvd = spa->spa_root_vdev; 9980 for (int c = 0; c < rvd->vdev_children; c++) { 9981 vdev_t *vd = rvd->vdev_child[c]; 9982 vdev_indirect_state_sync_verify(vd); 9983 9984 if (vdev_indirect_should_condense(vd)) { 9985 spa_condense_indirect_start_sync(vd, tx); 9986 break; 9987 } 9988 } 9989 } 9990 9991 static void 9992 spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx) 9993 { 9994 objset_t *mos = spa->spa_meta_objset; 9995 dsl_pool_t *dp = spa->spa_dsl_pool; 9996 uint64_t txg = tx->tx_txg; 9997 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 9998 9999 do { 10000 int pass = ++spa->spa_sync_pass; 10001 10002 spa_sync_config_object(spa, tx); 10003 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 10004 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 10005 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 10006 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 10007 spa_errlog_sync(spa, txg); 10008 dsl_pool_sync(dp, txg); 10009 10010 if (pass < zfs_sync_pass_deferred_free || 10011 spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { 10012 /* 10013 * If the log space map feature is active we don't 10014 * care about deferred frees and the deferred bpobj 10015 * as the log space map should effectively have the 10016 * same results (i.e. appending only to one object). 10017 */ 10018 spa_sync_frees(spa, free_bpl, tx); 10019 } else { 10020 /* 10021 * We can not defer frees in pass 1, because 10022 * we sync the deferred frees later in pass 1. 10023 */ 10024 ASSERT3U(pass, >, 1); 10025 bplist_iterate(free_bpl, bpobj_enqueue_alloc_cb, 10026 &spa->spa_deferred_bpobj, tx); 10027 } 10028 10029 brt_sync(spa, txg); 10030 ddt_sync(spa, txg); 10031 dsl_scan_sync(dp, tx); 10032 dsl_errorscrub_sync(dp, tx); 10033 svr_sync(spa, tx); 10034 spa_sync_upgrades(spa, tx); 10035 10036 spa_flush_metaslabs(spa, tx); 10037 10038 vdev_t *vd = NULL; 10039 while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 10040 != NULL) 10041 vdev_sync(vd, txg); 10042 10043 if (pass == 1) { 10044 /* 10045 * dsl_pool_sync() -> dp_sync_tasks may have dirtied 10046 * the config. If that happens, this txg should not 10047 * be a no-op. So we must sync the config to the MOS 10048 * before checking for no-op. 10049 * 10050 * Note that when the config is dirty, it will 10051 * be written to the MOS (i.e. the MOS will be 10052 * dirtied) every time we call spa_sync_config_object() 10053 * in this txg. Therefore we can't call this after 10054 * dsl_pool_sync() every pass, because it would 10055 * prevent us from converging, since we'd dirty 10056 * the MOS every pass. 10057 * 10058 * Sync tasks can only be processed in pass 1, so 10059 * there's no need to do this in later passes. 10060 */ 10061 spa_sync_config_object(spa, tx); 10062 } 10063 10064 /* 10065 * Note: We need to check if the MOS is dirty because we could 10066 * have marked the MOS dirty without updating the uberblock 10067 * (e.g. if we have sync tasks but no dirty user data). We need 10068 * to check the uberblock's rootbp because it is updated if we 10069 * have synced out dirty data (though in this case the MOS will 10070 * most likely also be dirty due to second order effects, we 10071 * don't want to rely on that here). 10072 */ 10073 if (pass == 1 && 10074 BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp) < txg && 10075 !dmu_objset_is_dirty(mos, txg)) { 10076 /* 10077 * Nothing changed on the first pass, therefore this 10078 * TXG is a no-op. Avoid syncing deferred frees, so 10079 * that we can keep this TXG as a no-op. 10080 */ 10081 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 10082 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 10083 ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); 10084 ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, txg)); 10085 break; 10086 } 10087 10088 spa_sync_deferred_frees(spa, tx); 10089 } while (dmu_objset_is_dirty(mos, txg)); 10090 } 10091 10092 /* 10093 * Rewrite the vdev configuration (which includes the uberblock) to 10094 * commit the transaction group. 10095 * 10096 * If there are no dirty vdevs, we sync the uberblock to a few random 10097 * top-level vdevs that are known to be visible in the config cache 10098 * (see spa_vdev_add() for a complete description). If there *are* dirty 10099 * vdevs, sync the uberblock to all vdevs. 10100 */ 10101 static void 10102 spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx) 10103 { 10104 vdev_t *rvd = spa->spa_root_vdev; 10105 uint64_t txg = tx->tx_txg; 10106 10107 for (;;) { 10108 int error = 0; 10109 10110 /* 10111 * We hold SCL_STATE to prevent vdev open/close/etc. 10112 * while we're attempting to write the vdev labels. 10113 */ 10114 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 10115 10116 if (list_is_empty(&spa->spa_config_dirty_list)) { 10117 vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; 10118 int svdcount = 0; 10119 int children = rvd->vdev_children; 10120 int c0 = random_in_range(children); 10121 10122 for (int c = 0; c < children; c++) { 10123 vdev_t *vd = 10124 rvd->vdev_child[(c0 + c) % children]; 10125 10126 /* Stop when revisiting the first vdev */ 10127 if (c > 0 && svd[0] == vd) 10128 break; 10129 10130 if (vd->vdev_ms_array == 0 || 10131 vd->vdev_islog || 10132 !vdev_is_concrete(vd)) 10133 continue; 10134 10135 svd[svdcount++] = vd; 10136 if (svdcount == SPA_SYNC_MIN_VDEVS) 10137 break; 10138 } 10139 error = vdev_config_sync(svd, svdcount, txg); 10140 } else { 10141 error = vdev_config_sync(rvd->vdev_child, 10142 rvd->vdev_children, txg); 10143 } 10144 10145 if (error == 0) 10146 spa->spa_last_synced_guid = rvd->vdev_guid; 10147 10148 spa_config_exit(spa, SCL_STATE, FTAG); 10149 10150 if (error == 0) 10151 break; 10152 zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR); 10153 zio_resume_wait(spa); 10154 } 10155 } 10156 10157 /* 10158 * Sync the specified transaction group. New blocks may be dirtied as 10159 * part of the process, so we iterate until it converges. 10160 */ 10161 void 10162 spa_sync(spa_t *spa, uint64_t txg) 10163 { 10164 vdev_t *vd = NULL; 10165 10166 VERIFY(spa_writeable(spa)); 10167 10168 /* 10169 * Wait for i/os issued in open context that need to complete 10170 * before this txg syncs. 10171 */ 10172 (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]); 10173 spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, 10174 ZIO_FLAG_CANFAIL); 10175 10176 /* 10177 * Now that there can be no more cloning in this transaction group, 10178 * but we are still before issuing frees, we can process pending BRT 10179 * updates. 10180 */ 10181 brt_pending_apply(spa, txg); 10182 10183 /* 10184 * Lock out configuration changes. 10185 */ 10186 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 10187 10188 spa->spa_syncing_txg = txg; 10189 spa->spa_sync_pass = 0; 10190 10191 /* 10192 * If there are any pending vdev state changes, convert them 10193 * into config changes that go out with this transaction group. 10194 */ 10195 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 10196 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 10197 /* Avoid holding the write lock unless actually necessary */ 10198 if (vd->vdev_aux == NULL) { 10199 vdev_state_clean(vd); 10200 vdev_config_dirty(vd); 10201 continue; 10202 } 10203 /* 10204 * We need the write lock here because, for aux vdevs, 10205 * calling vdev_config_dirty() modifies sav_config. 10206 * This is ugly and will become unnecessary when we 10207 * eliminate the aux vdev wart by integrating all vdevs 10208 * into the root vdev tree. 10209 */ 10210 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 10211 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 10212 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 10213 vdev_state_clean(vd); 10214 vdev_config_dirty(vd); 10215 } 10216 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 10217 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 10218 } 10219 spa_config_exit(spa, SCL_STATE, FTAG); 10220 10221 dsl_pool_t *dp = spa->spa_dsl_pool; 10222 dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); 10223 10224 spa->spa_sync_starttime = gethrtime(); 10225 taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); 10226 spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq, 10227 spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + 10228 NSEC_TO_TICK(spa->spa_deadman_synctime)); 10229 10230 /* 10231 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 10232 * set spa_deflate if we have no raid-z vdevs. 10233 */ 10234 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 10235 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 10236 vdev_t *rvd = spa->spa_root_vdev; 10237 10238 int i; 10239 for (i = 0; i < rvd->vdev_children; i++) { 10240 vd = rvd->vdev_child[i]; 10241 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 10242 break; 10243 } 10244 if (i == rvd->vdev_children) { 10245 spa->spa_deflate = TRUE; 10246 VERIFY0(zap_add(spa->spa_meta_objset, 10247 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 10248 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 10249 } 10250 } 10251 10252 spa_sync_adjust_vdev_max_queue_depth(spa); 10253 10254 spa_sync_condense_indirect(spa, tx); 10255 10256 spa_sync_iterate_to_convergence(spa, tx); 10257 10258 #ifdef ZFS_DEBUG 10259 if (!list_is_empty(&spa->spa_config_dirty_list)) { 10260 /* 10261 * Make sure that the number of ZAPs for all the vdevs matches 10262 * the number of ZAPs in the per-vdev ZAP list. This only gets 10263 * called if the config is dirty; otherwise there may be 10264 * outstanding AVZ operations that weren't completed in 10265 * spa_sync_config_object. 10266 */ 10267 uint64_t all_vdev_zap_entry_count; 10268 ASSERT0(zap_count(spa->spa_meta_objset, 10269 spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count)); 10270 ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==, 10271 all_vdev_zap_entry_count); 10272 } 10273 #endif 10274 10275 if (spa->spa_vdev_removal != NULL) { 10276 ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]); 10277 } 10278 10279 spa_sync_rewrite_vdev_config(spa, tx); 10280 dmu_tx_commit(tx); 10281 10282 taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); 10283 spa->spa_deadman_tqid = 0; 10284 10285 /* 10286 * Clear the dirty config list. 10287 */ 10288 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 10289 vdev_config_clean(vd); 10290 10291 /* 10292 * Now that the new config has synced transactionally, 10293 * let it become visible to the config cache. 10294 */ 10295 if (spa->spa_config_syncing != NULL) { 10296 spa_config_set(spa, spa->spa_config_syncing); 10297 spa->spa_config_txg = txg; 10298 spa->spa_config_syncing = NULL; 10299 } 10300 10301 dsl_pool_sync_done(dp, txg); 10302 10303 /* 10304 * Update usable space statistics. 10305 */ 10306 while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 10307 != NULL) 10308 vdev_sync_done(vd, txg); 10309 10310 metaslab_class_evict_old(spa->spa_normal_class, txg); 10311 metaslab_class_evict_old(spa->spa_log_class, txg); 10312 /* spa_embedded_log_class has only one metaslab per vdev. */ 10313 metaslab_class_evict_old(spa->spa_special_class, txg); 10314 metaslab_class_evict_old(spa->spa_dedup_class, txg); 10315 10316 spa_sync_close_syncing_log_sm(spa); 10317 10318 spa_update_dspace(spa); 10319 10320 if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) 10321 vdev_autotrim_kick(spa); 10322 10323 /* 10324 * It had better be the case that we didn't dirty anything 10325 * since vdev_config_sync(). 10326 */ 10327 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 10328 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 10329 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 10330 10331 while (zfs_pause_spa_sync) 10332 delay(1); 10333 10334 spa->spa_sync_pass = 0; 10335 10336 /* 10337 * Update the last synced uberblock here. We want to do this at 10338 * the end of spa_sync() so that consumers of spa_last_synced_txg() 10339 * will be guaranteed that all the processing associated with 10340 * that txg has been completed. 10341 */ 10342 spa->spa_ubsync = spa->spa_uberblock; 10343 spa_config_exit(spa, SCL_CONFIG, FTAG); 10344 10345 spa_handle_ignored_writes(spa); 10346 10347 /* 10348 * If any async tasks have been requested, kick them off. 10349 */ 10350 spa_async_dispatch(spa); 10351 } 10352 10353 /* 10354 * Sync all pools. We don't want to hold the namespace lock across these 10355 * operations, so we take a reference on the spa_t and drop the lock during the 10356 * sync. 10357 */ 10358 void 10359 spa_sync_allpools(void) 10360 { 10361 spa_t *spa = NULL; 10362 mutex_enter(&spa_namespace_lock); 10363 while ((spa = spa_next(spa)) != NULL) { 10364 if (spa_state(spa) != POOL_STATE_ACTIVE || 10365 !spa_writeable(spa) || spa_suspended(spa)) 10366 continue; 10367 spa_open_ref(spa, FTAG); 10368 mutex_exit(&spa_namespace_lock); 10369 txg_wait_synced(spa_get_dsl(spa), 0); 10370 mutex_enter(&spa_namespace_lock); 10371 spa_close(spa, FTAG); 10372 } 10373 mutex_exit(&spa_namespace_lock); 10374 } 10375 10376 taskq_t * 10377 spa_sync_tq_create(spa_t *spa, const char *name) 10378 { 10379 kthread_t **kthreads; 10380 10381 ASSERT(spa->spa_sync_tq == NULL); 10382 ASSERT3S(spa->spa_alloc_count, <=, boot_ncpus); 10383 10384 /* 10385 * - do not allow more allocators than cpus. 10386 * - there may be more cpus than allocators. 10387 * - do not allow more sync taskq threads than allocators or cpus. 10388 */ 10389 int nthreads = spa->spa_alloc_count; 10390 spa->spa_syncthreads = kmem_zalloc(sizeof (spa_syncthread_info_t) * 10391 nthreads, KM_SLEEP); 10392 10393 spa->spa_sync_tq = taskq_create_synced(name, nthreads, minclsyspri, 10394 nthreads, INT_MAX, TASKQ_PREPOPULATE, &kthreads); 10395 VERIFY(spa->spa_sync_tq != NULL); 10396 VERIFY(kthreads != NULL); 10397 10398 spa_syncthread_info_t *ti = spa->spa_syncthreads; 10399 for (int i = 0; i < nthreads; i++, ti++) { 10400 ti->sti_thread = kthreads[i]; 10401 ti->sti_allocator = i; 10402 } 10403 10404 kmem_free(kthreads, sizeof (*kthreads) * nthreads); 10405 return (spa->spa_sync_tq); 10406 } 10407 10408 void 10409 spa_sync_tq_destroy(spa_t *spa) 10410 { 10411 ASSERT(spa->spa_sync_tq != NULL); 10412 10413 taskq_wait(spa->spa_sync_tq); 10414 taskq_destroy(spa->spa_sync_tq); 10415 kmem_free(spa->spa_syncthreads, 10416 sizeof (spa_syncthread_info_t) * spa->spa_alloc_count); 10417 spa->spa_sync_tq = NULL; 10418 } 10419 10420 uint_t 10421 spa_acq_allocator(spa_t *spa) 10422 { 10423 int i; 10424 10425 if (spa->spa_alloc_count == 1) 10426 return (0); 10427 10428 mutex_enter(&spa->spa_allocs_use->sau_lock); 10429 uint_t r = spa->spa_allocs_use->sau_rotor; 10430 do { 10431 if (++r == spa->spa_alloc_count) 10432 r = 0; 10433 } while (spa->spa_allocs_use->sau_inuse[r]); 10434 spa->spa_allocs_use->sau_inuse[r] = B_TRUE; 10435 spa->spa_allocs_use->sau_rotor = r; 10436 mutex_exit(&spa->spa_allocs_use->sau_lock); 10437 10438 spa_syncthread_info_t *ti = spa->spa_syncthreads; 10439 for (i = 0; i < spa->spa_alloc_count; i++, ti++) { 10440 if (ti->sti_thread == curthread) { 10441 ti->sti_allocator = r; 10442 break; 10443 } 10444 } 10445 ASSERT3S(i, <, spa->spa_alloc_count); 10446 return (r); 10447 } 10448 10449 void 10450 spa_rel_allocator(spa_t *spa, uint_t allocator) 10451 { 10452 if (spa->spa_alloc_count > 1) 10453 spa->spa_allocs_use->sau_inuse[allocator] = B_FALSE; 10454 } 10455 10456 void 10457 spa_select_allocator(zio_t *zio) 10458 { 10459 zbookmark_phys_t *bm = &zio->io_bookmark; 10460 spa_t *spa = zio->io_spa; 10461 10462 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 10463 10464 /* 10465 * A gang block (for example) may have inherited its parent's 10466 * allocator, in which case there is nothing further to do here. 10467 */ 10468 if (ZIO_HAS_ALLOCATOR(zio)) 10469 return; 10470 10471 ASSERT(spa != NULL); 10472 ASSERT(bm != NULL); 10473 10474 /* 10475 * First try to use an allocator assigned to the syncthread, and set 10476 * the corresponding write issue taskq for the allocator. 10477 * Note, we must have an open pool to do this. 10478 */ 10479 if (spa->spa_sync_tq != NULL) { 10480 spa_syncthread_info_t *ti = spa->spa_syncthreads; 10481 for (int i = 0; i < spa->spa_alloc_count; i++, ti++) { 10482 if (ti->sti_thread == curthread) { 10483 zio->io_allocator = ti->sti_allocator; 10484 return; 10485 } 10486 } 10487 } 10488 10489 /* 10490 * We want to try to use as many allocators as possible to help improve 10491 * performance, but we also want logically adjacent IOs to be physically 10492 * adjacent to improve sequential read performance. We chunk each object 10493 * into 2^20 block regions, and then hash based on the objset, object, 10494 * level, and region to accomplish both of these goals. 10495 */ 10496 uint64_t hv = cityhash4(bm->zb_objset, bm->zb_object, bm->zb_level, 10497 bm->zb_blkid >> 20); 10498 10499 zio->io_allocator = (uint_t)hv % spa->spa_alloc_count; 10500 } 10501 10502 /* 10503 * ========================================================================== 10504 * Miscellaneous routines 10505 * ========================================================================== 10506 */ 10507 10508 /* 10509 * Remove all pools in the system. 10510 */ 10511 void 10512 spa_evict_all(void) 10513 { 10514 spa_t *spa; 10515 10516 /* 10517 * Remove all cached state. All pools should be closed now, 10518 * so every spa in the AVL tree should be unreferenced. 10519 */ 10520 mutex_enter(&spa_namespace_lock); 10521 while ((spa = spa_next(NULL)) != NULL) { 10522 /* 10523 * Stop async tasks. The async thread may need to detach 10524 * a device that's been replaced, which requires grabbing 10525 * spa_namespace_lock, so we must drop it here. 10526 */ 10527 spa_open_ref(spa, FTAG); 10528 mutex_exit(&spa_namespace_lock); 10529 spa_async_suspend(spa); 10530 mutex_enter(&spa_namespace_lock); 10531 spa_close(spa, FTAG); 10532 10533 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 10534 spa_unload(spa); 10535 spa_deactivate(spa); 10536 } 10537 spa_remove(spa); 10538 } 10539 mutex_exit(&spa_namespace_lock); 10540 } 10541 10542 vdev_t * 10543 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 10544 { 10545 vdev_t *vd; 10546 int i; 10547 10548 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 10549 return (vd); 10550 10551 if (aux) { 10552 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 10553 vd = spa->spa_l2cache.sav_vdevs[i]; 10554 if (vd->vdev_guid == guid) 10555 return (vd); 10556 } 10557 10558 for (i = 0; i < spa->spa_spares.sav_count; i++) { 10559 vd = spa->spa_spares.sav_vdevs[i]; 10560 if (vd->vdev_guid == guid) 10561 return (vd); 10562 } 10563 } 10564 10565 return (NULL); 10566 } 10567 10568 void 10569 spa_upgrade(spa_t *spa, uint64_t version) 10570 { 10571 ASSERT(spa_writeable(spa)); 10572 10573 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 10574 10575 /* 10576 * This should only be called for a non-faulted pool, and since a 10577 * future version would result in an unopenable pool, this shouldn't be 10578 * possible. 10579 */ 10580 ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); 10581 ASSERT3U(version, >=, spa->spa_uberblock.ub_version); 10582 10583 spa->spa_uberblock.ub_version = version; 10584 vdev_config_dirty(spa->spa_root_vdev); 10585 10586 spa_config_exit(spa, SCL_ALL, FTAG); 10587 10588 txg_wait_synced(spa_get_dsl(spa), 0); 10589 } 10590 10591 static boolean_t 10592 spa_has_aux_vdev(spa_t *spa, uint64_t guid, spa_aux_vdev_t *sav) 10593 { 10594 (void) spa; 10595 int i; 10596 uint64_t vdev_guid; 10597 10598 for (i = 0; i < sav->sav_count; i++) 10599 if (sav->sav_vdevs[i]->vdev_guid == guid) 10600 return (B_TRUE); 10601 10602 for (i = 0; i < sav->sav_npending; i++) { 10603 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 10604 &vdev_guid) == 0 && vdev_guid == guid) 10605 return (B_TRUE); 10606 } 10607 10608 return (B_FALSE); 10609 } 10610 10611 boolean_t 10612 spa_has_l2cache(spa_t *spa, uint64_t guid) 10613 { 10614 return (spa_has_aux_vdev(spa, guid, &spa->spa_l2cache)); 10615 } 10616 10617 boolean_t 10618 spa_has_spare(spa_t *spa, uint64_t guid) 10619 { 10620 return (spa_has_aux_vdev(spa, guid, &spa->spa_spares)); 10621 } 10622 10623 /* 10624 * Check if a pool has an active shared spare device. 10625 * Note: reference count of an active spare is 2, as a spare and as a replace 10626 */ 10627 static boolean_t 10628 spa_has_active_shared_spare(spa_t *spa) 10629 { 10630 int i, refcnt; 10631 uint64_t pool; 10632 spa_aux_vdev_t *sav = &spa->spa_spares; 10633 10634 for (i = 0; i < sav->sav_count; i++) { 10635 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 10636 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 10637 refcnt > 2) 10638 return (B_TRUE); 10639 } 10640 10641 return (B_FALSE); 10642 } 10643 10644 uint64_t 10645 spa_total_metaslabs(spa_t *spa) 10646 { 10647 vdev_t *rvd = spa->spa_root_vdev; 10648 10649 uint64_t m = 0; 10650 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 10651 vdev_t *vd = rvd->vdev_child[c]; 10652 if (!vdev_is_concrete(vd)) 10653 continue; 10654 m += vd->vdev_ms_count; 10655 } 10656 return (m); 10657 } 10658 10659 /* 10660 * Notify any waiting threads that some activity has switched from being in- 10661 * progress to not-in-progress so that the thread can wake up and determine 10662 * whether it is finished waiting. 10663 */ 10664 void 10665 spa_notify_waiters(spa_t *spa) 10666 { 10667 /* 10668 * Acquiring spa_activities_lock here prevents the cv_broadcast from 10669 * happening between the waiting thread's check and cv_wait. 10670 */ 10671 mutex_enter(&spa->spa_activities_lock); 10672 cv_broadcast(&spa->spa_activities_cv); 10673 mutex_exit(&spa->spa_activities_lock); 10674 } 10675 10676 /* 10677 * Notify any waiting threads that the pool is exporting, and then block until 10678 * they are finished using the spa_t. 10679 */ 10680 void 10681 spa_wake_waiters(spa_t *spa) 10682 { 10683 mutex_enter(&spa->spa_activities_lock); 10684 spa->spa_waiters_cancel = B_TRUE; 10685 cv_broadcast(&spa->spa_activities_cv); 10686 while (spa->spa_waiters != 0) 10687 cv_wait(&spa->spa_waiters_cv, &spa->spa_activities_lock); 10688 spa->spa_waiters_cancel = B_FALSE; 10689 mutex_exit(&spa->spa_activities_lock); 10690 } 10691 10692 /* Whether the vdev or any of its descendants are being initialized/trimmed. */ 10693 static boolean_t 10694 spa_vdev_activity_in_progress_impl(vdev_t *vd, zpool_wait_activity_t activity) 10695 { 10696 spa_t *spa = vd->vdev_spa; 10697 10698 ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER)); 10699 ASSERT(MUTEX_HELD(&spa->spa_activities_lock)); 10700 ASSERT(activity == ZPOOL_WAIT_INITIALIZE || 10701 activity == ZPOOL_WAIT_TRIM); 10702 10703 kmutex_t *lock = activity == ZPOOL_WAIT_INITIALIZE ? 10704 &vd->vdev_initialize_lock : &vd->vdev_trim_lock; 10705 10706 mutex_exit(&spa->spa_activities_lock); 10707 mutex_enter(lock); 10708 mutex_enter(&spa->spa_activities_lock); 10709 10710 boolean_t in_progress = (activity == ZPOOL_WAIT_INITIALIZE) ? 10711 (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) : 10712 (vd->vdev_trim_state == VDEV_TRIM_ACTIVE); 10713 mutex_exit(lock); 10714 10715 if (in_progress) 10716 return (B_TRUE); 10717 10718 for (int i = 0; i < vd->vdev_children; i++) { 10719 if (spa_vdev_activity_in_progress_impl(vd->vdev_child[i], 10720 activity)) 10721 return (B_TRUE); 10722 } 10723 10724 return (B_FALSE); 10725 } 10726 10727 /* 10728 * If use_guid is true, this checks whether the vdev specified by guid is 10729 * being initialized/trimmed. Otherwise, it checks whether any vdev in the pool 10730 * is being initialized/trimmed. The caller must hold the config lock and 10731 * spa_activities_lock. 10732 */ 10733 static int 10734 spa_vdev_activity_in_progress(spa_t *spa, boolean_t use_guid, uint64_t guid, 10735 zpool_wait_activity_t activity, boolean_t *in_progress) 10736 { 10737 mutex_exit(&spa->spa_activities_lock); 10738 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 10739 mutex_enter(&spa->spa_activities_lock); 10740 10741 vdev_t *vd; 10742 if (use_guid) { 10743 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 10744 if (vd == NULL || !vd->vdev_ops->vdev_op_leaf) { 10745 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 10746 return (EINVAL); 10747 } 10748 } else { 10749 vd = spa->spa_root_vdev; 10750 } 10751 10752 *in_progress = spa_vdev_activity_in_progress_impl(vd, activity); 10753 10754 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 10755 return (0); 10756 } 10757 10758 /* 10759 * Locking for waiting threads 10760 * --------------------------- 10761 * 10762 * Waiting threads need a way to check whether a given activity is in progress, 10763 * and then, if it is, wait for it to complete. Each activity will have some 10764 * in-memory representation of the relevant on-disk state which can be used to 10765 * determine whether or not the activity is in progress. The in-memory state and 10766 * the locking used to protect it will be different for each activity, and may 10767 * not be suitable for use with a cvar (e.g., some state is protected by the 10768 * config lock). To allow waiting threads to wait without any races, another 10769 * lock, spa_activities_lock, is used. 10770 * 10771 * When the state is checked, both the activity-specific lock (if there is one) 10772 * and spa_activities_lock are held. In some cases, the activity-specific lock 10773 * is acquired explicitly (e.g. the config lock). In others, the locking is 10774 * internal to some check (e.g. bpobj_is_empty). After checking, the waiting 10775 * thread releases the activity-specific lock and, if the activity is in 10776 * progress, then cv_waits using spa_activities_lock. 10777 * 10778 * The waiting thread is woken when another thread, one completing some 10779 * activity, updates the state of the activity and then calls 10780 * spa_notify_waiters, which will cv_broadcast. This 'completing' thread only 10781 * needs to hold its activity-specific lock when updating the state, and this 10782 * lock can (but doesn't have to) be dropped before calling spa_notify_waiters. 10783 * 10784 * Because spa_notify_waiters acquires spa_activities_lock before broadcasting, 10785 * and because it is held when the waiting thread checks the state of the 10786 * activity, it can never be the case that the completing thread both updates 10787 * the activity state and cv_broadcasts in between the waiting thread's check 10788 * and cv_wait. Thus, a waiting thread can never miss a wakeup. 10789 * 10790 * In order to prevent deadlock, when the waiting thread does its check, in some 10791 * cases it will temporarily drop spa_activities_lock in order to acquire the 10792 * activity-specific lock. The order in which spa_activities_lock and the 10793 * activity specific lock are acquired in the waiting thread is determined by 10794 * the order in which they are acquired in the completing thread; if the 10795 * completing thread calls spa_notify_waiters with the activity-specific lock 10796 * held, then the waiting thread must also acquire the activity-specific lock 10797 * first. 10798 */ 10799 10800 static int 10801 spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity, 10802 boolean_t use_tag, uint64_t tag, boolean_t *in_progress) 10803 { 10804 int error = 0; 10805 10806 ASSERT(MUTEX_HELD(&spa->spa_activities_lock)); 10807 10808 switch (activity) { 10809 case ZPOOL_WAIT_CKPT_DISCARD: 10810 *in_progress = 10811 (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT) && 10812 zap_contains(spa_meta_objset(spa), 10813 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT) == 10814 ENOENT); 10815 break; 10816 case ZPOOL_WAIT_FREE: 10817 *in_progress = ((spa_version(spa) >= SPA_VERSION_DEADLISTS && 10818 !bpobj_is_empty(&spa->spa_dsl_pool->dp_free_bpobj)) || 10819 spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY) || 10820 spa_livelist_delete_check(spa)); 10821 break; 10822 case ZPOOL_WAIT_INITIALIZE: 10823 case ZPOOL_WAIT_TRIM: 10824 error = spa_vdev_activity_in_progress(spa, use_tag, tag, 10825 activity, in_progress); 10826 break; 10827 case ZPOOL_WAIT_REPLACE: 10828 mutex_exit(&spa->spa_activities_lock); 10829 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 10830 mutex_enter(&spa->spa_activities_lock); 10831 10832 *in_progress = vdev_replace_in_progress(spa->spa_root_vdev); 10833 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 10834 break; 10835 case ZPOOL_WAIT_REMOVE: 10836 *in_progress = (spa->spa_removing_phys.sr_state == 10837 DSS_SCANNING); 10838 break; 10839 case ZPOOL_WAIT_RESILVER: 10840 *in_progress = vdev_rebuild_active(spa->spa_root_vdev); 10841 if (*in_progress) 10842 break; 10843 zfs_fallthrough; 10844 case ZPOOL_WAIT_SCRUB: 10845 { 10846 boolean_t scanning, paused, is_scrub; 10847 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; 10848 10849 is_scrub = (scn->scn_phys.scn_func == POOL_SCAN_SCRUB); 10850 scanning = (scn->scn_phys.scn_state == DSS_SCANNING); 10851 paused = dsl_scan_is_paused_scrub(scn); 10852 *in_progress = (scanning && !paused && 10853 is_scrub == (activity == ZPOOL_WAIT_SCRUB)); 10854 break; 10855 } 10856 case ZPOOL_WAIT_RAIDZ_EXPAND: 10857 { 10858 vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 10859 *in_progress = (vre != NULL && vre->vre_state == DSS_SCANNING); 10860 break; 10861 } 10862 default: 10863 panic("unrecognized value for activity %d", activity); 10864 } 10865 10866 return (error); 10867 } 10868 10869 static int 10870 spa_wait_common(const char *pool, zpool_wait_activity_t activity, 10871 boolean_t use_tag, uint64_t tag, boolean_t *waited) 10872 { 10873 /* 10874 * The tag is used to distinguish between instances of an activity. 10875 * 'initialize' and 'trim' are the only activities that we use this for. 10876 * The other activities can only have a single instance in progress in a 10877 * pool at one time, making the tag unnecessary. 10878 * 10879 * There can be multiple devices being replaced at once, but since they 10880 * all finish once resilvering finishes, we don't bother keeping track 10881 * of them individually, we just wait for them all to finish. 10882 */ 10883 if (use_tag && activity != ZPOOL_WAIT_INITIALIZE && 10884 activity != ZPOOL_WAIT_TRIM) 10885 return (EINVAL); 10886 10887 if (activity < 0 || activity >= ZPOOL_WAIT_NUM_ACTIVITIES) 10888 return (EINVAL); 10889 10890 spa_t *spa; 10891 int error = spa_open(pool, &spa, FTAG); 10892 if (error != 0) 10893 return (error); 10894 10895 /* 10896 * Increment the spa's waiter count so that we can call spa_close and 10897 * still ensure that the spa_t doesn't get freed before this thread is 10898 * finished with it when the pool is exported. We want to call spa_close 10899 * before we start waiting because otherwise the additional ref would 10900 * prevent the pool from being exported or destroyed throughout the 10901 * potentially long wait. 10902 */ 10903 mutex_enter(&spa->spa_activities_lock); 10904 spa->spa_waiters++; 10905 spa_close(spa, FTAG); 10906 10907 *waited = B_FALSE; 10908 for (;;) { 10909 boolean_t in_progress; 10910 error = spa_activity_in_progress(spa, activity, use_tag, tag, 10911 &in_progress); 10912 10913 if (error || !in_progress || spa->spa_waiters_cancel) 10914 break; 10915 10916 *waited = B_TRUE; 10917 10918 if (cv_wait_sig(&spa->spa_activities_cv, 10919 &spa->spa_activities_lock) == 0) { 10920 error = EINTR; 10921 break; 10922 } 10923 } 10924 10925 spa->spa_waiters--; 10926 cv_signal(&spa->spa_waiters_cv); 10927 mutex_exit(&spa->spa_activities_lock); 10928 10929 return (error); 10930 } 10931 10932 /* 10933 * Wait for a particular instance of the specified activity to complete, where 10934 * the instance is identified by 'tag' 10935 */ 10936 int 10937 spa_wait_tag(const char *pool, zpool_wait_activity_t activity, uint64_t tag, 10938 boolean_t *waited) 10939 { 10940 return (spa_wait_common(pool, activity, B_TRUE, tag, waited)); 10941 } 10942 10943 /* 10944 * Wait for all instances of the specified activity complete 10945 */ 10946 int 10947 spa_wait(const char *pool, zpool_wait_activity_t activity, boolean_t *waited) 10948 { 10949 10950 return (spa_wait_common(pool, activity, B_FALSE, 0, waited)); 10951 } 10952 10953 sysevent_t * 10954 spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) 10955 { 10956 sysevent_t *ev = NULL; 10957 #ifdef _KERNEL 10958 nvlist_t *resource; 10959 10960 resource = zfs_event_create(spa, vd, FM_SYSEVENT_CLASS, name, hist_nvl); 10961 if (resource) { 10962 ev = kmem_alloc(sizeof (sysevent_t), KM_SLEEP); 10963 ev->resource = resource; 10964 } 10965 #else 10966 (void) spa, (void) vd, (void) hist_nvl, (void) name; 10967 #endif 10968 return (ev); 10969 } 10970 10971 void 10972 spa_event_post(sysevent_t *ev) 10973 { 10974 #ifdef _KERNEL 10975 if (ev) { 10976 zfs_zevent_post(ev->resource, NULL, zfs_zevent_post_cb); 10977 kmem_free(ev, sizeof (*ev)); 10978 } 10979 #else 10980 (void) ev; 10981 #endif 10982 } 10983 10984 /* 10985 * Post a zevent corresponding to the given sysevent. The 'name' must be one 10986 * of the event definitions in sys/sysevent/eventdefs.h. The payload will be 10987 * filled in from the spa and (optionally) the vdev. This doesn't do anything 10988 * in the userland libzpool, as we don't want consumers to misinterpret ztest 10989 * or zdb as real changes. 10990 */ 10991 void 10992 spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) 10993 { 10994 spa_event_post(spa_event_create(spa, vd, hist_nvl, name)); 10995 } 10996 10997 /* state manipulation functions */ 10998 EXPORT_SYMBOL(spa_open); 10999 EXPORT_SYMBOL(spa_open_rewind); 11000 EXPORT_SYMBOL(spa_get_stats); 11001 EXPORT_SYMBOL(spa_create); 11002 EXPORT_SYMBOL(spa_import); 11003 EXPORT_SYMBOL(spa_tryimport); 11004 EXPORT_SYMBOL(spa_destroy); 11005 EXPORT_SYMBOL(spa_export); 11006 EXPORT_SYMBOL(spa_reset); 11007 EXPORT_SYMBOL(spa_async_request); 11008 EXPORT_SYMBOL(spa_async_suspend); 11009 EXPORT_SYMBOL(spa_async_resume); 11010 EXPORT_SYMBOL(spa_inject_addref); 11011 EXPORT_SYMBOL(spa_inject_delref); 11012 EXPORT_SYMBOL(spa_scan_stat_init); 11013 EXPORT_SYMBOL(spa_scan_get_stats); 11014 11015 /* device manipulation */ 11016 EXPORT_SYMBOL(spa_vdev_add); 11017 EXPORT_SYMBOL(spa_vdev_attach); 11018 EXPORT_SYMBOL(spa_vdev_detach); 11019 EXPORT_SYMBOL(spa_vdev_setpath); 11020 EXPORT_SYMBOL(spa_vdev_setfru); 11021 EXPORT_SYMBOL(spa_vdev_split_mirror); 11022 11023 /* spare statech is global across all pools) */ 11024 EXPORT_SYMBOL(spa_spare_add); 11025 EXPORT_SYMBOL(spa_spare_remove); 11026 EXPORT_SYMBOL(spa_spare_exists); 11027 EXPORT_SYMBOL(spa_spare_activate); 11028 11029 /* L2ARC statech is global across all pools) */ 11030 EXPORT_SYMBOL(spa_l2cache_add); 11031 EXPORT_SYMBOL(spa_l2cache_remove); 11032 EXPORT_SYMBOL(spa_l2cache_exists); 11033 EXPORT_SYMBOL(spa_l2cache_activate); 11034 EXPORT_SYMBOL(spa_l2cache_drop); 11035 11036 /* scanning */ 11037 EXPORT_SYMBOL(spa_scan); 11038 EXPORT_SYMBOL(spa_scan_range); 11039 EXPORT_SYMBOL(spa_scan_stop); 11040 11041 /* spa syncing */ 11042 EXPORT_SYMBOL(spa_sync); /* only for DMU use */ 11043 EXPORT_SYMBOL(spa_sync_allpools); 11044 11045 /* properties */ 11046 EXPORT_SYMBOL(spa_prop_set); 11047 EXPORT_SYMBOL(spa_prop_get); 11048 EXPORT_SYMBOL(spa_prop_clear_bootfs); 11049 11050 /* asynchronous event notification */ 11051 EXPORT_SYMBOL(spa_event_notify); 11052 11053 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_pct, UINT, ZMOD_RW, 11054 "Percentage of CPUs to run a metaslab preload taskq"); 11055 11056 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, UINT, ZMOD_RW, 11057 "log2 fraction of arc that can be used by inflight I/Os when " 11058 "verifying pool during import"); 11059 11060 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_metadata, INT, ZMOD_RW, 11061 "Set to traverse metadata on pool import"); 11062 11063 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW, 11064 "Set to traverse data on pool import"); 11065 11066 ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW, 11067 "Print vdev tree to zfs_dbgmsg during pool import"); 11068 11069 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RW, 11070 "Percentage of CPUs to run an IO worker thread"); 11071 11072 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RW, 11073 "Number of threads per IO worker taskqueue"); 11074 11075 ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, U64, ZMOD_RW, 11076 "Allow importing pool with up to this number of missing top-level " 11077 "vdevs (in read-only mode)"); 11078 11079 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT, 11080 ZMOD_RW, "Set the livelist condense zthr to pause"); 11081 11082 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_pause, INT, 11083 ZMOD_RW, "Set the livelist condense synctask to pause"); 11084 11085 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_cancel, 11086 INT, ZMOD_RW, 11087 "Whether livelist condensing was canceled in the synctask"); 11088 11089 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_cancel, 11090 INT, ZMOD_RW, 11091 "Whether livelist condensing was canceled in the zthr function"); 11092 11093 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, 11094 ZMOD_RW, 11095 "Whether extra ALLOC blkptrs were added to a livelist entry while it " 11096 "was being condensed"); 11097 11098 #ifdef _KERNEL 11099 ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read, 11100 spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RW, 11101 "Configure IO queues for read IO"); 11102 ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write, 11103 spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RW, 11104 "Configure IO queues for write IO"); 11105 #endif 11106 11107 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_write_tpq, UINT, ZMOD_RW, 11108 "Number of CPUs per write issue taskq"); 11109