1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011, 2020 by Delphix. All rights reserved. 25 * Copyright (c) 2018, Nexenta Systems, Inc. All rights reserved. 26 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 27 * Copyright 2013 Saso Kiselkov. All rights reserved. 28 * Copyright (c) 2014 Integros [integros.com] 29 * Copyright 2016 Toomas Soome <tsoome@me.com> 30 * Copyright (c) 2016 Actifio, Inc. All rights reserved. 31 * Copyright 2018 Joyent, Inc. 32 * Copyright (c) 2017, 2019, Datto Inc. All rights reserved. 33 * Copyright 2017 Joyent, Inc. 34 * Copyright (c) 2017, Intel Corporation. 35 * Copyright (c) 2021, Colm Buckley <colm@tuatha.org> 36 * Copyright (c) 2023 Hewlett Packard Enterprise Development LP. 37 */ 38 39 /* 40 * SPA: Storage Pool Allocator 41 * 42 * This file contains all the routines used when modifying on-disk SPA state. 43 * This includes opening, importing, destroying, exporting a pool, and syncing a 44 * pool. 45 */ 46 47 #include <sys/zfs_context.h> 48 #include <sys/fm/fs/zfs.h> 49 #include <sys/spa_impl.h> 50 #include <sys/zio.h> 51 #include <sys/zio_checksum.h> 52 #include <sys/dmu.h> 53 #include <sys/dmu_tx.h> 54 #include <sys/zap.h> 55 #include <sys/zil.h> 56 #include <sys/brt.h> 57 #include <sys/ddt.h> 58 #include <sys/vdev_impl.h> 59 #include <sys/vdev_removal.h> 60 #include <sys/vdev_indirect_mapping.h> 61 #include <sys/vdev_indirect_births.h> 62 #include <sys/vdev_initialize.h> 63 #include <sys/vdev_rebuild.h> 64 #include <sys/vdev_trim.h> 65 #include <sys/vdev_disk.h> 66 #include <sys/vdev_raidz.h> 67 #include <sys/vdev_draid.h> 68 #include <sys/metaslab.h> 69 #include <sys/metaslab_impl.h> 70 #include <sys/mmp.h> 71 #include <sys/uberblock_impl.h> 72 #include <sys/txg.h> 73 #include <sys/avl.h> 74 #include <sys/bpobj.h> 75 #include <sys/dmu_traverse.h> 76 #include <sys/dmu_objset.h> 77 #include <sys/unique.h> 78 #include <sys/dsl_pool.h> 79 #include <sys/dsl_dataset.h> 80 #include <sys/dsl_dir.h> 81 #include <sys/dsl_prop.h> 82 #include <sys/dsl_synctask.h> 83 #include <sys/fs/zfs.h> 84 #include <sys/arc.h> 85 #include <sys/callb.h> 86 #include <sys/systeminfo.h> 87 #include <sys/zfs_ioctl.h> 88 #include <sys/dsl_scan.h> 89 #include <sys/zfeature.h> 90 #include <sys/dsl_destroy.h> 91 #include <sys/zvol.h> 92 93 #ifdef _KERNEL 94 #include <sys/fm/protocol.h> 95 #include <sys/fm/util.h> 96 #include <sys/callb.h> 97 #include <sys/zone.h> 98 #include <sys/vmsystm.h> 99 #endif /* _KERNEL */ 100 101 #include "zfs_prop.h" 102 #include "zfs_comutil.h" 103 #include <cityhash.h> 104 105 /* 106 * spa_thread() existed on Illumos as a parent thread for the various worker 107 * threads that actually run the pool, as a way to both reference the entire 108 * pool work as a single object, and to share properties like scheduling 109 * options. It has not yet been adapted to Linux or FreeBSD. This define is 110 * used to mark related parts of the code to make things easier for the reader, 111 * and to compile this code out. It can be removed when someone implements it, 112 * moves it to some Illumos-specific place, or removes it entirely. 113 */ 114 #undef HAVE_SPA_THREAD 115 116 /* 117 * The "System Duty Cycle" scheduling class is an Illumos feature to help 118 * prevent CPU-intensive kernel threads from affecting latency on interactive 119 * threads. It doesn't exist on Linux or FreeBSD, so the supporting code is 120 * gated behind a define. On Illumos SDC depends on spa_thread(), but 121 * spa_thread() also has other uses, so this is a separate define. 122 */ 123 #undef HAVE_SYSDC 124 125 /* 126 * The interval, in seconds, at which failed configuration cache file writes 127 * should be retried. 128 */ 129 int zfs_ccw_retry_interval = 300; 130 131 typedef enum zti_modes { 132 ZTI_MODE_FIXED, /* value is # of threads (min 1) */ 133 ZTI_MODE_SCALE, /* Taskqs scale with CPUs. */ 134 ZTI_MODE_SYNC, /* sync thread assigned */ 135 ZTI_MODE_NULL, /* don't create a taskq */ 136 ZTI_NMODES 137 } zti_modes_t; 138 139 #define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } 140 #define ZTI_PCT(n) { ZTI_MODE_ONLINE_PERCENT, (n), 1 } 141 #define ZTI_SCALE { ZTI_MODE_SCALE, 0, 1 } 142 #define ZTI_SYNC { ZTI_MODE_SYNC, 0, 1 } 143 #define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } 144 145 #define ZTI_N(n) ZTI_P(n, 1) 146 #define ZTI_ONE ZTI_N(1) 147 148 typedef struct zio_taskq_info { 149 zti_modes_t zti_mode; 150 uint_t zti_value; 151 uint_t zti_count; 152 } zio_taskq_info_t; 153 154 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 155 "iss", "iss_h", "int", "int_h" 156 }; 157 158 /* 159 * This table defines the taskq settings for each ZFS I/O type. When 160 * initializing a pool, we use this table to create an appropriately sized 161 * taskq. Some operations are low volume and therefore have a small, static 162 * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE 163 * macros. Other operations process a large amount of data; the ZTI_SCALE 164 * macro causes us to create a taskq oriented for throughput. Some operations 165 * are so high frequency and short-lived that the taskq itself can become a 166 * point of lock contention. The ZTI_P(#, #) macro indicates that we need an 167 * additional degree of parallelism specified by the number of threads per- 168 * taskq and the number of taskqs; when dispatching an event in this case, the 169 * particular taskq is chosen at random. ZTI_SCALE uses a number of taskqs 170 * that scales with the number of CPUs. 171 * 172 * The different taskq priorities are to handle the different contexts (issue 173 * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that 174 * need to be handled with minimum delay. 175 */ 176 static zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 177 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 178 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ 179 { ZTI_N(8), ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* READ */ 180 { ZTI_SYNC, ZTI_N(5), ZTI_SCALE, ZTI_N(5) }, /* WRITE */ 181 { ZTI_SCALE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ 182 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ 183 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ 184 { ZTI_N(4), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* TRIM */ 185 }; 186 187 static void spa_sync_version(void *arg, dmu_tx_t *tx); 188 static void spa_sync_props(void *arg, dmu_tx_t *tx); 189 static boolean_t spa_has_active_shared_spare(spa_t *spa); 190 static int spa_load_impl(spa_t *spa, spa_import_type_t type, 191 const char **ereport); 192 static void spa_vdev_resilver_done(spa_t *spa); 193 194 /* 195 * Percentage of all CPUs that can be used by the metaslab preload taskq. 196 */ 197 static uint_t metaslab_preload_pct = 50; 198 199 static uint_t zio_taskq_batch_pct = 80; /* 1 thread per cpu in pset */ 200 static uint_t zio_taskq_batch_tpq; /* threads per taskq */ 201 202 #ifdef HAVE_SYSDC 203 static const boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 204 static const uint_t zio_taskq_basedc = 80; /* base duty cycle */ 205 #endif 206 207 #ifdef HAVE_SPA_THREAD 208 static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */ 209 #endif 210 211 static uint_t zio_taskq_wr_iss_ncpus = 0; 212 213 /* 214 * Report any spa_load_verify errors found, but do not fail spa_load. 215 * This is used by zdb to analyze non-idle pools. 216 */ 217 boolean_t spa_load_verify_dryrun = B_FALSE; 218 219 /* 220 * Allow read spacemaps in case of readonly import (spa_mode == SPA_MODE_READ). 221 * This is used by zdb for spacemaps verification. 222 */ 223 boolean_t spa_mode_readable_spacemaps = B_FALSE; 224 225 /* 226 * This (illegal) pool name is used when temporarily importing a spa_t in order 227 * to get the vdev stats associated with the imported devices. 228 */ 229 #define TRYIMPORT_NAME "$import" 230 231 /* 232 * For debugging purposes: print out vdev tree during pool import. 233 */ 234 static int spa_load_print_vdev_tree = B_FALSE; 235 236 /* 237 * A non-zero value for zfs_max_missing_tvds means that we allow importing 238 * pools with missing top-level vdevs. This is strictly intended for advanced 239 * pool recovery cases since missing data is almost inevitable. Pools with 240 * missing devices can only be imported read-only for safety reasons, and their 241 * fail-mode will be automatically set to "continue". 242 * 243 * With 1 missing vdev we should be able to import the pool and mount all 244 * datasets. User data that was not modified after the missing device has been 245 * added should be recoverable. This means that snapshots created prior to the 246 * addition of that device should be completely intact. 247 * 248 * With 2 missing vdevs, some datasets may fail to mount since there are 249 * dataset statistics that are stored as regular metadata. Some data might be 250 * recoverable if those vdevs were added recently. 251 * 252 * With 3 or more missing vdevs, the pool is severely damaged and MOS entries 253 * may be missing entirely. Chances of data recovery are very low. Note that 254 * there are also risks of performing an inadvertent rewind as we might be 255 * missing all the vdevs with the latest uberblocks. 256 */ 257 uint64_t zfs_max_missing_tvds = 0; 258 259 /* 260 * The parameters below are similar to zfs_max_missing_tvds but are only 261 * intended for a preliminary open of the pool with an untrusted config which 262 * might be incomplete or out-dated. 263 * 264 * We are more tolerant for pools opened from a cachefile since we could have 265 * an out-dated cachefile where a device removal was not registered. 266 * We could have set the limit arbitrarily high but in the case where devices 267 * are really missing we would want to return the proper error codes; we chose 268 * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available 269 * and we get a chance to retrieve the trusted config. 270 */ 271 uint64_t zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1; 272 273 /* 274 * In the case where config was assembled by scanning device paths (/dev/dsks 275 * by default) we are less tolerant since all the existing devices should have 276 * been detected and we want spa_load to return the right error codes. 277 */ 278 uint64_t zfs_max_missing_tvds_scan = 0; 279 280 /* 281 * Debugging aid that pauses spa_sync() towards the end. 282 */ 283 static const boolean_t zfs_pause_spa_sync = B_FALSE; 284 285 /* 286 * Variables to indicate the livelist condense zthr func should wait at certain 287 * points for the livelist to be removed - used to test condense/destroy races 288 */ 289 static int zfs_livelist_condense_zthr_pause = 0; 290 static int zfs_livelist_condense_sync_pause = 0; 291 292 /* 293 * Variables to track whether or not condense cancellation has been 294 * triggered in testing. 295 */ 296 static int zfs_livelist_condense_sync_cancel = 0; 297 static int zfs_livelist_condense_zthr_cancel = 0; 298 299 /* 300 * Variable to track whether or not extra ALLOC blkptrs were added to a 301 * livelist entry while it was being condensed (caused by the way we track 302 * remapped blkptrs in dbuf_remap_impl) 303 */ 304 static int zfs_livelist_condense_new_alloc = 0; 305 306 /* 307 * ========================================================================== 308 * SPA properties routines 309 * ========================================================================== 310 */ 311 312 /* 313 * Add a (source=src, propname=propval) list to an nvlist. 314 */ 315 static void 316 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, const char *strval, 317 uint64_t intval, zprop_source_t src) 318 { 319 const char *propname = zpool_prop_to_name(prop); 320 nvlist_t *propval; 321 322 propval = fnvlist_alloc(); 323 fnvlist_add_uint64(propval, ZPROP_SOURCE, src); 324 325 if (strval != NULL) 326 fnvlist_add_string(propval, ZPROP_VALUE, strval); 327 else 328 fnvlist_add_uint64(propval, ZPROP_VALUE, intval); 329 330 fnvlist_add_nvlist(nvl, propname, propval); 331 nvlist_free(propval); 332 } 333 334 /* 335 * Add a user property (source=src, propname=propval) to an nvlist. 336 */ 337 static void 338 spa_prop_add_user(nvlist_t *nvl, const char *propname, char *strval, 339 zprop_source_t src) 340 { 341 nvlist_t *propval; 342 343 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 344 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 345 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 346 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 347 nvlist_free(propval); 348 } 349 350 /* 351 * Get property values from the spa configuration. 352 */ 353 static void 354 spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 355 { 356 vdev_t *rvd = spa->spa_root_vdev; 357 dsl_pool_t *pool = spa->spa_dsl_pool; 358 uint64_t size, alloc, cap, version; 359 const zprop_source_t src = ZPROP_SRC_NONE; 360 spa_config_dirent_t *dp; 361 metaslab_class_t *mc = spa_normal_class(spa); 362 363 ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 364 365 if (rvd != NULL) { 366 alloc = metaslab_class_get_alloc(mc); 367 alloc += metaslab_class_get_alloc(spa_special_class(spa)); 368 alloc += metaslab_class_get_alloc(spa_dedup_class(spa)); 369 alloc += metaslab_class_get_alloc(spa_embedded_log_class(spa)); 370 371 size = metaslab_class_get_space(mc); 372 size += metaslab_class_get_space(spa_special_class(spa)); 373 size += metaslab_class_get_space(spa_dedup_class(spa)); 374 size += metaslab_class_get_space(spa_embedded_log_class(spa)); 375 376 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 377 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 378 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 379 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 380 size - alloc, src); 381 spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL, 382 spa->spa_checkpoint_info.sci_dspace, src); 383 384 spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL, 385 metaslab_class_fragmentation(mc), src); 386 spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, 387 metaslab_class_expandable_space(mc), src); 388 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, 389 (spa_mode(spa) == SPA_MODE_READ), src); 390 391 cap = (size == 0) ? 0 : (alloc * 100 / size); 392 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 393 394 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 395 ddt_get_pool_dedup_ratio(spa), src); 396 spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONEUSED, NULL, 397 brt_get_used(spa), src); 398 spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONESAVED, NULL, 399 brt_get_saved(spa), src); 400 spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONERATIO, NULL, 401 brt_get_ratio(spa), src); 402 403 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 404 rvd->vdev_state, src); 405 406 version = spa_version(spa); 407 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) { 408 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, 409 version, ZPROP_SRC_DEFAULT); 410 } else { 411 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, 412 version, ZPROP_SRC_LOCAL); 413 } 414 spa_prop_add_list(*nvp, ZPOOL_PROP_LOAD_GUID, 415 NULL, spa_load_guid(spa), src); 416 } 417 418 if (pool != NULL) { 419 /* 420 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, 421 * when opening pools before this version freedir will be NULL. 422 */ 423 if (pool->dp_free_dir != NULL) { 424 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 425 dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes, 426 src); 427 } else { 428 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, 429 NULL, 0, src); 430 } 431 432 if (pool->dp_leak_dir != NULL) { 433 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, 434 dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes, 435 src); 436 } else { 437 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, 438 NULL, 0, src); 439 } 440 } 441 442 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 443 444 if (spa->spa_comment != NULL) { 445 spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 446 0, ZPROP_SRC_LOCAL); 447 } 448 449 if (spa->spa_compatibility != NULL) { 450 spa_prop_add_list(*nvp, ZPOOL_PROP_COMPATIBILITY, 451 spa->spa_compatibility, 0, ZPROP_SRC_LOCAL); 452 } 453 454 if (spa->spa_root != NULL) 455 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 456 0, ZPROP_SRC_LOCAL); 457 458 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { 459 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 460 MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE); 461 } else { 462 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 463 SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); 464 } 465 466 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) { 467 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, 468 DNODE_MAX_SIZE, ZPROP_SRC_NONE); 469 } else { 470 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, 471 DNODE_MIN_SIZE, ZPROP_SRC_NONE); 472 } 473 474 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 475 if (dp->scd_path == NULL) { 476 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 477 "none", 0, ZPROP_SRC_LOCAL); 478 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 479 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 480 dp->scd_path, 0, ZPROP_SRC_LOCAL); 481 } 482 } 483 } 484 485 /* 486 * Get zpool property values. 487 */ 488 int 489 spa_prop_get(spa_t *spa, nvlist_t **nvp) 490 { 491 objset_t *mos = spa->spa_meta_objset; 492 zap_cursor_t zc; 493 zap_attribute_t za; 494 dsl_pool_t *dp; 495 int err; 496 497 err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP); 498 if (err) 499 return (err); 500 501 dp = spa_get_dsl(spa); 502 dsl_pool_config_enter(dp, FTAG); 503 mutex_enter(&spa->spa_props_lock); 504 505 /* 506 * Get properties from the spa config. 507 */ 508 spa_prop_get_config(spa, nvp); 509 510 /* If no pool property object, no more prop to get. */ 511 if (mos == NULL || spa->spa_pool_props_object == 0) 512 goto out; 513 514 /* 515 * Get properties from the MOS pool property object. 516 */ 517 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 518 (err = zap_cursor_retrieve(&zc, &za)) == 0; 519 zap_cursor_advance(&zc)) { 520 uint64_t intval = 0; 521 char *strval = NULL; 522 zprop_source_t src = ZPROP_SRC_DEFAULT; 523 zpool_prop_t prop; 524 525 if ((prop = zpool_name_to_prop(za.za_name)) == 526 ZPOOL_PROP_INVAL && !zfs_prop_user(za.za_name)) 527 continue; 528 529 switch (za.za_integer_length) { 530 case 8: 531 /* integer property */ 532 if (za.za_first_integer != 533 zpool_prop_default_numeric(prop)) 534 src = ZPROP_SRC_LOCAL; 535 536 if (prop == ZPOOL_PROP_BOOTFS) { 537 dsl_dataset_t *ds = NULL; 538 539 err = dsl_dataset_hold_obj(dp, 540 za.za_first_integer, FTAG, &ds); 541 if (err != 0) 542 break; 543 544 strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, 545 KM_SLEEP); 546 dsl_dataset_name(ds, strval); 547 dsl_dataset_rele(ds, FTAG); 548 } else { 549 strval = NULL; 550 intval = za.za_first_integer; 551 } 552 553 spa_prop_add_list(*nvp, prop, strval, intval, src); 554 555 if (strval != NULL) 556 kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN); 557 558 break; 559 560 case 1: 561 /* string property */ 562 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 563 err = zap_lookup(mos, spa->spa_pool_props_object, 564 za.za_name, 1, za.za_num_integers, strval); 565 if (err) { 566 kmem_free(strval, za.za_num_integers); 567 break; 568 } 569 if (prop != ZPOOL_PROP_INVAL) { 570 spa_prop_add_list(*nvp, prop, strval, 0, src); 571 } else { 572 src = ZPROP_SRC_LOCAL; 573 spa_prop_add_user(*nvp, za.za_name, strval, 574 src); 575 } 576 kmem_free(strval, za.za_num_integers); 577 break; 578 579 default: 580 break; 581 } 582 } 583 zap_cursor_fini(&zc); 584 out: 585 mutex_exit(&spa->spa_props_lock); 586 dsl_pool_config_exit(dp, FTAG); 587 if (err && err != ENOENT) { 588 nvlist_free(*nvp); 589 *nvp = NULL; 590 return (err); 591 } 592 593 return (0); 594 } 595 596 /* 597 * Validate the given pool properties nvlist and modify the list 598 * for the property values to be set. 599 */ 600 static int 601 spa_prop_validate(spa_t *spa, nvlist_t *props) 602 { 603 nvpair_t *elem; 604 int error = 0, reset_bootfs = 0; 605 uint64_t objnum = 0; 606 boolean_t has_feature = B_FALSE; 607 608 elem = NULL; 609 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 610 uint64_t intval; 611 const char *strval, *slash, *check, *fname; 612 const char *propname = nvpair_name(elem); 613 zpool_prop_t prop = zpool_name_to_prop(propname); 614 615 switch (prop) { 616 case ZPOOL_PROP_INVAL: 617 /* 618 * Sanitize the input. 619 */ 620 if (zfs_prop_user(propname)) { 621 if (strlen(propname) >= ZAP_MAXNAMELEN) { 622 error = SET_ERROR(ENAMETOOLONG); 623 break; 624 } 625 626 if (strlen(fnvpair_value_string(elem)) >= 627 ZAP_MAXVALUELEN) { 628 error = SET_ERROR(E2BIG); 629 break; 630 } 631 } else if (zpool_prop_feature(propname)) { 632 if (nvpair_type(elem) != DATA_TYPE_UINT64) { 633 error = SET_ERROR(EINVAL); 634 break; 635 } 636 637 if (nvpair_value_uint64(elem, &intval) != 0) { 638 error = SET_ERROR(EINVAL); 639 break; 640 } 641 642 if (intval != 0) { 643 error = SET_ERROR(EINVAL); 644 break; 645 } 646 647 fname = strchr(propname, '@') + 1; 648 if (zfeature_lookup_name(fname, NULL) != 0) { 649 error = SET_ERROR(EINVAL); 650 break; 651 } 652 653 has_feature = B_TRUE; 654 } else { 655 error = SET_ERROR(EINVAL); 656 break; 657 } 658 break; 659 660 case ZPOOL_PROP_VERSION: 661 error = nvpair_value_uint64(elem, &intval); 662 if (!error && 663 (intval < spa_version(spa) || 664 intval > SPA_VERSION_BEFORE_FEATURES || 665 has_feature)) 666 error = SET_ERROR(EINVAL); 667 break; 668 669 case ZPOOL_PROP_DELEGATION: 670 case ZPOOL_PROP_AUTOREPLACE: 671 case ZPOOL_PROP_LISTSNAPS: 672 case ZPOOL_PROP_AUTOEXPAND: 673 case ZPOOL_PROP_AUTOTRIM: 674 error = nvpair_value_uint64(elem, &intval); 675 if (!error && intval > 1) 676 error = SET_ERROR(EINVAL); 677 break; 678 679 case ZPOOL_PROP_MULTIHOST: 680 error = nvpair_value_uint64(elem, &intval); 681 if (!error && intval > 1) 682 error = SET_ERROR(EINVAL); 683 684 if (!error) { 685 uint32_t hostid = zone_get_hostid(NULL); 686 if (hostid) 687 spa->spa_hostid = hostid; 688 else 689 error = SET_ERROR(ENOTSUP); 690 } 691 692 break; 693 694 case ZPOOL_PROP_BOOTFS: 695 /* 696 * If the pool version is less than SPA_VERSION_BOOTFS, 697 * or the pool is still being created (version == 0), 698 * the bootfs property cannot be set. 699 */ 700 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 701 error = SET_ERROR(ENOTSUP); 702 break; 703 } 704 705 /* 706 * Make sure the vdev config is bootable 707 */ 708 if (!vdev_is_bootable(spa->spa_root_vdev)) { 709 error = SET_ERROR(ENOTSUP); 710 break; 711 } 712 713 reset_bootfs = 1; 714 715 error = nvpair_value_string(elem, &strval); 716 717 if (!error) { 718 objset_t *os; 719 720 if (strval == NULL || strval[0] == '\0') { 721 objnum = zpool_prop_default_numeric( 722 ZPOOL_PROP_BOOTFS); 723 break; 724 } 725 726 error = dmu_objset_hold(strval, FTAG, &os); 727 if (error != 0) 728 break; 729 730 /* Must be ZPL. */ 731 if (dmu_objset_type(os) != DMU_OST_ZFS) { 732 error = SET_ERROR(ENOTSUP); 733 } else { 734 objnum = dmu_objset_id(os); 735 } 736 dmu_objset_rele(os, FTAG); 737 } 738 break; 739 740 case ZPOOL_PROP_FAILUREMODE: 741 error = nvpair_value_uint64(elem, &intval); 742 if (!error && intval > ZIO_FAILURE_MODE_PANIC) 743 error = SET_ERROR(EINVAL); 744 745 /* 746 * This is a special case which only occurs when 747 * the pool has completely failed. This allows 748 * the user to change the in-core failmode property 749 * without syncing it out to disk (I/Os might 750 * currently be blocked). We do this by returning 751 * EIO to the caller (spa_prop_set) to trick it 752 * into thinking we encountered a property validation 753 * error. 754 */ 755 if (!error && spa_suspended(spa)) { 756 spa->spa_failmode = intval; 757 error = SET_ERROR(EIO); 758 } 759 break; 760 761 case ZPOOL_PROP_CACHEFILE: 762 if ((error = nvpair_value_string(elem, &strval)) != 0) 763 break; 764 765 if (strval[0] == '\0') 766 break; 767 768 if (strcmp(strval, "none") == 0) 769 break; 770 771 if (strval[0] != '/') { 772 error = SET_ERROR(EINVAL); 773 break; 774 } 775 776 slash = strrchr(strval, '/'); 777 ASSERT(slash != NULL); 778 779 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 780 strcmp(slash, "/..") == 0) 781 error = SET_ERROR(EINVAL); 782 break; 783 784 case ZPOOL_PROP_COMMENT: 785 if ((error = nvpair_value_string(elem, &strval)) != 0) 786 break; 787 for (check = strval; *check != '\0'; check++) { 788 if (!isprint(*check)) { 789 error = SET_ERROR(EINVAL); 790 break; 791 } 792 } 793 if (strlen(strval) > ZPROP_MAX_COMMENT) 794 error = SET_ERROR(E2BIG); 795 break; 796 797 default: 798 break; 799 } 800 801 if (error) 802 break; 803 } 804 805 (void) nvlist_remove_all(props, 806 zpool_prop_to_name(ZPOOL_PROP_DEDUPDITTO)); 807 808 if (!error && reset_bootfs) { 809 error = nvlist_remove(props, 810 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 811 812 if (!error) { 813 error = nvlist_add_uint64(props, 814 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 815 } 816 } 817 818 return (error); 819 } 820 821 void 822 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 823 { 824 const char *cachefile; 825 spa_config_dirent_t *dp; 826 827 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 828 &cachefile) != 0) 829 return; 830 831 dp = kmem_alloc(sizeof (spa_config_dirent_t), 832 KM_SLEEP); 833 834 if (cachefile[0] == '\0') 835 dp->scd_path = spa_strdup(spa_config_path); 836 else if (strcmp(cachefile, "none") == 0) 837 dp->scd_path = NULL; 838 else 839 dp->scd_path = spa_strdup(cachefile); 840 841 list_insert_head(&spa->spa_config_list, dp); 842 if (need_sync) 843 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 844 } 845 846 int 847 spa_prop_set(spa_t *spa, nvlist_t *nvp) 848 { 849 int error; 850 nvpair_t *elem = NULL; 851 boolean_t need_sync = B_FALSE; 852 853 if ((error = spa_prop_validate(spa, nvp)) != 0) 854 return (error); 855 856 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 857 zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); 858 859 if (prop == ZPOOL_PROP_CACHEFILE || 860 prop == ZPOOL_PROP_ALTROOT || 861 prop == ZPOOL_PROP_READONLY) 862 continue; 863 864 if (prop == ZPOOL_PROP_INVAL && 865 zfs_prop_user(nvpair_name(elem))) { 866 need_sync = B_TRUE; 867 break; 868 } 869 870 if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) { 871 uint64_t ver = 0; 872 873 if (prop == ZPOOL_PROP_VERSION) { 874 VERIFY(nvpair_value_uint64(elem, &ver) == 0); 875 } else { 876 ASSERT(zpool_prop_feature(nvpair_name(elem))); 877 ver = SPA_VERSION_FEATURES; 878 need_sync = B_TRUE; 879 } 880 881 /* Save time if the version is already set. */ 882 if (ver == spa_version(spa)) 883 continue; 884 885 /* 886 * In addition to the pool directory object, we might 887 * create the pool properties object, the features for 888 * read object, the features for write object, or the 889 * feature descriptions object. 890 */ 891 error = dsl_sync_task(spa->spa_name, NULL, 892 spa_sync_version, &ver, 893 6, ZFS_SPACE_CHECK_RESERVED); 894 if (error) 895 return (error); 896 continue; 897 } 898 899 need_sync = B_TRUE; 900 break; 901 } 902 903 if (need_sync) { 904 return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, 905 nvp, 6, ZFS_SPACE_CHECK_RESERVED)); 906 } 907 908 return (0); 909 } 910 911 /* 912 * If the bootfs property value is dsobj, clear it. 913 */ 914 void 915 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 916 { 917 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 918 VERIFY(zap_remove(spa->spa_meta_objset, 919 spa->spa_pool_props_object, 920 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 921 spa->spa_bootfs = 0; 922 } 923 } 924 925 static int 926 spa_change_guid_check(void *arg, dmu_tx_t *tx) 927 { 928 uint64_t *newguid __maybe_unused = arg; 929 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 930 vdev_t *rvd = spa->spa_root_vdev; 931 uint64_t vdev_state; 932 933 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 934 int error = (spa_has_checkpoint(spa)) ? 935 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 936 return (SET_ERROR(error)); 937 } 938 939 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 940 vdev_state = rvd->vdev_state; 941 spa_config_exit(spa, SCL_STATE, FTAG); 942 943 if (vdev_state != VDEV_STATE_HEALTHY) 944 return (SET_ERROR(ENXIO)); 945 946 ASSERT3U(spa_guid(spa), !=, *newguid); 947 948 return (0); 949 } 950 951 static void 952 spa_change_guid_sync(void *arg, dmu_tx_t *tx) 953 { 954 uint64_t *newguid = arg; 955 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 956 uint64_t oldguid; 957 vdev_t *rvd = spa->spa_root_vdev; 958 959 oldguid = spa_guid(spa); 960 961 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 962 rvd->vdev_guid = *newguid; 963 rvd->vdev_guid_sum += (*newguid - oldguid); 964 vdev_config_dirty(rvd); 965 spa_config_exit(spa, SCL_STATE, FTAG); 966 967 spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", 968 (u_longlong_t)oldguid, (u_longlong_t)*newguid); 969 } 970 971 /* 972 * Change the GUID for the pool. This is done so that we can later 973 * re-import a pool built from a clone of our own vdevs. We will modify 974 * the root vdev's guid, our own pool guid, and then mark all of our 975 * vdevs dirty. Note that we must make sure that all our vdevs are 976 * online when we do this, or else any vdevs that weren't present 977 * would be orphaned from our pool. We are also going to issue a 978 * sysevent to update any watchers. 979 */ 980 int 981 spa_change_guid(spa_t *spa) 982 { 983 int error; 984 uint64_t guid; 985 986 mutex_enter(&spa->spa_vdev_top_lock); 987 mutex_enter(&spa_namespace_lock); 988 guid = spa_generate_guid(NULL); 989 990 error = dsl_sync_task(spa->spa_name, spa_change_guid_check, 991 spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); 992 993 if (error == 0) { 994 /* 995 * Clear the kobj flag from all the vdevs to allow 996 * vdev_cache_process_kobj_evt() to post events to all the 997 * vdevs since GUID is updated. 998 */ 999 vdev_clear_kobj_evt(spa->spa_root_vdev); 1000 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 1001 vdev_clear_kobj_evt(spa->spa_l2cache.sav_vdevs[i]); 1002 1003 spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE); 1004 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID); 1005 } 1006 1007 mutex_exit(&spa_namespace_lock); 1008 mutex_exit(&spa->spa_vdev_top_lock); 1009 1010 return (error); 1011 } 1012 1013 /* 1014 * ========================================================================== 1015 * SPA state manipulation (open/create/destroy/import/export) 1016 * ========================================================================== 1017 */ 1018 1019 static int 1020 spa_error_entry_compare(const void *a, const void *b) 1021 { 1022 const spa_error_entry_t *sa = (const spa_error_entry_t *)a; 1023 const spa_error_entry_t *sb = (const spa_error_entry_t *)b; 1024 int ret; 1025 1026 ret = memcmp(&sa->se_bookmark, &sb->se_bookmark, 1027 sizeof (zbookmark_phys_t)); 1028 1029 return (TREE_ISIGN(ret)); 1030 } 1031 1032 /* 1033 * Utility function which retrieves copies of the current logs and 1034 * re-initializes them in the process. 1035 */ 1036 void 1037 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 1038 { 1039 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 1040 1041 memcpy(last, &spa->spa_errlist_last, sizeof (avl_tree_t)); 1042 memcpy(scrub, &spa->spa_errlist_scrub, sizeof (avl_tree_t)); 1043 1044 avl_create(&spa->spa_errlist_scrub, 1045 spa_error_entry_compare, sizeof (spa_error_entry_t), 1046 offsetof(spa_error_entry_t, se_avl)); 1047 avl_create(&spa->spa_errlist_last, 1048 spa_error_entry_compare, sizeof (spa_error_entry_t), 1049 offsetof(spa_error_entry_t, se_avl)); 1050 } 1051 1052 static void 1053 spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 1054 { 1055 const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 1056 enum zti_modes mode = ztip->zti_mode; 1057 uint_t value = ztip->zti_value; 1058 uint_t count = ztip->zti_count; 1059 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1060 uint_t cpus, flags = TASKQ_DYNAMIC; 1061 1062 switch (mode) { 1063 case ZTI_MODE_FIXED: 1064 ASSERT3U(value, >, 0); 1065 break; 1066 1067 case ZTI_MODE_SYNC: 1068 1069 /* 1070 * Create one wr_iss taskq for every 'zio_taskq_wr_iss_ncpus', 1071 * not to exceed the number of spa allocators. 1072 */ 1073 if (zio_taskq_wr_iss_ncpus == 0) { 1074 count = MAX(boot_ncpus / spa->spa_alloc_count, 1); 1075 } else { 1076 count = MAX(1, 1077 boot_ncpus / MAX(1, zio_taskq_wr_iss_ncpus)); 1078 } 1079 count = MAX(count, (zio_taskq_batch_pct + 99) / 100); 1080 count = MIN(count, spa->spa_alloc_count); 1081 1082 /* 1083 * zio_taskq_batch_pct is unbounded and may exceed 100%, but no 1084 * single taskq may have more threads than 100% of online cpus. 1085 */ 1086 value = (zio_taskq_batch_pct + count / 2) / count; 1087 value = MIN(value, 100); 1088 flags |= TASKQ_THREADS_CPU_PCT; 1089 break; 1090 1091 case ZTI_MODE_SCALE: 1092 flags |= TASKQ_THREADS_CPU_PCT; 1093 /* 1094 * We want more taskqs to reduce lock contention, but we want 1095 * less for better request ordering and CPU utilization. 1096 */ 1097 cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100); 1098 if (zio_taskq_batch_tpq > 0) { 1099 count = MAX(1, (cpus + zio_taskq_batch_tpq / 2) / 1100 zio_taskq_batch_tpq); 1101 } else { 1102 /* 1103 * Prefer 6 threads per taskq, but no more taskqs 1104 * than threads in them on large systems. For 80%: 1105 * 1106 * taskq taskq total 1107 * cpus taskqs percent threads threads 1108 * ------- ------- ------- ------- ------- 1109 * 1 1 80% 1 1 1110 * 2 1 80% 1 1 1111 * 4 1 80% 3 3 1112 * 8 2 40% 3 6 1113 * 16 3 27% 4 12 1114 * 32 5 16% 5 25 1115 * 64 7 11% 7 49 1116 * 128 10 8% 10 100 1117 * 256 14 6% 15 210 1118 */ 1119 count = 1 + cpus / 6; 1120 while (count * count > cpus) 1121 count--; 1122 } 1123 /* Limit each taskq within 100% to not trigger assertion. */ 1124 count = MAX(count, (zio_taskq_batch_pct + 99) / 100); 1125 value = (zio_taskq_batch_pct + count / 2) / count; 1126 break; 1127 1128 case ZTI_MODE_NULL: 1129 tqs->stqs_count = 0; 1130 tqs->stqs_taskq = NULL; 1131 return; 1132 1133 default: 1134 panic("unrecognized mode for %s_%s taskq (%u:%u) in " 1135 "spa_taskqs_init()", 1136 zio_type_name[t], zio_taskq_types[q], mode, value); 1137 break; 1138 } 1139 1140 ASSERT3U(count, >, 0); 1141 tqs->stqs_count = count; 1142 tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); 1143 1144 for (uint_t i = 0; i < count; i++) { 1145 taskq_t *tq; 1146 char name[32]; 1147 1148 if (count > 1) 1149 (void) snprintf(name, sizeof (name), "%s_%s_%u", 1150 zio_type_name[t], zio_taskq_types[q], i); 1151 else 1152 (void) snprintf(name, sizeof (name), "%s_%s", 1153 zio_type_name[t], zio_taskq_types[q]); 1154 1155 #ifdef HAVE_SYSDC 1156 if (zio_taskq_sysdc && spa->spa_proc != &p0) { 1157 (void) zio_taskq_basedc; 1158 tq = taskq_create_sysdc(name, value, 50, INT_MAX, 1159 spa->spa_proc, zio_taskq_basedc, flags); 1160 } else { 1161 #endif 1162 pri_t pri = maxclsyspri; 1163 /* 1164 * The write issue taskq can be extremely CPU 1165 * intensive. Run it at slightly less important 1166 * priority than the other taskqs. 1167 * 1168 * Under Linux and FreeBSD this means incrementing 1169 * the priority value as opposed to platforms like 1170 * illumos where it should be decremented. 1171 * 1172 * On FreeBSD, if priorities divided by four (RQ_PPQ) 1173 * are equal then a difference between them is 1174 * insignificant. 1175 */ 1176 if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) { 1177 #if defined(__linux__) 1178 pri++; 1179 #elif defined(__FreeBSD__) 1180 pri += 4; 1181 #else 1182 #error "unknown OS" 1183 #endif 1184 } 1185 tq = taskq_create_proc(name, value, pri, 50, 1186 INT_MAX, spa->spa_proc, flags); 1187 #ifdef HAVE_SYSDC 1188 } 1189 #endif 1190 1191 tqs->stqs_taskq[i] = tq; 1192 } 1193 } 1194 1195 static void 1196 spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 1197 { 1198 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1199 1200 if (tqs->stqs_taskq == NULL) { 1201 ASSERT3U(tqs->stqs_count, ==, 0); 1202 return; 1203 } 1204 1205 for (uint_t i = 0; i < tqs->stqs_count; i++) { 1206 ASSERT3P(tqs->stqs_taskq[i], !=, NULL); 1207 taskq_destroy(tqs->stqs_taskq[i]); 1208 } 1209 1210 kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); 1211 tqs->stqs_taskq = NULL; 1212 } 1213 1214 #ifdef _KERNEL 1215 /* 1216 * The READ and WRITE rows of zio_taskqs are configurable at module load time 1217 * by setting zio_taskq_read or zio_taskq_write. 1218 * 1219 * Example (the defaults for READ and WRITE) 1220 * zio_taskq_read='fixed,1,8 null scale null' 1221 * zio_taskq_write='sync fixed,1,5 scale fixed,1,5' 1222 * 1223 * Each sets the entire row at a time. 1224 * 1225 * 'fixed' is parameterised: fixed,Q,T where Q is number of taskqs, T is number 1226 * of threads per taskq. 1227 * 1228 * 'null' can only be set on the high-priority queues (queue selection for 1229 * high-priority queues will fall back to the regular queue if the high-pri 1230 * is NULL. 1231 */ 1232 static const char *const modes[ZTI_NMODES] = { 1233 "fixed", "scale", "sync", "null" 1234 }; 1235 1236 /* Parse the incoming config string. Modifies cfg */ 1237 static int 1238 spa_taskq_param_set(zio_type_t t, char *cfg) 1239 { 1240 int err = 0; 1241 1242 zio_taskq_info_t row[ZIO_TASKQ_TYPES] = {{0}}; 1243 1244 char *next = cfg, *tok, *c; 1245 1246 /* 1247 * Parse out each element from the string and fill `row`. The entire 1248 * row has to be set at once, so any errors are flagged by just 1249 * breaking out of this loop early. 1250 */ 1251 uint_t q; 1252 for (q = 0; q < ZIO_TASKQ_TYPES; q++) { 1253 /* `next` is the start of the config */ 1254 if (next == NULL) 1255 break; 1256 1257 /* Eat up leading space */ 1258 while (isspace(*next)) 1259 next++; 1260 if (*next == '\0') 1261 break; 1262 1263 /* Mode ends at space or end of string */ 1264 tok = next; 1265 next = strchr(tok, ' '); 1266 if (next != NULL) *next++ = '\0'; 1267 1268 /* Parameters start after a comma */ 1269 c = strchr(tok, ','); 1270 if (c != NULL) *c++ = '\0'; 1271 1272 /* Match mode string */ 1273 uint_t mode; 1274 for (mode = 0; mode < ZTI_NMODES; mode++) 1275 if (strcmp(tok, modes[mode]) == 0) 1276 break; 1277 if (mode == ZTI_NMODES) 1278 break; 1279 1280 /* Invalid canary */ 1281 row[q].zti_mode = ZTI_NMODES; 1282 1283 /* Per-mode setup */ 1284 switch (mode) { 1285 1286 /* 1287 * FIXED is parameterised: number of queues, and number of 1288 * threads per queue. 1289 */ 1290 case ZTI_MODE_FIXED: { 1291 /* No parameters? */ 1292 if (c == NULL || *c == '\0') 1293 break; 1294 1295 /* Find next parameter */ 1296 tok = c; 1297 c = strchr(tok, ','); 1298 if (c == NULL) 1299 break; 1300 1301 /* Take digits and convert */ 1302 unsigned long long nq; 1303 if (!(isdigit(*tok))) 1304 break; 1305 err = ddi_strtoull(tok, &tok, 10, &nq); 1306 /* Must succeed and also end at the next param sep */ 1307 if (err != 0 || tok != c) 1308 break; 1309 1310 /* Move past the comma */ 1311 tok++; 1312 /* Need another number */ 1313 if (!(isdigit(*tok))) 1314 break; 1315 /* Remember start to make sure we moved */ 1316 c = tok; 1317 1318 /* Take digits */ 1319 unsigned long long ntpq; 1320 err = ddi_strtoull(tok, &tok, 10, &ntpq); 1321 /* Must succeed, and moved forward */ 1322 if (err != 0 || tok == c || *tok != '\0') 1323 break; 1324 1325 /* 1326 * sanity; zero queues/threads make no sense, and 1327 * 16K is almost certainly more than anyone will ever 1328 * need and avoids silly numbers like UINT32_MAX 1329 */ 1330 if (nq == 0 || nq >= 16384 || 1331 ntpq == 0 || ntpq >= 16384) 1332 break; 1333 1334 const zio_taskq_info_t zti = ZTI_P(ntpq, nq); 1335 row[q] = zti; 1336 break; 1337 } 1338 1339 case ZTI_MODE_SCALE: { 1340 const zio_taskq_info_t zti = ZTI_SCALE; 1341 row[q] = zti; 1342 break; 1343 } 1344 1345 case ZTI_MODE_SYNC: { 1346 const zio_taskq_info_t zti = ZTI_SYNC; 1347 row[q] = zti; 1348 break; 1349 } 1350 1351 case ZTI_MODE_NULL: { 1352 /* 1353 * Can only null the high-priority queues; the general- 1354 * purpose ones have to exist. 1355 */ 1356 if (q != ZIO_TASKQ_ISSUE_HIGH && 1357 q != ZIO_TASKQ_INTERRUPT_HIGH) 1358 break; 1359 1360 const zio_taskq_info_t zti = ZTI_NULL; 1361 row[q] = zti; 1362 break; 1363 } 1364 1365 default: 1366 break; 1367 } 1368 1369 /* Ensure we set a mode */ 1370 if (row[q].zti_mode == ZTI_NMODES) 1371 break; 1372 } 1373 1374 /* Didn't get a full row, fail */ 1375 if (q < ZIO_TASKQ_TYPES) 1376 return (SET_ERROR(EINVAL)); 1377 1378 /* Eat trailing space */ 1379 if (next != NULL) 1380 while (isspace(*next)) 1381 next++; 1382 1383 /* If there's anything left over then fail */ 1384 if (next != NULL && *next != '\0') 1385 return (SET_ERROR(EINVAL)); 1386 1387 /* Success! Copy it into the real config */ 1388 for (q = 0; q < ZIO_TASKQ_TYPES; q++) 1389 zio_taskqs[t][q] = row[q]; 1390 1391 return (0); 1392 } 1393 1394 static int 1395 spa_taskq_param_get(zio_type_t t, char *buf) 1396 { 1397 int pos = 0; 1398 1399 /* Build paramater string from live config */ 1400 const char *sep = ""; 1401 for (uint_t q = 0; q < ZIO_TASKQ_TYPES; q++) { 1402 const zio_taskq_info_t *zti = &zio_taskqs[t][q]; 1403 if (zti->zti_mode == ZTI_MODE_FIXED) 1404 pos += sprintf(&buf[pos], "%s%s,%u,%u", sep, 1405 modes[zti->zti_mode], zti->zti_count, 1406 zti->zti_value); 1407 else 1408 pos += sprintf(&buf[pos], "%s%s", sep, 1409 modes[zti->zti_mode]); 1410 sep = " "; 1411 } 1412 1413 buf[pos++] = '\n'; 1414 buf[pos] = '\0'; 1415 1416 return (pos); 1417 } 1418 1419 #ifdef __linux__ 1420 static int 1421 spa_taskq_read_param_set(const char *val, zfs_kernel_param_t *kp) 1422 { 1423 char *cfg = kmem_strdup(val); 1424 int err = spa_taskq_param_set(ZIO_TYPE_READ, cfg); 1425 kmem_free(cfg, strlen(val)+1); 1426 return (-err); 1427 } 1428 static int 1429 spa_taskq_read_param_get(char *buf, zfs_kernel_param_t *kp) 1430 { 1431 return (spa_taskq_param_get(ZIO_TYPE_READ, buf)); 1432 } 1433 1434 static int 1435 spa_taskq_write_param_set(const char *val, zfs_kernel_param_t *kp) 1436 { 1437 char *cfg = kmem_strdup(val); 1438 int err = spa_taskq_param_set(ZIO_TYPE_WRITE, cfg); 1439 kmem_free(cfg, strlen(val)+1); 1440 return (-err); 1441 } 1442 static int 1443 spa_taskq_write_param_get(char *buf, zfs_kernel_param_t *kp) 1444 { 1445 return (spa_taskq_param_get(ZIO_TYPE_WRITE, buf)); 1446 } 1447 #else 1448 /* 1449 * On FreeBSD load-time parameters can be set up before malloc() is available, 1450 * so we have to do all the parsing work on the stack. 1451 */ 1452 #define SPA_TASKQ_PARAM_MAX (128) 1453 1454 static int 1455 spa_taskq_read_param(ZFS_MODULE_PARAM_ARGS) 1456 { 1457 char buf[SPA_TASKQ_PARAM_MAX]; 1458 int err; 1459 1460 (void) spa_taskq_param_get(ZIO_TYPE_READ, buf); 1461 err = sysctl_handle_string(oidp, buf, sizeof (buf), req); 1462 if (err || req->newptr == NULL) 1463 return (err); 1464 return (spa_taskq_param_set(ZIO_TYPE_READ, buf)); 1465 } 1466 1467 static int 1468 spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS) 1469 { 1470 char buf[SPA_TASKQ_PARAM_MAX]; 1471 int err; 1472 1473 (void) spa_taskq_param_get(ZIO_TYPE_WRITE, buf); 1474 err = sysctl_handle_string(oidp, buf, sizeof (buf), req); 1475 if (err || req->newptr == NULL) 1476 return (err); 1477 return (spa_taskq_param_set(ZIO_TYPE_WRITE, buf)); 1478 } 1479 #endif 1480 #endif /* _KERNEL */ 1481 1482 /* 1483 * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. 1484 * Note that a type may have multiple discrete taskqs to avoid lock contention 1485 * on the taskq itself. 1486 */ 1487 static taskq_t * 1488 spa_taskq_dispatch_select(spa_t *spa, zio_type_t t, zio_taskq_type_t q, 1489 zio_t *zio) 1490 { 1491 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1492 taskq_t *tq; 1493 1494 ASSERT3P(tqs->stqs_taskq, !=, NULL); 1495 ASSERT3U(tqs->stqs_count, !=, 0); 1496 1497 if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) && 1498 (zio != NULL) && (zio->io_wr_iss_tq != NULL)) { 1499 /* dispatch to assigned write issue taskq */ 1500 tq = zio->io_wr_iss_tq; 1501 return (tq); 1502 } 1503 1504 if (tqs->stqs_count == 1) { 1505 tq = tqs->stqs_taskq[0]; 1506 } else { 1507 tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; 1508 } 1509 return (tq); 1510 } 1511 1512 void 1513 spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, 1514 task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent, 1515 zio_t *zio) 1516 { 1517 taskq_t *tq = spa_taskq_dispatch_select(spa, t, q, zio); 1518 taskq_dispatch_ent(tq, func, arg, flags, ent); 1519 } 1520 1521 /* 1522 * Same as spa_taskq_dispatch_ent() but block on the task until completion. 1523 */ 1524 void 1525 spa_taskq_dispatch_sync(spa_t *spa, zio_type_t t, zio_taskq_type_t q, 1526 task_func_t *func, void *arg, uint_t flags) 1527 { 1528 taskq_t *tq = spa_taskq_dispatch_select(spa, t, q, NULL); 1529 taskqid_t id = taskq_dispatch(tq, func, arg, flags); 1530 if (id) 1531 taskq_wait_id(tq, id); 1532 } 1533 1534 static void 1535 spa_create_zio_taskqs(spa_t *spa) 1536 { 1537 for (int t = 0; t < ZIO_TYPES; t++) { 1538 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1539 spa_taskqs_init(spa, t, q); 1540 } 1541 } 1542 } 1543 1544 #if defined(_KERNEL) && defined(HAVE_SPA_THREAD) 1545 static void 1546 spa_thread(void *arg) 1547 { 1548 psetid_t zio_taskq_psrset_bind = PS_NONE; 1549 callb_cpr_t cprinfo; 1550 1551 spa_t *spa = arg; 1552 user_t *pu = PTOU(curproc); 1553 1554 CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 1555 spa->spa_name); 1556 1557 ASSERT(curproc != &p0); 1558 (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 1559 "zpool-%s", spa->spa_name); 1560 (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 1561 1562 /* bind this thread to the requested psrset */ 1563 if (zio_taskq_psrset_bind != PS_NONE) { 1564 pool_lock(); 1565 mutex_enter(&cpu_lock); 1566 mutex_enter(&pidlock); 1567 mutex_enter(&curproc->p_lock); 1568 1569 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 1570 0, NULL, NULL) == 0) { 1571 curthread->t_bind_pset = zio_taskq_psrset_bind; 1572 } else { 1573 cmn_err(CE_WARN, 1574 "Couldn't bind process for zfs pool \"%s\" to " 1575 "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 1576 } 1577 1578 mutex_exit(&curproc->p_lock); 1579 mutex_exit(&pidlock); 1580 mutex_exit(&cpu_lock); 1581 pool_unlock(); 1582 } 1583 1584 #ifdef HAVE_SYSDC 1585 if (zio_taskq_sysdc) { 1586 sysdc_thread_enter(curthread, 100, 0); 1587 } 1588 #endif 1589 1590 spa->spa_proc = curproc; 1591 spa->spa_did = curthread->t_did; 1592 1593 spa_create_zio_taskqs(spa); 1594 1595 mutex_enter(&spa->spa_proc_lock); 1596 ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 1597 1598 spa->spa_proc_state = SPA_PROC_ACTIVE; 1599 cv_broadcast(&spa->spa_proc_cv); 1600 1601 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1602 while (spa->spa_proc_state == SPA_PROC_ACTIVE) 1603 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1604 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 1605 1606 ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 1607 spa->spa_proc_state = SPA_PROC_GONE; 1608 spa->spa_proc = &p0; 1609 cv_broadcast(&spa->spa_proc_cv); 1610 CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 1611 1612 mutex_enter(&curproc->p_lock); 1613 lwp_exit(); 1614 } 1615 #endif 1616 1617 extern metaslab_ops_t *metaslab_allocator(spa_t *spa); 1618 1619 /* 1620 * Activate an uninitialized pool. 1621 */ 1622 static void 1623 spa_activate(spa_t *spa, spa_mode_t mode) 1624 { 1625 metaslab_ops_t *msp = metaslab_allocator(spa); 1626 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 1627 1628 spa->spa_state = POOL_STATE_ACTIVE; 1629 spa->spa_mode = mode; 1630 spa->spa_read_spacemaps = spa_mode_readable_spacemaps; 1631 1632 spa->spa_normal_class = metaslab_class_create(spa, msp); 1633 spa->spa_log_class = metaslab_class_create(spa, msp); 1634 spa->spa_embedded_log_class = metaslab_class_create(spa, msp); 1635 spa->spa_special_class = metaslab_class_create(spa, msp); 1636 spa->spa_dedup_class = metaslab_class_create(spa, msp); 1637 1638 /* Try to create a covering process */ 1639 mutex_enter(&spa->spa_proc_lock); 1640 ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 1641 ASSERT(spa->spa_proc == &p0); 1642 spa->spa_did = 0; 1643 1644 #ifdef HAVE_SPA_THREAD 1645 /* Only create a process if we're going to be around a while. */ 1646 if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 1647 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 1648 NULL, 0) == 0) { 1649 spa->spa_proc_state = SPA_PROC_CREATED; 1650 while (spa->spa_proc_state == SPA_PROC_CREATED) { 1651 cv_wait(&spa->spa_proc_cv, 1652 &spa->spa_proc_lock); 1653 } 1654 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1655 ASSERT(spa->spa_proc != &p0); 1656 ASSERT(spa->spa_did != 0); 1657 } else { 1658 #ifdef _KERNEL 1659 cmn_err(CE_WARN, 1660 "Couldn't create process for zfs pool \"%s\"\n", 1661 spa->spa_name); 1662 #endif 1663 } 1664 } 1665 #endif /* HAVE_SPA_THREAD */ 1666 mutex_exit(&spa->spa_proc_lock); 1667 1668 /* If we didn't create a process, we need to create our taskqs. */ 1669 if (spa->spa_proc == &p0) { 1670 spa_create_zio_taskqs(spa); 1671 } 1672 1673 for (size_t i = 0; i < TXG_SIZE; i++) { 1674 spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, 1675 ZIO_FLAG_CANFAIL); 1676 } 1677 1678 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 1679 offsetof(vdev_t, vdev_config_dirty_node)); 1680 list_create(&spa->spa_evicting_os_list, sizeof (objset_t), 1681 offsetof(objset_t, os_evicting_node)); 1682 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 1683 offsetof(vdev_t, vdev_state_dirty_node)); 1684 1685 txg_list_create(&spa->spa_vdev_txg_list, spa, 1686 offsetof(struct vdev, vdev_txg_node)); 1687 1688 avl_create(&spa->spa_errlist_scrub, 1689 spa_error_entry_compare, sizeof (spa_error_entry_t), 1690 offsetof(spa_error_entry_t, se_avl)); 1691 avl_create(&spa->spa_errlist_last, 1692 spa_error_entry_compare, sizeof (spa_error_entry_t), 1693 offsetof(spa_error_entry_t, se_avl)); 1694 avl_create(&spa->spa_errlist_healed, 1695 spa_error_entry_compare, sizeof (spa_error_entry_t), 1696 offsetof(spa_error_entry_t, se_avl)); 1697 1698 spa_activate_os(spa); 1699 1700 spa_keystore_init(&spa->spa_keystore); 1701 1702 /* 1703 * This taskq is used to perform zvol-minor-related tasks 1704 * asynchronously. This has several advantages, including easy 1705 * resolution of various deadlocks. 1706 * 1707 * The taskq must be single threaded to ensure tasks are always 1708 * processed in the order in which they were dispatched. 1709 * 1710 * A taskq per pool allows one to keep the pools independent. 1711 * This way if one pool is suspended, it will not impact another. 1712 * 1713 * The preferred location to dispatch a zvol minor task is a sync 1714 * task. In this context, there is easy access to the spa_t and minimal 1715 * error handling is required because the sync task must succeed. 1716 */ 1717 spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri, 1718 1, INT_MAX, 0); 1719 1720 /* 1721 * The taskq to preload metaslabs. 1722 */ 1723 spa->spa_metaslab_taskq = taskq_create("z_metaslab", 1724 metaslab_preload_pct, maxclsyspri, 1, INT_MAX, 1725 TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); 1726 1727 /* 1728 * Taskq dedicated to prefetcher threads: this is used to prevent the 1729 * pool traverse code from monopolizing the global (and limited) 1730 * system_taskq by inappropriately scheduling long running tasks on it. 1731 */ 1732 spa->spa_prefetch_taskq = taskq_create("z_prefetch", 100, 1733 defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); 1734 1735 /* 1736 * The taskq to upgrade datasets in this pool. Currently used by 1737 * feature SPA_FEATURE_USEROBJ_ACCOUNTING/SPA_FEATURE_PROJECT_QUOTA. 1738 */ 1739 spa->spa_upgrade_taskq = taskq_create("z_upgrade", 100, 1740 defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); 1741 } 1742 1743 /* 1744 * Opposite of spa_activate(). 1745 */ 1746 static void 1747 spa_deactivate(spa_t *spa) 1748 { 1749 ASSERT(spa->spa_sync_on == B_FALSE); 1750 ASSERT(spa->spa_dsl_pool == NULL); 1751 ASSERT(spa->spa_root_vdev == NULL); 1752 ASSERT(spa->spa_async_zio_root == NULL); 1753 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 1754 1755 spa_evicting_os_wait(spa); 1756 1757 if (spa->spa_zvol_taskq) { 1758 taskq_destroy(spa->spa_zvol_taskq); 1759 spa->spa_zvol_taskq = NULL; 1760 } 1761 1762 if (spa->spa_metaslab_taskq) { 1763 taskq_destroy(spa->spa_metaslab_taskq); 1764 spa->spa_metaslab_taskq = NULL; 1765 } 1766 1767 if (spa->spa_prefetch_taskq) { 1768 taskq_destroy(spa->spa_prefetch_taskq); 1769 spa->spa_prefetch_taskq = NULL; 1770 } 1771 1772 if (spa->spa_upgrade_taskq) { 1773 taskq_destroy(spa->spa_upgrade_taskq); 1774 spa->spa_upgrade_taskq = NULL; 1775 } 1776 1777 txg_list_destroy(&spa->spa_vdev_txg_list); 1778 1779 list_destroy(&spa->spa_config_dirty_list); 1780 list_destroy(&spa->spa_evicting_os_list); 1781 list_destroy(&spa->spa_state_dirty_list); 1782 1783 taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); 1784 1785 for (int t = 0; t < ZIO_TYPES; t++) { 1786 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1787 spa_taskqs_fini(spa, t, q); 1788 } 1789 } 1790 1791 for (size_t i = 0; i < TXG_SIZE; i++) { 1792 ASSERT3P(spa->spa_txg_zio[i], !=, NULL); 1793 VERIFY0(zio_wait(spa->spa_txg_zio[i])); 1794 spa->spa_txg_zio[i] = NULL; 1795 } 1796 1797 metaslab_class_destroy(spa->spa_normal_class); 1798 spa->spa_normal_class = NULL; 1799 1800 metaslab_class_destroy(spa->spa_log_class); 1801 spa->spa_log_class = NULL; 1802 1803 metaslab_class_destroy(spa->spa_embedded_log_class); 1804 spa->spa_embedded_log_class = NULL; 1805 1806 metaslab_class_destroy(spa->spa_special_class); 1807 spa->spa_special_class = NULL; 1808 1809 metaslab_class_destroy(spa->spa_dedup_class); 1810 spa->spa_dedup_class = NULL; 1811 1812 /* 1813 * If this was part of an import or the open otherwise failed, we may 1814 * still have errors left in the queues. Empty them just in case. 1815 */ 1816 spa_errlog_drain(spa); 1817 avl_destroy(&spa->spa_errlist_scrub); 1818 avl_destroy(&spa->spa_errlist_last); 1819 avl_destroy(&spa->spa_errlist_healed); 1820 1821 spa_keystore_fini(&spa->spa_keystore); 1822 1823 spa->spa_state = POOL_STATE_UNINITIALIZED; 1824 1825 mutex_enter(&spa->spa_proc_lock); 1826 if (spa->spa_proc_state != SPA_PROC_NONE) { 1827 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1828 spa->spa_proc_state = SPA_PROC_DEACTIVATE; 1829 cv_broadcast(&spa->spa_proc_cv); 1830 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 1831 ASSERT(spa->spa_proc != &p0); 1832 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1833 } 1834 ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 1835 spa->spa_proc_state = SPA_PROC_NONE; 1836 } 1837 ASSERT(spa->spa_proc == &p0); 1838 mutex_exit(&spa->spa_proc_lock); 1839 1840 /* 1841 * We want to make sure spa_thread() has actually exited the ZFS 1842 * module, so that the module can't be unloaded out from underneath 1843 * it. 1844 */ 1845 if (spa->spa_did != 0) { 1846 thread_join(spa->spa_did); 1847 spa->spa_did = 0; 1848 } 1849 1850 spa_deactivate_os(spa); 1851 1852 } 1853 1854 /* 1855 * Verify a pool configuration, and construct the vdev tree appropriately. This 1856 * will create all the necessary vdevs in the appropriate layout, with each vdev 1857 * in the CLOSED state. This will prep the pool before open/creation/import. 1858 * All vdev validation is done by the vdev_alloc() routine. 1859 */ 1860 int 1861 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 1862 uint_t id, int atype) 1863 { 1864 nvlist_t **child; 1865 uint_t children; 1866 int error; 1867 1868 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 1869 return (error); 1870 1871 if ((*vdp)->vdev_ops->vdev_op_leaf) 1872 return (0); 1873 1874 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1875 &child, &children); 1876 1877 if (error == ENOENT) 1878 return (0); 1879 1880 if (error) { 1881 vdev_free(*vdp); 1882 *vdp = NULL; 1883 return (SET_ERROR(EINVAL)); 1884 } 1885 1886 for (int c = 0; c < children; c++) { 1887 vdev_t *vd; 1888 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 1889 atype)) != 0) { 1890 vdev_free(*vdp); 1891 *vdp = NULL; 1892 return (error); 1893 } 1894 } 1895 1896 ASSERT(*vdp != NULL); 1897 1898 return (0); 1899 } 1900 1901 static boolean_t 1902 spa_should_flush_logs_on_unload(spa_t *spa) 1903 { 1904 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) 1905 return (B_FALSE); 1906 1907 if (!spa_writeable(spa)) 1908 return (B_FALSE); 1909 1910 if (!spa->spa_sync_on) 1911 return (B_FALSE); 1912 1913 if (spa_state(spa) != POOL_STATE_EXPORTED) 1914 return (B_FALSE); 1915 1916 if (zfs_keep_log_spacemaps_at_export) 1917 return (B_FALSE); 1918 1919 return (B_TRUE); 1920 } 1921 1922 /* 1923 * Opens a transaction that will set the flag that will instruct 1924 * spa_sync to attempt to flush all the metaslabs for that txg. 1925 */ 1926 static void 1927 spa_unload_log_sm_flush_all(spa_t *spa) 1928 { 1929 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 1930 VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 1931 1932 ASSERT3U(spa->spa_log_flushall_txg, ==, 0); 1933 spa->spa_log_flushall_txg = dmu_tx_get_txg(tx); 1934 1935 dmu_tx_commit(tx); 1936 txg_wait_synced(spa_get_dsl(spa), spa->spa_log_flushall_txg); 1937 } 1938 1939 static void 1940 spa_unload_log_sm_metadata(spa_t *spa) 1941 { 1942 void *cookie = NULL; 1943 spa_log_sm_t *sls; 1944 log_summary_entry_t *e; 1945 1946 while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg, 1947 &cookie)) != NULL) { 1948 VERIFY0(sls->sls_mscount); 1949 kmem_free(sls, sizeof (spa_log_sm_t)); 1950 } 1951 1952 while ((e = list_remove_head(&spa->spa_log_summary)) != NULL) { 1953 VERIFY0(e->lse_mscount); 1954 kmem_free(e, sizeof (log_summary_entry_t)); 1955 } 1956 1957 spa->spa_unflushed_stats.sus_nblocks = 0; 1958 spa->spa_unflushed_stats.sus_memused = 0; 1959 spa->spa_unflushed_stats.sus_blocklimit = 0; 1960 } 1961 1962 static void 1963 spa_destroy_aux_threads(spa_t *spa) 1964 { 1965 if (spa->spa_condense_zthr != NULL) { 1966 zthr_destroy(spa->spa_condense_zthr); 1967 spa->spa_condense_zthr = NULL; 1968 } 1969 if (spa->spa_checkpoint_discard_zthr != NULL) { 1970 zthr_destroy(spa->spa_checkpoint_discard_zthr); 1971 spa->spa_checkpoint_discard_zthr = NULL; 1972 } 1973 if (spa->spa_livelist_delete_zthr != NULL) { 1974 zthr_destroy(spa->spa_livelist_delete_zthr); 1975 spa->spa_livelist_delete_zthr = NULL; 1976 } 1977 if (spa->spa_livelist_condense_zthr != NULL) { 1978 zthr_destroy(spa->spa_livelist_condense_zthr); 1979 spa->spa_livelist_condense_zthr = NULL; 1980 } 1981 if (spa->spa_raidz_expand_zthr != NULL) { 1982 zthr_destroy(spa->spa_raidz_expand_zthr); 1983 spa->spa_raidz_expand_zthr = NULL; 1984 } 1985 } 1986 1987 /* 1988 * Opposite of spa_load(). 1989 */ 1990 static void 1991 spa_unload(spa_t *spa) 1992 { 1993 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1994 ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED); 1995 1996 spa_import_progress_remove(spa_guid(spa)); 1997 spa_load_note(spa, "UNLOADING"); 1998 1999 spa_wake_waiters(spa); 2000 2001 /* 2002 * If we have set the spa_final_txg, we have already performed the 2003 * tasks below in spa_export_common(). We should not redo it here since 2004 * we delay the final TXGs beyond what spa_final_txg is set at. 2005 */ 2006 if (spa->spa_final_txg == UINT64_MAX) { 2007 /* 2008 * If the log space map feature is enabled and the pool is 2009 * getting exported (but not destroyed), we want to spend some 2010 * time flushing as many metaslabs as we can in an attempt to 2011 * destroy log space maps and save import time. 2012 */ 2013 if (spa_should_flush_logs_on_unload(spa)) 2014 spa_unload_log_sm_flush_all(spa); 2015 2016 /* 2017 * Stop async tasks. 2018 */ 2019 spa_async_suspend(spa); 2020 2021 if (spa->spa_root_vdev) { 2022 vdev_t *root_vdev = spa->spa_root_vdev; 2023 vdev_initialize_stop_all(root_vdev, 2024 VDEV_INITIALIZE_ACTIVE); 2025 vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE); 2026 vdev_autotrim_stop_all(spa); 2027 vdev_rebuild_stop_all(spa); 2028 } 2029 } 2030 2031 /* 2032 * Stop syncing. 2033 */ 2034 if (spa->spa_sync_on) { 2035 txg_sync_stop(spa->spa_dsl_pool); 2036 spa->spa_sync_on = B_FALSE; 2037 } 2038 2039 /* 2040 * This ensures that there is no async metaslab prefetching 2041 * while we attempt to unload the spa. 2042 */ 2043 taskq_wait(spa->spa_metaslab_taskq); 2044 2045 if (spa->spa_mmp.mmp_thread) 2046 mmp_thread_stop(spa); 2047 2048 /* 2049 * Wait for any outstanding async I/O to complete. 2050 */ 2051 if (spa->spa_async_zio_root != NULL) { 2052 for (int i = 0; i < max_ncpus; i++) 2053 (void) zio_wait(spa->spa_async_zio_root[i]); 2054 kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *)); 2055 spa->spa_async_zio_root = NULL; 2056 } 2057 2058 if (spa->spa_vdev_removal != NULL) { 2059 spa_vdev_removal_destroy(spa->spa_vdev_removal); 2060 spa->spa_vdev_removal = NULL; 2061 } 2062 2063 spa_destroy_aux_threads(spa); 2064 2065 spa_condense_fini(spa); 2066 2067 bpobj_close(&spa->spa_deferred_bpobj); 2068 2069 spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); 2070 2071 /* 2072 * Close all vdevs. 2073 */ 2074 if (spa->spa_root_vdev) 2075 vdev_free(spa->spa_root_vdev); 2076 ASSERT(spa->spa_root_vdev == NULL); 2077 2078 /* 2079 * Close the dsl pool. 2080 */ 2081 if (spa->spa_dsl_pool) { 2082 dsl_pool_close(spa->spa_dsl_pool); 2083 spa->spa_dsl_pool = NULL; 2084 spa->spa_meta_objset = NULL; 2085 } 2086 2087 ddt_unload(spa); 2088 brt_unload(spa); 2089 spa_unload_log_sm_metadata(spa); 2090 2091 /* 2092 * Drop and purge level 2 cache 2093 */ 2094 spa_l2cache_drop(spa); 2095 2096 if (spa->spa_spares.sav_vdevs) { 2097 for (int i = 0; i < spa->spa_spares.sav_count; i++) 2098 vdev_free(spa->spa_spares.sav_vdevs[i]); 2099 kmem_free(spa->spa_spares.sav_vdevs, 2100 spa->spa_spares.sav_count * sizeof (void *)); 2101 spa->spa_spares.sav_vdevs = NULL; 2102 } 2103 if (spa->spa_spares.sav_config) { 2104 nvlist_free(spa->spa_spares.sav_config); 2105 spa->spa_spares.sav_config = NULL; 2106 } 2107 spa->spa_spares.sav_count = 0; 2108 2109 if (spa->spa_l2cache.sav_vdevs) { 2110 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { 2111 vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); 2112 vdev_free(spa->spa_l2cache.sav_vdevs[i]); 2113 } 2114 kmem_free(spa->spa_l2cache.sav_vdevs, 2115 spa->spa_l2cache.sav_count * sizeof (void *)); 2116 spa->spa_l2cache.sav_vdevs = NULL; 2117 } 2118 if (spa->spa_l2cache.sav_config) { 2119 nvlist_free(spa->spa_l2cache.sav_config); 2120 spa->spa_l2cache.sav_config = NULL; 2121 } 2122 spa->spa_l2cache.sav_count = 0; 2123 2124 spa->spa_async_suspended = 0; 2125 2126 spa->spa_indirect_vdevs_loaded = B_FALSE; 2127 2128 if (spa->spa_comment != NULL) { 2129 spa_strfree(spa->spa_comment); 2130 spa->spa_comment = NULL; 2131 } 2132 if (spa->spa_compatibility != NULL) { 2133 spa_strfree(spa->spa_compatibility); 2134 spa->spa_compatibility = NULL; 2135 } 2136 2137 spa->spa_raidz_expand = NULL; 2138 2139 spa_config_exit(spa, SCL_ALL, spa); 2140 } 2141 2142 /* 2143 * Load (or re-load) the current list of vdevs describing the active spares for 2144 * this pool. When this is called, we have some form of basic information in 2145 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 2146 * then re-generate a more complete list including status information. 2147 */ 2148 void 2149 spa_load_spares(spa_t *spa) 2150 { 2151 nvlist_t **spares; 2152 uint_t nspares; 2153 int i; 2154 vdev_t *vd, *tvd; 2155 2156 #ifndef _KERNEL 2157 /* 2158 * zdb opens both the current state of the pool and the 2159 * checkpointed state (if present), with a different spa_t. 2160 * 2161 * As spare vdevs are shared among open pools, we skip loading 2162 * them when we load the checkpointed state of the pool. 2163 */ 2164 if (!spa_writeable(spa)) 2165 return; 2166 #endif 2167 2168 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 2169 2170 /* 2171 * First, close and free any existing spare vdevs. 2172 */ 2173 if (spa->spa_spares.sav_vdevs) { 2174 for (i = 0; i < spa->spa_spares.sav_count; i++) { 2175 vd = spa->spa_spares.sav_vdevs[i]; 2176 2177 /* Undo the call to spa_activate() below */ 2178 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 2179 B_FALSE)) != NULL && tvd->vdev_isspare) 2180 spa_spare_remove(tvd); 2181 vdev_close(vd); 2182 vdev_free(vd); 2183 } 2184 2185 kmem_free(spa->spa_spares.sav_vdevs, 2186 spa->spa_spares.sav_count * sizeof (void *)); 2187 } 2188 2189 if (spa->spa_spares.sav_config == NULL) 2190 nspares = 0; 2191 else 2192 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 2193 ZPOOL_CONFIG_SPARES, &spares, &nspares)); 2194 2195 spa->spa_spares.sav_count = (int)nspares; 2196 spa->spa_spares.sav_vdevs = NULL; 2197 2198 if (nspares == 0) 2199 return; 2200 2201 /* 2202 * Construct the array of vdevs, opening them to get status in the 2203 * process. For each spare, there is potentially two different vdev_t 2204 * structures associated with it: one in the list of spares (used only 2205 * for basic validation purposes) and one in the active vdev 2206 * configuration (if it's spared in). During this phase we open and 2207 * validate each vdev on the spare list. If the vdev also exists in the 2208 * active configuration, then we also mark this vdev as an active spare. 2209 */ 2210 spa->spa_spares.sav_vdevs = kmem_zalloc(nspares * sizeof (void *), 2211 KM_SLEEP); 2212 for (i = 0; i < spa->spa_spares.sav_count; i++) { 2213 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 2214 VDEV_ALLOC_SPARE) == 0); 2215 ASSERT(vd != NULL); 2216 2217 spa->spa_spares.sav_vdevs[i] = vd; 2218 2219 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 2220 B_FALSE)) != NULL) { 2221 if (!tvd->vdev_isspare) 2222 spa_spare_add(tvd); 2223 2224 /* 2225 * We only mark the spare active if we were successfully 2226 * able to load the vdev. Otherwise, importing a pool 2227 * with a bad active spare would result in strange 2228 * behavior, because multiple pool would think the spare 2229 * is actively in use. 2230 * 2231 * There is a vulnerability here to an equally bizarre 2232 * circumstance, where a dead active spare is later 2233 * brought back to life (onlined or otherwise). Given 2234 * the rarity of this scenario, and the extra complexity 2235 * it adds, we ignore the possibility. 2236 */ 2237 if (!vdev_is_dead(tvd)) 2238 spa_spare_activate(tvd); 2239 } 2240 2241 vd->vdev_top = vd; 2242 vd->vdev_aux = &spa->spa_spares; 2243 2244 if (vdev_open(vd) != 0) 2245 continue; 2246 2247 if (vdev_validate_aux(vd) == 0) 2248 spa_spare_add(vd); 2249 } 2250 2251 /* 2252 * Recompute the stashed list of spares, with status information 2253 * this time. 2254 */ 2255 fnvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES); 2256 2257 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 2258 KM_SLEEP); 2259 for (i = 0; i < spa->spa_spares.sav_count; i++) 2260 spares[i] = vdev_config_generate(spa, 2261 spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 2262 fnvlist_add_nvlist_array(spa->spa_spares.sav_config, 2263 ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, 2264 spa->spa_spares.sav_count); 2265 for (i = 0; i < spa->spa_spares.sav_count; i++) 2266 nvlist_free(spares[i]); 2267 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 2268 } 2269 2270 /* 2271 * Load (or re-load) the current list of vdevs describing the active l2cache for 2272 * this pool. When this is called, we have some form of basic information in 2273 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 2274 * then re-generate a more complete list including status information. 2275 * Devices which are already active have their details maintained, and are 2276 * not re-opened. 2277 */ 2278 void 2279 spa_load_l2cache(spa_t *spa) 2280 { 2281 nvlist_t **l2cache = NULL; 2282 uint_t nl2cache; 2283 int i, j, oldnvdevs; 2284 uint64_t guid; 2285 vdev_t *vd, **oldvdevs, **newvdevs; 2286 spa_aux_vdev_t *sav = &spa->spa_l2cache; 2287 2288 #ifndef _KERNEL 2289 /* 2290 * zdb opens both the current state of the pool and the 2291 * checkpointed state (if present), with a different spa_t. 2292 * 2293 * As L2 caches are part of the ARC which is shared among open 2294 * pools, we skip loading them when we load the checkpointed 2295 * state of the pool. 2296 */ 2297 if (!spa_writeable(spa)) 2298 return; 2299 #endif 2300 2301 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 2302 2303 oldvdevs = sav->sav_vdevs; 2304 oldnvdevs = sav->sav_count; 2305 sav->sav_vdevs = NULL; 2306 sav->sav_count = 0; 2307 2308 if (sav->sav_config == NULL) { 2309 nl2cache = 0; 2310 newvdevs = NULL; 2311 goto out; 2312 } 2313 2314 VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, 2315 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache)); 2316 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 2317 2318 /* 2319 * Process new nvlist of vdevs. 2320 */ 2321 for (i = 0; i < nl2cache; i++) { 2322 guid = fnvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID); 2323 2324 newvdevs[i] = NULL; 2325 for (j = 0; j < oldnvdevs; j++) { 2326 vd = oldvdevs[j]; 2327 if (vd != NULL && guid == vd->vdev_guid) { 2328 /* 2329 * Retain previous vdev for add/remove ops. 2330 */ 2331 newvdevs[i] = vd; 2332 oldvdevs[j] = NULL; 2333 break; 2334 } 2335 } 2336 2337 if (newvdevs[i] == NULL) { 2338 /* 2339 * Create new vdev 2340 */ 2341 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 2342 VDEV_ALLOC_L2CACHE) == 0); 2343 ASSERT(vd != NULL); 2344 newvdevs[i] = vd; 2345 2346 /* 2347 * Commit this vdev as an l2cache device, 2348 * even if it fails to open. 2349 */ 2350 spa_l2cache_add(vd); 2351 2352 vd->vdev_top = vd; 2353 vd->vdev_aux = sav; 2354 2355 spa_l2cache_activate(vd); 2356 2357 if (vdev_open(vd) != 0) 2358 continue; 2359 2360 (void) vdev_validate_aux(vd); 2361 2362 if (!vdev_is_dead(vd)) 2363 l2arc_add_vdev(spa, vd); 2364 2365 /* 2366 * Upon cache device addition to a pool or pool 2367 * creation with a cache device or if the header 2368 * of the device is invalid we issue an async 2369 * TRIM command for the whole device which will 2370 * execute if l2arc_trim_ahead > 0. 2371 */ 2372 spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM); 2373 } 2374 } 2375 2376 sav->sav_vdevs = newvdevs; 2377 sav->sav_count = (int)nl2cache; 2378 2379 /* 2380 * Recompute the stashed list of l2cache devices, with status 2381 * information this time. 2382 */ 2383 fnvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE); 2384 2385 if (sav->sav_count > 0) 2386 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), 2387 KM_SLEEP); 2388 for (i = 0; i < sav->sav_count; i++) 2389 l2cache[i] = vdev_config_generate(spa, 2390 sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 2391 fnvlist_add_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 2392 (const nvlist_t * const *)l2cache, sav->sav_count); 2393 2394 out: 2395 /* 2396 * Purge vdevs that were dropped 2397 */ 2398 if (oldvdevs) { 2399 for (i = 0; i < oldnvdevs; i++) { 2400 uint64_t pool; 2401 2402 vd = oldvdevs[i]; 2403 if (vd != NULL) { 2404 ASSERT(vd->vdev_isl2cache); 2405 2406 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 2407 pool != 0ULL && l2arc_vdev_present(vd)) 2408 l2arc_remove_vdev(vd); 2409 vdev_clear_stats(vd); 2410 vdev_free(vd); 2411 } 2412 } 2413 2414 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 2415 } 2416 2417 for (i = 0; i < sav->sav_count; i++) 2418 nvlist_free(l2cache[i]); 2419 if (sav->sav_count) 2420 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 2421 } 2422 2423 static int 2424 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 2425 { 2426 dmu_buf_t *db; 2427 char *packed = NULL; 2428 size_t nvsize = 0; 2429 int error; 2430 *value = NULL; 2431 2432 error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); 2433 if (error) 2434 return (error); 2435 2436 nvsize = *(uint64_t *)db->db_data; 2437 dmu_buf_rele(db, FTAG); 2438 2439 packed = vmem_alloc(nvsize, KM_SLEEP); 2440 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 2441 DMU_READ_PREFETCH); 2442 if (error == 0) 2443 error = nvlist_unpack(packed, nvsize, value, 0); 2444 vmem_free(packed, nvsize); 2445 2446 return (error); 2447 } 2448 2449 /* 2450 * Concrete top-level vdevs that are not missing and are not logs. At every 2451 * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds. 2452 */ 2453 static uint64_t 2454 spa_healthy_core_tvds(spa_t *spa) 2455 { 2456 vdev_t *rvd = spa->spa_root_vdev; 2457 uint64_t tvds = 0; 2458 2459 for (uint64_t i = 0; i < rvd->vdev_children; i++) { 2460 vdev_t *vd = rvd->vdev_child[i]; 2461 if (vd->vdev_islog) 2462 continue; 2463 if (vdev_is_concrete(vd) && !vdev_is_dead(vd)) 2464 tvds++; 2465 } 2466 2467 return (tvds); 2468 } 2469 2470 /* 2471 * Checks to see if the given vdev could not be opened, in which case we post a 2472 * sysevent to notify the autoreplace code that the device has been removed. 2473 */ 2474 static void 2475 spa_check_removed(vdev_t *vd) 2476 { 2477 for (uint64_t c = 0; c < vd->vdev_children; c++) 2478 spa_check_removed(vd->vdev_child[c]); 2479 2480 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && 2481 vdev_is_concrete(vd)) { 2482 zfs_post_autoreplace(vd->vdev_spa, vd); 2483 spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK); 2484 } 2485 } 2486 2487 static int 2488 spa_check_for_missing_logs(spa_t *spa) 2489 { 2490 vdev_t *rvd = spa->spa_root_vdev; 2491 2492 /* 2493 * If we're doing a normal import, then build up any additional 2494 * diagnostic information about missing log devices. 2495 * We'll pass this up to the user for further processing. 2496 */ 2497 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 2498 nvlist_t **child, *nv; 2499 uint64_t idx = 0; 2500 2501 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t *), 2502 KM_SLEEP); 2503 nv = fnvlist_alloc(); 2504 2505 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 2506 vdev_t *tvd = rvd->vdev_child[c]; 2507 2508 /* 2509 * We consider a device as missing only if it failed 2510 * to open (i.e. offline or faulted is not considered 2511 * as missing). 2512 */ 2513 if (tvd->vdev_islog && 2514 tvd->vdev_state == VDEV_STATE_CANT_OPEN) { 2515 child[idx++] = vdev_config_generate(spa, tvd, 2516 B_FALSE, VDEV_CONFIG_MISSING); 2517 } 2518 } 2519 2520 if (idx > 0) { 2521 fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 2522 (const nvlist_t * const *)child, idx); 2523 fnvlist_add_nvlist(spa->spa_load_info, 2524 ZPOOL_CONFIG_MISSING_DEVICES, nv); 2525 2526 for (uint64_t i = 0; i < idx; i++) 2527 nvlist_free(child[i]); 2528 } 2529 nvlist_free(nv); 2530 kmem_free(child, rvd->vdev_children * sizeof (char **)); 2531 2532 if (idx > 0) { 2533 spa_load_failed(spa, "some log devices are missing"); 2534 vdev_dbgmsg_print_tree(rvd, 2); 2535 return (SET_ERROR(ENXIO)); 2536 } 2537 } else { 2538 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 2539 vdev_t *tvd = rvd->vdev_child[c]; 2540 2541 if (tvd->vdev_islog && 2542 tvd->vdev_state == VDEV_STATE_CANT_OPEN) { 2543 spa_set_log_state(spa, SPA_LOG_CLEAR); 2544 spa_load_note(spa, "some log devices are " 2545 "missing, ZIL is dropped."); 2546 vdev_dbgmsg_print_tree(rvd, 2); 2547 break; 2548 } 2549 } 2550 } 2551 2552 return (0); 2553 } 2554 2555 /* 2556 * Check for missing log devices 2557 */ 2558 static boolean_t 2559 spa_check_logs(spa_t *spa) 2560 { 2561 boolean_t rv = B_FALSE; 2562 dsl_pool_t *dp = spa_get_dsl(spa); 2563 2564 switch (spa->spa_log_state) { 2565 default: 2566 break; 2567 case SPA_LOG_MISSING: 2568 /* need to recheck in case slog has been restored */ 2569 case SPA_LOG_UNKNOWN: 2570 rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 2571 zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0); 2572 if (rv) 2573 spa_set_log_state(spa, SPA_LOG_MISSING); 2574 break; 2575 } 2576 return (rv); 2577 } 2578 2579 /* 2580 * Passivate any log vdevs (note, does not apply to embedded log metaslabs). 2581 */ 2582 static boolean_t 2583 spa_passivate_log(spa_t *spa) 2584 { 2585 vdev_t *rvd = spa->spa_root_vdev; 2586 boolean_t slog_found = B_FALSE; 2587 2588 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 2589 2590 for (int c = 0; c < rvd->vdev_children; c++) { 2591 vdev_t *tvd = rvd->vdev_child[c]; 2592 2593 if (tvd->vdev_islog) { 2594 ASSERT3P(tvd->vdev_log_mg, ==, NULL); 2595 metaslab_group_passivate(tvd->vdev_mg); 2596 slog_found = B_TRUE; 2597 } 2598 } 2599 2600 return (slog_found); 2601 } 2602 2603 /* 2604 * Activate any log vdevs (note, does not apply to embedded log metaslabs). 2605 */ 2606 static void 2607 spa_activate_log(spa_t *spa) 2608 { 2609 vdev_t *rvd = spa->spa_root_vdev; 2610 2611 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 2612 2613 for (int c = 0; c < rvd->vdev_children; c++) { 2614 vdev_t *tvd = rvd->vdev_child[c]; 2615 2616 if (tvd->vdev_islog) { 2617 ASSERT3P(tvd->vdev_log_mg, ==, NULL); 2618 metaslab_group_activate(tvd->vdev_mg); 2619 } 2620 } 2621 } 2622 2623 int 2624 spa_reset_logs(spa_t *spa) 2625 { 2626 int error; 2627 2628 error = dmu_objset_find(spa_name(spa), zil_reset, 2629 NULL, DS_FIND_CHILDREN); 2630 if (error == 0) { 2631 /* 2632 * We successfully offlined the log device, sync out the 2633 * current txg so that the "stubby" block can be removed 2634 * by zil_sync(). 2635 */ 2636 txg_wait_synced(spa->spa_dsl_pool, 0); 2637 } 2638 return (error); 2639 } 2640 2641 static void 2642 spa_aux_check_removed(spa_aux_vdev_t *sav) 2643 { 2644 for (int i = 0; i < sav->sav_count; i++) 2645 spa_check_removed(sav->sav_vdevs[i]); 2646 } 2647 2648 void 2649 spa_claim_notify(zio_t *zio) 2650 { 2651 spa_t *spa = zio->io_spa; 2652 2653 if (zio->io_error) 2654 return; 2655 2656 mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 2657 if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 2658 spa->spa_claim_max_txg = zio->io_bp->blk_birth; 2659 mutex_exit(&spa->spa_props_lock); 2660 } 2661 2662 typedef struct spa_load_error { 2663 boolean_t sle_verify_data; 2664 uint64_t sle_meta_count; 2665 uint64_t sle_data_count; 2666 } spa_load_error_t; 2667 2668 static void 2669 spa_load_verify_done(zio_t *zio) 2670 { 2671 blkptr_t *bp = zio->io_bp; 2672 spa_load_error_t *sle = zio->io_private; 2673 dmu_object_type_t type = BP_GET_TYPE(bp); 2674 int error = zio->io_error; 2675 spa_t *spa = zio->io_spa; 2676 2677 abd_free(zio->io_abd); 2678 if (error) { 2679 if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && 2680 type != DMU_OT_INTENT_LOG) 2681 atomic_inc_64(&sle->sle_meta_count); 2682 else 2683 atomic_inc_64(&sle->sle_data_count); 2684 } 2685 2686 mutex_enter(&spa->spa_scrub_lock); 2687 spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp); 2688 cv_broadcast(&spa->spa_scrub_io_cv); 2689 mutex_exit(&spa->spa_scrub_lock); 2690 } 2691 2692 /* 2693 * Maximum number of inflight bytes is the log2 fraction of the arc size. 2694 * By default, we set it to 1/16th of the arc. 2695 */ 2696 static uint_t spa_load_verify_shift = 4; 2697 static int spa_load_verify_metadata = B_TRUE; 2698 static int spa_load_verify_data = B_TRUE; 2699 2700 static int 2701 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 2702 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 2703 { 2704 zio_t *rio = arg; 2705 spa_load_error_t *sle = rio->io_private; 2706 2707 (void) zilog, (void) dnp; 2708 2709 /* 2710 * Note: normally this routine will not be called if 2711 * spa_load_verify_metadata is not set. However, it may be useful 2712 * to manually set the flag after the traversal has begun. 2713 */ 2714 if (!spa_load_verify_metadata) 2715 return (0); 2716 2717 /* 2718 * Sanity check the block pointer in order to detect obvious damage 2719 * before using the contents in subsequent checks or in zio_read(). 2720 * When damaged consider it to be a metadata error since we cannot 2721 * trust the BP_GET_TYPE and BP_GET_LEVEL values. 2722 */ 2723 if (!zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { 2724 atomic_inc_64(&sle->sle_meta_count); 2725 return (0); 2726 } 2727 2728 if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || 2729 BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp)) 2730 return (0); 2731 2732 if (!BP_IS_METADATA(bp) && 2733 (!spa_load_verify_data || !sle->sle_verify_data)) 2734 return (0); 2735 2736 uint64_t maxinflight_bytes = 2737 arc_target_bytes() >> spa_load_verify_shift; 2738 size_t size = BP_GET_PSIZE(bp); 2739 2740 mutex_enter(&spa->spa_scrub_lock); 2741 while (spa->spa_load_verify_bytes >= maxinflight_bytes) 2742 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2743 spa->spa_load_verify_bytes += size; 2744 mutex_exit(&spa->spa_scrub_lock); 2745 2746 zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size, 2747 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 2748 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 2749 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 2750 return (0); 2751 } 2752 2753 static int 2754 verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) 2755 { 2756 (void) dp, (void) arg; 2757 2758 if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN) 2759 return (SET_ERROR(ENAMETOOLONG)); 2760 2761 return (0); 2762 } 2763 2764 static int 2765 spa_load_verify(spa_t *spa) 2766 { 2767 zio_t *rio; 2768 spa_load_error_t sle = { 0 }; 2769 zpool_load_policy_t policy; 2770 boolean_t verify_ok = B_FALSE; 2771 int error = 0; 2772 2773 zpool_get_load_policy(spa->spa_config, &policy); 2774 2775 if (policy.zlp_rewind & ZPOOL_NEVER_REWIND || 2776 policy.zlp_maxmeta == UINT64_MAX) 2777 return (0); 2778 2779 dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); 2780 error = dmu_objset_find_dp(spa->spa_dsl_pool, 2781 spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL, 2782 DS_FIND_CHILDREN); 2783 dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); 2784 if (error != 0) 2785 return (error); 2786 2787 /* 2788 * Verify data only if we are rewinding or error limit was set. 2789 * Otherwise nothing except dbgmsg care about it to waste time. 2790 */ 2791 sle.sle_verify_data = (policy.zlp_rewind & ZPOOL_REWIND_MASK) || 2792 (policy.zlp_maxdata < UINT64_MAX); 2793 2794 rio = zio_root(spa, NULL, &sle, 2795 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 2796 2797 if (spa_load_verify_metadata) { 2798 if (spa->spa_extreme_rewind) { 2799 spa_load_note(spa, "performing a complete scan of the " 2800 "pool since extreme rewind is on. This may take " 2801 "a very long time.\n (spa_load_verify_data=%u, " 2802 "spa_load_verify_metadata=%u)", 2803 spa_load_verify_data, spa_load_verify_metadata); 2804 } 2805 2806 error = traverse_pool(spa, spa->spa_verify_min_txg, 2807 TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | 2808 TRAVERSE_NO_DECRYPT, spa_load_verify_cb, rio); 2809 } 2810 2811 (void) zio_wait(rio); 2812 ASSERT0(spa->spa_load_verify_bytes); 2813 2814 spa->spa_load_meta_errors = sle.sle_meta_count; 2815 spa->spa_load_data_errors = sle.sle_data_count; 2816 2817 if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) { 2818 spa_load_note(spa, "spa_load_verify found %llu metadata errors " 2819 "and %llu data errors", (u_longlong_t)sle.sle_meta_count, 2820 (u_longlong_t)sle.sle_data_count); 2821 } 2822 2823 if (spa_load_verify_dryrun || 2824 (!error && sle.sle_meta_count <= policy.zlp_maxmeta && 2825 sle.sle_data_count <= policy.zlp_maxdata)) { 2826 int64_t loss = 0; 2827 2828 verify_ok = B_TRUE; 2829 spa->spa_load_txg = spa->spa_uberblock.ub_txg; 2830 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 2831 2832 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 2833 fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_TIME, 2834 spa->spa_load_txg_ts); 2835 fnvlist_add_int64(spa->spa_load_info, ZPOOL_CONFIG_REWIND_TIME, 2836 loss); 2837 fnvlist_add_uint64(spa->spa_load_info, 2838 ZPOOL_CONFIG_LOAD_META_ERRORS, sle.sle_meta_count); 2839 fnvlist_add_uint64(spa->spa_load_info, 2840 ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count); 2841 } else { 2842 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 2843 } 2844 2845 if (spa_load_verify_dryrun) 2846 return (0); 2847 2848 if (error) { 2849 if (error != ENXIO && error != EIO) 2850 error = SET_ERROR(EIO); 2851 return (error); 2852 } 2853 2854 return (verify_ok ? 0 : EIO); 2855 } 2856 2857 /* 2858 * Find a value in the pool props object. 2859 */ 2860 static void 2861 spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 2862 { 2863 (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 2864 zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 2865 } 2866 2867 /* 2868 * Find a value in the pool directory object. 2869 */ 2870 static int 2871 spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent) 2872 { 2873 int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 2874 name, sizeof (uint64_t), 1, val); 2875 2876 if (error != 0 && (error != ENOENT || log_enoent)) { 2877 spa_load_failed(spa, "couldn't get '%s' value in MOS directory " 2878 "[error=%d]", name, error); 2879 } 2880 2881 return (error); 2882 } 2883 2884 static int 2885 spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 2886 { 2887 vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 2888 return (SET_ERROR(err)); 2889 } 2890 2891 boolean_t 2892 spa_livelist_delete_check(spa_t *spa) 2893 { 2894 return (spa->spa_livelists_to_delete != 0); 2895 } 2896 2897 static boolean_t 2898 spa_livelist_delete_cb_check(void *arg, zthr_t *z) 2899 { 2900 (void) z; 2901 spa_t *spa = arg; 2902 return (spa_livelist_delete_check(spa)); 2903 } 2904 2905 static int 2906 delete_blkptr_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 2907 { 2908 spa_t *spa = arg; 2909 zio_free(spa, tx->tx_txg, bp); 2910 dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, 2911 -bp_get_dsize_sync(spa, bp), 2912 -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); 2913 return (0); 2914 } 2915 2916 static int 2917 dsl_get_next_livelist_obj(objset_t *os, uint64_t zap_obj, uint64_t *llp) 2918 { 2919 int err; 2920 zap_cursor_t zc; 2921 zap_attribute_t za; 2922 zap_cursor_init(&zc, os, zap_obj); 2923 err = zap_cursor_retrieve(&zc, &za); 2924 zap_cursor_fini(&zc); 2925 if (err == 0) 2926 *llp = za.za_first_integer; 2927 return (err); 2928 } 2929 2930 /* 2931 * Components of livelist deletion that must be performed in syncing 2932 * context: freeing block pointers and updating the pool-wide data 2933 * structures to indicate how much work is left to do 2934 */ 2935 typedef struct sublist_delete_arg { 2936 spa_t *spa; 2937 dsl_deadlist_t *ll; 2938 uint64_t key; 2939 bplist_t *to_free; 2940 } sublist_delete_arg_t; 2941 2942 static void 2943 sublist_delete_sync(void *arg, dmu_tx_t *tx) 2944 { 2945 sublist_delete_arg_t *sda = arg; 2946 spa_t *spa = sda->spa; 2947 dsl_deadlist_t *ll = sda->ll; 2948 uint64_t key = sda->key; 2949 bplist_t *to_free = sda->to_free; 2950 2951 bplist_iterate(to_free, delete_blkptr_cb, spa, tx); 2952 dsl_deadlist_remove_entry(ll, key, tx); 2953 } 2954 2955 typedef struct livelist_delete_arg { 2956 spa_t *spa; 2957 uint64_t ll_obj; 2958 uint64_t zap_obj; 2959 } livelist_delete_arg_t; 2960 2961 static void 2962 livelist_delete_sync(void *arg, dmu_tx_t *tx) 2963 { 2964 livelist_delete_arg_t *lda = arg; 2965 spa_t *spa = lda->spa; 2966 uint64_t ll_obj = lda->ll_obj; 2967 uint64_t zap_obj = lda->zap_obj; 2968 objset_t *mos = spa->spa_meta_objset; 2969 uint64_t count; 2970 2971 /* free the livelist and decrement the feature count */ 2972 VERIFY0(zap_remove_int(mos, zap_obj, ll_obj, tx)); 2973 dsl_deadlist_free(mos, ll_obj, tx); 2974 spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx); 2975 VERIFY0(zap_count(mos, zap_obj, &count)); 2976 if (count == 0) { 2977 /* no more livelists to delete */ 2978 VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, 2979 DMU_POOL_DELETED_CLONES, tx)); 2980 VERIFY0(zap_destroy(mos, zap_obj, tx)); 2981 spa->spa_livelists_to_delete = 0; 2982 spa_notify_waiters(spa); 2983 } 2984 } 2985 2986 /* 2987 * Load in the value for the livelist to be removed and open it. Then, 2988 * load its first sublist and determine which block pointers should actually 2989 * be freed. Then, call a synctask which performs the actual frees and updates 2990 * the pool-wide livelist data. 2991 */ 2992 static void 2993 spa_livelist_delete_cb(void *arg, zthr_t *z) 2994 { 2995 spa_t *spa = arg; 2996 uint64_t ll_obj = 0, count; 2997 objset_t *mos = spa->spa_meta_objset; 2998 uint64_t zap_obj = spa->spa_livelists_to_delete; 2999 /* 3000 * Determine the next livelist to delete. This function should only 3001 * be called if there is at least one deleted clone. 3002 */ 3003 VERIFY0(dsl_get_next_livelist_obj(mos, zap_obj, &ll_obj)); 3004 VERIFY0(zap_count(mos, ll_obj, &count)); 3005 if (count > 0) { 3006 dsl_deadlist_t *ll; 3007 dsl_deadlist_entry_t *dle; 3008 bplist_t to_free; 3009 ll = kmem_zalloc(sizeof (dsl_deadlist_t), KM_SLEEP); 3010 dsl_deadlist_open(ll, mos, ll_obj); 3011 dle = dsl_deadlist_first(ll); 3012 ASSERT3P(dle, !=, NULL); 3013 bplist_create(&to_free); 3014 int err = dsl_process_sub_livelist(&dle->dle_bpobj, &to_free, 3015 z, NULL); 3016 if (err == 0) { 3017 sublist_delete_arg_t sync_arg = { 3018 .spa = spa, 3019 .ll = ll, 3020 .key = dle->dle_mintxg, 3021 .to_free = &to_free 3022 }; 3023 zfs_dbgmsg("deleting sublist (id %llu) from" 3024 " livelist %llu, %lld remaining", 3025 (u_longlong_t)dle->dle_bpobj.bpo_object, 3026 (u_longlong_t)ll_obj, (longlong_t)count - 1); 3027 VERIFY0(dsl_sync_task(spa_name(spa), NULL, 3028 sublist_delete_sync, &sync_arg, 0, 3029 ZFS_SPACE_CHECK_DESTROY)); 3030 } else { 3031 VERIFY3U(err, ==, EINTR); 3032 } 3033 bplist_clear(&to_free); 3034 bplist_destroy(&to_free); 3035 dsl_deadlist_close(ll); 3036 kmem_free(ll, sizeof (dsl_deadlist_t)); 3037 } else { 3038 livelist_delete_arg_t sync_arg = { 3039 .spa = spa, 3040 .ll_obj = ll_obj, 3041 .zap_obj = zap_obj 3042 }; 3043 zfs_dbgmsg("deletion of livelist %llu completed", 3044 (u_longlong_t)ll_obj); 3045 VERIFY0(dsl_sync_task(spa_name(spa), NULL, livelist_delete_sync, 3046 &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY)); 3047 } 3048 } 3049 3050 static void 3051 spa_start_livelist_destroy_thread(spa_t *spa) 3052 { 3053 ASSERT3P(spa->spa_livelist_delete_zthr, ==, NULL); 3054 spa->spa_livelist_delete_zthr = 3055 zthr_create("z_livelist_destroy", 3056 spa_livelist_delete_cb_check, spa_livelist_delete_cb, spa, 3057 minclsyspri); 3058 } 3059 3060 typedef struct livelist_new_arg { 3061 bplist_t *allocs; 3062 bplist_t *frees; 3063 } livelist_new_arg_t; 3064 3065 static int 3066 livelist_track_new_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 3067 dmu_tx_t *tx) 3068 { 3069 ASSERT(tx == NULL); 3070 livelist_new_arg_t *lna = arg; 3071 if (bp_freed) { 3072 bplist_append(lna->frees, bp); 3073 } else { 3074 bplist_append(lna->allocs, bp); 3075 zfs_livelist_condense_new_alloc++; 3076 } 3077 return (0); 3078 } 3079 3080 typedef struct livelist_condense_arg { 3081 spa_t *spa; 3082 bplist_t to_keep; 3083 uint64_t first_size; 3084 uint64_t next_size; 3085 } livelist_condense_arg_t; 3086 3087 static void 3088 spa_livelist_condense_sync(void *arg, dmu_tx_t *tx) 3089 { 3090 livelist_condense_arg_t *lca = arg; 3091 spa_t *spa = lca->spa; 3092 bplist_t new_frees; 3093 dsl_dataset_t *ds = spa->spa_to_condense.ds; 3094 3095 /* Have we been cancelled? */ 3096 if (spa->spa_to_condense.cancelled) { 3097 zfs_livelist_condense_sync_cancel++; 3098 goto out; 3099 } 3100 3101 dsl_deadlist_entry_t *first = spa->spa_to_condense.first; 3102 dsl_deadlist_entry_t *next = spa->spa_to_condense.next; 3103 dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist; 3104 3105 /* 3106 * It's possible that the livelist was changed while the zthr was 3107 * running. Therefore, we need to check for new blkptrs in the two 3108 * entries being condensed and continue to track them in the livelist. 3109 * Because of the way we handle remapped blkptrs (see dbuf_remap_impl), 3110 * it's possible that the newly added blkptrs are FREEs or ALLOCs so 3111 * we need to sort them into two different bplists. 3112 */ 3113 uint64_t first_obj = first->dle_bpobj.bpo_object; 3114 uint64_t next_obj = next->dle_bpobj.bpo_object; 3115 uint64_t cur_first_size = first->dle_bpobj.bpo_phys->bpo_num_blkptrs; 3116 uint64_t cur_next_size = next->dle_bpobj.bpo_phys->bpo_num_blkptrs; 3117 3118 bplist_create(&new_frees); 3119 livelist_new_arg_t new_bps = { 3120 .allocs = &lca->to_keep, 3121 .frees = &new_frees, 3122 }; 3123 3124 if (cur_first_size > lca->first_size) { 3125 VERIFY0(livelist_bpobj_iterate_from_nofree(&first->dle_bpobj, 3126 livelist_track_new_cb, &new_bps, lca->first_size)); 3127 } 3128 if (cur_next_size > lca->next_size) { 3129 VERIFY0(livelist_bpobj_iterate_from_nofree(&next->dle_bpobj, 3130 livelist_track_new_cb, &new_bps, lca->next_size)); 3131 } 3132 3133 dsl_deadlist_clear_entry(first, ll, tx); 3134 ASSERT(bpobj_is_empty(&first->dle_bpobj)); 3135 dsl_deadlist_remove_entry(ll, next->dle_mintxg, tx); 3136 3137 bplist_iterate(&lca->to_keep, dsl_deadlist_insert_alloc_cb, ll, tx); 3138 bplist_iterate(&new_frees, dsl_deadlist_insert_free_cb, ll, tx); 3139 bplist_destroy(&new_frees); 3140 3141 char dsname[ZFS_MAX_DATASET_NAME_LEN]; 3142 dsl_dataset_name(ds, dsname); 3143 zfs_dbgmsg("txg %llu condensing livelist of %s (id %llu), bpobj %llu " 3144 "(%llu blkptrs) and bpobj %llu (%llu blkptrs) -> bpobj %llu " 3145 "(%llu blkptrs)", (u_longlong_t)tx->tx_txg, dsname, 3146 (u_longlong_t)ds->ds_object, (u_longlong_t)first_obj, 3147 (u_longlong_t)cur_first_size, (u_longlong_t)next_obj, 3148 (u_longlong_t)cur_next_size, 3149 (u_longlong_t)first->dle_bpobj.bpo_object, 3150 (u_longlong_t)first->dle_bpobj.bpo_phys->bpo_num_blkptrs); 3151 out: 3152 dmu_buf_rele(ds->ds_dbuf, spa); 3153 spa->spa_to_condense.ds = NULL; 3154 bplist_clear(&lca->to_keep); 3155 bplist_destroy(&lca->to_keep); 3156 kmem_free(lca, sizeof (livelist_condense_arg_t)); 3157 spa->spa_to_condense.syncing = B_FALSE; 3158 } 3159 3160 static void 3161 spa_livelist_condense_cb(void *arg, zthr_t *t) 3162 { 3163 while (zfs_livelist_condense_zthr_pause && 3164 !(zthr_has_waiters(t) || zthr_iscancelled(t))) 3165 delay(1); 3166 3167 spa_t *spa = arg; 3168 dsl_deadlist_entry_t *first = spa->spa_to_condense.first; 3169 dsl_deadlist_entry_t *next = spa->spa_to_condense.next; 3170 uint64_t first_size, next_size; 3171 3172 livelist_condense_arg_t *lca = 3173 kmem_alloc(sizeof (livelist_condense_arg_t), KM_SLEEP); 3174 bplist_create(&lca->to_keep); 3175 3176 /* 3177 * Process the livelists (matching FREEs and ALLOCs) in open context 3178 * so we have minimal work in syncing context to condense. 3179 * 3180 * We save bpobj sizes (first_size and next_size) to use later in 3181 * syncing context to determine if entries were added to these sublists 3182 * while in open context. This is possible because the clone is still 3183 * active and open for normal writes and we want to make sure the new, 3184 * unprocessed blockpointers are inserted into the livelist normally. 3185 * 3186 * Note that dsl_process_sub_livelist() both stores the size number of 3187 * blockpointers and iterates over them while the bpobj's lock held, so 3188 * the sizes returned to us are consistent which what was actually 3189 * processed. 3190 */ 3191 int err = dsl_process_sub_livelist(&first->dle_bpobj, &lca->to_keep, t, 3192 &first_size); 3193 if (err == 0) 3194 err = dsl_process_sub_livelist(&next->dle_bpobj, &lca->to_keep, 3195 t, &next_size); 3196 3197 if (err == 0) { 3198 while (zfs_livelist_condense_sync_pause && 3199 !(zthr_has_waiters(t) || zthr_iscancelled(t))) 3200 delay(1); 3201 3202 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 3203 dmu_tx_mark_netfree(tx); 3204 dmu_tx_hold_space(tx, 1); 3205 err = dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE); 3206 if (err == 0) { 3207 /* 3208 * Prevent the condense zthr restarting before 3209 * the synctask completes. 3210 */ 3211 spa->spa_to_condense.syncing = B_TRUE; 3212 lca->spa = spa; 3213 lca->first_size = first_size; 3214 lca->next_size = next_size; 3215 dsl_sync_task_nowait(spa_get_dsl(spa), 3216 spa_livelist_condense_sync, lca, tx); 3217 dmu_tx_commit(tx); 3218 return; 3219 } 3220 } 3221 /* 3222 * Condensing can not continue: either it was externally stopped or 3223 * we were unable to assign to a tx because the pool has run out of 3224 * space. In the second case, we'll just end up trying to condense 3225 * again in a later txg. 3226 */ 3227 ASSERT(err != 0); 3228 bplist_clear(&lca->to_keep); 3229 bplist_destroy(&lca->to_keep); 3230 kmem_free(lca, sizeof (livelist_condense_arg_t)); 3231 dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, spa); 3232 spa->spa_to_condense.ds = NULL; 3233 if (err == EINTR) 3234 zfs_livelist_condense_zthr_cancel++; 3235 } 3236 3237 /* 3238 * Check that there is something to condense but that a condense is not 3239 * already in progress and that condensing has not been cancelled. 3240 */ 3241 static boolean_t 3242 spa_livelist_condense_cb_check(void *arg, zthr_t *z) 3243 { 3244 (void) z; 3245 spa_t *spa = arg; 3246 if ((spa->spa_to_condense.ds != NULL) && 3247 (spa->spa_to_condense.syncing == B_FALSE) && 3248 (spa->spa_to_condense.cancelled == B_FALSE)) { 3249 return (B_TRUE); 3250 } 3251 return (B_FALSE); 3252 } 3253 3254 static void 3255 spa_start_livelist_condensing_thread(spa_t *spa) 3256 { 3257 spa->spa_to_condense.ds = NULL; 3258 spa->spa_to_condense.first = NULL; 3259 spa->spa_to_condense.next = NULL; 3260 spa->spa_to_condense.syncing = B_FALSE; 3261 spa->spa_to_condense.cancelled = B_FALSE; 3262 3263 ASSERT3P(spa->spa_livelist_condense_zthr, ==, NULL); 3264 spa->spa_livelist_condense_zthr = 3265 zthr_create("z_livelist_condense", 3266 spa_livelist_condense_cb_check, 3267 spa_livelist_condense_cb, spa, minclsyspri); 3268 } 3269 3270 static void 3271 spa_spawn_aux_threads(spa_t *spa) 3272 { 3273 ASSERT(spa_writeable(spa)); 3274 3275 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 3276 3277 spa_start_raidz_expansion_thread(spa); 3278 spa_start_indirect_condensing_thread(spa); 3279 spa_start_livelist_destroy_thread(spa); 3280 spa_start_livelist_condensing_thread(spa); 3281 3282 ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL); 3283 spa->spa_checkpoint_discard_zthr = 3284 zthr_create("z_checkpoint_discard", 3285 spa_checkpoint_discard_thread_check, 3286 spa_checkpoint_discard_thread, spa, minclsyspri); 3287 } 3288 3289 /* 3290 * Fix up config after a partly-completed split. This is done with the 3291 * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 3292 * pool have that entry in their config, but only the splitting one contains 3293 * a list of all the guids of the vdevs that are being split off. 3294 * 3295 * This function determines what to do with that list: either rejoin 3296 * all the disks to the pool, or complete the splitting process. To attempt 3297 * the rejoin, each disk that is offlined is marked online again, and 3298 * we do a reopen() call. If the vdev label for every disk that was 3299 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 3300 * then we call vdev_split() on each disk, and complete the split. 3301 * 3302 * Otherwise we leave the config alone, with all the vdevs in place in 3303 * the original pool. 3304 */ 3305 static void 3306 spa_try_repair(spa_t *spa, nvlist_t *config) 3307 { 3308 uint_t extracted; 3309 uint64_t *glist; 3310 uint_t i, gcount; 3311 nvlist_t *nvl; 3312 vdev_t **vd; 3313 boolean_t attempt_reopen; 3314 3315 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 3316 return; 3317 3318 /* check that the config is complete */ 3319 if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 3320 &glist, &gcount) != 0) 3321 return; 3322 3323 vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 3324 3325 /* attempt to online all the vdevs & validate */ 3326 attempt_reopen = B_TRUE; 3327 for (i = 0; i < gcount; i++) { 3328 if (glist[i] == 0) /* vdev is hole */ 3329 continue; 3330 3331 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 3332 if (vd[i] == NULL) { 3333 /* 3334 * Don't bother attempting to reopen the disks; 3335 * just do the split. 3336 */ 3337 attempt_reopen = B_FALSE; 3338 } else { 3339 /* attempt to re-online it */ 3340 vd[i]->vdev_offline = B_FALSE; 3341 } 3342 } 3343 3344 if (attempt_reopen) { 3345 vdev_reopen(spa->spa_root_vdev); 3346 3347 /* check each device to see what state it's in */ 3348 for (extracted = 0, i = 0; i < gcount; i++) { 3349 if (vd[i] != NULL && 3350 vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 3351 break; 3352 ++extracted; 3353 } 3354 } 3355 3356 /* 3357 * If every disk has been moved to the new pool, or if we never 3358 * even attempted to look at them, then we split them off for 3359 * good. 3360 */ 3361 if (!attempt_reopen || gcount == extracted) { 3362 for (i = 0; i < gcount; i++) 3363 if (vd[i] != NULL) 3364 vdev_split(vd[i]); 3365 vdev_reopen(spa->spa_root_vdev); 3366 } 3367 3368 kmem_free(vd, gcount * sizeof (vdev_t *)); 3369 } 3370 3371 static int 3372 spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type) 3373 { 3374 const char *ereport = FM_EREPORT_ZFS_POOL; 3375 int error; 3376 3377 spa->spa_load_state = state; 3378 (void) spa_import_progress_set_state(spa_guid(spa), 3379 spa_load_state(spa)); 3380 spa_import_progress_set_notes(spa, "spa_load()"); 3381 3382 gethrestime(&spa->spa_loaded_ts); 3383 error = spa_load_impl(spa, type, &ereport); 3384 3385 /* 3386 * Don't count references from objsets that are already closed 3387 * and are making their way through the eviction process. 3388 */ 3389 spa_evicting_os_wait(spa); 3390 spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); 3391 if (error) { 3392 if (error != EEXIST) { 3393 spa->spa_loaded_ts.tv_sec = 0; 3394 spa->spa_loaded_ts.tv_nsec = 0; 3395 } 3396 if (error != EBADF) { 3397 (void) zfs_ereport_post(ereport, spa, 3398 NULL, NULL, NULL, 0); 3399 } 3400 } 3401 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 3402 spa->spa_ena = 0; 3403 3404 (void) spa_import_progress_set_state(spa_guid(spa), 3405 spa_load_state(spa)); 3406 3407 return (error); 3408 } 3409 3410 #ifdef ZFS_DEBUG 3411 /* 3412 * Count the number of per-vdev ZAPs associated with all of the vdevs in the 3413 * vdev tree rooted in the given vd, and ensure that each ZAP is present in the 3414 * spa's per-vdev ZAP list. 3415 */ 3416 static uint64_t 3417 vdev_count_verify_zaps(vdev_t *vd) 3418 { 3419 spa_t *spa = vd->vdev_spa; 3420 uint64_t total = 0; 3421 3422 if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2) && 3423 vd->vdev_root_zap != 0) { 3424 total++; 3425 ASSERT0(zap_lookup_int(spa->spa_meta_objset, 3426 spa->spa_all_vdev_zaps, vd->vdev_root_zap)); 3427 } 3428 if (vd->vdev_top_zap != 0) { 3429 total++; 3430 ASSERT0(zap_lookup_int(spa->spa_meta_objset, 3431 spa->spa_all_vdev_zaps, vd->vdev_top_zap)); 3432 } 3433 if (vd->vdev_leaf_zap != 0) { 3434 total++; 3435 ASSERT0(zap_lookup_int(spa->spa_meta_objset, 3436 spa->spa_all_vdev_zaps, vd->vdev_leaf_zap)); 3437 } 3438 3439 for (uint64_t i = 0; i < vd->vdev_children; i++) { 3440 total += vdev_count_verify_zaps(vd->vdev_child[i]); 3441 } 3442 3443 return (total); 3444 } 3445 #else 3446 #define vdev_count_verify_zaps(vd) ((void) sizeof (vd), 0) 3447 #endif 3448 3449 /* 3450 * Determine whether the activity check is required. 3451 */ 3452 static boolean_t 3453 spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label, 3454 nvlist_t *config) 3455 { 3456 uint64_t state = 0; 3457 uint64_t hostid = 0; 3458 uint64_t tryconfig_txg = 0; 3459 uint64_t tryconfig_timestamp = 0; 3460 uint16_t tryconfig_mmp_seq = 0; 3461 nvlist_t *nvinfo; 3462 3463 if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { 3464 nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); 3465 (void) nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG, 3466 &tryconfig_txg); 3467 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 3468 &tryconfig_timestamp); 3469 (void) nvlist_lookup_uint16(nvinfo, ZPOOL_CONFIG_MMP_SEQ, 3470 &tryconfig_mmp_seq); 3471 } 3472 3473 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state); 3474 3475 /* 3476 * Disable the MMP activity check - This is used by zdb which 3477 * is intended to be used on potentially active pools. 3478 */ 3479 if (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) 3480 return (B_FALSE); 3481 3482 /* 3483 * Skip the activity check when the MMP feature is disabled. 3484 */ 3485 if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay == 0) 3486 return (B_FALSE); 3487 3488 /* 3489 * If the tryconfig_ values are nonzero, they are the results of an 3490 * earlier tryimport. If they all match the uberblock we just found, 3491 * then the pool has not changed and we return false so we do not test 3492 * a second time. 3493 */ 3494 if (tryconfig_txg && tryconfig_txg == ub->ub_txg && 3495 tryconfig_timestamp && tryconfig_timestamp == ub->ub_timestamp && 3496 tryconfig_mmp_seq && tryconfig_mmp_seq == 3497 (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) 3498 return (B_FALSE); 3499 3500 /* 3501 * Allow the activity check to be skipped when importing the pool 3502 * on the same host which last imported it. Since the hostid from 3503 * configuration may be stale use the one read from the label. 3504 */ 3505 if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID)) 3506 hostid = fnvlist_lookup_uint64(label, ZPOOL_CONFIG_HOSTID); 3507 3508 if (hostid == spa_get_hostid(spa)) 3509 return (B_FALSE); 3510 3511 /* 3512 * Skip the activity test when the pool was cleanly exported. 3513 */ 3514 if (state != POOL_STATE_ACTIVE) 3515 return (B_FALSE); 3516 3517 return (B_TRUE); 3518 } 3519 3520 /* 3521 * Nanoseconds the activity check must watch for changes on-disk. 3522 */ 3523 static uint64_t 3524 spa_activity_check_duration(spa_t *spa, uberblock_t *ub) 3525 { 3526 uint64_t import_intervals = MAX(zfs_multihost_import_intervals, 1); 3527 uint64_t multihost_interval = MSEC2NSEC( 3528 MMP_INTERVAL_OK(zfs_multihost_interval)); 3529 uint64_t import_delay = MAX(NANOSEC, import_intervals * 3530 multihost_interval); 3531 3532 /* 3533 * Local tunables determine a minimum duration except for the case 3534 * where we know when the remote host will suspend the pool if MMP 3535 * writes do not land. 3536 * 3537 * See Big Theory comment at the top of mmp.c for the reasoning behind 3538 * these cases and times. 3539 */ 3540 3541 ASSERT(MMP_IMPORT_SAFETY_FACTOR >= 100); 3542 3543 if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) && 3544 MMP_FAIL_INT(ub) > 0) { 3545 3546 /* MMP on remote host will suspend pool after failed writes */ 3547 import_delay = MMP_FAIL_INT(ub) * MSEC2NSEC(MMP_INTERVAL(ub)) * 3548 MMP_IMPORT_SAFETY_FACTOR / 100; 3549 3550 zfs_dbgmsg("fail_intvals>0 import_delay=%llu ub_mmp " 3551 "mmp_fails=%llu ub_mmp mmp_interval=%llu " 3552 "import_intervals=%llu", (u_longlong_t)import_delay, 3553 (u_longlong_t)MMP_FAIL_INT(ub), 3554 (u_longlong_t)MMP_INTERVAL(ub), 3555 (u_longlong_t)import_intervals); 3556 3557 } else if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) && 3558 MMP_FAIL_INT(ub) == 0) { 3559 3560 /* MMP on remote host will never suspend pool */ 3561 import_delay = MAX(import_delay, (MSEC2NSEC(MMP_INTERVAL(ub)) + 3562 ub->ub_mmp_delay) * import_intervals); 3563 3564 zfs_dbgmsg("fail_intvals=0 import_delay=%llu ub_mmp " 3565 "mmp_interval=%llu ub_mmp_delay=%llu " 3566 "import_intervals=%llu", (u_longlong_t)import_delay, 3567 (u_longlong_t)MMP_INTERVAL(ub), 3568 (u_longlong_t)ub->ub_mmp_delay, 3569 (u_longlong_t)import_intervals); 3570 3571 } else if (MMP_VALID(ub)) { 3572 /* 3573 * zfs-0.7 compatibility case 3574 */ 3575 3576 import_delay = MAX(import_delay, (multihost_interval + 3577 ub->ub_mmp_delay) * import_intervals); 3578 3579 zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu " 3580 "import_intervals=%llu leaves=%u", 3581 (u_longlong_t)import_delay, 3582 (u_longlong_t)ub->ub_mmp_delay, 3583 (u_longlong_t)import_intervals, 3584 vdev_count_leaves(spa)); 3585 } else { 3586 /* Using local tunings is the only reasonable option */ 3587 zfs_dbgmsg("pool last imported on non-MMP aware " 3588 "host using import_delay=%llu multihost_interval=%llu " 3589 "import_intervals=%llu", (u_longlong_t)import_delay, 3590 (u_longlong_t)multihost_interval, 3591 (u_longlong_t)import_intervals); 3592 } 3593 3594 return (import_delay); 3595 } 3596 3597 /* 3598 * Perform the import activity check. If the user canceled the import or 3599 * we detected activity then fail. 3600 */ 3601 static int 3602 spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config) 3603 { 3604 uint64_t txg = ub->ub_txg; 3605 uint64_t timestamp = ub->ub_timestamp; 3606 uint64_t mmp_config = ub->ub_mmp_config; 3607 uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0; 3608 uint64_t import_delay; 3609 hrtime_t import_expire, now; 3610 nvlist_t *mmp_label = NULL; 3611 vdev_t *rvd = spa->spa_root_vdev; 3612 kcondvar_t cv; 3613 kmutex_t mtx; 3614 int error = 0; 3615 3616 cv_init(&cv, NULL, CV_DEFAULT, NULL); 3617 mutex_init(&mtx, NULL, MUTEX_DEFAULT, NULL); 3618 mutex_enter(&mtx); 3619 3620 /* 3621 * If ZPOOL_CONFIG_MMP_TXG is present an activity check was performed 3622 * during the earlier tryimport. If the txg recorded there is 0 then 3623 * the pool is known to be active on another host. 3624 * 3625 * Otherwise, the pool might be in use on another host. Check for 3626 * changes in the uberblocks on disk if necessary. 3627 */ 3628 if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { 3629 nvlist_t *nvinfo = fnvlist_lookup_nvlist(config, 3630 ZPOOL_CONFIG_LOAD_INFO); 3631 3632 if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_TXG) && 3633 fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG) == 0) { 3634 vdev_uberblock_load(rvd, ub, &mmp_label); 3635 error = SET_ERROR(EREMOTEIO); 3636 goto out; 3637 } 3638 } 3639 3640 import_delay = spa_activity_check_duration(spa, ub); 3641 3642 /* Add a small random factor in case of simultaneous imports (0-25%) */ 3643 import_delay += import_delay * random_in_range(250) / 1000; 3644 3645 import_expire = gethrtime() + import_delay; 3646 3647 spa_import_progress_set_notes(spa, "Checking MMP activity, waiting " 3648 "%llu ms", (u_longlong_t)NSEC2MSEC(import_delay)); 3649 3650 int interations = 0; 3651 while ((now = gethrtime()) < import_expire) { 3652 if (interations++ % 30 == 0) { 3653 spa_import_progress_set_notes(spa, "Checking MMP " 3654 "activity, %llu ms remaining", 3655 (u_longlong_t)NSEC2MSEC(import_expire - now)); 3656 } 3657 3658 (void) spa_import_progress_set_mmp_check(spa_guid(spa), 3659 NSEC2SEC(import_expire - gethrtime())); 3660 3661 vdev_uberblock_load(rvd, ub, &mmp_label); 3662 3663 if (txg != ub->ub_txg || timestamp != ub->ub_timestamp || 3664 mmp_seq != (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) { 3665 zfs_dbgmsg("multihost activity detected " 3666 "txg %llu ub_txg %llu " 3667 "timestamp %llu ub_timestamp %llu " 3668 "mmp_config %#llx ub_mmp_config %#llx", 3669 (u_longlong_t)txg, (u_longlong_t)ub->ub_txg, 3670 (u_longlong_t)timestamp, 3671 (u_longlong_t)ub->ub_timestamp, 3672 (u_longlong_t)mmp_config, 3673 (u_longlong_t)ub->ub_mmp_config); 3674 3675 error = SET_ERROR(EREMOTEIO); 3676 break; 3677 } 3678 3679 if (mmp_label) { 3680 nvlist_free(mmp_label); 3681 mmp_label = NULL; 3682 } 3683 3684 error = cv_timedwait_sig(&cv, &mtx, ddi_get_lbolt() + hz); 3685 if (error != -1) { 3686 error = SET_ERROR(EINTR); 3687 break; 3688 } 3689 error = 0; 3690 } 3691 3692 out: 3693 mutex_exit(&mtx); 3694 mutex_destroy(&mtx); 3695 cv_destroy(&cv); 3696 3697 /* 3698 * If the pool is determined to be active store the status in the 3699 * spa->spa_load_info nvlist. If the remote hostname or hostid are 3700 * available from configuration read from disk store them as well. 3701 * This allows 'zpool import' to generate a more useful message. 3702 * 3703 * ZPOOL_CONFIG_MMP_STATE - observed pool status (mandatory) 3704 * ZPOOL_CONFIG_MMP_HOSTNAME - hostname from the active pool 3705 * ZPOOL_CONFIG_MMP_HOSTID - hostid from the active pool 3706 */ 3707 if (error == EREMOTEIO) { 3708 const char *hostname = "<unknown>"; 3709 uint64_t hostid = 0; 3710 3711 if (mmp_label) { 3712 if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTNAME)) { 3713 hostname = fnvlist_lookup_string(mmp_label, 3714 ZPOOL_CONFIG_HOSTNAME); 3715 fnvlist_add_string(spa->spa_load_info, 3716 ZPOOL_CONFIG_MMP_HOSTNAME, hostname); 3717 } 3718 3719 if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTID)) { 3720 hostid = fnvlist_lookup_uint64(mmp_label, 3721 ZPOOL_CONFIG_HOSTID); 3722 fnvlist_add_uint64(spa->spa_load_info, 3723 ZPOOL_CONFIG_MMP_HOSTID, hostid); 3724 } 3725 } 3726 3727 fnvlist_add_uint64(spa->spa_load_info, 3728 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_ACTIVE); 3729 fnvlist_add_uint64(spa->spa_load_info, 3730 ZPOOL_CONFIG_MMP_TXG, 0); 3731 3732 error = spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO); 3733 } 3734 3735 if (mmp_label) 3736 nvlist_free(mmp_label); 3737 3738 return (error); 3739 } 3740 3741 static int 3742 spa_verify_host(spa_t *spa, nvlist_t *mos_config) 3743 { 3744 uint64_t hostid; 3745 const char *hostname; 3746 uint64_t myhostid = 0; 3747 3748 if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config, 3749 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 3750 hostname = fnvlist_lookup_string(mos_config, 3751 ZPOOL_CONFIG_HOSTNAME); 3752 3753 myhostid = zone_get_hostid(NULL); 3754 3755 if (hostid != 0 && myhostid != 0 && hostid != myhostid) { 3756 cmn_err(CE_WARN, "pool '%s' could not be " 3757 "loaded as it was last accessed by " 3758 "another system (host: %s hostid: 0x%llx). " 3759 "See: https://openzfs.github.io/openzfs-docs/msg/" 3760 "ZFS-8000-EY", 3761 spa_name(spa), hostname, (u_longlong_t)hostid); 3762 spa_load_failed(spa, "hostid verification failed: pool " 3763 "last accessed by host: %s (hostid: 0x%llx)", 3764 hostname, (u_longlong_t)hostid); 3765 return (SET_ERROR(EBADF)); 3766 } 3767 } 3768 3769 return (0); 3770 } 3771 3772 static int 3773 spa_ld_parse_config(spa_t *spa, spa_import_type_t type) 3774 { 3775 int error = 0; 3776 nvlist_t *nvtree, *nvl, *config = spa->spa_config; 3777 int parse; 3778 vdev_t *rvd; 3779 uint64_t pool_guid; 3780 const char *comment; 3781 const char *compatibility; 3782 3783 /* 3784 * Versioning wasn't explicitly added to the label until later, so if 3785 * it's not present treat it as the initial version. 3786 */ 3787 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 3788 &spa->spa_ubsync.ub_version) != 0) 3789 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 3790 3791 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 3792 spa_load_failed(spa, "invalid config provided: '%s' missing", 3793 ZPOOL_CONFIG_POOL_GUID); 3794 return (SET_ERROR(EINVAL)); 3795 } 3796 3797 /* 3798 * If we are doing an import, ensure that the pool is not already 3799 * imported by checking if its pool guid already exists in the 3800 * spa namespace. 3801 * 3802 * The only case that we allow an already imported pool to be 3803 * imported again, is when the pool is checkpointed and we want to 3804 * look at its checkpointed state from userland tools like zdb. 3805 */ 3806 #ifdef _KERNEL 3807 if ((spa->spa_load_state == SPA_LOAD_IMPORT || 3808 spa->spa_load_state == SPA_LOAD_TRYIMPORT) && 3809 spa_guid_exists(pool_guid, 0)) { 3810 #else 3811 if ((spa->spa_load_state == SPA_LOAD_IMPORT || 3812 spa->spa_load_state == SPA_LOAD_TRYIMPORT) && 3813 spa_guid_exists(pool_guid, 0) && 3814 !spa_importing_readonly_checkpoint(spa)) { 3815 #endif 3816 spa_load_failed(spa, "a pool with guid %llu is already open", 3817 (u_longlong_t)pool_guid); 3818 return (SET_ERROR(EEXIST)); 3819 } 3820 3821 spa->spa_config_guid = pool_guid; 3822 3823 nvlist_free(spa->spa_load_info); 3824 spa->spa_load_info = fnvlist_alloc(); 3825 3826 ASSERT(spa->spa_comment == NULL); 3827 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) 3828 spa->spa_comment = spa_strdup(comment); 3829 3830 ASSERT(spa->spa_compatibility == NULL); 3831 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMPATIBILITY, 3832 &compatibility) == 0) 3833 spa->spa_compatibility = spa_strdup(compatibility); 3834 3835 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 3836 &spa->spa_config_txg); 3837 3838 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0) 3839 spa->spa_config_splitting = fnvlist_dup(nvl); 3840 3841 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) { 3842 spa_load_failed(spa, "invalid config provided: '%s' missing", 3843 ZPOOL_CONFIG_VDEV_TREE); 3844 return (SET_ERROR(EINVAL)); 3845 } 3846 3847 /* 3848 * Create "The Godfather" zio to hold all async IOs 3849 */ 3850 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 3851 KM_SLEEP); 3852 for (int i = 0; i < max_ncpus; i++) { 3853 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 3854 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 3855 ZIO_FLAG_GODFATHER); 3856 } 3857 3858 /* 3859 * Parse the configuration into a vdev tree. We explicitly set the 3860 * value that will be returned by spa_version() since parsing the 3861 * configuration requires knowing the version number. 3862 */ 3863 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3864 parse = (type == SPA_IMPORT_EXISTING ? 3865 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 3866 error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse); 3867 spa_config_exit(spa, SCL_ALL, FTAG); 3868 3869 if (error != 0) { 3870 spa_load_failed(spa, "unable to parse config [error=%d]", 3871 error); 3872 return (error); 3873 } 3874 3875 ASSERT(spa->spa_root_vdev == rvd); 3876 ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); 3877 ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT); 3878 3879 if (type != SPA_IMPORT_ASSEMBLE) { 3880 ASSERT(spa_guid(spa) == pool_guid); 3881 } 3882 3883 return (0); 3884 } 3885 3886 /* 3887 * Recursively open all vdevs in the vdev tree. This function is called twice: 3888 * first with the untrusted config, then with the trusted config. 3889 */ 3890 static int 3891 spa_ld_open_vdevs(spa_t *spa) 3892 { 3893 int error = 0; 3894 3895 /* 3896 * spa_missing_tvds_allowed defines how many top-level vdevs can be 3897 * missing/unopenable for the root vdev to be still considered openable. 3898 */ 3899 if (spa->spa_trust_config) { 3900 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds; 3901 } else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) { 3902 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile; 3903 } else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) { 3904 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan; 3905 } else { 3906 spa->spa_missing_tvds_allowed = 0; 3907 } 3908 3909 spa->spa_missing_tvds_allowed = 3910 MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed); 3911 3912 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3913 error = vdev_open(spa->spa_root_vdev); 3914 spa_config_exit(spa, SCL_ALL, FTAG); 3915 3916 if (spa->spa_missing_tvds != 0) { 3917 spa_load_note(spa, "vdev tree has %lld missing top-level " 3918 "vdevs.", (u_longlong_t)spa->spa_missing_tvds); 3919 if (spa->spa_trust_config && (spa->spa_mode & SPA_MODE_WRITE)) { 3920 /* 3921 * Although theoretically we could allow users to open 3922 * incomplete pools in RW mode, we'd need to add a lot 3923 * of extra logic (e.g. adjust pool space to account 3924 * for missing vdevs). 3925 * This limitation also prevents users from accidentally 3926 * opening the pool in RW mode during data recovery and 3927 * damaging it further. 3928 */ 3929 spa_load_note(spa, "pools with missing top-level " 3930 "vdevs can only be opened in read-only mode."); 3931 error = SET_ERROR(ENXIO); 3932 } else { 3933 spa_load_note(spa, "current settings allow for maximum " 3934 "%lld missing top-level vdevs at this stage.", 3935 (u_longlong_t)spa->spa_missing_tvds_allowed); 3936 } 3937 } 3938 if (error != 0) { 3939 spa_load_failed(spa, "unable to open vdev tree [error=%d]", 3940 error); 3941 } 3942 if (spa->spa_missing_tvds != 0 || error != 0) 3943 vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2); 3944 3945 return (error); 3946 } 3947 3948 /* 3949 * We need to validate the vdev labels against the configuration that 3950 * we have in hand. This function is called twice: first with an untrusted 3951 * config, then with a trusted config. The validation is more strict when the 3952 * config is trusted. 3953 */ 3954 static int 3955 spa_ld_validate_vdevs(spa_t *spa) 3956 { 3957 int error = 0; 3958 vdev_t *rvd = spa->spa_root_vdev; 3959 3960 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3961 error = vdev_validate(rvd); 3962 spa_config_exit(spa, SCL_ALL, FTAG); 3963 3964 if (error != 0) { 3965 spa_load_failed(spa, "vdev_validate failed [error=%d]", error); 3966 return (error); 3967 } 3968 3969 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 3970 spa_load_failed(spa, "cannot open vdev tree after invalidating " 3971 "some vdevs"); 3972 vdev_dbgmsg_print_tree(rvd, 2); 3973 return (SET_ERROR(ENXIO)); 3974 } 3975 3976 return (0); 3977 } 3978 3979 static void 3980 spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub) 3981 { 3982 spa->spa_state = POOL_STATE_ACTIVE; 3983 spa->spa_ubsync = spa->spa_uberblock; 3984 spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 3985 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 3986 spa->spa_first_txg = spa->spa_last_ubsync_txg ? 3987 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 3988 spa->spa_claim_max_txg = spa->spa_first_txg; 3989 spa->spa_prev_software_version = ub->ub_software_version; 3990 } 3991 3992 static int 3993 spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) 3994 { 3995 vdev_t *rvd = spa->spa_root_vdev; 3996 nvlist_t *label; 3997 uberblock_t *ub = &spa->spa_uberblock; 3998 boolean_t activity_check = B_FALSE; 3999 4000 /* 4001 * If we are opening the checkpointed state of the pool by 4002 * rewinding to it, at this point we will have written the 4003 * checkpointed uberblock to the vdev labels, so searching 4004 * the labels will find the right uberblock. However, if 4005 * we are opening the checkpointed state read-only, we have 4006 * not modified the labels. Therefore, we must ignore the 4007 * labels and continue using the spa_uberblock that was set 4008 * by spa_ld_checkpoint_rewind. 4009 * 4010 * Note that it would be fine to ignore the labels when 4011 * rewinding (opening writeable) as well. However, if we 4012 * crash just after writing the labels, we will end up 4013 * searching the labels. Doing so in the common case means 4014 * that this code path gets exercised normally, rather than 4015 * just in the edge case. 4016 */ 4017 if (ub->ub_checkpoint_txg != 0 && 4018 spa_importing_readonly_checkpoint(spa)) { 4019 spa_ld_select_uberblock_done(spa, ub); 4020 return (0); 4021 } 4022 4023 /* 4024 * Find the best uberblock. 4025 */ 4026 vdev_uberblock_load(rvd, ub, &label); 4027 4028 /* 4029 * If we weren't able to find a single valid uberblock, return failure. 4030 */ 4031 if (ub->ub_txg == 0) { 4032 nvlist_free(label); 4033 spa_load_failed(spa, "no valid uberblock found"); 4034 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 4035 } 4036 4037 if (spa->spa_load_max_txg != UINT64_MAX) { 4038 (void) spa_import_progress_set_max_txg(spa_guid(spa), 4039 (u_longlong_t)spa->spa_load_max_txg); 4040 } 4041 spa_load_note(spa, "using uberblock with txg=%llu", 4042 (u_longlong_t)ub->ub_txg); 4043 if (ub->ub_raidz_reflow_info != 0) { 4044 spa_load_note(spa, "uberblock raidz_reflow_info: " 4045 "state=%u offset=%llu", 4046 (int)RRSS_GET_STATE(ub), 4047 (u_longlong_t)RRSS_GET_OFFSET(ub)); 4048 } 4049 4050 4051 /* 4052 * For pools which have the multihost property on determine if the 4053 * pool is truly inactive and can be safely imported. Prevent 4054 * hosts which don't have a hostid set from importing the pool. 4055 */ 4056 activity_check = spa_activity_check_required(spa, ub, label, 4057 spa->spa_config); 4058 if (activity_check) { 4059 if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay && 4060 spa_get_hostid(spa) == 0) { 4061 nvlist_free(label); 4062 fnvlist_add_uint64(spa->spa_load_info, 4063 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); 4064 return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); 4065 } 4066 4067 int error = spa_activity_check(spa, ub, spa->spa_config); 4068 if (error) { 4069 nvlist_free(label); 4070 return (error); 4071 } 4072 4073 fnvlist_add_uint64(spa->spa_load_info, 4074 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_INACTIVE); 4075 fnvlist_add_uint64(spa->spa_load_info, 4076 ZPOOL_CONFIG_MMP_TXG, ub->ub_txg); 4077 fnvlist_add_uint16(spa->spa_load_info, 4078 ZPOOL_CONFIG_MMP_SEQ, 4079 (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)); 4080 } 4081 4082 /* 4083 * If the pool has an unsupported version we can't open it. 4084 */ 4085 if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { 4086 nvlist_free(label); 4087 spa_load_failed(spa, "version %llu is not supported", 4088 (u_longlong_t)ub->ub_version); 4089 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 4090 } 4091 4092 if (ub->ub_version >= SPA_VERSION_FEATURES) { 4093 nvlist_t *features; 4094 4095 /* 4096 * If we weren't able to find what's necessary for reading the 4097 * MOS in the label, return failure. 4098 */ 4099 if (label == NULL) { 4100 spa_load_failed(spa, "label config unavailable"); 4101 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 4102 ENXIO)); 4103 } 4104 4105 if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ, 4106 &features) != 0) { 4107 nvlist_free(label); 4108 spa_load_failed(spa, "invalid label: '%s' missing", 4109 ZPOOL_CONFIG_FEATURES_FOR_READ); 4110 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 4111 ENXIO)); 4112 } 4113 4114 /* 4115 * Update our in-core representation with the definitive values 4116 * from the label. 4117 */ 4118 nvlist_free(spa->spa_label_features); 4119 spa->spa_label_features = fnvlist_dup(features); 4120 } 4121 4122 nvlist_free(label); 4123 4124 /* 4125 * Look through entries in the label nvlist's features_for_read. If 4126 * there is a feature listed there which we don't understand then we 4127 * cannot open a pool. 4128 */ 4129 if (ub->ub_version >= SPA_VERSION_FEATURES) { 4130 nvlist_t *unsup_feat; 4131 4132 unsup_feat = fnvlist_alloc(); 4133 4134 for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, 4135 NULL); nvp != NULL; 4136 nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { 4137 if (!zfeature_is_supported(nvpair_name(nvp))) { 4138 fnvlist_add_string(unsup_feat, 4139 nvpair_name(nvp), ""); 4140 } 4141 } 4142 4143 if (!nvlist_empty(unsup_feat)) { 4144 fnvlist_add_nvlist(spa->spa_load_info, 4145 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 4146 nvlist_free(unsup_feat); 4147 spa_load_failed(spa, "some features are unsupported"); 4148 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 4149 ENOTSUP)); 4150 } 4151 4152 nvlist_free(unsup_feat); 4153 } 4154 4155 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 4156 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4157 spa_try_repair(spa, spa->spa_config); 4158 spa_config_exit(spa, SCL_ALL, FTAG); 4159 nvlist_free(spa->spa_config_splitting); 4160 spa->spa_config_splitting = NULL; 4161 } 4162 4163 /* 4164 * Initialize internal SPA structures. 4165 */ 4166 spa_ld_select_uberblock_done(spa, ub); 4167 4168 return (0); 4169 } 4170 4171 static int 4172 spa_ld_open_rootbp(spa_t *spa) 4173 { 4174 int error = 0; 4175 vdev_t *rvd = spa->spa_root_vdev; 4176 4177 error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 4178 if (error != 0) { 4179 spa_load_failed(spa, "unable to open rootbp in dsl_pool_init " 4180 "[error=%d]", error); 4181 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4182 } 4183 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 4184 4185 return (0); 4186 } 4187 4188 static int 4189 spa_ld_trusted_config(spa_t *spa, spa_import_type_t type, 4190 boolean_t reloading) 4191 { 4192 vdev_t *mrvd, *rvd = spa->spa_root_vdev; 4193 nvlist_t *nv, *mos_config, *policy; 4194 int error = 0, copy_error; 4195 uint64_t healthy_tvds, healthy_tvds_mos; 4196 uint64_t mos_config_txg; 4197 4198 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE) 4199 != 0) 4200 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4201 4202 /* 4203 * If we're assembling a pool from a split, the config provided is 4204 * already trusted so there is nothing to do. 4205 */ 4206 if (type == SPA_IMPORT_ASSEMBLE) 4207 return (0); 4208 4209 healthy_tvds = spa_healthy_core_tvds(spa); 4210 4211 if (load_nvlist(spa, spa->spa_config_object, &mos_config) 4212 != 0) { 4213 spa_load_failed(spa, "unable to retrieve MOS config"); 4214 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4215 } 4216 4217 /* 4218 * If we are doing an open, pool owner wasn't verified yet, thus do 4219 * the verification here. 4220 */ 4221 if (spa->spa_load_state == SPA_LOAD_OPEN) { 4222 error = spa_verify_host(spa, mos_config); 4223 if (error != 0) { 4224 nvlist_free(mos_config); 4225 return (error); 4226 } 4227 } 4228 4229 nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE); 4230 4231 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4232 4233 /* 4234 * Build a new vdev tree from the trusted config 4235 */ 4236 error = spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD); 4237 if (error != 0) { 4238 nvlist_free(mos_config); 4239 spa_config_exit(spa, SCL_ALL, FTAG); 4240 spa_load_failed(spa, "spa_config_parse failed [error=%d]", 4241 error); 4242 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 4243 } 4244 4245 /* 4246 * Vdev paths in the MOS may be obsolete. If the untrusted config was 4247 * obtained by scanning /dev/dsk, then it will have the right vdev 4248 * paths. We update the trusted MOS config with this information. 4249 * We first try to copy the paths with vdev_copy_path_strict, which 4250 * succeeds only when both configs have exactly the same vdev tree. 4251 * If that fails, we fall back to a more flexible method that has a 4252 * best effort policy. 4253 */ 4254 copy_error = vdev_copy_path_strict(rvd, mrvd); 4255 if (copy_error != 0 || spa_load_print_vdev_tree) { 4256 spa_load_note(spa, "provided vdev tree:"); 4257 vdev_dbgmsg_print_tree(rvd, 2); 4258 spa_load_note(spa, "MOS vdev tree:"); 4259 vdev_dbgmsg_print_tree(mrvd, 2); 4260 } 4261 if (copy_error != 0) { 4262 spa_load_note(spa, "vdev_copy_path_strict failed, falling " 4263 "back to vdev_copy_path_relaxed"); 4264 vdev_copy_path_relaxed(rvd, mrvd); 4265 } 4266 4267 vdev_close(rvd); 4268 vdev_free(rvd); 4269 spa->spa_root_vdev = mrvd; 4270 rvd = mrvd; 4271 spa_config_exit(spa, SCL_ALL, FTAG); 4272 4273 /* 4274 * If 'zpool import' used a cached config, then the on-disk hostid and 4275 * hostname may be different to the cached config in ways that should 4276 * prevent import. Userspace can't discover this without a scan, but 4277 * we know, so we add these values to LOAD_INFO so the caller can know 4278 * the difference. 4279 * 4280 * Note that we have to do this before the config is regenerated, 4281 * because the new config will have the hostid and hostname for this 4282 * host, in readiness for import. 4283 */ 4284 if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTID)) 4285 fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_HOSTID, 4286 fnvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_HOSTID)); 4287 if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTNAME)) 4288 fnvlist_add_string(spa->spa_load_info, ZPOOL_CONFIG_HOSTNAME, 4289 fnvlist_lookup_string(mos_config, ZPOOL_CONFIG_HOSTNAME)); 4290 4291 /* 4292 * We will use spa_config if we decide to reload the spa or if spa_load 4293 * fails and we rewind. We must thus regenerate the config using the 4294 * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to 4295 * pass settings on how to load the pool and is not stored in the MOS. 4296 * We copy it over to our new, trusted config. 4297 */ 4298 mos_config_txg = fnvlist_lookup_uint64(mos_config, 4299 ZPOOL_CONFIG_POOL_TXG); 4300 nvlist_free(mos_config); 4301 mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE); 4302 if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY, 4303 &policy) == 0) 4304 fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy); 4305 spa_config_set(spa, mos_config); 4306 spa->spa_config_source = SPA_CONFIG_SRC_MOS; 4307 4308 /* 4309 * Now that we got the config from the MOS, we should be more strict 4310 * in checking blkptrs and can make assumptions about the consistency 4311 * of the vdev tree. spa_trust_config must be set to true before opening 4312 * vdevs in order for them to be writeable. 4313 */ 4314 spa->spa_trust_config = B_TRUE; 4315 4316 /* 4317 * Open and validate the new vdev tree 4318 */ 4319 error = spa_ld_open_vdevs(spa); 4320 if (error != 0) 4321 return (error); 4322 4323 error = spa_ld_validate_vdevs(spa); 4324 if (error != 0) 4325 return (error); 4326 4327 if (copy_error != 0 || spa_load_print_vdev_tree) { 4328 spa_load_note(spa, "final vdev tree:"); 4329 vdev_dbgmsg_print_tree(rvd, 2); 4330 } 4331 4332 if (spa->spa_load_state != SPA_LOAD_TRYIMPORT && 4333 !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) { 4334 /* 4335 * Sanity check to make sure that we are indeed loading the 4336 * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds 4337 * in the config provided and they happened to be the only ones 4338 * to have the latest uberblock, we could involuntarily perform 4339 * an extreme rewind. 4340 */ 4341 healthy_tvds_mos = spa_healthy_core_tvds(spa); 4342 if (healthy_tvds_mos - healthy_tvds >= 4343 SPA_SYNC_MIN_VDEVS) { 4344 spa_load_note(spa, "config provided misses too many " 4345 "top-level vdevs compared to MOS (%lld vs %lld). ", 4346 (u_longlong_t)healthy_tvds, 4347 (u_longlong_t)healthy_tvds_mos); 4348 spa_load_note(spa, "vdev tree:"); 4349 vdev_dbgmsg_print_tree(rvd, 2); 4350 if (reloading) { 4351 spa_load_failed(spa, "config was already " 4352 "provided from MOS. Aborting."); 4353 return (spa_vdev_err(rvd, 4354 VDEV_AUX_CORRUPT_DATA, EIO)); 4355 } 4356 spa_load_note(spa, "spa must be reloaded using MOS " 4357 "config"); 4358 return (SET_ERROR(EAGAIN)); 4359 } 4360 } 4361 4362 error = spa_check_for_missing_logs(spa); 4363 if (error != 0) 4364 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 4365 4366 if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) { 4367 spa_load_failed(spa, "uberblock guid sum doesn't match MOS " 4368 "guid sum (%llu != %llu)", 4369 (u_longlong_t)spa->spa_uberblock.ub_guid_sum, 4370 (u_longlong_t)rvd->vdev_guid_sum); 4371 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 4372 ENXIO)); 4373 } 4374 4375 return (0); 4376 } 4377 4378 static int 4379 spa_ld_open_indirect_vdev_metadata(spa_t *spa) 4380 { 4381 int error = 0; 4382 vdev_t *rvd = spa->spa_root_vdev; 4383 4384 /* 4385 * Everything that we read before spa_remove_init() must be stored 4386 * on concreted vdevs. Therefore we do this as early as possible. 4387 */ 4388 error = spa_remove_init(spa); 4389 if (error != 0) { 4390 spa_load_failed(spa, "spa_remove_init failed [error=%d]", 4391 error); 4392 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4393 } 4394 4395 /* 4396 * Retrieve information needed to condense indirect vdev mappings. 4397 */ 4398 error = spa_condense_init(spa); 4399 if (error != 0) { 4400 spa_load_failed(spa, "spa_condense_init failed [error=%d]", 4401 error); 4402 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 4403 } 4404 4405 return (0); 4406 } 4407 4408 static int 4409 spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep) 4410 { 4411 int error = 0; 4412 vdev_t *rvd = spa->spa_root_vdev; 4413 4414 if (spa_version(spa) >= SPA_VERSION_FEATURES) { 4415 boolean_t missing_feat_read = B_FALSE; 4416 nvlist_t *unsup_feat, *enabled_feat; 4417 4418 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, 4419 &spa->spa_feat_for_read_obj, B_TRUE) != 0) { 4420 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4421 } 4422 4423 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, 4424 &spa->spa_feat_for_write_obj, B_TRUE) != 0) { 4425 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4426 } 4427 4428 if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, 4429 &spa->spa_feat_desc_obj, B_TRUE) != 0) { 4430 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4431 } 4432 4433 enabled_feat = fnvlist_alloc(); 4434 unsup_feat = fnvlist_alloc(); 4435 4436 if (!spa_features_check(spa, B_FALSE, 4437 unsup_feat, enabled_feat)) 4438 missing_feat_read = B_TRUE; 4439 4440 if (spa_writeable(spa) || 4441 spa->spa_load_state == SPA_LOAD_TRYIMPORT) { 4442 if (!spa_features_check(spa, B_TRUE, 4443 unsup_feat, enabled_feat)) { 4444 *missing_feat_writep = B_TRUE; 4445 } 4446 } 4447 4448 fnvlist_add_nvlist(spa->spa_load_info, 4449 ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); 4450 4451 if (!nvlist_empty(unsup_feat)) { 4452 fnvlist_add_nvlist(spa->spa_load_info, 4453 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 4454 } 4455 4456 fnvlist_free(enabled_feat); 4457 fnvlist_free(unsup_feat); 4458 4459 if (!missing_feat_read) { 4460 fnvlist_add_boolean(spa->spa_load_info, 4461 ZPOOL_CONFIG_CAN_RDONLY); 4462 } 4463 4464 /* 4465 * If the state is SPA_LOAD_TRYIMPORT, our objective is 4466 * twofold: to determine whether the pool is available for 4467 * import in read-write mode and (if it is not) whether the 4468 * pool is available for import in read-only mode. If the pool 4469 * is available for import in read-write mode, it is displayed 4470 * as available in userland; if it is not available for import 4471 * in read-only mode, it is displayed as unavailable in 4472 * userland. If the pool is available for import in read-only 4473 * mode but not read-write mode, it is displayed as unavailable 4474 * in userland with a special note that the pool is actually 4475 * available for open in read-only mode. 4476 * 4477 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are 4478 * missing a feature for write, we must first determine whether 4479 * the pool can be opened read-only before returning to 4480 * userland in order to know whether to display the 4481 * abovementioned note. 4482 */ 4483 if (missing_feat_read || (*missing_feat_writep && 4484 spa_writeable(spa))) { 4485 spa_load_failed(spa, "pool uses unsupported features"); 4486 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 4487 ENOTSUP)); 4488 } 4489 4490 /* 4491 * Load refcounts for ZFS features from disk into an in-memory 4492 * cache during SPA initialization. 4493 */ 4494 for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { 4495 uint64_t refcount; 4496 4497 error = feature_get_refcount_from_disk(spa, 4498 &spa_feature_table[i], &refcount); 4499 if (error == 0) { 4500 spa->spa_feat_refcount_cache[i] = refcount; 4501 } else if (error == ENOTSUP) { 4502 spa->spa_feat_refcount_cache[i] = 4503 SPA_FEATURE_DISABLED; 4504 } else { 4505 spa_load_failed(spa, "error getting refcount " 4506 "for feature %s [error=%d]", 4507 spa_feature_table[i].fi_guid, error); 4508 return (spa_vdev_err(rvd, 4509 VDEV_AUX_CORRUPT_DATA, EIO)); 4510 } 4511 } 4512 } 4513 4514 if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { 4515 if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, 4516 &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0) 4517 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4518 } 4519 4520 /* 4521 * Encryption was added before bookmark_v2, even though bookmark_v2 4522 * is now a dependency. If this pool has encryption enabled without 4523 * bookmark_v2, trigger an errata message. 4524 */ 4525 if (spa_feature_is_enabled(spa, SPA_FEATURE_ENCRYPTION) && 4526 !spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_V2)) { 4527 spa->spa_errata = ZPOOL_ERRATA_ZOL_8308_ENCRYPTION; 4528 } 4529 4530 return (0); 4531 } 4532 4533 static int 4534 spa_ld_load_special_directories(spa_t *spa) 4535 { 4536 int error = 0; 4537 vdev_t *rvd = spa->spa_root_vdev; 4538 4539 spa->spa_is_initializing = B_TRUE; 4540 error = dsl_pool_open(spa->spa_dsl_pool); 4541 spa->spa_is_initializing = B_FALSE; 4542 if (error != 0) { 4543 spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error); 4544 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4545 } 4546 4547 return (0); 4548 } 4549 4550 static int 4551 spa_ld_get_props(spa_t *spa) 4552 { 4553 int error = 0; 4554 uint64_t obj; 4555 vdev_t *rvd = spa->spa_root_vdev; 4556 4557 /* Grab the checksum salt from the MOS. */ 4558 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 4559 DMU_POOL_CHECKSUM_SALT, 1, 4560 sizeof (spa->spa_cksum_salt.zcs_bytes), 4561 spa->spa_cksum_salt.zcs_bytes); 4562 if (error == ENOENT) { 4563 /* Generate a new salt for subsequent use */ 4564 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 4565 sizeof (spa->spa_cksum_salt.zcs_bytes)); 4566 } else if (error != 0) { 4567 spa_load_failed(spa, "unable to retrieve checksum salt from " 4568 "MOS [error=%d]", error); 4569 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4570 } 4571 4572 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0) 4573 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4574 error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 4575 if (error != 0) { 4576 spa_load_failed(spa, "error opening deferred-frees bpobj " 4577 "[error=%d]", error); 4578 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4579 } 4580 4581 /* 4582 * Load the bit that tells us to use the new accounting function 4583 * (raid-z deflation). If we have an older pool, this will not 4584 * be present. 4585 */ 4586 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE); 4587 if (error != 0 && error != ENOENT) 4588 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4589 4590 error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 4591 &spa->spa_creation_version, B_FALSE); 4592 if (error != 0 && error != ENOENT) 4593 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4594 4595 /* 4596 * Load the persistent error log. If we have an older pool, this will 4597 * not be present. 4598 */ 4599 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last, 4600 B_FALSE); 4601 if (error != 0 && error != ENOENT) 4602 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4603 4604 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 4605 &spa->spa_errlog_scrub, B_FALSE); 4606 if (error != 0 && error != ENOENT) 4607 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4608 4609 /* 4610 * Load the livelist deletion field. If a livelist is queued for 4611 * deletion, indicate that in the spa 4612 */ 4613 error = spa_dir_prop(spa, DMU_POOL_DELETED_CLONES, 4614 &spa->spa_livelists_to_delete, B_FALSE); 4615 if (error != 0 && error != ENOENT) 4616 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4617 4618 /* 4619 * Load the history object. If we have an older pool, this 4620 * will not be present. 4621 */ 4622 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE); 4623 if (error != 0 && error != ENOENT) 4624 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4625 4626 /* 4627 * Load the per-vdev ZAP map. If we have an older pool, this will not 4628 * be present; in this case, defer its creation to a later time to 4629 * avoid dirtying the MOS this early / out of sync context. See 4630 * spa_sync_config_object. 4631 */ 4632 4633 /* The sentinel is only available in the MOS config. */ 4634 nvlist_t *mos_config; 4635 if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) { 4636 spa_load_failed(spa, "unable to retrieve MOS config"); 4637 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4638 } 4639 4640 error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP, 4641 &spa->spa_all_vdev_zaps, B_FALSE); 4642 4643 if (error == ENOENT) { 4644 VERIFY(!nvlist_exists(mos_config, 4645 ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); 4646 spa->spa_avz_action = AVZ_ACTION_INITIALIZE; 4647 ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); 4648 } else if (error != 0) { 4649 nvlist_free(mos_config); 4650 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4651 } else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) { 4652 /* 4653 * An older version of ZFS overwrote the sentinel value, so 4654 * we have orphaned per-vdev ZAPs in the MOS. Defer their 4655 * destruction to later; see spa_sync_config_object. 4656 */ 4657 spa->spa_avz_action = AVZ_ACTION_DESTROY; 4658 /* 4659 * We're assuming that no vdevs have had their ZAPs created 4660 * before this. Better be sure of it. 4661 */ 4662 ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); 4663 } 4664 nvlist_free(mos_config); 4665 4666 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 4667 4668 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object, 4669 B_FALSE); 4670 if (error && error != ENOENT) 4671 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4672 4673 if (error == 0) { 4674 uint64_t autoreplace = 0; 4675 4676 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 4677 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 4678 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 4679 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 4680 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 4681 spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost); 4682 spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_autotrim); 4683 spa->spa_autoreplace = (autoreplace != 0); 4684 } 4685 4686 /* 4687 * If we are importing a pool with missing top-level vdevs, 4688 * we enforce that the pool doesn't panic or get suspended on 4689 * error since the likelihood of missing data is extremely high. 4690 */ 4691 if (spa->spa_missing_tvds > 0 && 4692 spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE && 4693 spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 4694 spa_load_note(spa, "forcing failmode to 'continue' " 4695 "as some top level vdevs are missing"); 4696 spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE; 4697 } 4698 4699 return (0); 4700 } 4701 4702 static int 4703 spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type) 4704 { 4705 int error = 0; 4706 vdev_t *rvd = spa->spa_root_vdev; 4707 4708 /* 4709 * If we're assembling the pool from the split-off vdevs of 4710 * an existing pool, we don't want to attach the spares & cache 4711 * devices. 4712 */ 4713 4714 /* 4715 * Load any hot spares for this pool. 4716 */ 4717 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object, 4718 B_FALSE); 4719 if (error != 0 && error != ENOENT) 4720 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4721 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 4722 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 4723 if (load_nvlist(spa, spa->spa_spares.sav_object, 4724 &spa->spa_spares.sav_config) != 0) { 4725 spa_load_failed(spa, "error loading spares nvlist"); 4726 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4727 } 4728 4729 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4730 spa_load_spares(spa); 4731 spa_config_exit(spa, SCL_ALL, FTAG); 4732 } else if (error == 0) { 4733 spa->spa_spares.sav_sync = B_TRUE; 4734 } 4735 4736 /* 4737 * Load any level 2 ARC devices for this pool. 4738 */ 4739 error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 4740 &spa->spa_l2cache.sav_object, B_FALSE); 4741 if (error != 0 && error != ENOENT) 4742 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4743 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 4744 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 4745 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 4746 &spa->spa_l2cache.sav_config) != 0) { 4747 spa_load_failed(spa, "error loading l2cache nvlist"); 4748 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4749 } 4750 4751 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4752 spa_load_l2cache(spa); 4753 spa_config_exit(spa, SCL_ALL, FTAG); 4754 } else if (error == 0) { 4755 spa->spa_l2cache.sav_sync = B_TRUE; 4756 } 4757 4758 return (0); 4759 } 4760 4761 static int 4762 spa_ld_load_vdev_metadata(spa_t *spa) 4763 { 4764 int error = 0; 4765 vdev_t *rvd = spa->spa_root_vdev; 4766 4767 /* 4768 * If the 'multihost' property is set, then never allow a pool to 4769 * be imported when the system hostid is zero. The exception to 4770 * this rule is zdb which is always allowed to access pools. 4771 */ 4772 if (spa_multihost(spa) && spa_get_hostid(spa) == 0 && 4773 (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) == 0) { 4774 fnvlist_add_uint64(spa->spa_load_info, 4775 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); 4776 return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); 4777 } 4778 4779 /* 4780 * If the 'autoreplace' property is set, then post a resource notifying 4781 * the ZFS DE that it should not issue any faults for unopenable 4782 * devices. We also iterate over the vdevs, and post a sysevent for any 4783 * unopenable vdevs so that the normal autoreplace handler can take 4784 * over. 4785 */ 4786 if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 4787 spa_check_removed(spa->spa_root_vdev); 4788 /* 4789 * For the import case, this is done in spa_import(), because 4790 * at this point we're using the spare definitions from 4791 * the MOS config, not necessarily from the userland config. 4792 */ 4793 if (spa->spa_load_state != SPA_LOAD_IMPORT) { 4794 spa_aux_check_removed(&spa->spa_spares); 4795 spa_aux_check_removed(&spa->spa_l2cache); 4796 } 4797 } 4798 4799 /* 4800 * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc. 4801 */ 4802 error = vdev_load(rvd); 4803 if (error != 0) { 4804 spa_load_failed(spa, "vdev_load failed [error=%d]", error); 4805 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 4806 } 4807 4808 error = spa_ld_log_spacemaps(spa); 4809 if (error != 0) { 4810 spa_load_failed(spa, "spa_ld_log_spacemaps failed [error=%d]", 4811 error); 4812 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 4813 } 4814 4815 /* 4816 * Propagate the leaf DTLs we just loaded all the way up the vdev tree. 4817 */ 4818 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4819 vdev_dtl_reassess(rvd, 0, 0, B_FALSE, B_FALSE); 4820 spa_config_exit(spa, SCL_ALL, FTAG); 4821 4822 return (0); 4823 } 4824 4825 static int 4826 spa_ld_load_dedup_tables(spa_t *spa) 4827 { 4828 int error = 0; 4829 vdev_t *rvd = spa->spa_root_vdev; 4830 4831 error = ddt_load(spa); 4832 if (error != 0) { 4833 spa_load_failed(spa, "ddt_load failed [error=%d]", error); 4834 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4835 } 4836 4837 return (0); 4838 } 4839 4840 static int 4841 spa_ld_load_brt(spa_t *spa) 4842 { 4843 int error = 0; 4844 vdev_t *rvd = spa->spa_root_vdev; 4845 4846 error = brt_load(spa); 4847 if (error != 0) { 4848 spa_load_failed(spa, "brt_load failed [error=%d]", error); 4849 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4850 } 4851 4852 return (0); 4853 } 4854 4855 static int 4856 spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, const char **ereport) 4857 { 4858 vdev_t *rvd = spa->spa_root_vdev; 4859 4860 if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) { 4861 boolean_t missing = spa_check_logs(spa); 4862 if (missing) { 4863 if (spa->spa_missing_tvds != 0) { 4864 spa_load_note(spa, "spa_check_logs failed " 4865 "so dropping the logs"); 4866 } else { 4867 *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 4868 spa_load_failed(spa, "spa_check_logs failed"); 4869 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, 4870 ENXIO)); 4871 } 4872 } 4873 } 4874 4875 return (0); 4876 } 4877 4878 static int 4879 spa_ld_verify_pool_data(spa_t *spa) 4880 { 4881 int error = 0; 4882 vdev_t *rvd = spa->spa_root_vdev; 4883 4884 /* 4885 * We've successfully opened the pool, verify that we're ready 4886 * to start pushing transactions. 4887 */ 4888 if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 4889 error = spa_load_verify(spa); 4890 if (error != 0) { 4891 spa_load_failed(spa, "spa_load_verify failed " 4892 "[error=%d]", error); 4893 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 4894 error)); 4895 } 4896 } 4897 4898 return (0); 4899 } 4900 4901 static void 4902 spa_ld_claim_log_blocks(spa_t *spa) 4903 { 4904 dmu_tx_t *tx; 4905 dsl_pool_t *dp = spa_get_dsl(spa); 4906 4907 /* 4908 * Claim log blocks that haven't been committed yet. 4909 * This must all happen in a single txg. 4910 * Note: spa_claim_max_txg is updated by spa_claim_notify(), 4911 * invoked from zil_claim_log_block()'s i/o done callback. 4912 * Price of rollback is that we abandon the log. 4913 */ 4914 spa->spa_claiming = B_TRUE; 4915 4916 tx = dmu_tx_create_assigned(dp, spa_first_txg(spa)); 4917 (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 4918 zil_claim, tx, DS_FIND_CHILDREN); 4919 dmu_tx_commit(tx); 4920 4921 spa->spa_claiming = B_FALSE; 4922 4923 spa_set_log_state(spa, SPA_LOG_GOOD); 4924 } 4925 4926 static void 4927 spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg, 4928 boolean_t update_config_cache) 4929 { 4930 vdev_t *rvd = spa->spa_root_vdev; 4931 int need_update = B_FALSE; 4932 4933 /* 4934 * If the config cache is stale, or we have uninitialized 4935 * metaslabs (see spa_vdev_add()), then update the config. 4936 * 4937 * If this is a verbatim import, trust the current 4938 * in-core spa_config and update the disk labels. 4939 */ 4940 if (update_config_cache || config_cache_txg != spa->spa_config_txg || 4941 spa->spa_load_state == SPA_LOAD_IMPORT || 4942 spa->spa_load_state == SPA_LOAD_RECOVER || 4943 (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 4944 need_update = B_TRUE; 4945 4946 for (int c = 0; c < rvd->vdev_children; c++) 4947 if (rvd->vdev_child[c]->vdev_ms_array == 0) 4948 need_update = B_TRUE; 4949 4950 /* 4951 * Update the config cache asynchronously in case we're the 4952 * root pool, in which case the config cache isn't writable yet. 4953 */ 4954 if (need_update) 4955 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 4956 } 4957 4958 static void 4959 spa_ld_prepare_for_reload(spa_t *spa) 4960 { 4961 spa_mode_t mode = spa->spa_mode; 4962 int async_suspended = spa->spa_async_suspended; 4963 4964 spa_unload(spa); 4965 spa_deactivate(spa); 4966 spa_activate(spa, mode); 4967 4968 /* 4969 * We save the value of spa_async_suspended as it gets reset to 0 by 4970 * spa_unload(). We want to restore it back to the original value before 4971 * returning as we might be calling spa_async_resume() later. 4972 */ 4973 spa->spa_async_suspended = async_suspended; 4974 } 4975 4976 static int 4977 spa_ld_read_checkpoint_txg(spa_t *spa) 4978 { 4979 uberblock_t checkpoint; 4980 int error = 0; 4981 4982 ASSERT0(spa->spa_checkpoint_txg); 4983 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 4984 4985 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 4986 DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), 4987 sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); 4988 4989 if (error == ENOENT) 4990 return (0); 4991 4992 if (error != 0) 4993 return (error); 4994 4995 ASSERT3U(checkpoint.ub_txg, !=, 0); 4996 ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0); 4997 ASSERT3U(checkpoint.ub_timestamp, !=, 0); 4998 spa->spa_checkpoint_txg = checkpoint.ub_txg; 4999 spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp; 5000 5001 return (0); 5002 } 5003 5004 static int 5005 spa_ld_mos_init(spa_t *spa, spa_import_type_t type) 5006 { 5007 int error = 0; 5008 5009 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5010 ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); 5011 5012 /* 5013 * Never trust the config that is provided unless we are assembling 5014 * a pool following a split. 5015 * This means don't trust blkptrs and the vdev tree in general. This 5016 * also effectively puts the spa in read-only mode since 5017 * spa_writeable() checks for spa_trust_config to be true. 5018 * We will later load a trusted config from the MOS. 5019 */ 5020 if (type != SPA_IMPORT_ASSEMBLE) 5021 spa->spa_trust_config = B_FALSE; 5022 5023 /* 5024 * Parse the config provided to create a vdev tree. 5025 */ 5026 error = spa_ld_parse_config(spa, type); 5027 if (error != 0) 5028 return (error); 5029 5030 spa_import_progress_add(spa); 5031 5032 /* 5033 * Now that we have the vdev tree, try to open each vdev. This involves 5034 * opening the underlying physical device, retrieving its geometry and 5035 * probing the vdev with a dummy I/O. The state of each vdev will be set 5036 * based on the success of those operations. After this we'll be ready 5037 * to read from the vdevs. 5038 */ 5039 error = spa_ld_open_vdevs(spa); 5040 if (error != 0) 5041 return (error); 5042 5043 /* 5044 * Read the label of each vdev and make sure that the GUIDs stored 5045 * there match the GUIDs in the config provided. 5046 * If we're assembling a new pool that's been split off from an 5047 * existing pool, the labels haven't yet been updated so we skip 5048 * validation for now. 5049 */ 5050 if (type != SPA_IMPORT_ASSEMBLE) { 5051 error = spa_ld_validate_vdevs(spa); 5052 if (error != 0) 5053 return (error); 5054 } 5055 5056 /* 5057 * Read all vdev labels to find the best uberblock (i.e. latest, 5058 * unless spa_load_max_txg is set) and store it in spa_uberblock. We 5059 * get the list of features required to read blkptrs in the MOS from 5060 * the vdev label with the best uberblock and verify that our version 5061 * of zfs supports them all. 5062 */ 5063 error = spa_ld_select_uberblock(spa, type); 5064 if (error != 0) 5065 return (error); 5066 5067 /* 5068 * Pass that uberblock to the dsl_pool layer which will open the root 5069 * blkptr. This blkptr points to the latest version of the MOS and will 5070 * allow us to read its contents. 5071 */ 5072 error = spa_ld_open_rootbp(spa); 5073 if (error != 0) 5074 return (error); 5075 5076 return (0); 5077 } 5078 5079 static int 5080 spa_ld_checkpoint_rewind(spa_t *spa) 5081 { 5082 uberblock_t checkpoint; 5083 int error = 0; 5084 5085 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5086 ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); 5087 5088 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 5089 DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), 5090 sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); 5091 5092 if (error != 0) { 5093 spa_load_failed(spa, "unable to retrieve checkpointed " 5094 "uberblock from the MOS config [error=%d]", error); 5095 5096 if (error == ENOENT) 5097 error = ZFS_ERR_NO_CHECKPOINT; 5098 5099 return (error); 5100 } 5101 5102 ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg); 5103 ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg); 5104 5105 /* 5106 * We need to update the txg and timestamp of the checkpointed 5107 * uberblock to be higher than the latest one. This ensures that 5108 * the checkpointed uberblock is selected if we were to close and 5109 * reopen the pool right after we've written it in the vdev labels. 5110 * (also see block comment in vdev_uberblock_compare) 5111 */ 5112 checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1; 5113 checkpoint.ub_timestamp = gethrestime_sec(); 5114 5115 /* 5116 * Set current uberblock to be the checkpointed uberblock. 5117 */ 5118 spa->spa_uberblock = checkpoint; 5119 5120 /* 5121 * If we are doing a normal rewind, then the pool is open for 5122 * writing and we sync the "updated" checkpointed uberblock to 5123 * disk. Once this is done, we've basically rewound the whole 5124 * pool and there is no way back. 5125 * 5126 * There are cases when we don't want to attempt and sync the 5127 * checkpointed uberblock to disk because we are opening a 5128 * pool as read-only. Specifically, verifying the checkpointed 5129 * state with zdb, and importing the checkpointed state to get 5130 * a "preview" of its content. 5131 */ 5132 if (spa_writeable(spa)) { 5133 vdev_t *rvd = spa->spa_root_vdev; 5134 5135 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5136 vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; 5137 int svdcount = 0; 5138 int children = rvd->vdev_children; 5139 int c0 = random_in_range(children); 5140 5141 for (int c = 0; c < children; c++) { 5142 vdev_t *vd = rvd->vdev_child[(c0 + c) % children]; 5143 5144 /* Stop when revisiting the first vdev */ 5145 if (c > 0 && svd[0] == vd) 5146 break; 5147 5148 if (vd->vdev_ms_array == 0 || vd->vdev_islog || 5149 !vdev_is_concrete(vd)) 5150 continue; 5151 5152 svd[svdcount++] = vd; 5153 if (svdcount == SPA_SYNC_MIN_VDEVS) 5154 break; 5155 } 5156 error = vdev_config_sync(svd, svdcount, spa->spa_first_txg); 5157 if (error == 0) 5158 spa->spa_last_synced_guid = rvd->vdev_guid; 5159 spa_config_exit(spa, SCL_ALL, FTAG); 5160 5161 if (error != 0) { 5162 spa_load_failed(spa, "failed to write checkpointed " 5163 "uberblock to the vdev labels [error=%d]", error); 5164 return (error); 5165 } 5166 } 5167 5168 return (0); 5169 } 5170 5171 static int 5172 spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type, 5173 boolean_t *update_config_cache) 5174 { 5175 int error; 5176 5177 /* 5178 * Parse the config for pool, open and validate vdevs, 5179 * select an uberblock, and use that uberblock to open 5180 * the MOS. 5181 */ 5182 error = spa_ld_mos_init(spa, type); 5183 if (error != 0) 5184 return (error); 5185 5186 /* 5187 * Retrieve the trusted config stored in the MOS and use it to create 5188 * a new, exact version of the vdev tree, then reopen all vdevs. 5189 */ 5190 error = spa_ld_trusted_config(spa, type, B_FALSE); 5191 if (error == EAGAIN) { 5192 if (update_config_cache != NULL) 5193 *update_config_cache = B_TRUE; 5194 5195 /* 5196 * Redo the loading process with the trusted config if it is 5197 * too different from the untrusted config. 5198 */ 5199 spa_ld_prepare_for_reload(spa); 5200 spa_load_note(spa, "RELOADING"); 5201 error = spa_ld_mos_init(spa, type); 5202 if (error != 0) 5203 return (error); 5204 5205 error = spa_ld_trusted_config(spa, type, B_TRUE); 5206 if (error != 0) 5207 return (error); 5208 5209 } else if (error != 0) { 5210 return (error); 5211 } 5212 5213 return (0); 5214 } 5215 5216 /* 5217 * Load an existing storage pool, using the config provided. This config 5218 * describes which vdevs are part of the pool and is later validated against 5219 * partial configs present in each vdev's label and an entire copy of the 5220 * config stored in the MOS. 5221 */ 5222 static int 5223 spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) 5224 { 5225 int error = 0; 5226 boolean_t missing_feat_write = B_FALSE; 5227 boolean_t checkpoint_rewind = 5228 (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); 5229 boolean_t update_config_cache = B_FALSE; 5230 5231 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5232 ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); 5233 5234 spa_load_note(spa, "LOADING"); 5235 5236 error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache); 5237 if (error != 0) 5238 return (error); 5239 5240 /* 5241 * If we are rewinding to the checkpoint then we need to repeat 5242 * everything we've done so far in this function but this time 5243 * selecting the checkpointed uberblock and using that to open 5244 * the MOS. 5245 */ 5246 if (checkpoint_rewind) { 5247 /* 5248 * If we are rewinding to the checkpoint update config cache 5249 * anyway. 5250 */ 5251 update_config_cache = B_TRUE; 5252 5253 /* 5254 * Extract the checkpointed uberblock from the current MOS 5255 * and use this as the pool's uberblock from now on. If the 5256 * pool is imported as writeable we also write the checkpoint 5257 * uberblock to the labels, making the rewind permanent. 5258 */ 5259 error = spa_ld_checkpoint_rewind(spa); 5260 if (error != 0) 5261 return (error); 5262 5263 /* 5264 * Redo the loading process again with the 5265 * checkpointed uberblock. 5266 */ 5267 spa_ld_prepare_for_reload(spa); 5268 spa_load_note(spa, "LOADING checkpointed uberblock"); 5269 error = spa_ld_mos_with_trusted_config(spa, type, NULL); 5270 if (error != 0) 5271 return (error); 5272 } 5273 5274 /* 5275 * Retrieve the checkpoint txg if the pool has a checkpoint. 5276 */ 5277 spa_import_progress_set_notes(spa, "Loading checkpoint txg"); 5278 error = spa_ld_read_checkpoint_txg(spa); 5279 if (error != 0) 5280 return (error); 5281 5282 /* 5283 * Retrieve the mapping of indirect vdevs. Those vdevs were removed 5284 * from the pool and their contents were re-mapped to other vdevs. Note 5285 * that everything that we read before this step must have been 5286 * rewritten on concrete vdevs after the last device removal was 5287 * initiated. Otherwise we could be reading from indirect vdevs before 5288 * we have loaded their mappings. 5289 */ 5290 spa_import_progress_set_notes(spa, "Loading indirect vdev metadata"); 5291 error = spa_ld_open_indirect_vdev_metadata(spa); 5292 if (error != 0) 5293 return (error); 5294 5295 /* 5296 * Retrieve the full list of active features from the MOS and check if 5297 * they are all supported. 5298 */ 5299 spa_import_progress_set_notes(spa, "Checking feature flags"); 5300 error = spa_ld_check_features(spa, &missing_feat_write); 5301 if (error != 0) 5302 return (error); 5303 5304 /* 5305 * Load several special directories from the MOS needed by the dsl_pool 5306 * layer. 5307 */ 5308 spa_import_progress_set_notes(spa, "Loading special MOS directories"); 5309 error = spa_ld_load_special_directories(spa); 5310 if (error != 0) 5311 return (error); 5312 5313 /* 5314 * Retrieve pool properties from the MOS. 5315 */ 5316 spa_import_progress_set_notes(spa, "Loading properties"); 5317 error = spa_ld_get_props(spa); 5318 if (error != 0) 5319 return (error); 5320 5321 /* 5322 * Retrieve the list of auxiliary devices - cache devices and spares - 5323 * and open them. 5324 */ 5325 spa_import_progress_set_notes(spa, "Loading AUX vdevs"); 5326 error = spa_ld_open_aux_vdevs(spa, type); 5327 if (error != 0) 5328 return (error); 5329 5330 /* 5331 * Load the metadata for all vdevs. Also check if unopenable devices 5332 * should be autoreplaced. 5333 */ 5334 spa_import_progress_set_notes(spa, "Loading vdev metadata"); 5335 error = spa_ld_load_vdev_metadata(spa); 5336 if (error != 0) 5337 return (error); 5338 5339 spa_import_progress_set_notes(spa, "Loading dedup tables"); 5340 error = spa_ld_load_dedup_tables(spa); 5341 if (error != 0) 5342 return (error); 5343 5344 spa_import_progress_set_notes(spa, "Loading BRT"); 5345 error = spa_ld_load_brt(spa); 5346 if (error != 0) 5347 return (error); 5348 5349 /* 5350 * Verify the logs now to make sure we don't have any unexpected errors 5351 * when we claim log blocks later. 5352 */ 5353 spa_import_progress_set_notes(spa, "Verifying Log Devices"); 5354 error = spa_ld_verify_logs(spa, type, ereport); 5355 if (error != 0) 5356 return (error); 5357 5358 if (missing_feat_write) { 5359 ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT); 5360 5361 /* 5362 * At this point, we know that we can open the pool in 5363 * read-only mode but not read-write mode. We now have enough 5364 * information and can return to userland. 5365 */ 5366 return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT, 5367 ENOTSUP)); 5368 } 5369 5370 /* 5371 * Traverse the last txgs to make sure the pool was left off in a safe 5372 * state. When performing an extreme rewind, we verify the whole pool, 5373 * which can take a very long time. 5374 */ 5375 spa_import_progress_set_notes(spa, "Verifying pool data"); 5376 error = spa_ld_verify_pool_data(spa); 5377 if (error != 0) 5378 return (error); 5379 5380 /* 5381 * Calculate the deflated space for the pool. This must be done before 5382 * we write anything to the pool because we'd need to update the space 5383 * accounting using the deflated sizes. 5384 */ 5385 spa_import_progress_set_notes(spa, "Calculating deflated space"); 5386 spa_update_dspace(spa); 5387 5388 /* 5389 * We have now retrieved all the information we needed to open the 5390 * pool. If we are importing the pool in read-write mode, a few 5391 * additional steps must be performed to finish the import. 5392 */ 5393 spa_import_progress_set_notes(spa, "Starting import"); 5394 if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER || 5395 spa->spa_load_max_txg == UINT64_MAX)) { 5396 uint64_t config_cache_txg = spa->spa_config_txg; 5397 5398 ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT); 5399 5400 /* 5401 * Before we do any zio_write's, complete the raidz expansion 5402 * scratch space copying, if necessary. 5403 */ 5404 if (RRSS_GET_STATE(&spa->spa_uberblock) == RRSS_SCRATCH_VALID) 5405 vdev_raidz_reflow_copy_scratch(spa); 5406 5407 /* 5408 * In case of a checkpoint rewind, log the original txg 5409 * of the checkpointed uberblock. 5410 */ 5411 if (checkpoint_rewind) { 5412 spa_history_log_internal(spa, "checkpoint rewind", 5413 NULL, "rewound state to txg=%llu", 5414 (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg); 5415 } 5416 5417 spa_import_progress_set_notes(spa, "Claiming ZIL blocks"); 5418 /* 5419 * Traverse the ZIL and claim all blocks. 5420 */ 5421 spa_ld_claim_log_blocks(spa); 5422 5423 /* 5424 * Kick-off the syncing thread. 5425 */ 5426 spa->spa_sync_on = B_TRUE; 5427 txg_sync_start(spa->spa_dsl_pool); 5428 mmp_thread_start(spa); 5429 5430 /* 5431 * Wait for all claims to sync. We sync up to the highest 5432 * claimed log block birth time so that claimed log blocks 5433 * don't appear to be from the future. spa_claim_max_txg 5434 * will have been set for us by ZIL traversal operations 5435 * performed above. 5436 */ 5437 spa_import_progress_set_notes(spa, "Syncing ZIL claims"); 5438 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 5439 5440 /* 5441 * Check if we need to request an update of the config. On the 5442 * next sync, we would update the config stored in vdev labels 5443 * and the cachefile (by default /etc/zfs/zpool.cache). 5444 */ 5445 spa_import_progress_set_notes(spa, "Updating configs"); 5446 spa_ld_check_for_config_update(spa, config_cache_txg, 5447 update_config_cache); 5448 5449 /* 5450 * Check if a rebuild was in progress and if so resume it. 5451 * Then check all DTLs to see if anything needs resilvering. 5452 * The resilver will be deferred if a rebuild was started. 5453 */ 5454 spa_import_progress_set_notes(spa, "Starting resilvers"); 5455 if (vdev_rebuild_active(spa->spa_root_vdev)) { 5456 vdev_rebuild_restart(spa); 5457 } else if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 5458 vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 5459 spa_async_request(spa, SPA_ASYNC_RESILVER); 5460 } 5461 5462 /* 5463 * Log the fact that we booted up (so that we can detect if 5464 * we rebooted in the middle of an operation). 5465 */ 5466 spa_history_log_version(spa, "open", NULL); 5467 5468 spa_import_progress_set_notes(spa, 5469 "Restarting device removals"); 5470 spa_restart_removal(spa); 5471 spa_spawn_aux_threads(spa); 5472 5473 /* 5474 * Delete any inconsistent datasets. 5475 * 5476 * Note: 5477 * Since we may be issuing deletes for clones here, 5478 * we make sure to do so after we've spawned all the 5479 * auxiliary threads above (from which the livelist 5480 * deletion zthr is part of). 5481 */ 5482 spa_import_progress_set_notes(spa, 5483 "Cleaning up inconsistent objsets"); 5484 (void) dmu_objset_find(spa_name(spa), 5485 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 5486 5487 /* 5488 * Clean up any stale temporary dataset userrefs. 5489 */ 5490 spa_import_progress_set_notes(spa, 5491 "Cleaning up temporary userrefs"); 5492 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 5493 5494 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5495 spa_import_progress_set_notes(spa, "Restarting initialize"); 5496 vdev_initialize_restart(spa->spa_root_vdev); 5497 spa_import_progress_set_notes(spa, "Restarting TRIM"); 5498 vdev_trim_restart(spa->spa_root_vdev); 5499 vdev_autotrim_restart(spa); 5500 spa_config_exit(spa, SCL_CONFIG, FTAG); 5501 spa_import_progress_set_notes(spa, "Finished importing"); 5502 } 5503 5504 spa_import_progress_remove(spa_guid(spa)); 5505 spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); 5506 5507 spa_load_note(spa, "LOADED"); 5508 5509 return (0); 5510 } 5511 5512 static int 5513 spa_load_retry(spa_t *spa, spa_load_state_t state) 5514 { 5515 spa_mode_t mode = spa->spa_mode; 5516 5517 spa_unload(spa); 5518 spa_deactivate(spa); 5519 5520 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1; 5521 5522 spa_activate(spa, mode); 5523 spa_async_suspend(spa); 5524 5525 spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu", 5526 (u_longlong_t)spa->spa_load_max_txg); 5527 5528 return (spa_load(spa, state, SPA_IMPORT_EXISTING)); 5529 } 5530 5531 /* 5532 * If spa_load() fails this function will try loading prior txg's. If 5533 * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool 5534 * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this 5535 * function will not rewind the pool and will return the same error as 5536 * spa_load(). 5537 */ 5538 static int 5539 spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request, 5540 int rewind_flags) 5541 { 5542 nvlist_t *loadinfo = NULL; 5543 nvlist_t *config = NULL; 5544 int load_error, rewind_error; 5545 uint64_t safe_rewind_txg; 5546 uint64_t min_txg; 5547 5548 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 5549 spa->spa_load_max_txg = spa->spa_load_txg; 5550 spa_set_log_state(spa, SPA_LOG_CLEAR); 5551 } else { 5552 spa->spa_load_max_txg = max_request; 5553 if (max_request != UINT64_MAX) 5554 spa->spa_extreme_rewind = B_TRUE; 5555 } 5556 5557 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING); 5558 if (load_error == 0) 5559 return (0); 5560 if (load_error == ZFS_ERR_NO_CHECKPOINT) { 5561 /* 5562 * When attempting checkpoint-rewind on a pool with no 5563 * checkpoint, we should not attempt to load uberblocks 5564 * from previous txgs when spa_load fails. 5565 */ 5566 ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); 5567 spa_import_progress_remove(spa_guid(spa)); 5568 return (load_error); 5569 } 5570 5571 if (spa->spa_root_vdev != NULL) 5572 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 5573 5574 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 5575 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 5576 5577 if (rewind_flags & ZPOOL_NEVER_REWIND) { 5578 nvlist_free(config); 5579 spa_import_progress_remove(spa_guid(spa)); 5580 return (load_error); 5581 } 5582 5583 if (state == SPA_LOAD_RECOVER) { 5584 /* Price of rolling back is discarding txgs, including log */ 5585 spa_set_log_state(spa, SPA_LOG_CLEAR); 5586 } else { 5587 /* 5588 * If we aren't rolling back save the load info from our first 5589 * import attempt so that we can restore it after attempting 5590 * to rewind. 5591 */ 5592 loadinfo = spa->spa_load_info; 5593 spa->spa_load_info = fnvlist_alloc(); 5594 } 5595 5596 spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 5597 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 5598 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 5599 TXG_INITIAL : safe_rewind_txg; 5600 5601 /* 5602 * Continue as long as we're finding errors, we're still within 5603 * the acceptable rewind range, and we're still finding uberblocks 5604 */ 5605 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 5606 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 5607 if (spa->spa_load_max_txg < safe_rewind_txg) 5608 spa->spa_extreme_rewind = B_TRUE; 5609 rewind_error = spa_load_retry(spa, state); 5610 } 5611 5612 spa->spa_extreme_rewind = B_FALSE; 5613 spa->spa_load_max_txg = UINT64_MAX; 5614 5615 if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 5616 spa_config_set(spa, config); 5617 else 5618 nvlist_free(config); 5619 5620 if (state == SPA_LOAD_RECOVER) { 5621 ASSERT3P(loadinfo, ==, NULL); 5622 spa_import_progress_remove(spa_guid(spa)); 5623 return (rewind_error); 5624 } else { 5625 /* Store the rewind info as part of the initial load info */ 5626 fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, 5627 spa->spa_load_info); 5628 5629 /* Restore the initial load info */ 5630 fnvlist_free(spa->spa_load_info); 5631 spa->spa_load_info = loadinfo; 5632 5633 spa_import_progress_remove(spa_guid(spa)); 5634 return (load_error); 5635 } 5636 } 5637 5638 /* 5639 * Pool Open/Import 5640 * 5641 * The import case is identical to an open except that the configuration is sent 5642 * down from userland, instead of grabbed from the configuration cache. For the 5643 * case of an open, the pool configuration will exist in the 5644 * POOL_STATE_UNINITIALIZED state. 5645 * 5646 * The stats information (gen/count/ustats) is used to gather vdev statistics at 5647 * the same time open the pool, without having to keep around the spa_t in some 5648 * ambiguous state. 5649 */ 5650 static int 5651 spa_open_common(const char *pool, spa_t **spapp, const void *tag, 5652 nvlist_t *nvpolicy, nvlist_t **config) 5653 { 5654 spa_t *spa; 5655 spa_load_state_t state = SPA_LOAD_OPEN; 5656 int error; 5657 int locked = B_FALSE; 5658 int firstopen = B_FALSE; 5659 5660 *spapp = NULL; 5661 5662 /* 5663 * As disgusting as this is, we need to support recursive calls to this 5664 * function because dsl_dir_open() is called during spa_load(), and ends 5665 * up calling spa_open() again. The real fix is to figure out how to 5666 * avoid dsl_dir_open() calling this in the first place. 5667 */ 5668 if (MUTEX_NOT_HELD(&spa_namespace_lock)) { 5669 mutex_enter(&spa_namespace_lock); 5670 locked = B_TRUE; 5671 } 5672 5673 if ((spa = spa_lookup(pool)) == NULL) { 5674 if (locked) 5675 mutex_exit(&spa_namespace_lock); 5676 return (SET_ERROR(ENOENT)); 5677 } 5678 5679 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 5680 zpool_load_policy_t policy; 5681 5682 firstopen = B_TRUE; 5683 5684 zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config, 5685 &policy); 5686 if (policy.zlp_rewind & ZPOOL_DO_REWIND) 5687 state = SPA_LOAD_RECOVER; 5688 5689 spa_activate(spa, spa_mode_global); 5690 5691 if (state != SPA_LOAD_RECOVER) 5692 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 5693 spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; 5694 5695 zfs_dbgmsg("spa_open_common: opening %s", pool); 5696 error = spa_load_best(spa, state, policy.zlp_txg, 5697 policy.zlp_rewind); 5698 5699 if (error == EBADF) { 5700 /* 5701 * If vdev_validate() returns failure (indicated by 5702 * EBADF), it indicates that one of the vdevs indicates 5703 * that the pool has been exported or destroyed. If 5704 * this is the case, the config cache is out of sync and 5705 * we should remove the pool from the namespace. 5706 */ 5707 spa_unload(spa); 5708 spa_deactivate(spa); 5709 spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE); 5710 spa_remove(spa); 5711 if (locked) 5712 mutex_exit(&spa_namespace_lock); 5713 return (SET_ERROR(ENOENT)); 5714 } 5715 5716 if (error) { 5717 /* 5718 * We can't open the pool, but we still have useful 5719 * information: the state of each vdev after the 5720 * attempted vdev_open(). Return this to the user. 5721 */ 5722 if (config != NULL && spa->spa_config) { 5723 *config = fnvlist_dup(spa->spa_config); 5724 fnvlist_add_nvlist(*config, 5725 ZPOOL_CONFIG_LOAD_INFO, 5726 spa->spa_load_info); 5727 } 5728 spa_unload(spa); 5729 spa_deactivate(spa); 5730 spa->spa_last_open_failed = error; 5731 if (locked) 5732 mutex_exit(&spa_namespace_lock); 5733 *spapp = NULL; 5734 return (error); 5735 } 5736 } 5737 5738 spa_open_ref(spa, tag); 5739 5740 if (config != NULL) 5741 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 5742 5743 /* 5744 * If we've recovered the pool, pass back any information we 5745 * gathered while doing the load. 5746 */ 5747 if (state == SPA_LOAD_RECOVER && config != NULL) { 5748 fnvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 5749 spa->spa_load_info); 5750 } 5751 5752 if (locked) { 5753 spa->spa_last_open_failed = 0; 5754 spa->spa_last_ubsync_txg = 0; 5755 spa->spa_load_txg = 0; 5756 mutex_exit(&spa_namespace_lock); 5757 } 5758 5759 if (firstopen) 5760 zvol_create_minors_recursive(spa_name(spa)); 5761 5762 *spapp = spa; 5763 5764 return (0); 5765 } 5766 5767 int 5768 spa_open_rewind(const char *name, spa_t **spapp, const void *tag, 5769 nvlist_t *policy, nvlist_t **config) 5770 { 5771 return (spa_open_common(name, spapp, tag, policy, config)); 5772 } 5773 5774 int 5775 spa_open(const char *name, spa_t **spapp, const void *tag) 5776 { 5777 return (spa_open_common(name, spapp, tag, NULL, NULL)); 5778 } 5779 5780 /* 5781 * Lookup the given spa_t, incrementing the inject count in the process, 5782 * preventing it from being exported or destroyed. 5783 */ 5784 spa_t * 5785 spa_inject_addref(char *name) 5786 { 5787 spa_t *spa; 5788 5789 mutex_enter(&spa_namespace_lock); 5790 if ((spa = spa_lookup(name)) == NULL) { 5791 mutex_exit(&spa_namespace_lock); 5792 return (NULL); 5793 } 5794 spa->spa_inject_ref++; 5795 mutex_exit(&spa_namespace_lock); 5796 5797 return (spa); 5798 } 5799 5800 void 5801 spa_inject_delref(spa_t *spa) 5802 { 5803 mutex_enter(&spa_namespace_lock); 5804 spa->spa_inject_ref--; 5805 mutex_exit(&spa_namespace_lock); 5806 } 5807 5808 /* 5809 * Add spares device information to the nvlist. 5810 */ 5811 static void 5812 spa_add_spares(spa_t *spa, nvlist_t *config) 5813 { 5814 nvlist_t **spares; 5815 uint_t i, nspares; 5816 nvlist_t *nvroot; 5817 uint64_t guid; 5818 vdev_stat_t *vs; 5819 uint_t vsc; 5820 uint64_t pool; 5821 5822 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 5823 5824 if (spa->spa_spares.sav_count == 0) 5825 return; 5826 5827 nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); 5828 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 5829 ZPOOL_CONFIG_SPARES, &spares, &nspares)); 5830 if (nspares != 0) { 5831 fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 5832 (const nvlist_t * const *)spares, nspares); 5833 VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 5834 &spares, &nspares)); 5835 5836 /* 5837 * Go through and find any spares which have since been 5838 * repurposed as an active spare. If this is the case, update 5839 * their status appropriately. 5840 */ 5841 for (i = 0; i < nspares; i++) { 5842 guid = fnvlist_lookup_uint64(spares[i], 5843 ZPOOL_CONFIG_GUID); 5844 VERIFY0(nvlist_lookup_uint64_array(spares[i], 5845 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)); 5846 if (spa_spare_exists(guid, &pool, NULL) && 5847 pool != 0ULL) { 5848 vs->vs_state = VDEV_STATE_CANT_OPEN; 5849 vs->vs_aux = VDEV_AUX_SPARED; 5850 } else { 5851 vs->vs_state = 5852 spa->spa_spares.sav_vdevs[i]->vdev_state; 5853 } 5854 } 5855 } 5856 } 5857 5858 /* 5859 * Add l2cache device information to the nvlist, including vdev stats. 5860 */ 5861 static void 5862 spa_add_l2cache(spa_t *spa, nvlist_t *config) 5863 { 5864 nvlist_t **l2cache; 5865 uint_t i, j, nl2cache; 5866 nvlist_t *nvroot; 5867 uint64_t guid; 5868 vdev_t *vd; 5869 vdev_stat_t *vs; 5870 uint_t vsc; 5871 5872 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 5873 5874 if (spa->spa_l2cache.sav_count == 0) 5875 return; 5876 5877 nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); 5878 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 5879 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache)); 5880 if (nl2cache != 0) { 5881 fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 5882 (const nvlist_t * const *)l2cache, nl2cache); 5883 VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 5884 &l2cache, &nl2cache)); 5885 5886 /* 5887 * Update level 2 cache device stats. 5888 */ 5889 5890 for (i = 0; i < nl2cache; i++) { 5891 guid = fnvlist_lookup_uint64(l2cache[i], 5892 ZPOOL_CONFIG_GUID); 5893 5894 vd = NULL; 5895 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 5896 if (guid == 5897 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 5898 vd = spa->spa_l2cache.sav_vdevs[j]; 5899 break; 5900 } 5901 } 5902 ASSERT(vd != NULL); 5903 5904 VERIFY0(nvlist_lookup_uint64_array(l2cache[i], 5905 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)); 5906 vdev_get_stats(vd, vs); 5907 vdev_config_generate_stats(vd, l2cache[i]); 5908 5909 } 5910 } 5911 } 5912 5913 static void 5914 spa_feature_stats_from_disk(spa_t *spa, nvlist_t *features) 5915 { 5916 zap_cursor_t zc; 5917 zap_attribute_t za; 5918 5919 if (spa->spa_feat_for_read_obj != 0) { 5920 for (zap_cursor_init(&zc, spa->spa_meta_objset, 5921 spa->spa_feat_for_read_obj); 5922 zap_cursor_retrieve(&zc, &za) == 0; 5923 zap_cursor_advance(&zc)) { 5924 ASSERT(za.za_integer_length == sizeof (uint64_t) && 5925 za.za_num_integers == 1); 5926 VERIFY0(nvlist_add_uint64(features, za.za_name, 5927 za.za_first_integer)); 5928 } 5929 zap_cursor_fini(&zc); 5930 } 5931 5932 if (spa->spa_feat_for_write_obj != 0) { 5933 for (zap_cursor_init(&zc, spa->spa_meta_objset, 5934 spa->spa_feat_for_write_obj); 5935 zap_cursor_retrieve(&zc, &za) == 0; 5936 zap_cursor_advance(&zc)) { 5937 ASSERT(za.za_integer_length == sizeof (uint64_t) && 5938 za.za_num_integers == 1); 5939 VERIFY0(nvlist_add_uint64(features, za.za_name, 5940 za.za_first_integer)); 5941 } 5942 zap_cursor_fini(&zc); 5943 } 5944 } 5945 5946 static void 5947 spa_feature_stats_from_cache(spa_t *spa, nvlist_t *features) 5948 { 5949 int i; 5950 5951 for (i = 0; i < SPA_FEATURES; i++) { 5952 zfeature_info_t feature = spa_feature_table[i]; 5953 uint64_t refcount; 5954 5955 if (feature_get_refcount(spa, &feature, &refcount) != 0) 5956 continue; 5957 5958 VERIFY0(nvlist_add_uint64(features, feature.fi_guid, refcount)); 5959 } 5960 } 5961 5962 /* 5963 * Store a list of pool features and their reference counts in the 5964 * config. 5965 * 5966 * The first time this is called on a spa, allocate a new nvlist, fetch 5967 * the pool features and reference counts from disk, then save the list 5968 * in the spa. In subsequent calls on the same spa use the saved nvlist 5969 * and refresh its values from the cached reference counts. This 5970 * ensures we don't block here on I/O on a suspended pool so 'zpool 5971 * clear' can resume the pool. 5972 */ 5973 static void 5974 spa_add_feature_stats(spa_t *spa, nvlist_t *config) 5975 { 5976 nvlist_t *features; 5977 5978 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 5979 5980 mutex_enter(&spa->spa_feat_stats_lock); 5981 features = spa->spa_feat_stats; 5982 5983 if (features != NULL) { 5984 spa_feature_stats_from_cache(spa, features); 5985 } else { 5986 VERIFY0(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP)); 5987 spa->spa_feat_stats = features; 5988 spa_feature_stats_from_disk(spa, features); 5989 } 5990 5991 VERIFY0(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, 5992 features)); 5993 5994 mutex_exit(&spa->spa_feat_stats_lock); 5995 } 5996 5997 int 5998 spa_get_stats(const char *name, nvlist_t **config, 5999 char *altroot, size_t buflen) 6000 { 6001 int error; 6002 spa_t *spa; 6003 6004 *config = NULL; 6005 error = spa_open_common(name, &spa, FTAG, NULL, config); 6006 6007 if (spa != NULL) { 6008 /* 6009 * This still leaves a window of inconsistency where the spares 6010 * or l2cache devices could change and the config would be 6011 * self-inconsistent. 6012 */ 6013 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6014 6015 if (*config != NULL) { 6016 uint64_t loadtimes[2]; 6017 6018 loadtimes[0] = spa->spa_loaded_ts.tv_sec; 6019 loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 6020 fnvlist_add_uint64_array(*config, 6021 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2); 6022 6023 fnvlist_add_uint64(*config, 6024 ZPOOL_CONFIG_ERRCOUNT, 6025 spa_approx_errlog_size(spa)); 6026 6027 if (spa_suspended(spa)) { 6028 fnvlist_add_uint64(*config, 6029 ZPOOL_CONFIG_SUSPENDED, 6030 spa->spa_failmode); 6031 fnvlist_add_uint64(*config, 6032 ZPOOL_CONFIG_SUSPENDED_REASON, 6033 spa->spa_suspended); 6034 } 6035 6036 spa_add_spares(spa, *config); 6037 spa_add_l2cache(spa, *config); 6038 spa_add_feature_stats(spa, *config); 6039 } 6040 } 6041 6042 /* 6043 * We want to get the alternate root even for faulted pools, so we cheat 6044 * and call spa_lookup() directly. 6045 */ 6046 if (altroot) { 6047 if (spa == NULL) { 6048 mutex_enter(&spa_namespace_lock); 6049 spa = spa_lookup(name); 6050 if (spa) 6051 spa_altroot(spa, altroot, buflen); 6052 else 6053 altroot[0] = '\0'; 6054 spa = NULL; 6055 mutex_exit(&spa_namespace_lock); 6056 } else { 6057 spa_altroot(spa, altroot, buflen); 6058 } 6059 } 6060 6061 if (spa != NULL) { 6062 spa_config_exit(spa, SCL_CONFIG, FTAG); 6063 spa_close(spa, FTAG); 6064 } 6065 6066 return (error); 6067 } 6068 6069 /* 6070 * Validate that the auxiliary device array is well formed. We must have an 6071 * array of nvlists, each which describes a valid leaf vdev. If this is an 6072 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 6073 * specified, as long as they are well-formed. 6074 */ 6075 static int 6076 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 6077 spa_aux_vdev_t *sav, const char *config, uint64_t version, 6078 vdev_labeltype_t label) 6079 { 6080 nvlist_t **dev; 6081 uint_t i, ndev; 6082 vdev_t *vd; 6083 int error; 6084 6085 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 6086 6087 /* 6088 * It's acceptable to have no devs specified. 6089 */ 6090 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 6091 return (0); 6092 6093 if (ndev == 0) 6094 return (SET_ERROR(EINVAL)); 6095 6096 /* 6097 * Make sure the pool is formatted with a version that supports this 6098 * device type. 6099 */ 6100 if (spa_version(spa) < version) 6101 return (SET_ERROR(ENOTSUP)); 6102 6103 /* 6104 * Set the pending device list so we correctly handle device in-use 6105 * checking. 6106 */ 6107 sav->sav_pending = dev; 6108 sav->sav_npending = ndev; 6109 6110 for (i = 0; i < ndev; i++) { 6111 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 6112 mode)) != 0) 6113 goto out; 6114 6115 if (!vd->vdev_ops->vdev_op_leaf) { 6116 vdev_free(vd); 6117 error = SET_ERROR(EINVAL); 6118 goto out; 6119 } 6120 6121 vd->vdev_top = vd; 6122 6123 if ((error = vdev_open(vd)) == 0 && 6124 (error = vdev_label_init(vd, crtxg, label)) == 0) { 6125 fnvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 6126 vd->vdev_guid); 6127 } 6128 6129 vdev_free(vd); 6130 6131 if (error && 6132 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 6133 goto out; 6134 else 6135 error = 0; 6136 } 6137 6138 out: 6139 sav->sav_pending = NULL; 6140 sav->sav_npending = 0; 6141 return (error); 6142 } 6143 6144 static int 6145 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 6146 { 6147 int error; 6148 6149 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 6150 6151 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 6152 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 6153 VDEV_LABEL_SPARE)) != 0) { 6154 return (error); 6155 } 6156 6157 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 6158 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 6159 VDEV_LABEL_L2CACHE)); 6160 } 6161 6162 static void 6163 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 6164 const char *config) 6165 { 6166 int i; 6167 6168 if (sav->sav_config != NULL) { 6169 nvlist_t **olddevs; 6170 uint_t oldndevs; 6171 nvlist_t **newdevs; 6172 6173 /* 6174 * Generate new dev list by concatenating with the 6175 * current dev list. 6176 */ 6177 VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, config, 6178 &olddevs, &oldndevs)); 6179 6180 newdevs = kmem_alloc(sizeof (void *) * 6181 (ndevs + oldndevs), KM_SLEEP); 6182 for (i = 0; i < oldndevs; i++) 6183 newdevs[i] = fnvlist_dup(olddevs[i]); 6184 for (i = 0; i < ndevs; i++) 6185 newdevs[i + oldndevs] = fnvlist_dup(devs[i]); 6186 6187 fnvlist_remove(sav->sav_config, config); 6188 6189 fnvlist_add_nvlist_array(sav->sav_config, config, 6190 (const nvlist_t * const *)newdevs, ndevs + oldndevs); 6191 for (i = 0; i < oldndevs + ndevs; i++) 6192 nvlist_free(newdevs[i]); 6193 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 6194 } else { 6195 /* 6196 * Generate a new dev list. 6197 */ 6198 sav->sav_config = fnvlist_alloc(); 6199 fnvlist_add_nvlist_array(sav->sav_config, config, 6200 (const nvlist_t * const *)devs, ndevs); 6201 } 6202 } 6203 6204 /* 6205 * Stop and drop level 2 ARC devices 6206 */ 6207 void 6208 spa_l2cache_drop(spa_t *spa) 6209 { 6210 vdev_t *vd; 6211 int i; 6212 spa_aux_vdev_t *sav = &spa->spa_l2cache; 6213 6214 for (i = 0; i < sav->sav_count; i++) { 6215 uint64_t pool; 6216 6217 vd = sav->sav_vdevs[i]; 6218 ASSERT(vd != NULL); 6219 6220 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 6221 pool != 0ULL && l2arc_vdev_present(vd)) 6222 l2arc_remove_vdev(vd); 6223 } 6224 } 6225 6226 /* 6227 * Verify encryption parameters for spa creation. If we are encrypting, we must 6228 * have the encryption feature flag enabled. 6229 */ 6230 static int 6231 spa_create_check_encryption_params(dsl_crypto_params_t *dcp, 6232 boolean_t has_encryption) 6233 { 6234 if (dcp->cp_crypt != ZIO_CRYPT_OFF && 6235 dcp->cp_crypt != ZIO_CRYPT_INHERIT && 6236 !has_encryption) 6237 return (SET_ERROR(ENOTSUP)); 6238 6239 return (dmu_objset_create_crypt_check(NULL, dcp, NULL)); 6240 } 6241 6242 /* 6243 * Pool Creation 6244 */ 6245 int 6246 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 6247 nvlist_t *zplprops, dsl_crypto_params_t *dcp) 6248 { 6249 spa_t *spa; 6250 const char *altroot = NULL; 6251 vdev_t *rvd; 6252 dsl_pool_t *dp; 6253 dmu_tx_t *tx; 6254 int error = 0; 6255 uint64_t txg = TXG_INITIAL; 6256 nvlist_t **spares, **l2cache; 6257 uint_t nspares, nl2cache; 6258 uint64_t version, obj, ndraid = 0; 6259 boolean_t has_features; 6260 boolean_t has_encryption; 6261 boolean_t has_allocclass; 6262 spa_feature_t feat; 6263 const char *feat_name; 6264 const char *poolname; 6265 nvlist_t *nvl; 6266 6267 if (props == NULL || 6268 nvlist_lookup_string(props, "tname", &poolname) != 0) 6269 poolname = (char *)pool; 6270 6271 /* 6272 * If this pool already exists, return failure. 6273 */ 6274 mutex_enter(&spa_namespace_lock); 6275 if (spa_lookup(poolname) != NULL) { 6276 mutex_exit(&spa_namespace_lock); 6277 return (SET_ERROR(EEXIST)); 6278 } 6279 6280 /* 6281 * Allocate a new spa_t structure. 6282 */ 6283 nvl = fnvlist_alloc(); 6284 fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool); 6285 (void) nvlist_lookup_string(props, 6286 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 6287 spa = spa_add(poolname, nvl, altroot); 6288 fnvlist_free(nvl); 6289 spa_activate(spa, spa_mode_global); 6290 6291 if (props && (error = spa_prop_validate(spa, props))) { 6292 spa_deactivate(spa); 6293 spa_remove(spa); 6294 mutex_exit(&spa_namespace_lock); 6295 return (error); 6296 } 6297 6298 /* 6299 * Temporary pool names should never be written to disk. 6300 */ 6301 if (poolname != pool) 6302 spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME; 6303 6304 has_features = B_FALSE; 6305 has_encryption = B_FALSE; 6306 has_allocclass = B_FALSE; 6307 for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); 6308 elem != NULL; elem = nvlist_next_nvpair(props, elem)) { 6309 if (zpool_prop_feature(nvpair_name(elem))) { 6310 has_features = B_TRUE; 6311 6312 feat_name = strchr(nvpair_name(elem), '@') + 1; 6313 VERIFY0(zfeature_lookup_name(feat_name, &feat)); 6314 if (feat == SPA_FEATURE_ENCRYPTION) 6315 has_encryption = B_TRUE; 6316 if (feat == SPA_FEATURE_ALLOCATION_CLASSES) 6317 has_allocclass = B_TRUE; 6318 } 6319 } 6320 6321 /* verify encryption params, if they were provided */ 6322 if (dcp != NULL) { 6323 error = spa_create_check_encryption_params(dcp, has_encryption); 6324 if (error != 0) { 6325 spa_deactivate(spa); 6326 spa_remove(spa); 6327 mutex_exit(&spa_namespace_lock); 6328 return (error); 6329 } 6330 } 6331 if (!has_allocclass && zfs_special_devs(nvroot, NULL)) { 6332 spa_deactivate(spa); 6333 spa_remove(spa); 6334 mutex_exit(&spa_namespace_lock); 6335 return (ENOTSUP); 6336 } 6337 6338 if (has_features || nvlist_lookup_uint64(props, 6339 zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { 6340 version = SPA_VERSION; 6341 } 6342 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 6343 6344 spa->spa_first_txg = txg; 6345 spa->spa_uberblock.ub_txg = txg - 1; 6346 spa->spa_uberblock.ub_version = version; 6347 spa->spa_ubsync = spa->spa_uberblock; 6348 spa->spa_load_state = SPA_LOAD_CREATE; 6349 spa->spa_removing_phys.sr_state = DSS_NONE; 6350 spa->spa_removing_phys.sr_removing_vdev = -1; 6351 spa->spa_removing_phys.sr_prev_indirect_vdev = -1; 6352 spa->spa_indirect_vdevs_loaded = B_TRUE; 6353 6354 /* 6355 * Create "The Godfather" zio to hold all async IOs 6356 */ 6357 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 6358 KM_SLEEP); 6359 for (int i = 0; i < max_ncpus; i++) { 6360 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 6361 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 6362 ZIO_FLAG_GODFATHER); 6363 } 6364 6365 /* 6366 * Create the root vdev. 6367 */ 6368 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6369 6370 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 6371 6372 ASSERT(error != 0 || rvd != NULL); 6373 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 6374 6375 if (error == 0 && !zfs_allocatable_devs(nvroot)) 6376 error = SET_ERROR(EINVAL); 6377 6378 if (error == 0 && 6379 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 6380 (error = vdev_draid_spare_create(nvroot, rvd, &ndraid, 0)) == 0 && 6381 (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { 6382 /* 6383 * instantiate the metaslab groups (this will dirty the vdevs) 6384 * we can no longer error exit past this point 6385 */ 6386 for (int c = 0; error == 0 && c < rvd->vdev_children; c++) { 6387 vdev_t *vd = rvd->vdev_child[c]; 6388 6389 vdev_metaslab_set_size(vd); 6390 vdev_expand(vd, txg); 6391 } 6392 } 6393 6394 spa_config_exit(spa, SCL_ALL, FTAG); 6395 6396 if (error != 0) { 6397 spa_unload(spa); 6398 spa_deactivate(spa); 6399 spa_remove(spa); 6400 mutex_exit(&spa_namespace_lock); 6401 return (error); 6402 } 6403 6404 /* 6405 * Get the list of spares, if specified. 6406 */ 6407 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 6408 &spares, &nspares) == 0) { 6409 spa->spa_spares.sav_config = fnvlist_alloc(); 6410 fnvlist_add_nvlist_array(spa->spa_spares.sav_config, 6411 ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, 6412 nspares); 6413 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6414 spa_load_spares(spa); 6415 spa_config_exit(spa, SCL_ALL, FTAG); 6416 spa->spa_spares.sav_sync = B_TRUE; 6417 } 6418 6419 /* 6420 * Get the list of level 2 cache devices, if specified. 6421 */ 6422 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 6423 &l2cache, &nl2cache) == 0) { 6424 VERIFY0(nvlist_alloc(&spa->spa_l2cache.sav_config, 6425 NV_UNIQUE_NAME, KM_SLEEP)); 6426 fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 6427 ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache, 6428 nl2cache); 6429 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6430 spa_load_l2cache(spa); 6431 spa_config_exit(spa, SCL_ALL, FTAG); 6432 spa->spa_l2cache.sav_sync = B_TRUE; 6433 } 6434 6435 spa->spa_is_initializing = B_TRUE; 6436 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, dcp, txg); 6437 spa->spa_is_initializing = B_FALSE; 6438 6439 /* 6440 * Create DDTs (dedup tables). 6441 */ 6442 ddt_create(spa); 6443 /* 6444 * Create BRT table and BRT table object. 6445 */ 6446 brt_create(spa); 6447 6448 spa_update_dspace(spa); 6449 6450 tx = dmu_tx_create_assigned(dp, txg); 6451 6452 /* 6453 * Create the pool's history object. 6454 */ 6455 if (version >= SPA_VERSION_ZPOOL_HISTORY && !spa->spa_history) 6456 spa_history_create_obj(spa, tx); 6457 6458 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE); 6459 spa_history_log_version(spa, "create", tx); 6460 6461 /* 6462 * Create the pool config object. 6463 */ 6464 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 6465 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 6466 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 6467 6468 if (zap_add(spa->spa_meta_objset, 6469 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 6470 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 6471 cmn_err(CE_PANIC, "failed to add pool config"); 6472 } 6473 6474 if (zap_add(spa->spa_meta_objset, 6475 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 6476 sizeof (uint64_t), 1, &version, tx) != 0) { 6477 cmn_err(CE_PANIC, "failed to add pool version"); 6478 } 6479 6480 /* Newly created pools with the right version are always deflated. */ 6481 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 6482 spa->spa_deflate = TRUE; 6483 if (zap_add(spa->spa_meta_objset, 6484 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 6485 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 6486 cmn_err(CE_PANIC, "failed to add deflate"); 6487 } 6488 } 6489 6490 /* 6491 * Create the deferred-free bpobj. Turn off compression 6492 * because sync-to-convergence takes longer if the blocksize 6493 * keeps changing. 6494 */ 6495 obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 6496 dmu_object_set_compress(spa->spa_meta_objset, obj, 6497 ZIO_COMPRESS_OFF, tx); 6498 if (zap_add(spa->spa_meta_objset, 6499 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 6500 sizeof (uint64_t), 1, &obj, tx) != 0) { 6501 cmn_err(CE_PANIC, "failed to add bpobj"); 6502 } 6503 VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 6504 spa->spa_meta_objset, obj)); 6505 6506 /* 6507 * Generate some random noise for salted checksums to operate on. 6508 */ 6509 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 6510 sizeof (spa->spa_cksum_salt.zcs_bytes)); 6511 6512 /* 6513 * Set pool properties. 6514 */ 6515 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 6516 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 6517 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 6518 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 6519 spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST); 6520 spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM); 6521 6522 if (props != NULL) { 6523 spa_configfile_set(spa, props, B_FALSE); 6524 spa_sync_props(props, tx); 6525 } 6526 6527 for (int i = 0; i < ndraid; i++) 6528 spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); 6529 6530 dmu_tx_commit(tx); 6531 6532 spa->spa_sync_on = B_TRUE; 6533 txg_sync_start(dp); 6534 mmp_thread_start(spa); 6535 txg_wait_synced(dp, txg); 6536 6537 spa_spawn_aux_threads(spa); 6538 6539 spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE); 6540 6541 /* 6542 * Don't count references from objsets that are already closed 6543 * and are making their way through the eviction process. 6544 */ 6545 spa_evicting_os_wait(spa); 6546 spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); 6547 spa->spa_load_state = SPA_LOAD_NONE; 6548 6549 spa_import_os(spa); 6550 6551 mutex_exit(&spa_namespace_lock); 6552 6553 return (0); 6554 } 6555 6556 /* 6557 * Import a non-root pool into the system. 6558 */ 6559 int 6560 spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 6561 { 6562 spa_t *spa; 6563 const char *altroot = NULL; 6564 spa_load_state_t state = SPA_LOAD_IMPORT; 6565 zpool_load_policy_t policy; 6566 spa_mode_t mode = spa_mode_global; 6567 uint64_t readonly = B_FALSE; 6568 int error; 6569 nvlist_t *nvroot; 6570 nvlist_t **spares, **l2cache; 6571 uint_t nspares, nl2cache; 6572 6573 /* 6574 * If a pool with this name exists, return failure. 6575 */ 6576 mutex_enter(&spa_namespace_lock); 6577 if (spa_lookup(pool) != NULL) { 6578 mutex_exit(&spa_namespace_lock); 6579 return (SET_ERROR(EEXIST)); 6580 } 6581 6582 /* 6583 * Create and initialize the spa structure. 6584 */ 6585 (void) nvlist_lookup_string(props, 6586 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 6587 (void) nvlist_lookup_uint64(props, 6588 zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 6589 if (readonly) 6590 mode = SPA_MODE_READ; 6591 spa = spa_add(pool, config, altroot); 6592 spa->spa_import_flags = flags; 6593 6594 /* 6595 * Verbatim import - Take a pool and insert it into the namespace 6596 * as if it had been loaded at boot. 6597 */ 6598 if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 6599 if (props != NULL) 6600 spa_configfile_set(spa, props, B_FALSE); 6601 6602 spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE); 6603 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); 6604 zfs_dbgmsg("spa_import: verbatim import of %s", pool); 6605 mutex_exit(&spa_namespace_lock); 6606 return (0); 6607 } 6608 6609 spa_activate(spa, mode); 6610 6611 /* 6612 * Don't start async tasks until we know everything is healthy. 6613 */ 6614 spa_async_suspend(spa); 6615 6616 zpool_get_load_policy(config, &policy); 6617 if (policy.zlp_rewind & ZPOOL_DO_REWIND) 6618 state = SPA_LOAD_RECOVER; 6619 6620 spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT; 6621 6622 if (state != SPA_LOAD_RECOVER) { 6623 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 6624 zfs_dbgmsg("spa_import: importing %s", pool); 6625 } else { 6626 zfs_dbgmsg("spa_import: importing %s, max_txg=%lld " 6627 "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg); 6628 } 6629 error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind); 6630 6631 /* 6632 * Propagate anything learned while loading the pool and pass it 6633 * back to caller (i.e. rewind info, missing devices, etc). 6634 */ 6635 fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info); 6636 6637 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6638 /* 6639 * Toss any existing sparelist, as it doesn't have any validity 6640 * anymore, and conflicts with spa_has_spare(). 6641 */ 6642 if (spa->spa_spares.sav_config) { 6643 nvlist_free(spa->spa_spares.sav_config); 6644 spa->spa_spares.sav_config = NULL; 6645 spa_load_spares(spa); 6646 } 6647 if (spa->spa_l2cache.sav_config) { 6648 nvlist_free(spa->spa_l2cache.sav_config); 6649 spa->spa_l2cache.sav_config = NULL; 6650 spa_load_l2cache(spa); 6651 } 6652 6653 nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); 6654 spa_config_exit(spa, SCL_ALL, FTAG); 6655 6656 if (props != NULL) 6657 spa_configfile_set(spa, props, B_FALSE); 6658 6659 if (error != 0 || (props && spa_writeable(spa) && 6660 (error = spa_prop_set(spa, props)))) { 6661 spa_unload(spa); 6662 spa_deactivate(spa); 6663 spa_remove(spa); 6664 mutex_exit(&spa_namespace_lock); 6665 return (error); 6666 } 6667 6668 spa_async_resume(spa); 6669 6670 /* 6671 * Override any spares and level 2 cache devices as specified by 6672 * the user, as these may have correct device names/devids, etc. 6673 */ 6674 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 6675 &spares, &nspares) == 0) { 6676 if (spa->spa_spares.sav_config) 6677 fnvlist_remove(spa->spa_spares.sav_config, 6678 ZPOOL_CONFIG_SPARES); 6679 else 6680 spa->spa_spares.sav_config = fnvlist_alloc(); 6681 fnvlist_add_nvlist_array(spa->spa_spares.sav_config, 6682 ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, 6683 nspares); 6684 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6685 spa_load_spares(spa); 6686 spa_config_exit(spa, SCL_ALL, FTAG); 6687 spa->spa_spares.sav_sync = B_TRUE; 6688 } 6689 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 6690 &l2cache, &nl2cache) == 0) { 6691 if (spa->spa_l2cache.sav_config) 6692 fnvlist_remove(spa->spa_l2cache.sav_config, 6693 ZPOOL_CONFIG_L2CACHE); 6694 else 6695 spa->spa_l2cache.sav_config = fnvlist_alloc(); 6696 fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 6697 ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache, 6698 nl2cache); 6699 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6700 spa_load_l2cache(spa); 6701 spa_config_exit(spa, SCL_ALL, FTAG); 6702 spa->spa_l2cache.sav_sync = B_TRUE; 6703 } 6704 6705 /* 6706 * Check for any removed devices. 6707 */ 6708 if (spa->spa_autoreplace) { 6709 spa_aux_check_removed(&spa->spa_spares); 6710 spa_aux_check_removed(&spa->spa_l2cache); 6711 } 6712 6713 if (spa_writeable(spa)) { 6714 /* 6715 * Update the config cache to include the newly-imported pool. 6716 */ 6717 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 6718 } 6719 6720 /* 6721 * It's possible that the pool was expanded while it was exported. 6722 * We kick off an async task to handle this for us. 6723 */ 6724 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 6725 6726 spa_history_log_version(spa, "import", NULL); 6727 6728 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); 6729 6730 mutex_exit(&spa_namespace_lock); 6731 6732 zvol_create_minors_recursive(pool); 6733 6734 spa_import_os(spa); 6735 6736 return (0); 6737 } 6738 6739 nvlist_t * 6740 spa_tryimport(nvlist_t *tryconfig) 6741 { 6742 nvlist_t *config = NULL; 6743 const char *poolname, *cachefile; 6744 spa_t *spa; 6745 uint64_t state; 6746 int error; 6747 zpool_load_policy_t policy; 6748 6749 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 6750 return (NULL); 6751 6752 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 6753 return (NULL); 6754 6755 /* 6756 * Create and initialize the spa structure. 6757 */ 6758 mutex_enter(&spa_namespace_lock); 6759 spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 6760 spa_activate(spa, SPA_MODE_READ); 6761 6762 /* 6763 * Rewind pool if a max txg was provided. 6764 */ 6765 zpool_get_load_policy(spa->spa_config, &policy); 6766 if (policy.zlp_txg != UINT64_MAX) { 6767 spa->spa_load_max_txg = policy.zlp_txg; 6768 spa->spa_extreme_rewind = B_TRUE; 6769 zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld", 6770 poolname, (longlong_t)policy.zlp_txg); 6771 } else { 6772 zfs_dbgmsg("spa_tryimport: importing %s", poolname); 6773 } 6774 6775 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile) 6776 == 0) { 6777 zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile); 6778 spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; 6779 } else { 6780 spa->spa_config_source = SPA_CONFIG_SRC_SCAN; 6781 } 6782 6783 /* 6784 * spa_import() relies on a pool config fetched by spa_try_import() 6785 * for spare/cache devices. Import flags are not passed to 6786 * spa_tryimport(), which makes it return early due to a missing log 6787 * device and missing retrieving the cache device and spare eventually. 6788 * Passing ZFS_IMPORT_MISSING_LOG to spa_tryimport() makes it fetch 6789 * the correct configuration regardless of the missing log device. 6790 */ 6791 spa->spa_import_flags |= ZFS_IMPORT_MISSING_LOG; 6792 6793 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING); 6794 6795 /* 6796 * If 'tryconfig' was at least parsable, return the current config. 6797 */ 6798 if (spa->spa_root_vdev != NULL) { 6799 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 6800 fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, poolname); 6801 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, state); 6802 fnvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 6803 spa->spa_uberblock.ub_timestamp); 6804 fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 6805 spa->spa_load_info); 6806 fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, 6807 spa->spa_errata); 6808 6809 /* 6810 * If the bootfs property exists on this pool then we 6811 * copy it out so that external consumers can tell which 6812 * pools are bootable. 6813 */ 6814 if ((!error || error == EEXIST) && spa->spa_bootfs) { 6815 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 6816 6817 /* 6818 * We have to play games with the name since the 6819 * pool was opened as TRYIMPORT_NAME. 6820 */ 6821 if (dsl_dsobj_to_dsname(spa_name(spa), 6822 spa->spa_bootfs, tmpname) == 0) { 6823 char *cp; 6824 char *dsname; 6825 6826 dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 6827 6828 cp = strchr(tmpname, '/'); 6829 if (cp == NULL) { 6830 (void) strlcpy(dsname, tmpname, 6831 MAXPATHLEN); 6832 } else { 6833 (void) snprintf(dsname, MAXPATHLEN, 6834 "%s/%s", poolname, ++cp); 6835 } 6836 fnvlist_add_string(config, ZPOOL_CONFIG_BOOTFS, 6837 dsname); 6838 kmem_free(dsname, MAXPATHLEN); 6839 } 6840 kmem_free(tmpname, MAXPATHLEN); 6841 } 6842 6843 /* 6844 * Add the list of hot spares and level 2 cache devices. 6845 */ 6846 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6847 spa_add_spares(spa, config); 6848 spa_add_l2cache(spa, config); 6849 spa_config_exit(spa, SCL_CONFIG, FTAG); 6850 } 6851 6852 spa_unload(spa); 6853 spa_deactivate(spa); 6854 spa_remove(spa); 6855 mutex_exit(&spa_namespace_lock); 6856 6857 return (config); 6858 } 6859 6860 /* 6861 * Pool export/destroy 6862 * 6863 * The act of destroying or exporting a pool is very simple. We make sure there 6864 * is no more pending I/O and any references to the pool are gone. Then, we 6865 * update the pool state and sync all the labels to disk, removing the 6866 * configuration from the cache afterwards. If the 'hardforce' flag is set, then 6867 * we don't sync the labels or remove the configuration cache. 6868 */ 6869 static int 6870 spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, 6871 boolean_t force, boolean_t hardforce) 6872 { 6873 int error; 6874 spa_t *spa; 6875 6876 if (oldconfig) 6877 *oldconfig = NULL; 6878 6879 if (!(spa_mode_global & SPA_MODE_WRITE)) 6880 return (SET_ERROR(EROFS)); 6881 6882 mutex_enter(&spa_namespace_lock); 6883 if ((spa = spa_lookup(pool)) == NULL) { 6884 mutex_exit(&spa_namespace_lock); 6885 return (SET_ERROR(ENOENT)); 6886 } 6887 6888 if (spa->spa_is_exporting) { 6889 /* the pool is being exported by another thread */ 6890 mutex_exit(&spa_namespace_lock); 6891 return (SET_ERROR(ZFS_ERR_EXPORT_IN_PROGRESS)); 6892 } 6893 spa->spa_is_exporting = B_TRUE; 6894 6895 /* 6896 * Put a hold on the pool, drop the namespace lock, stop async tasks, 6897 * reacquire the namespace lock, and see if we can export. 6898 */ 6899 spa_open_ref(spa, FTAG); 6900 mutex_exit(&spa_namespace_lock); 6901 spa_async_suspend(spa); 6902 if (spa->spa_zvol_taskq) { 6903 zvol_remove_minors(spa, spa_name(spa), B_TRUE); 6904 taskq_wait(spa->spa_zvol_taskq); 6905 } 6906 mutex_enter(&spa_namespace_lock); 6907 spa_close(spa, FTAG); 6908 6909 if (spa->spa_state == POOL_STATE_UNINITIALIZED) 6910 goto export_spa; 6911 /* 6912 * The pool will be in core if it's openable, in which case we can 6913 * modify its state. Objsets may be open only because they're dirty, 6914 * so we have to force it to sync before checking spa_refcnt. 6915 */ 6916 if (spa->spa_sync_on) { 6917 txg_wait_synced(spa->spa_dsl_pool, 0); 6918 spa_evicting_os_wait(spa); 6919 } 6920 6921 /* 6922 * A pool cannot be exported or destroyed if there are active 6923 * references. If we are resetting a pool, allow references by 6924 * fault injection handlers. 6925 */ 6926 if (!spa_refcount_zero(spa) || (spa->spa_inject_ref != 0)) { 6927 error = SET_ERROR(EBUSY); 6928 goto fail; 6929 } 6930 6931 if (spa->spa_sync_on) { 6932 vdev_t *rvd = spa->spa_root_vdev; 6933 /* 6934 * A pool cannot be exported if it has an active shared spare. 6935 * This is to prevent other pools stealing the active spare 6936 * from an exported pool. At user's own will, such pool can 6937 * be forcedly exported. 6938 */ 6939 if (!force && new_state == POOL_STATE_EXPORTED && 6940 spa_has_active_shared_spare(spa)) { 6941 error = SET_ERROR(EXDEV); 6942 goto fail; 6943 } 6944 6945 /* 6946 * We're about to export or destroy this pool. Make sure 6947 * we stop all initialization and trim activity here before 6948 * we set the spa_final_txg. This will ensure that all 6949 * dirty data resulting from the initialization is 6950 * committed to disk before we unload the pool. 6951 */ 6952 vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE); 6953 vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE); 6954 vdev_autotrim_stop_all(spa); 6955 vdev_rebuild_stop_all(spa); 6956 6957 /* 6958 * We want this to be reflected on every label, 6959 * so mark them all dirty. spa_unload() will do the 6960 * final sync that pushes these changes out. 6961 */ 6962 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 6963 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6964 spa->spa_state = new_state; 6965 vdev_config_dirty(rvd); 6966 spa_config_exit(spa, SCL_ALL, FTAG); 6967 } 6968 6969 /* 6970 * If the log space map feature is enabled and the pool is 6971 * getting exported (but not destroyed), we want to spend some 6972 * time flushing as many metaslabs as we can in an attempt to 6973 * destroy log space maps and save import time. This has to be 6974 * done before we set the spa_final_txg, otherwise 6975 * spa_sync() -> spa_flush_metaslabs() may dirty the final TXGs. 6976 * spa_should_flush_logs_on_unload() should be called after 6977 * spa_state has been set to the new_state. 6978 */ 6979 if (spa_should_flush_logs_on_unload(spa)) 6980 spa_unload_log_sm_flush_all(spa); 6981 6982 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 6983 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6984 spa->spa_final_txg = spa_last_synced_txg(spa) + 6985 TXG_DEFER_SIZE + 1; 6986 spa_config_exit(spa, SCL_ALL, FTAG); 6987 } 6988 } 6989 6990 export_spa: 6991 spa_export_os(spa); 6992 6993 if (new_state == POOL_STATE_DESTROYED) 6994 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY); 6995 else if (new_state == POOL_STATE_EXPORTED) 6996 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_EXPORT); 6997 6998 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 6999 spa_unload(spa); 7000 spa_deactivate(spa); 7001 } 7002 7003 if (oldconfig && spa->spa_config) 7004 *oldconfig = fnvlist_dup(spa->spa_config); 7005 7006 if (new_state != POOL_STATE_UNINITIALIZED) { 7007 if (!hardforce) 7008 spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE); 7009 spa_remove(spa); 7010 } else { 7011 /* 7012 * If spa_remove() is not called for this spa_t and 7013 * there is any possibility that it can be reused, 7014 * we make sure to reset the exporting flag. 7015 */ 7016 spa->spa_is_exporting = B_FALSE; 7017 } 7018 7019 mutex_exit(&spa_namespace_lock); 7020 return (0); 7021 7022 fail: 7023 spa->spa_is_exporting = B_FALSE; 7024 spa_async_resume(spa); 7025 mutex_exit(&spa_namespace_lock); 7026 return (error); 7027 } 7028 7029 /* 7030 * Destroy a storage pool. 7031 */ 7032 int 7033 spa_destroy(const char *pool) 7034 { 7035 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 7036 B_FALSE, B_FALSE)); 7037 } 7038 7039 /* 7040 * Export a storage pool. 7041 */ 7042 int 7043 spa_export(const char *pool, nvlist_t **oldconfig, boolean_t force, 7044 boolean_t hardforce) 7045 { 7046 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 7047 force, hardforce)); 7048 } 7049 7050 /* 7051 * Similar to spa_export(), this unloads the spa_t without actually removing it 7052 * from the namespace in any way. 7053 */ 7054 int 7055 spa_reset(const char *pool) 7056 { 7057 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 7058 B_FALSE, B_FALSE)); 7059 } 7060 7061 /* 7062 * ========================================================================== 7063 * Device manipulation 7064 * ========================================================================== 7065 */ 7066 7067 /* 7068 * This is called as a synctask to increment the draid feature flag 7069 */ 7070 static void 7071 spa_draid_feature_incr(void *arg, dmu_tx_t *tx) 7072 { 7073 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 7074 int draid = (int)(uintptr_t)arg; 7075 7076 for (int c = 0; c < draid; c++) 7077 spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); 7078 } 7079 7080 /* 7081 * Add a device to a storage pool. 7082 */ 7083 int 7084 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 7085 { 7086 uint64_t txg, ndraid = 0; 7087 int error; 7088 vdev_t *rvd = spa->spa_root_vdev; 7089 vdev_t *vd, *tvd; 7090 nvlist_t **spares, **l2cache; 7091 uint_t nspares, nl2cache; 7092 7093 ASSERT(spa_writeable(spa)); 7094 7095 txg = spa_vdev_enter(spa); 7096 7097 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 7098 VDEV_ALLOC_ADD)) != 0) 7099 return (spa_vdev_exit(spa, NULL, txg, error)); 7100 7101 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 7102 7103 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 7104 &nspares) != 0) 7105 nspares = 0; 7106 7107 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 7108 &nl2cache) != 0) 7109 nl2cache = 0; 7110 7111 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 7112 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 7113 7114 if (vd->vdev_children != 0 && 7115 (error = vdev_create(vd, txg, B_FALSE)) != 0) { 7116 return (spa_vdev_exit(spa, vd, txg, error)); 7117 } 7118 7119 /* 7120 * The virtual dRAID spares must be added after vdev tree is created 7121 * and the vdev guids are generated. The guid of their associated 7122 * dRAID is stored in the config and used when opening the spare. 7123 */ 7124 if ((error = vdev_draid_spare_create(nvroot, vd, &ndraid, 7125 rvd->vdev_children)) == 0) { 7126 if (ndraid > 0 && nvlist_lookup_nvlist_array(nvroot, 7127 ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0) 7128 nspares = 0; 7129 } else { 7130 return (spa_vdev_exit(spa, vd, txg, error)); 7131 } 7132 7133 /* 7134 * We must validate the spares and l2cache devices after checking the 7135 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 7136 */ 7137 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 7138 return (spa_vdev_exit(spa, vd, txg, error)); 7139 7140 /* 7141 * If we are in the middle of a device removal, we can only add 7142 * devices which match the existing devices in the pool. 7143 * If we are in the middle of a removal, or have some indirect 7144 * vdevs, we can not add raidz or dRAID top levels. 7145 */ 7146 if (spa->spa_vdev_removal != NULL || 7147 spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { 7148 for (int c = 0; c < vd->vdev_children; c++) { 7149 tvd = vd->vdev_child[c]; 7150 if (spa->spa_vdev_removal != NULL && 7151 tvd->vdev_ashift != spa->spa_max_ashift) { 7152 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 7153 } 7154 /* Fail if top level vdev is raidz or a dRAID */ 7155 if (vdev_get_nparity(tvd) != 0) 7156 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 7157 7158 /* 7159 * Need the top level mirror to be 7160 * a mirror of leaf vdevs only 7161 */ 7162 if (tvd->vdev_ops == &vdev_mirror_ops) { 7163 for (uint64_t cid = 0; 7164 cid < tvd->vdev_children; cid++) { 7165 vdev_t *cvd = tvd->vdev_child[cid]; 7166 if (!cvd->vdev_ops->vdev_op_leaf) { 7167 return (spa_vdev_exit(spa, vd, 7168 txg, EINVAL)); 7169 } 7170 } 7171 } 7172 } 7173 } 7174 7175 for (int c = 0; c < vd->vdev_children; c++) { 7176 tvd = vd->vdev_child[c]; 7177 vdev_remove_child(vd, tvd); 7178 tvd->vdev_id = rvd->vdev_children; 7179 vdev_add_child(rvd, tvd); 7180 vdev_config_dirty(tvd); 7181 } 7182 7183 if (nspares != 0) { 7184 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 7185 ZPOOL_CONFIG_SPARES); 7186 spa_load_spares(spa); 7187 spa->spa_spares.sav_sync = B_TRUE; 7188 } 7189 7190 if (nl2cache != 0) { 7191 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 7192 ZPOOL_CONFIG_L2CACHE); 7193 spa_load_l2cache(spa); 7194 spa->spa_l2cache.sav_sync = B_TRUE; 7195 } 7196 7197 /* 7198 * We can't increment a feature while holding spa_vdev so we 7199 * have to do it in a synctask. 7200 */ 7201 if (ndraid != 0) { 7202 dmu_tx_t *tx; 7203 7204 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 7205 dsl_sync_task_nowait(spa->spa_dsl_pool, spa_draid_feature_incr, 7206 (void *)(uintptr_t)ndraid, tx); 7207 dmu_tx_commit(tx); 7208 } 7209 7210 /* 7211 * We have to be careful when adding new vdevs to an existing pool. 7212 * If other threads start allocating from these vdevs before we 7213 * sync the config cache, and we lose power, then upon reboot we may 7214 * fail to open the pool because there are DVAs that the config cache 7215 * can't translate. Therefore, we first add the vdevs without 7216 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 7217 * and then let spa_config_update() initialize the new metaslabs. 7218 * 7219 * spa_load() checks for added-but-not-initialized vdevs, so that 7220 * if we lose power at any point in this sequence, the remaining 7221 * steps will be completed the next time we load the pool. 7222 */ 7223 (void) spa_vdev_exit(spa, vd, txg, 0); 7224 7225 mutex_enter(&spa_namespace_lock); 7226 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 7227 spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD); 7228 mutex_exit(&spa_namespace_lock); 7229 7230 return (0); 7231 } 7232 7233 /* 7234 * Attach a device to a vdev specified by its guid. The vdev type can be 7235 * a mirror, a raidz, or a leaf device that is also a top-level (e.g. a 7236 * single device). When the vdev is a single device, a mirror vdev will be 7237 * automatically inserted. 7238 * 7239 * If 'replacing' is specified, the new device is intended to replace the 7240 * existing device; in this case the two devices are made into their own 7241 * mirror using the 'replacing' vdev, which is functionally identical to 7242 * the mirror vdev (it actually reuses all the same ops) but has a few 7243 * extra rules: you can't attach to it after it's been created, and upon 7244 * completion of resilvering, the first disk (the one being replaced) 7245 * is automatically detached. 7246 * 7247 * If 'rebuild' is specified, then sequential reconstruction (a.ka. rebuild) 7248 * should be performed instead of traditional healing reconstruction. From 7249 * an administrators perspective these are both resilver operations. 7250 */ 7251 int 7252 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, 7253 int rebuild) 7254 { 7255 uint64_t txg, dtl_max_txg; 7256 vdev_t *rvd = spa->spa_root_vdev; 7257 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 7258 vdev_ops_t *pvops; 7259 char *oldvdpath, *newvdpath; 7260 int newvd_isspare = B_FALSE; 7261 int error; 7262 7263 ASSERT(spa_writeable(spa)); 7264 7265 txg = spa_vdev_enter(spa); 7266 7267 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 7268 7269 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 7270 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 7271 error = (spa_has_checkpoint(spa)) ? 7272 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 7273 return (spa_vdev_exit(spa, NULL, txg, error)); 7274 } 7275 7276 if (rebuild) { 7277 if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) 7278 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7279 7280 if (dsl_scan_resilvering(spa_get_dsl(spa)) || 7281 dsl_scan_resilver_scheduled(spa_get_dsl(spa))) { 7282 return (spa_vdev_exit(spa, NULL, txg, 7283 ZFS_ERR_RESILVER_IN_PROGRESS)); 7284 } 7285 } else { 7286 if (vdev_rebuild_active(rvd)) 7287 return (spa_vdev_exit(spa, NULL, txg, 7288 ZFS_ERR_REBUILD_IN_PROGRESS)); 7289 } 7290 7291 if (spa->spa_vdev_removal != NULL) { 7292 return (spa_vdev_exit(spa, NULL, txg, 7293 ZFS_ERR_DEVRM_IN_PROGRESS)); 7294 } 7295 7296 if (oldvd == NULL) 7297 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 7298 7299 boolean_t raidz = oldvd->vdev_ops == &vdev_raidz_ops; 7300 7301 if (raidz) { 7302 if (!spa_feature_is_enabled(spa, SPA_FEATURE_RAIDZ_EXPANSION)) 7303 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7304 7305 /* 7306 * Can't expand a raidz while prior expand is in progress. 7307 */ 7308 if (spa->spa_raidz_expand != NULL) { 7309 return (spa_vdev_exit(spa, NULL, txg, 7310 ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS)); 7311 } 7312 } else if (!oldvd->vdev_ops->vdev_op_leaf) { 7313 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7314 } 7315 7316 if (raidz) 7317 pvd = oldvd; 7318 else 7319 pvd = oldvd->vdev_parent; 7320 7321 if (spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 7322 VDEV_ALLOC_ATTACH) != 0) 7323 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 7324 7325 if (newrootvd->vdev_children != 1) 7326 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 7327 7328 newvd = newrootvd->vdev_child[0]; 7329 7330 if (!newvd->vdev_ops->vdev_op_leaf) 7331 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 7332 7333 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 7334 return (spa_vdev_exit(spa, newrootvd, txg, error)); 7335 7336 /* 7337 * log, dedup and special vdevs should not be replaced by spares. 7338 */ 7339 if ((oldvd->vdev_top->vdev_alloc_bias != VDEV_BIAS_NONE || 7340 oldvd->vdev_top->vdev_islog) && newvd->vdev_isspare) { 7341 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7342 } 7343 7344 /* 7345 * A dRAID spare can only replace a child of its parent dRAID vdev. 7346 */ 7347 if (newvd->vdev_ops == &vdev_draid_spare_ops && 7348 oldvd->vdev_top != vdev_draid_spare_get_parent(newvd)) { 7349 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7350 } 7351 7352 if (rebuild) { 7353 /* 7354 * For rebuilds, the top vdev must support reconstruction 7355 * using only space maps. This means the only allowable 7356 * vdevs types are the root vdev, a mirror, or dRAID. 7357 */ 7358 tvd = pvd; 7359 if (pvd->vdev_top != NULL) 7360 tvd = pvd->vdev_top; 7361 7362 if (tvd->vdev_ops != &vdev_mirror_ops && 7363 tvd->vdev_ops != &vdev_root_ops && 7364 tvd->vdev_ops != &vdev_draid_ops) { 7365 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7366 } 7367 } 7368 7369 if (!replacing) { 7370 /* 7371 * For attach, the only allowable parent is a mirror or 7372 * the root vdev. A raidz vdev can be attached to, but 7373 * you cannot attach to a raidz child. 7374 */ 7375 if (pvd->vdev_ops != &vdev_mirror_ops && 7376 pvd->vdev_ops != &vdev_root_ops && 7377 !raidz) 7378 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7379 7380 pvops = &vdev_mirror_ops; 7381 } else { 7382 /* 7383 * Active hot spares can only be replaced by inactive hot 7384 * spares. 7385 */ 7386 if (pvd->vdev_ops == &vdev_spare_ops && 7387 oldvd->vdev_isspare && 7388 !spa_has_spare(spa, newvd->vdev_guid)) 7389 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7390 7391 /* 7392 * If the source is a hot spare, and the parent isn't already a 7393 * spare, then we want to create a new hot spare. Otherwise, we 7394 * want to create a replacing vdev. The user is not allowed to 7395 * attach to a spared vdev child unless the 'isspare' state is 7396 * the same (spare replaces spare, non-spare replaces 7397 * non-spare). 7398 */ 7399 if (pvd->vdev_ops == &vdev_replacing_ops && 7400 spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 7401 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7402 } else if (pvd->vdev_ops == &vdev_spare_ops && 7403 newvd->vdev_isspare != oldvd->vdev_isspare) { 7404 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7405 } 7406 7407 if (newvd->vdev_isspare) 7408 pvops = &vdev_spare_ops; 7409 else 7410 pvops = &vdev_replacing_ops; 7411 } 7412 7413 /* 7414 * Make sure the new device is big enough. 7415 */ 7416 vdev_t *min_vdev = raidz ? oldvd->vdev_child[0] : oldvd; 7417 if (newvd->vdev_asize < vdev_get_min_asize(min_vdev)) 7418 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 7419 7420 /* 7421 * The new device cannot have a higher alignment requirement 7422 * than the top-level vdev. 7423 */ 7424 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 7425 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7426 7427 /* 7428 * RAIDZ-expansion-specific checks. 7429 */ 7430 if (raidz) { 7431 if (vdev_raidz_attach_check(newvd) != 0) 7432 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7433 7434 /* 7435 * Fail early if a child is not healthy or being replaced 7436 */ 7437 for (int i = 0; i < oldvd->vdev_children; i++) { 7438 if (vdev_is_dead(oldvd->vdev_child[i]) || 7439 !oldvd->vdev_child[i]->vdev_ops->vdev_op_leaf) { 7440 return (spa_vdev_exit(spa, newrootvd, txg, 7441 ENXIO)); 7442 } 7443 /* Also fail if reserved boot area is in-use */ 7444 if (vdev_check_boot_reserve(spa, oldvd->vdev_child[i]) 7445 != 0) { 7446 return (spa_vdev_exit(spa, newrootvd, txg, 7447 EADDRINUSE)); 7448 } 7449 } 7450 } 7451 7452 if (raidz) { 7453 /* 7454 * Note: oldvdpath is freed by spa_strfree(), but 7455 * kmem_asprintf() is freed by kmem_strfree(), so we have to 7456 * move it to a spa_strdup-ed string. 7457 */ 7458 char *tmp = kmem_asprintf("raidz%u-%u", 7459 (uint_t)vdev_get_nparity(oldvd), (uint_t)oldvd->vdev_id); 7460 oldvdpath = spa_strdup(tmp); 7461 kmem_strfree(tmp); 7462 } else { 7463 oldvdpath = spa_strdup(oldvd->vdev_path); 7464 } 7465 newvdpath = spa_strdup(newvd->vdev_path); 7466 7467 /* 7468 * If this is an in-place replacement, update oldvd's path and devid 7469 * to make it distinguishable from newvd, and unopenable from now on. 7470 */ 7471 if (strcmp(oldvdpath, newvdpath) == 0) { 7472 spa_strfree(oldvd->vdev_path); 7473 oldvd->vdev_path = kmem_alloc(strlen(newvdpath) + 5, 7474 KM_SLEEP); 7475 (void) sprintf(oldvd->vdev_path, "%s/old", 7476 newvdpath); 7477 if (oldvd->vdev_devid != NULL) { 7478 spa_strfree(oldvd->vdev_devid); 7479 oldvd->vdev_devid = NULL; 7480 } 7481 spa_strfree(oldvdpath); 7482 oldvdpath = spa_strdup(oldvd->vdev_path); 7483 } 7484 7485 /* 7486 * If the parent is not a mirror, or if we're replacing, insert the new 7487 * mirror/replacing/spare vdev above oldvd. 7488 */ 7489 if (!raidz && pvd->vdev_ops != pvops) { 7490 pvd = vdev_add_parent(oldvd, pvops); 7491 ASSERT(pvd->vdev_ops == pvops); 7492 ASSERT(oldvd->vdev_parent == pvd); 7493 } 7494 7495 ASSERT(pvd->vdev_top->vdev_parent == rvd); 7496 7497 /* 7498 * Extract the new device from its root and add it to pvd. 7499 */ 7500 vdev_remove_child(newrootvd, newvd); 7501 newvd->vdev_id = pvd->vdev_children; 7502 newvd->vdev_crtxg = oldvd->vdev_crtxg; 7503 vdev_add_child(pvd, newvd); 7504 7505 /* 7506 * Reevaluate the parent vdev state. 7507 */ 7508 vdev_propagate_state(pvd); 7509 7510 tvd = newvd->vdev_top; 7511 ASSERT(pvd->vdev_top == tvd); 7512 ASSERT(tvd->vdev_parent == rvd); 7513 7514 vdev_config_dirty(tvd); 7515 7516 /* 7517 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 7518 * for any dmu_sync-ed blocks. It will propagate upward when 7519 * spa_vdev_exit() calls vdev_dtl_reassess(). 7520 */ 7521 dtl_max_txg = txg + TXG_CONCURRENT_STATES; 7522 7523 if (raidz) { 7524 /* 7525 * Wait for the youngest allocations and frees to sync, 7526 * and then wait for the deferral of those frees to finish. 7527 */ 7528 spa_vdev_config_exit(spa, NULL, 7529 txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 7530 7531 vdev_initialize_stop_all(tvd, VDEV_INITIALIZE_ACTIVE); 7532 vdev_trim_stop_all(tvd, VDEV_TRIM_ACTIVE); 7533 vdev_autotrim_stop_wait(tvd); 7534 7535 dtl_max_txg = spa_vdev_config_enter(spa); 7536 7537 tvd->vdev_rz_expanding = B_TRUE; 7538 7539 vdev_dirty_leaves(tvd, VDD_DTL, dtl_max_txg); 7540 vdev_config_dirty(tvd); 7541 7542 dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, 7543 dtl_max_txg); 7544 dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_raidz_attach_sync, 7545 newvd, tx); 7546 dmu_tx_commit(tx); 7547 } else { 7548 vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, 7549 dtl_max_txg - TXG_INITIAL); 7550 7551 if (newvd->vdev_isspare) { 7552 spa_spare_activate(newvd); 7553 spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE); 7554 } 7555 7556 newvd_isspare = newvd->vdev_isspare; 7557 7558 /* 7559 * Mark newvd's DTL dirty in this txg. 7560 */ 7561 vdev_dirty(tvd, VDD_DTL, newvd, txg); 7562 7563 /* 7564 * Schedule the resilver or rebuild to restart in the future. 7565 * We do this to ensure that dmu_sync-ed blocks have been 7566 * stitched into the respective datasets. 7567 */ 7568 if (rebuild) { 7569 newvd->vdev_rebuild_txg = txg; 7570 7571 vdev_rebuild(tvd); 7572 } else { 7573 newvd->vdev_resilver_txg = txg; 7574 7575 if (dsl_scan_resilvering(spa_get_dsl(spa)) && 7576 spa_feature_is_enabled(spa, 7577 SPA_FEATURE_RESILVER_DEFER)) { 7578 vdev_defer_resilver(newvd); 7579 } else { 7580 dsl_scan_restart_resilver(spa->spa_dsl_pool, 7581 dtl_max_txg); 7582 } 7583 } 7584 } 7585 7586 if (spa->spa_bootfs) 7587 spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH); 7588 7589 spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH); 7590 7591 /* 7592 * Commit the config 7593 */ 7594 (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 7595 7596 spa_history_log_internal(spa, "vdev attach", NULL, 7597 "%s vdev=%s %s vdev=%s", 7598 replacing && newvd_isspare ? "spare in" : 7599 replacing ? "replace" : "attach", newvdpath, 7600 replacing ? "for" : "to", oldvdpath); 7601 7602 spa_strfree(oldvdpath); 7603 spa_strfree(newvdpath); 7604 7605 return (0); 7606 } 7607 7608 /* 7609 * Detach a device from a mirror or replacing vdev. 7610 * 7611 * If 'replace_done' is specified, only detach if the parent 7612 * is a replacing or a spare vdev. 7613 */ 7614 int 7615 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 7616 { 7617 uint64_t txg; 7618 int error; 7619 vdev_t *rvd __maybe_unused = spa->spa_root_vdev; 7620 vdev_t *vd, *pvd, *cvd, *tvd; 7621 boolean_t unspare = B_FALSE; 7622 uint64_t unspare_guid = 0; 7623 char *vdpath; 7624 7625 ASSERT(spa_writeable(spa)); 7626 7627 txg = spa_vdev_detach_enter(spa, guid); 7628 7629 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 7630 7631 /* 7632 * Besides being called directly from the userland through the 7633 * ioctl interface, spa_vdev_detach() can be potentially called 7634 * at the end of spa_vdev_resilver_done(). 7635 * 7636 * In the regular case, when we have a checkpoint this shouldn't 7637 * happen as we never empty the DTLs of a vdev during the scrub 7638 * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done() 7639 * should never get here when we have a checkpoint. 7640 * 7641 * That said, even in a case when we checkpoint the pool exactly 7642 * as spa_vdev_resilver_done() calls this function everything 7643 * should be fine as the resilver will return right away. 7644 */ 7645 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 7646 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 7647 error = (spa_has_checkpoint(spa)) ? 7648 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 7649 return (spa_vdev_exit(spa, NULL, txg, error)); 7650 } 7651 7652 if (vd == NULL) 7653 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 7654 7655 if (!vd->vdev_ops->vdev_op_leaf) 7656 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7657 7658 pvd = vd->vdev_parent; 7659 7660 /* 7661 * If the parent/child relationship is not as expected, don't do it. 7662 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 7663 * vdev that's replacing B with C. The user's intent in replacing 7664 * is to go from M(A,B) to M(A,C). If the user decides to cancel 7665 * the replace by detaching C, the expected behavior is to end up 7666 * M(A,B). But suppose that right after deciding to detach C, 7667 * the replacement of B completes. We would have M(A,C), and then 7668 * ask to detach C, which would leave us with just A -- not what 7669 * the user wanted. To prevent this, we make sure that the 7670 * parent/child relationship hasn't changed -- in this example, 7671 * that C's parent is still the replacing vdev R. 7672 */ 7673 if (pvd->vdev_guid != pguid && pguid != 0) 7674 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 7675 7676 /* 7677 * Only 'replacing' or 'spare' vdevs can be replaced. 7678 */ 7679 if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 7680 pvd->vdev_ops != &vdev_spare_ops) 7681 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7682 7683 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 7684 spa_version(spa) >= SPA_VERSION_SPARES); 7685 7686 /* 7687 * Only mirror, replacing, and spare vdevs support detach. 7688 */ 7689 if (pvd->vdev_ops != &vdev_replacing_ops && 7690 pvd->vdev_ops != &vdev_mirror_ops && 7691 pvd->vdev_ops != &vdev_spare_ops) 7692 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7693 7694 /* 7695 * If this device has the only valid copy of some data, 7696 * we cannot safely detach it. 7697 */ 7698 if (vdev_dtl_required(vd)) 7699 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 7700 7701 ASSERT(pvd->vdev_children >= 2); 7702 7703 /* 7704 * If we are detaching the second disk from a replacing vdev, then 7705 * check to see if we changed the original vdev's path to have "/old" 7706 * at the end in spa_vdev_attach(). If so, undo that change now. 7707 */ 7708 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 7709 vd->vdev_path != NULL) { 7710 size_t len = strlen(vd->vdev_path); 7711 7712 for (int c = 0; c < pvd->vdev_children; c++) { 7713 cvd = pvd->vdev_child[c]; 7714 7715 if (cvd == vd || cvd->vdev_path == NULL) 7716 continue; 7717 7718 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 7719 strcmp(cvd->vdev_path + len, "/old") == 0) { 7720 spa_strfree(cvd->vdev_path); 7721 cvd->vdev_path = spa_strdup(vd->vdev_path); 7722 break; 7723 } 7724 } 7725 } 7726 7727 /* 7728 * If we are detaching the original disk from a normal spare, then it 7729 * implies that the spare should become a real disk, and be removed 7730 * from the active spare list for the pool. dRAID spares on the 7731 * other hand are coupled to the pool and thus should never be removed 7732 * from the spares list. 7733 */ 7734 if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0) { 7735 vdev_t *last_cvd = pvd->vdev_child[pvd->vdev_children - 1]; 7736 7737 if (last_cvd->vdev_isspare && 7738 last_cvd->vdev_ops != &vdev_draid_spare_ops) { 7739 unspare = B_TRUE; 7740 } 7741 } 7742 7743 /* 7744 * Erase the disk labels so the disk can be used for other things. 7745 * This must be done after all other error cases are handled, 7746 * but before we disembowel vd (so we can still do I/O to it). 7747 * But if we can't do it, don't treat the error as fatal -- 7748 * it may be that the unwritability of the disk is the reason 7749 * it's being detached! 7750 */ 7751 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 7752 7753 /* 7754 * Remove vd from its parent and compact the parent's children. 7755 */ 7756 vdev_remove_child(pvd, vd); 7757 vdev_compact_children(pvd); 7758 7759 /* 7760 * Remember one of the remaining children so we can get tvd below. 7761 */ 7762 cvd = pvd->vdev_child[pvd->vdev_children - 1]; 7763 7764 /* 7765 * If we need to remove the remaining child from the list of hot spares, 7766 * do it now, marking the vdev as no longer a spare in the process. 7767 * We must do this before vdev_remove_parent(), because that can 7768 * change the GUID if it creates a new toplevel GUID. For a similar 7769 * reason, we must remove the spare now, in the same txg as the detach; 7770 * otherwise someone could attach a new sibling, change the GUID, and 7771 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 7772 */ 7773 if (unspare) { 7774 ASSERT(cvd->vdev_isspare); 7775 spa_spare_remove(cvd); 7776 unspare_guid = cvd->vdev_guid; 7777 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 7778 cvd->vdev_unspare = B_TRUE; 7779 } 7780 7781 /* 7782 * If the parent mirror/replacing vdev only has one child, 7783 * the parent is no longer needed. Remove it from the tree. 7784 */ 7785 if (pvd->vdev_children == 1) { 7786 if (pvd->vdev_ops == &vdev_spare_ops) 7787 cvd->vdev_unspare = B_FALSE; 7788 vdev_remove_parent(cvd); 7789 } 7790 7791 /* 7792 * We don't set tvd until now because the parent we just removed 7793 * may have been the previous top-level vdev. 7794 */ 7795 tvd = cvd->vdev_top; 7796 ASSERT(tvd->vdev_parent == rvd); 7797 7798 /* 7799 * Reevaluate the parent vdev state. 7800 */ 7801 vdev_propagate_state(cvd); 7802 7803 /* 7804 * If the 'autoexpand' property is set on the pool then automatically 7805 * try to expand the size of the pool. For example if the device we 7806 * just detached was smaller than the others, it may be possible to 7807 * add metaslabs (i.e. grow the pool). We need to reopen the vdev 7808 * first so that we can obtain the updated sizes of the leaf vdevs. 7809 */ 7810 if (spa->spa_autoexpand) { 7811 vdev_reopen(tvd); 7812 vdev_expand(tvd, txg); 7813 } 7814 7815 vdev_config_dirty(tvd); 7816 7817 /* 7818 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 7819 * vd->vdev_detached is set and free vd's DTL object in syncing context. 7820 * But first make sure we're not on any *other* txg's DTL list, to 7821 * prevent vd from being accessed after it's freed. 7822 */ 7823 vdpath = spa_strdup(vd->vdev_path ? vd->vdev_path : "none"); 7824 for (int t = 0; t < TXG_SIZE; t++) 7825 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 7826 vd->vdev_detached = B_TRUE; 7827 vdev_dirty(tvd, VDD_DTL, vd, txg); 7828 7829 spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE); 7830 spa_notify_waiters(spa); 7831 7832 /* hang on to the spa before we release the lock */ 7833 spa_open_ref(spa, FTAG); 7834 7835 error = spa_vdev_exit(spa, vd, txg, 0); 7836 7837 spa_history_log_internal(spa, "detach", NULL, 7838 "vdev=%s", vdpath); 7839 spa_strfree(vdpath); 7840 7841 /* 7842 * If this was the removal of the original device in a hot spare vdev, 7843 * then we want to go through and remove the device from the hot spare 7844 * list of every other pool. 7845 */ 7846 if (unspare) { 7847 spa_t *altspa = NULL; 7848 7849 mutex_enter(&spa_namespace_lock); 7850 while ((altspa = spa_next(altspa)) != NULL) { 7851 if (altspa->spa_state != POOL_STATE_ACTIVE || 7852 altspa == spa) 7853 continue; 7854 7855 spa_open_ref(altspa, FTAG); 7856 mutex_exit(&spa_namespace_lock); 7857 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 7858 mutex_enter(&spa_namespace_lock); 7859 spa_close(altspa, FTAG); 7860 } 7861 mutex_exit(&spa_namespace_lock); 7862 7863 /* search the rest of the vdevs for spares to remove */ 7864 spa_vdev_resilver_done(spa); 7865 } 7866 7867 /* all done with the spa; OK to release */ 7868 mutex_enter(&spa_namespace_lock); 7869 spa_close(spa, FTAG); 7870 mutex_exit(&spa_namespace_lock); 7871 7872 return (error); 7873 } 7874 7875 static int 7876 spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, 7877 list_t *vd_list) 7878 { 7879 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 7880 7881 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 7882 7883 /* Look up vdev and ensure it's a leaf. */ 7884 vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); 7885 if (vd == NULL || vd->vdev_detached) { 7886 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7887 return (SET_ERROR(ENODEV)); 7888 } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { 7889 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7890 return (SET_ERROR(EINVAL)); 7891 } else if (!vdev_writeable(vd)) { 7892 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7893 return (SET_ERROR(EROFS)); 7894 } 7895 mutex_enter(&vd->vdev_initialize_lock); 7896 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7897 7898 /* 7899 * When we activate an initialize action we check to see 7900 * if the vdev_initialize_thread is NULL. We do this instead 7901 * of using the vdev_initialize_state since there might be 7902 * a previous initialization process which has completed but 7903 * the thread is not exited. 7904 */ 7905 if (cmd_type == POOL_INITIALIZE_START && 7906 (vd->vdev_initialize_thread != NULL || 7907 vd->vdev_top->vdev_removing || vd->vdev_top->vdev_rz_expanding)) { 7908 mutex_exit(&vd->vdev_initialize_lock); 7909 return (SET_ERROR(EBUSY)); 7910 } else if (cmd_type == POOL_INITIALIZE_CANCEL && 7911 (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE && 7912 vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) { 7913 mutex_exit(&vd->vdev_initialize_lock); 7914 return (SET_ERROR(ESRCH)); 7915 } else if (cmd_type == POOL_INITIALIZE_SUSPEND && 7916 vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) { 7917 mutex_exit(&vd->vdev_initialize_lock); 7918 return (SET_ERROR(ESRCH)); 7919 } else if (cmd_type == POOL_INITIALIZE_UNINIT && 7920 vd->vdev_initialize_thread != NULL) { 7921 mutex_exit(&vd->vdev_initialize_lock); 7922 return (SET_ERROR(EBUSY)); 7923 } 7924 7925 switch (cmd_type) { 7926 case POOL_INITIALIZE_START: 7927 vdev_initialize(vd); 7928 break; 7929 case POOL_INITIALIZE_CANCEL: 7930 vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED, vd_list); 7931 break; 7932 case POOL_INITIALIZE_SUSPEND: 7933 vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list); 7934 break; 7935 case POOL_INITIALIZE_UNINIT: 7936 vdev_uninitialize(vd); 7937 break; 7938 default: 7939 panic("invalid cmd_type %llu", (unsigned long long)cmd_type); 7940 } 7941 mutex_exit(&vd->vdev_initialize_lock); 7942 7943 return (0); 7944 } 7945 7946 int 7947 spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, 7948 nvlist_t *vdev_errlist) 7949 { 7950 int total_errors = 0; 7951 list_t vd_list; 7952 7953 list_create(&vd_list, sizeof (vdev_t), 7954 offsetof(vdev_t, vdev_initialize_node)); 7955 7956 /* 7957 * We hold the namespace lock through the whole function 7958 * to prevent any changes to the pool while we're starting or 7959 * stopping initialization. The config and state locks are held so that 7960 * we can properly assess the vdev state before we commit to 7961 * the initializing operation. 7962 */ 7963 mutex_enter(&spa_namespace_lock); 7964 7965 for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); 7966 pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { 7967 uint64_t vdev_guid = fnvpair_value_uint64(pair); 7968 7969 int error = spa_vdev_initialize_impl(spa, vdev_guid, cmd_type, 7970 &vd_list); 7971 if (error != 0) { 7972 char guid_as_str[MAXNAMELEN]; 7973 7974 (void) snprintf(guid_as_str, sizeof (guid_as_str), 7975 "%llu", (unsigned long long)vdev_guid); 7976 fnvlist_add_int64(vdev_errlist, guid_as_str, error); 7977 total_errors++; 7978 } 7979 } 7980 7981 /* Wait for all initialize threads to stop. */ 7982 vdev_initialize_stop_wait(spa, &vd_list); 7983 7984 /* Sync out the initializing state */ 7985 txg_wait_synced(spa->spa_dsl_pool, 0); 7986 mutex_exit(&spa_namespace_lock); 7987 7988 list_destroy(&vd_list); 7989 7990 return (total_errors); 7991 } 7992 7993 static int 7994 spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, 7995 uint64_t rate, boolean_t partial, boolean_t secure, list_t *vd_list) 7996 { 7997 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 7998 7999 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 8000 8001 /* Look up vdev and ensure it's a leaf. */ 8002 vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); 8003 if (vd == NULL || vd->vdev_detached) { 8004 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8005 return (SET_ERROR(ENODEV)); 8006 } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { 8007 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8008 return (SET_ERROR(EINVAL)); 8009 } else if (!vdev_writeable(vd)) { 8010 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8011 return (SET_ERROR(EROFS)); 8012 } else if (!vd->vdev_has_trim) { 8013 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8014 return (SET_ERROR(EOPNOTSUPP)); 8015 } else if (secure && !vd->vdev_has_securetrim) { 8016 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8017 return (SET_ERROR(EOPNOTSUPP)); 8018 } 8019 mutex_enter(&vd->vdev_trim_lock); 8020 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8021 8022 /* 8023 * When we activate a TRIM action we check to see if the 8024 * vdev_trim_thread is NULL. We do this instead of using the 8025 * vdev_trim_state since there might be a previous TRIM process 8026 * which has completed but the thread is not exited. 8027 */ 8028 if (cmd_type == POOL_TRIM_START && 8029 (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing || 8030 vd->vdev_top->vdev_rz_expanding)) { 8031 mutex_exit(&vd->vdev_trim_lock); 8032 return (SET_ERROR(EBUSY)); 8033 } else if (cmd_type == POOL_TRIM_CANCEL && 8034 (vd->vdev_trim_state != VDEV_TRIM_ACTIVE && 8035 vd->vdev_trim_state != VDEV_TRIM_SUSPENDED)) { 8036 mutex_exit(&vd->vdev_trim_lock); 8037 return (SET_ERROR(ESRCH)); 8038 } else if (cmd_type == POOL_TRIM_SUSPEND && 8039 vd->vdev_trim_state != VDEV_TRIM_ACTIVE) { 8040 mutex_exit(&vd->vdev_trim_lock); 8041 return (SET_ERROR(ESRCH)); 8042 } 8043 8044 switch (cmd_type) { 8045 case POOL_TRIM_START: 8046 vdev_trim(vd, rate, partial, secure); 8047 break; 8048 case POOL_TRIM_CANCEL: 8049 vdev_trim_stop(vd, VDEV_TRIM_CANCELED, vd_list); 8050 break; 8051 case POOL_TRIM_SUSPEND: 8052 vdev_trim_stop(vd, VDEV_TRIM_SUSPENDED, vd_list); 8053 break; 8054 default: 8055 panic("invalid cmd_type %llu", (unsigned long long)cmd_type); 8056 } 8057 mutex_exit(&vd->vdev_trim_lock); 8058 8059 return (0); 8060 } 8061 8062 /* 8063 * Initiates a manual TRIM for the requested vdevs. This kicks off individual 8064 * TRIM threads for each child vdev. These threads pass over all of the free 8065 * space in the vdev's metaslabs and issues TRIM commands for that space. 8066 */ 8067 int 8068 spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate, 8069 boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist) 8070 { 8071 int total_errors = 0; 8072 list_t vd_list; 8073 8074 list_create(&vd_list, sizeof (vdev_t), 8075 offsetof(vdev_t, vdev_trim_node)); 8076 8077 /* 8078 * We hold the namespace lock through the whole function 8079 * to prevent any changes to the pool while we're starting or 8080 * stopping TRIM. The config and state locks are held so that 8081 * we can properly assess the vdev state before we commit to 8082 * the TRIM operation. 8083 */ 8084 mutex_enter(&spa_namespace_lock); 8085 8086 for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); 8087 pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { 8088 uint64_t vdev_guid = fnvpair_value_uint64(pair); 8089 8090 int error = spa_vdev_trim_impl(spa, vdev_guid, cmd_type, 8091 rate, partial, secure, &vd_list); 8092 if (error != 0) { 8093 char guid_as_str[MAXNAMELEN]; 8094 8095 (void) snprintf(guid_as_str, sizeof (guid_as_str), 8096 "%llu", (unsigned long long)vdev_guid); 8097 fnvlist_add_int64(vdev_errlist, guid_as_str, error); 8098 total_errors++; 8099 } 8100 } 8101 8102 /* Wait for all TRIM threads to stop. */ 8103 vdev_trim_stop_wait(spa, &vd_list); 8104 8105 /* Sync out the TRIM state */ 8106 txg_wait_synced(spa->spa_dsl_pool, 0); 8107 mutex_exit(&spa_namespace_lock); 8108 8109 list_destroy(&vd_list); 8110 8111 return (total_errors); 8112 } 8113 8114 /* 8115 * Split a set of devices from their mirrors, and create a new pool from them. 8116 */ 8117 int 8118 spa_vdev_split_mirror(spa_t *spa, const char *newname, nvlist_t *config, 8119 nvlist_t *props, boolean_t exp) 8120 { 8121 int error = 0; 8122 uint64_t txg, *glist; 8123 spa_t *newspa; 8124 uint_t c, children, lastlog; 8125 nvlist_t **child, *nvl, *tmp; 8126 dmu_tx_t *tx; 8127 const char *altroot = NULL; 8128 vdev_t *rvd, **vml = NULL; /* vdev modify list */ 8129 boolean_t activate_slog; 8130 8131 ASSERT(spa_writeable(spa)); 8132 8133 txg = spa_vdev_enter(spa); 8134 8135 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 8136 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 8137 error = (spa_has_checkpoint(spa)) ? 8138 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 8139 return (spa_vdev_exit(spa, NULL, txg, error)); 8140 } 8141 8142 /* clear the log and flush everything up to now */ 8143 activate_slog = spa_passivate_log(spa); 8144 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 8145 error = spa_reset_logs(spa); 8146 txg = spa_vdev_config_enter(spa); 8147 8148 if (activate_slog) 8149 spa_activate_log(spa); 8150 8151 if (error != 0) 8152 return (spa_vdev_exit(spa, NULL, txg, error)); 8153 8154 /* check new spa name before going any further */ 8155 if (spa_lookup(newname) != NULL) 8156 return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 8157 8158 /* 8159 * scan through all the children to ensure they're all mirrors 8160 */ 8161 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 8162 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 8163 &children) != 0) 8164 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 8165 8166 /* first, check to ensure we've got the right child count */ 8167 rvd = spa->spa_root_vdev; 8168 lastlog = 0; 8169 for (c = 0; c < rvd->vdev_children; c++) { 8170 vdev_t *vd = rvd->vdev_child[c]; 8171 8172 /* don't count the holes & logs as children */ 8173 if (vd->vdev_islog || (vd->vdev_ops != &vdev_indirect_ops && 8174 !vdev_is_concrete(vd))) { 8175 if (lastlog == 0) 8176 lastlog = c; 8177 continue; 8178 } 8179 8180 lastlog = 0; 8181 } 8182 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 8183 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 8184 8185 /* next, ensure no spare or cache devices are part of the split */ 8186 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 8187 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 8188 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 8189 8190 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 8191 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 8192 8193 /* then, loop over each vdev and validate it */ 8194 for (c = 0; c < children; c++) { 8195 uint64_t is_hole = 0; 8196 8197 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 8198 &is_hole); 8199 8200 if (is_hole != 0) { 8201 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 8202 spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 8203 continue; 8204 } else { 8205 error = SET_ERROR(EINVAL); 8206 break; 8207 } 8208 } 8209 8210 /* deal with indirect vdevs */ 8211 if (spa->spa_root_vdev->vdev_child[c]->vdev_ops == 8212 &vdev_indirect_ops) 8213 continue; 8214 8215 /* which disk is going to be split? */ 8216 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 8217 &glist[c]) != 0) { 8218 error = SET_ERROR(EINVAL); 8219 break; 8220 } 8221 8222 /* look it up in the spa */ 8223 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 8224 if (vml[c] == NULL) { 8225 error = SET_ERROR(ENODEV); 8226 break; 8227 } 8228 8229 /* make sure there's nothing stopping the split */ 8230 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 8231 vml[c]->vdev_islog || 8232 !vdev_is_concrete(vml[c]) || 8233 vml[c]->vdev_isspare || 8234 vml[c]->vdev_isl2cache || 8235 !vdev_writeable(vml[c]) || 8236 vml[c]->vdev_children != 0 || 8237 vml[c]->vdev_state != VDEV_STATE_HEALTHY || 8238 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 8239 error = SET_ERROR(EINVAL); 8240 break; 8241 } 8242 8243 if (vdev_dtl_required(vml[c]) || 8244 vdev_resilver_needed(vml[c], NULL, NULL)) { 8245 error = SET_ERROR(EBUSY); 8246 break; 8247 } 8248 8249 /* we need certain info from the top level */ 8250 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 8251 vml[c]->vdev_top->vdev_ms_array); 8252 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 8253 vml[c]->vdev_top->vdev_ms_shift); 8254 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 8255 vml[c]->vdev_top->vdev_asize); 8256 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 8257 vml[c]->vdev_top->vdev_ashift); 8258 8259 /* transfer per-vdev ZAPs */ 8260 ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0); 8261 VERIFY0(nvlist_add_uint64(child[c], 8262 ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap)); 8263 8264 ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0); 8265 VERIFY0(nvlist_add_uint64(child[c], 8266 ZPOOL_CONFIG_VDEV_TOP_ZAP, 8267 vml[c]->vdev_parent->vdev_top_zap)); 8268 } 8269 8270 if (error != 0) { 8271 kmem_free(vml, children * sizeof (vdev_t *)); 8272 kmem_free(glist, children * sizeof (uint64_t)); 8273 return (spa_vdev_exit(spa, NULL, txg, error)); 8274 } 8275 8276 /* stop writers from using the disks */ 8277 for (c = 0; c < children; c++) { 8278 if (vml[c] != NULL) 8279 vml[c]->vdev_offline = B_TRUE; 8280 } 8281 vdev_reopen(spa->spa_root_vdev); 8282 8283 /* 8284 * Temporarily record the splitting vdevs in the spa config. This 8285 * will disappear once the config is regenerated. 8286 */ 8287 nvl = fnvlist_alloc(); 8288 fnvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, glist, children); 8289 kmem_free(glist, children * sizeof (uint64_t)); 8290 8291 mutex_enter(&spa->spa_props_lock); 8292 fnvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, nvl); 8293 mutex_exit(&spa->spa_props_lock); 8294 spa->spa_config_splitting = nvl; 8295 vdev_config_dirty(spa->spa_root_vdev); 8296 8297 /* configure and create the new pool */ 8298 fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname); 8299 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 8300 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE); 8301 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa)); 8302 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg); 8303 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 8304 spa_generate_guid(NULL)); 8305 VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); 8306 (void) nvlist_lookup_string(props, 8307 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 8308 8309 /* add the new pool to the namespace */ 8310 newspa = spa_add(newname, config, altroot); 8311 newspa->spa_avz_action = AVZ_ACTION_REBUILD; 8312 newspa->spa_config_txg = spa->spa_config_txg; 8313 spa_set_log_state(newspa, SPA_LOG_CLEAR); 8314 8315 /* release the spa config lock, retaining the namespace lock */ 8316 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 8317 8318 if (zio_injection_enabled) 8319 zio_handle_panic_injection(spa, FTAG, 1); 8320 8321 spa_activate(newspa, spa_mode_global); 8322 spa_async_suspend(newspa); 8323 8324 /* 8325 * Temporarily stop the initializing and TRIM activity. We set the 8326 * state to ACTIVE so that we know to resume initializing or TRIM 8327 * once the split has completed. 8328 */ 8329 list_t vd_initialize_list; 8330 list_create(&vd_initialize_list, sizeof (vdev_t), 8331 offsetof(vdev_t, vdev_initialize_node)); 8332 8333 list_t vd_trim_list; 8334 list_create(&vd_trim_list, sizeof (vdev_t), 8335 offsetof(vdev_t, vdev_trim_node)); 8336 8337 for (c = 0; c < children; c++) { 8338 if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { 8339 mutex_enter(&vml[c]->vdev_initialize_lock); 8340 vdev_initialize_stop(vml[c], 8341 VDEV_INITIALIZE_ACTIVE, &vd_initialize_list); 8342 mutex_exit(&vml[c]->vdev_initialize_lock); 8343 8344 mutex_enter(&vml[c]->vdev_trim_lock); 8345 vdev_trim_stop(vml[c], VDEV_TRIM_ACTIVE, &vd_trim_list); 8346 mutex_exit(&vml[c]->vdev_trim_lock); 8347 } 8348 } 8349 8350 vdev_initialize_stop_wait(spa, &vd_initialize_list); 8351 vdev_trim_stop_wait(spa, &vd_trim_list); 8352 8353 list_destroy(&vd_initialize_list); 8354 list_destroy(&vd_trim_list); 8355 8356 newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT; 8357 newspa->spa_is_splitting = B_TRUE; 8358 8359 /* create the new pool from the disks of the original pool */ 8360 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE); 8361 if (error) 8362 goto out; 8363 8364 /* if that worked, generate a real config for the new pool */ 8365 if (newspa->spa_root_vdev != NULL) { 8366 newspa->spa_config_splitting = fnvlist_alloc(); 8367 fnvlist_add_uint64(newspa->spa_config_splitting, 8368 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)); 8369 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 8370 B_TRUE)); 8371 } 8372 8373 /* set the props */ 8374 if (props != NULL) { 8375 spa_configfile_set(newspa, props, B_FALSE); 8376 error = spa_prop_set(newspa, props); 8377 if (error) 8378 goto out; 8379 } 8380 8381 /* flush everything */ 8382 txg = spa_vdev_config_enter(newspa); 8383 vdev_config_dirty(newspa->spa_root_vdev); 8384 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 8385 8386 if (zio_injection_enabled) 8387 zio_handle_panic_injection(spa, FTAG, 2); 8388 8389 spa_async_resume(newspa); 8390 8391 /* finally, update the original pool's config */ 8392 txg = spa_vdev_config_enter(spa); 8393 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 8394 error = dmu_tx_assign(tx, TXG_WAIT); 8395 if (error != 0) 8396 dmu_tx_abort(tx); 8397 for (c = 0; c < children; c++) { 8398 if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { 8399 vdev_t *tvd = vml[c]->vdev_top; 8400 8401 /* 8402 * Need to be sure the detachable VDEV is not 8403 * on any *other* txg's DTL list to prevent it 8404 * from being accessed after it's freed. 8405 */ 8406 for (int t = 0; t < TXG_SIZE; t++) { 8407 (void) txg_list_remove_this( 8408 &tvd->vdev_dtl_list, vml[c], t); 8409 } 8410 8411 vdev_split(vml[c]); 8412 if (error == 0) 8413 spa_history_log_internal(spa, "detach", tx, 8414 "vdev=%s", vml[c]->vdev_path); 8415 8416 vdev_free(vml[c]); 8417 } 8418 } 8419 spa->spa_avz_action = AVZ_ACTION_REBUILD; 8420 vdev_config_dirty(spa->spa_root_vdev); 8421 spa->spa_config_splitting = NULL; 8422 nvlist_free(nvl); 8423 if (error == 0) 8424 dmu_tx_commit(tx); 8425 (void) spa_vdev_exit(spa, NULL, txg, 0); 8426 8427 if (zio_injection_enabled) 8428 zio_handle_panic_injection(spa, FTAG, 3); 8429 8430 /* split is complete; log a history record */ 8431 spa_history_log_internal(newspa, "split", NULL, 8432 "from pool %s", spa_name(spa)); 8433 8434 newspa->spa_is_splitting = B_FALSE; 8435 kmem_free(vml, children * sizeof (vdev_t *)); 8436 8437 /* if we're not going to mount the filesystems in userland, export */ 8438 if (exp) 8439 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 8440 B_FALSE, B_FALSE); 8441 8442 return (error); 8443 8444 out: 8445 spa_unload(newspa); 8446 spa_deactivate(newspa); 8447 spa_remove(newspa); 8448 8449 txg = spa_vdev_config_enter(spa); 8450 8451 /* re-online all offlined disks */ 8452 for (c = 0; c < children; c++) { 8453 if (vml[c] != NULL) 8454 vml[c]->vdev_offline = B_FALSE; 8455 } 8456 8457 /* restart initializing or trimming disks as necessary */ 8458 spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); 8459 spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); 8460 spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); 8461 8462 vdev_reopen(spa->spa_root_vdev); 8463 8464 nvlist_free(spa->spa_config_splitting); 8465 spa->spa_config_splitting = NULL; 8466 (void) spa_vdev_exit(spa, NULL, txg, error); 8467 8468 kmem_free(vml, children * sizeof (vdev_t *)); 8469 return (error); 8470 } 8471 8472 /* 8473 * Find any device that's done replacing, or a vdev marked 'unspare' that's 8474 * currently spared, so we can detach it. 8475 */ 8476 static vdev_t * 8477 spa_vdev_resilver_done_hunt(vdev_t *vd) 8478 { 8479 vdev_t *newvd, *oldvd; 8480 8481 for (int c = 0; c < vd->vdev_children; c++) { 8482 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 8483 if (oldvd != NULL) 8484 return (oldvd); 8485 } 8486 8487 /* 8488 * Check for a completed replacement. We always consider the first 8489 * vdev in the list to be the oldest vdev, and the last one to be 8490 * the newest (see spa_vdev_attach() for how that works). In 8491 * the case where the newest vdev is faulted, we will not automatically 8492 * remove it after a resilver completes. This is OK as it will require 8493 * user intervention to determine which disk the admin wishes to keep. 8494 */ 8495 if (vd->vdev_ops == &vdev_replacing_ops) { 8496 ASSERT(vd->vdev_children > 1); 8497 8498 newvd = vd->vdev_child[vd->vdev_children - 1]; 8499 oldvd = vd->vdev_child[0]; 8500 8501 if (vdev_dtl_empty(newvd, DTL_MISSING) && 8502 vdev_dtl_empty(newvd, DTL_OUTAGE) && 8503 !vdev_dtl_required(oldvd)) 8504 return (oldvd); 8505 } 8506 8507 /* 8508 * Check for a completed resilver with the 'unspare' flag set. 8509 * Also potentially update faulted state. 8510 */ 8511 if (vd->vdev_ops == &vdev_spare_ops) { 8512 vdev_t *first = vd->vdev_child[0]; 8513 vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 8514 8515 if (last->vdev_unspare) { 8516 oldvd = first; 8517 newvd = last; 8518 } else if (first->vdev_unspare) { 8519 oldvd = last; 8520 newvd = first; 8521 } else { 8522 oldvd = NULL; 8523 } 8524 8525 if (oldvd != NULL && 8526 vdev_dtl_empty(newvd, DTL_MISSING) && 8527 vdev_dtl_empty(newvd, DTL_OUTAGE) && 8528 !vdev_dtl_required(oldvd)) 8529 return (oldvd); 8530 8531 vdev_propagate_state(vd); 8532 8533 /* 8534 * If there are more than two spares attached to a disk, 8535 * and those spares are not required, then we want to 8536 * attempt to free them up now so that they can be used 8537 * by other pools. Once we're back down to a single 8538 * disk+spare, we stop removing them. 8539 */ 8540 if (vd->vdev_children > 2) { 8541 newvd = vd->vdev_child[1]; 8542 8543 if (newvd->vdev_isspare && last->vdev_isspare && 8544 vdev_dtl_empty(last, DTL_MISSING) && 8545 vdev_dtl_empty(last, DTL_OUTAGE) && 8546 !vdev_dtl_required(newvd)) 8547 return (newvd); 8548 } 8549 } 8550 8551 return (NULL); 8552 } 8553 8554 static void 8555 spa_vdev_resilver_done(spa_t *spa) 8556 { 8557 vdev_t *vd, *pvd, *ppvd; 8558 uint64_t guid, sguid, pguid, ppguid; 8559 8560 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 8561 8562 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 8563 pvd = vd->vdev_parent; 8564 ppvd = pvd->vdev_parent; 8565 guid = vd->vdev_guid; 8566 pguid = pvd->vdev_guid; 8567 ppguid = ppvd->vdev_guid; 8568 sguid = 0; 8569 /* 8570 * If we have just finished replacing a hot spared device, then 8571 * we need to detach the parent's first child (the original hot 8572 * spare) as well. 8573 */ 8574 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 8575 ppvd->vdev_children == 2) { 8576 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 8577 sguid = ppvd->vdev_child[1]->vdev_guid; 8578 } 8579 ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); 8580 8581 spa_config_exit(spa, SCL_ALL, FTAG); 8582 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 8583 return; 8584 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 8585 return; 8586 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 8587 } 8588 8589 spa_config_exit(spa, SCL_ALL, FTAG); 8590 8591 /* 8592 * If a detach was not performed above replace waiters will not have 8593 * been notified. In which case we must do so now. 8594 */ 8595 spa_notify_waiters(spa); 8596 } 8597 8598 /* 8599 * Update the stored path or FRU for this vdev. 8600 */ 8601 static int 8602 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 8603 boolean_t ispath) 8604 { 8605 vdev_t *vd; 8606 boolean_t sync = B_FALSE; 8607 8608 ASSERT(spa_writeable(spa)); 8609 8610 spa_vdev_state_enter(spa, SCL_ALL); 8611 8612 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 8613 return (spa_vdev_state_exit(spa, NULL, ENOENT)); 8614 8615 if (!vd->vdev_ops->vdev_op_leaf) 8616 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 8617 8618 if (ispath) { 8619 if (strcmp(value, vd->vdev_path) != 0) { 8620 spa_strfree(vd->vdev_path); 8621 vd->vdev_path = spa_strdup(value); 8622 sync = B_TRUE; 8623 } 8624 } else { 8625 if (vd->vdev_fru == NULL) { 8626 vd->vdev_fru = spa_strdup(value); 8627 sync = B_TRUE; 8628 } else if (strcmp(value, vd->vdev_fru) != 0) { 8629 spa_strfree(vd->vdev_fru); 8630 vd->vdev_fru = spa_strdup(value); 8631 sync = B_TRUE; 8632 } 8633 } 8634 8635 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 8636 } 8637 8638 int 8639 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 8640 { 8641 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 8642 } 8643 8644 int 8645 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 8646 { 8647 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 8648 } 8649 8650 /* 8651 * ========================================================================== 8652 * SPA Scanning 8653 * ========================================================================== 8654 */ 8655 int 8656 spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd) 8657 { 8658 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 8659 8660 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 8661 return (SET_ERROR(EBUSY)); 8662 8663 return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd)); 8664 } 8665 8666 int 8667 spa_scan_stop(spa_t *spa) 8668 { 8669 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 8670 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 8671 return (SET_ERROR(EBUSY)); 8672 8673 return (dsl_scan_cancel(spa->spa_dsl_pool)); 8674 } 8675 8676 int 8677 spa_scan(spa_t *spa, pool_scan_func_t func) 8678 { 8679 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 8680 8681 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 8682 return (SET_ERROR(ENOTSUP)); 8683 8684 if (func == POOL_SCAN_RESILVER && 8685 !spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) 8686 return (SET_ERROR(ENOTSUP)); 8687 8688 /* 8689 * If a resilver was requested, but there is no DTL on a 8690 * writeable leaf device, we have nothing to do. 8691 */ 8692 if (func == POOL_SCAN_RESILVER && 8693 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 8694 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 8695 return (0); 8696 } 8697 8698 if (func == POOL_SCAN_ERRORSCRUB && 8699 !spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) 8700 return (SET_ERROR(ENOTSUP)); 8701 8702 return (dsl_scan(spa->spa_dsl_pool, func)); 8703 } 8704 8705 /* 8706 * ========================================================================== 8707 * SPA async task processing 8708 * ========================================================================== 8709 */ 8710 8711 static void 8712 spa_async_remove(spa_t *spa, vdev_t *vd) 8713 { 8714 if (vd->vdev_remove_wanted) { 8715 vd->vdev_remove_wanted = B_FALSE; 8716 vd->vdev_delayed_close = B_FALSE; 8717 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 8718 8719 /* 8720 * We want to clear the stats, but we don't want to do a full 8721 * vdev_clear() as that will cause us to throw away 8722 * degraded/faulted state as well as attempt to reopen the 8723 * device, all of which is a waste. 8724 */ 8725 vd->vdev_stat.vs_read_errors = 0; 8726 vd->vdev_stat.vs_write_errors = 0; 8727 vd->vdev_stat.vs_checksum_errors = 0; 8728 8729 vdev_state_dirty(vd->vdev_top); 8730 8731 /* Tell userspace that the vdev is gone. */ 8732 zfs_post_remove(spa, vd); 8733 } 8734 8735 for (int c = 0; c < vd->vdev_children; c++) 8736 spa_async_remove(spa, vd->vdev_child[c]); 8737 } 8738 8739 static void 8740 spa_async_probe(spa_t *spa, vdev_t *vd) 8741 { 8742 if (vd->vdev_probe_wanted) { 8743 vd->vdev_probe_wanted = B_FALSE; 8744 vdev_reopen(vd); /* vdev_open() does the actual probe */ 8745 } 8746 8747 for (int c = 0; c < vd->vdev_children; c++) 8748 spa_async_probe(spa, vd->vdev_child[c]); 8749 } 8750 8751 static void 8752 spa_async_autoexpand(spa_t *spa, vdev_t *vd) 8753 { 8754 if (!spa->spa_autoexpand) 8755 return; 8756 8757 for (int c = 0; c < vd->vdev_children; c++) { 8758 vdev_t *cvd = vd->vdev_child[c]; 8759 spa_async_autoexpand(spa, cvd); 8760 } 8761 8762 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 8763 return; 8764 8765 spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_AUTOEXPAND); 8766 } 8767 8768 static __attribute__((noreturn)) void 8769 spa_async_thread(void *arg) 8770 { 8771 spa_t *spa = (spa_t *)arg; 8772 dsl_pool_t *dp = spa->spa_dsl_pool; 8773 int tasks; 8774 8775 ASSERT(spa->spa_sync_on); 8776 8777 mutex_enter(&spa->spa_async_lock); 8778 tasks = spa->spa_async_tasks; 8779 spa->spa_async_tasks = 0; 8780 mutex_exit(&spa->spa_async_lock); 8781 8782 /* 8783 * See if the config needs to be updated. 8784 */ 8785 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 8786 uint64_t old_space, new_space; 8787 8788 mutex_enter(&spa_namespace_lock); 8789 old_space = metaslab_class_get_space(spa_normal_class(spa)); 8790 old_space += metaslab_class_get_space(spa_special_class(spa)); 8791 old_space += metaslab_class_get_space(spa_dedup_class(spa)); 8792 old_space += metaslab_class_get_space( 8793 spa_embedded_log_class(spa)); 8794 8795 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 8796 8797 new_space = metaslab_class_get_space(spa_normal_class(spa)); 8798 new_space += metaslab_class_get_space(spa_special_class(spa)); 8799 new_space += metaslab_class_get_space(spa_dedup_class(spa)); 8800 new_space += metaslab_class_get_space( 8801 spa_embedded_log_class(spa)); 8802 mutex_exit(&spa_namespace_lock); 8803 8804 /* 8805 * If the pool grew as a result of the config update, 8806 * then log an internal history event. 8807 */ 8808 if (new_space != old_space) { 8809 spa_history_log_internal(spa, "vdev online", NULL, 8810 "pool '%s' size: %llu(+%llu)", 8811 spa_name(spa), (u_longlong_t)new_space, 8812 (u_longlong_t)(new_space - old_space)); 8813 } 8814 } 8815 8816 /* 8817 * See if any devices need to be marked REMOVED. 8818 */ 8819 if (tasks & SPA_ASYNC_REMOVE) { 8820 spa_vdev_state_enter(spa, SCL_NONE); 8821 spa_async_remove(spa, spa->spa_root_vdev); 8822 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 8823 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 8824 for (int i = 0; i < spa->spa_spares.sav_count; i++) 8825 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 8826 (void) spa_vdev_state_exit(spa, NULL, 0); 8827 } 8828 8829 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 8830 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8831 spa_async_autoexpand(spa, spa->spa_root_vdev); 8832 spa_config_exit(spa, SCL_CONFIG, FTAG); 8833 } 8834 8835 /* 8836 * See if any devices need to be probed. 8837 */ 8838 if (tasks & SPA_ASYNC_PROBE) { 8839 spa_vdev_state_enter(spa, SCL_NONE); 8840 spa_async_probe(spa, spa->spa_root_vdev); 8841 (void) spa_vdev_state_exit(spa, NULL, 0); 8842 } 8843 8844 /* 8845 * If any devices are done replacing, detach them. 8846 */ 8847 if (tasks & SPA_ASYNC_RESILVER_DONE || 8848 tasks & SPA_ASYNC_REBUILD_DONE || 8849 tasks & SPA_ASYNC_DETACH_SPARE) { 8850 spa_vdev_resilver_done(spa); 8851 } 8852 8853 /* 8854 * Kick off a resilver. 8855 */ 8856 if (tasks & SPA_ASYNC_RESILVER && 8857 !vdev_rebuild_active(spa->spa_root_vdev) && 8858 (!dsl_scan_resilvering(dp) || 8859 !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))) 8860 dsl_scan_restart_resilver(dp, 0); 8861 8862 if (tasks & SPA_ASYNC_INITIALIZE_RESTART) { 8863 mutex_enter(&spa_namespace_lock); 8864 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8865 vdev_initialize_restart(spa->spa_root_vdev); 8866 spa_config_exit(spa, SCL_CONFIG, FTAG); 8867 mutex_exit(&spa_namespace_lock); 8868 } 8869 8870 if (tasks & SPA_ASYNC_TRIM_RESTART) { 8871 mutex_enter(&spa_namespace_lock); 8872 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8873 vdev_trim_restart(spa->spa_root_vdev); 8874 spa_config_exit(spa, SCL_CONFIG, FTAG); 8875 mutex_exit(&spa_namespace_lock); 8876 } 8877 8878 if (tasks & SPA_ASYNC_AUTOTRIM_RESTART) { 8879 mutex_enter(&spa_namespace_lock); 8880 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8881 vdev_autotrim_restart(spa); 8882 spa_config_exit(spa, SCL_CONFIG, FTAG); 8883 mutex_exit(&spa_namespace_lock); 8884 } 8885 8886 /* 8887 * Kick off L2 cache whole device TRIM. 8888 */ 8889 if (tasks & SPA_ASYNC_L2CACHE_TRIM) { 8890 mutex_enter(&spa_namespace_lock); 8891 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8892 vdev_trim_l2arc(spa); 8893 spa_config_exit(spa, SCL_CONFIG, FTAG); 8894 mutex_exit(&spa_namespace_lock); 8895 } 8896 8897 /* 8898 * Kick off L2 cache rebuilding. 8899 */ 8900 if (tasks & SPA_ASYNC_L2CACHE_REBUILD) { 8901 mutex_enter(&spa_namespace_lock); 8902 spa_config_enter(spa, SCL_L2ARC, FTAG, RW_READER); 8903 l2arc_spa_rebuild_start(spa); 8904 spa_config_exit(spa, SCL_L2ARC, FTAG); 8905 mutex_exit(&spa_namespace_lock); 8906 } 8907 8908 /* 8909 * Let the world know that we're done. 8910 */ 8911 mutex_enter(&spa->spa_async_lock); 8912 spa->spa_async_thread = NULL; 8913 cv_broadcast(&spa->spa_async_cv); 8914 mutex_exit(&spa->spa_async_lock); 8915 thread_exit(); 8916 } 8917 8918 void 8919 spa_async_suspend(spa_t *spa) 8920 { 8921 mutex_enter(&spa->spa_async_lock); 8922 spa->spa_async_suspended++; 8923 while (spa->spa_async_thread != NULL) 8924 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 8925 mutex_exit(&spa->spa_async_lock); 8926 8927 spa_vdev_remove_suspend(spa); 8928 8929 zthr_t *condense_thread = spa->spa_condense_zthr; 8930 if (condense_thread != NULL) 8931 zthr_cancel(condense_thread); 8932 8933 zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr; 8934 if (raidz_expand_thread != NULL) 8935 zthr_cancel(raidz_expand_thread); 8936 8937 zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; 8938 if (discard_thread != NULL) 8939 zthr_cancel(discard_thread); 8940 8941 zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; 8942 if (ll_delete_thread != NULL) 8943 zthr_cancel(ll_delete_thread); 8944 8945 zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; 8946 if (ll_condense_thread != NULL) 8947 zthr_cancel(ll_condense_thread); 8948 } 8949 8950 void 8951 spa_async_resume(spa_t *spa) 8952 { 8953 mutex_enter(&spa->spa_async_lock); 8954 ASSERT(spa->spa_async_suspended != 0); 8955 spa->spa_async_suspended--; 8956 mutex_exit(&spa->spa_async_lock); 8957 spa_restart_removal(spa); 8958 8959 zthr_t *condense_thread = spa->spa_condense_zthr; 8960 if (condense_thread != NULL) 8961 zthr_resume(condense_thread); 8962 8963 zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr; 8964 if (raidz_expand_thread != NULL) 8965 zthr_resume(raidz_expand_thread); 8966 8967 zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; 8968 if (discard_thread != NULL) 8969 zthr_resume(discard_thread); 8970 8971 zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; 8972 if (ll_delete_thread != NULL) 8973 zthr_resume(ll_delete_thread); 8974 8975 zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; 8976 if (ll_condense_thread != NULL) 8977 zthr_resume(ll_condense_thread); 8978 } 8979 8980 static boolean_t 8981 spa_async_tasks_pending(spa_t *spa) 8982 { 8983 uint_t non_config_tasks; 8984 uint_t config_task; 8985 boolean_t config_task_suspended; 8986 8987 non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE; 8988 config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; 8989 if (spa->spa_ccw_fail_time == 0) { 8990 config_task_suspended = B_FALSE; 8991 } else { 8992 config_task_suspended = 8993 (gethrtime() - spa->spa_ccw_fail_time) < 8994 ((hrtime_t)zfs_ccw_retry_interval * NANOSEC); 8995 } 8996 8997 return (non_config_tasks || (config_task && !config_task_suspended)); 8998 } 8999 9000 static void 9001 spa_async_dispatch(spa_t *spa) 9002 { 9003 mutex_enter(&spa->spa_async_lock); 9004 if (spa_async_tasks_pending(spa) && 9005 !spa->spa_async_suspended && 9006 spa->spa_async_thread == NULL) 9007 spa->spa_async_thread = thread_create(NULL, 0, 9008 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 9009 mutex_exit(&spa->spa_async_lock); 9010 } 9011 9012 void 9013 spa_async_request(spa_t *spa, int task) 9014 { 9015 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 9016 mutex_enter(&spa->spa_async_lock); 9017 spa->spa_async_tasks |= task; 9018 mutex_exit(&spa->spa_async_lock); 9019 } 9020 9021 int 9022 spa_async_tasks(spa_t *spa) 9023 { 9024 return (spa->spa_async_tasks); 9025 } 9026 9027 /* 9028 * ========================================================================== 9029 * SPA syncing routines 9030 * ========================================================================== 9031 */ 9032 9033 9034 static int 9035 bpobj_enqueue_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 9036 dmu_tx_t *tx) 9037 { 9038 bpobj_t *bpo = arg; 9039 bpobj_enqueue(bpo, bp, bp_freed, tx); 9040 return (0); 9041 } 9042 9043 int 9044 bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 9045 { 9046 return (bpobj_enqueue_cb(arg, bp, B_FALSE, tx)); 9047 } 9048 9049 int 9050 bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 9051 { 9052 return (bpobj_enqueue_cb(arg, bp, B_TRUE, tx)); 9053 } 9054 9055 static int 9056 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 9057 { 9058 zio_t *pio = arg; 9059 9060 zio_nowait(zio_free_sync(pio, pio->io_spa, dmu_tx_get_txg(tx), bp, 9061 pio->io_flags)); 9062 return (0); 9063 } 9064 9065 static int 9066 bpobj_spa_free_sync_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 9067 dmu_tx_t *tx) 9068 { 9069 ASSERT(!bp_freed); 9070 return (spa_free_sync_cb(arg, bp, tx)); 9071 } 9072 9073 /* 9074 * Note: this simple function is not inlined to make it easier to dtrace the 9075 * amount of time spent syncing frees. 9076 */ 9077 static void 9078 spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) 9079 { 9080 zio_t *zio = zio_root(spa, NULL, NULL, 0); 9081 bplist_iterate(bpl, spa_free_sync_cb, zio, tx); 9082 VERIFY(zio_wait(zio) == 0); 9083 } 9084 9085 /* 9086 * Note: this simple function is not inlined to make it easier to dtrace the 9087 * amount of time spent syncing deferred frees. 9088 */ 9089 static void 9090 spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) 9091 { 9092 if (spa_sync_pass(spa) != 1) 9093 return; 9094 9095 /* 9096 * Note: 9097 * If the log space map feature is active, we stop deferring 9098 * frees to the next TXG and therefore running this function 9099 * would be considered a no-op as spa_deferred_bpobj should 9100 * not have any entries. 9101 * 9102 * That said we run this function anyway (instead of returning 9103 * immediately) for the edge-case scenario where we just 9104 * activated the log space map feature in this TXG but we have 9105 * deferred frees from the previous TXG. 9106 */ 9107 zio_t *zio = zio_root(spa, NULL, NULL, 0); 9108 VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, 9109 bpobj_spa_free_sync_cb, zio, tx), ==, 0); 9110 VERIFY0(zio_wait(zio)); 9111 } 9112 9113 static void 9114 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 9115 { 9116 char *packed = NULL; 9117 size_t bufsize; 9118 size_t nvsize = 0; 9119 dmu_buf_t *db; 9120 9121 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 9122 9123 /* 9124 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 9125 * information. This avoids the dmu_buf_will_dirty() path and 9126 * saves us a pre-read to get data we don't actually care about. 9127 */ 9128 bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); 9129 packed = vmem_alloc(bufsize, KM_SLEEP); 9130 9131 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 9132 KM_SLEEP) == 0); 9133 memset(packed + nvsize, 0, bufsize - nvsize); 9134 9135 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 9136 9137 vmem_free(packed, bufsize); 9138 9139 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 9140 dmu_buf_will_dirty(db, tx); 9141 *(uint64_t *)db->db_data = nvsize; 9142 dmu_buf_rele(db, FTAG); 9143 } 9144 9145 static void 9146 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 9147 const char *config, const char *entry) 9148 { 9149 nvlist_t *nvroot; 9150 nvlist_t **list; 9151 int i; 9152 9153 if (!sav->sav_sync) 9154 return; 9155 9156 /* 9157 * Update the MOS nvlist describing the list of available devices. 9158 * spa_validate_aux() will have already made sure this nvlist is 9159 * valid and the vdevs are labeled appropriately. 9160 */ 9161 if (sav->sav_object == 0) { 9162 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 9163 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 9164 sizeof (uint64_t), tx); 9165 VERIFY(zap_update(spa->spa_meta_objset, 9166 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 9167 &sav->sav_object, tx) == 0); 9168 } 9169 9170 nvroot = fnvlist_alloc(); 9171 if (sav->sav_count == 0) { 9172 fnvlist_add_nvlist_array(nvroot, config, 9173 (const nvlist_t * const *)NULL, 0); 9174 } else { 9175 list = kmem_alloc(sav->sav_count*sizeof (void *), KM_SLEEP); 9176 for (i = 0; i < sav->sav_count; i++) 9177 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 9178 B_FALSE, VDEV_CONFIG_L2CACHE); 9179 fnvlist_add_nvlist_array(nvroot, config, 9180 (const nvlist_t * const *)list, sav->sav_count); 9181 for (i = 0; i < sav->sav_count; i++) 9182 nvlist_free(list[i]); 9183 kmem_free(list, sav->sav_count * sizeof (void *)); 9184 } 9185 9186 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 9187 nvlist_free(nvroot); 9188 9189 sav->sav_sync = B_FALSE; 9190 } 9191 9192 /* 9193 * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t. 9194 * The all-vdev ZAP must be empty. 9195 */ 9196 static void 9197 spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx) 9198 { 9199 spa_t *spa = vd->vdev_spa; 9200 9201 if (vd->vdev_root_zap != 0 && 9202 spa_feature_is_active(spa, SPA_FEATURE_AVZ_V2)) { 9203 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 9204 vd->vdev_root_zap, tx)); 9205 } 9206 if (vd->vdev_top_zap != 0) { 9207 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 9208 vd->vdev_top_zap, tx)); 9209 } 9210 if (vd->vdev_leaf_zap != 0) { 9211 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 9212 vd->vdev_leaf_zap, tx)); 9213 } 9214 for (uint64_t i = 0; i < vd->vdev_children; i++) { 9215 spa_avz_build(vd->vdev_child[i], avz, tx); 9216 } 9217 } 9218 9219 static void 9220 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 9221 { 9222 nvlist_t *config; 9223 9224 /* 9225 * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS, 9226 * its config may not be dirty but we still need to build per-vdev ZAPs. 9227 * Similarly, if the pool is being assembled (e.g. after a split), we 9228 * need to rebuild the AVZ although the config may not be dirty. 9229 */ 9230 if (list_is_empty(&spa->spa_config_dirty_list) && 9231 spa->spa_avz_action == AVZ_ACTION_NONE) 9232 return; 9233 9234 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 9235 9236 ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE || 9237 spa->spa_avz_action == AVZ_ACTION_INITIALIZE || 9238 spa->spa_all_vdev_zaps != 0); 9239 9240 if (spa->spa_avz_action == AVZ_ACTION_REBUILD) { 9241 /* Make and build the new AVZ */ 9242 uint64_t new_avz = zap_create(spa->spa_meta_objset, 9243 DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); 9244 spa_avz_build(spa->spa_root_vdev, new_avz, tx); 9245 9246 /* Diff old AVZ with new one */ 9247 zap_cursor_t zc; 9248 zap_attribute_t za; 9249 9250 for (zap_cursor_init(&zc, spa->spa_meta_objset, 9251 spa->spa_all_vdev_zaps); 9252 zap_cursor_retrieve(&zc, &za) == 0; 9253 zap_cursor_advance(&zc)) { 9254 uint64_t vdzap = za.za_first_integer; 9255 if (zap_lookup_int(spa->spa_meta_objset, new_avz, 9256 vdzap) == ENOENT) { 9257 /* 9258 * ZAP is listed in old AVZ but not in new one; 9259 * destroy it 9260 */ 9261 VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap, 9262 tx)); 9263 } 9264 } 9265 9266 zap_cursor_fini(&zc); 9267 9268 /* Destroy the old AVZ */ 9269 VERIFY0(zap_destroy(spa->spa_meta_objset, 9270 spa->spa_all_vdev_zaps, tx)); 9271 9272 /* Replace the old AVZ in the dir obj with the new one */ 9273 VERIFY0(zap_update(spa->spa_meta_objset, 9274 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, 9275 sizeof (new_avz), 1, &new_avz, tx)); 9276 9277 spa->spa_all_vdev_zaps = new_avz; 9278 } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) { 9279 zap_cursor_t zc; 9280 zap_attribute_t za; 9281 9282 /* Walk through the AVZ and destroy all listed ZAPs */ 9283 for (zap_cursor_init(&zc, spa->spa_meta_objset, 9284 spa->spa_all_vdev_zaps); 9285 zap_cursor_retrieve(&zc, &za) == 0; 9286 zap_cursor_advance(&zc)) { 9287 uint64_t zap = za.za_first_integer; 9288 VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx)); 9289 } 9290 9291 zap_cursor_fini(&zc); 9292 9293 /* Destroy and unlink the AVZ itself */ 9294 VERIFY0(zap_destroy(spa->spa_meta_objset, 9295 spa->spa_all_vdev_zaps, tx)); 9296 VERIFY0(zap_remove(spa->spa_meta_objset, 9297 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx)); 9298 spa->spa_all_vdev_zaps = 0; 9299 } 9300 9301 if (spa->spa_all_vdev_zaps == 0) { 9302 spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset, 9303 DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, 9304 DMU_POOL_VDEV_ZAP_MAP, tx); 9305 } 9306 spa->spa_avz_action = AVZ_ACTION_NONE; 9307 9308 /* Create ZAPs for vdevs that don't have them. */ 9309 vdev_construct_zaps(spa->spa_root_vdev, tx); 9310 9311 config = spa_config_generate(spa, spa->spa_root_vdev, 9312 dmu_tx_get_txg(tx), B_FALSE); 9313 9314 /* 9315 * If we're upgrading the spa version then make sure that 9316 * the config object gets updated with the correct version. 9317 */ 9318 if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) 9319 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 9320 spa->spa_uberblock.ub_version); 9321 9322 spa_config_exit(spa, SCL_STATE, FTAG); 9323 9324 nvlist_free(spa->spa_config_syncing); 9325 spa->spa_config_syncing = config; 9326 9327 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 9328 } 9329 9330 static void 9331 spa_sync_version(void *arg, dmu_tx_t *tx) 9332 { 9333 uint64_t *versionp = arg; 9334 uint64_t version = *versionp; 9335 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 9336 9337 /* 9338 * Setting the version is special cased when first creating the pool. 9339 */ 9340 ASSERT(tx->tx_txg != TXG_INITIAL); 9341 9342 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 9343 ASSERT(version >= spa_version(spa)); 9344 9345 spa->spa_uberblock.ub_version = version; 9346 vdev_config_dirty(spa->spa_root_vdev); 9347 spa_history_log_internal(spa, "set", tx, "version=%lld", 9348 (longlong_t)version); 9349 } 9350 9351 /* 9352 * Set zpool properties. 9353 */ 9354 static void 9355 spa_sync_props(void *arg, dmu_tx_t *tx) 9356 { 9357 nvlist_t *nvp = arg; 9358 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 9359 objset_t *mos = spa->spa_meta_objset; 9360 nvpair_t *elem = NULL; 9361 9362 mutex_enter(&spa->spa_props_lock); 9363 9364 while ((elem = nvlist_next_nvpair(nvp, elem))) { 9365 uint64_t intval; 9366 const char *strval, *fname; 9367 zpool_prop_t prop; 9368 const char *propname; 9369 const char *elemname = nvpair_name(elem); 9370 zprop_type_t proptype; 9371 spa_feature_t fid; 9372 9373 switch (prop = zpool_name_to_prop(elemname)) { 9374 case ZPOOL_PROP_VERSION: 9375 intval = fnvpair_value_uint64(elem); 9376 /* 9377 * The version is synced separately before other 9378 * properties and should be correct by now. 9379 */ 9380 ASSERT3U(spa_version(spa), >=, intval); 9381 break; 9382 9383 case ZPOOL_PROP_ALTROOT: 9384 /* 9385 * 'altroot' is a non-persistent property. It should 9386 * have been set temporarily at creation or import time. 9387 */ 9388 ASSERT(spa->spa_root != NULL); 9389 break; 9390 9391 case ZPOOL_PROP_READONLY: 9392 case ZPOOL_PROP_CACHEFILE: 9393 /* 9394 * 'readonly' and 'cachefile' are also non-persistent 9395 * properties. 9396 */ 9397 break; 9398 case ZPOOL_PROP_COMMENT: 9399 strval = fnvpair_value_string(elem); 9400 if (spa->spa_comment != NULL) 9401 spa_strfree(spa->spa_comment); 9402 spa->spa_comment = spa_strdup(strval); 9403 /* 9404 * We need to dirty the configuration on all the vdevs 9405 * so that their labels get updated. We also need to 9406 * update the cache file to keep it in sync with the 9407 * MOS version. It's unnecessary to do this for pool 9408 * creation since the vdev's configuration has already 9409 * been dirtied. 9410 */ 9411 if (tx->tx_txg != TXG_INITIAL) { 9412 vdev_config_dirty(spa->spa_root_vdev); 9413 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 9414 } 9415 spa_history_log_internal(spa, "set", tx, 9416 "%s=%s", elemname, strval); 9417 break; 9418 case ZPOOL_PROP_COMPATIBILITY: 9419 strval = fnvpair_value_string(elem); 9420 if (spa->spa_compatibility != NULL) 9421 spa_strfree(spa->spa_compatibility); 9422 spa->spa_compatibility = spa_strdup(strval); 9423 /* 9424 * Dirty the configuration on vdevs as above. 9425 */ 9426 if (tx->tx_txg != TXG_INITIAL) { 9427 vdev_config_dirty(spa->spa_root_vdev); 9428 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 9429 } 9430 9431 spa_history_log_internal(spa, "set", tx, 9432 "%s=%s", nvpair_name(elem), strval); 9433 break; 9434 9435 case ZPOOL_PROP_INVAL: 9436 if (zpool_prop_feature(elemname)) { 9437 fname = strchr(elemname, '@') + 1; 9438 VERIFY0(zfeature_lookup_name(fname, &fid)); 9439 9440 spa_feature_enable(spa, fid, tx); 9441 spa_history_log_internal(spa, "set", tx, 9442 "%s=enabled", elemname); 9443 break; 9444 } else if (!zfs_prop_user(elemname)) { 9445 ASSERT(zpool_prop_feature(elemname)); 9446 break; 9447 } 9448 zfs_fallthrough; 9449 default: 9450 /* 9451 * Set pool property values in the poolprops mos object. 9452 */ 9453 if (spa->spa_pool_props_object == 0) { 9454 spa->spa_pool_props_object = 9455 zap_create_link(mos, DMU_OT_POOL_PROPS, 9456 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 9457 tx); 9458 } 9459 9460 /* normalize the property name */ 9461 if (prop == ZPOOL_PROP_INVAL) { 9462 propname = elemname; 9463 proptype = PROP_TYPE_STRING; 9464 } else { 9465 propname = zpool_prop_to_name(prop); 9466 proptype = zpool_prop_get_type(prop); 9467 } 9468 9469 if (nvpair_type(elem) == DATA_TYPE_STRING) { 9470 ASSERT(proptype == PROP_TYPE_STRING); 9471 strval = fnvpair_value_string(elem); 9472 VERIFY0(zap_update(mos, 9473 spa->spa_pool_props_object, propname, 9474 1, strlen(strval) + 1, strval, tx)); 9475 spa_history_log_internal(spa, "set", tx, 9476 "%s=%s", elemname, strval); 9477 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 9478 intval = fnvpair_value_uint64(elem); 9479 9480 if (proptype == PROP_TYPE_INDEX) { 9481 const char *unused; 9482 VERIFY0(zpool_prop_index_to_string( 9483 prop, intval, &unused)); 9484 } 9485 VERIFY0(zap_update(mos, 9486 spa->spa_pool_props_object, propname, 9487 8, 1, &intval, tx)); 9488 spa_history_log_internal(spa, "set", tx, 9489 "%s=%lld", elemname, 9490 (longlong_t)intval); 9491 9492 switch (prop) { 9493 case ZPOOL_PROP_DELEGATION: 9494 spa->spa_delegation = intval; 9495 break; 9496 case ZPOOL_PROP_BOOTFS: 9497 spa->spa_bootfs = intval; 9498 break; 9499 case ZPOOL_PROP_FAILUREMODE: 9500 spa->spa_failmode = intval; 9501 break; 9502 case ZPOOL_PROP_AUTOTRIM: 9503 spa->spa_autotrim = intval; 9504 spa_async_request(spa, 9505 SPA_ASYNC_AUTOTRIM_RESTART); 9506 break; 9507 case ZPOOL_PROP_AUTOEXPAND: 9508 spa->spa_autoexpand = intval; 9509 if (tx->tx_txg != TXG_INITIAL) 9510 spa_async_request(spa, 9511 SPA_ASYNC_AUTOEXPAND); 9512 break; 9513 case ZPOOL_PROP_MULTIHOST: 9514 spa->spa_multihost = intval; 9515 break; 9516 default: 9517 break; 9518 } 9519 } else { 9520 ASSERT(0); /* not allowed */ 9521 } 9522 } 9523 9524 } 9525 9526 mutex_exit(&spa->spa_props_lock); 9527 } 9528 9529 /* 9530 * Perform one-time upgrade on-disk changes. spa_version() does not 9531 * reflect the new version this txg, so there must be no changes this 9532 * txg to anything that the upgrade code depends on after it executes. 9533 * Therefore this must be called after dsl_pool_sync() does the sync 9534 * tasks. 9535 */ 9536 static void 9537 spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 9538 { 9539 if (spa_sync_pass(spa) != 1) 9540 return; 9541 9542 dsl_pool_t *dp = spa->spa_dsl_pool; 9543 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 9544 9545 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 9546 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 9547 dsl_pool_create_origin(dp, tx); 9548 9549 /* Keeping the origin open increases spa_minref */ 9550 spa->spa_minref += 3; 9551 } 9552 9553 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 9554 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 9555 dsl_pool_upgrade_clones(dp, tx); 9556 } 9557 9558 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 9559 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 9560 dsl_pool_upgrade_dir_clones(dp, tx); 9561 9562 /* Keeping the freedir open increases spa_minref */ 9563 spa->spa_minref += 3; 9564 } 9565 9566 if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && 9567 spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 9568 spa_feature_create_zap_objects(spa, tx); 9569 } 9570 9571 /* 9572 * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable 9573 * when possibility to use lz4 compression for metadata was added 9574 * Old pools that have this feature enabled must be upgraded to have 9575 * this feature active 9576 */ 9577 if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 9578 boolean_t lz4_en = spa_feature_is_enabled(spa, 9579 SPA_FEATURE_LZ4_COMPRESS); 9580 boolean_t lz4_ac = spa_feature_is_active(spa, 9581 SPA_FEATURE_LZ4_COMPRESS); 9582 9583 if (lz4_en && !lz4_ac) 9584 spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); 9585 } 9586 9587 /* 9588 * If we haven't written the salt, do so now. Note that the 9589 * feature may not be activated yet, but that's fine since 9590 * the presence of this ZAP entry is backwards compatible. 9591 */ 9592 if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 9593 DMU_POOL_CHECKSUM_SALT) == ENOENT) { 9594 VERIFY0(zap_add(spa->spa_meta_objset, 9595 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, 9596 sizeof (spa->spa_cksum_salt.zcs_bytes), 9597 spa->spa_cksum_salt.zcs_bytes, tx)); 9598 } 9599 9600 rrw_exit(&dp->dp_config_rwlock, FTAG); 9601 } 9602 9603 static void 9604 vdev_indirect_state_sync_verify(vdev_t *vd) 9605 { 9606 vdev_indirect_mapping_t *vim __maybe_unused = vd->vdev_indirect_mapping; 9607 vdev_indirect_births_t *vib __maybe_unused = vd->vdev_indirect_births; 9608 9609 if (vd->vdev_ops == &vdev_indirect_ops) { 9610 ASSERT(vim != NULL); 9611 ASSERT(vib != NULL); 9612 } 9613 9614 uint64_t obsolete_sm_object = 0; 9615 ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); 9616 if (obsolete_sm_object != 0) { 9617 ASSERT(vd->vdev_obsolete_sm != NULL); 9618 ASSERT(vd->vdev_removing || 9619 vd->vdev_ops == &vdev_indirect_ops); 9620 ASSERT(vdev_indirect_mapping_num_entries(vim) > 0); 9621 ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0); 9622 ASSERT3U(obsolete_sm_object, ==, 9623 space_map_object(vd->vdev_obsolete_sm)); 9624 ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=, 9625 space_map_allocated(vd->vdev_obsolete_sm)); 9626 } 9627 ASSERT(vd->vdev_obsolete_segments != NULL); 9628 9629 /* 9630 * Since frees / remaps to an indirect vdev can only 9631 * happen in syncing context, the obsolete segments 9632 * tree must be empty when we start syncing. 9633 */ 9634 ASSERT0(range_tree_space(vd->vdev_obsolete_segments)); 9635 } 9636 9637 /* 9638 * Set the top-level vdev's max queue depth. Evaluate each top-level's 9639 * async write queue depth in case it changed. The max queue depth will 9640 * not change in the middle of syncing out this txg. 9641 */ 9642 static void 9643 spa_sync_adjust_vdev_max_queue_depth(spa_t *spa) 9644 { 9645 ASSERT(spa_writeable(spa)); 9646 9647 vdev_t *rvd = spa->spa_root_vdev; 9648 uint32_t max_queue_depth = zfs_vdev_async_write_max_active * 9649 zfs_vdev_queue_depth_pct / 100; 9650 metaslab_class_t *normal = spa_normal_class(spa); 9651 metaslab_class_t *special = spa_special_class(spa); 9652 metaslab_class_t *dedup = spa_dedup_class(spa); 9653 9654 uint64_t slots_per_allocator = 0; 9655 for (int c = 0; c < rvd->vdev_children; c++) { 9656 vdev_t *tvd = rvd->vdev_child[c]; 9657 9658 metaslab_group_t *mg = tvd->vdev_mg; 9659 if (mg == NULL || !metaslab_group_initialized(mg)) 9660 continue; 9661 9662 metaslab_class_t *mc = mg->mg_class; 9663 if (mc != normal && mc != special && mc != dedup) 9664 continue; 9665 9666 /* 9667 * It is safe to do a lock-free check here because only async 9668 * allocations look at mg_max_alloc_queue_depth, and async 9669 * allocations all happen from spa_sync(). 9670 */ 9671 for (int i = 0; i < mg->mg_allocators; i++) { 9672 ASSERT0(zfs_refcount_count( 9673 &(mg->mg_allocator[i].mga_alloc_queue_depth))); 9674 } 9675 mg->mg_max_alloc_queue_depth = max_queue_depth; 9676 9677 for (int i = 0; i < mg->mg_allocators; i++) { 9678 mg->mg_allocator[i].mga_cur_max_alloc_queue_depth = 9679 zfs_vdev_def_queue_depth; 9680 } 9681 slots_per_allocator += zfs_vdev_def_queue_depth; 9682 } 9683 9684 for (int i = 0; i < spa->spa_alloc_count; i++) { 9685 ASSERT0(zfs_refcount_count(&normal->mc_allocator[i]. 9686 mca_alloc_slots)); 9687 ASSERT0(zfs_refcount_count(&special->mc_allocator[i]. 9688 mca_alloc_slots)); 9689 ASSERT0(zfs_refcount_count(&dedup->mc_allocator[i]. 9690 mca_alloc_slots)); 9691 normal->mc_allocator[i].mca_alloc_max_slots = 9692 slots_per_allocator; 9693 special->mc_allocator[i].mca_alloc_max_slots = 9694 slots_per_allocator; 9695 dedup->mc_allocator[i].mca_alloc_max_slots = 9696 slots_per_allocator; 9697 } 9698 normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; 9699 special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; 9700 dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; 9701 } 9702 9703 static void 9704 spa_sync_condense_indirect(spa_t *spa, dmu_tx_t *tx) 9705 { 9706 ASSERT(spa_writeable(spa)); 9707 9708 vdev_t *rvd = spa->spa_root_vdev; 9709 for (int c = 0; c < rvd->vdev_children; c++) { 9710 vdev_t *vd = rvd->vdev_child[c]; 9711 vdev_indirect_state_sync_verify(vd); 9712 9713 if (vdev_indirect_should_condense(vd)) { 9714 spa_condense_indirect_start_sync(vd, tx); 9715 break; 9716 } 9717 } 9718 } 9719 9720 static void 9721 spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx) 9722 { 9723 objset_t *mos = spa->spa_meta_objset; 9724 dsl_pool_t *dp = spa->spa_dsl_pool; 9725 uint64_t txg = tx->tx_txg; 9726 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 9727 9728 do { 9729 int pass = ++spa->spa_sync_pass; 9730 9731 spa_sync_config_object(spa, tx); 9732 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 9733 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 9734 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 9735 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 9736 spa_errlog_sync(spa, txg); 9737 dsl_pool_sync(dp, txg); 9738 9739 if (pass < zfs_sync_pass_deferred_free || 9740 spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { 9741 /* 9742 * If the log space map feature is active we don't 9743 * care about deferred frees and the deferred bpobj 9744 * as the log space map should effectively have the 9745 * same results (i.e. appending only to one object). 9746 */ 9747 spa_sync_frees(spa, free_bpl, tx); 9748 } else { 9749 /* 9750 * We can not defer frees in pass 1, because 9751 * we sync the deferred frees later in pass 1. 9752 */ 9753 ASSERT3U(pass, >, 1); 9754 bplist_iterate(free_bpl, bpobj_enqueue_alloc_cb, 9755 &spa->spa_deferred_bpobj, tx); 9756 } 9757 9758 brt_sync(spa, txg); 9759 ddt_sync(spa, txg); 9760 dsl_scan_sync(dp, tx); 9761 dsl_errorscrub_sync(dp, tx); 9762 svr_sync(spa, tx); 9763 spa_sync_upgrades(spa, tx); 9764 9765 spa_flush_metaslabs(spa, tx); 9766 9767 vdev_t *vd = NULL; 9768 while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 9769 != NULL) 9770 vdev_sync(vd, txg); 9771 9772 if (pass == 1) { 9773 /* 9774 * dsl_pool_sync() -> dp_sync_tasks may have dirtied 9775 * the config. If that happens, this txg should not 9776 * be a no-op. So we must sync the config to the MOS 9777 * before checking for no-op. 9778 * 9779 * Note that when the config is dirty, it will 9780 * be written to the MOS (i.e. the MOS will be 9781 * dirtied) every time we call spa_sync_config_object() 9782 * in this txg. Therefore we can't call this after 9783 * dsl_pool_sync() every pass, because it would 9784 * prevent us from converging, since we'd dirty 9785 * the MOS every pass. 9786 * 9787 * Sync tasks can only be processed in pass 1, so 9788 * there's no need to do this in later passes. 9789 */ 9790 spa_sync_config_object(spa, tx); 9791 } 9792 9793 /* 9794 * Note: We need to check if the MOS is dirty because we could 9795 * have marked the MOS dirty without updating the uberblock 9796 * (e.g. if we have sync tasks but no dirty user data). We need 9797 * to check the uberblock's rootbp because it is updated if we 9798 * have synced out dirty data (though in this case the MOS will 9799 * most likely also be dirty due to second order effects, we 9800 * don't want to rely on that here). 9801 */ 9802 if (pass == 1 && 9803 spa->spa_uberblock.ub_rootbp.blk_birth < txg && 9804 !dmu_objset_is_dirty(mos, txg)) { 9805 /* 9806 * Nothing changed on the first pass, therefore this 9807 * TXG is a no-op. Avoid syncing deferred frees, so 9808 * that we can keep this TXG as a no-op. 9809 */ 9810 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 9811 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 9812 ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); 9813 ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, txg)); 9814 break; 9815 } 9816 9817 spa_sync_deferred_frees(spa, tx); 9818 } while (dmu_objset_is_dirty(mos, txg)); 9819 } 9820 9821 /* 9822 * Rewrite the vdev configuration (which includes the uberblock) to 9823 * commit the transaction group. 9824 * 9825 * If there are no dirty vdevs, we sync the uberblock to a few random 9826 * top-level vdevs that are known to be visible in the config cache 9827 * (see spa_vdev_add() for a complete description). If there *are* dirty 9828 * vdevs, sync the uberblock to all vdevs. 9829 */ 9830 static void 9831 spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx) 9832 { 9833 vdev_t *rvd = spa->spa_root_vdev; 9834 uint64_t txg = tx->tx_txg; 9835 9836 for (;;) { 9837 int error = 0; 9838 9839 /* 9840 * We hold SCL_STATE to prevent vdev open/close/etc. 9841 * while we're attempting to write the vdev labels. 9842 */ 9843 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 9844 9845 if (list_is_empty(&spa->spa_config_dirty_list)) { 9846 vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; 9847 int svdcount = 0; 9848 int children = rvd->vdev_children; 9849 int c0 = random_in_range(children); 9850 9851 for (int c = 0; c < children; c++) { 9852 vdev_t *vd = 9853 rvd->vdev_child[(c0 + c) % children]; 9854 9855 /* Stop when revisiting the first vdev */ 9856 if (c > 0 && svd[0] == vd) 9857 break; 9858 9859 if (vd->vdev_ms_array == 0 || 9860 vd->vdev_islog || 9861 !vdev_is_concrete(vd)) 9862 continue; 9863 9864 svd[svdcount++] = vd; 9865 if (svdcount == SPA_SYNC_MIN_VDEVS) 9866 break; 9867 } 9868 error = vdev_config_sync(svd, svdcount, txg); 9869 } else { 9870 error = vdev_config_sync(rvd->vdev_child, 9871 rvd->vdev_children, txg); 9872 } 9873 9874 if (error == 0) 9875 spa->spa_last_synced_guid = rvd->vdev_guid; 9876 9877 spa_config_exit(spa, SCL_STATE, FTAG); 9878 9879 if (error == 0) 9880 break; 9881 zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR); 9882 zio_resume_wait(spa); 9883 } 9884 } 9885 9886 /* 9887 * Sync the specified transaction group. New blocks may be dirtied as 9888 * part of the process, so we iterate until it converges. 9889 */ 9890 void 9891 spa_sync(spa_t *spa, uint64_t txg) 9892 { 9893 vdev_t *vd = NULL; 9894 9895 VERIFY(spa_writeable(spa)); 9896 9897 /* 9898 * Wait for i/os issued in open context that need to complete 9899 * before this txg syncs. 9900 */ 9901 (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]); 9902 spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, 9903 ZIO_FLAG_CANFAIL); 9904 9905 /* 9906 * Now that there can be no more cloning in this transaction group, 9907 * but we are still before issuing frees, we can process pending BRT 9908 * updates. 9909 */ 9910 brt_pending_apply(spa, txg); 9911 9912 /* 9913 * Lock out configuration changes. 9914 */ 9915 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 9916 9917 spa->spa_syncing_txg = txg; 9918 spa->spa_sync_pass = 0; 9919 9920 for (int i = 0; i < spa->spa_alloc_count; i++) { 9921 mutex_enter(&spa->spa_allocs[i].spaa_lock); 9922 VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree)); 9923 mutex_exit(&spa->spa_allocs[i].spaa_lock); 9924 } 9925 9926 /* 9927 * If there are any pending vdev state changes, convert them 9928 * into config changes that go out with this transaction group. 9929 */ 9930 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 9931 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 9932 /* Avoid holding the write lock unless actually necessary */ 9933 if (vd->vdev_aux == NULL) { 9934 vdev_state_clean(vd); 9935 vdev_config_dirty(vd); 9936 continue; 9937 } 9938 /* 9939 * We need the write lock here because, for aux vdevs, 9940 * calling vdev_config_dirty() modifies sav_config. 9941 * This is ugly and will become unnecessary when we 9942 * eliminate the aux vdev wart by integrating all vdevs 9943 * into the root vdev tree. 9944 */ 9945 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 9946 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 9947 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 9948 vdev_state_clean(vd); 9949 vdev_config_dirty(vd); 9950 } 9951 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 9952 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 9953 } 9954 spa_config_exit(spa, SCL_STATE, FTAG); 9955 9956 dsl_pool_t *dp = spa->spa_dsl_pool; 9957 dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); 9958 9959 spa->spa_sync_starttime = gethrtime(); 9960 taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); 9961 spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq, 9962 spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + 9963 NSEC_TO_TICK(spa->spa_deadman_synctime)); 9964 9965 /* 9966 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 9967 * set spa_deflate if we have no raid-z vdevs. 9968 */ 9969 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 9970 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 9971 vdev_t *rvd = spa->spa_root_vdev; 9972 9973 int i; 9974 for (i = 0; i < rvd->vdev_children; i++) { 9975 vd = rvd->vdev_child[i]; 9976 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 9977 break; 9978 } 9979 if (i == rvd->vdev_children) { 9980 spa->spa_deflate = TRUE; 9981 VERIFY0(zap_add(spa->spa_meta_objset, 9982 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 9983 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 9984 } 9985 } 9986 9987 spa_sync_adjust_vdev_max_queue_depth(spa); 9988 9989 spa_sync_condense_indirect(spa, tx); 9990 9991 spa_sync_iterate_to_convergence(spa, tx); 9992 9993 #ifdef ZFS_DEBUG 9994 if (!list_is_empty(&spa->spa_config_dirty_list)) { 9995 /* 9996 * Make sure that the number of ZAPs for all the vdevs matches 9997 * the number of ZAPs in the per-vdev ZAP list. This only gets 9998 * called if the config is dirty; otherwise there may be 9999 * outstanding AVZ operations that weren't completed in 10000 * spa_sync_config_object. 10001 */ 10002 uint64_t all_vdev_zap_entry_count; 10003 ASSERT0(zap_count(spa->spa_meta_objset, 10004 spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count)); 10005 ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==, 10006 all_vdev_zap_entry_count); 10007 } 10008 #endif 10009 10010 if (spa->spa_vdev_removal != NULL) { 10011 ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]); 10012 } 10013 10014 spa_sync_rewrite_vdev_config(spa, tx); 10015 dmu_tx_commit(tx); 10016 10017 taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); 10018 spa->spa_deadman_tqid = 0; 10019 10020 /* 10021 * Clear the dirty config list. 10022 */ 10023 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 10024 vdev_config_clean(vd); 10025 10026 /* 10027 * Now that the new config has synced transactionally, 10028 * let it become visible to the config cache. 10029 */ 10030 if (spa->spa_config_syncing != NULL) { 10031 spa_config_set(spa, spa->spa_config_syncing); 10032 spa->spa_config_txg = txg; 10033 spa->spa_config_syncing = NULL; 10034 } 10035 10036 dsl_pool_sync_done(dp, txg); 10037 10038 for (int i = 0; i < spa->spa_alloc_count; i++) { 10039 mutex_enter(&spa->spa_allocs[i].spaa_lock); 10040 VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree)); 10041 mutex_exit(&spa->spa_allocs[i].spaa_lock); 10042 } 10043 10044 /* 10045 * Update usable space statistics. 10046 */ 10047 while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 10048 != NULL) 10049 vdev_sync_done(vd, txg); 10050 10051 metaslab_class_evict_old(spa->spa_normal_class, txg); 10052 metaslab_class_evict_old(spa->spa_log_class, txg); 10053 10054 spa_sync_close_syncing_log_sm(spa); 10055 10056 spa_update_dspace(spa); 10057 10058 if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) 10059 vdev_autotrim_kick(spa); 10060 10061 /* 10062 * It had better be the case that we didn't dirty anything 10063 * since vdev_config_sync(). 10064 */ 10065 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 10066 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 10067 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 10068 10069 while (zfs_pause_spa_sync) 10070 delay(1); 10071 10072 spa->spa_sync_pass = 0; 10073 10074 /* 10075 * Update the last synced uberblock here. We want to do this at 10076 * the end of spa_sync() so that consumers of spa_last_synced_txg() 10077 * will be guaranteed that all the processing associated with 10078 * that txg has been completed. 10079 */ 10080 spa->spa_ubsync = spa->spa_uberblock; 10081 spa_config_exit(spa, SCL_CONFIG, FTAG); 10082 10083 spa_handle_ignored_writes(spa); 10084 10085 /* 10086 * If any async tasks have been requested, kick them off. 10087 */ 10088 spa_async_dispatch(spa); 10089 } 10090 10091 /* 10092 * Sync all pools. We don't want to hold the namespace lock across these 10093 * operations, so we take a reference on the spa_t and drop the lock during the 10094 * sync. 10095 */ 10096 void 10097 spa_sync_allpools(void) 10098 { 10099 spa_t *spa = NULL; 10100 mutex_enter(&spa_namespace_lock); 10101 while ((spa = spa_next(spa)) != NULL) { 10102 if (spa_state(spa) != POOL_STATE_ACTIVE || 10103 !spa_writeable(spa) || spa_suspended(spa)) 10104 continue; 10105 spa_open_ref(spa, FTAG); 10106 mutex_exit(&spa_namespace_lock); 10107 txg_wait_synced(spa_get_dsl(spa), 0); 10108 mutex_enter(&spa_namespace_lock); 10109 spa_close(spa, FTAG); 10110 } 10111 mutex_exit(&spa_namespace_lock); 10112 } 10113 10114 taskq_t * 10115 spa_sync_tq_create(spa_t *spa, const char *name) 10116 { 10117 kthread_t **kthreads; 10118 10119 ASSERT(spa->spa_sync_tq == NULL); 10120 ASSERT3S(spa->spa_alloc_count, <=, boot_ncpus); 10121 10122 /* 10123 * - do not allow more allocators than cpus. 10124 * - there may be more cpus than allocators. 10125 * - do not allow more sync taskq threads than allocators or cpus. 10126 */ 10127 int nthreads = spa->spa_alloc_count; 10128 spa->spa_syncthreads = kmem_zalloc(sizeof (spa_syncthread_info_t) * 10129 nthreads, KM_SLEEP); 10130 10131 spa->spa_sync_tq = taskq_create_synced(name, nthreads, minclsyspri, 10132 nthreads, INT_MAX, TASKQ_PREPOPULATE, &kthreads); 10133 VERIFY(spa->spa_sync_tq != NULL); 10134 VERIFY(kthreads != NULL); 10135 10136 spa_taskqs_t *tqs = 10137 &spa->spa_zio_taskq[ZIO_TYPE_WRITE][ZIO_TASKQ_ISSUE]; 10138 10139 spa_syncthread_info_t *ti = spa->spa_syncthreads; 10140 for (int i = 0, w = 0; i < nthreads; i++, w++, ti++) { 10141 ti->sti_thread = kthreads[i]; 10142 if (w == tqs->stqs_count) { 10143 w = 0; 10144 } 10145 ti->sti_wr_iss_tq = tqs->stqs_taskq[w]; 10146 } 10147 10148 kmem_free(kthreads, sizeof (*kthreads) * nthreads); 10149 return (spa->spa_sync_tq); 10150 } 10151 10152 void 10153 spa_sync_tq_destroy(spa_t *spa) 10154 { 10155 ASSERT(spa->spa_sync_tq != NULL); 10156 10157 taskq_wait(spa->spa_sync_tq); 10158 taskq_destroy(spa->spa_sync_tq); 10159 kmem_free(spa->spa_syncthreads, 10160 sizeof (spa_syncthread_info_t) * spa->spa_alloc_count); 10161 spa->spa_sync_tq = NULL; 10162 } 10163 10164 void 10165 spa_select_allocator(zio_t *zio) 10166 { 10167 zbookmark_phys_t *bm = &zio->io_bookmark; 10168 spa_t *spa = zio->io_spa; 10169 10170 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 10171 10172 /* 10173 * A gang block (for example) may have inherited its parent's 10174 * allocator, in which case there is nothing further to do here. 10175 */ 10176 if (ZIO_HAS_ALLOCATOR(zio)) 10177 return; 10178 10179 ASSERT(spa != NULL); 10180 ASSERT(bm != NULL); 10181 10182 /* 10183 * First try to use an allocator assigned to the syncthread, and set 10184 * the corresponding write issue taskq for the allocator. 10185 * Note, we must have an open pool to do this. 10186 */ 10187 if (spa->spa_sync_tq != NULL) { 10188 spa_syncthread_info_t *ti = spa->spa_syncthreads; 10189 for (int i = 0; i < spa->spa_alloc_count; i++, ti++) { 10190 if (ti->sti_thread == curthread) { 10191 zio->io_allocator = i; 10192 zio->io_wr_iss_tq = ti->sti_wr_iss_tq; 10193 return; 10194 } 10195 } 10196 } 10197 10198 /* 10199 * We want to try to use as many allocators as possible to help improve 10200 * performance, but we also want logically adjacent IOs to be physically 10201 * adjacent to improve sequential read performance. We chunk each object 10202 * into 2^20 block regions, and then hash based on the objset, object, 10203 * level, and region to accomplish both of these goals. 10204 */ 10205 uint64_t hv = cityhash4(bm->zb_objset, bm->zb_object, bm->zb_level, 10206 bm->zb_blkid >> 20); 10207 10208 zio->io_allocator = (uint_t)hv % spa->spa_alloc_count; 10209 zio->io_wr_iss_tq = NULL; 10210 } 10211 10212 /* 10213 * ========================================================================== 10214 * Miscellaneous routines 10215 * ========================================================================== 10216 */ 10217 10218 /* 10219 * Remove all pools in the system. 10220 */ 10221 void 10222 spa_evict_all(void) 10223 { 10224 spa_t *spa; 10225 10226 /* 10227 * Remove all cached state. All pools should be closed now, 10228 * so every spa in the AVL tree should be unreferenced. 10229 */ 10230 mutex_enter(&spa_namespace_lock); 10231 while ((spa = spa_next(NULL)) != NULL) { 10232 /* 10233 * Stop async tasks. The async thread may need to detach 10234 * a device that's been replaced, which requires grabbing 10235 * spa_namespace_lock, so we must drop it here. 10236 */ 10237 spa_open_ref(spa, FTAG); 10238 mutex_exit(&spa_namespace_lock); 10239 spa_async_suspend(spa); 10240 mutex_enter(&spa_namespace_lock); 10241 spa_close(spa, FTAG); 10242 10243 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 10244 spa_unload(spa); 10245 spa_deactivate(spa); 10246 } 10247 spa_remove(spa); 10248 } 10249 mutex_exit(&spa_namespace_lock); 10250 } 10251 10252 vdev_t * 10253 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 10254 { 10255 vdev_t *vd; 10256 int i; 10257 10258 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 10259 return (vd); 10260 10261 if (aux) { 10262 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 10263 vd = spa->spa_l2cache.sav_vdevs[i]; 10264 if (vd->vdev_guid == guid) 10265 return (vd); 10266 } 10267 10268 for (i = 0; i < spa->spa_spares.sav_count; i++) { 10269 vd = spa->spa_spares.sav_vdevs[i]; 10270 if (vd->vdev_guid == guid) 10271 return (vd); 10272 } 10273 } 10274 10275 return (NULL); 10276 } 10277 10278 void 10279 spa_upgrade(spa_t *spa, uint64_t version) 10280 { 10281 ASSERT(spa_writeable(spa)); 10282 10283 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 10284 10285 /* 10286 * This should only be called for a non-faulted pool, and since a 10287 * future version would result in an unopenable pool, this shouldn't be 10288 * possible. 10289 */ 10290 ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); 10291 ASSERT3U(version, >=, spa->spa_uberblock.ub_version); 10292 10293 spa->spa_uberblock.ub_version = version; 10294 vdev_config_dirty(spa->spa_root_vdev); 10295 10296 spa_config_exit(spa, SCL_ALL, FTAG); 10297 10298 txg_wait_synced(spa_get_dsl(spa), 0); 10299 } 10300 10301 static boolean_t 10302 spa_has_aux_vdev(spa_t *spa, uint64_t guid, spa_aux_vdev_t *sav) 10303 { 10304 (void) spa; 10305 int i; 10306 uint64_t vdev_guid; 10307 10308 for (i = 0; i < sav->sav_count; i++) 10309 if (sav->sav_vdevs[i]->vdev_guid == guid) 10310 return (B_TRUE); 10311 10312 for (i = 0; i < sav->sav_npending; i++) { 10313 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 10314 &vdev_guid) == 0 && vdev_guid == guid) 10315 return (B_TRUE); 10316 } 10317 10318 return (B_FALSE); 10319 } 10320 10321 boolean_t 10322 spa_has_l2cache(spa_t *spa, uint64_t guid) 10323 { 10324 return (spa_has_aux_vdev(spa, guid, &spa->spa_l2cache)); 10325 } 10326 10327 boolean_t 10328 spa_has_spare(spa_t *spa, uint64_t guid) 10329 { 10330 return (spa_has_aux_vdev(spa, guid, &spa->spa_spares)); 10331 } 10332 10333 /* 10334 * Check if a pool has an active shared spare device. 10335 * Note: reference count of an active spare is 2, as a spare and as a replace 10336 */ 10337 static boolean_t 10338 spa_has_active_shared_spare(spa_t *spa) 10339 { 10340 int i, refcnt; 10341 uint64_t pool; 10342 spa_aux_vdev_t *sav = &spa->spa_spares; 10343 10344 for (i = 0; i < sav->sav_count; i++) { 10345 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 10346 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 10347 refcnt > 2) 10348 return (B_TRUE); 10349 } 10350 10351 return (B_FALSE); 10352 } 10353 10354 uint64_t 10355 spa_total_metaslabs(spa_t *spa) 10356 { 10357 vdev_t *rvd = spa->spa_root_vdev; 10358 10359 uint64_t m = 0; 10360 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 10361 vdev_t *vd = rvd->vdev_child[c]; 10362 if (!vdev_is_concrete(vd)) 10363 continue; 10364 m += vd->vdev_ms_count; 10365 } 10366 return (m); 10367 } 10368 10369 /* 10370 * Notify any waiting threads that some activity has switched from being in- 10371 * progress to not-in-progress so that the thread can wake up and determine 10372 * whether it is finished waiting. 10373 */ 10374 void 10375 spa_notify_waiters(spa_t *spa) 10376 { 10377 /* 10378 * Acquiring spa_activities_lock here prevents the cv_broadcast from 10379 * happening between the waiting thread's check and cv_wait. 10380 */ 10381 mutex_enter(&spa->spa_activities_lock); 10382 cv_broadcast(&spa->spa_activities_cv); 10383 mutex_exit(&spa->spa_activities_lock); 10384 } 10385 10386 /* 10387 * Notify any waiting threads that the pool is exporting, and then block until 10388 * they are finished using the spa_t. 10389 */ 10390 void 10391 spa_wake_waiters(spa_t *spa) 10392 { 10393 mutex_enter(&spa->spa_activities_lock); 10394 spa->spa_waiters_cancel = B_TRUE; 10395 cv_broadcast(&spa->spa_activities_cv); 10396 while (spa->spa_waiters != 0) 10397 cv_wait(&spa->spa_waiters_cv, &spa->spa_activities_lock); 10398 spa->spa_waiters_cancel = B_FALSE; 10399 mutex_exit(&spa->spa_activities_lock); 10400 } 10401 10402 /* Whether the vdev or any of its descendants are being initialized/trimmed. */ 10403 static boolean_t 10404 spa_vdev_activity_in_progress_impl(vdev_t *vd, zpool_wait_activity_t activity) 10405 { 10406 spa_t *spa = vd->vdev_spa; 10407 10408 ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER)); 10409 ASSERT(MUTEX_HELD(&spa->spa_activities_lock)); 10410 ASSERT(activity == ZPOOL_WAIT_INITIALIZE || 10411 activity == ZPOOL_WAIT_TRIM); 10412 10413 kmutex_t *lock = activity == ZPOOL_WAIT_INITIALIZE ? 10414 &vd->vdev_initialize_lock : &vd->vdev_trim_lock; 10415 10416 mutex_exit(&spa->spa_activities_lock); 10417 mutex_enter(lock); 10418 mutex_enter(&spa->spa_activities_lock); 10419 10420 boolean_t in_progress = (activity == ZPOOL_WAIT_INITIALIZE) ? 10421 (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) : 10422 (vd->vdev_trim_state == VDEV_TRIM_ACTIVE); 10423 mutex_exit(lock); 10424 10425 if (in_progress) 10426 return (B_TRUE); 10427 10428 for (int i = 0; i < vd->vdev_children; i++) { 10429 if (spa_vdev_activity_in_progress_impl(vd->vdev_child[i], 10430 activity)) 10431 return (B_TRUE); 10432 } 10433 10434 return (B_FALSE); 10435 } 10436 10437 /* 10438 * If use_guid is true, this checks whether the vdev specified by guid is 10439 * being initialized/trimmed. Otherwise, it checks whether any vdev in the pool 10440 * is being initialized/trimmed. The caller must hold the config lock and 10441 * spa_activities_lock. 10442 */ 10443 static int 10444 spa_vdev_activity_in_progress(spa_t *spa, boolean_t use_guid, uint64_t guid, 10445 zpool_wait_activity_t activity, boolean_t *in_progress) 10446 { 10447 mutex_exit(&spa->spa_activities_lock); 10448 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 10449 mutex_enter(&spa->spa_activities_lock); 10450 10451 vdev_t *vd; 10452 if (use_guid) { 10453 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 10454 if (vd == NULL || !vd->vdev_ops->vdev_op_leaf) { 10455 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 10456 return (EINVAL); 10457 } 10458 } else { 10459 vd = spa->spa_root_vdev; 10460 } 10461 10462 *in_progress = spa_vdev_activity_in_progress_impl(vd, activity); 10463 10464 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 10465 return (0); 10466 } 10467 10468 /* 10469 * Locking for waiting threads 10470 * --------------------------- 10471 * 10472 * Waiting threads need a way to check whether a given activity is in progress, 10473 * and then, if it is, wait for it to complete. Each activity will have some 10474 * in-memory representation of the relevant on-disk state which can be used to 10475 * determine whether or not the activity is in progress. The in-memory state and 10476 * the locking used to protect it will be different for each activity, and may 10477 * not be suitable for use with a cvar (e.g., some state is protected by the 10478 * config lock). To allow waiting threads to wait without any races, another 10479 * lock, spa_activities_lock, is used. 10480 * 10481 * When the state is checked, both the activity-specific lock (if there is one) 10482 * and spa_activities_lock are held. In some cases, the activity-specific lock 10483 * is acquired explicitly (e.g. the config lock). In others, the locking is 10484 * internal to some check (e.g. bpobj_is_empty). After checking, the waiting 10485 * thread releases the activity-specific lock and, if the activity is in 10486 * progress, then cv_waits using spa_activities_lock. 10487 * 10488 * The waiting thread is woken when another thread, one completing some 10489 * activity, updates the state of the activity and then calls 10490 * spa_notify_waiters, which will cv_broadcast. This 'completing' thread only 10491 * needs to hold its activity-specific lock when updating the state, and this 10492 * lock can (but doesn't have to) be dropped before calling spa_notify_waiters. 10493 * 10494 * Because spa_notify_waiters acquires spa_activities_lock before broadcasting, 10495 * and because it is held when the waiting thread checks the state of the 10496 * activity, it can never be the case that the completing thread both updates 10497 * the activity state and cv_broadcasts in between the waiting thread's check 10498 * and cv_wait. Thus, a waiting thread can never miss a wakeup. 10499 * 10500 * In order to prevent deadlock, when the waiting thread does its check, in some 10501 * cases it will temporarily drop spa_activities_lock in order to acquire the 10502 * activity-specific lock. The order in which spa_activities_lock and the 10503 * activity specific lock are acquired in the waiting thread is determined by 10504 * the order in which they are acquired in the completing thread; if the 10505 * completing thread calls spa_notify_waiters with the activity-specific lock 10506 * held, then the waiting thread must also acquire the activity-specific lock 10507 * first. 10508 */ 10509 10510 static int 10511 spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity, 10512 boolean_t use_tag, uint64_t tag, boolean_t *in_progress) 10513 { 10514 int error = 0; 10515 10516 ASSERT(MUTEX_HELD(&spa->spa_activities_lock)); 10517 10518 switch (activity) { 10519 case ZPOOL_WAIT_CKPT_DISCARD: 10520 *in_progress = 10521 (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT) && 10522 zap_contains(spa_meta_objset(spa), 10523 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT) == 10524 ENOENT); 10525 break; 10526 case ZPOOL_WAIT_FREE: 10527 *in_progress = ((spa_version(spa) >= SPA_VERSION_DEADLISTS && 10528 !bpobj_is_empty(&spa->spa_dsl_pool->dp_free_bpobj)) || 10529 spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY) || 10530 spa_livelist_delete_check(spa)); 10531 break; 10532 case ZPOOL_WAIT_INITIALIZE: 10533 case ZPOOL_WAIT_TRIM: 10534 error = spa_vdev_activity_in_progress(spa, use_tag, tag, 10535 activity, in_progress); 10536 break; 10537 case ZPOOL_WAIT_REPLACE: 10538 mutex_exit(&spa->spa_activities_lock); 10539 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 10540 mutex_enter(&spa->spa_activities_lock); 10541 10542 *in_progress = vdev_replace_in_progress(spa->spa_root_vdev); 10543 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 10544 break; 10545 case ZPOOL_WAIT_REMOVE: 10546 *in_progress = (spa->spa_removing_phys.sr_state == 10547 DSS_SCANNING); 10548 break; 10549 case ZPOOL_WAIT_RESILVER: 10550 *in_progress = vdev_rebuild_active(spa->spa_root_vdev); 10551 if (*in_progress) 10552 break; 10553 zfs_fallthrough; 10554 case ZPOOL_WAIT_SCRUB: 10555 { 10556 boolean_t scanning, paused, is_scrub; 10557 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; 10558 10559 is_scrub = (scn->scn_phys.scn_func == POOL_SCAN_SCRUB); 10560 scanning = (scn->scn_phys.scn_state == DSS_SCANNING); 10561 paused = dsl_scan_is_paused_scrub(scn); 10562 *in_progress = (scanning && !paused && 10563 is_scrub == (activity == ZPOOL_WAIT_SCRUB)); 10564 break; 10565 } 10566 case ZPOOL_WAIT_RAIDZ_EXPAND: 10567 { 10568 vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 10569 *in_progress = (vre != NULL && vre->vre_state == DSS_SCANNING); 10570 break; 10571 } 10572 default: 10573 panic("unrecognized value for activity %d", activity); 10574 } 10575 10576 return (error); 10577 } 10578 10579 static int 10580 spa_wait_common(const char *pool, zpool_wait_activity_t activity, 10581 boolean_t use_tag, uint64_t tag, boolean_t *waited) 10582 { 10583 /* 10584 * The tag is used to distinguish between instances of an activity. 10585 * 'initialize' and 'trim' are the only activities that we use this for. 10586 * The other activities can only have a single instance in progress in a 10587 * pool at one time, making the tag unnecessary. 10588 * 10589 * There can be multiple devices being replaced at once, but since they 10590 * all finish once resilvering finishes, we don't bother keeping track 10591 * of them individually, we just wait for them all to finish. 10592 */ 10593 if (use_tag && activity != ZPOOL_WAIT_INITIALIZE && 10594 activity != ZPOOL_WAIT_TRIM) 10595 return (EINVAL); 10596 10597 if (activity < 0 || activity >= ZPOOL_WAIT_NUM_ACTIVITIES) 10598 return (EINVAL); 10599 10600 spa_t *spa; 10601 int error = spa_open(pool, &spa, FTAG); 10602 if (error != 0) 10603 return (error); 10604 10605 /* 10606 * Increment the spa's waiter count so that we can call spa_close and 10607 * still ensure that the spa_t doesn't get freed before this thread is 10608 * finished with it when the pool is exported. We want to call spa_close 10609 * before we start waiting because otherwise the additional ref would 10610 * prevent the pool from being exported or destroyed throughout the 10611 * potentially long wait. 10612 */ 10613 mutex_enter(&spa->spa_activities_lock); 10614 spa->spa_waiters++; 10615 spa_close(spa, FTAG); 10616 10617 *waited = B_FALSE; 10618 for (;;) { 10619 boolean_t in_progress; 10620 error = spa_activity_in_progress(spa, activity, use_tag, tag, 10621 &in_progress); 10622 10623 if (error || !in_progress || spa->spa_waiters_cancel) 10624 break; 10625 10626 *waited = B_TRUE; 10627 10628 if (cv_wait_sig(&spa->spa_activities_cv, 10629 &spa->spa_activities_lock) == 0) { 10630 error = EINTR; 10631 break; 10632 } 10633 } 10634 10635 spa->spa_waiters--; 10636 cv_signal(&spa->spa_waiters_cv); 10637 mutex_exit(&spa->spa_activities_lock); 10638 10639 return (error); 10640 } 10641 10642 /* 10643 * Wait for a particular instance of the specified activity to complete, where 10644 * the instance is identified by 'tag' 10645 */ 10646 int 10647 spa_wait_tag(const char *pool, zpool_wait_activity_t activity, uint64_t tag, 10648 boolean_t *waited) 10649 { 10650 return (spa_wait_common(pool, activity, B_TRUE, tag, waited)); 10651 } 10652 10653 /* 10654 * Wait for all instances of the specified activity complete 10655 */ 10656 int 10657 spa_wait(const char *pool, zpool_wait_activity_t activity, boolean_t *waited) 10658 { 10659 10660 return (spa_wait_common(pool, activity, B_FALSE, 0, waited)); 10661 } 10662 10663 sysevent_t * 10664 spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) 10665 { 10666 sysevent_t *ev = NULL; 10667 #ifdef _KERNEL 10668 nvlist_t *resource; 10669 10670 resource = zfs_event_create(spa, vd, FM_SYSEVENT_CLASS, name, hist_nvl); 10671 if (resource) { 10672 ev = kmem_alloc(sizeof (sysevent_t), KM_SLEEP); 10673 ev->resource = resource; 10674 } 10675 #else 10676 (void) spa, (void) vd, (void) hist_nvl, (void) name; 10677 #endif 10678 return (ev); 10679 } 10680 10681 void 10682 spa_event_post(sysevent_t *ev) 10683 { 10684 #ifdef _KERNEL 10685 if (ev) { 10686 zfs_zevent_post(ev->resource, NULL, zfs_zevent_post_cb); 10687 kmem_free(ev, sizeof (*ev)); 10688 } 10689 #else 10690 (void) ev; 10691 #endif 10692 } 10693 10694 /* 10695 * Post a zevent corresponding to the given sysevent. The 'name' must be one 10696 * of the event definitions in sys/sysevent/eventdefs.h. The payload will be 10697 * filled in from the spa and (optionally) the vdev. This doesn't do anything 10698 * in the userland libzpool, as we don't want consumers to misinterpret ztest 10699 * or zdb as real changes. 10700 */ 10701 void 10702 spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) 10703 { 10704 spa_event_post(spa_event_create(spa, vd, hist_nvl, name)); 10705 } 10706 10707 /* state manipulation functions */ 10708 EXPORT_SYMBOL(spa_open); 10709 EXPORT_SYMBOL(spa_open_rewind); 10710 EXPORT_SYMBOL(spa_get_stats); 10711 EXPORT_SYMBOL(spa_create); 10712 EXPORT_SYMBOL(spa_import); 10713 EXPORT_SYMBOL(spa_tryimport); 10714 EXPORT_SYMBOL(spa_destroy); 10715 EXPORT_SYMBOL(spa_export); 10716 EXPORT_SYMBOL(spa_reset); 10717 EXPORT_SYMBOL(spa_async_request); 10718 EXPORT_SYMBOL(spa_async_suspend); 10719 EXPORT_SYMBOL(spa_async_resume); 10720 EXPORT_SYMBOL(spa_inject_addref); 10721 EXPORT_SYMBOL(spa_inject_delref); 10722 EXPORT_SYMBOL(spa_scan_stat_init); 10723 EXPORT_SYMBOL(spa_scan_get_stats); 10724 10725 /* device manipulation */ 10726 EXPORT_SYMBOL(spa_vdev_add); 10727 EXPORT_SYMBOL(spa_vdev_attach); 10728 EXPORT_SYMBOL(spa_vdev_detach); 10729 EXPORT_SYMBOL(spa_vdev_setpath); 10730 EXPORT_SYMBOL(spa_vdev_setfru); 10731 EXPORT_SYMBOL(spa_vdev_split_mirror); 10732 10733 /* spare statech is global across all pools) */ 10734 EXPORT_SYMBOL(spa_spare_add); 10735 EXPORT_SYMBOL(spa_spare_remove); 10736 EXPORT_SYMBOL(spa_spare_exists); 10737 EXPORT_SYMBOL(spa_spare_activate); 10738 10739 /* L2ARC statech is global across all pools) */ 10740 EXPORT_SYMBOL(spa_l2cache_add); 10741 EXPORT_SYMBOL(spa_l2cache_remove); 10742 EXPORT_SYMBOL(spa_l2cache_exists); 10743 EXPORT_SYMBOL(spa_l2cache_activate); 10744 EXPORT_SYMBOL(spa_l2cache_drop); 10745 10746 /* scanning */ 10747 EXPORT_SYMBOL(spa_scan); 10748 EXPORT_SYMBOL(spa_scan_stop); 10749 10750 /* spa syncing */ 10751 EXPORT_SYMBOL(spa_sync); /* only for DMU use */ 10752 EXPORT_SYMBOL(spa_sync_allpools); 10753 10754 /* properties */ 10755 EXPORT_SYMBOL(spa_prop_set); 10756 EXPORT_SYMBOL(spa_prop_get); 10757 EXPORT_SYMBOL(spa_prop_clear_bootfs); 10758 10759 /* asynchronous event notification */ 10760 EXPORT_SYMBOL(spa_event_notify); 10761 10762 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_pct, UINT, ZMOD_RW, 10763 "Percentage of CPUs to run a metaslab preload taskq"); 10764 10765 /* BEGIN CSTYLED */ 10766 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, UINT, ZMOD_RW, 10767 "log2 fraction of arc that can be used by inflight I/Os when " 10768 "verifying pool during import"); 10769 /* END CSTYLED */ 10770 10771 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_metadata, INT, ZMOD_RW, 10772 "Set to traverse metadata on pool import"); 10773 10774 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW, 10775 "Set to traverse data on pool import"); 10776 10777 ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW, 10778 "Print vdev tree to zfs_dbgmsg during pool import"); 10779 10780 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RD, 10781 "Percentage of CPUs to run an IO worker thread"); 10782 10783 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RD, 10784 "Number of threads per IO worker taskqueue"); 10785 10786 /* BEGIN CSTYLED */ 10787 ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, U64, ZMOD_RW, 10788 "Allow importing pool with up to this number of missing top-level " 10789 "vdevs (in read-only mode)"); 10790 /* END CSTYLED */ 10791 10792 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT, 10793 ZMOD_RW, "Set the livelist condense zthr to pause"); 10794 10795 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_pause, INT, 10796 ZMOD_RW, "Set the livelist condense synctask to pause"); 10797 10798 /* BEGIN CSTYLED */ 10799 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_cancel, 10800 INT, ZMOD_RW, 10801 "Whether livelist condensing was canceled in the synctask"); 10802 10803 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_cancel, 10804 INT, ZMOD_RW, 10805 "Whether livelist condensing was canceled in the zthr function"); 10806 10807 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, 10808 ZMOD_RW, 10809 "Whether extra ALLOC blkptrs were added to a livelist entry while it " 10810 "was being condensed"); 10811 10812 #ifdef _KERNEL 10813 ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read, 10814 spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RD, 10815 "Configure IO queues for read IO"); 10816 ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write, 10817 spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RD, 10818 "Configure IO queues for write IO"); 10819 #endif 10820 /* END CSTYLED */ 10821 10822 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_wr_iss_ncpus, UINT, ZMOD_RW, 10823 "Number of CPUs to run write issue taskqs"); 10824