1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011, 2024 by Delphix. All rights reserved. 25 * Copyright (c) 2018, Nexenta Systems, Inc. All rights reserved. 26 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 27 * Copyright 2013 Saso Kiselkov. All rights reserved. 28 * Copyright (c) 2014 Integros [integros.com] 29 * Copyright 2016 Toomas Soome <tsoome@me.com> 30 * Copyright (c) 2016 Actifio, Inc. All rights reserved. 31 * Copyright 2018 Joyent, Inc. 32 * Copyright (c) 2017, 2019, Datto Inc. All rights reserved. 33 * Copyright 2017 Joyent, Inc. 34 * Copyright (c) 2017, Intel Corporation. 35 * Copyright (c) 2021, Colm Buckley <colm@tuatha.org> 36 * Copyright (c) 2023 Hewlett Packard Enterprise Development LP. 37 */ 38 39 /* 40 * SPA: Storage Pool Allocator 41 * 42 * This file contains all the routines used when modifying on-disk SPA state. 43 * This includes opening, importing, destroying, exporting a pool, and syncing a 44 * pool. 45 */ 46 47 #include <sys/zfs_context.h> 48 #include <sys/fm/fs/zfs.h> 49 #include <sys/spa_impl.h> 50 #include <sys/zio.h> 51 #include <sys/zio_checksum.h> 52 #include <sys/dmu.h> 53 #include <sys/dmu_tx.h> 54 #include <sys/zap.h> 55 #include <sys/zil.h> 56 #include <sys/brt.h> 57 #include <sys/ddt.h> 58 #include <sys/vdev_impl.h> 59 #include <sys/vdev_removal.h> 60 #include <sys/vdev_indirect_mapping.h> 61 #include <sys/vdev_indirect_births.h> 62 #include <sys/vdev_initialize.h> 63 #include <sys/vdev_rebuild.h> 64 #include <sys/vdev_trim.h> 65 #include <sys/vdev_disk.h> 66 #include <sys/vdev_raidz.h> 67 #include <sys/vdev_draid.h> 68 #include <sys/metaslab.h> 69 #include <sys/metaslab_impl.h> 70 #include <sys/mmp.h> 71 #include <sys/uberblock_impl.h> 72 #include <sys/txg.h> 73 #include <sys/avl.h> 74 #include <sys/bpobj.h> 75 #include <sys/dmu_traverse.h> 76 #include <sys/dmu_objset.h> 77 #include <sys/unique.h> 78 #include <sys/dsl_pool.h> 79 #include <sys/dsl_dataset.h> 80 #include <sys/dsl_dir.h> 81 #include <sys/dsl_prop.h> 82 #include <sys/dsl_synctask.h> 83 #include <sys/fs/zfs.h> 84 #include <sys/arc.h> 85 #include <sys/callb.h> 86 #include <sys/systeminfo.h> 87 #include <sys/zfs_ioctl.h> 88 #include <sys/dsl_scan.h> 89 #include <sys/zfeature.h> 90 #include <sys/dsl_destroy.h> 91 #include <sys/zvol.h> 92 93 #ifdef _KERNEL 94 #include <sys/fm/protocol.h> 95 #include <sys/fm/util.h> 96 #include <sys/callb.h> 97 #include <sys/zone.h> 98 #include <sys/vmsystm.h> 99 #endif /* _KERNEL */ 100 101 #include "zfs_prop.h" 102 #include "zfs_comutil.h" 103 #include <cityhash.h> 104 105 /* 106 * spa_thread() existed on Illumos as a parent thread for the various worker 107 * threads that actually run the pool, as a way to both reference the entire 108 * pool work as a single object, and to share properties like scheduling 109 * options. It has not yet been adapted to Linux or FreeBSD. This define is 110 * used to mark related parts of the code to make things easier for the reader, 111 * and to compile this code out. It can be removed when someone implements it, 112 * moves it to some Illumos-specific place, or removes it entirely. 113 */ 114 #undef HAVE_SPA_THREAD 115 116 /* 117 * The "System Duty Cycle" scheduling class is an Illumos feature to help 118 * prevent CPU-intensive kernel threads from affecting latency on interactive 119 * threads. It doesn't exist on Linux or FreeBSD, so the supporting code is 120 * gated behind a define. On Illumos SDC depends on spa_thread(), but 121 * spa_thread() also has other uses, so this is a separate define. 122 */ 123 #undef HAVE_SYSDC 124 125 /* 126 * The interval, in seconds, at which failed configuration cache file writes 127 * should be retried. 128 */ 129 int zfs_ccw_retry_interval = 300; 130 131 typedef enum zti_modes { 132 ZTI_MODE_FIXED, /* value is # of threads (min 1) */ 133 ZTI_MODE_SCALE, /* Taskqs scale with CPUs. */ 134 ZTI_MODE_SYNC, /* sync thread assigned */ 135 ZTI_MODE_NULL, /* don't create a taskq */ 136 ZTI_NMODES 137 } zti_modes_t; 138 139 #define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } 140 #define ZTI_PCT(n) { ZTI_MODE_ONLINE_PERCENT, (n), 1 } 141 #define ZTI_SCALE { ZTI_MODE_SCALE, 0, 1 } 142 #define ZTI_SYNC { ZTI_MODE_SYNC, 0, 1 } 143 #define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } 144 145 #define ZTI_N(n) ZTI_P(n, 1) 146 #define ZTI_ONE ZTI_N(1) 147 148 typedef struct zio_taskq_info { 149 zti_modes_t zti_mode; 150 uint_t zti_value; 151 uint_t zti_count; 152 } zio_taskq_info_t; 153 154 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 155 "iss", "iss_h", "int", "int_h" 156 }; 157 158 /* 159 * This table defines the taskq settings for each ZFS I/O type. When 160 * initializing a pool, we use this table to create an appropriately sized 161 * taskq. Some operations are low volume and therefore have a small, static 162 * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE 163 * macros. Other operations process a large amount of data; the ZTI_SCALE 164 * macro causes us to create a taskq oriented for throughput. Some operations 165 * are so high frequency and short-lived that the taskq itself can become a 166 * point of lock contention. The ZTI_P(#, #) macro indicates that we need an 167 * additional degree of parallelism specified by the number of threads per- 168 * taskq and the number of taskqs; when dispatching an event in this case, the 169 * particular taskq is chosen at random. ZTI_SCALE uses a number of taskqs 170 * that scales with the number of CPUs. 171 * 172 * The different taskq priorities are to handle the different contexts (issue 173 * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that 174 * need to be handled with minimum delay. 175 */ 176 static zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 177 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 178 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ 179 { ZTI_N(8), ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* READ */ 180 { ZTI_SYNC, ZTI_N(5), ZTI_SCALE, ZTI_N(5) }, /* WRITE */ 181 { ZTI_SCALE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ 182 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ 183 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FLUSH */ 184 { ZTI_N(4), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* TRIM */ 185 }; 186 187 static void spa_sync_version(void *arg, dmu_tx_t *tx); 188 static void spa_sync_props(void *arg, dmu_tx_t *tx); 189 static boolean_t spa_has_active_shared_spare(spa_t *spa); 190 static int spa_load_impl(spa_t *spa, spa_import_type_t type, 191 const char **ereport); 192 static void spa_vdev_resilver_done(spa_t *spa); 193 194 /* 195 * Percentage of all CPUs that can be used by the metaslab preload taskq. 196 */ 197 static uint_t metaslab_preload_pct = 50; 198 199 static uint_t zio_taskq_batch_pct = 80; /* 1 thread per cpu in pset */ 200 static uint_t zio_taskq_batch_tpq; /* threads per taskq */ 201 202 #ifdef HAVE_SYSDC 203 static const boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 204 static const uint_t zio_taskq_basedc = 80; /* base duty cycle */ 205 #endif 206 207 #ifdef HAVE_SPA_THREAD 208 static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */ 209 #endif 210 211 static uint_t zio_taskq_write_tpq = 16; 212 213 /* 214 * Report any spa_load_verify errors found, but do not fail spa_load. 215 * This is used by zdb to analyze non-idle pools. 216 */ 217 boolean_t spa_load_verify_dryrun = B_FALSE; 218 219 /* 220 * Allow read spacemaps in case of readonly import (spa_mode == SPA_MODE_READ). 221 * This is used by zdb for spacemaps verification. 222 */ 223 boolean_t spa_mode_readable_spacemaps = B_FALSE; 224 225 /* 226 * This (illegal) pool name is used when temporarily importing a spa_t in order 227 * to get the vdev stats associated with the imported devices. 228 */ 229 #define TRYIMPORT_NAME "$import" 230 231 /* 232 * For debugging purposes: print out vdev tree during pool import. 233 */ 234 static int spa_load_print_vdev_tree = B_FALSE; 235 236 /* 237 * A non-zero value for zfs_max_missing_tvds means that we allow importing 238 * pools with missing top-level vdevs. This is strictly intended for advanced 239 * pool recovery cases since missing data is almost inevitable. Pools with 240 * missing devices can only be imported read-only for safety reasons, and their 241 * fail-mode will be automatically set to "continue". 242 * 243 * With 1 missing vdev we should be able to import the pool and mount all 244 * datasets. User data that was not modified after the missing device has been 245 * added should be recoverable. This means that snapshots created prior to the 246 * addition of that device should be completely intact. 247 * 248 * With 2 missing vdevs, some datasets may fail to mount since there are 249 * dataset statistics that are stored as regular metadata. Some data might be 250 * recoverable if those vdevs were added recently. 251 * 252 * With 3 or more missing vdevs, the pool is severely damaged and MOS entries 253 * may be missing entirely. Chances of data recovery are very low. Note that 254 * there are also risks of performing an inadvertent rewind as we might be 255 * missing all the vdevs with the latest uberblocks. 256 */ 257 uint64_t zfs_max_missing_tvds = 0; 258 259 /* 260 * The parameters below are similar to zfs_max_missing_tvds but are only 261 * intended for a preliminary open of the pool with an untrusted config which 262 * might be incomplete or out-dated. 263 * 264 * We are more tolerant for pools opened from a cachefile since we could have 265 * an out-dated cachefile where a device removal was not registered. 266 * We could have set the limit arbitrarily high but in the case where devices 267 * are really missing we would want to return the proper error codes; we chose 268 * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available 269 * and we get a chance to retrieve the trusted config. 270 */ 271 uint64_t zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1; 272 273 /* 274 * In the case where config was assembled by scanning device paths (/dev/dsks 275 * by default) we are less tolerant since all the existing devices should have 276 * been detected and we want spa_load to return the right error codes. 277 */ 278 uint64_t zfs_max_missing_tvds_scan = 0; 279 280 /* 281 * Debugging aid that pauses spa_sync() towards the end. 282 */ 283 static const boolean_t zfs_pause_spa_sync = B_FALSE; 284 285 /* 286 * Variables to indicate the livelist condense zthr func should wait at certain 287 * points for the livelist to be removed - used to test condense/destroy races 288 */ 289 static int zfs_livelist_condense_zthr_pause = 0; 290 static int zfs_livelist_condense_sync_pause = 0; 291 292 /* 293 * Variables to track whether or not condense cancellation has been 294 * triggered in testing. 295 */ 296 static int zfs_livelist_condense_sync_cancel = 0; 297 static int zfs_livelist_condense_zthr_cancel = 0; 298 299 /* 300 * Variable to track whether or not extra ALLOC blkptrs were added to a 301 * livelist entry while it was being condensed (caused by the way we track 302 * remapped blkptrs in dbuf_remap_impl) 303 */ 304 static int zfs_livelist_condense_new_alloc = 0; 305 306 /* 307 * ========================================================================== 308 * SPA properties routines 309 * ========================================================================== 310 */ 311 312 /* 313 * Add a (source=src, propname=propval) list to an nvlist. 314 */ 315 static void 316 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, const char *strval, 317 uint64_t intval, zprop_source_t src) 318 { 319 const char *propname = zpool_prop_to_name(prop); 320 nvlist_t *propval; 321 322 propval = fnvlist_alloc(); 323 fnvlist_add_uint64(propval, ZPROP_SOURCE, src); 324 325 if (strval != NULL) 326 fnvlist_add_string(propval, ZPROP_VALUE, strval); 327 else 328 fnvlist_add_uint64(propval, ZPROP_VALUE, intval); 329 330 fnvlist_add_nvlist(nvl, propname, propval); 331 nvlist_free(propval); 332 } 333 334 /* 335 * Add a user property (source=src, propname=propval) to an nvlist. 336 */ 337 static void 338 spa_prop_add_user(nvlist_t *nvl, const char *propname, char *strval, 339 zprop_source_t src) 340 { 341 nvlist_t *propval; 342 343 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 344 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 345 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 346 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 347 nvlist_free(propval); 348 } 349 350 /* 351 * Get property values from the spa configuration. 352 */ 353 static void 354 spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 355 { 356 vdev_t *rvd = spa->spa_root_vdev; 357 dsl_pool_t *pool = spa->spa_dsl_pool; 358 uint64_t size, alloc, cap, version; 359 const zprop_source_t src = ZPROP_SRC_NONE; 360 spa_config_dirent_t *dp; 361 metaslab_class_t *mc = spa_normal_class(spa); 362 363 ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 364 365 if (rvd != NULL) { 366 alloc = metaslab_class_get_alloc(mc); 367 alloc += metaslab_class_get_alloc(spa_special_class(spa)); 368 alloc += metaslab_class_get_alloc(spa_dedup_class(spa)); 369 alloc += metaslab_class_get_alloc(spa_embedded_log_class(spa)); 370 371 size = metaslab_class_get_space(mc); 372 size += metaslab_class_get_space(spa_special_class(spa)); 373 size += metaslab_class_get_space(spa_dedup_class(spa)); 374 size += metaslab_class_get_space(spa_embedded_log_class(spa)); 375 376 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 377 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 378 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 379 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 380 size - alloc, src); 381 spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL, 382 spa->spa_checkpoint_info.sci_dspace, src); 383 384 spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL, 385 metaslab_class_fragmentation(mc), src); 386 spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, 387 metaslab_class_expandable_space(mc), src); 388 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, 389 (spa_mode(spa) == SPA_MODE_READ), src); 390 391 cap = (size == 0) ? 0 : (alloc * 100 / size); 392 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 393 394 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 395 ddt_get_pool_dedup_ratio(spa), src); 396 spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONEUSED, NULL, 397 brt_get_used(spa), src); 398 spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONESAVED, NULL, 399 brt_get_saved(spa), src); 400 spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONERATIO, NULL, 401 brt_get_ratio(spa), src); 402 403 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 404 rvd->vdev_state, src); 405 406 version = spa_version(spa); 407 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) { 408 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, 409 version, ZPROP_SRC_DEFAULT); 410 } else { 411 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, 412 version, ZPROP_SRC_LOCAL); 413 } 414 spa_prop_add_list(*nvp, ZPOOL_PROP_LOAD_GUID, 415 NULL, spa_load_guid(spa), src); 416 } 417 418 if (pool != NULL) { 419 /* 420 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, 421 * when opening pools before this version freedir will be NULL. 422 */ 423 if (pool->dp_free_dir != NULL) { 424 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 425 dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes, 426 src); 427 } else { 428 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, 429 NULL, 0, src); 430 } 431 432 if (pool->dp_leak_dir != NULL) { 433 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, 434 dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes, 435 src); 436 } else { 437 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, 438 NULL, 0, src); 439 } 440 } 441 442 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 443 444 if (spa->spa_comment != NULL) { 445 spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 446 0, ZPROP_SRC_LOCAL); 447 } 448 449 if (spa->spa_compatibility != NULL) { 450 spa_prop_add_list(*nvp, ZPOOL_PROP_COMPATIBILITY, 451 spa->spa_compatibility, 0, ZPROP_SRC_LOCAL); 452 } 453 454 if (spa->spa_root != NULL) 455 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 456 0, ZPROP_SRC_LOCAL); 457 458 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { 459 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 460 MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE); 461 } else { 462 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 463 SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); 464 } 465 466 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) { 467 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, 468 DNODE_MAX_SIZE, ZPROP_SRC_NONE); 469 } else { 470 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, 471 DNODE_MIN_SIZE, ZPROP_SRC_NONE); 472 } 473 474 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 475 if (dp->scd_path == NULL) { 476 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 477 "none", 0, ZPROP_SRC_LOCAL); 478 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 479 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 480 dp->scd_path, 0, ZPROP_SRC_LOCAL); 481 } 482 } 483 } 484 485 /* 486 * Get zpool property values. 487 */ 488 int 489 spa_prop_get(spa_t *spa, nvlist_t **nvp) 490 { 491 objset_t *mos = spa->spa_meta_objset; 492 zap_cursor_t zc; 493 zap_attribute_t za; 494 dsl_pool_t *dp; 495 int err; 496 497 err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP); 498 if (err) 499 return (err); 500 501 dp = spa_get_dsl(spa); 502 dsl_pool_config_enter(dp, FTAG); 503 mutex_enter(&spa->spa_props_lock); 504 505 /* 506 * Get properties from the spa config. 507 */ 508 spa_prop_get_config(spa, nvp); 509 510 /* If no pool property object, no more prop to get. */ 511 if (mos == NULL || spa->spa_pool_props_object == 0) 512 goto out; 513 514 /* 515 * Get properties from the MOS pool property object. 516 */ 517 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 518 (err = zap_cursor_retrieve(&zc, &za)) == 0; 519 zap_cursor_advance(&zc)) { 520 uint64_t intval = 0; 521 char *strval = NULL; 522 zprop_source_t src = ZPROP_SRC_DEFAULT; 523 zpool_prop_t prop; 524 525 if ((prop = zpool_name_to_prop(za.za_name)) == 526 ZPOOL_PROP_INVAL && !zfs_prop_user(za.za_name)) 527 continue; 528 529 switch (za.za_integer_length) { 530 case 8: 531 /* integer property */ 532 if (za.za_first_integer != 533 zpool_prop_default_numeric(prop)) 534 src = ZPROP_SRC_LOCAL; 535 536 if (prop == ZPOOL_PROP_BOOTFS) { 537 dsl_dataset_t *ds = NULL; 538 539 err = dsl_dataset_hold_obj(dp, 540 za.za_first_integer, FTAG, &ds); 541 if (err != 0) 542 break; 543 544 strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, 545 KM_SLEEP); 546 dsl_dataset_name(ds, strval); 547 dsl_dataset_rele(ds, FTAG); 548 } else { 549 strval = NULL; 550 intval = za.za_first_integer; 551 } 552 553 spa_prop_add_list(*nvp, prop, strval, intval, src); 554 555 if (strval != NULL) 556 kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN); 557 558 break; 559 560 case 1: 561 /* string property */ 562 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 563 err = zap_lookup(mos, spa->spa_pool_props_object, 564 za.za_name, 1, za.za_num_integers, strval); 565 if (err) { 566 kmem_free(strval, za.za_num_integers); 567 break; 568 } 569 if (prop != ZPOOL_PROP_INVAL) { 570 spa_prop_add_list(*nvp, prop, strval, 0, src); 571 } else { 572 src = ZPROP_SRC_LOCAL; 573 spa_prop_add_user(*nvp, za.za_name, strval, 574 src); 575 } 576 kmem_free(strval, za.za_num_integers); 577 break; 578 579 default: 580 break; 581 } 582 } 583 zap_cursor_fini(&zc); 584 out: 585 mutex_exit(&spa->spa_props_lock); 586 dsl_pool_config_exit(dp, FTAG); 587 if (err && err != ENOENT) { 588 nvlist_free(*nvp); 589 *nvp = NULL; 590 return (err); 591 } 592 593 return (0); 594 } 595 596 /* 597 * Validate the given pool properties nvlist and modify the list 598 * for the property values to be set. 599 */ 600 static int 601 spa_prop_validate(spa_t *spa, nvlist_t *props) 602 { 603 nvpair_t *elem; 604 int error = 0, reset_bootfs = 0; 605 uint64_t objnum = 0; 606 boolean_t has_feature = B_FALSE; 607 608 elem = NULL; 609 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 610 uint64_t intval; 611 const char *strval, *slash, *check, *fname; 612 const char *propname = nvpair_name(elem); 613 zpool_prop_t prop = zpool_name_to_prop(propname); 614 615 switch (prop) { 616 case ZPOOL_PROP_INVAL: 617 /* 618 * Sanitize the input. 619 */ 620 if (zfs_prop_user(propname)) { 621 if (strlen(propname) >= ZAP_MAXNAMELEN) { 622 error = SET_ERROR(ENAMETOOLONG); 623 break; 624 } 625 626 if (strlen(fnvpair_value_string(elem)) >= 627 ZAP_MAXVALUELEN) { 628 error = SET_ERROR(E2BIG); 629 break; 630 } 631 } else if (zpool_prop_feature(propname)) { 632 if (nvpair_type(elem) != DATA_TYPE_UINT64) { 633 error = SET_ERROR(EINVAL); 634 break; 635 } 636 637 if (nvpair_value_uint64(elem, &intval) != 0) { 638 error = SET_ERROR(EINVAL); 639 break; 640 } 641 642 if (intval != 0) { 643 error = SET_ERROR(EINVAL); 644 break; 645 } 646 647 fname = strchr(propname, '@') + 1; 648 if (zfeature_lookup_name(fname, NULL) != 0) { 649 error = SET_ERROR(EINVAL); 650 break; 651 } 652 653 has_feature = B_TRUE; 654 } else { 655 error = SET_ERROR(EINVAL); 656 break; 657 } 658 break; 659 660 case ZPOOL_PROP_VERSION: 661 error = nvpair_value_uint64(elem, &intval); 662 if (!error && 663 (intval < spa_version(spa) || 664 intval > SPA_VERSION_BEFORE_FEATURES || 665 has_feature)) 666 error = SET_ERROR(EINVAL); 667 break; 668 669 case ZPOOL_PROP_DELEGATION: 670 case ZPOOL_PROP_AUTOREPLACE: 671 case ZPOOL_PROP_LISTSNAPS: 672 case ZPOOL_PROP_AUTOEXPAND: 673 case ZPOOL_PROP_AUTOTRIM: 674 error = nvpair_value_uint64(elem, &intval); 675 if (!error && intval > 1) 676 error = SET_ERROR(EINVAL); 677 break; 678 679 case ZPOOL_PROP_MULTIHOST: 680 error = nvpair_value_uint64(elem, &intval); 681 if (!error && intval > 1) 682 error = SET_ERROR(EINVAL); 683 684 if (!error) { 685 uint32_t hostid = zone_get_hostid(NULL); 686 if (hostid) 687 spa->spa_hostid = hostid; 688 else 689 error = SET_ERROR(ENOTSUP); 690 } 691 692 break; 693 694 case ZPOOL_PROP_BOOTFS: 695 /* 696 * If the pool version is less than SPA_VERSION_BOOTFS, 697 * or the pool is still being created (version == 0), 698 * the bootfs property cannot be set. 699 */ 700 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 701 error = SET_ERROR(ENOTSUP); 702 break; 703 } 704 705 /* 706 * Make sure the vdev config is bootable 707 */ 708 if (!vdev_is_bootable(spa->spa_root_vdev)) { 709 error = SET_ERROR(ENOTSUP); 710 break; 711 } 712 713 reset_bootfs = 1; 714 715 error = nvpair_value_string(elem, &strval); 716 717 if (!error) { 718 objset_t *os; 719 720 if (strval == NULL || strval[0] == '\0') { 721 objnum = zpool_prop_default_numeric( 722 ZPOOL_PROP_BOOTFS); 723 break; 724 } 725 726 error = dmu_objset_hold(strval, FTAG, &os); 727 if (error != 0) 728 break; 729 730 /* Must be ZPL. */ 731 if (dmu_objset_type(os) != DMU_OST_ZFS) { 732 error = SET_ERROR(ENOTSUP); 733 } else { 734 objnum = dmu_objset_id(os); 735 } 736 dmu_objset_rele(os, FTAG); 737 } 738 break; 739 740 case ZPOOL_PROP_FAILUREMODE: 741 error = nvpair_value_uint64(elem, &intval); 742 if (!error && intval > ZIO_FAILURE_MODE_PANIC) 743 error = SET_ERROR(EINVAL); 744 745 /* 746 * This is a special case which only occurs when 747 * the pool has completely failed. This allows 748 * the user to change the in-core failmode property 749 * without syncing it out to disk (I/Os might 750 * currently be blocked). We do this by returning 751 * EIO to the caller (spa_prop_set) to trick it 752 * into thinking we encountered a property validation 753 * error. 754 */ 755 if (!error && spa_suspended(spa)) { 756 spa->spa_failmode = intval; 757 error = SET_ERROR(EIO); 758 } 759 break; 760 761 case ZPOOL_PROP_CACHEFILE: 762 if ((error = nvpair_value_string(elem, &strval)) != 0) 763 break; 764 765 if (strval[0] == '\0') 766 break; 767 768 if (strcmp(strval, "none") == 0) 769 break; 770 771 if (strval[0] != '/') { 772 error = SET_ERROR(EINVAL); 773 break; 774 } 775 776 slash = strrchr(strval, '/'); 777 ASSERT(slash != NULL); 778 779 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 780 strcmp(slash, "/..") == 0) 781 error = SET_ERROR(EINVAL); 782 break; 783 784 case ZPOOL_PROP_COMMENT: 785 if ((error = nvpair_value_string(elem, &strval)) != 0) 786 break; 787 for (check = strval; *check != '\0'; check++) { 788 if (!isprint(*check)) { 789 error = SET_ERROR(EINVAL); 790 break; 791 } 792 } 793 if (strlen(strval) > ZPROP_MAX_COMMENT) 794 error = SET_ERROR(E2BIG); 795 break; 796 797 default: 798 break; 799 } 800 801 if (error) 802 break; 803 } 804 805 (void) nvlist_remove_all(props, 806 zpool_prop_to_name(ZPOOL_PROP_DEDUPDITTO)); 807 808 if (!error && reset_bootfs) { 809 error = nvlist_remove(props, 810 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 811 812 if (!error) { 813 error = nvlist_add_uint64(props, 814 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 815 } 816 } 817 818 return (error); 819 } 820 821 void 822 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 823 { 824 const char *cachefile; 825 spa_config_dirent_t *dp; 826 827 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 828 &cachefile) != 0) 829 return; 830 831 dp = kmem_alloc(sizeof (spa_config_dirent_t), 832 KM_SLEEP); 833 834 if (cachefile[0] == '\0') 835 dp->scd_path = spa_strdup(spa_config_path); 836 else if (strcmp(cachefile, "none") == 0) 837 dp->scd_path = NULL; 838 else 839 dp->scd_path = spa_strdup(cachefile); 840 841 list_insert_head(&spa->spa_config_list, dp); 842 if (need_sync) 843 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 844 } 845 846 int 847 spa_prop_set(spa_t *spa, nvlist_t *nvp) 848 { 849 int error; 850 nvpair_t *elem = NULL; 851 boolean_t need_sync = B_FALSE; 852 853 if ((error = spa_prop_validate(spa, nvp)) != 0) 854 return (error); 855 856 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 857 zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); 858 859 if (prop == ZPOOL_PROP_CACHEFILE || 860 prop == ZPOOL_PROP_ALTROOT || 861 prop == ZPOOL_PROP_READONLY) 862 continue; 863 864 if (prop == ZPOOL_PROP_INVAL && 865 zfs_prop_user(nvpair_name(elem))) { 866 need_sync = B_TRUE; 867 break; 868 } 869 870 if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) { 871 uint64_t ver = 0; 872 873 if (prop == ZPOOL_PROP_VERSION) { 874 VERIFY(nvpair_value_uint64(elem, &ver) == 0); 875 } else { 876 ASSERT(zpool_prop_feature(nvpair_name(elem))); 877 ver = SPA_VERSION_FEATURES; 878 need_sync = B_TRUE; 879 } 880 881 /* Save time if the version is already set. */ 882 if (ver == spa_version(spa)) 883 continue; 884 885 /* 886 * In addition to the pool directory object, we might 887 * create the pool properties object, the features for 888 * read object, the features for write object, or the 889 * feature descriptions object. 890 */ 891 error = dsl_sync_task(spa->spa_name, NULL, 892 spa_sync_version, &ver, 893 6, ZFS_SPACE_CHECK_RESERVED); 894 if (error) 895 return (error); 896 continue; 897 } 898 899 need_sync = B_TRUE; 900 break; 901 } 902 903 if (need_sync) { 904 return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, 905 nvp, 6, ZFS_SPACE_CHECK_RESERVED)); 906 } 907 908 return (0); 909 } 910 911 /* 912 * If the bootfs property value is dsobj, clear it. 913 */ 914 void 915 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 916 { 917 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 918 VERIFY(zap_remove(spa->spa_meta_objset, 919 spa->spa_pool_props_object, 920 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 921 spa->spa_bootfs = 0; 922 } 923 } 924 925 static int 926 spa_change_guid_check(void *arg, dmu_tx_t *tx) 927 { 928 uint64_t *newguid __maybe_unused = arg; 929 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 930 vdev_t *rvd = spa->spa_root_vdev; 931 uint64_t vdev_state; 932 933 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 934 int error = (spa_has_checkpoint(spa)) ? 935 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 936 return (SET_ERROR(error)); 937 } 938 939 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 940 vdev_state = rvd->vdev_state; 941 spa_config_exit(spa, SCL_STATE, FTAG); 942 943 if (vdev_state != VDEV_STATE_HEALTHY) 944 return (SET_ERROR(ENXIO)); 945 946 ASSERT3U(spa_guid(spa), !=, *newguid); 947 948 return (0); 949 } 950 951 static void 952 spa_change_guid_sync(void *arg, dmu_tx_t *tx) 953 { 954 uint64_t *newguid = arg; 955 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 956 uint64_t oldguid; 957 vdev_t *rvd = spa->spa_root_vdev; 958 959 oldguid = spa_guid(spa); 960 961 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 962 rvd->vdev_guid = *newguid; 963 rvd->vdev_guid_sum += (*newguid - oldguid); 964 vdev_config_dirty(rvd); 965 spa_config_exit(spa, SCL_STATE, FTAG); 966 967 spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", 968 (u_longlong_t)oldguid, (u_longlong_t)*newguid); 969 } 970 971 /* 972 * Change the GUID for the pool. This is done so that we can later 973 * re-import a pool built from a clone of our own vdevs. We will modify 974 * the root vdev's guid, our own pool guid, and then mark all of our 975 * vdevs dirty. Note that we must make sure that all our vdevs are 976 * online when we do this, or else any vdevs that weren't present 977 * would be orphaned from our pool. We are also going to issue a 978 * sysevent to update any watchers. 979 */ 980 int 981 spa_change_guid(spa_t *spa) 982 { 983 int error; 984 uint64_t guid; 985 986 mutex_enter(&spa->spa_vdev_top_lock); 987 mutex_enter(&spa_namespace_lock); 988 guid = spa_generate_guid(NULL); 989 990 error = dsl_sync_task(spa->spa_name, spa_change_guid_check, 991 spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); 992 993 if (error == 0) { 994 /* 995 * Clear the kobj flag from all the vdevs to allow 996 * vdev_cache_process_kobj_evt() to post events to all the 997 * vdevs since GUID is updated. 998 */ 999 vdev_clear_kobj_evt(spa->spa_root_vdev); 1000 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 1001 vdev_clear_kobj_evt(spa->spa_l2cache.sav_vdevs[i]); 1002 1003 spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE); 1004 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID); 1005 } 1006 1007 mutex_exit(&spa_namespace_lock); 1008 mutex_exit(&spa->spa_vdev_top_lock); 1009 1010 return (error); 1011 } 1012 1013 /* 1014 * ========================================================================== 1015 * SPA state manipulation (open/create/destroy/import/export) 1016 * ========================================================================== 1017 */ 1018 1019 static int 1020 spa_error_entry_compare(const void *a, const void *b) 1021 { 1022 const spa_error_entry_t *sa = (const spa_error_entry_t *)a; 1023 const spa_error_entry_t *sb = (const spa_error_entry_t *)b; 1024 int ret; 1025 1026 ret = memcmp(&sa->se_bookmark, &sb->se_bookmark, 1027 sizeof (zbookmark_phys_t)); 1028 1029 return (TREE_ISIGN(ret)); 1030 } 1031 1032 /* 1033 * Utility function which retrieves copies of the current logs and 1034 * re-initializes them in the process. 1035 */ 1036 void 1037 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 1038 { 1039 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 1040 1041 memcpy(last, &spa->spa_errlist_last, sizeof (avl_tree_t)); 1042 memcpy(scrub, &spa->spa_errlist_scrub, sizeof (avl_tree_t)); 1043 1044 avl_create(&spa->spa_errlist_scrub, 1045 spa_error_entry_compare, sizeof (spa_error_entry_t), 1046 offsetof(spa_error_entry_t, se_avl)); 1047 avl_create(&spa->spa_errlist_last, 1048 spa_error_entry_compare, sizeof (spa_error_entry_t), 1049 offsetof(spa_error_entry_t, se_avl)); 1050 } 1051 1052 static void 1053 spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 1054 { 1055 const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 1056 enum zti_modes mode = ztip->zti_mode; 1057 uint_t value = ztip->zti_value; 1058 uint_t count = ztip->zti_count; 1059 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1060 uint_t cpus, flags = TASKQ_DYNAMIC; 1061 1062 switch (mode) { 1063 case ZTI_MODE_FIXED: 1064 ASSERT3U(value, >, 0); 1065 break; 1066 1067 case ZTI_MODE_SYNC: 1068 1069 /* 1070 * Create one wr_iss taskq for every 'zio_taskq_write_tpq' CPUs, 1071 * not to exceed the number of spa allocators, and align to it. 1072 */ 1073 cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100); 1074 count = MAX(1, cpus / MAX(1, zio_taskq_write_tpq)); 1075 count = MAX(count, (zio_taskq_batch_pct + 99) / 100); 1076 count = MIN(count, spa->spa_alloc_count); 1077 while (spa->spa_alloc_count % count != 0 && 1078 spa->spa_alloc_count < count * 2) 1079 count--; 1080 1081 /* 1082 * zio_taskq_batch_pct is unbounded and may exceed 100%, but no 1083 * single taskq may have more threads than 100% of online cpus. 1084 */ 1085 value = (zio_taskq_batch_pct + count / 2) / count; 1086 value = MIN(value, 100); 1087 flags |= TASKQ_THREADS_CPU_PCT; 1088 break; 1089 1090 case ZTI_MODE_SCALE: 1091 flags |= TASKQ_THREADS_CPU_PCT; 1092 /* 1093 * We want more taskqs to reduce lock contention, but we want 1094 * less for better request ordering and CPU utilization. 1095 */ 1096 cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100); 1097 if (zio_taskq_batch_tpq > 0) { 1098 count = MAX(1, (cpus + zio_taskq_batch_tpq / 2) / 1099 zio_taskq_batch_tpq); 1100 } else { 1101 /* 1102 * Prefer 6 threads per taskq, but no more taskqs 1103 * than threads in them on large systems. For 80%: 1104 * 1105 * taskq taskq total 1106 * cpus taskqs percent threads threads 1107 * ------- ------- ------- ------- ------- 1108 * 1 1 80% 1 1 1109 * 2 1 80% 1 1 1110 * 4 1 80% 3 3 1111 * 8 2 40% 3 6 1112 * 16 3 27% 4 12 1113 * 32 5 16% 5 25 1114 * 64 7 11% 7 49 1115 * 128 10 8% 10 100 1116 * 256 14 6% 15 210 1117 */ 1118 count = 1 + cpus / 6; 1119 while (count * count > cpus) 1120 count--; 1121 } 1122 /* Limit each taskq within 100% to not trigger assertion. */ 1123 count = MAX(count, (zio_taskq_batch_pct + 99) / 100); 1124 value = (zio_taskq_batch_pct + count / 2) / count; 1125 break; 1126 1127 case ZTI_MODE_NULL: 1128 tqs->stqs_count = 0; 1129 tqs->stqs_taskq = NULL; 1130 return; 1131 1132 default: 1133 panic("unrecognized mode for %s_%s taskq (%u:%u) in " 1134 "spa_taskqs_init()", 1135 zio_type_name[t], zio_taskq_types[q], mode, value); 1136 break; 1137 } 1138 1139 ASSERT3U(count, >, 0); 1140 tqs->stqs_count = count; 1141 tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); 1142 1143 for (uint_t i = 0; i < count; i++) { 1144 taskq_t *tq; 1145 char name[32]; 1146 1147 if (count > 1) 1148 (void) snprintf(name, sizeof (name), "%s_%s_%u", 1149 zio_type_name[t], zio_taskq_types[q], i); 1150 else 1151 (void) snprintf(name, sizeof (name), "%s_%s", 1152 zio_type_name[t], zio_taskq_types[q]); 1153 1154 #ifdef HAVE_SYSDC 1155 if (zio_taskq_sysdc && spa->spa_proc != &p0) { 1156 (void) zio_taskq_basedc; 1157 tq = taskq_create_sysdc(name, value, 50, INT_MAX, 1158 spa->spa_proc, zio_taskq_basedc, flags); 1159 } else { 1160 #endif 1161 pri_t pri = maxclsyspri; 1162 /* 1163 * The write issue taskq can be extremely CPU 1164 * intensive. Run it at slightly less important 1165 * priority than the other taskqs. 1166 * 1167 * Under Linux and FreeBSD this means incrementing 1168 * the priority value as opposed to platforms like 1169 * illumos where it should be decremented. 1170 * 1171 * On FreeBSD, if priorities divided by four (RQ_PPQ) 1172 * are equal then a difference between them is 1173 * insignificant. 1174 */ 1175 if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) { 1176 #if defined(__linux__) 1177 pri++; 1178 #elif defined(__FreeBSD__) 1179 pri += 4; 1180 #else 1181 #error "unknown OS" 1182 #endif 1183 } 1184 tq = taskq_create_proc(name, value, pri, 50, 1185 INT_MAX, spa->spa_proc, flags); 1186 #ifdef HAVE_SYSDC 1187 } 1188 #endif 1189 1190 tqs->stqs_taskq[i] = tq; 1191 } 1192 } 1193 1194 static void 1195 spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 1196 { 1197 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1198 1199 if (tqs->stqs_taskq == NULL) { 1200 ASSERT3U(tqs->stqs_count, ==, 0); 1201 return; 1202 } 1203 1204 for (uint_t i = 0; i < tqs->stqs_count; i++) { 1205 ASSERT3P(tqs->stqs_taskq[i], !=, NULL); 1206 taskq_destroy(tqs->stqs_taskq[i]); 1207 } 1208 1209 kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); 1210 tqs->stqs_taskq = NULL; 1211 } 1212 1213 #ifdef _KERNEL 1214 /* 1215 * The READ and WRITE rows of zio_taskqs are configurable at module load time 1216 * by setting zio_taskq_read or zio_taskq_write. 1217 * 1218 * Example (the defaults for READ and WRITE) 1219 * zio_taskq_read='fixed,1,8 null scale null' 1220 * zio_taskq_write='sync fixed,1,5 scale fixed,1,5' 1221 * 1222 * Each sets the entire row at a time. 1223 * 1224 * 'fixed' is parameterised: fixed,Q,T where Q is number of taskqs, T is number 1225 * of threads per taskq. 1226 * 1227 * 'null' can only be set on the high-priority queues (queue selection for 1228 * high-priority queues will fall back to the regular queue if the high-pri 1229 * is NULL. 1230 */ 1231 static const char *const modes[ZTI_NMODES] = { 1232 "fixed", "scale", "sync", "null" 1233 }; 1234 1235 /* Parse the incoming config string. Modifies cfg */ 1236 static int 1237 spa_taskq_param_set(zio_type_t t, char *cfg) 1238 { 1239 int err = 0; 1240 1241 zio_taskq_info_t row[ZIO_TASKQ_TYPES] = {{0}}; 1242 1243 char *next = cfg, *tok, *c; 1244 1245 /* 1246 * Parse out each element from the string and fill `row`. The entire 1247 * row has to be set at once, so any errors are flagged by just 1248 * breaking out of this loop early. 1249 */ 1250 uint_t q; 1251 for (q = 0; q < ZIO_TASKQ_TYPES; q++) { 1252 /* `next` is the start of the config */ 1253 if (next == NULL) 1254 break; 1255 1256 /* Eat up leading space */ 1257 while (isspace(*next)) 1258 next++; 1259 if (*next == '\0') 1260 break; 1261 1262 /* Mode ends at space or end of string */ 1263 tok = next; 1264 next = strchr(tok, ' '); 1265 if (next != NULL) *next++ = '\0'; 1266 1267 /* Parameters start after a comma */ 1268 c = strchr(tok, ','); 1269 if (c != NULL) *c++ = '\0'; 1270 1271 /* Match mode string */ 1272 uint_t mode; 1273 for (mode = 0; mode < ZTI_NMODES; mode++) 1274 if (strcmp(tok, modes[mode]) == 0) 1275 break; 1276 if (mode == ZTI_NMODES) 1277 break; 1278 1279 /* Invalid canary */ 1280 row[q].zti_mode = ZTI_NMODES; 1281 1282 /* Per-mode setup */ 1283 switch (mode) { 1284 1285 /* 1286 * FIXED is parameterised: number of queues, and number of 1287 * threads per queue. 1288 */ 1289 case ZTI_MODE_FIXED: { 1290 /* No parameters? */ 1291 if (c == NULL || *c == '\0') 1292 break; 1293 1294 /* Find next parameter */ 1295 tok = c; 1296 c = strchr(tok, ','); 1297 if (c == NULL) 1298 break; 1299 1300 /* Take digits and convert */ 1301 unsigned long long nq; 1302 if (!(isdigit(*tok))) 1303 break; 1304 err = ddi_strtoull(tok, &tok, 10, &nq); 1305 /* Must succeed and also end at the next param sep */ 1306 if (err != 0 || tok != c) 1307 break; 1308 1309 /* Move past the comma */ 1310 tok++; 1311 /* Need another number */ 1312 if (!(isdigit(*tok))) 1313 break; 1314 /* Remember start to make sure we moved */ 1315 c = tok; 1316 1317 /* Take digits */ 1318 unsigned long long ntpq; 1319 err = ddi_strtoull(tok, &tok, 10, &ntpq); 1320 /* Must succeed, and moved forward */ 1321 if (err != 0 || tok == c || *tok != '\0') 1322 break; 1323 1324 /* 1325 * sanity; zero queues/threads make no sense, and 1326 * 16K is almost certainly more than anyone will ever 1327 * need and avoids silly numbers like UINT32_MAX 1328 */ 1329 if (nq == 0 || nq >= 16384 || 1330 ntpq == 0 || ntpq >= 16384) 1331 break; 1332 1333 const zio_taskq_info_t zti = ZTI_P(ntpq, nq); 1334 row[q] = zti; 1335 break; 1336 } 1337 1338 case ZTI_MODE_SCALE: { 1339 const zio_taskq_info_t zti = ZTI_SCALE; 1340 row[q] = zti; 1341 break; 1342 } 1343 1344 case ZTI_MODE_SYNC: { 1345 const zio_taskq_info_t zti = ZTI_SYNC; 1346 row[q] = zti; 1347 break; 1348 } 1349 1350 case ZTI_MODE_NULL: { 1351 /* 1352 * Can only null the high-priority queues; the general- 1353 * purpose ones have to exist. 1354 */ 1355 if (q != ZIO_TASKQ_ISSUE_HIGH && 1356 q != ZIO_TASKQ_INTERRUPT_HIGH) 1357 break; 1358 1359 const zio_taskq_info_t zti = ZTI_NULL; 1360 row[q] = zti; 1361 break; 1362 } 1363 1364 default: 1365 break; 1366 } 1367 1368 /* Ensure we set a mode */ 1369 if (row[q].zti_mode == ZTI_NMODES) 1370 break; 1371 } 1372 1373 /* Didn't get a full row, fail */ 1374 if (q < ZIO_TASKQ_TYPES) 1375 return (SET_ERROR(EINVAL)); 1376 1377 /* Eat trailing space */ 1378 if (next != NULL) 1379 while (isspace(*next)) 1380 next++; 1381 1382 /* If there's anything left over then fail */ 1383 if (next != NULL && *next != '\0') 1384 return (SET_ERROR(EINVAL)); 1385 1386 /* Success! Copy it into the real config */ 1387 for (q = 0; q < ZIO_TASKQ_TYPES; q++) 1388 zio_taskqs[t][q] = row[q]; 1389 1390 return (0); 1391 } 1392 1393 static int 1394 spa_taskq_param_get(zio_type_t t, char *buf, boolean_t add_newline) 1395 { 1396 int pos = 0; 1397 1398 /* Build paramater string from live config */ 1399 const char *sep = ""; 1400 for (uint_t q = 0; q < ZIO_TASKQ_TYPES; q++) { 1401 const zio_taskq_info_t *zti = &zio_taskqs[t][q]; 1402 if (zti->zti_mode == ZTI_MODE_FIXED) 1403 pos += sprintf(&buf[pos], "%s%s,%u,%u", sep, 1404 modes[zti->zti_mode], zti->zti_count, 1405 zti->zti_value); 1406 else 1407 pos += sprintf(&buf[pos], "%s%s", sep, 1408 modes[zti->zti_mode]); 1409 sep = " "; 1410 } 1411 1412 if (add_newline) 1413 buf[pos++] = '\n'; 1414 buf[pos] = '\0'; 1415 1416 return (pos); 1417 } 1418 1419 #ifdef __linux__ 1420 static int 1421 spa_taskq_read_param_set(const char *val, zfs_kernel_param_t *kp) 1422 { 1423 char *cfg = kmem_strdup(val); 1424 int err = spa_taskq_param_set(ZIO_TYPE_READ, cfg); 1425 kmem_free(cfg, strlen(val)+1); 1426 return (-err); 1427 } 1428 static int 1429 spa_taskq_read_param_get(char *buf, zfs_kernel_param_t *kp) 1430 { 1431 return (spa_taskq_param_get(ZIO_TYPE_READ, buf, TRUE)); 1432 } 1433 1434 static int 1435 spa_taskq_write_param_set(const char *val, zfs_kernel_param_t *kp) 1436 { 1437 char *cfg = kmem_strdup(val); 1438 int err = spa_taskq_param_set(ZIO_TYPE_WRITE, cfg); 1439 kmem_free(cfg, strlen(val)+1); 1440 return (-err); 1441 } 1442 static int 1443 spa_taskq_write_param_get(char *buf, zfs_kernel_param_t *kp) 1444 { 1445 return (spa_taskq_param_get(ZIO_TYPE_WRITE, buf, TRUE)); 1446 } 1447 #else 1448 /* 1449 * On FreeBSD load-time parameters can be set up before malloc() is available, 1450 * so we have to do all the parsing work on the stack. 1451 */ 1452 #define SPA_TASKQ_PARAM_MAX (128) 1453 1454 static int 1455 spa_taskq_read_param(ZFS_MODULE_PARAM_ARGS) 1456 { 1457 char buf[SPA_TASKQ_PARAM_MAX]; 1458 int err; 1459 1460 (void) spa_taskq_param_get(ZIO_TYPE_READ, buf, FALSE); 1461 err = sysctl_handle_string(oidp, buf, sizeof (buf), req); 1462 if (err || req->newptr == NULL) 1463 return (err); 1464 return (spa_taskq_param_set(ZIO_TYPE_READ, buf)); 1465 } 1466 1467 static int 1468 spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS) 1469 { 1470 char buf[SPA_TASKQ_PARAM_MAX]; 1471 int err; 1472 1473 (void) spa_taskq_param_get(ZIO_TYPE_WRITE, buf, FALSE); 1474 err = sysctl_handle_string(oidp, buf, sizeof (buf), req); 1475 if (err || req->newptr == NULL) 1476 return (err); 1477 return (spa_taskq_param_set(ZIO_TYPE_WRITE, buf)); 1478 } 1479 #endif 1480 #endif /* _KERNEL */ 1481 1482 /* 1483 * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. 1484 * Note that a type may have multiple discrete taskqs to avoid lock contention 1485 * on the taskq itself. 1486 */ 1487 static taskq_t * 1488 spa_taskq_dispatch_select(spa_t *spa, zio_type_t t, zio_taskq_type_t q, 1489 zio_t *zio) 1490 { 1491 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1492 taskq_t *tq; 1493 1494 ASSERT3P(tqs->stqs_taskq, !=, NULL); 1495 ASSERT3U(tqs->stqs_count, !=, 0); 1496 1497 if (tqs->stqs_count == 1) { 1498 tq = tqs->stqs_taskq[0]; 1499 } else if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) && 1500 (zio != NULL) && ZIO_HAS_ALLOCATOR(zio)) { 1501 tq = tqs->stqs_taskq[zio->io_allocator % tqs->stqs_count]; 1502 } else { 1503 tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; 1504 } 1505 return (tq); 1506 } 1507 1508 void 1509 spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, 1510 task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent, 1511 zio_t *zio) 1512 { 1513 taskq_t *tq = spa_taskq_dispatch_select(spa, t, q, zio); 1514 taskq_dispatch_ent(tq, func, arg, flags, ent); 1515 } 1516 1517 /* 1518 * Same as spa_taskq_dispatch_ent() but block on the task until completion. 1519 */ 1520 void 1521 spa_taskq_dispatch_sync(spa_t *spa, zio_type_t t, zio_taskq_type_t q, 1522 task_func_t *func, void *arg, uint_t flags) 1523 { 1524 taskq_t *tq = spa_taskq_dispatch_select(spa, t, q, NULL); 1525 taskqid_t id = taskq_dispatch(tq, func, arg, flags); 1526 if (id) 1527 taskq_wait_id(tq, id); 1528 } 1529 1530 static void 1531 spa_create_zio_taskqs(spa_t *spa) 1532 { 1533 for (int t = 0; t < ZIO_TYPES; t++) { 1534 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1535 spa_taskqs_init(spa, t, q); 1536 } 1537 } 1538 } 1539 1540 #if defined(_KERNEL) && defined(HAVE_SPA_THREAD) 1541 static void 1542 spa_thread(void *arg) 1543 { 1544 psetid_t zio_taskq_psrset_bind = PS_NONE; 1545 callb_cpr_t cprinfo; 1546 1547 spa_t *spa = arg; 1548 user_t *pu = PTOU(curproc); 1549 1550 CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 1551 spa->spa_name); 1552 1553 ASSERT(curproc != &p0); 1554 (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 1555 "zpool-%s", spa->spa_name); 1556 (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 1557 1558 /* bind this thread to the requested psrset */ 1559 if (zio_taskq_psrset_bind != PS_NONE) { 1560 pool_lock(); 1561 mutex_enter(&cpu_lock); 1562 mutex_enter(&pidlock); 1563 mutex_enter(&curproc->p_lock); 1564 1565 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 1566 0, NULL, NULL) == 0) { 1567 curthread->t_bind_pset = zio_taskq_psrset_bind; 1568 } else { 1569 cmn_err(CE_WARN, 1570 "Couldn't bind process for zfs pool \"%s\" to " 1571 "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 1572 } 1573 1574 mutex_exit(&curproc->p_lock); 1575 mutex_exit(&pidlock); 1576 mutex_exit(&cpu_lock); 1577 pool_unlock(); 1578 } 1579 1580 #ifdef HAVE_SYSDC 1581 if (zio_taskq_sysdc) { 1582 sysdc_thread_enter(curthread, 100, 0); 1583 } 1584 #endif 1585 1586 spa->spa_proc = curproc; 1587 spa->spa_did = curthread->t_did; 1588 1589 spa_create_zio_taskqs(spa); 1590 1591 mutex_enter(&spa->spa_proc_lock); 1592 ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 1593 1594 spa->spa_proc_state = SPA_PROC_ACTIVE; 1595 cv_broadcast(&spa->spa_proc_cv); 1596 1597 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1598 while (spa->spa_proc_state == SPA_PROC_ACTIVE) 1599 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1600 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 1601 1602 ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 1603 spa->spa_proc_state = SPA_PROC_GONE; 1604 spa->spa_proc = &p0; 1605 cv_broadcast(&spa->spa_proc_cv); 1606 CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 1607 1608 mutex_enter(&curproc->p_lock); 1609 lwp_exit(); 1610 } 1611 #endif 1612 1613 extern metaslab_ops_t *metaslab_allocator(spa_t *spa); 1614 1615 /* 1616 * Activate an uninitialized pool. 1617 */ 1618 static void 1619 spa_activate(spa_t *spa, spa_mode_t mode) 1620 { 1621 metaslab_ops_t *msp = metaslab_allocator(spa); 1622 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 1623 1624 spa->spa_state = POOL_STATE_ACTIVE; 1625 spa->spa_mode = mode; 1626 spa->spa_read_spacemaps = spa_mode_readable_spacemaps; 1627 1628 spa->spa_normal_class = metaslab_class_create(spa, msp); 1629 spa->spa_log_class = metaslab_class_create(spa, msp); 1630 spa->spa_embedded_log_class = metaslab_class_create(spa, msp); 1631 spa->spa_special_class = metaslab_class_create(spa, msp); 1632 spa->spa_dedup_class = metaslab_class_create(spa, msp); 1633 1634 /* Try to create a covering process */ 1635 mutex_enter(&spa->spa_proc_lock); 1636 ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 1637 ASSERT(spa->spa_proc == &p0); 1638 spa->spa_did = 0; 1639 1640 #ifdef HAVE_SPA_THREAD 1641 /* Only create a process if we're going to be around a while. */ 1642 if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 1643 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 1644 NULL, 0) == 0) { 1645 spa->spa_proc_state = SPA_PROC_CREATED; 1646 while (spa->spa_proc_state == SPA_PROC_CREATED) { 1647 cv_wait(&spa->spa_proc_cv, 1648 &spa->spa_proc_lock); 1649 } 1650 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1651 ASSERT(spa->spa_proc != &p0); 1652 ASSERT(spa->spa_did != 0); 1653 } else { 1654 #ifdef _KERNEL 1655 cmn_err(CE_WARN, 1656 "Couldn't create process for zfs pool \"%s\"\n", 1657 spa->spa_name); 1658 #endif 1659 } 1660 } 1661 #endif /* HAVE_SPA_THREAD */ 1662 mutex_exit(&spa->spa_proc_lock); 1663 1664 /* If we didn't create a process, we need to create our taskqs. */ 1665 if (spa->spa_proc == &p0) { 1666 spa_create_zio_taskqs(spa); 1667 } 1668 1669 for (size_t i = 0; i < TXG_SIZE; i++) { 1670 spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, 1671 ZIO_FLAG_CANFAIL); 1672 } 1673 1674 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 1675 offsetof(vdev_t, vdev_config_dirty_node)); 1676 list_create(&spa->spa_evicting_os_list, sizeof (objset_t), 1677 offsetof(objset_t, os_evicting_node)); 1678 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 1679 offsetof(vdev_t, vdev_state_dirty_node)); 1680 1681 txg_list_create(&spa->spa_vdev_txg_list, spa, 1682 offsetof(struct vdev, vdev_txg_node)); 1683 1684 avl_create(&spa->spa_errlist_scrub, 1685 spa_error_entry_compare, sizeof (spa_error_entry_t), 1686 offsetof(spa_error_entry_t, se_avl)); 1687 avl_create(&spa->spa_errlist_last, 1688 spa_error_entry_compare, sizeof (spa_error_entry_t), 1689 offsetof(spa_error_entry_t, se_avl)); 1690 avl_create(&spa->spa_errlist_healed, 1691 spa_error_entry_compare, sizeof (spa_error_entry_t), 1692 offsetof(spa_error_entry_t, se_avl)); 1693 1694 spa_activate_os(spa); 1695 1696 spa_keystore_init(&spa->spa_keystore); 1697 1698 /* 1699 * This taskq is used to perform zvol-minor-related tasks 1700 * asynchronously. This has several advantages, including easy 1701 * resolution of various deadlocks. 1702 * 1703 * The taskq must be single threaded to ensure tasks are always 1704 * processed in the order in which they were dispatched. 1705 * 1706 * A taskq per pool allows one to keep the pools independent. 1707 * This way if one pool is suspended, it will not impact another. 1708 * 1709 * The preferred location to dispatch a zvol minor task is a sync 1710 * task. In this context, there is easy access to the spa_t and minimal 1711 * error handling is required because the sync task must succeed. 1712 */ 1713 spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri, 1714 1, INT_MAX, 0); 1715 1716 /* 1717 * The taskq to preload metaslabs. 1718 */ 1719 spa->spa_metaslab_taskq = taskq_create("z_metaslab", 1720 metaslab_preload_pct, maxclsyspri, 1, INT_MAX, 1721 TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); 1722 1723 /* 1724 * Taskq dedicated to prefetcher threads: this is used to prevent the 1725 * pool traverse code from monopolizing the global (and limited) 1726 * system_taskq by inappropriately scheduling long running tasks on it. 1727 */ 1728 spa->spa_prefetch_taskq = taskq_create("z_prefetch", 100, 1729 defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); 1730 1731 /* 1732 * The taskq to upgrade datasets in this pool. Currently used by 1733 * feature SPA_FEATURE_USEROBJ_ACCOUNTING/SPA_FEATURE_PROJECT_QUOTA. 1734 */ 1735 spa->spa_upgrade_taskq = taskq_create("z_upgrade", 100, 1736 defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); 1737 } 1738 1739 /* 1740 * Opposite of spa_activate(). 1741 */ 1742 static void 1743 spa_deactivate(spa_t *spa) 1744 { 1745 ASSERT(spa->spa_sync_on == B_FALSE); 1746 ASSERT(spa->spa_dsl_pool == NULL); 1747 ASSERT(spa->spa_root_vdev == NULL); 1748 ASSERT(spa->spa_async_zio_root == NULL); 1749 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 1750 1751 spa_evicting_os_wait(spa); 1752 1753 if (spa->spa_zvol_taskq) { 1754 taskq_destroy(spa->spa_zvol_taskq); 1755 spa->spa_zvol_taskq = NULL; 1756 } 1757 1758 if (spa->spa_metaslab_taskq) { 1759 taskq_destroy(spa->spa_metaslab_taskq); 1760 spa->spa_metaslab_taskq = NULL; 1761 } 1762 1763 if (spa->spa_prefetch_taskq) { 1764 taskq_destroy(spa->spa_prefetch_taskq); 1765 spa->spa_prefetch_taskq = NULL; 1766 } 1767 1768 if (spa->spa_upgrade_taskq) { 1769 taskq_destroy(spa->spa_upgrade_taskq); 1770 spa->spa_upgrade_taskq = NULL; 1771 } 1772 1773 txg_list_destroy(&spa->spa_vdev_txg_list); 1774 1775 list_destroy(&spa->spa_config_dirty_list); 1776 list_destroy(&spa->spa_evicting_os_list); 1777 list_destroy(&spa->spa_state_dirty_list); 1778 1779 taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); 1780 1781 for (int t = 0; t < ZIO_TYPES; t++) { 1782 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1783 spa_taskqs_fini(spa, t, q); 1784 } 1785 } 1786 1787 for (size_t i = 0; i < TXG_SIZE; i++) { 1788 ASSERT3P(spa->spa_txg_zio[i], !=, NULL); 1789 VERIFY0(zio_wait(spa->spa_txg_zio[i])); 1790 spa->spa_txg_zio[i] = NULL; 1791 } 1792 1793 metaslab_class_destroy(spa->spa_normal_class); 1794 spa->spa_normal_class = NULL; 1795 1796 metaslab_class_destroy(spa->spa_log_class); 1797 spa->spa_log_class = NULL; 1798 1799 metaslab_class_destroy(spa->spa_embedded_log_class); 1800 spa->spa_embedded_log_class = NULL; 1801 1802 metaslab_class_destroy(spa->spa_special_class); 1803 spa->spa_special_class = NULL; 1804 1805 metaslab_class_destroy(spa->spa_dedup_class); 1806 spa->spa_dedup_class = NULL; 1807 1808 /* 1809 * If this was part of an import or the open otherwise failed, we may 1810 * still have errors left in the queues. Empty them just in case. 1811 */ 1812 spa_errlog_drain(spa); 1813 avl_destroy(&spa->spa_errlist_scrub); 1814 avl_destroy(&spa->spa_errlist_last); 1815 avl_destroy(&spa->spa_errlist_healed); 1816 1817 spa_keystore_fini(&spa->spa_keystore); 1818 1819 spa->spa_state = POOL_STATE_UNINITIALIZED; 1820 1821 mutex_enter(&spa->spa_proc_lock); 1822 if (spa->spa_proc_state != SPA_PROC_NONE) { 1823 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1824 spa->spa_proc_state = SPA_PROC_DEACTIVATE; 1825 cv_broadcast(&spa->spa_proc_cv); 1826 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 1827 ASSERT(spa->spa_proc != &p0); 1828 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1829 } 1830 ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 1831 spa->spa_proc_state = SPA_PROC_NONE; 1832 } 1833 ASSERT(spa->spa_proc == &p0); 1834 mutex_exit(&spa->spa_proc_lock); 1835 1836 /* 1837 * We want to make sure spa_thread() has actually exited the ZFS 1838 * module, so that the module can't be unloaded out from underneath 1839 * it. 1840 */ 1841 if (spa->spa_did != 0) { 1842 thread_join(spa->spa_did); 1843 spa->spa_did = 0; 1844 } 1845 1846 spa_deactivate_os(spa); 1847 1848 } 1849 1850 /* 1851 * Verify a pool configuration, and construct the vdev tree appropriately. This 1852 * will create all the necessary vdevs in the appropriate layout, with each vdev 1853 * in the CLOSED state. This will prep the pool before open/creation/import. 1854 * All vdev validation is done by the vdev_alloc() routine. 1855 */ 1856 int 1857 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 1858 uint_t id, int atype) 1859 { 1860 nvlist_t **child; 1861 uint_t children; 1862 int error; 1863 1864 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 1865 return (error); 1866 1867 if ((*vdp)->vdev_ops->vdev_op_leaf) 1868 return (0); 1869 1870 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1871 &child, &children); 1872 1873 if (error == ENOENT) 1874 return (0); 1875 1876 if (error) { 1877 vdev_free(*vdp); 1878 *vdp = NULL; 1879 return (SET_ERROR(EINVAL)); 1880 } 1881 1882 for (int c = 0; c < children; c++) { 1883 vdev_t *vd; 1884 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 1885 atype)) != 0) { 1886 vdev_free(*vdp); 1887 *vdp = NULL; 1888 return (error); 1889 } 1890 } 1891 1892 ASSERT(*vdp != NULL); 1893 1894 return (0); 1895 } 1896 1897 static boolean_t 1898 spa_should_flush_logs_on_unload(spa_t *spa) 1899 { 1900 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) 1901 return (B_FALSE); 1902 1903 if (!spa_writeable(spa)) 1904 return (B_FALSE); 1905 1906 if (!spa->spa_sync_on) 1907 return (B_FALSE); 1908 1909 if (spa_state(spa) != POOL_STATE_EXPORTED) 1910 return (B_FALSE); 1911 1912 if (zfs_keep_log_spacemaps_at_export) 1913 return (B_FALSE); 1914 1915 return (B_TRUE); 1916 } 1917 1918 /* 1919 * Opens a transaction that will set the flag that will instruct 1920 * spa_sync to attempt to flush all the metaslabs for that txg. 1921 */ 1922 static void 1923 spa_unload_log_sm_flush_all(spa_t *spa) 1924 { 1925 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 1926 VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 1927 1928 ASSERT3U(spa->spa_log_flushall_txg, ==, 0); 1929 spa->spa_log_flushall_txg = dmu_tx_get_txg(tx); 1930 1931 dmu_tx_commit(tx); 1932 txg_wait_synced(spa_get_dsl(spa), spa->spa_log_flushall_txg); 1933 } 1934 1935 static void 1936 spa_unload_log_sm_metadata(spa_t *spa) 1937 { 1938 void *cookie = NULL; 1939 spa_log_sm_t *sls; 1940 log_summary_entry_t *e; 1941 1942 while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg, 1943 &cookie)) != NULL) { 1944 VERIFY0(sls->sls_mscount); 1945 kmem_free(sls, sizeof (spa_log_sm_t)); 1946 } 1947 1948 while ((e = list_remove_head(&spa->spa_log_summary)) != NULL) { 1949 VERIFY0(e->lse_mscount); 1950 kmem_free(e, sizeof (log_summary_entry_t)); 1951 } 1952 1953 spa->spa_unflushed_stats.sus_nblocks = 0; 1954 spa->spa_unflushed_stats.sus_memused = 0; 1955 spa->spa_unflushed_stats.sus_blocklimit = 0; 1956 } 1957 1958 static void 1959 spa_destroy_aux_threads(spa_t *spa) 1960 { 1961 if (spa->spa_condense_zthr != NULL) { 1962 zthr_destroy(spa->spa_condense_zthr); 1963 spa->spa_condense_zthr = NULL; 1964 } 1965 if (spa->spa_checkpoint_discard_zthr != NULL) { 1966 zthr_destroy(spa->spa_checkpoint_discard_zthr); 1967 spa->spa_checkpoint_discard_zthr = NULL; 1968 } 1969 if (spa->spa_livelist_delete_zthr != NULL) { 1970 zthr_destroy(spa->spa_livelist_delete_zthr); 1971 spa->spa_livelist_delete_zthr = NULL; 1972 } 1973 if (spa->spa_livelist_condense_zthr != NULL) { 1974 zthr_destroy(spa->spa_livelist_condense_zthr); 1975 spa->spa_livelist_condense_zthr = NULL; 1976 } 1977 if (spa->spa_raidz_expand_zthr != NULL) { 1978 zthr_destroy(spa->spa_raidz_expand_zthr); 1979 spa->spa_raidz_expand_zthr = NULL; 1980 } 1981 } 1982 1983 /* 1984 * Opposite of spa_load(). 1985 */ 1986 static void 1987 spa_unload(spa_t *spa) 1988 { 1989 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1990 ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED); 1991 1992 spa_import_progress_remove(spa_guid(spa)); 1993 spa_load_note(spa, "UNLOADING"); 1994 1995 spa_wake_waiters(spa); 1996 1997 /* 1998 * If we have set the spa_final_txg, we have already performed the 1999 * tasks below in spa_export_common(). We should not redo it here since 2000 * we delay the final TXGs beyond what spa_final_txg is set at. 2001 */ 2002 if (spa->spa_final_txg == UINT64_MAX) { 2003 /* 2004 * If the log space map feature is enabled and the pool is 2005 * getting exported (but not destroyed), we want to spend some 2006 * time flushing as many metaslabs as we can in an attempt to 2007 * destroy log space maps and save import time. 2008 */ 2009 if (spa_should_flush_logs_on_unload(spa)) 2010 spa_unload_log_sm_flush_all(spa); 2011 2012 /* 2013 * Stop async tasks. 2014 */ 2015 spa_async_suspend(spa); 2016 2017 if (spa->spa_root_vdev) { 2018 vdev_t *root_vdev = spa->spa_root_vdev; 2019 vdev_initialize_stop_all(root_vdev, 2020 VDEV_INITIALIZE_ACTIVE); 2021 vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE); 2022 vdev_autotrim_stop_all(spa); 2023 vdev_rebuild_stop_all(spa); 2024 } 2025 } 2026 2027 /* 2028 * Stop syncing. 2029 */ 2030 if (spa->spa_sync_on) { 2031 txg_sync_stop(spa->spa_dsl_pool); 2032 spa->spa_sync_on = B_FALSE; 2033 } 2034 2035 /* 2036 * This ensures that there is no async metaslab prefetching 2037 * while we attempt to unload the spa. 2038 */ 2039 taskq_wait(spa->spa_metaslab_taskq); 2040 2041 if (spa->spa_mmp.mmp_thread) 2042 mmp_thread_stop(spa); 2043 2044 /* 2045 * Wait for any outstanding async I/O to complete. 2046 */ 2047 if (spa->spa_async_zio_root != NULL) { 2048 for (int i = 0; i < max_ncpus; i++) 2049 (void) zio_wait(spa->spa_async_zio_root[i]); 2050 kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *)); 2051 spa->spa_async_zio_root = NULL; 2052 } 2053 2054 if (spa->spa_vdev_removal != NULL) { 2055 spa_vdev_removal_destroy(spa->spa_vdev_removal); 2056 spa->spa_vdev_removal = NULL; 2057 } 2058 2059 spa_destroy_aux_threads(spa); 2060 2061 spa_condense_fini(spa); 2062 2063 bpobj_close(&spa->spa_deferred_bpobj); 2064 2065 spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); 2066 2067 /* 2068 * Close all vdevs. 2069 */ 2070 if (spa->spa_root_vdev) 2071 vdev_free(spa->spa_root_vdev); 2072 ASSERT(spa->spa_root_vdev == NULL); 2073 2074 /* 2075 * Close the dsl pool. 2076 */ 2077 if (spa->spa_dsl_pool) { 2078 dsl_pool_close(spa->spa_dsl_pool); 2079 spa->spa_dsl_pool = NULL; 2080 spa->spa_meta_objset = NULL; 2081 } 2082 2083 ddt_unload(spa); 2084 brt_unload(spa); 2085 spa_unload_log_sm_metadata(spa); 2086 2087 /* 2088 * Drop and purge level 2 cache 2089 */ 2090 spa_l2cache_drop(spa); 2091 2092 if (spa->spa_spares.sav_vdevs) { 2093 for (int i = 0; i < spa->spa_spares.sav_count; i++) 2094 vdev_free(spa->spa_spares.sav_vdevs[i]); 2095 kmem_free(spa->spa_spares.sav_vdevs, 2096 spa->spa_spares.sav_count * sizeof (void *)); 2097 spa->spa_spares.sav_vdevs = NULL; 2098 } 2099 if (spa->spa_spares.sav_config) { 2100 nvlist_free(spa->spa_spares.sav_config); 2101 spa->spa_spares.sav_config = NULL; 2102 } 2103 spa->spa_spares.sav_count = 0; 2104 2105 if (spa->spa_l2cache.sav_vdevs) { 2106 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { 2107 vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); 2108 vdev_free(spa->spa_l2cache.sav_vdevs[i]); 2109 } 2110 kmem_free(spa->spa_l2cache.sav_vdevs, 2111 spa->spa_l2cache.sav_count * sizeof (void *)); 2112 spa->spa_l2cache.sav_vdevs = NULL; 2113 } 2114 if (spa->spa_l2cache.sav_config) { 2115 nvlist_free(spa->spa_l2cache.sav_config); 2116 spa->spa_l2cache.sav_config = NULL; 2117 } 2118 spa->spa_l2cache.sav_count = 0; 2119 2120 spa->spa_async_suspended = 0; 2121 2122 spa->spa_indirect_vdevs_loaded = B_FALSE; 2123 2124 if (spa->spa_comment != NULL) { 2125 spa_strfree(spa->spa_comment); 2126 spa->spa_comment = NULL; 2127 } 2128 if (spa->spa_compatibility != NULL) { 2129 spa_strfree(spa->spa_compatibility); 2130 spa->spa_compatibility = NULL; 2131 } 2132 2133 spa->spa_raidz_expand = NULL; 2134 2135 spa_config_exit(spa, SCL_ALL, spa); 2136 } 2137 2138 /* 2139 * Load (or re-load) the current list of vdevs describing the active spares for 2140 * this pool. When this is called, we have some form of basic information in 2141 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 2142 * then re-generate a more complete list including status information. 2143 */ 2144 void 2145 spa_load_spares(spa_t *spa) 2146 { 2147 nvlist_t **spares; 2148 uint_t nspares; 2149 int i; 2150 vdev_t *vd, *tvd; 2151 2152 #ifndef _KERNEL 2153 /* 2154 * zdb opens both the current state of the pool and the 2155 * checkpointed state (if present), with a different spa_t. 2156 * 2157 * As spare vdevs are shared among open pools, we skip loading 2158 * them when we load the checkpointed state of the pool. 2159 */ 2160 if (!spa_writeable(spa)) 2161 return; 2162 #endif 2163 2164 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 2165 2166 /* 2167 * First, close and free any existing spare vdevs. 2168 */ 2169 if (spa->spa_spares.sav_vdevs) { 2170 for (i = 0; i < spa->spa_spares.sav_count; i++) { 2171 vd = spa->spa_spares.sav_vdevs[i]; 2172 2173 /* Undo the call to spa_activate() below */ 2174 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 2175 B_FALSE)) != NULL && tvd->vdev_isspare) 2176 spa_spare_remove(tvd); 2177 vdev_close(vd); 2178 vdev_free(vd); 2179 } 2180 2181 kmem_free(spa->spa_spares.sav_vdevs, 2182 spa->spa_spares.sav_count * sizeof (void *)); 2183 } 2184 2185 if (spa->spa_spares.sav_config == NULL) 2186 nspares = 0; 2187 else 2188 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 2189 ZPOOL_CONFIG_SPARES, &spares, &nspares)); 2190 2191 spa->spa_spares.sav_count = (int)nspares; 2192 spa->spa_spares.sav_vdevs = NULL; 2193 2194 if (nspares == 0) 2195 return; 2196 2197 /* 2198 * Construct the array of vdevs, opening them to get status in the 2199 * process. For each spare, there is potentially two different vdev_t 2200 * structures associated with it: one in the list of spares (used only 2201 * for basic validation purposes) and one in the active vdev 2202 * configuration (if it's spared in). During this phase we open and 2203 * validate each vdev on the spare list. If the vdev also exists in the 2204 * active configuration, then we also mark this vdev as an active spare. 2205 */ 2206 spa->spa_spares.sav_vdevs = kmem_zalloc(nspares * sizeof (void *), 2207 KM_SLEEP); 2208 for (i = 0; i < spa->spa_spares.sav_count; i++) { 2209 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 2210 VDEV_ALLOC_SPARE) == 0); 2211 ASSERT(vd != NULL); 2212 2213 spa->spa_spares.sav_vdevs[i] = vd; 2214 2215 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 2216 B_FALSE)) != NULL) { 2217 if (!tvd->vdev_isspare) 2218 spa_spare_add(tvd); 2219 2220 /* 2221 * We only mark the spare active if we were successfully 2222 * able to load the vdev. Otherwise, importing a pool 2223 * with a bad active spare would result in strange 2224 * behavior, because multiple pool would think the spare 2225 * is actively in use. 2226 * 2227 * There is a vulnerability here to an equally bizarre 2228 * circumstance, where a dead active spare is later 2229 * brought back to life (onlined or otherwise). Given 2230 * the rarity of this scenario, and the extra complexity 2231 * it adds, we ignore the possibility. 2232 */ 2233 if (!vdev_is_dead(tvd)) 2234 spa_spare_activate(tvd); 2235 } 2236 2237 vd->vdev_top = vd; 2238 vd->vdev_aux = &spa->spa_spares; 2239 2240 if (vdev_open(vd) != 0) 2241 continue; 2242 2243 if (vdev_validate_aux(vd) == 0) 2244 spa_spare_add(vd); 2245 } 2246 2247 /* 2248 * Recompute the stashed list of spares, with status information 2249 * this time. 2250 */ 2251 fnvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES); 2252 2253 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 2254 KM_SLEEP); 2255 for (i = 0; i < spa->spa_spares.sav_count; i++) 2256 spares[i] = vdev_config_generate(spa, 2257 spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 2258 fnvlist_add_nvlist_array(spa->spa_spares.sav_config, 2259 ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, 2260 spa->spa_spares.sav_count); 2261 for (i = 0; i < spa->spa_spares.sav_count; i++) 2262 nvlist_free(spares[i]); 2263 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 2264 } 2265 2266 /* 2267 * Load (or re-load) the current list of vdevs describing the active l2cache for 2268 * this pool. When this is called, we have some form of basic information in 2269 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 2270 * then re-generate a more complete list including status information. 2271 * Devices which are already active have their details maintained, and are 2272 * not re-opened. 2273 */ 2274 void 2275 spa_load_l2cache(spa_t *spa) 2276 { 2277 nvlist_t **l2cache = NULL; 2278 uint_t nl2cache; 2279 int i, j, oldnvdevs; 2280 uint64_t guid; 2281 vdev_t *vd, **oldvdevs, **newvdevs; 2282 spa_aux_vdev_t *sav = &spa->spa_l2cache; 2283 2284 #ifndef _KERNEL 2285 /* 2286 * zdb opens both the current state of the pool and the 2287 * checkpointed state (if present), with a different spa_t. 2288 * 2289 * As L2 caches are part of the ARC which is shared among open 2290 * pools, we skip loading them when we load the checkpointed 2291 * state of the pool. 2292 */ 2293 if (!spa_writeable(spa)) 2294 return; 2295 #endif 2296 2297 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 2298 2299 oldvdevs = sav->sav_vdevs; 2300 oldnvdevs = sav->sav_count; 2301 sav->sav_vdevs = NULL; 2302 sav->sav_count = 0; 2303 2304 if (sav->sav_config == NULL) { 2305 nl2cache = 0; 2306 newvdevs = NULL; 2307 goto out; 2308 } 2309 2310 VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, 2311 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache)); 2312 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 2313 2314 /* 2315 * Process new nvlist of vdevs. 2316 */ 2317 for (i = 0; i < nl2cache; i++) { 2318 guid = fnvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID); 2319 2320 newvdevs[i] = NULL; 2321 for (j = 0; j < oldnvdevs; j++) { 2322 vd = oldvdevs[j]; 2323 if (vd != NULL && guid == vd->vdev_guid) { 2324 /* 2325 * Retain previous vdev for add/remove ops. 2326 */ 2327 newvdevs[i] = vd; 2328 oldvdevs[j] = NULL; 2329 break; 2330 } 2331 } 2332 2333 if (newvdevs[i] == NULL) { 2334 /* 2335 * Create new vdev 2336 */ 2337 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 2338 VDEV_ALLOC_L2CACHE) == 0); 2339 ASSERT(vd != NULL); 2340 newvdevs[i] = vd; 2341 2342 /* 2343 * Commit this vdev as an l2cache device, 2344 * even if it fails to open. 2345 */ 2346 spa_l2cache_add(vd); 2347 2348 vd->vdev_top = vd; 2349 vd->vdev_aux = sav; 2350 2351 spa_l2cache_activate(vd); 2352 2353 if (vdev_open(vd) != 0) 2354 continue; 2355 2356 (void) vdev_validate_aux(vd); 2357 2358 if (!vdev_is_dead(vd)) 2359 l2arc_add_vdev(spa, vd); 2360 2361 /* 2362 * Upon cache device addition to a pool or pool 2363 * creation with a cache device or if the header 2364 * of the device is invalid we issue an async 2365 * TRIM command for the whole device which will 2366 * execute if l2arc_trim_ahead > 0. 2367 */ 2368 spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM); 2369 } 2370 } 2371 2372 sav->sav_vdevs = newvdevs; 2373 sav->sav_count = (int)nl2cache; 2374 2375 /* 2376 * Recompute the stashed list of l2cache devices, with status 2377 * information this time. 2378 */ 2379 fnvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE); 2380 2381 if (sav->sav_count > 0) 2382 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), 2383 KM_SLEEP); 2384 for (i = 0; i < sav->sav_count; i++) 2385 l2cache[i] = vdev_config_generate(spa, 2386 sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 2387 fnvlist_add_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 2388 (const nvlist_t * const *)l2cache, sav->sav_count); 2389 2390 out: 2391 /* 2392 * Purge vdevs that were dropped 2393 */ 2394 if (oldvdevs) { 2395 for (i = 0; i < oldnvdevs; i++) { 2396 uint64_t pool; 2397 2398 vd = oldvdevs[i]; 2399 if (vd != NULL) { 2400 ASSERT(vd->vdev_isl2cache); 2401 2402 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 2403 pool != 0ULL && l2arc_vdev_present(vd)) 2404 l2arc_remove_vdev(vd); 2405 vdev_clear_stats(vd); 2406 vdev_free(vd); 2407 } 2408 } 2409 2410 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 2411 } 2412 2413 for (i = 0; i < sav->sav_count; i++) 2414 nvlist_free(l2cache[i]); 2415 if (sav->sav_count) 2416 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 2417 } 2418 2419 static int 2420 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 2421 { 2422 dmu_buf_t *db; 2423 char *packed = NULL; 2424 size_t nvsize = 0; 2425 int error; 2426 *value = NULL; 2427 2428 error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); 2429 if (error) 2430 return (error); 2431 2432 nvsize = *(uint64_t *)db->db_data; 2433 dmu_buf_rele(db, FTAG); 2434 2435 packed = vmem_alloc(nvsize, KM_SLEEP); 2436 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 2437 DMU_READ_PREFETCH); 2438 if (error == 0) 2439 error = nvlist_unpack(packed, nvsize, value, 0); 2440 vmem_free(packed, nvsize); 2441 2442 return (error); 2443 } 2444 2445 /* 2446 * Concrete top-level vdevs that are not missing and are not logs. At every 2447 * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds. 2448 */ 2449 static uint64_t 2450 spa_healthy_core_tvds(spa_t *spa) 2451 { 2452 vdev_t *rvd = spa->spa_root_vdev; 2453 uint64_t tvds = 0; 2454 2455 for (uint64_t i = 0; i < rvd->vdev_children; i++) { 2456 vdev_t *vd = rvd->vdev_child[i]; 2457 if (vd->vdev_islog) 2458 continue; 2459 if (vdev_is_concrete(vd) && !vdev_is_dead(vd)) 2460 tvds++; 2461 } 2462 2463 return (tvds); 2464 } 2465 2466 /* 2467 * Checks to see if the given vdev could not be opened, in which case we post a 2468 * sysevent to notify the autoreplace code that the device has been removed. 2469 */ 2470 static void 2471 spa_check_removed(vdev_t *vd) 2472 { 2473 for (uint64_t c = 0; c < vd->vdev_children; c++) 2474 spa_check_removed(vd->vdev_child[c]); 2475 2476 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && 2477 vdev_is_concrete(vd)) { 2478 zfs_post_autoreplace(vd->vdev_spa, vd); 2479 spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK); 2480 } 2481 } 2482 2483 static int 2484 spa_check_for_missing_logs(spa_t *spa) 2485 { 2486 vdev_t *rvd = spa->spa_root_vdev; 2487 2488 /* 2489 * If we're doing a normal import, then build up any additional 2490 * diagnostic information about missing log devices. 2491 * We'll pass this up to the user for further processing. 2492 */ 2493 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 2494 nvlist_t **child, *nv; 2495 uint64_t idx = 0; 2496 2497 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t *), 2498 KM_SLEEP); 2499 nv = fnvlist_alloc(); 2500 2501 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 2502 vdev_t *tvd = rvd->vdev_child[c]; 2503 2504 /* 2505 * We consider a device as missing only if it failed 2506 * to open (i.e. offline or faulted is not considered 2507 * as missing). 2508 */ 2509 if (tvd->vdev_islog && 2510 tvd->vdev_state == VDEV_STATE_CANT_OPEN) { 2511 child[idx++] = vdev_config_generate(spa, tvd, 2512 B_FALSE, VDEV_CONFIG_MISSING); 2513 } 2514 } 2515 2516 if (idx > 0) { 2517 fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 2518 (const nvlist_t * const *)child, idx); 2519 fnvlist_add_nvlist(spa->spa_load_info, 2520 ZPOOL_CONFIG_MISSING_DEVICES, nv); 2521 2522 for (uint64_t i = 0; i < idx; i++) 2523 nvlist_free(child[i]); 2524 } 2525 nvlist_free(nv); 2526 kmem_free(child, rvd->vdev_children * sizeof (char **)); 2527 2528 if (idx > 0) { 2529 spa_load_failed(spa, "some log devices are missing"); 2530 vdev_dbgmsg_print_tree(rvd, 2); 2531 return (SET_ERROR(ENXIO)); 2532 } 2533 } else { 2534 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 2535 vdev_t *tvd = rvd->vdev_child[c]; 2536 2537 if (tvd->vdev_islog && 2538 tvd->vdev_state == VDEV_STATE_CANT_OPEN) { 2539 spa_set_log_state(spa, SPA_LOG_CLEAR); 2540 spa_load_note(spa, "some log devices are " 2541 "missing, ZIL is dropped."); 2542 vdev_dbgmsg_print_tree(rvd, 2); 2543 break; 2544 } 2545 } 2546 } 2547 2548 return (0); 2549 } 2550 2551 /* 2552 * Check for missing log devices 2553 */ 2554 static boolean_t 2555 spa_check_logs(spa_t *spa) 2556 { 2557 boolean_t rv = B_FALSE; 2558 dsl_pool_t *dp = spa_get_dsl(spa); 2559 2560 switch (spa->spa_log_state) { 2561 default: 2562 break; 2563 case SPA_LOG_MISSING: 2564 /* need to recheck in case slog has been restored */ 2565 case SPA_LOG_UNKNOWN: 2566 rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 2567 zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0); 2568 if (rv) 2569 spa_set_log_state(spa, SPA_LOG_MISSING); 2570 break; 2571 } 2572 return (rv); 2573 } 2574 2575 /* 2576 * Passivate any log vdevs (note, does not apply to embedded log metaslabs). 2577 */ 2578 static boolean_t 2579 spa_passivate_log(spa_t *spa) 2580 { 2581 vdev_t *rvd = spa->spa_root_vdev; 2582 boolean_t slog_found = B_FALSE; 2583 2584 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 2585 2586 for (int c = 0; c < rvd->vdev_children; c++) { 2587 vdev_t *tvd = rvd->vdev_child[c]; 2588 2589 if (tvd->vdev_islog) { 2590 ASSERT3P(tvd->vdev_log_mg, ==, NULL); 2591 metaslab_group_passivate(tvd->vdev_mg); 2592 slog_found = B_TRUE; 2593 } 2594 } 2595 2596 return (slog_found); 2597 } 2598 2599 /* 2600 * Activate any log vdevs (note, does not apply to embedded log metaslabs). 2601 */ 2602 static void 2603 spa_activate_log(spa_t *spa) 2604 { 2605 vdev_t *rvd = spa->spa_root_vdev; 2606 2607 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 2608 2609 for (int c = 0; c < rvd->vdev_children; c++) { 2610 vdev_t *tvd = rvd->vdev_child[c]; 2611 2612 if (tvd->vdev_islog) { 2613 ASSERT3P(tvd->vdev_log_mg, ==, NULL); 2614 metaslab_group_activate(tvd->vdev_mg); 2615 } 2616 } 2617 } 2618 2619 int 2620 spa_reset_logs(spa_t *spa) 2621 { 2622 int error; 2623 2624 error = dmu_objset_find(spa_name(spa), zil_reset, 2625 NULL, DS_FIND_CHILDREN); 2626 if (error == 0) { 2627 /* 2628 * We successfully offlined the log device, sync out the 2629 * current txg so that the "stubby" block can be removed 2630 * by zil_sync(). 2631 */ 2632 txg_wait_synced(spa->spa_dsl_pool, 0); 2633 } 2634 return (error); 2635 } 2636 2637 static void 2638 spa_aux_check_removed(spa_aux_vdev_t *sav) 2639 { 2640 for (int i = 0; i < sav->sav_count; i++) 2641 spa_check_removed(sav->sav_vdevs[i]); 2642 } 2643 2644 void 2645 spa_claim_notify(zio_t *zio) 2646 { 2647 spa_t *spa = zio->io_spa; 2648 2649 if (zio->io_error) 2650 return; 2651 2652 mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 2653 if (spa->spa_claim_max_txg < BP_GET_LOGICAL_BIRTH(zio->io_bp)) 2654 spa->spa_claim_max_txg = BP_GET_LOGICAL_BIRTH(zio->io_bp); 2655 mutex_exit(&spa->spa_props_lock); 2656 } 2657 2658 typedef struct spa_load_error { 2659 boolean_t sle_verify_data; 2660 uint64_t sle_meta_count; 2661 uint64_t sle_data_count; 2662 } spa_load_error_t; 2663 2664 static void 2665 spa_load_verify_done(zio_t *zio) 2666 { 2667 blkptr_t *bp = zio->io_bp; 2668 spa_load_error_t *sle = zio->io_private; 2669 dmu_object_type_t type = BP_GET_TYPE(bp); 2670 int error = zio->io_error; 2671 spa_t *spa = zio->io_spa; 2672 2673 abd_free(zio->io_abd); 2674 if (error) { 2675 if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && 2676 type != DMU_OT_INTENT_LOG) 2677 atomic_inc_64(&sle->sle_meta_count); 2678 else 2679 atomic_inc_64(&sle->sle_data_count); 2680 } 2681 2682 mutex_enter(&spa->spa_scrub_lock); 2683 spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp); 2684 cv_broadcast(&spa->spa_scrub_io_cv); 2685 mutex_exit(&spa->spa_scrub_lock); 2686 } 2687 2688 /* 2689 * Maximum number of inflight bytes is the log2 fraction of the arc size. 2690 * By default, we set it to 1/16th of the arc. 2691 */ 2692 static uint_t spa_load_verify_shift = 4; 2693 static int spa_load_verify_metadata = B_TRUE; 2694 static int spa_load_verify_data = B_TRUE; 2695 2696 static int 2697 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 2698 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 2699 { 2700 zio_t *rio = arg; 2701 spa_load_error_t *sle = rio->io_private; 2702 2703 (void) zilog, (void) dnp; 2704 2705 /* 2706 * Note: normally this routine will not be called if 2707 * spa_load_verify_metadata is not set. However, it may be useful 2708 * to manually set the flag after the traversal has begun. 2709 */ 2710 if (!spa_load_verify_metadata) 2711 return (0); 2712 2713 /* 2714 * Sanity check the block pointer in order to detect obvious damage 2715 * before using the contents in subsequent checks or in zio_read(). 2716 * When damaged consider it to be a metadata error since we cannot 2717 * trust the BP_GET_TYPE and BP_GET_LEVEL values. 2718 */ 2719 if (!zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) { 2720 atomic_inc_64(&sle->sle_meta_count); 2721 return (0); 2722 } 2723 2724 if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || 2725 BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp)) 2726 return (0); 2727 2728 if (!BP_IS_METADATA(bp) && 2729 (!spa_load_verify_data || !sle->sle_verify_data)) 2730 return (0); 2731 2732 uint64_t maxinflight_bytes = 2733 arc_target_bytes() >> spa_load_verify_shift; 2734 size_t size = BP_GET_PSIZE(bp); 2735 2736 mutex_enter(&spa->spa_scrub_lock); 2737 while (spa->spa_load_verify_bytes >= maxinflight_bytes) 2738 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2739 spa->spa_load_verify_bytes += size; 2740 mutex_exit(&spa->spa_scrub_lock); 2741 2742 zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size, 2743 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 2744 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 2745 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 2746 return (0); 2747 } 2748 2749 static int 2750 verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) 2751 { 2752 (void) dp, (void) arg; 2753 2754 if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN) 2755 return (SET_ERROR(ENAMETOOLONG)); 2756 2757 return (0); 2758 } 2759 2760 static int 2761 spa_load_verify(spa_t *spa) 2762 { 2763 zio_t *rio; 2764 spa_load_error_t sle = { 0 }; 2765 zpool_load_policy_t policy; 2766 boolean_t verify_ok = B_FALSE; 2767 int error = 0; 2768 2769 zpool_get_load_policy(spa->spa_config, &policy); 2770 2771 if (policy.zlp_rewind & ZPOOL_NEVER_REWIND || 2772 policy.zlp_maxmeta == UINT64_MAX) 2773 return (0); 2774 2775 dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); 2776 error = dmu_objset_find_dp(spa->spa_dsl_pool, 2777 spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL, 2778 DS_FIND_CHILDREN); 2779 dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); 2780 if (error != 0) 2781 return (error); 2782 2783 /* 2784 * Verify data only if we are rewinding or error limit was set. 2785 * Otherwise nothing except dbgmsg care about it to waste time. 2786 */ 2787 sle.sle_verify_data = (policy.zlp_rewind & ZPOOL_REWIND_MASK) || 2788 (policy.zlp_maxdata < UINT64_MAX); 2789 2790 rio = zio_root(spa, NULL, &sle, 2791 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 2792 2793 if (spa_load_verify_metadata) { 2794 if (spa->spa_extreme_rewind) { 2795 spa_load_note(spa, "performing a complete scan of the " 2796 "pool since extreme rewind is on. This may take " 2797 "a very long time.\n (spa_load_verify_data=%u, " 2798 "spa_load_verify_metadata=%u)", 2799 spa_load_verify_data, spa_load_verify_metadata); 2800 } 2801 2802 error = traverse_pool(spa, spa->spa_verify_min_txg, 2803 TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | 2804 TRAVERSE_NO_DECRYPT, spa_load_verify_cb, rio); 2805 } 2806 2807 (void) zio_wait(rio); 2808 ASSERT0(spa->spa_load_verify_bytes); 2809 2810 spa->spa_load_meta_errors = sle.sle_meta_count; 2811 spa->spa_load_data_errors = sle.sle_data_count; 2812 2813 if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) { 2814 spa_load_note(spa, "spa_load_verify found %llu metadata errors " 2815 "and %llu data errors", (u_longlong_t)sle.sle_meta_count, 2816 (u_longlong_t)sle.sle_data_count); 2817 } 2818 2819 if (spa_load_verify_dryrun || 2820 (!error && sle.sle_meta_count <= policy.zlp_maxmeta && 2821 sle.sle_data_count <= policy.zlp_maxdata)) { 2822 int64_t loss = 0; 2823 2824 verify_ok = B_TRUE; 2825 spa->spa_load_txg = spa->spa_uberblock.ub_txg; 2826 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 2827 2828 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 2829 fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_TIME, 2830 spa->spa_load_txg_ts); 2831 fnvlist_add_int64(spa->spa_load_info, ZPOOL_CONFIG_REWIND_TIME, 2832 loss); 2833 fnvlist_add_uint64(spa->spa_load_info, 2834 ZPOOL_CONFIG_LOAD_META_ERRORS, sle.sle_meta_count); 2835 fnvlist_add_uint64(spa->spa_load_info, 2836 ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count); 2837 } else { 2838 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 2839 } 2840 2841 if (spa_load_verify_dryrun) 2842 return (0); 2843 2844 if (error) { 2845 if (error != ENXIO && error != EIO) 2846 error = SET_ERROR(EIO); 2847 return (error); 2848 } 2849 2850 return (verify_ok ? 0 : EIO); 2851 } 2852 2853 /* 2854 * Find a value in the pool props object. 2855 */ 2856 static void 2857 spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 2858 { 2859 (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 2860 zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 2861 } 2862 2863 /* 2864 * Find a value in the pool directory object. 2865 */ 2866 static int 2867 spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent) 2868 { 2869 int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 2870 name, sizeof (uint64_t), 1, val); 2871 2872 if (error != 0 && (error != ENOENT || log_enoent)) { 2873 spa_load_failed(spa, "couldn't get '%s' value in MOS directory " 2874 "[error=%d]", name, error); 2875 } 2876 2877 return (error); 2878 } 2879 2880 static int 2881 spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 2882 { 2883 vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 2884 return (SET_ERROR(err)); 2885 } 2886 2887 boolean_t 2888 spa_livelist_delete_check(spa_t *spa) 2889 { 2890 return (spa->spa_livelists_to_delete != 0); 2891 } 2892 2893 static boolean_t 2894 spa_livelist_delete_cb_check(void *arg, zthr_t *z) 2895 { 2896 (void) z; 2897 spa_t *spa = arg; 2898 return (spa_livelist_delete_check(spa)); 2899 } 2900 2901 static int 2902 delete_blkptr_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 2903 { 2904 spa_t *spa = arg; 2905 zio_free(spa, tx->tx_txg, bp); 2906 dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, 2907 -bp_get_dsize_sync(spa, bp), 2908 -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); 2909 return (0); 2910 } 2911 2912 static int 2913 dsl_get_next_livelist_obj(objset_t *os, uint64_t zap_obj, uint64_t *llp) 2914 { 2915 int err; 2916 zap_cursor_t zc; 2917 zap_attribute_t za; 2918 zap_cursor_init(&zc, os, zap_obj); 2919 err = zap_cursor_retrieve(&zc, &za); 2920 zap_cursor_fini(&zc); 2921 if (err == 0) 2922 *llp = za.za_first_integer; 2923 return (err); 2924 } 2925 2926 /* 2927 * Components of livelist deletion that must be performed in syncing 2928 * context: freeing block pointers and updating the pool-wide data 2929 * structures to indicate how much work is left to do 2930 */ 2931 typedef struct sublist_delete_arg { 2932 spa_t *spa; 2933 dsl_deadlist_t *ll; 2934 uint64_t key; 2935 bplist_t *to_free; 2936 } sublist_delete_arg_t; 2937 2938 static void 2939 sublist_delete_sync(void *arg, dmu_tx_t *tx) 2940 { 2941 sublist_delete_arg_t *sda = arg; 2942 spa_t *spa = sda->spa; 2943 dsl_deadlist_t *ll = sda->ll; 2944 uint64_t key = sda->key; 2945 bplist_t *to_free = sda->to_free; 2946 2947 bplist_iterate(to_free, delete_blkptr_cb, spa, tx); 2948 dsl_deadlist_remove_entry(ll, key, tx); 2949 } 2950 2951 typedef struct livelist_delete_arg { 2952 spa_t *spa; 2953 uint64_t ll_obj; 2954 uint64_t zap_obj; 2955 } livelist_delete_arg_t; 2956 2957 static void 2958 livelist_delete_sync(void *arg, dmu_tx_t *tx) 2959 { 2960 livelist_delete_arg_t *lda = arg; 2961 spa_t *spa = lda->spa; 2962 uint64_t ll_obj = lda->ll_obj; 2963 uint64_t zap_obj = lda->zap_obj; 2964 objset_t *mos = spa->spa_meta_objset; 2965 uint64_t count; 2966 2967 /* free the livelist and decrement the feature count */ 2968 VERIFY0(zap_remove_int(mos, zap_obj, ll_obj, tx)); 2969 dsl_deadlist_free(mos, ll_obj, tx); 2970 spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx); 2971 VERIFY0(zap_count(mos, zap_obj, &count)); 2972 if (count == 0) { 2973 /* no more livelists to delete */ 2974 VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, 2975 DMU_POOL_DELETED_CLONES, tx)); 2976 VERIFY0(zap_destroy(mos, zap_obj, tx)); 2977 spa->spa_livelists_to_delete = 0; 2978 spa_notify_waiters(spa); 2979 } 2980 } 2981 2982 /* 2983 * Load in the value for the livelist to be removed and open it. Then, 2984 * load its first sublist and determine which block pointers should actually 2985 * be freed. Then, call a synctask which performs the actual frees and updates 2986 * the pool-wide livelist data. 2987 */ 2988 static void 2989 spa_livelist_delete_cb(void *arg, zthr_t *z) 2990 { 2991 spa_t *spa = arg; 2992 uint64_t ll_obj = 0, count; 2993 objset_t *mos = spa->spa_meta_objset; 2994 uint64_t zap_obj = spa->spa_livelists_to_delete; 2995 /* 2996 * Determine the next livelist to delete. This function should only 2997 * be called if there is at least one deleted clone. 2998 */ 2999 VERIFY0(dsl_get_next_livelist_obj(mos, zap_obj, &ll_obj)); 3000 VERIFY0(zap_count(mos, ll_obj, &count)); 3001 if (count > 0) { 3002 dsl_deadlist_t *ll; 3003 dsl_deadlist_entry_t *dle; 3004 bplist_t to_free; 3005 ll = kmem_zalloc(sizeof (dsl_deadlist_t), KM_SLEEP); 3006 dsl_deadlist_open(ll, mos, ll_obj); 3007 dle = dsl_deadlist_first(ll); 3008 ASSERT3P(dle, !=, NULL); 3009 bplist_create(&to_free); 3010 int err = dsl_process_sub_livelist(&dle->dle_bpobj, &to_free, 3011 z, NULL); 3012 if (err == 0) { 3013 sublist_delete_arg_t sync_arg = { 3014 .spa = spa, 3015 .ll = ll, 3016 .key = dle->dle_mintxg, 3017 .to_free = &to_free 3018 }; 3019 zfs_dbgmsg("deleting sublist (id %llu) from" 3020 " livelist %llu, %lld remaining", 3021 (u_longlong_t)dle->dle_bpobj.bpo_object, 3022 (u_longlong_t)ll_obj, (longlong_t)count - 1); 3023 VERIFY0(dsl_sync_task(spa_name(spa), NULL, 3024 sublist_delete_sync, &sync_arg, 0, 3025 ZFS_SPACE_CHECK_DESTROY)); 3026 } else { 3027 VERIFY3U(err, ==, EINTR); 3028 } 3029 bplist_clear(&to_free); 3030 bplist_destroy(&to_free); 3031 dsl_deadlist_close(ll); 3032 kmem_free(ll, sizeof (dsl_deadlist_t)); 3033 } else { 3034 livelist_delete_arg_t sync_arg = { 3035 .spa = spa, 3036 .ll_obj = ll_obj, 3037 .zap_obj = zap_obj 3038 }; 3039 zfs_dbgmsg("deletion of livelist %llu completed", 3040 (u_longlong_t)ll_obj); 3041 VERIFY0(dsl_sync_task(spa_name(spa), NULL, livelist_delete_sync, 3042 &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY)); 3043 } 3044 } 3045 3046 static void 3047 spa_start_livelist_destroy_thread(spa_t *spa) 3048 { 3049 ASSERT3P(spa->spa_livelist_delete_zthr, ==, NULL); 3050 spa->spa_livelist_delete_zthr = 3051 zthr_create("z_livelist_destroy", 3052 spa_livelist_delete_cb_check, spa_livelist_delete_cb, spa, 3053 minclsyspri); 3054 } 3055 3056 typedef struct livelist_new_arg { 3057 bplist_t *allocs; 3058 bplist_t *frees; 3059 } livelist_new_arg_t; 3060 3061 static int 3062 livelist_track_new_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 3063 dmu_tx_t *tx) 3064 { 3065 ASSERT(tx == NULL); 3066 livelist_new_arg_t *lna = arg; 3067 if (bp_freed) { 3068 bplist_append(lna->frees, bp); 3069 } else { 3070 bplist_append(lna->allocs, bp); 3071 zfs_livelist_condense_new_alloc++; 3072 } 3073 return (0); 3074 } 3075 3076 typedef struct livelist_condense_arg { 3077 spa_t *spa; 3078 bplist_t to_keep; 3079 uint64_t first_size; 3080 uint64_t next_size; 3081 } livelist_condense_arg_t; 3082 3083 static void 3084 spa_livelist_condense_sync(void *arg, dmu_tx_t *tx) 3085 { 3086 livelist_condense_arg_t *lca = arg; 3087 spa_t *spa = lca->spa; 3088 bplist_t new_frees; 3089 dsl_dataset_t *ds = spa->spa_to_condense.ds; 3090 3091 /* Have we been cancelled? */ 3092 if (spa->spa_to_condense.cancelled) { 3093 zfs_livelist_condense_sync_cancel++; 3094 goto out; 3095 } 3096 3097 dsl_deadlist_entry_t *first = spa->spa_to_condense.first; 3098 dsl_deadlist_entry_t *next = spa->spa_to_condense.next; 3099 dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist; 3100 3101 /* 3102 * It's possible that the livelist was changed while the zthr was 3103 * running. Therefore, we need to check for new blkptrs in the two 3104 * entries being condensed and continue to track them in the livelist. 3105 * Because of the way we handle remapped blkptrs (see dbuf_remap_impl), 3106 * it's possible that the newly added blkptrs are FREEs or ALLOCs so 3107 * we need to sort them into two different bplists. 3108 */ 3109 uint64_t first_obj = first->dle_bpobj.bpo_object; 3110 uint64_t next_obj = next->dle_bpobj.bpo_object; 3111 uint64_t cur_first_size = first->dle_bpobj.bpo_phys->bpo_num_blkptrs; 3112 uint64_t cur_next_size = next->dle_bpobj.bpo_phys->bpo_num_blkptrs; 3113 3114 bplist_create(&new_frees); 3115 livelist_new_arg_t new_bps = { 3116 .allocs = &lca->to_keep, 3117 .frees = &new_frees, 3118 }; 3119 3120 if (cur_first_size > lca->first_size) { 3121 VERIFY0(livelist_bpobj_iterate_from_nofree(&first->dle_bpobj, 3122 livelist_track_new_cb, &new_bps, lca->first_size)); 3123 } 3124 if (cur_next_size > lca->next_size) { 3125 VERIFY0(livelist_bpobj_iterate_from_nofree(&next->dle_bpobj, 3126 livelist_track_new_cb, &new_bps, lca->next_size)); 3127 } 3128 3129 dsl_deadlist_clear_entry(first, ll, tx); 3130 ASSERT(bpobj_is_empty(&first->dle_bpobj)); 3131 dsl_deadlist_remove_entry(ll, next->dle_mintxg, tx); 3132 3133 bplist_iterate(&lca->to_keep, dsl_deadlist_insert_alloc_cb, ll, tx); 3134 bplist_iterate(&new_frees, dsl_deadlist_insert_free_cb, ll, tx); 3135 bplist_destroy(&new_frees); 3136 3137 char dsname[ZFS_MAX_DATASET_NAME_LEN]; 3138 dsl_dataset_name(ds, dsname); 3139 zfs_dbgmsg("txg %llu condensing livelist of %s (id %llu), bpobj %llu " 3140 "(%llu blkptrs) and bpobj %llu (%llu blkptrs) -> bpobj %llu " 3141 "(%llu blkptrs)", (u_longlong_t)tx->tx_txg, dsname, 3142 (u_longlong_t)ds->ds_object, (u_longlong_t)first_obj, 3143 (u_longlong_t)cur_first_size, (u_longlong_t)next_obj, 3144 (u_longlong_t)cur_next_size, 3145 (u_longlong_t)first->dle_bpobj.bpo_object, 3146 (u_longlong_t)first->dle_bpobj.bpo_phys->bpo_num_blkptrs); 3147 out: 3148 dmu_buf_rele(ds->ds_dbuf, spa); 3149 spa->spa_to_condense.ds = NULL; 3150 bplist_clear(&lca->to_keep); 3151 bplist_destroy(&lca->to_keep); 3152 kmem_free(lca, sizeof (livelist_condense_arg_t)); 3153 spa->spa_to_condense.syncing = B_FALSE; 3154 } 3155 3156 static void 3157 spa_livelist_condense_cb(void *arg, zthr_t *t) 3158 { 3159 while (zfs_livelist_condense_zthr_pause && 3160 !(zthr_has_waiters(t) || zthr_iscancelled(t))) 3161 delay(1); 3162 3163 spa_t *spa = arg; 3164 dsl_deadlist_entry_t *first = spa->spa_to_condense.first; 3165 dsl_deadlist_entry_t *next = spa->spa_to_condense.next; 3166 uint64_t first_size, next_size; 3167 3168 livelist_condense_arg_t *lca = 3169 kmem_alloc(sizeof (livelist_condense_arg_t), KM_SLEEP); 3170 bplist_create(&lca->to_keep); 3171 3172 /* 3173 * Process the livelists (matching FREEs and ALLOCs) in open context 3174 * so we have minimal work in syncing context to condense. 3175 * 3176 * We save bpobj sizes (first_size and next_size) to use later in 3177 * syncing context to determine if entries were added to these sublists 3178 * while in open context. This is possible because the clone is still 3179 * active and open for normal writes and we want to make sure the new, 3180 * unprocessed blockpointers are inserted into the livelist normally. 3181 * 3182 * Note that dsl_process_sub_livelist() both stores the size number of 3183 * blockpointers and iterates over them while the bpobj's lock held, so 3184 * the sizes returned to us are consistent which what was actually 3185 * processed. 3186 */ 3187 int err = dsl_process_sub_livelist(&first->dle_bpobj, &lca->to_keep, t, 3188 &first_size); 3189 if (err == 0) 3190 err = dsl_process_sub_livelist(&next->dle_bpobj, &lca->to_keep, 3191 t, &next_size); 3192 3193 if (err == 0) { 3194 while (zfs_livelist_condense_sync_pause && 3195 !(zthr_has_waiters(t) || zthr_iscancelled(t))) 3196 delay(1); 3197 3198 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 3199 dmu_tx_mark_netfree(tx); 3200 dmu_tx_hold_space(tx, 1); 3201 err = dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE); 3202 if (err == 0) { 3203 /* 3204 * Prevent the condense zthr restarting before 3205 * the synctask completes. 3206 */ 3207 spa->spa_to_condense.syncing = B_TRUE; 3208 lca->spa = spa; 3209 lca->first_size = first_size; 3210 lca->next_size = next_size; 3211 dsl_sync_task_nowait(spa_get_dsl(spa), 3212 spa_livelist_condense_sync, lca, tx); 3213 dmu_tx_commit(tx); 3214 return; 3215 } 3216 } 3217 /* 3218 * Condensing can not continue: either it was externally stopped or 3219 * we were unable to assign to a tx because the pool has run out of 3220 * space. In the second case, we'll just end up trying to condense 3221 * again in a later txg. 3222 */ 3223 ASSERT(err != 0); 3224 bplist_clear(&lca->to_keep); 3225 bplist_destroy(&lca->to_keep); 3226 kmem_free(lca, sizeof (livelist_condense_arg_t)); 3227 dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, spa); 3228 spa->spa_to_condense.ds = NULL; 3229 if (err == EINTR) 3230 zfs_livelist_condense_zthr_cancel++; 3231 } 3232 3233 /* 3234 * Check that there is something to condense but that a condense is not 3235 * already in progress and that condensing has not been cancelled. 3236 */ 3237 static boolean_t 3238 spa_livelist_condense_cb_check(void *arg, zthr_t *z) 3239 { 3240 (void) z; 3241 spa_t *spa = arg; 3242 if ((spa->spa_to_condense.ds != NULL) && 3243 (spa->spa_to_condense.syncing == B_FALSE) && 3244 (spa->spa_to_condense.cancelled == B_FALSE)) { 3245 return (B_TRUE); 3246 } 3247 return (B_FALSE); 3248 } 3249 3250 static void 3251 spa_start_livelist_condensing_thread(spa_t *spa) 3252 { 3253 spa->spa_to_condense.ds = NULL; 3254 spa->spa_to_condense.first = NULL; 3255 spa->spa_to_condense.next = NULL; 3256 spa->spa_to_condense.syncing = B_FALSE; 3257 spa->spa_to_condense.cancelled = B_FALSE; 3258 3259 ASSERT3P(spa->spa_livelist_condense_zthr, ==, NULL); 3260 spa->spa_livelist_condense_zthr = 3261 zthr_create("z_livelist_condense", 3262 spa_livelist_condense_cb_check, 3263 spa_livelist_condense_cb, spa, minclsyspri); 3264 } 3265 3266 static void 3267 spa_spawn_aux_threads(spa_t *spa) 3268 { 3269 ASSERT(spa_writeable(spa)); 3270 3271 spa_start_raidz_expansion_thread(spa); 3272 spa_start_indirect_condensing_thread(spa); 3273 spa_start_livelist_destroy_thread(spa); 3274 spa_start_livelist_condensing_thread(spa); 3275 3276 ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL); 3277 spa->spa_checkpoint_discard_zthr = 3278 zthr_create("z_checkpoint_discard", 3279 spa_checkpoint_discard_thread_check, 3280 spa_checkpoint_discard_thread, spa, minclsyspri); 3281 } 3282 3283 /* 3284 * Fix up config after a partly-completed split. This is done with the 3285 * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 3286 * pool have that entry in their config, but only the splitting one contains 3287 * a list of all the guids of the vdevs that are being split off. 3288 * 3289 * This function determines what to do with that list: either rejoin 3290 * all the disks to the pool, or complete the splitting process. To attempt 3291 * the rejoin, each disk that is offlined is marked online again, and 3292 * we do a reopen() call. If the vdev label for every disk that was 3293 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 3294 * then we call vdev_split() on each disk, and complete the split. 3295 * 3296 * Otherwise we leave the config alone, with all the vdevs in place in 3297 * the original pool. 3298 */ 3299 static void 3300 spa_try_repair(spa_t *spa, nvlist_t *config) 3301 { 3302 uint_t extracted; 3303 uint64_t *glist; 3304 uint_t i, gcount; 3305 nvlist_t *nvl; 3306 vdev_t **vd; 3307 boolean_t attempt_reopen; 3308 3309 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 3310 return; 3311 3312 /* check that the config is complete */ 3313 if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 3314 &glist, &gcount) != 0) 3315 return; 3316 3317 vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 3318 3319 /* attempt to online all the vdevs & validate */ 3320 attempt_reopen = B_TRUE; 3321 for (i = 0; i < gcount; i++) { 3322 if (glist[i] == 0) /* vdev is hole */ 3323 continue; 3324 3325 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 3326 if (vd[i] == NULL) { 3327 /* 3328 * Don't bother attempting to reopen the disks; 3329 * just do the split. 3330 */ 3331 attempt_reopen = B_FALSE; 3332 } else { 3333 /* attempt to re-online it */ 3334 vd[i]->vdev_offline = B_FALSE; 3335 } 3336 } 3337 3338 if (attempt_reopen) { 3339 vdev_reopen(spa->spa_root_vdev); 3340 3341 /* check each device to see what state it's in */ 3342 for (extracted = 0, i = 0; i < gcount; i++) { 3343 if (vd[i] != NULL && 3344 vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 3345 break; 3346 ++extracted; 3347 } 3348 } 3349 3350 /* 3351 * If every disk has been moved to the new pool, or if we never 3352 * even attempted to look at them, then we split them off for 3353 * good. 3354 */ 3355 if (!attempt_reopen || gcount == extracted) { 3356 for (i = 0; i < gcount; i++) 3357 if (vd[i] != NULL) 3358 vdev_split(vd[i]); 3359 vdev_reopen(spa->spa_root_vdev); 3360 } 3361 3362 kmem_free(vd, gcount * sizeof (vdev_t *)); 3363 } 3364 3365 static int 3366 spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type) 3367 { 3368 const char *ereport = FM_EREPORT_ZFS_POOL; 3369 int error; 3370 3371 spa->spa_load_state = state; 3372 (void) spa_import_progress_set_state(spa_guid(spa), 3373 spa_load_state(spa)); 3374 spa_import_progress_set_notes(spa, "spa_load()"); 3375 3376 gethrestime(&spa->spa_loaded_ts); 3377 error = spa_load_impl(spa, type, &ereport); 3378 3379 /* 3380 * Don't count references from objsets that are already closed 3381 * and are making their way through the eviction process. 3382 */ 3383 spa_evicting_os_wait(spa); 3384 spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); 3385 if (error) { 3386 if (error != EEXIST) { 3387 spa->spa_loaded_ts.tv_sec = 0; 3388 spa->spa_loaded_ts.tv_nsec = 0; 3389 } 3390 if (error != EBADF) { 3391 (void) zfs_ereport_post(ereport, spa, 3392 NULL, NULL, NULL, 0); 3393 } 3394 } 3395 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 3396 spa->spa_ena = 0; 3397 3398 (void) spa_import_progress_set_state(spa_guid(spa), 3399 spa_load_state(spa)); 3400 3401 return (error); 3402 } 3403 3404 #ifdef ZFS_DEBUG 3405 /* 3406 * Count the number of per-vdev ZAPs associated with all of the vdevs in the 3407 * vdev tree rooted in the given vd, and ensure that each ZAP is present in the 3408 * spa's per-vdev ZAP list. 3409 */ 3410 static uint64_t 3411 vdev_count_verify_zaps(vdev_t *vd) 3412 { 3413 spa_t *spa = vd->vdev_spa; 3414 uint64_t total = 0; 3415 3416 if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2) && 3417 vd->vdev_root_zap != 0) { 3418 total++; 3419 ASSERT0(zap_lookup_int(spa->spa_meta_objset, 3420 spa->spa_all_vdev_zaps, vd->vdev_root_zap)); 3421 } 3422 if (vd->vdev_top_zap != 0) { 3423 total++; 3424 ASSERT0(zap_lookup_int(spa->spa_meta_objset, 3425 spa->spa_all_vdev_zaps, vd->vdev_top_zap)); 3426 } 3427 if (vd->vdev_leaf_zap != 0) { 3428 total++; 3429 ASSERT0(zap_lookup_int(spa->spa_meta_objset, 3430 spa->spa_all_vdev_zaps, vd->vdev_leaf_zap)); 3431 } 3432 3433 for (uint64_t i = 0; i < vd->vdev_children; i++) { 3434 total += vdev_count_verify_zaps(vd->vdev_child[i]); 3435 } 3436 3437 return (total); 3438 } 3439 #else 3440 #define vdev_count_verify_zaps(vd) ((void) sizeof (vd), 0) 3441 #endif 3442 3443 /* 3444 * Determine whether the activity check is required. 3445 */ 3446 static boolean_t 3447 spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label, 3448 nvlist_t *config) 3449 { 3450 uint64_t state = 0; 3451 uint64_t hostid = 0; 3452 uint64_t tryconfig_txg = 0; 3453 uint64_t tryconfig_timestamp = 0; 3454 uint16_t tryconfig_mmp_seq = 0; 3455 nvlist_t *nvinfo; 3456 3457 if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { 3458 nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); 3459 (void) nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG, 3460 &tryconfig_txg); 3461 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 3462 &tryconfig_timestamp); 3463 (void) nvlist_lookup_uint16(nvinfo, ZPOOL_CONFIG_MMP_SEQ, 3464 &tryconfig_mmp_seq); 3465 } 3466 3467 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state); 3468 3469 /* 3470 * Disable the MMP activity check - This is used by zdb which 3471 * is intended to be used on potentially active pools. 3472 */ 3473 if (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) 3474 return (B_FALSE); 3475 3476 /* 3477 * Skip the activity check when the MMP feature is disabled. 3478 */ 3479 if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay == 0) 3480 return (B_FALSE); 3481 3482 /* 3483 * If the tryconfig_ values are nonzero, they are the results of an 3484 * earlier tryimport. If they all match the uberblock we just found, 3485 * then the pool has not changed and we return false so we do not test 3486 * a second time. 3487 */ 3488 if (tryconfig_txg && tryconfig_txg == ub->ub_txg && 3489 tryconfig_timestamp && tryconfig_timestamp == ub->ub_timestamp && 3490 tryconfig_mmp_seq && tryconfig_mmp_seq == 3491 (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) 3492 return (B_FALSE); 3493 3494 /* 3495 * Allow the activity check to be skipped when importing the pool 3496 * on the same host which last imported it. Since the hostid from 3497 * configuration may be stale use the one read from the label. 3498 */ 3499 if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID)) 3500 hostid = fnvlist_lookup_uint64(label, ZPOOL_CONFIG_HOSTID); 3501 3502 if (hostid == spa_get_hostid(spa)) 3503 return (B_FALSE); 3504 3505 /* 3506 * Skip the activity test when the pool was cleanly exported. 3507 */ 3508 if (state != POOL_STATE_ACTIVE) 3509 return (B_FALSE); 3510 3511 return (B_TRUE); 3512 } 3513 3514 /* 3515 * Nanoseconds the activity check must watch for changes on-disk. 3516 */ 3517 static uint64_t 3518 spa_activity_check_duration(spa_t *spa, uberblock_t *ub) 3519 { 3520 uint64_t import_intervals = MAX(zfs_multihost_import_intervals, 1); 3521 uint64_t multihost_interval = MSEC2NSEC( 3522 MMP_INTERVAL_OK(zfs_multihost_interval)); 3523 uint64_t import_delay = MAX(NANOSEC, import_intervals * 3524 multihost_interval); 3525 3526 /* 3527 * Local tunables determine a minimum duration except for the case 3528 * where we know when the remote host will suspend the pool if MMP 3529 * writes do not land. 3530 * 3531 * See Big Theory comment at the top of mmp.c for the reasoning behind 3532 * these cases and times. 3533 */ 3534 3535 ASSERT(MMP_IMPORT_SAFETY_FACTOR >= 100); 3536 3537 if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) && 3538 MMP_FAIL_INT(ub) > 0) { 3539 3540 /* MMP on remote host will suspend pool after failed writes */ 3541 import_delay = MMP_FAIL_INT(ub) * MSEC2NSEC(MMP_INTERVAL(ub)) * 3542 MMP_IMPORT_SAFETY_FACTOR / 100; 3543 3544 zfs_dbgmsg("fail_intvals>0 import_delay=%llu ub_mmp " 3545 "mmp_fails=%llu ub_mmp mmp_interval=%llu " 3546 "import_intervals=%llu", (u_longlong_t)import_delay, 3547 (u_longlong_t)MMP_FAIL_INT(ub), 3548 (u_longlong_t)MMP_INTERVAL(ub), 3549 (u_longlong_t)import_intervals); 3550 3551 } else if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) && 3552 MMP_FAIL_INT(ub) == 0) { 3553 3554 /* MMP on remote host will never suspend pool */ 3555 import_delay = MAX(import_delay, (MSEC2NSEC(MMP_INTERVAL(ub)) + 3556 ub->ub_mmp_delay) * import_intervals); 3557 3558 zfs_dbgmsg("fail_intvals=0 import_delay=%llu ub_mmp " 3559 "mmp_interval=%llu ub_mmp_delay=%llu " 3560 "import_intervals=%llu", (u_longlong_t)import_delay, 3561 (u_longlong_t)MMP_INTERVAL(ub), 3562 (u_longlong_t)ub->ub_mmp_delay, 3563 (u_longlong_t)import_intervals); 3564 3565 } else if (MMP_VALID(ub)) { 3566 /* 3567 * zfs-0.7 compatibility case 3568 */ 3569 3570 import_delay = MAX(import_delay, (multihost_interval + 3571 ub->ub_mmp_delay) * import_intervals); 3572 3573 zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu " 3574 "import_intervals=%llu leaves=%u", 3575 (u_longlong_t)import_delay, 3576 (u_longlong_t)ub->ub_mmp_delay, 3577 (u_longlong_t)import_intervals, 3578 vdev_count_leaves(spa)); 3579 } else { 3580 /* Using local tunings is the only reasonable option */ 3581 zfs_dbgmsg("pool last imported on non-MMP aware " 3582 "host using import_delay=%llu multihost_interval=%llu " 3583 "import_intervals=%llu", (u_longlong_t)import_delay, 3584 (u_longlong_t)multihost_interval, 3585 (u_longlong_t)import_intervals); 3586 } 3587 3588 return (import_delay); 3589 } 3590 3591 /* 3592 * Remote host activity check. 3593 * 3594 * error results: 3595 * 0 - no activity detected 3596 * EREMOTEIO - remote activity detected 3597 * EINTR - user canceled the operation 3598 */ 3599 static int 3600 spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config, 3601 boolean_t importing) 3602 { 3603 uint64_t txg = ub->ub_txg; 3604 uint64_t timestamp = ub->ub_timestamp; 3605 uint64_t mmp_config = ub->ub_mmp_config; 3606 uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0; 3607 uint64_t import_delay; 3608 hrtime_t import_expire, now; 3609 nvlist_t *mmp_label = NULL; 3610 vdev_t *rvd = spa->spa_root_vdev; 3611 kcondvar_t cv; 3612 kmutex_t mtx; 3613 int error = 0; 3614 3615 cv_init(&cv, NULL, CV_DEFAULT, NULL); 3616 mutex_init(&mtx, NULL, MUTEX_DEFAULT, NULL); 3617 mutex_enter(&mtx); 3618 3619 /* 3620 * If ZPOOL_CONFIG_MMP_TXG is present an activity check was performed 3621 * during the earlier tryimport. If the txg recorded there is 0 then 3622 * the pool is known to be active on another host. 3623 * 3624 * Otherwise, the pool might be in use on another host. Check for 3625 * changes in the uberblocks on disk if necessary. 3626 */ 3627 if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { 3628 nvlist_t *nvinfo = fnvlist_lookup_nvlist(config, 3629 ZPOOL_CONFIG_LOAD_INFO); 3630 3631 if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_TXG) && 3632 fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG) == 0) { 3633 vdev_uberblock_load(rvd, ub, &mmp_label); 3634 error = SET_ERROR(EREMOTEIO); 3635 goto out; 3636 } 3637 } 3638 3639 import_delay = spa_activity_check_duration(spa, ub); 3640 3641 /* Add a small random factor in case of simultaneous imports (0-25%) */ 3642 import_delay += import_delay * random_in_range(250) / 1000; 3643 3644 import_expire = gethrtime() + import_delay; 3645 3646 if (importing) { 3647 spa_import_progress_set_notes(spa, "Checking MMP activity, " 3648 "waiting %llu ms", (u_longlong_t)NSEC2MSEC(import_delay)); 3649 } 3650 3651 int iterations = 0; 3652 while ((now = gethrtime()) < import_expire) { 3653 if (importing && iterations++ % 30 == 0) { 3654 spa_import_progress_set_notes(spa, "Checking MMP " 3655 "activity, %llu ms remaining", 3656 (u_longlong_t)NSEC2MSEC(import_expire - now)); 3657 } 3658 3659 if (importing) { 3660 (void) spa_import_progress_set_mmp_check(spa_guid(spa), 3661 NSEC2SEC(import_expire - gethrtime())); 3662 } 3663 3664 vdev_uberblock_load(rvd, ub, &mmp_label); 3665 3666 if (txg != ub->ub_txg || timestamp != ub->ub_timestamp || 3667 mmp_seq != (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) { 3668 zfs_dbgmsg("multihost activity detected " 3669 "txg %llu ub_txg %llu " 3670 "timestamp %llu ub_timestamp %llu " 3671 "mmp_config %#llx ub_mmp_config %#llx", 3672 (u_longlong_t)txg, (u_longlong_t)ub->ub_txg, 3673 (u_longlong_t)timestamp, 3674 (u_longlong_t)ub->ub_timestamp, 3675 (u_longlong_t)mmp_config, 3676 (u_longlong_t)ub->ub_mmp_config); 3677 3678 error = SET_ERROR(EREMOTEIO); 3679 break; 3680 } 3681 3682 if (mmp_label) { 3683 nvlist_free(mmp_label); 3684 mmp_label = NULL; 3685 } 3686 3687 error = cv_timedwait_sig(&cv, &mtx, ddi_get_lbolt() + hz); 3688 if (error != -1) { 3689 error = SET_ERROR(EINTR); 3690 break; 3691 } 3692 error = 0; 3693 } 3694 3695 out: 3696 mutex_exit(&mtx); 3697 mutex_destroy(&mtx); 3698 cv_destroy(&cv); 3699 3700 /* 3701 * If the pool is determined to be active store the status in the 3702 * spa->spa_load_info nvlist. If the remote hostname or hostid are 3703 * available from configuration read from disk store them as well. 3704 * This allows 'zpool import' to generate a more useful message. 3705 * 3706 * ZPOOL_CONFIG_MMP_STATE - observed pool status (mandatory) 3707 * ZPOOL_CONFIG_MMP_HOSTNAME - hostname from the active pool 3708 * ZPOOL_CONFIG_MMP_HOSTID - hostid from the active pool 3709 */ 3710 if (error == EREMOTEIO) { 3711 const char *hostname = "<unknown>"; 3712 uint64_t hostid = 0; 3713 3714 if (mmp_label) { 3715 if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTNAME)) { 3716 hostname = fnvlist_lookup_string(mmp_label, 3717 ZPOOL_CONFIG_HOSTNAME); 3718 fnvlist_add_string(spa->spa_load_info, 3719 ZPOOL_CONFIG_MMP_HOSTNAME, hostname); 3720 } 3721 3722 if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTID)) { 3723 hostid = fnvlist_lookup_uint64(mmp_label, 3724 ZPOOL_CONFIG_HOSTID); 3725 fnvlist_add_uint64(spa->spa_load_info, 3726 ZPOOL_CONFIG_MMP_HOSTID, hostid); 3727 } 3728 } 3729 3730 fnvlist_add_uint64(spa->spa_load_info, 3731 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_ACTIVE); 3732 fnvlist_add_uint64(spa->spa_load_info, 3733 ZPOOL_CONFIG_MMP_TXG, 0); 3734 3735 error = spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO); 3736 } 3737 3738 if (mmp_label) 3739 nvlist_free(mmp_label); 3740 3741 return (error); 3742 } 3743 3744 /* 3745 * Called from zfs_ioc_clear for a pool that was suspended 3746 * after failing mmp write checks. 3747 */ 3748 boolean_t 3749 spa_mmp_remote_host_activity(spa_t *spa) 3750 { 3751 ASSERT(spa_multihost(spa) && spa_suspended(spa)); 3752 3753 nvlist_t *best_label; 3754 uberblock_t best_ub; 3755 3756 /* 3757 * Locate the best uberblock on disk 3758 */ 3759 vdev_uberblock_load(spa->spa_root_vdev, &best_ub, &best_label); 3760 if (best_label) { 3761 /* 3762 * confirm that the best hostid matches our hostid 3763 */ 3764 if (nvlist_exists(best_label, ZPOOL_CONFIG_HOSTID) && 3765 spa_get_hostid(spa) != 3766 fnvlist_lookup_uint64(best_label, ZPOOL_CONFIG_HOSTID)) { 3767 nvlist_free(best_label); 3768 return (B_TRUE); 3769 } 3770 nvlist_free(best_label); 3771 } else { 3772 return (B_TRUE); 3773 } 3774 3775 if (!MMP_VALID(&best_ub) || 3776 !MMP_FAIL_INT_VALID(&best_ub) || 3777 MMP_FAIL_INT(&best_ub) == 0) { 3778 return (B_TRUE); 3779 } 3780 3781 if (best_ub.ub_txg != spa->spa_uberblock.ub_txg || 3782 best_ub.ub_timestamp != spa->spa_uberblock.ub_timestamp) { 3783 zfs_dbgmsg("txg mismatch detected during pool clear " 3784 "txg %llu ub_txg %llu timestamp %llu ub_timestamp %llu", 3785 (u_longlong_t)spa->spa_uberblock.ub_txg, 3786 (u_longlong_t)best_ub.ub_txg, 3787 (u_longlong_t)spa->spa_uberblock.ub_timestamp, 3788 (u_longlong_t)best_ub.ub_timestamp); 3789 return (B_TRUE); 3790 } 3791 3792 /* 3793 * Perform an activity check looking for any remote writer 3794 */ 3795 return (spa_activity_check(spa, &spa->spa_uberblock, spa->spa_config, 3796 B_FALSE) != 0); 3797 } 3798 3799 static int 3800 spa_verify_host(spa_t *spa, nvlist_t *mos_config) 3801 { 3802 uint64_t hostid; 3803 const char *hostname; 3804 uint64_t myhostid = 0; 3805 3806 if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config, 3807 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 3808 hostname = fnvlist_lookup_string(mos_config, 3809 ZPOOL_CONFIG_HOSTNAME); 3810 3811 myhostid = zone_get_hostid(NULL); 3812 3813 if (hostid != 0 && myhostid != 0 && hostid != myhostid) { 3814 cmn_err(CE_WARN, "pool '%s' could not be " 3815 "loaded as it was last accessed by " 3816 "another system (host: %s hostid: 0x%llx). " 3817 "See: https://openzfs.github.io/openzfs-docs/msg/" 3818 "ZFS-8000-EY", 3819 spa_name(spa), hostname, (u_longlong_t)hostid); 3820 spa_load_failed(spa, "hostid verification failed: pool " 3821 "last accessed by host: %s (hostid: 0x%llx)", 3822 hostname, (u_longlong_t)hostid); 3823 return (SET_ERROR(EBADF)); 3824 } 3825 } 3826 3827 return (0); 3828 } 3829 3830 static int 3831 spa_ld_parse_config(spa_t *spa, spa_import_type_t type) 3832 { 3833 int error = 0; 3834 nvlist_t *nvtree, *nvl, *config = spa->spa_config; 3835 int parse; 3836 vdev_t *rvd; 3837 uint64_t pool_guid; 3838 const char *comment; 3839 const char *compatibility; 3840 3841 /* 3842 * Versioning wasn't explicitly added to the label until later, so if 3843 * it's not present treat it as the initial version. 3844 */ 3845 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 3846 &spa->spa_ubsync.ub_version) != 0) 3847 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 3848 3849 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 3850 spa_load_failed(spa, "invalid config provided: '%s' missing", 3851 ZPOOL_CONFIG_POOL_GUID); 3852 return (SET_ERROR(EINVAL)); 3853 } 3854 3855 /* 3856 * If we are doing an import, ensure that the pool is not already 3857 * imported by checking if its pool guid already exists in the 3858 * spa namespace. 3859 * 3860 * The only case that we allow an already imported pool to be 3861 * imported again, is when the pool is checkpointed and we want to 3862 * look at its checkpointed state from userland tools like zdb. 3863 */ 3864 #ifdef _KERNEL 3865 if ((spa->spa_load_state == SPA_LOAD_IMPORT || 3866 spa->spa_load_state == SPA_LOAD_TRYIMPORT) && 3867 spa_guid_exists(pool_guid, 0)) { 3868 #else 3869 if ((spa->spa_load_state == SPA_LOAD_IMPORT || 3870 spa->spa_load_state == SPA_LOAD_TRYIMPORT) && 3871 spa_guid_exists(pool_guid, 0) && 3872 !spa_importing_readonly_checkpoint(spa)) { 3873 #endif 3874 spa_load_failed(spa, "a pool with guid %llu is already open", 3875 (u_longlong_t)pool_guid); 3876 return (SET_ERROR(EEXIST)); 3877 } 3878 3879 spa->spa_config_guid = pool_guid; 3880 3881 nvlist_free(spa->spa_load_info); 3882 spa->spa_load_info = fnvlist_alloc(); 3883 3884 ASSERT(spa->spa_comment == NULL); 3885 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) 3886 spa->spa_comment = spa_strdup(comment); 3887 3888 ASSERT(spa->spa_compatibility == NULL); 3889 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMPATIBILITY, 3890 &compatibility) == 0) 3891 spa->spa_compatibility = spa_strdup(compatibility); 3892 3893 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 3894 &spa->spa_config_txg); 3895 3896 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0) 3897 spa->spa_config_splitting = fnvlist_dup(nvl); 3898 3899 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) { 3900 spa_load_failed(spa, "invalid config provided: '%s' missing", 3901 ZPOOL_CONFIG_VDEV_TREE); 3902 return (SET_ERROR(EINVAL)); 3903 } 3904 3905 /* 3906 * Create "The Godfather" zio to hold all async IOs 3907 */ 3908 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 3909 KM_SLEEP); 3910 for (int i = 0; i < max_ncpus; i++) { 3911 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 3912 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 3913 ZIO_FLAG_GODFATHER); 3914 } 3915 3916 /* 3917 * Parse the configuration into a vdev tree. We explicitly set the 3918 * value that will be returned by spa_version() since parsing the 3919 * configuration requires knowing the version number. 3920 */ 3921 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3922 parse = (type == SPA_IMPORT_EXISTING ? 3923 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 3924 error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse); 3925 spa_config_exit(spa, SCL_ALL, FTAG); 3926 3927 if (error != 0) { 3928 spa_load_failed(spa, "unable to parse config [error=%d]", 3929 error); 3930 return (error); 3931 } 3932 3933 ASSERT(spa->spa_root_vdev == rvd); 3934 ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); 3935 ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT); 3936 3937 if (type != SPA_IMPORT_ASSEMBLE) { 3938 ASSERT(spa_guid(spa) == pool_guid); 3939 } 3940 3941 return (0); 3942 } 3943 3944 /* 3945 * Recursively open all vdevs in the vdev tree. This function is called twice: 3946 * first with the untrusted config, then with the trusted config. 3947 */ 3948 static int 3949 spa_ld_open_vdevs(spa_t *spa) 3950 { 3951 int error = 0; 3952 3953 /* 3954 * spa_missing_tvds_allowed defines how many top-level vdevs can be 3955 * missing/unopenable for the root vdev to be still considered openable. 3956 */ 3957 if (spa->spa_trust_config) { 3958 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds; 3959 } else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) { 3960 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile; 3961 } else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) { 3962 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan; 3963 } else { 3964 spa->spa_missing_tvds_allowed = 0; 3965 } 3966 3967 spa->spa_missing_tvds_allowed = 3968 MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed); 3969 3970 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3971 error = vdev_open(spa->spa_root_vdev); 3972 spa_config_exit(spa, SCL_ALL, FTAG); 3973 3974 if (spa->spa_missing_tvds != 0) { 3975 spa_load_note(spa, "vdev tree has %lld missing top-level " 3976 "vdevs.", (u_longlong_t)spa->spa_missing_tvds); 3977 if (spa->spa_trust_config && (spa->spa_mode & SPA_MODE_WRITE)) { 3978 /* 3979 * Although theoretically we could allow users to open 3980 * incomplete pools in RW mode, we'd need to add a lot 3981 * of extra logic (e.g. adjust pool space to account 3982 * for missing vdevs). 3983 * This limitation also prevents users from accidentally 3984 * opening the pool in RW mode during data recovery and 3985 * damaging it further. 3986 */ 3987 spa_load_note(spa, "pools with missing top-level " 3988 "vdevs can only be opened in read-only mode."); 3989 error = SET_ERROR(ENXIO); 3990 } else { 3991 spa_load_note(spa, "current settings allow for maximum " 3992 "%lld missing top-level vdevs at this stage.", 3993 (u_longlong_t)spa->spa_missing_tvds_allowed); 3994 } 3995 } 3996 if (error != 0) { 3997 spa_load_failed(spa, "unable to open vdev tree [error=%d]", 3998 error); 3999 } 4000 if (spa->spa_missing_tvds != 0 || error != 0) 4001 vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2); 4002 4003 return (error); 4004 } 4005 4006 /* 4007 * We need to validate the vdev labels against the configuration that 4008 * we have in hand. This function is called twice: first with an untrusted 4009 * config, then with a trusted config. The validation is more strict when the 4010 * config is trusted. 4011 */ 4012 static int 4013 spa_ld_validate_vdevs(spa_t *spa) 4014 { 4015 int error = 0; 4016 vdev_t *rvd = spa->spa_root_vdev; 4017 4018 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4019 error = vdev_validate(rvd); 4020 spa_config_exit(spa, SCL_ALL, FTAG); 4021 4022 if (error != 0) { 4023 spa_load_failed(spa, "vdev_validate failed [error=%d]", error); 4024 return (error); 4025 } 4026 4027 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 4028 spa_load_failed(spa, "cannot open vdev tree after invalidating " 4029 "some vdevs"); 4030 vdev_dbgmsg_print_tree(rvd, 2); 4031 return (SET_ERROR(ENXIO)); 4032 } 4033 4034 return (0); 4035 } 4036 4037 static void 4038 spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub) 4039 { 4040 spa->spa_state = POOL_STATE_ACTIVE; 4041 spa->spa_ubsync = spa->spa_uberblock; 4042 spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 4043 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 4044 spa->spa_first_txg = spa->spa_last_ubsync_txg ? 4045 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 4046 spa->spa_claim_max_txg = spa->spa_first_txg; 4047 spa->spa_prev_software_version = ub->ub_software_version; 4048 } 4049 4050 static int 4051 spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) 4052 { 4053 vdev_t *rvd = spa->spa_root_vdev; 4054 nvlist_t *label; 4055 uberblock_t *ub = &spa->spa_uberblock; 4056 boolean_t activity_check = B_FALSE; 4057 4058 /* 4059 * If we are opening the checkpointed state of the pool by 4060 * rewinding to it, at this point we will have written the 4061 * checkpointed uberblock to the vdev labels, so searching 4062 * the labels will find the right uberblock. However, if 4063 * we are opening the checkpointed state read-only, we have 4064 * not modified the labels. Therefore, we must ignore the 4065 * labels and continue using the spa_uberblock that was set 4066 * by spa_ld_checkpoint_rewind. 4067 * 4068 * Note that it would be fine to ignore the labels when 4069 * rewinding (opening writeable) as well. However, if we 4070 * crash just after writing the labels, we will end up 4071 * searching the labels. Doing so in the common case means 4072 * that this code path gets exercised normally, rather than 4073 * just in the edge case. 4074 */ 4075 if (ub->ub_checkpoint_txg != 0 && 4076 spa_importing_readonly_checkpoint(spa)) { 4077 spa_ld_select_uberblock_done(spa, ub); 4078 return (0); 4079 } 4080 4081 /* 4082 * Find the best uberblock. 4083 */ 4084 vdev_uberblock_load(rvd, ub, &label); 4085 4086 /* 4087 * If we weren't able to find a single valid uberblock, return failure. 4088 */ 4089 if (ub->ub_txg == 0) { 4090 nvlist_free(label); 4091 spa_load_failed(spa, "no valid uberblock found"); 4092 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 4093 } 4094 4095 if (spa->spa_load_max_txg != UINT64_MAX) { 4096 (void) spa_import_progress_set_max_txg(spa_guid(spa), 4097 (u_longlong_t)spa->spa_load_max_txg); 4098 } 4099 spa_load_note(spa, "using uberblock with txg=%llu", 4100 (u_longlong_t)ub->ub_txg); 4101 if (ub->ub_raidz_reflow_info != 0) { 4102 spa_load_note(spa, "uberblock raidz_reflow_info: " 4103 "state=%u offset=%llu", 4104 (int)RRSS_GET_STATE(ub), 4105 (u_longlong_t)RRSS_GET_OFFSET(ub)); 4106 } 4107 4108 4109 /* 4110 * For pools which have the multihost property on determine if the 4111 * pool is truly inactive and can be safely imported. Prevent 4112 * hosts which don't have a hostid set from importing the pool. 4113 */ 4114 activity_check = spa_activity_check_required(spa, ub, label, 4115 spa->spa_config); 4116 if (activity_check) { 4117 if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay && 4118 spa_get_hostid(spa) == 0) { 4119 nvlist_free(label); 4120 fnvlist_add_uint64(spa->spa_load_info, 4121 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); 4122 return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); 4123 } 4124 4125 int error = 4126 spa_activity_check(spa, ub, spa->spa_config, B_TRUE); 4127 if (error) { 4128 nvlist_free(label); 4129 return (error); 4130 } 4131 4132 fnvlist_add_uint64(spa->spa_load_info, 4133 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_INACTIVE); 4134 fnvlist_add_uint64(spa->spa_load_info, 4135 ZPOOL_CONFIG_MMP_TXG, ub->ub_txg); 4136 fnvlist_add_uint16(spa->spa_load_info, 4137 ZPOOL_CONFIG_MMP_SEQ, 4138 (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)); 4139 } 4140 4141 /* 4142 * If the pool has an unsupported version we can't open it. 4143 */ 4144 if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { 4145 nvlist_free(label); 4146 spa_load_failed(spa, "version %llu is not supported", 4147 (u_longlong_t)ub->ub_version); 4148 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 4149 } 4150 4151 if (ub->ub_version >= SPA_VERSION_FEATURES) { 4152 nvlist_t *features; 4153 4154 /* 4155 * If we weren't able to find what's necessary for reading the 4156 * MOS in the label, return failure. 4157 */ 4158 if (label == NULL) { 4159 spa_load_failed(spa, "label config unavailable"); 4160 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 4161 ENXIO)); 4162 } 4163 4164 if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ, 4165 &features) != 0) { 4166 nvlist_free(label); 4167 spa_load_failed(spa, "invalid label: '%s' missing", 4168 ZPOOL_CONFIG_FEATURES_FOR_READ); 4169 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 4170 ENXIO)); 4171 } 4172 4173 /* 4174 * Update our in-core representation with the definitive values 4175 * from the label. 4176 */ 4177 nvlist_free(spa->spa_label_features); 4178 spa->spa_label_features = fnvlist_dup(features); 4179 } 4180 4181 nvlist_free(label); 4182 4183 /* 4184 * Look through entries in the label nvlist's features_for_read. If 4185 * there is a feature listed there which we don't understand then we 4186 * cannot open a pool. 4187 */ 4188 if (ub->ub_version >= SPA_VERSION_FEATURES) { 4189 nvlist_t *unsup_feat; 4190 4191 unsup_feat = fnvlist_alloc(); 4192 4193 for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, 4194 NULL); nvp != NULL; 4195 nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { 4196 if (!zfeature_is_supported(nvpair_name(nvp))) { 4197 fnvlist_add_string(unsup_feat, 4198 nvpair_name(nvp), ""); 4199 } 4200 } 4201 4202 if (!nvlist_empty(unsup_feat)) { 4203 fnvlist_add_nvlist(spa->spa_load_info, 4204 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 4205 nvlist_free(unsup_feat); 4206 spa_load_failed(spa, "some features are unsupported"); 4207 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 4208 ENOTSUP)); 4209 } 4210 4211 nvlist_free(unsup_feat); 4212 } 4213 4214 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 4215 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4216 spa_try_repair(spa, spa->spa_config); 4217 spa_config_exit(spa, SCL_ALL, FTAG); 4218 nvlist_free(spa->spa_config_splitting); 4219 spa->spa_config_splitting = NULL; 4220 } 4221 4222 /* 4223 * Initialize internal SPA structures. 4224 */ 4225 spa_ld_select_uberblock_done(spa, ub); 4226 4227 return (0); 4228 } 4229 4230 static int 4231 spa_ld_open_rootbp(spa_t *spa) 4232 { 4233 int error = 0; 4234 vdev_t *rvd = spa->spa_root_vdev; 4235 4236 error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 4237 if (error != 0) { 4238 spa_load_failed(spa, "unable to open rootbp in dsl_pool_init " 4239 "[error=%d]", error); 4240 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4241 } 4242 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 4243 4244 return (0); 4245 } 4246 4247 static int 4248 spa_ld_trusted_config(spa_t *spa, spa_import_type_t type, 4249 boolean_t reloading) 4250 { 4251 vdev_t *mrvd, *rvd = spa->spa_root_vdev; 4252 nvlist_t *nv, *mos_config, *policy; 4253 int error = 0, copy_error; 4254 uint64_t healthy_tvds, healthy_tvds_mos; 4255 uint64_t mos_config_txg; 4256 4257 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE) 4258 != 0) 4259 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4260 4261 /* 4262 * If we're assembling a pool from a split, the config provided is 4263 * already trusted so there is nothing to do. 4264 */ 4265 if (type == SPA_IMPORT_ASSEMBLE) 4266 return (0); 4267 4268 healthy_tvds = spa_healthy_core_tvds(spa); 4269 4270 if (load_nvlist(spa, spa->spa_config_object, &mos_config) 4271 != 0) { 4272 spa_load_failed(spa, "unable to retrieve MOS config"); 4273 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4274 } 4275 4276 /* 4277 * If we are doing an open, pool owner wasn't verified yet, thus do 4278 * the verification here. 4279 */ 4280 if (spa->spa_load_state == SPA_LOAD_OPEN) { 4281 error = spa_verify_host(spa, mos_config); 4282 if (error != 0) { 4283 nvlist_free(mos_config); 4284 return (error); 4285 } 4286 } 4287 4288 nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE); 4289 4290 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4291 4292 /* 4293 * Build a new vdev tree from the trusted config 4294 */ 4295 error = spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD); 4296 if (error != 0) { 4297 nvlist_free(mos_config); 4298 spa_config_exit(spa, SCL_ALL, FTAG); 4299 spa_load_failed(spa, "spa_config_parse failed [error=%d]", 4300 error); 4301 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 4302 } 4303 4304 /* 4305 * Vdev paths in the MOS may be obsolete. If the untrusted config was 4306 * obtained by scanning /dev/dsk, then it will have the right vdev 4307 * paths. We update the trusted MOS config with this information. 4308 * We first try to copy the paths with vdev_copy_path_strict, which 4309 * succeeds only when both configs have exactly the same vdev tree. 4310 * If that fails, we fall back to a more flexible method that has a 4311 * best effort policy. 4312 */ 4313 copy_error = vdev_copy_path_strict(rvd, mrvd); 4314 if (copy_error != 0 || spa_load_print_vdev_tree) { 4315 spa_load_note(spa, "provided vdev tree:"); 4316 vdev_dbgmsg_print_tree(rvd, 2); 4317 spa_load_note(spa, "MOS vdev tree:"); 4318 vdev_dbgmsg_print_tree(mrvd, 2); 4319 } 4320 if (copy_error != 0) { 4321 spa_load_note(spa, "vdev_copy_path_strict failed, falling " 4322 "back to vdev_copy_path_relaxed"); 4323 vdev_copy_path_relaxed(rvd, mrvd); 4324 } 4325 4326 vdev_close(rvd); 4327 vdev_free(rvd); 4328 spa->spa_root_vdev = mrvd; 4329 rvd = mrvd; 4330 spa_config_exit(spa, SCL_ALL, FTAG); 4331 4332 /* 4333 * If 'zpool import' used a cached config, then the on-disk hostid and 4334 * hostname may be different to the cached config in ways that should 4335 * prevent import. Userspace can't discover this without a scan, but 4336 * we know, so we add these values to LOAD_INFO so the caller can know 4337 * the difference. 4338 * 4339 * Note that we have to do this before the config is regenerated, 4340 * because the new config will have the hostid and hostname for this 4341 * host, in readiness for import. 4342 */ 4343 if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTID)) 4344 fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_HOSTID, 4345 fnvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_HOSTID)); 4346 if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTNAME)) 4347 fnvlist_add_string(spa->spa_load_info, ZPOOL_CONFIG_HOSTNAME, 4348 fnvlist_lookup_string(mos_config, ZPOOL_CONFIG_HOSTNAME)); 4349 4350 /* 4351 * We will use spa_config if we decide to reload the spa or if spa_load 4352 * fails and we rewind. We must thus regenerate the config using the 4353 * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to 4354 * pass settings on how to load the pool and is not stored in the MOS. 4355 * We copy it over to our new, trusted config. 4356 */ 4357 mos_config_txg = fnvlist_lookup_uint64(mos_config, 4358 ZPOOL_CONFIG_POOL_TXG); 4359 nvlist_free(mos_config); 4360 mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE); 4361 if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY, 4362 &policy) == 0) 4363 fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy); 4364 spa_config_set(spa, mos_config); 4365 spa->spa_config_source = SPA_CONFIG_SRC_MOS; 4366 4367 /* 4368 * Now that we got the config from the MOS, we should be more strict 4369 * in checking blkptrs and can make assumptions about the consistency 4370 * of the vdev tree. spa_trust_config must be set to true before opening 4371 * vdevs in order for them to be writeable. 4372 */ 4373 spa->spa_trust_config = B_TRUE; 4374 4375 /* 4376 * Open and validate the new vdev tree 4377 */ 4378 error = spa_ld_open_vdevs(spa); 4379 if (error != 0) 4380 return (error); 4381 4382 error = spa_ld_validate_vdevs(spa); 4383 if (error != 0) 4384 return (error); 4385 4386 if (copy_error != 0 || spa_load_print_vdev_tree) { 4387 spa_load_note(spa, "final vdev tree:"); 4388 vdev_dbgmsg_print_tree(rvd, 2); 4389 } 4390 4391 if (spa->spa_load_state != SPA_LOAD_TRYIMPORT && 4392 !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) { 4393 /* 4394 * Sanity check to make sure that we are indeed loading the 4395 * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds 4396 * in the config provided and they happened to be the only ones 4397 * to have the latest uberblock, we could involuntarily perform 4398 * an extreme rewind. 4399 */ 4400 healthy_tvds_mos = spa_healthy_core_tvds(spa); 4401 if (healthy_tvds_mos - healthy_tvds >= 4402 SPA_SYNC_MIN_VDEVS) { 4403 spa_load_note(spa, "config provided misses too many " 4404 "top-level vdevs compared to MOS (%lld vs %lld). ", 4405 (u_longlong_t)healthy_tvds, 4406 (u_longlong_t)healthy_tvds_mos); 4407 spa_load_note(spa, "vdev tree:"); 4408 vdev_dbgmsg_print_tree(rvd, 2); 4409 if (reloading) { 4410 spa_load_failed(spa, "config was already " 4411 "provided from MOS. Aborting."); 4412 return (spa_vdev_err(rvd, 4413 VDEV_AUX_CORRUPT_DATA, EIO)); 4414 } 4415 spa_load_note(spa, "spa must be reloaded using MOS " 4416 "config"); 4417 return (SET_ERROR(EAGAIN)); 4418 } 4419 } 4420 4421 error = spa_check_for_missing_logs(spa); 4422 if (error != 0) 4423 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 4424 4425 if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) { 4426 spa_load_failed(spa, "uberblock guid sum doesn't match MOS " 4427 "guid sum (%llu != %llu)", 4428 (u_longlong_t)spa->spa_uberblock.ub_guid_sum, 4429 (u_longlong_t)rvd->vdev_guid_sum); 4430 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 4431 ENXIO)); 4432 } 4433 4434 return (0); 4435 } 4436 4437 static int 4438 spa_ld_open_indirect_vdev_metadata(spa_t *spa) 4439 { 4440 int error = 0; 4441 vdev_t *rvd = spa->spa_root_vdev; 4442 4443 /* 4444 * Everything that we read before spa_remove_init() must be stored 4445 * on concreted vdevs. Therefore we do this as early as possible. 4446 */ 4447 error = spa_remove_init(spa); 4448 if (error != 0) { 4449 spa_load_failed(spa, "spa_remove_init failed [error=%d]", 4450 error); 4451 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4452 } 4453 4454 /* 4455 * Retrieve information needed to condense indirect vdev mappings. 4456 */ 4457 error = spa_condense_init(spa); 4458 if (error != 0) { 4459 spa_load_failed(spa, "spa_condense_init failed [error=%d]", 4460 error); 4461 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 4462 } 4463 4464 return (0); 4465 } 4466 4467 static int 4468 spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep) 4469 { 4470 int error = 0; 4471 vdev_t *rvd = spa->spa_root_vdev; 4472 4473 if (spa_version(spa) >= SPA_VERSION_FEATURES) { 4474 boolean_t missing_feat_read = B_FALSE; 4475 nvlist_t *unsup_feat, *enabled_feat; 4476 4477 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, 4478 &spa->spa_feat_for_read_obj, B_TRUE) != 0) { 4479 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4480 } 4481 4482 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, 4483 &spa->spa_feat_for_write_obj, B_TRUE) != 0) { 4484 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4485 } 4486 4487 if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, 4488 &spa->spa_feat_desc_obj, B_TRUE) != 0) { 4489 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4490 } 4491 4492 enabled_feat = fnvlist_alloc(); 4493 unsup_feat = fnvlist_alloc(); 4494 4495 if (!spa_features_check(spa, B_FALSE, 4496 unsup_feat, enabled_feat)) 4497 missing_feat_read = B_TRUE; 4498 4499 if (spa_writeable(spa) || 4500 spa->spa_load_state == SPA_LOAD_TRYIMPORT) { 4501 if (!spa_features_check(spa, B_TRUE, 4502 unsup_feat, enabled_feat)) { 4503 *missing_feat_writep = B_TRUE; 4504 } 4505 } 4506 4507 fnvlist_add_nvlist(spa->spa_load_info, 4508 ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); 4509 4510 if (!nvlist_empty(unsup_feat)) { 4511 fnvlist_add_nvlist(spa->spa_load_info, 4512 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 4513 } 4514 4515 fnvlist_free(enabled_feat); 4516 fnvlist_free(unsup_feat); 4517 4518 if (!missing_feat_read) { 4519 fnvlist_add_boolean(spa->spa_load_info, 4520 ZPOOL_CONFIG_CAN_RDONLY); 4521 } 4522 4523 /* 4524 * If the state is SPA_LOAD_TRYIMPORT, our objective is 4525 * twofold: to determine whether the pool is available for 4526 * import in read-write mode and (if it is not) whether the 4527 * pool is available for import in read-only mode. If the pool 4528 * is available for import in read-write mode, it is displayed 4529 * as available in userland; if it is not available for import 4530 * in read-only mode, it is displayed as unavailable in 4531 * userland. If the pool is available for import in read-only 4532 * mode but not read-write mode, it is displayed as unavailable 4533 * in userland with a special note that the pool is actually 4534 * available for open in read-only mode. 4535 * 4536 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are 4537 * missing a feature for write, we must first determine whether 4538 * the pool can be opened read-only before returning to 4539 * userland in order to know whether to display the 4540 * abovementioned note. 4541 */ 4542 if (missing_feat_read || (*missing_feat_writep && 4543 spa_writeable(spa))) { 4544 spa_load_failed(spa, "pool uses unsupported features"); 4545 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 4546 ENOTSUP)); 4547 } 4548 4549 /* 4550 * Load refcounts for ZFS features from disk into an in-memory 4551 * cache during SPA initialization. 4552 */ 4553 for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { 4554 uint64_t refcount; 4555 4556 error = feature_get_refcount_from_disk(spa, 4557 &spa_feature_table[i], &refcount); 4558 if (error == 0) { 4559 spa->spa_feat_refcount_cache[i] = refcount; 4560 } else if (error == ENOTSUP) { 4561 spa->spa_feat_refcount_cache[i] = 4562 SPA_FEATURE_DISABLED; 4563 } else { 4564 spa_load_failed(spa, "error getting refcount " 4565 "for feature %s [error=%d]", 4566 spa_feature_table[i].fi_guid, error); 4567 return (spa_vdev_err(rvd, 4568 VDEV_AUX_CORRUPT_DATA, EIO)); 4569 } 4570 } 4571 } 4572 4573 if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { 4574 if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, 4575 &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0) 4576 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4577 } 4578 4579 /* 4580 * Encryption was added before bookmark_v2, even though bookmark_v2 4581 * is now a dependency. If this pool has encryption enabled without 4582 * bookmark_v2, trigger an errata message. 4583 */ 4584 if (spa_feature_is_enabled(spa, SPA_FEATURE_ENCRYPTION) && 4585 !spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_V2)) { 4586 spa->spa_errata = ZPOOL_ERRATA_ZOL_8308_ENCRYPTION; 4587 } 4588 4589 return (0); 4590 } 4591 4592 static int 4593 spa_ld_load_special_directories(spa_t *spa) 4594 { 4595 int error = 0; 4596 vdev_t *rvd = spa->spa_root_vdev; 4597 4598 spa->spa_is_initializing = B_TRUE; 4599 error = dsl_pool_open(spa->spa_dsl_pool); 4600 spa->spa_is_initializing = B_FALSE; 4601 if (error != 0) { 4602 spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error); 4603 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4604 } 4605 4606 return (0); 4607 } 4608 4609 static int 4610 spa_ld_get_props(spa_t *spa) 4611 { 4612 int error = 0; 4613 uint64_t obj; 4614 vdev_t *rvd = spa->spa_root_vdev; 4615 4616 /* Grab the checksum salt from the MOS. */ 4617 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 4618 DMU_POOL_CHECKSUM_SALT, 1, 4619 sizeof (spa->spa_cksum_salt.zcs_bytes), 4620 spa->spa_cksum_salt.zcs_bytes); 4621 if (error == ENOENT) { 4622 /* Generate a new salt for subsequent use */ 4623 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 4624 sizeof (spa->spa_cksum_salt.zcs_bytes)); 4625 } else if (error != 0) { 4626 spa_load_failed(spa, "unable to retrieve checksum salt from " 4627 "MOS [error=%d]", error); 4628 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4629 } 4630 4631 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0) 4632 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4633 error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 4634 if (error != 0) { 4635 spa_load_failed(spa, "error opening deferred-frees bpobj " 4636 "[error=%d]", error); 4637 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4638 } 4639 4640 /* 4641 * Load the bit that tells us to use the new accounting function 4642 * (raid-z deflation). If we have an older pool, this will not 4643 * be present. 4644 */ 4645 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE); 4646 if (error != 0 && error != ENOENT) 4647 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4648 4649 error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 4650 &spa->spa_creation_version, B_FALSE); 4651 if (error != 0 && error != ENOENT) 4652 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4653 4654 /* 4655 * Load the persistent error log. If we have an older pool, this will 4656 * not be present. 4657 */ 4658 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last, 4659 B_FALSE); 4660 if (error != 0 && error != ENOENT) 4661 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4662 4663 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 4664 &spa->spa_errlog_scrub, B_FALSE); 4665 if (error != 0 && error != ENOENT) 4666 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4667 4668 /* 4669 * Load the livelist deletion field. If a livelist is queued for 4670 * deletion, indicate that in the spa 4671 */ 4672 error = spa_dir_prop(spa, DMU_POOL_DELETED_CLONES, 4673 &spa->spa_livelists_to_delete, B_FALSE); 4674 if (error != 0 && error != ENOENT) 4675 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4676 4677 /* 4678 * Load the history object. If we have an older pool, this 4679 * will not be present. 4680 */ 4681 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE); 4682 if (error != 0 && error != ENOENT) 4683 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4684 4685 /* 4686 * Load the per-vdev ZAP map. If we have an older pool, this will not 4687 * be present; in this case, defer its creation to a later time to 4688 * avoid dirtying the MOS this early / out of sync context. See 4689 * spa_sync_config_object. 4690 */ 4691 4692 /* The sentinel is only available in the MOS config. */ 4693 nvlist_t *mos_config; 4694 if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) { 4695 spa_load_failed(spa, "unable to retrieve MOS config"); 4696 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4697 } 4698 4699 error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP, 4700 &spa->spa_all_vdev_zaps, B_FALSE); 4701 4702 if (error == ENOENT) { 4703 VERIFY(!nvlist_exists(mos_config, 4704 ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); 4705 spa->spa_avz_action = AVZ_ACTION_INITIALIZE; 4706 ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); 4707 } else if (error != 0) { 4708 nvlist_free(mos_config); 4709 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4710 } else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) { 4711 /* 4712 * An older version of ZFS overwrote the sentinel value, so 4713 * we have orphaned per-vdev ZAPs in the MOS. Defer their 4714 * destruction to later; see spa_sync_config_object. 4715 */ 4716 spa->spa_avz_action = AVZ_ACTION_DESTROY; 4717 /* 4718 * We're assuming that no vdevs have had their ZAPs created 4719 * before this. Better be sure of it. 4720 */ 4721 ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); 4722 } 4723 nvlist_free(mos_config); 4724 4725 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 4726 4727 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object, 4728 B_FALSE); 4729 if (error && error != ENOENT) 4730 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4731 4732 if (error == 0) { 4733 uint64_t autoreplace = 0; 4734 4735 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 4736 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 4737 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 4738 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 4739 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 4740 spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost); 4741 spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_autotrim); 4742 spa->spa_autoreplace = (autoreplace != 0); 4743 } 4744 4745 /* 4746 * If we are importing a pool with missing top-level vdevs, 4747 * we enforce that the pool doesn't panic or get suspended on 4748 * error since the likelihood of missing data is extremely high. 4749 */ 4750 if (spa->spa_missing_tvds > 0 && 4751 spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE && 4752 spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 4753 spa_load_note(spa, "forcing failmode to 'continue' " 4754 "as some top level vdevs are missing"); 4755 spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE; 4756 } 4757 4758 return (0); 4759 } 4760 4761 static int 4762 spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type) 4763 { 4764 int error = 0; 4765 vdev_t *rvd = spa->spa_root_vdev; 4766 4767 /* 4768 * If we're assembling the pool from the split-off vdevs of 4769 * an existing pool, we don't want to attach the spares & cache 4770 * devices. 4771 */ 4772 4773 /* 4774 * Load any hot spares for this pool. 4775 */ 4776 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object, 4777 B_FALSE); 4778 if (error != 0 && error != ENOENT) 4779 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4780 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 4781 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 4782 if (load_nvlist(spa, spa->spa_spares.sav_object, 4783 &spa->spa_spares.sav_config) != 0) { 4784 spa_load_failed(spa, "error loading spares nvlist"); 4785 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4786 } 4787 4788 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4789 spa_load_spares(spa); 4790 spa_config_exit(spa, SCL_ALL, FTAG); 4791 } else if (error == 0) { 4792 spa->spa_spares.sav_sync = B_TRUE; 4793 } 4794 4795 /* 4796 * Load any level 2 ARC devices for this pool. 4797 */ 4798 error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 4799 &spa->spa_l2cache.sav_object, B_FALSE); 4800 if (error != 0 && error != ENOENT) 4801 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4802 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 4803 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 4804 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 4805 &spa->spa_l2cache.sav_config) != 0) { 4806 spa_load_failed(spa, "error loading l2cache nvlist"); 4807 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4808 } 4809 4810 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4811 spa_load_l2cache(spa); 4812 spa_config_exit(spa, SCL_ALL, FTAG); 4813 } else if (error == 0) { 4814 spa->spa_l2cache.sav_sync = B_TRUE; 4815 } 4816 4817 return (0); 4818 } 4819 4820 static int 4821 spa_ld_load_vdev_metadata(spa_t *spa) 4822 { 4823 int error = 0; 4824 vdev_t *rvd = spa->spa_root_vdev; 4825 4826 /* 4827 * If the 'multihost' property is set, then never allow a pool to 4828 * be imported when the system hostid is zero. The exception to 4829 * this rule is zdb which is always allowed to access pools. 4830 */ 4831 if (spa_multihost(spa) && spa_get_hostid(spa) == 0 && 4832 (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) == 0) { 4833 fnvlist_add_uint64(spa->spa_load_info, 4834 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); 4835 return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); 4836 } 4837 4838 /* 4839 * If the 'autoreplace' property is set, then post a resource notifying 4840 * the ZFS DE that it should not issue any faults for unopenable 4841 * devices. We also iterate over the vdevs, and post a sysevent for any 4842 * unopenable vdevs so that the normal autoreplace handler can take 4843 * over. 4844 */ 4845 if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 4846 spa_check_removed(spa->spa_root_vdev); 4847 /* 4848 * For the import case, this is done in spa_import(), because 4849 * at this point we're using the spare definitions from 4850 * the MOS config, not necessarily from the userland config. 4851 */ 4852 if (spa->spa_load_state != SPA_LOAD_IMPORT) { 4853 spa_aux_check_removed(&spa->spa_spares); 4854 spa_aux_check_removed(&spa->spa_l2cache); 4855 } 4856 } 4857 4858 /* 4859 * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc. 4860 */ 4861 error = vdev_load(rvd); 4862 if (error != 0) { 4863 spa_load_failed(spa, "vdev_load failed [error=%d]", error); 4864 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 4865 } 4866 4867 error = spa_ld_log_spacemaps(spa); 4868 if (error != 0) { 4869 spa_load_failed(spa, "spa_ld_log_spacemaps failed [error=%d]", 4870 error); 4871 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 4872 } 4873 4874 /* 4875 * Propagate the leaf DTLs we just loaded all the way up the vdev tree. 4876 */ 4877 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4878 vdev_dtl_reassess(rvd, 0, 0, B_FALSE, B_FALSE); 4879 spa_config_exit(spa, SCL_ALL, FTAG); 4880 4881 return (0); 4882 } 4883 4884 static int 4885 spa_ld_load_dedup_tables(spa_t *spa) 4886 { 4887 int error = 0; 4888 vdev_t *rvd = spa->spa_root_vdev; 4889 4890 error = ddt_load(spa); 4891 if (error != 0) { 4892 spa_load_failed(spa, "ddt_load failed [error=%d]", error); 4893 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4894 } 4895 4896 return (0); 4897 } 4898 4899 static int 4900 spa_ld_load_brt(spa_t *spa) 4901 { 4902 int error = 0; 4903 vdev_t *rvd = spa->spa_root_vdev; 4904 4905 error = brt_load(spa); 4906 if (error != 0) { 4907 spa_load_failed(spa, "brt_load failed [error=%d]", error); 4908 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4909 } 4910 4911 return (0); 4912 } 4913 4914 static int 4915 spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, const char **ereport) 4916 { 4917 vdev_t *rvd = spa->spa_root_vdev; 4918 4919 if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) { 4920 boolean_t missing = spa_check_logs(spa); 4921 if (missing) { 4922 if (spa->spa_missing_tvds != 0) { 4923 spa_load_note(spa, "spa_check_logs failed " 4924 "so dropping the logs"); 4925 } else { 4926 *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 4927 spa_load_failed(spa, "spa_check_logs failed"); 4928 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, 4929 ENXIO)); 4930 } 4931 } 4932 } 4933 4934 return (0); 4935 } 4936 4937 static int 4938 spa_ld_verify_pool_data(spa_t *spa) 4939 { 4940 int error = 0; 4941 vdev_t *rvd = spa->spa_root_vdev; 4942 4943 /* 4944 * We've successfully opened the pool, verify that we're ready 4945 * to start pushing transactions. 4946 */ 4947 if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 4948 error = spa_load_verify(spa); 4949 if (error != 0) { 4950 spa_load_failed(spa, "spa_load_verify failed " 4951 "[error=%d]", error); 4952 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 4953 error)); 4954 } 4955 } 4956 4957 return (0); 4958 } 4959 4960 static void 4961 spa_ld_claim_log_blocks(spa_t *spa) 4962 { 4963 dmu_tx_t *tx; 4964 dsl_pool_t *dp = spa_get_dsl(spa); 4965 4966 /* 4967 * Claim log blocks that haven't been committed yet. 4968 * This must all happen in a single txg. 4969 * Note: spa_claim_max_txg is updated by spa_claim_notify(), 4970 * invoked from zil_claim_log_block()'s i/o done callback. 4971 * Price of rollback is that we abandon the log. 4972 */ 4973 spa->spa_claiming = B_TRUE; 4974 4975 tx = dmu_tx_create_assigned(dp, spa_first_txg(spa)); 4976 (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 4977 zil_claim, tx, DS_FIND_CHILDREN); 4978 dmu_tx_commit(tx); 4979 4980 spa->spa_claiming = B_FALSE; 4981 4982 spa_set_log_state(spa, SPA_LOG_GOOD); 4983 } 4984 4985 static void 4986 spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg, 4987 boolean_t update_config_cache) 4988 { 4989 vdev_t *rvd = spa->spa_root_vdev; 4990 int need_update = B_FALSE; 4991 4992 /* 4993 * If the config cache is stale, or we have uninitialized 4994 * metaslabs (see spa_vdev_add()), then update the config. 4995 * 4996 * If this is a verbatim import, trust the current 4997 * in-core spa_config and update the disk labels. 4998 */ 4999 if (update_config_cache || config_cache_txg != spa->spa_config_txg || 5000 spa->spa_load_state == SPA_LOAD_IMPORT || 5001 spa->spa_load_state == SPA_LOAD_RECOVER || 5002 (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 5003 need_update = B_TRUE; 5004 5005 for (int c = 0; c < rvd->vdev_children; c++) 5006 if (rvd->vdev_child[c]->vdev_ms_array == 0) 5007 need_update = B_TRUE; 5008 5009 /* 5010 * Update the config cache asynchronously in case we're the 5011 * root pool, in which case the config cache isn't writable yet. 5012 */ 5013 if (need_update) 5014 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 5015 } 5016 5017 static void 5018 spa_ld_prepare_for_reload(spa_t *spa) 5019 { 5020 spa_mode_t mode = spa->spa_mode; 5021 int async_suspended = spa->spa_async_suspended; 5022 5023 spa_unload(spa); 5024 spa_deactivate(spa); 5025 spa_activate(spa, mode); 5026 5027 /* 5028 * We save the value of spa_async_suspended as it gets reset to 0 by 5029 * spa_unload(). We want to restore it back to the original value before 5030 * returning as we might be calling spa_async_resume() later. 5031 */ 5032 spa->spa_async_suspended = async_suspended; 5033 } 5034 5035 static int 5036 spa_ld_read_checkpoint_txg(spa_t *spa) 5037 { 5038 uberblock_t checkpoint; 5039 int error = 0; 5040 5041 ASSERT0(spa->spa_checkpoint_txg); 5042 ASSERT(MUTEX_HELD(&spa_namespace_lock) || 5043 spa->spa_load_thread == curthread); 5044 5045 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 5046 DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), 5047 sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); 5048 5049 if (error == ENOENT) 5050 return (0); 5051 5052 if (error != 0) 5053 return (error); 5054 5055 ASSERT3U(checkpoint.ub_txg, !=, 0); 5056 ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0); 5057 ASSERT3U(checkpoint.ub_timestamp, !=, 0); 5058 spa->spa_checkpoint_txg = checkpoint.ub_txg; 5059 spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp; 5060 5061 return (0); 5062 } 5063 5064 static int 5065 spa_ld_mos_init(spa_t *spa, spa_import_type_t type) 5066 { 5067 int error = 0; 5068 5069 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5070 ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); 5071 5072 /* 5073 * Never trust the config that is provided unless we are assembling 5074 * a pool following a split. 5075 * This means don't trust blkptrs and the vdev tree in general. This 5076 * also effectively puts the spa in read-only mode since 5077 * spa_writeable() checks for spa_trust_config to be true. 5078 * We will later load a trusted config from the MOS. 5079 */ 5080 if (type != SPA_IMPORT_ASSEMBLE) 5081 spa->spa_trust_config = B_FALSE; 5082 5083 /* 5084 * Parse the config provided to create a vdev tree. 5085 */ 5086 error = spa_ld_parse_config(spa, type); 5087 if (error != 0) 5088 return (error); 5089 5090 spa_import_progress_add(spa); 5091 5092 /* 5093 * Now that we have the vdev tree, try to open each vdev. This involves 5094 * opening the underlying physical device, retrieving its geometry and 5095 * probing the vdev with a dummy I/O. The state of each vdev will be set 5096 * based on the success of those operations. After this we'll be ready 5097 * to read from the vdevs. 5098 */ 5099 error = spa_ld_open_vdevs(spa); 5100 if (error != 0) 5101 return (error); 5102 5103 /* 5104 * Read the label of each vdev and make sure that the GUIDs stored 5105 * there match the GUIDs in the config provided. 5106 * If we're assembling a new pool that's been split off from an 5107 * existing pool, the labels haven't yet been updated so we skip 5108 * validation for now. 5109 */ 5110 if (type != SPA_IMPORT_ASSEMBLE) { 5111 error = spa_ld_validate_vdevs(spa); 5112 if (error != 0) 5113 return (error); 5114 } 5115 5116 /* 5117 * Read all vdev labels to find the best uberblock (i.e. latest, 5118 * unless spa_load_max_txg is set) and store it in spa_uberblock. We 5119 * get the list of features required to read blkptrs in the MOS from 5120 * the vdev label with the best uberblock and verify that our version 5121 * of zfs supports them all. 5122 */ 5123 error = spa_ld_select_uberblock(spa, type); 5124 if (error != 0) 5125 return (error); 5126 5127 /* 5128 * Pass that uberblock to the dsl_pool layer which will open the root 5129 * blkptr. This blkptr points to the latest version of the MOS and will 5130 * allow us to read its contents. 5131 */ 5132 error = spa_ld_open_rootbp(spa); 5133 if (error != 0) 5134 return (error); 5135 5136 return (0); 5137 } 5138 5139 static int 5140 spa_ld_checkpoint_rewind(spa_t *spa) 5141 { 5142 uberblock_t checkpoint; 5143 int error = 0; 5144 5145 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5146 ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); 5147 5148 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 5149 DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), 5150 sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); 5151 5152 if (error != 0) { 5153 spa_load_failed(spa, "unable to retrieve checkpointed " 5154 "uberblock from the MOS config [error=%d]", error); 5155 5156 if (error == ENOENT) 5157 error = ZFS_ERR_NO_CHECKPOINT; 5158 5159 return (error); 5160 } 5161 5162 ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg); 5163 ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg); 5164 5165 /* 5166 * We need to update the txg and timestamp of the checkpointed 5167 * uberblock to be higher than the latest one. This ensures that 5168 * the checkpointed uberblock is selected if we were to close and 5169 * reopen the pool right after we've written it in the vdev labels. 5170 * (also see block comment in vdev_uberblock_compare) 5171 */ 5172 checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1; 5173 checkpoint.ub_timestamp = gethrestime_sec(); 5174 5175 /* 5176 * Set current uberblock to be the checkpointed uberblock. 5177 */ 5178 spa->spa_uberblock = checkpoint; 5179 5180 /* 5181 * If we are doing a normal rewind, then the pool is open for 5182 * writing and we sync the "updated" checkpointed uberblock to 5183 * disk. Once this is done, we've basically rewound the whole 5184 * pool and there is no way back. 5185 * 5186 * There are cases when we don't want to attempt and sync the 5187 * checkpointed uberblock to disk because we are opening a 5188 * pool as read-only. Specifically, verifying the checkpointed 5189 * state with zdb, and importing the checkpointed state to get 5190 * a "preview" of its content. 5191 */ 5192 if (spa_writeable(spa)) { 5193 vdev_t *rvd = spa->spa_root_vdev; 5194 5195 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5196 vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; 5197 int svdcount = 0; 5198 int children = rvd->vdev_children; 5199 int c0 = random_in_range(children); 5200 5201 for (int c = 0; c < children; c++) { 5202 vdev_t *vd = rvd->vdev_child[(c0 + c) % children]; 5203 5204 /* Stop when revisiting the first vdev */ 5205 if (c > 0 && svd[0] == vd) 5206 break; 5207 5208 if (vd->vdev_ms_array == 0 || vd->vdev_islog || 5209 !vdev_is_concrete(vd)) 5210 continue; 5211 5212 svd[svdcount++] = vd; 5213 if (svdcount == SPA_SYNC_MIN_VDEVS) 5214 break; 5215 } 5216 error = vdev_config_sync(svd, svdcount, spa->spa_first_txg); 5217 if (error == 0) 5218 spa->spa_last_synced_guid = rvd->vdev_guid; 5219 spa_config_exit(spa, SCL_ALL, FTAG); 5220 5221 if (error != 0) { 5222 spa_load_failed(spa, "failed to write checkpointed " 5223 "uberblock to the vdev labels [error=%d]", error); 5224 return (error); 5225 } 5226 } 5227 5228 return (0); 5229 } 5230 5231 static int 5232 spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type, 5233 boolean_t *update_config_cache) 5234 { 5235 int error; 5236 5237 /* 5238 * Parse the config for pool, open and validate vdevs, 5239 * select an uberblock, and use that uberblock to open 5240 * the MOS. 5241 */ 5242 error = spa_ld_mos_init(spa, type); 5243 if (error != 0) 5244 return (error); 5245 5246 /* 5247 * Retrieve the trusted config stored in the MOS and use it to create 5248 * a new, exact version of the vdev tree, then reopen all vdevs. 5249 */ 5250 error = spa_ld_trusted_config(spa, type, B_FALSE); 5251 if (error == EAGAIN) { 5252 if (update_config_cache != NULL) 5253 *update_config_cache = B_TRUE; 5254 5255 /* 5256 * Redo the loading process with the trusted config if it is 5257 * too different from the untrusted config. 5258 */ 5259 spa_ld_prepare_for_reload(spa); 5260 spa_load_note(spa, "RELOADING"); 5261 error = spa_ld_mos_init(spa, type); 5262 if (error != 0) 5263 return (error); 5264 5265 error = spa_ld_trusted_config(spa, type, B_TRUE); 5266 if (error != 0) 5267 return (error); 5268 5269 } else if (error != 0) { 5270 return (error); 5271 } 5272 5273 return (0); 5274 } 5275 5276 /* 5277 * Load an existing storage pool, using the config provided. This config 5278 * describes which vdevs are part of the pool and is later validated against 5279 * partial configs present in each vdev's label and an entire copy of the 5280 * config stored in the MOS. 5281 */ 5282 static int 5283 spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) 5284 { 5285 int error = 0; 5286 boolean_t missing_feat_write = B_FALSE; 5287 boolean_t checkpoint_rewind = 5288 (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); 5289 boolean_t update_config_cache = B_FALSE; 5290 hrtime_t load_start = gethrtime(); 5291 5292 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 5293 ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); 5294 5295 spa_load_note(spa, "LOADING"); 5296 5297 error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache); 5298 if (error != 0) 5299 return (error); 5300 5301 /* 5302 * If we are rewinding to the checkpoint then we need to repeat 5303 * everything we've done so far in this function but this time 5304 * selecting the checkpointed uberblock and using that to open 5305 * the MOS. 5306 */ 5307 if (checkpoint_rewind) { 5308 /* 5309 * If we are rewinding to the checkpoint update config cache 5310 * anyway. 5311 */ 5312 update_config_cache = B_TRUE; 5313 5314 /* 5315 * Extract the checkpointed uberblock from the current MOS 5316 * and use this as the pool's uberblock from now on. If the 5317 * pool is imported as writeable we also write the checkpoint 5318 * uberblock to the labels, making the rewind permanent. 5319 */ 5320 error = spa_ld_checkpoint_rewind(spa); 5321 if (error != 0) 5322 return (error); 5323 5324 /* 5325 * Redo the loading process again with the 5326 * checkpointed uberblock. 5327 */ 5328 spa_ld_prepare_for_reload(spa); 5329 spa_load_note(spa, "LOADING checkpointed uberblock"); 5330 error = spa_ld_mos_with_trusted_config(spa, type, NULL); 5331 if (error != 0) 5332 return (error); 5333 } 5334 5335 /* 5336 * Drop the namespace lock for the rest of the function. 5337 */ 5338 spa->spa_load_thread = curthread; 5339 mutex_exit(&spa_namespace_lock); 5340 5341 /* 5342 * Retrieve the checkpoint txg if the pool has a checkpoint. 5343 */ 5344 spa_import_progress_set_notes(spa, "Loading checkpoint txg"); 5345 error = spa_ld_read_checkpoint_txg(spa); 5346 if (error != 0) 5347 goto fail; 5348 5349 /* 5350 * Retrieve the mapping of indirect vdevs. Those vdevs were removed 5351 * from the pool and their contents were re-mapped to other vdevs. Note 5352 * that everything that we read before this step must have been 5353 * rewritten on concrete vdevs after the last device removal was 5354 * initiated. Otherwise we could be reading from indirect vdevs before 5355 * we have loaded their mappings. 5356 */ 5357 spa_import_progress_set_notes(spa, "Loading indirect vdev metadata"); 5358 error = spa_ld_open_indirect_vdev_metadata(spa); 5359 if (error != 0) 5360 goto fail; 5361 5362 /* 5363 * Retrieve the full list of active features from the MOS and check if 5364 * they are all supported. 5365 */ 5366 spa_import_progress_set_notes(spa, "Checking feature flags"); 5367 error = spa_ld_check_features(spa, &missing_feat_write); 5368 if (error != 0) 5369 goto fail; 5370 5371 /* 5372 * Load several special directories from the MOS needed by the dsl_pool 5373 * layer. 5374 */ 5375 spa_import_progress_set_notes(spa, "Loading special MOS directories"); 5376 error = spa_ld_load_special_directories(spa); 5377 if (error != 0) 5378 goto fail; 5379 5380 /* 5381 * Retrieve pool properties from the MOS. 5382 */ 5383 spa_import_progress_set_notes(spa, "Loading properties"); 5384 error = spa_ld_get_props(spa); 5385 if (error != 0) 5386 goto fail; 5387 5388 /* 5389 * Retrieve the list of auxiliary devices - cache devices and spares - 5390 * and open them. 5391 */ 5392 spa_import_progress_set_notes(spa, "Loading AUX vdevs"); 5393 error = spa_ld_open_aux_vdevs(spa, type); 5394 if (error != 0) 5395 goto fail; 5396 5397 /* 5398 * Load the metadata for all vdevs. Also check if unopenable devices 5399 * should be autoreplaced. 5400 */ 5401 spa_import_progress_set_notes(spa, "Loading vdev metadata"); 5402 error = spa_ld_load_vdev_metadata(spa); 5403 if (error != 0) 5404 goto fail; 5405 5406 spa_import_progress_set_notes(spa, "Loading dedup tables"); 5407 error = spa_ld_load_dedup_tables(spa); 5408 if (error != 0) 5409 goto fail; 5410 5411 spa_import_progress_set_notes(spa, "Loading BRT"); 5412 error = spa_ld_load_brt(spa); 5413 if (error != 0) 5414 goto fail; 5415 5416 /* 5417 * Verify the logs now to make sure we don't have any unexpected errors 5418 * when we claim log blocks later. 5419 */ 5420 spa_import_progress_set_notes(spa, "Verifying Log Devices"); 5421 error = spa_ld_verify_logs(spa, type, ereport); 5422 if (error != 0) 5423 goto fail; 5424 5425 if (missing_feat_write) { 5426 ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT); 5427 5428 /* 5429 * At this point, we know that we can open the pool in 5430 * read-only mode but not read-write mode. We now have enough 5431 * information and can return to userland. 5432 */ 5433 error = spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT, 5434 ENOTSUP); 5435 goto fail; 5436 } 5437 5438 /* 5439 * Traverse the last txgs to make sure the pool was left off in a safe 5440 * state. When performing an extreme rewind, we verify the whole pool, 5441 * which can take a very long time. 5442 */ 5443 spa_import_progress_set_notes(spa, "Verifying pool data"); 5444 error = spa_ld_verify_pool_data(spa); 5445 if (error != 0) 5446 goto fail; 5447 5448 /* 5449 * Calculate the deflated space for the pool. This must be done before 5450 * we write anything to the pool because we'd need to update the space 5451 * accounting using the deflated sizes. 5452 */ 5453 spa_import_progress_set_notes(spa, "Calculating deflated space"); 5454 spa_update_dspace(spa); 5455 5456 /* 5457 * We have now retrieved all the information we needed to open the 5458 * pool. If we are importing the pool in read-write mode, a few 5459 * additional steps must be performed to finish the import. 5460 */ 5461 spa_import_progress_set_notes(spa, "Starting import"); 5462 if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER || 5463 spa->spa_load_max_txg == UINT64_MAX)) { 5464 uint64_t config_cache_txg = spa->spa_config_txg; 5465 5466 ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT); 5467 5468 /* 5469 * Before we do any zio_write's, complete the raidz expansion 5470 * scratch space copying, if necessary. 5471 */ 5472 if (RRSS_GET_STATE(&spa->spa_uberblock) == RRSS_SCRATCH_VALID) 5473 vdev_raidz_reflow_copy_scratch(spa); 5474 5475 /* 5476 * In case of a checkpoint rewind, log the original txg 5477 * of the checkpointed uberblock. 5478 */ 5479 if (checkpoint_rewind) { 5480 spa_history_log_internal(spa, "checkpoint rewind", 5481 NULL, "rewound state to txg=%llu", 5482 (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg); 5483 } 5484 5485 spa_import_progress_set_notes(spa, "Claiming ZIL blocks"); 5486 /* 5487 * Traverse the ZIL and claim all blocks. 5488 */ 5489 spa_ld_claim_log_blocks(spa); 5490 5491 /* 5492 * Kick-off the syncing thread. 5493 */ 5494 spa->spa_sync_on = B_TRUE; 5495 txg_sync_start(spa->spa_dsl_pool); 5496 mmp_thread_start(spa); 5497 5498 /* 5499 * Wait for all claims to sync. We sync up to the highest 5500 * claimed log block birth time so that claimed log blocks 5501 * don't appear to be from the future. spa_claim_max_txg 5502 * will have been set for us by ZIL traversal operations 5503 * performed above. 5504 */ 5505 spa_import_progress_set_notes(spa, "Syncing ZIL claims"); 5506 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 5507 5508 /* 5509 * Check if we need to request an update of the config. On the 5510 * next sync, we would update the config stored in vdev labels 5511 * and the cachefile (by default /etc/zfs/zpool.cache). 5512 */ 5513 spa_import_progress_set_notes(spa, "Updating configs"); 5514 spa_ld_check_for_config_update(spa, config_cache_txg, 5515 update_config_cache); 5516 5517 /* 5518 * Check if a rebuild was in progress and if so resume it. 5519 * Then check all DTLs to see if anything needs resilvering. 5520 * The resilver will be deferred if a rebuild was started. 5521 */ 5522 spa_import_progress_set_notes(spa, "Starting resilvers"); 5523 if (vdev_rebuild_active(spa->spa_root_vdev)) { 5524 vdev_rebuild_restart(spa); 5525 } else if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 5526 vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 5527 spa_async_request(spa, SPA_ASYNC_RESILVER); 5528 } 5529 5530 /* 5531 * Log the fact that we booted up (so that we can detect if 5532 * we rebooted in the middle of an operation). 5533 */ 5534 spa_history_log_version(spa, "open", NULL); 5535 5536 spa_import_progress_set_notes(spa, 5537 "Restarting device removals"); 5538 spa_restart_removal(spa); 5539 spa_spawn_aux_threads(spa); 5540 5541 /* 5542 * Delete any inconsistent datasets. 5543 * 5544 * Note: 5545 * Since we may be issuing deletes for clones here, 5546 * we make sure to do so after we've spawned all the 5547 * auxiliary threads above (from which the livelist 5548 * deletion zthr is part of). 5549 */ 5550 spa_import_progress_set_notes(spa, 5551 "Cleaning up inconsistent objsets"); 5552 (void) dmu_objset_find(spa_name(spa), 5553 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 5554 5555 /* 5556 * Clean up any stale temporary dataset userrefs. 5557 */ 5558 spa_import_progress_set_notes(spa, 5559 "Cleaning up temporary userrefs"); 5560 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 5561 5562 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5563 spa_import_progress_set_notes(spa, "Restarting initialize"); 5564 vdev_initialize_restart(spa->spa_root_vdev); 5565 spa_import_progress_set_notes(spa, "Restarting TRIM"); 5566 vdev_trim_restart(spa->spa_root_vdev); 5567 vdev_autotrim_restart(spa); 5568 spa_config_exit(spa, SCL_CONFIG, FTAG); 5569 spa_import_progress_set_notes(spa, "Finished importing"); 5570 } 5571 zio_handle_import_delay(spa, gethrtime() - load_start); 5572 5573 spa_import_progress_remove(spa_guid(spa)); 5574 spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); 5575 5576 spa_load_note(spa, "LOADED"); 5577 fail: 5578 mutex_enter(&spa_namespace_lock); 5579 spa->spa_load_thread = NULL; 5580 cv_broadcast(&spa_namespace_cv); 5581 5582 return (error); 5583 5584 } 5585 5586 static int 5587 spa_load_retry(spa_t *spa, spa_load_state_t state) 5588 { 5589 spa_mode_t mode = spa->spa_mode; 5590 5591 spa_unload(spa); 5592 spa_deactivate(spa); 5593 5594 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1; 5595 5596 spa_activate(spa, mode); 5597 spa_async_suspend(spa); 5598 5599 spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu", 5600 (u_longlong_t)spa->spa_load_max_txg); 5601 5602 return (spa_load(spa, state, SPA_IMPORT_EXISTING)); 5603 } 5604 5605 /* 5606 * If spa_load() fails this function will try loading prior txg's. If 5607 * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool 5608 * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this 5609 * function will not rewind the pool and will return the same error as 5610 * spa_load(). 5611 */ 5612 static int 5613 spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request, 5614 int rewind_flags) 5615 { 5616 nvlist_t *loadinfo = NULL; 5617 nvlist_t *config = NULL; 5618 int load_error, rewind_error; 5619 uint64_t safe_rewind_txg; 5620 uint64_t min_txg; 5621 5622 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 5623 spa->spa_load_max_txg = spa->spa_load_txg; 5624 spa_set_log_state(spa, SPA_LOG_CLEAR); 5625 } else { 5626 spa->spa_load_max_txg = max_request; 5627 if (max_request != UINT64_MAX) 5628 spa->spa_extreme_rewind = B_TRUE; 5629 } 5630 5631 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING); 5632 if (load_error == 0) 5633 return (0); 5634 if (load_error == ZFS_ERR_NO_CHECKPOINT) { 5635 /* 5636 * When attempting checkpoint-rewind on a pool with no 5637 * checkpoint, we should not attempt to load uberblocks 5638 * from previous txgs when spa_load fails. 5639 */ 5640 ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); 5641 spa_import_progress_remove(spa_guid(spa)); 5642 return (load_error); 5643 } 5644 5645 if (spa->spa_root_vdev != NULL) 5646 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 5647 5648 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 5649 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 5650 5651 if (rewind_flags & ZPOOL_NEVER_REWIND) { 5652 nvlist_free(config); 5653 spa_import_progress_remove(spa_guid(spa)); 5654 return (load_error); 5655 } 5656 5657 if (state == SPA_LOAD_RECOVER) { 5658 /* Price of rolling back is discarding txgs, including log */ 5659 spa_set_log_state(spa, SPA_LOG_CLEAR); 5660 } else { 5661 /* 5662 * If we aren't rolling back save the load info from our first 5663 * import attempt so that we can restore it after attempting 5664 * to rewind. 5665 */ 5666 loadinfo = spa->spa_load_info; 5667 spa->spa_load_info = fnvlist_alloc(); 5668 } 5669 5670 spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 5671 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 5672 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 5673 TXG_INITIAL : safe_rewind_txg; 5674 5675 /* 5676 * Continue as long as we're finding errors, we're still within 5677 * the acceptable rewind range, and we're still finding uberblocks 5678 */ 5679 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 5680 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 5681 if (spa->spa_load_max_txg < safe_rewind_txg) 5682 spa->spa_extreme_rewind = B_TRUE; 5683 rewind_error = spa_load_retry(spa, state); 5684 } 5685 5686 spa->spa_extreme_rewind = B_FALSE; 5687 spa->spa_load_max_txg = UINT64_MAX; 5688 5689 if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 5690 spa_config_set(spa, config); 5691 else 5692 nvlist_free(config); 5693 5694 if (state == SPA_LOAD_RECOVER) { 5695 ASSERT3P(loadinfo, ==, NULL); 5696 spa_import_progress_remove(spa_guid(spa)); 5697 return (rewind_error); 5698 } else { 5699 /* Store the rewind info as part of the initial load info */ 5700 fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, 5701 spa->spa_load_info); 5702 5703 /* Restore the initial load info */ 5704 fnvlist_free(spa->spa_load_info); 5705 spa->spa_load_info = loadinfo; 5706 5707 spa_import_progress_remove(spa_guid(spa)); 5708 return (load_error); 5709 } 5710 } 5711 5712 /* 5713 * Pool Open/Import 5714 * 5715 * The import case is identical to an open except that the configuration is sent 5716 * down from userland, instead of grabbed from the configuration cache. For the 5717 * case of an open, the pool configuration will exist in the 5718 * POOL_STATE_UNINITIALIZED state. 5719 * 5720 * The stats information (gen/count/ustats) is used to gather vdev statistics at 5721 * the same time open the pool, without having to keep around the spa_t in some 5722 * ambiguous state. 5723 */ 5724 static int 5725 spa_open_common(const char *pool, spa_t **spapp, const void *tag, 5726 nvlist_t *nvpolicy, nvlist_t **config) 5727 { 5728 spa_t *spa; 5729 spa_load_state_t state = SPA_LOAD_OPEN; 5730 int error; 5731 int locked = B_FALSE; 5732 int firstopen = B_FALSE; 5733 5734 *spapp = NULL; 5735 5736 /* 5737 * As disgusting as this is, we need to support recursive calls to this 5738 * function because dsl_dir_open() is called during spa_load(), and ends 5739 * up calling spa_open() again. The real fix is to figure out how to 5740 * avoid dsl_dir_open() calling this in the first place. 5741 */ 5742 if (MUTEX_NOT_HELD(&spa_namespace_lock)) { 5743 mutex_enter(&spa_namespace_lock); 5744 locked = B_TRUE; 5745 } 5746 5747 if ((spa = spa_lookup(pool)) == NULL) { 5748 if (locked) 5749 mutex_exit(&spa_namespace_lock); 5750 return (SET_ERROR(ENOENT)); 5751 } 5752 5753 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 5754 zpool_load_policy_t policy; 5755 5756 firstopen = B_TRUE; 5757 5758 zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config, 5759 &policy); 5760 if (policy.zlp_rewind & ZPOOL_DO_REWIND) 5761 state = SPA_LOAD_RECOVER; 5762 5763 spa_activate(spa, spa_mode_global); 5764 5765 if (state != SPA_LOAD_RECOVER) 5766 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 5767 spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; 5768 5769 zfs_dbgmsg("spa_open_common: opening %s", pool); 5770 error = spa_load_best(spa, state, policy.zlp_txg, 5771 policy.zlp_rewind); 5772 5773 if (error == EBADF) { 5774 /* 5775 * If vdev_validate() returns failure (indicated by 5776 * EBADF), it indicates that one of the vdevs indicates 5777 * that the pool has been exported or destroyed. If 5778 * this is the case, the config cache is out of sync and 5779 * we should remove the pool from the namespace. 5780 */ 5781 spa_unload(spa); 5782 spa_deactivate(spa); 5783 spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE); 5784 spa_remove(spa); 5785 if (locked) 5786 mutex_exit(&spa_namespace_lock); 5787 return (SET_ERROR(ENOENT)); 5788 } 5789 5790 if (error) { 5791 /* 5792 * We can't open the pool, but we still have useful 5793 * information: the state of each vdev after the 5794 * attempted vdev_open(). Return this to the user. 5795 */ 5796 if (config != NULL && spa->spa_config) { 5797 *config = fnvlist_dup(spa->spa_config); 5798 fnvlist_add_nvlist(*config, 5799 ZPOOL_CONFIG_LOAD_INFO, 5800 spa->spa_load_info); 5801 } 5802 spa_unload(spa); 5803 spa_deactivate(spa); 5804 spa->spa_last_open_failed = error; 5805 if (locked) 5806 mutex_exit(&spa_namespace_lock); 5807 *spapp = NULL; 5808 return (error); 5809 } 5810 } 5811 5812 spa_open_ref(spa, tag); 5813 5814 if (config != NULL) 5815 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 5816 5817 /* 5818 * If we've recovered the pool, pass back any information we 5819 * gathered while doing the load. 5820 */ 5821 if (state == SPA_LOAD_RECOVER && config != NULL) { 5822 fnvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 5823 spa->spa_load_info); 5824 } 5825 5826 if (locked) { 5827 spa->spa_last_open_failed = 0; 5828 spa->spa_last_ubsync_txg = 0; 5829 spa->spa_load_txg = 0; 5830 mutex_exit(&spa_namespace_lock); 5831 } 5832 5833 if (firstopen) 5834 zvol_create_minors_recursive(spa_name(spa)); 5835 5836 *spapp = spa; 5837 5838 return (0); 5839 } 5840 5841 int 5842 spa_open_rewind(const char *name, spa_t **spapp, const void *tag, 5843 nvlist_t *policy, nvlist_t **config) 5844 { 5845 return (spa_open_common(name, spapp, tag, policy, config)); 5846 } 5847 5848 int 5849 spa_open(const char *name, spa_t **spapp, const void *tag) 5850 { 5851 return (spa_open_common(name, spapp, tag, NULL, NULL)); 5852 } 5853 5854 /* 5855 * Lookup the given spa_t, incrementing the inject count in the process, 5856 * preventing it from being exported or destroyed. 5857 */ 5858 spa_t * 5859 spa_inject_addref(char *name) 5860 { 5861 spa_t *spa; 5862 5863 mutex_enter(&spa_namespace_lock); 5864 if ((spa = spa_lookup(name)) == NULL) { 5865 mutex_exit(&spa_namespace_lock); 5866 return (NULL); 5867 } 5868 spa->spa_inject_ref++; 5869 mutex_exit(&spa_namespace_lock); 5870 5871 return (spa); 5872 } 5873 5874 void 5875 spa_inject_delref(spa_t *spa) 5876 { 5877 mutex_enter(&spa_namespace_lock); 5878 spa->spa_inject_ref--; 5879 mutex_exit(&spa_namespace_lock); 5880 } 5881 5882 /* 5883 * Add spares device information to the nvlist. 5884 */ 5885 static void 5886 spa_add_spares(spa_t *spa, nvlist_t *config) 5887 { 5888 nvlist_t **spares; 5889 uint_t i, nspares; 5890 nvlist_t *nvroot; 5891 uint64_t guid; 5892 vdev_stat_t *vs; 5893 uint_t vsc; 5894 uint64_t pool; 5895 5896 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 5897 5898 if (spa->spa_spares.sav_count == 0) 5899 return; 5900 5901 nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); 5902 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 5903 ZPOOL_CONFIG_SPARES, &spares, &nspares)); 5904 if (nspares != 0) { 5905 fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 5906 (const nvlist_t * const *)spares, nspares); 5907 VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 5908 &spares, &nspares)); 5909 5910 /* 5911 * Go through and find any spares which have since been 5912 * repurposed as an active spare. If this is the case, update 5913 * their status appropriately. 5914 */ 5915 for (i = 0; i < nspares; i++) { 5916 guid = fnvlist_lookup_uint64(spares[i], 5917 ZPOOL_CONFIG_GUID); 5918 VERIFY0(nvlist_lookup_uint64_array(spares[i], 5919 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)); 5920 if (spa_spare_exists(guid, &pool, NULL) && 5921 pool != 0ULL) { 5922 vs->vs_state = VDEV_STATE_CANT_OPEN; 5923 vs->vs_aux = VDEV_AUX_SPARED; 5924 } else { 5925 vs->vs_state = 5926 spa->spa_spares.sav_vdevs[i]->vdev_state; 5927 } 5928 } 5929 } 5930 } 5931 5932 /* 5933 * Add l2cache device information to the nvlist, including vdev stats. 5934 */ 5935 static void 5936 spa_add_l2cache(spa_t *spa, nvlist_t *config) 5937 { 5938 nvlist_t **l2cache; 5939 uint_t i, j, nl2cache; 5940 nvlist_t *nvroot; 5941 uint64_t guid; 5942 vdev_t *vd; 5943 vdev_stat_t *vs; 5944 uint_t vsc; 5945 5946 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 5947 5948 if (spa->spa_l2cache.sav_count == 0) 5949 return; 5950 5951 nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); 5952 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 5953 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache)); 5954 if (nl2cache != 0) { 5955 fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 5956 (const nvlist_t * const *)l2cache, nl2cache); 5957 VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 5958 &l2cache, &nl2cache)); 5959 5960 /* 5961 * Update level 2 cache device stats. 5962 */ 5963 5964 for (i = 0; i < nl2cache; i++) { 5965 guid = fnvlist_lookup_uint64(l2cache[i], 5966 ZPOOL_CONFIG_GUID); 5967 5968 vd = NULL; 5969 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 5970 if (guid == 5971 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 5972 vd = spa->spa_l2cache.sav_vdevs[j]; 5973 break; 5974 } 5975 } 5976 ASSERT(vd != NULL); 5977 5978 VERIFY0(nvlist_lookup_uint64_array(l2cache[i], 5979 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)); 5980 vdev_get_stats(vd, vs); 5981 vdev_config_generate_stats(vd, l2cache[i]); 5982 5983 } 5984 } 5985 } 5986 5987 static void 5988 spa_feature_stats_from_disk(spa_t *spa, nvlist_t *features) 5989 { 5990 zap_cursor_t zc; 5991 zap_attribute_t za; 5992 5993 if (spa->spa_feat_for_read_obj != 0) { 5994 for (zap_cursor_init(&zc, spa->spa_meta_objset, 5995 spa->spa_feat_for_read_obj); 5996 zap_cursor_retrieve(&zc, &za) == 0; 5997 zap_cursor_advance(&zc)) { 5998 ASSERT(za.za_integer_length == sizeof (uint64_t) && 5999 za.za_num_integers == 1); 6000 VERIFY0(nvlist_add_uint64(features, za.za_name, 6001 za.za_first_integer)); 6002 } 6003 zap_cursor_fini(&zc); 6004 } 6005 6006 if (spa->spa_feat_for_write_obj != 0) { 6007 for (zap_cursor_init(&zc, spa->spa_meta_objset, 6008 spa->spa_feat_for_write_obj); 6009 zap_cursor_retrieve(&zc, &za) == 0; 6010 zap_cursor_advance(&zc)) { 6011 ASSERT(za.za_integer_length == sizeof (uint64_t) && 6012 za.za_num_integers == 1); 6013 VERIFY0(nvlist_add_uint64(features, za.za_name, 6014 za.za_first_integer)); 6015 } 6016 zap_cursor_fini(&zc); 6017 } 6018 } 6019 6020 static void 6021 spa_feature_stats_from_cache(spa_t *spa, nvlist_t *features) 6022 { 6023 int i; 6024 6025 for (i = 0; i < SPA_FEATURES; i++) { 6026 zfeature_info_t feature = spa_feature_table[i]; 6027 uint64_t refcount; 6028 6029 if (feature_get_refcount(spa, &feature, &refcount) != 0) 6030 continue; 6031 6032 VERIFY0(nvlist_add_uint64(features, feature.fi_guid, refcount)); 6033 } 6034 } 6035 6036 /* 6037 * Store a list of pool features and their reference counts in the 6038 * config. 6039 * 6040 * The first time this is called on a spa, allocate a new nvlist, fetch 6041 * the pool features and reference counts from disk, then save the list 6042 * in the spa. In subsequent calls on the same spa use the saved nvlist 6043 * and refresh its values from the cached reference counts. This 6044 * ensures we don't block here on I/O on a suspended pool so 'zpool 6045 * clear' can resume the pool. 6046 */ 6047 static void 6048 spa_add_feature_stats(spa_t *spa, nvlist_t *config) 6049 { 6050 nvlist_t *features; 6051 6052 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 6053 6054 mutex_enter(&spa->spa_feat_stats_lock); 6055 features = spa->spa_feat_stats; 6056 6057 if (features != NULL) { 6058 spa_feature_stats_from_cache(spa, features); 6059 } else { 6060 VERIFY0(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP)); 6061 spa->spa_feat_stats = features; 6062 spa_feature_stats_from_disk(spa, features); 6063 } 6064 6065 VERIFY0(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, 6066 features)); 6067 6068 mutex_exit(&spa->spa_feat_stats_lock); 6069 } 6070 6071 int 6072 spa_get_stats(const char *name, nvlist_t **config, 6073 char *altroot, size_t buflen) 6074 { 6075 int error; 6076 spa_t *spa; 6077 6078 *config = NULL; 6079 error = spa_open_common(name, &spa, FTAG, NULL, config); 6080 6081 if (spa != NULL) { 6082 /* 6083 * This still leaves a window of inconsistency where the spares 6084 * or l2cache devices could change and the config would be 6085 * self-inconsistent. 6086 */ 6087 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6088 6089 if (*config != NULL) { 6090 uint64_t loadtimes[2]; 6091 6092 loadtimes[0] = spa->spa_loaded_ts.tv_sec; 6093 loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 6094 fnvlist_add_uint64_array(*config, 6095 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2); 6096 6097 fnvlist_add_uint64(*config, 6098 ZPOOL_CONFIG_ERRCOUNT, 6099 spa_approx_errlog_size(spa)); 6100 6101 if (spa_suspended(spa)) { 6102 fnvlist_add_uint64(*config, 6103 ZPOOL_CONFIG_SUSPENDED, 6104 spa->spa_failmode); 6105 fnvlist_add_uint64(*config, 6106 ZPOOL_CONFIG_SUSPENDED_REASON, 6107 spa->spa_suspended); 6108 } 6109 6110 spa_add_spares(spa, *config); 6111 spa_add_l2cache(spa, *config); 6112 spa_add_feature_stats(spa, *config); 6113 } 6114 } 6115 6116 /* 6117 * We want to get the alternate root even for faulted pools, so we cheat 6118 * and call spa_lookup() directly. 6119 */ 6120 if (altroot) { 6121 if (spa == NULL) { 6122 mutex_enter(&spa_namespace_lock); 6123 spa = spa_lookup(name); 6124 if (spa) 6125 spa_altroot(spa, altroot, buflen); 6126 else 6127 altroot[0] = '\0'; 6128 spa = NULL; 6129 mutex_exit(&spa_namespace_lock); 6130 } else { 6131 spa_altroot(spa, altroot, buflen); 6132 } 6133 } 6134 6135 if (spa != NULL) { 6136 spa_config_exit(spa, SCL_CONFIG, FTAG); 6137 spa_close(spa, FTAG); 6138 } 6139 6140 return (error); 6141 } 6142 6143 /* 6144 * Validate that the auxiliary device array is well formed. We must have an 6145 * array of nvlists, each which describes a valid leaf vdev. If this is an 6146 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 6147 * specified, as long as they are well-formed. 6148 */ 6149 static int 6150 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 6151 spa_aux_vdev_t *sav, const char *config, uint64_t version, 6152 vdev_labeltype_t label) 6153 { 6154 nvlist_t **dev; 6155 uint_t i, ndev; 6156 vdev_t *vd; 6157 int error; 6158 6159 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 6160 6161 /* 6162 * It's acceptable to have no devs specified. 6163 */ 6164 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 6165 return (0); 6166 6167 if (ndev == 0) 6168 return (SET_ERROR(EINVAL)); 6169 6170 /* 6171 * Make sure the pool is formatted with a version that supports this 6172 * device type. 6173 */ 6174 if (spa_version(spa) < version) 6175 return (SET_ERROR(ENOTSUP)); 6176 6177 /* 6178 * Set the pending device list so we correctly handle device in-use 6179 * checking. 6180 */ 6181 sav->sav_pending = dev; 6182 sav->sav_npending = ndev; 6183 6184 for (i = 0; i < ndev; i++) { 6185 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 6186 mode)) != 0) 6187 goto out; 6188 6189 if (!vd->vdev_ops->vdev_op_leaf) { 6190 vdev_free(vd); 6191 error = SET_ERROR(EINVAL); 6192 goto out; 6193 } 6194 6195 vd->vdev_top = vd; 6196 6197 if ((error = vdev_open(vd)) == 0 && 6198 (error = vdev_label_init(vd, crtxg, label)) == 0) { 6199 fnvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 6200 vd->vdev_guid); 6201 } 6202 6203 vdev_free(vd); 6204 6205 if (error && 6206 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 6207 goto out; 6208 else 6209 error = 0; 6210 } 6211 6212 out: 6213 sav->sav_pending = NULL; 6214 sav->sav_npending = 0; 6215 return (error); 6216 } 6217 6218 static int 6219 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 6220 { 6221 int error; 6222 6223 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 6224 6225 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 6226 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 6227 VDEV_LABEL_SPARE)) != 0) { 6228 return (error); 6229 } 6230 6231 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 6232 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 6233 VDEV_LABEL_L2CACHE)); 6234 } 6235 6236 static void 6237 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 6238 const char *config) 6239 { 6240 int i; 6241 6242 if (sav->sav_config != NULL) { 6243 nvlist_t **olddevs; 6244 uint_t oldndevs; 6245 nvlist_t **newdevs; 6246 6247 /* 6248 * Generate new dev list by concatenating with the 6249 * current dev list. 6250 */ 6251 VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, config, 6252 &olddevs, &oldndevs)); 6253 6254 newdevs = kmem_alloc(sizeof (void *) * 6255 (ndevs + oldndevs), KM_SLEEP); 6256 for (i = 0; i < oldndevs; i++) 6257 newdevs[i] = fnvlist_dup(olddevs[i]); 6258 for (i = 0; i < ndevs; i++) 6259 newdevs[i + oldndevs] = fnvlist_dup(devs[i]); 6260 6261 fnvlist_remove(sav->sav_config, config); 6262 6263 fnvlist_add_nvlist_array(sav->sav_config, config, 6264 (const nvlist_t * const *)newdevs, ndevs + oldndevs); 6265 for (i = 0; i < oldndevs + ndevs; i++) 6266 nvlist_free(newdevs[i]); 6267 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 6268 } else { 6269 /* 6270 * Generate a new dev list. 6271 */ 6272 sav->sav_config = fnvlist_alloc(); 6273 fnvlist_add_nvlist_array(sav->sav_config, config, 6274 (const nvlist_t * const *)devs, ndevs); 6275 } 6276 } 6277 6278 /* 6279 * Stop and drop level 2 ARC devices 6280 */ 6281 void 6282 spa_l2cache_drop(spa_t *spa) 6283 { 6284 vdev_t *vd; 6285 int i; 6286 spa_aux_vdev_t *sav = &spa->spa_l2cache; 6287 6288 for (i = 0; i < sav->sav_count; i++) { 6289 uint64_t pool; 6290 6291 vd = sav->sav_vdevs[i]; 6292 ASSERT(vd != NULL); 6293 6294 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 6295 pool != 0ULL && l2arc_vdev_present(vd)) 6296 l2arc_remove_vdev(vd); 6297 } 6298 } 6299 6300 /* 6301 * Verify encryption parameters for spa creation. If we are encrypting, we must 6302 * have the encryption feature flag enabled. 6303 */ 6304 static int 6305 spa_create_check_encryption_params(dsl_crypto_params_t *dcp, 6306 boolean_t has_encryption) 6307 { 6308 if (dcp->cp_crypt != ZIO_CRYPT_OFF && 6309 dcp->cp_crypt != ZIO_CRYPT_INHERIT && 6310 !has_encryption) 6311 return (SET_ERROR(ENOTSUP)); 6312 6313 return (dmu_objset_create_crypt_check(NULL, dcp, NULL)); 6314 } 6315 6316 /* 6317 * Pool Creation 6318 */ 6319 int 6320 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 6321 nvlist_t *zplprops, dsl_crypto_params_t *dcp) 6322 { 6323 spa_t *spa; 6324 const char *altroot = NULL; 6325 vdev_t *rvd; 6326 dsl_pool_t *dp; 6327 dmu_tx_t *tx; 6328 int error = 0; 6329 uint64_t txg = TXG_INITIAL; 6330 nvlist_t **spares, **l2cache; 6331 uint_t nspares, nl2cache; 6332 uint64_t version, obj, ndraid = 0; 6333 boolean_t has_features; 6334 boolean_t has_encryption; 6335 boolean_t has_allocclass; 6336 spa_feature_t feat; 6337 const char *feat_name; 6338 const char *poolname; 6339 nvlist_t *nvl; 6340 6341 if (props == NULL || 6342 nvlist_lookup_string(props, 6343 zpool_prop_to_name(ZPOOL_PROP_TNAME), &poolname) != 0) 6344 poolname = (char *)pool; 6345 6346 /* 6347 * If this pool already exists, return failure. 6348 */ 6349 mutex_enter(&spa_namespace_lock); 6350 if (spa_lookup(poolname) != NULL) { 6351 mutex_exit(&spa_namespace_lock); 6352 return (SET_ERROR(EEXIST)); 6353 } 6354 6355 /* 6356 * Allocate a new spa_t structure. 6357 */ 6358 nvl = fnvlist_alloc(); 6359 fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool); 6360 (void) nvlist_lookup_string(props, 6361 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 6362 spa = spa_add(poolname, nvl, altroot); 6363 fnvlist_free(nvl); 6364 spa_activate(spa, spa_mode_global); 6365 6366 if (props && (error = spa_prop_validate(spa, props))) { 6367 spa_deactivate(spa); 6368 spa_remove(spa); 6369 mutex_exit(&spa_namespace_lock); 6370 return (error); 6371 } 6372 6373 /* 6374 * Temporary pool names should never be written to disk. 6375 */ 6376 if (poolname != pool) 6377 spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME; 6378 6379 has_features = B_FALSE; 6380 has_encryption = B_FALSE; 6381 has_allocclass = B_FALSE; 6382 for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); 6383 elem != NULL; elem = nvlist_next_nvpair(props, elem)) { 6384 if (zpool_prop_feature(nvpair_name(elem))) { 6385 has_features = B_TRUE; 6386 6387 feat_name = strchr(nvpair_name(elem), '@') + 1; 6388 VERIFY0(zfeature_lookup_name(feat_name, &feat)); 6389 if (feat == SPA_FEATURE_ENCRYPTION) 6390 has_encryption = B_TRUE; 6391 if (feat == SPA_FEATURE_ALLOCATION_CLASSES) 6392 has_allocclass = B_TRUE; 6393 } 6394 } 6395 6396 /* verify encryption params, if they were provided */ 6397 if (dcp != NULL) { 6398 error = spa_create_check_encryption_params(dcp, has_encryption); 6399 if (error != 0) { 6400 spa_deactivate(spa); 6401 spa_remove(spa); 6402 mutex_exit(&spa_namespace_lock); 6403 return (error); 6404 } 6405 } 6406 if (!has_allocclass && zfs_special_devs(nvroot, NULL)) { 6407 spa_deactivate(spa); 6408 spa_remove(spa); 6409 mutex_exit(&spa_namespace_lock); 6410 return (ENOTSUP); 6411 } 6412 6413 if (has_features || nvlist_lookup_uint64(props, 6414 zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { 6415 version = SPA_VERSION; 6416 } 6417 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 6418 6419 spa->spa_first_txg = txg; 6420 spa->spa_uberblock.ub_txg = txg - 1; 6421 spa->spa_uberblock.ub_version = version; 6422 spa->spa_ubsync = spa->spa_uberblock; 6423 spa->spa_load_state = SPA_LOAD_CREATE; 6424 spa->spa_removing_phys.sr_state = DSS_NONE; 6425 spa->spa_removing_phys.sr_removing_vdev = -1; 6426 spa->spa_removing_phys.sr_prev_indirect_vdev = -1; 6427 spa->spa_indirect_vdevs_loaded = B_TRUE; 6428 6429 /* 6430 * Create "The Godfather" zio to hold all async IOs 6431 */ 6432 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 6433 KM_SLEEP); 6434 for (int i = 0; i < max_ncpus; i++) { 6435 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 6436 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 6437 ZIO_FLAG_GODFATHER); 6438 } 6439 6440 /* 6441 * Create the root vdev. 6442 */ 6443 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6444 6445 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 6446 6447 ASSERT(error != 0 || rvd != NULL); 6448 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 6449 6450 if (error == 0 && !zfs_allocatable_devs(nvroot)) 6451 error = SET_ERROR(EINVAL); 6452 6453 if (error == 0 && 6454 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 6455 (error = vdev_draid_spare_create(nvroot, rvd, &ndraid, 0)) == 0 && 6456 (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { 6457 /* 6458 * instantiate the metaslab groups (this will dirty the vdevs) 6459 * we can no longer error exit past this point 6460 */ 6461 for (int c = 0; error == 0 && c < rvd->vdev_children; c++) { 6462 vdev_t *vd = rvd->vdev_child[c]; 6463 6464 vdev_metaslab_set_size(vd); 6465 vdev_expand(vd, txg); 6466 } 6467 } 6468 6469 spa_config_exit(spa, SCL_ALL, FTAG); 6470 6471 if (error != 0) { 6472 spa_unload(spa); 6473 spa_deactivate(spa); 6474 spa_remove(spa); 6475 mutex_exit(&spa_namespace_lock); 6476 return (error); 6477 } 6478 6479 /* 6480 * Get the list of spares, if specified. 6481 */ 6482 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 6483 &spares, &nspares) == 0) { 6484 spa->spa_spares.sav_config = fnvlist_alloc(); 6485 fnvlist_add_nvlist_array(spa->spa_spares.sav_config, 6486 ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, 6487 nspares); 6488 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6489 spa_load_spares(spa); 6490 spa_config_exit(spa, SCL_ALL, FTAG); 6491 spa->spa_spares.sav_sync = B_TRUE; 6492 } 6493 6494 /* 6495 * Get the list of level 2 cache devices, if specified. 6496 */ 6497 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 6498 &l2cache, &nl2cache) == 0) { 6499 VERIFY0(nvlist_alloc(&spa->spa_l2cache.sav_config, 6500 NV_UNIQUE_NAME, KM_SLEEP)); 6501 fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 6502 ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache, 6503 nl2cache); 6504 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6505 spa_load_l2cache(spa); 6506 spa_config_exit(spa, SCL_ALL, FTAG); 6507 spa->spa_l2cache.sav_sync = B_TRUE; 6508 } 6509 6510 spa->spa_is_initializing = B_TRUE; 6511 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, dcp, txg); 6512 spa->spa_is_initializing = B_FALSE; 6513 6514 /* 6515 * Create DDTs (dedup tables). 6516 */ 6517 ddt_create(spa); 6518 /* 6519 * Create BRT table and BRT table object. 6520 */ 6521 brt_create(spa); 6522 6523 spa_update_dspace(spa); 6524 6525 tx = dmu_tx_create_assigned(dp, txg); 6526 6527 /* 6528 * Create the pool's history object. 6529 */ 6530 if (version >= SPA_VERSION_ZPOOL_HISTORY && !spa->spa_history) 6531 spa_history_create_obj(spa, tx); 6532 6533 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE); 6534 spa_history_log_version(spa, "create", tx); 6535 6536 /* 6537 * Create the pool config object. 6538 */ 6539 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 6540 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 6541 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 6542 6543 if (zap_add(spa->spa_meta_objset, 6544 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 6545 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 6546 cmn_err(CE_PANIC, "failed to add pool config"); 6547 } 6548 6549 if (zap_add(spa->spa_meta_objset, 6550 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 6551 sizeof (uint64_t), 1, &version, tx) != 0) { 6552 cmn_err(CE_PANIC, "failed to add pool version"); 6553 } 6554 6555 /* Newly created pools with the right version are always deflated. */ 6556 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 6557 spa->spa_deflate = TRUE; 6558 if (zap_add(spa->spa_meta_objset, 6559 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 6560 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 6561 cmn_err(CE_PANIC, "failed to add deflate"); 6562 } 6563 } 6564 6565 /* 6566 * Create the deferred-free bpobj. Turn off compression 6567 * because sync-to-convergence takes longer if the blocksize 6568 * keeps changing. 6569 */ 6570 obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 6571 dmu_object_set_compress(spa->spa_meta_objset, obj, 6572 ZIO_COMPRESS_OFF, tx); 6573 if (zap_add(spa->spa_meta_objset, 6574 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 6575 sizeof (uint64_t), 1, &obj, tx) != 0) { 6576 cmn_err(CE_PANIC, "failed to add bpobj"); 6577 } 6578 VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 6579 spa->spa_meta_objset, obj)); 6580 6581 /* 6582 * Generate some random noise for salted checksums to operate on. 6583 */ 6584 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 6585 sizeof (spa->spa_cksum_salt.zcs_bytes)); 6586 6587 /* 6588 * Set pool properties. 6589 */ 6590 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 6591 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 6592 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 6593 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 6594 spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST); 6595 spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM); 6596 6597 if (props != NULL) { 6598 spa_configfile_set(spa, props, B_FALSE); 6599 spa_sync_props(props, tx); 6600 } 6601 6602 for (int i = 0; i < ndraid; i++) 6603 spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); 6604 6605 dmu_tx_commit(tx); 6606 6607 spa->spa_sync_on = B_TRUE; 6608 txg_sync_start(dp); 6609 mmp_thread_start(spa); 6610 txg_wait_synced(dp, txg); 6611 6612 spa_spawn_aux_threads(spa); 6613 6614 spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE); 6615 6616 /* 6617 * Don't count references from objsets that are already closed 6618 * and are making their way through the eviction process. 6619 */ 6620 spa_evicting_os_wait(spa); 6621 spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); 6622 spa->spa_load_state = SPA_LOAD_NONE; 6623 6624 spa_import_os(spa); 6625 6626 mutex_exit(&spa_namespace_lock); 6627 6628 return (0); 6629 } 6630 6631 /* 6632 * Import a non-root pool into the system. 6633 */ 6634 int 6635 spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 6636 { 6637 spa_t *spa; 6638 const char *altroot = NULL; 6639 spa_load_state_t state = SPA_LOAD_IMPORT; 6640 zpool_load_policy_t policy; 6641 spa_mode_t mode = spa_mode_global; 6642 uint64_t readonly = B_FALSE; 6643 int error; 6644 nvlist_t *nvroot; 6645 nvlist_t **spares, **l2cache; 6646 uint_t nspares, nl2cache; 6647 6648 /* 6649 * If a pool with this name exists, return failure. 6650 */ 6651 mutex_enter(&spa_namespace_lock); 6652 if (spa_lookup(pool) != NULL) { 6653 mutex_exit(&spa_namespace_lock); 6654 return (SET_ERROR(EEXIST)); 6655 } 6656 6657 /* 6658 * Create and initialize the spa structure. 6659 */ 6660 (void) nvlist_lookup_string(props, 6661 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 6662 (void) nvlist_lookup_uint64(props, 6663 zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 6664 if (readonly) 6665 mode = SPA_MODE_READ; 6666 spa = spa_add(pool, config, altroot); 6667 spa->spa_import_flags = flags; 6668 6669 /* 6670 * Verbatim import - Take a pool and insert it into the namespace 6671 * as if it had been loaded at boot. 6672 */ 6673 if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 6674 if (props != NULL) 6675 spa_configfile_set(spa, props, B_FALSE); 6676 6677 spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE); 6678 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); 6679 zfs_dbgmsg("spa_import: verbatim import of %s", pool); 6680 mutex_exit(&spa_namespace_lock); 6681 return (0); 6682 } 6683 6684 spa_activate(spa, mode); 6685 6686 /* 6687 * Don't start async tasks until we know everything is healthy. 6688 */ 6689 spa_async_suspend(spa); 6690 6691 zpool_get_load_policy(config, &policy); 6692 if (policy.zlp_rewind & ZPOOL_DO_REWIND) 6693 state = SPA_LOAD_RECOVER; 6694 6695 spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT; 6696 6697 if (state != SPA_LOAD_RECOVER) { 6698 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 6699 zfs_dbgmsg("spa_import: importing %s", pool); 6700 } else { 6701 zfs_dbgmsg("spa_import: importing %s, max_txg=%lld " 6702 "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg); 6703 } 6704 error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind); 6705 6706 /* 6707 * Propagate anything learned while loading the pool and pass it 6708 * back to caller (i.e. rewind info, missing devices, etc). 6709 */ 6710 fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info); 6711 6712 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6713 /* 6714 * Toss any existing sparelist, as it doesn't have any validity 6715 * anymore, and conflicts with spa_has_spare(). 6716 */ 6717 if (spa->spa_spares.sav_config) { 6718 nvlist_free(spa->spa_spares.sav_config); 6719 spa->spa_spares.sav_config = NULL; 6720 spa_load_spares(spa); 6721 } 6722 if (spa->spa_l2cache.sav_config) { 6723 nvlist_free(spa->spa_l2cache.sav_config); 6724 spa->spa_l2cache.sav_config = NULL; 6725 spa_load_l2cache(spa); 6726 } 6727 6728 nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); 6729 spa_config_exit(spa, SCL_ALL, FTAG); 6730 6731 if (props != NULL) 6732 spa_configfile_set(spa, props, B_FALSE); 6733 6734 if (error != 0 || (props && spa_writeable(spa) && 6735 (error = spa_prop_set(spa, props)))) { 6736 spa_unload(spa); 6737 spa_deactivate(spa); 6738 spa_remove(spa); 6739 mutex_exit(&spa_namespace_lock); 6740 return (error); 6741 } 6742 6743 spa_async_resume(spa); 6744 6745 /* 6746 * Override any spares and level 2 cache devices as specified by 6747 * the user, as these may have correct device names/devids, etc. 6748 */ 6749 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 6750 &spares, &nspares) == 0) { 6751 if (spa->spa_spares.sav_config) 6752 fnvlist_remove(spa->spa_spares.sav_config, 6753 ZPOOL_CONFIG_SPARES); 6754 else 6755 spa->spa_spares.sav_config = fnvlist_alloc(); 6756 fnvlist_add_nvlist_array(spa->spa_spares.sav_config, 6757 ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, 6758 nspares); 6759 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6760 spa_load_spares(spa); 6761 spa_config_exit(spa, SCL_ALL, FTAG); 6762 spa->spa_spares.sav_sync = B_TRUE; 6763 } 6764 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 6765 &l2cache, &nl2cache) == 0) { 6766 if (spa->spa_l2cache.sav_config) 6767 fnvlist_remove(spa->spa_l2cache.sav_config, 6768 ZPOOL_CONFIG_L2CACHE); 6769 else 6770 spa->spa_l2cache.sav_config = fnvlist_alloc(); 6771 fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 6772 ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache, 6773 nl2cache); 6774 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6775 spa_load_l2cache(spa); 6776 spa_config_exit(spa, SCL_ALL, FTAG); 6777 spa->spa_l2cache.sav_sync = B_TRUE; 6778 } 6779 6780 /* 6781 * Check for any removed devices. 6782 */ 6783 if (spa->spa_autoreplace) { 6784 spa_aux_check_removed(&spa->spa_spares); 6785 spa_aux_check_removed(&spa->spa_l2cache); 6786 } 6787 6788 if (spa_writeable(spa)) { 6789 /* 6790 * Update the config cache to include the newly-imported pool. 6791 */ 6792 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 6793 } 6794 6795 /* 6796 * It's possible that the pool was expanded while it was exported. 6797 * We kick off an async task to handle this for us. 6798 */ 6799 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 6800 6801 spa_history_log_version(spa, "import", NULL); 6802 6803 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); 6804 6805 mutex_exit(&spa_namespace_lock); 6806 6807 zvol_create_minors_recursive(pool); 6808 6809 spa_import_os(spa); 6810 6811 return (0); 6812 } 6813 6814 nvlist_t * 6815 spa_tryimport(nvlist_t *tryconfig) 6816 { 6817 nvlist_t *config = NULL; 6818 const char *poolname, *cachefile; 6819 spa_t *spa; 6820 uint64_t state; 6821 int error; 6822 zpool_load_policy_t policy; 6823 6824 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 6825 return (NULL); 6826 6827 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 6828 return (NULL); 6829 6830 /* 6831 * Create and initialize the spa structure. 6832 */ 6833 char *name = kmem_alloc(MAXPATHLEN, KM_SLEEP); 6834 (void) snprintf(name, MAXPATHLEN, "%s-%llx-%s", 6835 TRYIMPORT_NAME, (u_longlong_t)curthread, poolname); 6836 6837 mutex_enter(&spa_namespace_lock); 6838 spa = spa_add(name, tryconfig, NULL); 6839 spa_activate(spa, SPA_MODE_READ); 6840 kmem_free(name, MAXPATHLEN); 6841 6842 /* 6843 * Rewind pool if a max txg was provided. 6844 */ 6845 zpool_get_load_policy(spa->spa_config, &policy); 6846 if (policy.zlp_txg != UINT64_MAX) { 6847 spa->spa_load_max_txg = policy.zlp_txg; 6848 spa->spa_extreme_rewind = B_TRUE; 6849 zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld", 6850 poolname, (longlong_t)policy.zlp_txg); 6851 } else { 6852 zfs_dbgmsg("spa_tryimport: importing %s", poolname); 6853 } 6854 6855 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile) 6856 == 0) { 6857 zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile); 6858 spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; 6859 } else { 6860 spa->spa_config_source = SPA_CONFIG_SRC_SCAN; 6861 } 6862 6863 /* 6864 * spa_import() relies on a pool config fetched by spa_try_import() 6865 * for spare/cache devices. Import flags are not passed to 6866 * spa_tryimport(), which makes it return early due to a missing log 6867 * device and missing retrieving the cache device and spare eventually. 6868 * Passing ZFS_IMPORT_MISSING_LOG to spa_tryimport() makes it fetch 6869 * the correct configuration regardless of the missing log device. 6870 */ 6871 spa->spa_import_flags |= ZFS_IMPORT_MISSING_LOG; 6872 6873 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING); 6874 6875 /* 6876 * If 'tryconfig' was at least parsable, return the current config. 6877 */ 6878 if (spa->spa_root_vdev != NULL) { 6879 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 6880 fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, poolname); 6881 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, state); 6882 fnvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 6883 spa->spa_uberblock.ub_timestamp); 6884 fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 6885 spa->spa_load_info); 6886 fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, 6887 spa->spa_errata); 6888 6889 /* 6890 * If the bootfs property exists on this pool then we 6891 * copy it out so that external consumers can tell which 6892 * pools are bootable. 6893 */ 6894 if ((!error || error == EEXIST) && spa->spa_bootfs) { 6895 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 6896 6897 /* 6898 * We have to play games with the name since the 6899 * pool was opened as TRYIMPORT_NAME. 6900 */ 6901 if (dsl_dsobj_to_dsname(spa_name(spa), 6902 spa->spa_bootfs, tmpname) == 0) { 6903 char *cp; 6904 char *dsname; 6905 6906 dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 6907 6908 cp = strchr(tmpname, '/'); 6909 if (cp == NULL) { 6910 (void) strlcpy(dsname, tmpname, 6911 MAXPATHLEN); 6912 } else { 6913 (void) snprintf(dsname, MAXPATHLEN, 6914 "%s/%s", poolname, ++cp); 6915 } 6916 fnvlist_add_string(config, ZPOOL_CONFIG_BOOTFS, 6917 dsname); 6918 kmem_free(dsname, MAXPATHLEN); 6919 } 6920 kmem_free(tmpname, MAXPATHLEN); 6921 } 6922 6923 /* 6924 * Add the list of hot spares and level 2 cache devices. 6925 */ 6926 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6927 spa_add_spares(spa, config); 6928 spa_add_l2cache(spa, config); 6929 spa_config_exit(spa, SCL_CONFIG, FTAG); 6930 } 6931 6932 spa_unload(spa); 6933 spa_deactivate(spa); 6934 spa_remove(spa); 6935 mutex_exit(&spa_namespace_lock); 6936 6937 return (config); 6938 } 6939 6940 /* 6941 * Pool export/destroy 6942 * 6943 * The act of destroying or exporting a pool is very simple. We make sure there 6944 * is no more pending I/O and any references to the pool are gone. Then, we 6945 * update the pool state and sync all the labels to disk, removing the 6946 * configuration from the cache afterwards. If the 'hardforce' flag is set, then 6947 * we don't sync the labels or remove the configuration cache. 6948 */ 6949 static int 6950 spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, 6951 boolean_t force, boolean_t hardforce) 6952 { 6953 int error; 6954 spa_t *spa; 6955 hrtime_t export_start = gethrtime(); 6956 6957 if (oldconfig) 6958 *oldconfig = NULL; 6959 6960 if (!(spa_mode_global & SPA_MODE_WRITE)) 6961 return (SET_ERROR(EROFS)); 6962 6963 mutex_enter(&spa_namespace_lock); 6964 if ((spa = spa_lookup(pool)) == NULL) { 6965 mutex_exit(&spa_namespace_lock); 6966 return (SET_ERROR(ENOENT)); 6967 } 6968 6969 if (spa->spa_is_exporting) { 6970 /* the pool is being exported by another thread */ 6971 mutex_exit(&spa_namespace_lock); 6972 return (SET_ERROR(ZFS_ERR_EXPORT_IN_PROGRESS)); 6973 } 6974 spa->spa_is_exporting = B_TRUE; 6975 6976 /* 6977 * Put a hold on the pool, drop the namespace lock, stop async tasks, 6978 * reacquire the namespace lock, and see if we can export. 6979 */ 6980 spa_open_ref(spa, FTAG); 6981 mutex_exit(&spa_namespace_lock); 6982 spa_async_suspend(spa); 6983 if (spa->spa_zvol_taskq) { 6984 zvol_remove_minors(spa, spa_name(spa), B_TRUE); 6985 taskq_wait(spa->spa_zvol_taskq); 6986 } 6987 mutex_enter(&spa_namespace_lock); 6988 spa_close(spa, FTAG); 6989 6990 if (spa->spa_state == POOL_STATE_UNINITIALIZED) 6991 goto export_spa; 6992 /* 6993 * The pool will be in core if it's openable, in which case we can 6994 * modify its state. Objsets may be open only because they're dirty, 6995 * so we have to force it to sync before checking spa_refcnt. 6996 */ 6997 if (spa->spa_sync_on) { 6998 txg_wait_synced(spa->spa_dsl_pool, 0); 6999 spa_evicting_os_wait(spa); 7000 } 7001 7002 /* 7003 * A pool cannot be exported or destroyed if there are active 7004 * references. If we are resetting a pool, allow references by 7005 * fault injection handlers. 7006 */ 7007 if (!spa_refcount_zero(spa) || (spa->spa_inject_ref != 0)) { 7008 error = SET_ERROR(EBUSY); 7009 goto fail; 7010 } 7011 7012 if (spa->spa_sync_on) { 7013 vdev_t *rvd = spa->spa_root_vdev; 7014 /* 7015 * A pool cannot be exported if it has an active shared spare. 7016 * This is to prevent other pools stealing the active spare 7017 * from an exported pool. At user's own will, such pool can 7018 * be forcedly exported. 7019 */ 7020 if (!force && new_state == POOL_STATE_EXPORTED && 7021 spa_has_active_shared_spare(spa)) { 7022 error = SET_ERROR(EXDEV); 7023 goto fail; 7024 } 7025 7026 /* 7027 * We're about to export or destroy this pool. Make sure 7028 * we stop all initialization and trim activity here before 7029 * we set the spa_final_txg. This will ensure that all 7030 * dirty data resulting from the initialization is 7031 * committed to disk before we unload the pool. 7032 */ 7033 vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE); 7034 vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE); 7035 vdev_autotrim_stop_all(spa); 7036 vdev_rebuild_stop_all(spa); 7037 7038 /* 7039 * We want this to be reflected on every label, 7040 * so mark them all dirty. spa_unload() will do the 7041 * final sync that pushes these changes out. 7042 */ 7043 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 7044 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 7045 spa->spa_state = new_state; 7046 vdev_config_dirty(rvd); 7047 spa_config_exit(spa, SCL_ALL, FTAG); 7048 } 7049 7050 /* 7051 * If the log space map feature is enabled and the pool is 7052 * getting exported (but not destroyed), we want to spend some 7053 * time flushing as many metaslabs as we can in an attempt to 7054 * destroy log space maps and save import time. This has to be 7055 * done before we set the spa_final_txg, otherwise 7056 * spa_sync() -> spa_flush_metaslabs() may dirty the final TXGs. 7057 * spa_should_flush_logs_on_unload() should be called after 7058 * spa_state has been set to the new_state. 7059 */ 7060 if (spa_should_flush_logs_on_unload(spa)) 7061 spa_unload_log_sm_flush_all(spa); 7062 7063 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 7064 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 7065 spa->spa_final_txg = spa_last_synced_txg(spa) + 7066 TXG_DEFER_SIZE + 1; 7067 spa_config_exit(spa, SCL_ALL, FTAG); 7068 } 7069 } 7070 7071 export_spa: 7072 spa_export_os(spa); 7073 7074 if (new_state == POOL_STATE_DESTROYED) 7075 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY); 7076 else if (new_state == POOL_STATE_EXPORTED) 7077 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_EXPORT); 7078 7079 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 7080 spa_unload(spa); 7081 spa_deactivate(spa); 7082 } 7083 7084 if (oldconfig && spa->spa_config) 7085 *oldconfig = fnvlist_dup(spa->spa_config); 7086 7087 if (new_state != POOL_STATE_UNINITIALIZED) { 7088 if (!hardforce) 7089 spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE); 7090 spa_remove(spa); 7091 } else { 7092 /* 7093 * If spa_remove() is not called for this spa_t and 7094 * there is any possibility that it can be reused, 7095 * we make sure to reset the exporting flag. 7096 */ 7097 spa->spa_is_exporting = B_FALSE; 7098 } 7099 7100 if (new_state == POOL_STATE_EXPORTED) 7101 zio_handle_export_delay(spa, gethrtime() - export_start); 7102 7103 mutex_exit(&spa_namespace_lock); 7104 return (0); 7105 7106 fail: 7107 spa->spa_is_exporting = B_FALSE; 7108 spa_async_resume(spa); 7109 mutex_exit(&spa_namespace_lock); 7110 return (error); 7111 } 7112 7113 /* 7114 * Destroy a storage pool. 7115 */ 7116 int 7117 spa_destroy(const char *pool) 7118 { 7119 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 7120 B_FALSE, B_FALSE)); 7121 } 7122 7123 /* 7124 * Export a storage pool. 7125 */ 7126 int 7127 spa_export(const char *pool, nvlist_t **oldconfig, boolean_t force, 7128 boolean_t hardforce) 7129 { 7130 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 7131 force, hardforce)); 7132 } 7133 7134 /* 7135 * Similar to spa_export(), this unloads the spa_t without actually removing it 7136 * from the namespace in any way. 7137 */ 7138 int 7139 spa_reset(const char *pool) 7140 { 7141 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 7142 B_FALSE, B_FALSE)); 7143 } 7144 7145 /* 7146 * ========================================================================== 7147 * Device manipulation 7148 * ========================================================================== 7149 */ 7150 7151 /* 7152 * This is called as a synctask to increment the draid feature flag 7153 */ 7154 static void 7155 spa_draid_feature_incr(void *arg, dmu_tx_t *tx) 7156 { 7157 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 7158 int draid = (int)(uintptr_t)arg; 7159 7160 for (int c = 0; c < draid; c++) 7161 spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); 7162 } 7163 7164 /* 7165 * Add a device to a storage pool. 7166 */ 7167 int 7168 spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t check_ashift) 7169 { 7170 uint64_t txg, ndraid = 0; 7171 int error; 7172 vdev_t *rvd = spa->spa_root_vdev; 7173 vdev_t *vd, *tvd; 7174 nvlist_t **spares, **l2cache; 7175 uint_t nspares, nl2cache; 7176 7177 ASSERT(spa_writeable(spa)); 7178 7179 txg = spa_vdev_enter(spa); 7180 7181 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 7182 VDEV_ALLOC_ADD)) != 0) 7183 return (spa_vdev_exit(spa, NULL, txg, error)); 7184 7185 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 7186 7187 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 7188 &nspares) != 0) 7189 nspares = 0; 7190 7191 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 7192 &nl2cache) != 0) 7193 nl2cache = 0; 7194 7195 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 7196 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 7197 7198 if (vd->vdev_children != 0 && 7199 (error = vdev_create(vd, txg, B_FALSE)) != 0) { 7200 return (spa_vdev_exit(spa, vd, txg, error)); 7201 } 7202 7203 /* 7204 * The virtual dRAID spares must be added after vdev tree is created 7205 * and the vdev guids are generated. The guid of their associated 7206 * dRAID is stored in the config and used when opening the spare. 7207 */ 7208 if ((error = vdev_draid_spare_create(nvroot, vd, &ndraid, 7209 rvd->vdev_children)) == 0) { 7210 if (ndraid > 0 && nvlist_lookup_nvlist_array(nvroot, 7211 ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0) 7212 nspares = 0; 7213 } else { 7214 return (spa_vdev_exit(spa, vd, txg, error)); 7215 } 7216 7217 /* 7218 * We must validate the spares and l2cache devices after checking the 7219 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 7220 */ 7221 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 7222 return (spa_vdev_exit(spa, vd, txg, error)); 7223 7224 /* 7225 * If we are in the middle of a device removal, we can only add 7226 * devices which match the existing devices in the pool. 7227 * If we are in the middle of a removal, or have some indirect 7228 * vdevs, we can not add raidz or dRAID top levels. 7229 */ 7230 if (spa->spa_vdev_removal != NULL || 7231 spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { 7232 for (int c = 0; c < vd->vdev_children; c++) { 7233 tvd = vd->vdev_child[c]; 7234 if (spa->spa_vdev_removal != NULL && 7235 tvd->vdev_ashift != spa->spa_max_ashift) { 7236 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 7237 } 7238 /* Fail if top level vdev is raidz or a dRAID */ 7239 if (vdev_get_nparity(tvd) != 0) 7240 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 7241 7242 /* 7243 * Need the top level mirror to be 7244 * a mirror of leaf vdevs only 7245 */ 7246 if (tvd->vdev_ops == &vdev_mirror_ops) { 7247 for (uint64_t cid = 0; 7248 cid < tvd->vdev_children; cid++) { 7249 vdev_t *cvd = tvd->vdev_child[cid]; 7250 if (!cvd->vdev_ops->vdev_op_leaf) { 7251 return (spa_vdev_exit(spa, vd, 7252 txg, EINVAL)); 7253 } 7254 } 7255 } 7256 } 7257 } 7258 7259 if (check_ashift && spa->spa_max_ashift == spa->spa_min_ashift) { 7260 for (int c = 0; c < vd->vdev_children; c++) { 7261 tvd = vd->vdev_child[c]; 7262 if (tvd->vdev_ashift != spa->spa_max_ashift) { 7263 return (spa_vdev_exit(spa, vd, txg, 7264 ZFS_ERR_ASHIFT_MISMATCH)); 7265 } 7266 } 7267 } 7268 7269 for (int c = 0; c < vd->vdev_children; c++) { 7270 tvd = vd->vdev_child[c]; 7271 vdev_remove_child(vd, tvd); 7272 tvd->vdev_id = rvd->vdev_children; 7273 vdev_add_child(rvd, tvd); 7274 vdev_config_dirty(tvd); 7275 } 7276 7277 if (nspares != 0) { 7278 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 7279 ZPOOL_CONFIG_SPARES); 7280 spa_load_spares(spa); 7281 spa->spa_spares.sav_sync = B_TRUE; 7282 } 7283 7284 if (nl2cache != 0) { 7285 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 7286 ZPOOL_CONFIG_L2CACHE); 7287 spa_load_l2cache(spa); 7288 spa->spa_l2cache.sav_sync = B_TRUE; 7289 } 7290 7291 /* 7292 * We can't increment a feature while holding spa_vdev so we 7293 * have to do it in a synctask. 7294 */ 7295 if (ndraid != 0) { 7296 dmu_tx_t *tx; 7297 7298 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 7299 dsl_sync_task_nowait(spa->spa_dsl_pool, spa_draid_feature_incr, 7300 (void *)(uintptr_t)ndraid, tx); 7301 dmu_tx_commit(tx); 7302 } 7303 7304 /* 7305 * We have to be careful when adding new vdevs to an existing pool. 7306 * If other threads start allocating from these vdevs before we 7307 * sync the config cache, and we lose power, then upon reboot we may 7308 * fail to open the pool because there are DVAs that the config cache 7309 * can't translate. Therefore, we first add the vdevs without 7310 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 7311 * and then let spa_config_update() initialize the new metaslabs. 7312 * 7313 * spa_load() checks for added-but-not-initialized vdevs, so that 7314 * if we lose power at any point in this sequence, the remaining 7315 * steps will be completed the next time we load the pool. 7316 */ 7317 (void) spa_vdev_exit(spa, vd, txg, 0); 7318 7319 mutex_enter(&spa_namespace_lock); 7320 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 7321 spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD); 7322 mutex_exit(&spa_namespace_lock); 7323 7324 return (0); 7325 } 7326 7327 /* 7328 * Attach a device to a vdev specified by its guid. The vdev type can be 7329 * a mirror, a raidz, or a leaf device that is also a top-level (e.g. a 7330 * single device). When the vdev is a single device, a mirror vdev will be 7331 * automatically inserted. 7332 * 7333 * If 'replacing' is specified, the new device is intended to replace the 7334 * existing device; in this case the two devices are made into their own 7335 * mirror using the 'replacing' vdev, which is functionally identical to 7336 * the mirror vdev (it actually reuses all the same ops) but has a few 7337 * extra rules: you can't attach to it after it's been created, and upon 7338 * completion of resilvering, the first disk (the one being replaced) 7339 * is automatically detached. 7340 * 7341 * If 'rebuild' is specified, then sequential reconstruction (a.ka. rebuild) 7342 * should be performed instead of traditional healing reconstruction. From 7343 * an administrators perspective these are both resilver operations. 7344 */ 7345 int 7346 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, 7347 int rebuild) 7348 { 7349 uint64_t txg, dtl_max_txg; 7350 vdev_t *rvd = spa->spa_root_vdev; 7351 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 7352 vdev_ops_t *pvops; 7353 char *oldvdpath, *newvdpath; 7354 int newvd_isspare = B_FALSE; 7355 int error; 7356 7357 ASSERT(spa_writeable(spa)); 7358 7359 txg = spa_vdev_enter(spa); 7360 7361 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 7362 7363 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 7364 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 7365 error = (spa_has_checkpoint(spa)) ? 7366 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 7367 return (spa_vdev_exit(spa, NULL, txg, error)); 7368 } 7369 7370 if (rebuild) { 7371 if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) 7372 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7373 7374 if (dsl_scan_resilvering(spa_get_dsl(spa)) || 7375 dsl_scan_resilver_scheduled(spa_get_dsl(spa))) { 7376 return (spa_vdev_exit(spa, NULL, txg, 7377 ZFS_ERR_RESILVER_IN_PROGRESS)); 7378 } 7379 } else { 7380 if (vdev_rebuild_active(rvd)) 7381 return (spa_vdev_exit(spa, NULL, txg, 7382 ZFS_ERR_REBUILD_IN_PROGRESS)); 7383 } 7384 7385 if (spa->spa_vdev_removal != NULL) { 7386 return (spa_vdev_exit(spa, NULL, txg, 7387 ZFS_ERR_DEVRM_IN_PROGRESS)); 7388 } 7389 7390 if (oldvd == NULL) 7391 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 7392 7393 boolean_t raidz = oldvd->vdev_ops == &vdev_raidz_ops; 7394 7395 if (raidz) { 7396 if (!spa_feature_is_enabled(spa, SPA_FEATURE_RAIDZ_EXPANSION)) 7397 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7398 7399 /* 7400 * Can't expand a raidz while prior expand is in progress. 7401 */ 7402 if (spa->spa_raidz_expand != NULL) { 7403 return (spa_vdev_exit(spa, NULL, txg, 7404 ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS)); 7405 } 7406 } else if (!oldvd->vdev_ops->vdev_op_leaf) { 7407 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7408 } 7409 7410 if (raidz) 7411 pvd = oldvd; 7412 else 7413 pvd = oldvd->vdev_parent; 7414 7415 if (spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 7416 VDEV_ALLOC_ATTACH) != 0) 7417 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 7418 7419 if (newrootvd->vdev_children != 1) 7420 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 7421 7422 newvd = newrootvd->vdev_child[0]; 7423 7424 if (!newvd->vdev_ops->vdev_op_leaf) 7425 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 7426 7427 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 7428 return (spa_vdev_exit(spa, newrootvd, txg, error)); 7429 7430 /* 7431 * log, dedup and special vdevs should not be replaced by spares. 7432 */ 7433 if ((oldvd->vdev_top->vdev_alloc_bias != VDEV_BIAS_NONE || 7434 oldvd->vdev_top->vdev_islog) && newvd->vdev_isspare) { 7435 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7436 } 7437 7438 /* 7439 * A dRAID spare can only replace a child of its parent dRAID vdev. 7440 */ 7441 if (newvd->vdev_ops == &vdev_draid_spare_ops && 7442 oldvd->vdev_top != vdev_draid_spare_get_parent(newvd)) { 7443 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7444 } 7445 7446 if (rebuild) { 7447 /* 7448 * For rebuilds, the top vdev must support reconstruction 7449 * using only space maps. This means the only allowable 7450 * vdevs types are the root vdev, a mirror, or dRAID. 7451 */ 7452 tvd = pvd; 7453 if (pvd->vdev_top != NULL) 7454 tvd = pvd->vdev_top; 7455 7456 if (tvd->vdev_ops != &vdev_mirror_ops && 7457 tvd->vdev_ops != &vdev_root_ops && 7458 tvd->vdev_ops != &vdev_draid_ops) { 7459 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7460 } 7461 } 7462 7463 if (!replacing) { 7464 /* 7465 * For attach, the only allowable parent is a mirror or 7466 * the root vdev. A raidz vdev can be attached to, but 7467 * you cannot attach to a raidz child. 7468 */ 7469 if (pvd->vdev_ops != &vdev_mirror_ops && 7470 pvd->vdev_ops != &vdev_root_ops && 7471 !raidz) 7472 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7473 7474 pvops = &vdev_mirror_ops; 7475 } else { 7476 /* 7477 * Active hot spares can only be replaced by inactive hot 7478 * spares. 7479 */ 7480 if (pvd->vdev_ops == &vdev_spare_ops && 7481 oldvd->vdev_isspare && 7482 !spa_has_spare(spa, newvd->vdev_guid)) 7483 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7484 7485 /* 7486 * If the source is a hot spare, and the parent isn't already a 7487 * spare, then we want to create a new hot spare. Otherwise, we 7488 * want to create a replacing vdev. The user is not allowed to 7489 * attach to a spared vdev child unless the 'isspare' state is 7490 * the same (spare replaces spare, non-spare replaces 7491 * non-spare). 7492 */ 7493 if (pvd->vdev_ops == &vdev_replacing_ops && 7494 spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 7495 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7496 } else if (pvd->vdev_ops == &vdev_spare_ops && 7497 newvd->vdev_isspare != oldvd->vdev_isspare) { 7498 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7499 } 7500 7501 if (newvd->vdev_isspare) 7502 pvops = &vdev_spare_ops; 7503 else 7504 pvops = &vdev_replacing_ops; 7505 } 7506 7507 /* 7508 * Make sure the new device is big enough. 7509 */ 7510 vdev_t *min_vdev = raidz ? oldvd->vdev_child[0] : oldvd; 7511 if (newvd->vdev_asize < vdev_get_min_asize(min_vdev)) 7512 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 7513 7514 /* 7515 * The new device cannot have a higher alignment requirement 7516 * than the top-level vdev. 7517 */ 7518 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 7519 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7520 7521 /* 7522 * RAIDZ-expansion-specific checks. 7523 */ 7524 if (raidz) { 7525 if (vdev_raidz_attach_check(newvd) != 0) 7526 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 7527 7528 /* 7529 * Fail early if a child is not healthy or being replaced 7530 */ 7531 for (int i = 0; i < oldvd->vdev_children; i++) { 7532 if (vdev_is_dead(oldvd->vdev_child[i]) || 7533 !oldvd->vdev_child[i]->vdev_ops->vdev_op_leaf) { 7534 return (spa_vdev_exit(spa, newrootvd, txg, 7535 ENXIO)); 7536 } 7537 /* Also fail if reserved boot area is in-use */ 7538 if (vdev_check_boot_reserve(spa, oldvd->vdev_child[i]) 7539 != 0) { 7540 return (spa_vdev_exit(spa, newrootvd, txg, 7541 EADDRINUSE)); 7542 } 7543 } 7544 } 7545 7546 if (raidz) { 7547 /* 7548 * Note: oldvdpath is freed by spa_strfree(), but 7549 * kmem_asprintf() is freed by kmem_strfree(), so we have to 7550 * move it to a spa_strdup-ed string. 7551 */ 7552 char *tmp = kmem_asprintf("raidz%u-%u", 7553 (uint_t)vdev_get_nparity(oldvd), (uint_t)oldvd->vdev_id); 7554 oldvdpath = spa_strdup(tmp); 7555 kmem_strfree(tmp); 7556 } else { 7557 oldvdpath = spa_strdup(oldvd->vdev_path); 7558 } 7559 newvdpath = spa_strdup(newvd->vdev_path); 7560 7561 /* 7562 * If this is an in-place replacement, update oldvd's path and devid 7563 * to make it distinguishable from newvd, and unopenable from now on. 7564 */ 7565 if (strcmp(oldvdpath, newvdpath) == 0) { 7566 spa_strfree(oldvd->vdev_path); 7567 oldvd->vdev_path = kmem_alloc(strlen(newvdpath) + 5, 7568 KM_SLEEP); 7569 (void) sprintf(oldvd->vdev_path, "%s/old", 7570 newvdpath); 7571 if (oldvd->vdev_devid != NULL) { 7572 spa_strfree(oldvd->vdev_devid); 7573 oldvd->vdev_devid = NULL; 7574 } 7575 spa_strfree(oldvdpath); 7576 oldvdpath = spa_strdup(oldvd->vdev_path); 7577 } 7578 7579 /* 7580 * If the parent is not a mirror, or if we're replacing, insert the new 7581 * mirror/replacing/spare vdev above oldvd. 7582 */ 7583 if (!raidz && pvd->vdev_ops != pvops) { 7584 pvd = vdev_add_parent(oldvd, pvops); 7585 ASSERT(pvd->vdev_ops == pvops); 7586 ASSERT(oldvd->vdev_parent == pvd); 7587 } 7588 7589 ASSERT(pvd->vdev_top->vdev_parent == rvd); 7590 7591 /* 7592 * Extract the new device from its root and add it to pvd. 7593 */ 7594 vdev_remove_child(newrootvd, newvd); 7595 newvd->vdev_id = pvd->vdev_children; 7596 newvd->vdev_crtxg = oldvd->vdev_crtxg; 7597 vdev_add_child(pvd, newvd); 7598 7599 /* 7600 * Reevaluate the parent vdev state. 7601 */ 7602 vdev_propagate_state(pvd); 7603 7604 tvd = newvd->vdev_top; 7605 ASSERT(pvd->vdev_top == tvd); 7606 ASSERT(tvd->vdev_parent == rvd); 7607 7608 vdev_config_dirty(tvd); 7609 7610 /* 7611 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 7612 * for any dmu_sync-ed blocks. It will propagate upward when 7613 * spa_vdev_exit() calls vdev_dtl_reassess(). 7614 */ 7615 dtl_max_txg = txg + TXG_CONCURRENT_STATES; 7616 7617 if (raidz) { 7618 /* 7619 * Wait for the youngest allocations and frees to sync, 7620 * and then wait for the deferral of those frees to finish. 7621 */ 7622 spa_vdev_config_exit(spa, NULL, 7623 txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 7624 7625 vdev_initialize_stop_all(tvd, VDEV_INITIALIZE_ACTIVE); 7626 vdev_trim_stop_all(tvd, VDEV_TRIM_ACTIVE); 7627 vdev_autotrim_stop_wait(tvd); 7628 7629 dtl_max_txg = spa_vdev_config_enter(spa); 7630 7631 tvd->vdev_rz_expanding = B_TRUE; 7632 7633 vdev_dirty_leaves(tvd, VDD_DTL, dtl_max_txg); 7634 vdev_config_dirty(tvd); 7635 7636 dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, 7637 dtl_max_txg); 7638 dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_raidz_attach_sync, 7639 newvd, tx); 7640 dmu_tx_commit(tx); 7641 } else { 7642 vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, 7643 dtl_max_txg - TXG_INITIAL); 7644 7645 if (newvd->vdev_isspare) { 7646 spa_spare_activate(newvd); 7647 spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE); 7648 } 7649 7650 newvd_isspare = newvd->vdev_isspare; 7651 7652 /* 7653 * Mark newvd's DTL dirty in this txg. 7654 */ 7655 vdev_dirty(tvd, VDD_DTL, newvd, txg); 7656 7657 /* 7658 * Schedule the resilver or rebuild to restart in the future. 7659 * We do this to ensure that dmu_sync-ed blocks have been 7660 * stitched into the respective datasets. 7661 */ 7662 if (rebuild) { 7663 newvd->vdev_rebuild_txg = txg; 7664 7665 vdev_rebuild(tvd); 7666 } else { 7667 newvd->vdev_resilver_txg = txg; 7668 7669 if (dsl_scan_resilvering(spa_get_dsl(spa)) && 7670 spa_feature_is_enabled(spa, 7671 SPA_FEATURE_RESILVER_DEFER)) { 7672 vdev_defer_resilver(newvd); 7673 } else { 7674 dsl_scan_restart_resilver(spa->spa_dsl_pool, 7675 dtl_max_txg); 7676 } 7677 } 7678 } 7679 7680 if (spa->spa_bootfs) 7681 spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH); 7682 7683 spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH); 7684 7685 /* 7686 * Commit the config 7687 */ 7688 (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 7689 7690 spa_history_log_internal(spa, "vdev attach", NULL, 7691 "%s vdev=%s %s vdev=%s", 7692 replacing && newvd_isspare ? "spare in" : 7693 replacing ? "replace" : "attach", newvdpath, 7694 replacing ? "for" : "to", oldvdpath); 7695 7696 spa_strfree(oldvdpath); 7697 spa_strfree(newvdpath); 7698 7699 return (0); 7700 } 7701 7702 /* 7703 * Detach a device from a mirror or replacing vdev. 7704 * 7705 * If 'replace_done' is specified, only detach if the parent 7706 * is a replacing or a spare vdev. 7707 */ 7708 int 7709 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 7710 { 7711 uint64_t txg; 7712 int error; 7713 vdev_t *rvd __maybe_unused = spa->spa_root_vdev; 7714 vdev_t *vd, *pvd, *cvd, *tvd; 7715 boolean_t unspare = B_FALSE; 7716 uint64_t unspare_guid = 0; 7717 char *vdpath; 7718 7719 ASSERT(spa_writeable(spa)); 7720 7721 txg = spa_vdev_detach_enter(spa, guid); 7722 7723 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 7724 7725 /* 7726 * Besides being called directly from the userland through the 7727 * ioctl interface, spa_vdev_detach() can be potentially called 7728 * at the end of spa_vdev_resilver_done(). 7729 * 7730 * In the regular case, when we have a checkpoint this shouldn't 7731 * happen as we never empty the DTLs of a vdev during the scrub 7732 * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done() 7733 * should never get here when we have a checkpoint. 7734 * 7735 * That said, even in a case when we checkpoint the pool exactly 7736 * as spa_vdev_resilver_done() calls this function everything 7737 * should be fine as the resilver will return right away. 7738 */ 7739 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 7740 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 7741 error = (spa_has_checkpoint(spa)) ? 7742 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 7743 return (spa_vdev_exit(spa, NULL, txg, error)); 7744 } 7745 7746 if (vd == NULL) 7747 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 7748 7749 if (!vd->vdev_ops->vdev_op_leaf) 7750 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7751 7752 pvd = vd->vdev_parent; 7753 7754 /* 7755 * If the parent/child relationship is not as expected, don't do it. 7756 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 7757 * vdev that's replacing B with C. The user's intent in replacing 7758 * is to go from M(A,B) to M(A,C). If the user decides to cancel 7759 * the replace by detaching C, the expected behavior is to end up 7760 * M(A,B). But suppose that right after deciding to detach C, 7761 * the replacement of B completes. We would have M(A,C), and then 7762 * ask to detach C, which would leave us with just A -- not what 7763 * the user wanted. To prevent this, we make sure that the 7764 * parent/child relationship hasn't changed -- in this example, 7765 * that C's parent is still the replacing vdev R. 7766 */ 7767 if (pvd->vdev_guid != pguid && pguid != 0) 7768 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 7769 7770 /* 7771 * Only 'replacing' or 'spare' vdevs can be replaced. 7772 */ 7773 if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 7774 pvd->vdev_ops != &vdev_spare_ops) 7775 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7776 7777 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 7778 spa_version(spa) >= SPA_VERSION_SPARES); 7779 7780 /* 7781 * Only mirror, replacing, and spare vdevs support detach. 7782 */ 7783 if (pvd->vdev_ops != &vdev_replacing_ops && 7784 pvd->vdev_ops != &vdev_mirror_ops && 7785 pvd->vdev_ops != &vdev_spare_ops) 7786 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7787 7788 /* 7789 * If this device has the only valid copy of some data, 7790 * we cannot safely detach it. 7791 */ 7792 if (vdev_dtl_required(vd)) 7793 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 7794 7795 ASSERT(pvd->vdev_children >= 2); 7796 7797 /* 7798 * If we are detaching the second disk from a replacing vdev, then 7799 * check to see if we changed the original vdev's path to have "/old" 7800 * at the end in spa_vdev_attach(). If so, undo that change now. 7801 */ 7802 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 7803 vd->vdev_path != NULL) { 7804 size_t len = strlen(vd->vdev_path); 7805 7806 for (int c = 0; c < pvd->vdev_children; c++) { 7807 cvd = pvd->vdev_child[c]; 7808 7809 if (cvd == vd || cvd->vdev_path == NULL) 7810 continue; 7811 7812 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 7813 strcmp(cvd->vdev_path + len, "/old") == 0) { 7814 spa_strfree(cvd->vdev_path); 7815 cvd->vdev_path = spa_strdup(vd->vdev_path); 7816 break; 7817 } 7818 } 7819 } 7820 7821 /* 7822 * If we are detaching the original disk from a normal spare, then it 7823 * implies that the spare should become a real disk, and be removed 7824 * from the active spare list for the pool. dRAID spares on the 7825 * other hand are coupled to the pool and thus should never be removed 7826 * from the spares list. 7827 */ 7828 if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0) { 7829 vdev_t *last_cvd = pvd->vdev_child[pvd->vdev_children - 1]; 7830 7831 if (last_cvd->vdev_isspare && 7832 last_cvd->vdev_ops != &vdev_draid_spare_ops) { 7833 unspare = B_TRUE; 7834 } 7835 } 7836 7837 /* 7838 * Erase the disk labels so the disk can be used for other things. 7839 * This must be done after all other error cases are handled, 7840 * but before we disembowel vd (so we can still do I/O to it). 7841 * But if we can't do it, don't treat the error as fatal -- 7842 * it may be that the unwritability of the disk is the reason 7843 * it's being detached! 7844 */ 7845 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 7846 7847 /* 7848 * Remove vd from its parent and compact the parent's children. 7849 */ 7850 vdev_remove_child(pvd, vd); 7851 vdev_compact_children(pvd); 7852 7853 /* 7854 * Remember one of the remaining children so we can get tvd below. 7855 */ 7856 cvd = pvd->vdev_child[pvd->vdev_children - 1]; 7857 7858 /* 7859 * If we need to remove the remaining child from the list of hot spares, 7860 * do it now, marking the vdev as no longer a spare in the process. 7861 * We must do this before vdev_remove_parent(), because that can 7862 * change the GUID if it creates a new toplevel GUID. For a similar 7863 * reason, we must remove the spare now, in the same txg as the detach; 7864 * otherwise someone could attach a new sibling, change the GUID, and 7865 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 7866 */ 7867 if (unspare) { 7868 ASSERT(cvd->vdev_isspare); 7869 spa_spare_remove(cvd); 7870 unspare_guid = cvd->vdev_guid; 7871 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 7872 cvd->vdev_unspare = B_TRUE; 7873 } 7874 7875 /* 7876 * If the parent mirror/replacing vdev only has one child, 7877 * the parent is no longer needed. Remove it from the tree. 7878 */ 7879 if (pvd->vdev_children == 1) { 7880 if (pvd->vdev_ops == &vdev_spare_ops) 7881 cvd->vdev_unspare = B_FALSE; 7882 vdev_remove_parent(cvd); 7883 } 7884 7885 /* 7886 * We don't set tvd until now because the parent we just removed 7887 * may have been the previous top-level vdev. 7888 */ 7889 tvd = cvd->vdev_top; 7890 ASSERT(tvd->vdev_parent == rvd); 7891 7892 /* 7893 * Reevaluate the parent vdev state. 7894 */ 7895 vdev_propagate_state(cvd); 7896 7897 /* 7898 * If the 'autoexpand' property is set on the pool then automatically 7899 * try to expand the size of the pool. For example if the device we 7900 * just detached was smaller than the others, it may be possible to 7901 * add metaslabs (i.e. grow the pool). We need to reopen the vdev 7902 * first so that we can obtain the updated sizes of the leaf vdevs. 7903 */ 7904 if (spa->spa_autoexpand) { 7905 vdev_reopen(tvd); 7906 vdev_expand(tvd, txg); 7907 } 7908 7909 vdev_config_dirty(tvd); 7910 7911 /* 7912 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 7913 * vd->vdev_detached is set and free vd's DTL object in syncing context. 7914 * But first make sure we're not on any *other* txg's DTL list, to 7915 * prevent vd from being accessed after it's freed. 7916 */ 7917 vdpath = spa_strdup(vd->vdev_path ? vd->vdev_path : "none"); 7918 for (int t = 0; t < TXG_SIZE; t++) 7919 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 7920 vd->vdev_detached = B_TRUE; 7921 vdev_dirty(tvd, VDD_DTL, vd, txg); 7922 7923 spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE); 7924 spa_notify_waiters(spa); 7925 7926 /* hang on to the spa before we release the lock */ 7927 spa_open_ref(spa, FTAG); 7928 7929 error = spa_vdev_exit(spa, vd, txg, 0); 7930 7931 spa_history_log_internal(spa, "detach", NULL, 7932 "vdev=%s", vdpath); 7933 spa_strfree(vdpath); 7934 7935 /* 7936 * If this was the removal of the original device in a hot spare vdev, 7937 * then we want to go through and remove the device from the hot spare 7938 * list of every other pool. 7939 */ 7940 if (unspare) { 7941 spa_t *altspa = NULL; 7942 7943 mutex_enter(&spa_namespace_lock); 7944 while ((altspa = spa_next(altspa)) != NULL) { 7945 if (altspa->spa_state != POOL_STATE_ACTIVE || 7946 altspa == spa) 7947 continue; 7948 7949 spa_open_ref(altspa, FTAG); 7950 mutex_exit(&spa_namespace_lock); 7951 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 7952 mutex_enter(&spa_namespace_lock); 7953 spa_close(altspa, FTAG); 7954 } 7955 mutex_exit(&spa_namespace_lock); 7956 7957 /* search the rest of the vdevs for spares to remove */ 7958 spa_vdev_resilver_done(spa); 7959 } 7960 7961 /* all done with the spa; OK to release */ 7962 mutex_enter(&spa_namespace_lock); 7963 spa_close(spa, FTAG); 7964 mutex_exit(&spa_namespace_lock); 7965 7966 return (error); 7967 } 7968 7969 static int 7970 spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, 7971 list_t *vd_list) 7972 { 7973 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 7974 7975 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 7976 7977 /* Look up vdev and ensure it's a leaf. */ 7978 vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); 7979 if (vd == NULL || vd->vdev_detached) { 7980 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7981 return (SET_ERROR(ENODEV)); 7982 } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { 7983 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7984 return (SET_ERROR(EINVAL)); 7985 } else if (!vdev_writeable(vd)) { 7986 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7987 return (SET_ERROR(EROFS)); 7988 } 7989 mutex_enter(&vd->vdev_initialize_lock); 7990 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7991 7992 /* 7993 * When we activate an initialize action we check to see 7994 * if the vdev_initialize_thread is NULL. We do this instead 7995 * of using the vdev_initialize_state since there might be 7996 * a previous initialization process which has completed but 7997 * the thread is not exited. 7998 */ 7999 if (cmd_type == POOL_INITIALIZE_START && 8000 (vd->vdev_initialize_thread != NULL || 8001 vd->vdev_top->vdev_removing || vd->vdev_top->vdev_rz_expanding)) { 8002 mutex_exit(&vd->vdev_initialize_lock); 8003 return (SET_ERROR(EBUSY)); 8004 } else if (cmd_type == POOL_INITIALIZE_CANCEL && 8005 (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE && 8006 vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) { 8007 mutex_exit(&vd->vdev_initialize_lock); 8008 return (SET_ERROR(ESRCH)); 8009 } else if (cmd_type == POOL_INITIALIZE_SUSPEND && 8010 vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) { 8011 mutex_exit(&vd->vdev_initialize_lock); 8012 return (SET_ERROR(ESRCH)); 8013 } else if (cmd_type == POOL_INITIALIZE_UNINIT && 8014 vd->vdev_initialize_thread != NULL) { 8015 mutex_exit(&vd->vdev_initialize_lock); 8016 return (SET_ERROR(EBUSY)); 8017 } 8018 8019 switch (cmd_type) { 8020 case POOL_INITIALIZE_START: 8021 vdev_initialize(vd); 8022 break; 8023 case POOL_INITIALIZE_CANCEL: 8024 vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED, vd_list); 8025 break; 8026 case POOL_INITIALIZE_SUSPEND: 8027 vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list); 8028 break; 8029 case POOL_INITIALIZE_UNINIT: 8030 vdev_uninitialize(vd); 8031 break; 8032 default: 8033 panic("invalid cmd_type %llu", (unsigned long long)cmd_type); 8034 } 8035 mutex_exit(&vd->vdev_initialize_lock); 8036 8037 return (0); 8038 } 8039 8040 int 8041 spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, 8042 nvlist_t *vdev_errlist) 8043 { 8044 int total_errors = 0; 8045 list_t vd_list; 8046 8047 list_create(&vd_list, sizeof (vdev_t), 8048 offsetof(vdev_t, vdev_initialize_node)); 8049 8050 /* 8051 * We hold the namespace lock through the whole function 8052 * to prevent any changes to the pool while we're starting or 8053 * stopping initialization. The config and state locks are held so that 8054 * we can properly assess the vdev state before we commit to 8055 * the initializing operation. 8056 */ 8057 mutex_enter(&spa_namespace_lock); 8058 8059 for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); 8060 pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { 8061 uint64_t vdev_guid = fnvpair_value_uint64(pair); 8062 8063 int error = spa_vdev_initialize_impl(spa, vdev_guid, cmd_type, 8064 &vd_list); 8065 if (error != 0) { 8066 char guid_as_str[MAXNAMELEN]; 8067 8068 (void) snprintf(guid_as_str, sizeof (guid_as_str), 8069 "%llu", (unsigned long long)vdev_guid); 8070 fnvlist_add_int64(vdev_errlist, guid_as_str, error); 8071 total_errors++; 8072 } 8073 } 8074 8075 /* Wait for all initialize threads to stop. */ 8076 vdev_initialize_stop_wait(spa, &vd_list); 8077 8078 /* Sync out the initializing state */ 8079 txg_wait_synced(spa->spa_dsl_pool, 0); 8080 mutex_exit(&spa_namespace_lock); 8081 8082 list_destroy(&vd_list); 8083 8084 return (total_errors); 8085 } 8086 8087 static int 8088 spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, 8089 uint64_t rate, boolean_t partial, boolean_t secure, list_t *vd_list) 8090 { 8091 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 8092 8093 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 8094 8095 /* Look up vdev and ensure it's a leaf. */ 8096 vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); 8097 if (vd == NULL || vd->vdev_detached) { 8098 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8099 return (SET_ERROR(ENODEV)); 8100 } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { 8101 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8102 return (SET_ERROR(EINVAL)); 8103 } else if (!vdev_writeable(vd)) { 8104 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8105 return (SET_ERROR(EROFS)); 8106 } else if (!vd->vdev_has_trim) { 8107 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8108 return (SET_ERROR(EOPNOTSUPP)); 8109 } else if (secure && !vd->vdev_has_securetrim) { 8110 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8111 return (SET_ERROR(EOPNOTSUPP)); 8112 } 8113 mutex_enter(&vd->vdev_trim_lock); 8114 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 8115 8116 /* 8117 * When we activate a TRIM action we check to see if the 8118 * vdev_trim_thread is NULL. We do this instead of using the 8119 * vdev_trim_state since there might be a previous TRIM process 8120 * which has completed but the thread is not exited. 8121 */ 8122 if (cmd_type == POOL_TRIM_START && 8123 (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing || 8124 vd->vdev_top->vdev_rz_expanding)) { 8125 mutex_exit(&vd->vdev_trim_lock); 8126 return (SET_ERROR(EBUSY)); 8127 } else if (cmd_type == POOL_TRIM_CANCEL && 8128 (vd->vdev_trim_state != VDEV_TRIM_ACTIVE && 8129 vd->vdev_trim_state != VDEV_TRIM_SUSPENDED)) { 8130 mutex_exit(&vd->vdev_trim_lock); 8131 return (SET_ERROR(ESRCH)); 8132 } else if (cmd_type == POOL_TRIM_SUSPEND && 8133 vd->vdev_trim_state != VDEV_TRIM_ACTIVE) { 8134 mutex_exit(&vd->vdev_trim_lock); 8135 return (SET_ERROR(ESRCH)); 8136 } 8137 8138 switch (cmd_type) { 8139 case POOL_TRIM_START: 8140 vdev_trim(vd, rate, partial, secure); 8141 break; 8142 case POOL_TRIM_CANCEL: 8143 vdev_trim_stop(vd, VDEV_TRIM_CANCELED, vd_list); 8144 break; 8145 case POOL_TRIM_SUSPEND: 8146 vdev_trim_stop(vd, VDEV_TRIM_SUSPENDED, vd_list); 8147 break; 8148 default: 8149 panic("invalid cmd_type %llu", (unsigned long long)cmd_type); 8150 } 8151 mutex_exit(&vd->vdev_trim_lock); 8152 8153 return (0); 8154 } 8155 8156 /* 8157 * Initiates a manual TRIM for the requested vdevs. This kicks off individual 8158 * TRIM threads for each child vdev. These threads pass over all of the free 8159 * space in the vdev's metaslabs and issues TRIM commands for that space. 8160 */ 8161 int 8162 spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate, 8163 boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist) 8164 { 8165 int total_errors = 0; 8166 list_t vd_list; 8167 8168 list_create(&vd_list, sizeof (vdev_t), 8169 offsetof(vdev_t, vdev_trim_node)); 8170 8171 /* 8172 * We hold the namespace lock through the whole function 8173 * to prevent any changes to the pool while we're starting or 8174 * stopping TRIM. The config and state locks are held so that 8175 * we can properly assess the vdev state before we commit to 8176 * the TRIM operation. 8177 */ 8178 mutex_enter(&spa_namespace_lock); 8179 8180 for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); 8181 pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { 8182 uint64_t vdev_guid = fnvpair_value_uint64(pair); 8183 8184 int error = spa_vdev_trim_impl(spa, vdev_guid, cmd_type, 8185 rate, partial, secure, &vd_list); 8186 if (error != 0) { 8187 char guid_as_str[MAXNAMELEN]; 8188 8189 (void) snprintf(guid_as_str, sizeof (guid_as_str), 8190 "%llu", (unsigned long long)vdev_guid); 8191 fnvlist_add_int64(vdev_errlist, guid_as_str, error); 8192 total_errors++; 8193 } 8194 } 8195 8196 /* Wait for all TRIM threads to stop. */ 8197 vdev_trim_stop_wait(spa, &vd_list); 8198 8199 /* Sync out the TRIM state */ 8200 txg_wait_synced(spa->spa_dsl_pool, 0); 8201 mutex_exit(&spa_namespace_lock); 8202 8203 list_destroy(&vd_list); 8204 8205 return (total_errors); 8206 } 8207 8208 /* 8209 * Split a set of devices from their mirrors, and create a new pool from them. 8210 */ 8211 int 8212 spa_vdev_split_mirror(spa_t *spa, const char *newname, nvlist_t *config, 8213 nvlist_t *props, boolean_t exp) 8214 { 8215 int error = 0; 8216 uint64_t txg, *glist; 8217 spa_t *newspa; 8218 uint_t c, children, lastlog; 8219 nvlist_t **child, *nvl, *tmp; 8220 dmu_tx_t *tx; 8221 const char *altroot = NULL; 8222 vdev_t *rvd, **vml = NULL; /* vdev modify list */ 8223 boolean_t activate_slog; 8224 8225 ASSERT(spa_writeable(spa)); 8226 8227 txg = spa_vdev_enter(spa); 8228 8229 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 8230 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 8231 error = (spa_has_checkpoint(spa)) ? 8232 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 8233 return (spa_vdev_exit(spa, NULL, txg, error)); 8234 } 8235 8236 /* clear the log and flush everything up to now */ 8237 activate_slog = spa_passivate_log(spa); 8238 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 8239 error = spa_reset_logs(spa); 8240 txg = spa_vdev_config_enter(spa); 8241 8242 if (activate_slog) 8243 spa_activate_log(spa); 8244 8245 if (error != 0) 8246 return (spa_vdev_exit(spa, NULL, txg, error)); 8247 8248 /* check new spa name before going any further */ 8249 if (spa_lookup(newname) != NULL) 8250 return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 8251 8252 /* 8253 * scan through all the children to ensure they're all mirrors 8254 */ 8255 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 8256 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 8257 &children) != 0) 8258 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 8259 8260 /* first, check to ensure we've got the right child count */ 8261 rvd = spa->spa_root_vdev; 8262 lastlog = 0; 8263 for (c = 0; c < rvd->vdev_children; c++) { 8264 vdev_t *vd = rvd->vdev_child[c]; 8265 8266 /* don't count the holes & logs as children */ 8267 if (vd->vdev_islog || (vd->vdev_ops != &vdev_indirect_ops && 8268 !vdev_is_concrete(vd))) { 8269 if (lastlog == 0) 8270 lastlog = c; 8271 continue; 8272 } 8273 8274 lastlog = 0; 8275 } 8276 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 8277 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 8278 8279 /* next, ensure no spare or cache devices are part of the split */ 8280 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 8281 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 8282 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 8283 8284 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 8285 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 8286 8287 /* then, loop over each vdev and validate it */ 8288 for (c = 0; c < children; c++) { 8289 uint64_t is_hole = 0; 8290 8291 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 8292 &is_hole); 8293 8294 if (is_hole != 0) { 8295 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 8296 spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 8297 continue; 8298 } else { 8299 error = SET_ERROR(EINVAL); 8300 break; 8301 } 8302 } 8303 8304 /* deal with indirect vdevs */ 8305 if (spa->spa_root_vdev->vdev_child[c]->vdev_ops == 8306 &vdev_indirect_ops) 8307 continue; 8308 8309 /* which disk is going to be split? */ 8310 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 8311 &glist[c]) != 0) { 8312 error = SET_ERROR(EINVAL); 8313 break; 8314 } 8315 8316 /* look it up in the spa */ 8317 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 8318 if (vml[c] == NULL) { 8319 error = SET_ERROR(ENODEV); 8320 break; 8321 } 8322 8323 /* make sure there's nothing stopping the split */ 8324 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 8325 vml[c]->vdev_islog || 8326 !vdev_is_concrete(vml[c]) || 8327 vml[c]->vdev_isspare || 8328 vml[c]->vdev_isl2cache || 8329 !vdev_writeable(vml[c]) || 8330 vml[c]->vdev_children != 0 || 8331 vml[c]->vdev_state != VDEV_STATE_HEALTHY || 8332 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 8333 error = SET_ERROR(EINVAL); 8334 break; 8335 } 8336 8337 if (vdev_dtl_required(vml[c]) || 8338 vdev_resilver_needed(vml[c], NULL, NULL)) { 8339 error = SET_ERROR(EBUSY); 8340 break; 8341 } 8342 8343 /* we need certain info from the top level */ 8344 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 8345 vml[c]->vdev_top->vdev_ms_array); 8346 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 8347 vml[c]->vdev_top->vdev_ms_shift); 8348 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 8349 vml[c]->vdev_top->vdev_asize); 8350 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 8351 vml[c]->vdev_top->vdev_ashift); 8352 8353 /* transfer per-vdev ZAPs */ 8354 ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0); 8355 VERIFY0(nvlist_add_uint64(child[c], 8356 ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap)); 8357 8358 ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0); 8359 VERIFY0(nvlist_add_uint64(child[c], 8360 ZPOOL_CONFIG_VDEV_TOP_ZAP, 8361 vml[c]->vdev_parent->vdev_top_zap)); 8362 } 8363 8364 if (error != 0) { 8365 kmem_free(vml, children * sizeof (vdev_t *)); 8366 kmem_free(glist, children * sizeof (uint64_t)); 8367 return (spa_vdev_exit(spa, NULL, txg, error)); 8368 } 8369 8370 /* stop writers from using the disks */ 8371 for (c = 0; c < children; c++) { 8372 if (vml[c] != NULL) 8373 vml[c]->vdev_offline = B_TRUE; 8374 } 8375 vdev_reopen(spa->spa_root_vdev); 8376 8377 /* 8378 * Temporarily record the splitting vdevs in the spa config. This 8379 * will disappear once the config is regenerated. 8380 */ 8381 nvl = fnvlist_alloc(); 8382 fnvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, glist, children); 8383 kmem_free(glist, children * sizeof (uint64_t)); 8384 8385 mutex_enter(&spa->spa_props_lock); 8386 fnvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, nvl); 8387 mutex_exit(&spa->spa_props_lock); 8388 spa->spa_config_splitting = nvl; 8389 vdev_config_dirty(spa->spa_root_vdev); 8390 8391 /* configure and create the new pool */ 8392 fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname); 8393 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 8394 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE); 8395 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa)); 8396 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg); 8397 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 8398 spa_generate_guid(NULL)); 8399 VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); 8400 (void) nvlist_lookup_string(props, 8401 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 8402 8403 /* add the new pool to the namespace */ 8404 newspa = spa_add(newname, config, altroot); 8405 newspa->spa_avz_action = AVZ_ACTION_REBUILD; 8406 newspa->spa_config_txg = spa->spa_config_txg; 8407 spa_set_log_state(newspa, SPA_LOG_CLEAR); 8408 8409 /* release the spa config lock, retaining the namespace lock */ 8410 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 8411 8412 if (zio_injection_enabled) 8413 zio_handle_panic_injection(spa, FTAG, 1); 8414 8415 spa_activate(newspa, spa_mode_global); 8416 spa_async_suspend(newspa); 8417 8418 /* 8419 * Temporarily stop the initializing and TRIM activity. We set the 8420 * state to ACTIVE so that we know to resume initializing or TRIM 8421 * once the split has completed. 8422 */ 8423 list_t vd_initialize_list; 8424 list_create(&vd_initialize_list, sizeof (vdev_t), 8425 offsetof(vdev_t, vdev_initialize_node)); 8426 8427 list_t vd_trim_list; 8428 list_create(&vd_trim_list, sizeof (vdev_t), 8429 offsetof(vdev_t, vdev_trim_node)); 8430 8431 for (c = 0; c < children; c++) { 8432 if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { 8433 mutex_enter(&vml[c]->vdev_initialize_lock); 8434 vdev_initialize_stop(vml[c], 8435 VDEV_INITIALIZE_ACTIVE, &vd_initialize_list); 8436 mutex_exit(&vml[c]->vdev_initialize_lock); 8437 8438 mutex_enter(&vml[c]->vdev_trim_lock); 8439 vdev_trim_stop(vml[c], VDEV_TRIM_ACTIVE, &vd_trim_list); 8440 mutex_exit(&vml[c]->vdev_trim_lock); 8441 } 8442 } 8443 8444 vdev_initialize_stop_wait(spa, &vd_initialize_list); 8445 vdev_trim_stop_wait(spa, &vd_trim_list); 8446 8447 list_destroy(&vd_initialize_list); 8448 list_destroy(&vd_trim_list); 8449 8450 newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT; 8451 newspa->spa_is_splitting = B_TRUE; 8452 8453 /* create the new pool from the disks of the original pool */ 8454 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE); 8455 if (error) 8456 goto out; 8457 8458 /* if that worked, generate a real config for the new pool */ 8459 if (newspa->spa_root_vdev != NULL) { 8460 newspa->spa_config_splitting = fnvlist_alloc(); 8461 fnvlist_add_uint64(newspa->spa_config_splitting, 8462 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)); 8463 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 8464 B_TRUE)); 8465 } 8466 8467 /* set the props */ 8468 if (props != NULL) { 8469 spa_configfile_set(newspa, props, B_FALSE); 8470 error = spa_prop_set(newspa, props); 8471 if (error) 8472 goto out; 8473 } 8474 8475 /* flush everything */ 8476 txg = spa_vdev_config_enter(newspa); 8477 vdev_config_dirty(newspa->spa_root_vdev); 8478 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 8479 8480 if (zio_injection_enabled) 8481 zio_handle_panic_injection(spa, FTAG, 2); 8482 8483 spa_async_resume(newspa); 8484 8485 /* finally, update the original pool's config */ 8486 txg = spa_vdev_config_enter(spa); 8487 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 8488 error = dmu_tx_assign(tx, TXG_WAIT); 8489 if (error != 0) 8490 dmu_tx_abort(tx); 8491 for (c = 0; c < children; c++) { 8492 if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { 8493 vdev_t *tvd = vml[c]->vdev_top; 8494 8495 /* 8496 * Need to be sure the detachable VDEV is not 8497 * on any *other* txg's DTL list to prevent it 8498 * from being accessed after it's freed. 8499 */ 8500 for (int t = 0; t < TXG_SIZE; t++) { 8501 (void) txg_list_remove_this( 8502 &tvd->vdev_dtl_list, vml[c], t); 8503 } 8504 8505 vdev_split(vml[c]); 8506 if (error == 0) 8507 spa_history_log_internal(spa, "detach", tx, 8508 "vdev=%s", vml[c]->vdev_path); 8509 8510 vdev_free(vml[c]); 8511 } 8512 } 8513 spa->spa_avz_action = AVZ_ACTION_REBUILD; 8514 vdev_config_dirty(spa->spa_root_vdev); 8515 spa->spa_config_splitting = NULL; 8516 nvlist_free(nvl); 8517 if (error == 0) 8518 dmu_tx_commit(tx); 8519 (void) spa_vdev_exit(spa, NULL, txg, 0); 8520 8521 if (zio_injection_enabled) 8522 zio_handle_panic_injection(spa, FTAG, 3); 8523 8524 /* split is complete; log a history record */ 8525 spa_history_log_internal(newspa, "split", NULL, 8526 "from pool %s", spa_name(spa)); 8527 8528 newspa->spa_is_splitting = B_FALSE; 8529 kmem_free(vml, children * sizeof (vdev_t *)); 8530 8531 /* if we're not going to mount the filesystems in userland, export */ 8532 if (exp) 8533 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 8534 B_FALSE, B_FALSE); 8535 8536 return (error); 8537 8538 out: 8539 spa_unload(newspa); 8540 spa_deactivate(newspa); 8541 spa_remove(newspa); 8542 8543 txg = spa_vdev_config_enter(spa); 8544 8545 /* re-online all offlined disks */ 8546 for (c = 0; c < children; c++) { 8547 if (vml[c] != NULL) 8548 vml[c]->vdev_offline = B_FALSE; 8549 } 8550 8551 /* restart initializing or trimming disks as necessary */ 8552 spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); 8553 spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); 8554 spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); 8555 8556 vdev_reopen(spa->spa_root_vdev); 8557 8558 nvlist_free(spa->spa_config_splitting); 8559 spa->spa_config_splitting = NULL; 8560 (void) spa_vdev_exit(spa, NULL, txg, error); 8561 8562 kmem_free(vml, children * sizeof (vdev_t *)); 8563 return (error); 8564 } 8565 8566 /* 8567 * Find any device that's done replacing, or a vdev marked 'unspare' that's 8568 * currently spared, so we can detach it. 8569 */ 8570 static vdev_t * 8571 spa_vdev_resilver_done_hunt(vdev_t *vd) 8572 { 8573 vdev_t *newvd, *oldvd; 8574 8575 for (int c = 0; c < vd->vdev_children; c++) { 8576 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 8577 if (oldvd != NULL) 8578 return (oldvd); 8579 } 8580 8581 /* 8582 * Check for a completed replacement. We always consider the first 8583 * vdev in the list to be the oldest vdev, and the last one to be 8584 * the newest (see spa_vdev_attach() for how that works). In 8585 * the case where the newest vdev is faulted, we will not automatically 8586 * remove it after a resilver completes. This is OK as it will require 8587 * user intervention to determine which disk the admin wishes to keep. 8588 */ 8589 if (vd->vdev_ops == &vdev_replacing_ops) { 8590 ASSERT(vd->vdev_children > 1); 8591 8592 newvd = vd->vdev_child[vd->vdev_children - 1]; 8593 oldvd = vd->vdev_child[0]; 8594 8595 if (vdev_dtl_empty(newvd, DTL_MISSING) && 8596 vdev_dtl_empty(newvd, DTL_OUTAGE) && 8597 !vdev_dtl_required(oldvd)) 8598 return (oldvd); 8599 } 8600 8601 /* 8602 * Check for a completed resilver with the 'unspare' flag set. 8603 * Also potentially update faulted state. 8604 */ 8605 if (vd->vdev_ops == &vdev_spare_ops) { 8606 vdev_t *first = vd->vdev_child[0]; 8607 vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 8608 8609 if (last->vdev_unspare) { 8610 oldvd = first; 8611 newvd = last; 8612 } else if (first->vdev_unspare) { 8613 oldvd = last; 8614 newvd = first; 8615 } else { 8616 oldvd = NULL; 8617 } 8618 8619 if (oldvd != NULL && 8620 vdev_dtl_empty(newvd, DTL_MISSING) && 8621 vdev_dtl_empty(newvd, DTL_OUTAGE) && 8622 !vdev_dtl_required(oldvd)) 8623 return (oldvd); 8624 8625 vdev_propagate_state(vd); 8626 8627 /* 8628 * If there are more than two spares attached to a disk, 8629 * and those spares are not required, then we want to 8630 * attempt to free them up now so that they can be used 8631 * by other pools. Once we're back down to a single 8632 * disk+spare, we stop removing them. 8633 */ 8634 if (vd->vdev_children > 2) { 8635 newvd = vd->vdev_child[1]; 8636 8637 if (newvd->vdev_isspare && last->vdev_isspare && 8638 vdev_dtl_empty(last, DTL_MISSING) && 8639 vdev_dtl_empty(last, DTL_OUTAGE) && 8640 !vdev_dtl_required(newvd)) 8641 return (newvd); 8642 } 8643 } 8644 8645 return (NULL); 8646 } 8647 8648 static void 8649 spa_vdev_resilver_done(spa_t *spa) 8650 { 8651 vdev_t *vd, *pvd, *ppvd; 8652 uint64_t guid, sguid, pguid, ppguid; 8653 8654 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 8655 8656 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 8657 pvd = vd->vdev_parent; 8658 ppvd = pvd->vdev_parent; 8659 guid = vd->vdev_guid; 8660 pguid = pvd->vdev_guid; 8661 ppguid = ppvd->vdev_guid; 8662 sguid = 0; 8663 /* 8664 * If we have just finished replacing a hot spared device, then 8665 * we need to detach the parent's first child (the original hot 8666 * spare) as well. 8667 */ 8668 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 8669 ppvd->vdev_children == 2) { 8670 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 8671 sguid = ppvd->vdev_child[1]->vdev_guid; 8672 } 8673 ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); 8674 8675 spa_config_exit(spa, SCL_ALL, FTAG); 8676 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 8677 return; 8678 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 8679 return; 8680 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 8681 } 8682 8683 spa_config_exit(spa, SCL_ALL, FTAG); 8684 8685 /* 8686 * If a detach was not performed above replace waiters will not have 8687 * been notified. In which case we must do so now. 8688 */ 8689 spa_notify_waiters(spa); 8690 } 8691 8692 /* 8693 * Update the stored path or FRU for this vdev. 8694 */ 8695 static int 8696 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 8697 boolean_t ispath) 8698 { 8699 vdev_t *vd; 8700 boolean_t sync = B_FALSE; 8701 8702 ASSERT(spa_writeable(spa)); 8703 8704 spa_vdev_state_enter(spa, SCL_ALL); 8705 8706 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 8707 return (spa_vdev_state_exit(spa, NULL, ENOENT)); 8708 8709 if (!vd->vdev_ops->vdev_op_leaf) 8710 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 8711 8712 if (ispath) { 8713 if (strcmp(value, vd->vdev_path) != 0) { 8714 spa_strfree(vd->vdev_path); 8715 vd->vdev_path = spa_strdup(value); 8716 sync = B_TRUE; 8717 } 8718 } else { 8719 if (vd->vdev_fru == NULL) { 8720 vd->vdev_fru = spa_strdup(value); 8721 sync = B_TRUE; 8722 } else if (strcmp(value, vd->vdev_fru) != 0) { 8723 spa_strfree(vd->vdev_fru); 8724 vd->vdev_fru = spa_strdup(value); 8725 sync = B_TRUE; 8726 } 8727 } 8728 8729 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 8730 } 8731 8732 int 8733 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 8734 { 8735 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 8736 } 8737 8738 int 8739 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 8740 { 8741 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 8742 } 8743 8744 /* 8745 * ========================================================================== 8746 * SPA Scanning 8747 * ========================================================================== 8748 */ 8749 int 8750 spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd) 8751 { 8752 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 8753 8754 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 8755 return (SET_ERROR(EBUSY)); 8756 8757 return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd)); 8758 } 8759 8760 int 8761 spa_scan_stop(spa_t *spa) 8762 { 8763 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 8764 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 8765 return (SET_ERROR(EBUSY)); 8766 8767 return (dsl_scan_cancel(spa->spa_dsl_pool)); 8768 } 8769 8770 int 8771 spa_scan(spa_t *spa, pool_scan_func_t func) 8772 { 8773 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 8774 8775 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 8776 return (SET_ERROR(ENOTSUP)); 8777 8778 if (func == POOL_SCAN_RESILVER && 8779 !spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) 8780 return (SET_ERROR(ENOTSUP)); 8781 8782 /* 8783 * If a resilver was requested, but there is no DTL on a 8784 * writeable leaf device, we have nothing to do. 8785 */ 8786 if (func == POOL_SCAN_RESILVER && 8787 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 8788 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 8789 return (0); 8790 } 8791 8792 if (func == POOL_SCAN_ERRORSCRUB && 8793 !spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) 8794 return (SET_ERROR(ENOTSUP)); 8795 8796 return (dsl_scan(spa->spa_dsl_pool, func)); 8797 } 8798 8799 /* 8800 * ========================================================================== 8801 * SPA async task processing 8802 * ========================================================================== 8803 */ 8804 8805 static void 8806 spa_async_remove(spa_t *spa, vdev_t *vd) 8807 { 8808 if (vd->vdev_remove_wanted) { 8809 vd->vdev_remove_wanted = B_FALSE; 8810 vd->vdev_delayed_close = B_FALSE; 8811 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 8812 8813 /* 8814 * We want to clear the stats, but we don't want to do a full 8815 * vdev_clear() as that will cause us to throw away 8816 * degraded/faulted state as well as attempt to reopen the 8817 * device, all of which is a waste. 8818 */ 8819 vd->vdev_stat.vs_read_errors = 0; 8820 vd->vdev_stat.vs_write_errors = 0; 8821 vd->vdev_stat.vs_checksum_errors = 0; 8822 8823 vdev_state_dirty(vd->vdev_top); 8824 8825 /* Tell userspace that the vdev is gone. */ 8826 zfs_post_remove(spa, vd); 8827 } 8828 8829 for (int c = 0; c < vd->vdev_children; c++) 8830 spa_async_remove(spa, vd->vdev_child[c]); 8831 } 8832 8833 static void 8834 spa_async_fault_vdev(spa_t *spa, vdev_t *vd) 8835 { 8836 if (vd->vdev_fault_wanted) { 8837 vd->vdev_fault_wanted = B_FALSE; 8838 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 8839 VDEV_AUX_ERR_EXCEEDED); 8840 } 8841 8842 for (int c = 0; c < vd->vdev_children; c++) 8843 spa_async_fault_vdev(spa, vd->vdev_child[c]); 8844 } 8845 8846 static void 8847 spa_async_autoexpand(spa_t *spa, vdev_t *vd) 8848 { 8849 if (!spa->spa_autoexpand) 8850 return; 8851 8852 for (int c = 0; c < vd->vdev_children; c++) { 8853 vdev_t *cvd = vd->vdev_child[c]; 8854 spa_async_autoexpand(spa, cvd); 8855 } 8856 8857 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 8858 return; 8859 8860 spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_AUTOEXPAND); 8861 } 8862 8863 static __attribute__((noreturn)) void 8864 spa_async_thread(void *arg) 8865 { 8866 spa_t *spa = (spa_t *)arg; 8867 dsl_pool_t *dp = spa->spa_dsl_pool; 8868 int tasks; 8869 8870 ASSERT(spa->spa_sync_on); 8871 8872 mutex_enter(&spa->spa_async_lock); 8873 tasks = spa->spa_async_tasks; 8874 spa->spa_async_tasks = 0; 8875 mutex_exit(&spa->spa_async_lock); 8876 8877 /* 8878 * See if the config needs to be updated. 8879 */ 8880 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 8881 uint64_t old_space, new_space; 8882 8883 mutex_enter(&spa_namespace_lock); 8884 old_space = metaslab_class_get_space(spa_normal_class(spa)); 8885 old_space += metaslab_class_get_space(spa_special_class(spa)); 8886 old_space += metaslab_class_get_space(spa_dedup_class(spa)); 8887 old_space += metaslab_class_get_space( 8888 spa_embedded_log_class(spa)); 8889 8890 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 8891 8892 new_space = metaslab_class_get_space(spa_normal_class(spa)); 8893 new_space += metaslab_class_get_space(spa_special_class(spa)); 8894 new_space += metaslab_class_get_space(spa_dedup_class(spa)); 8895 new_space += metaslab_class_get_space( 8896 spa_embedded_log_class(spa)); 8897 mutex_exit(&spa_namespace_lock); 8898 8899 /* 8900 * If the pool grew as a result of the config update, 8901 * then log an internal history event. 8902 */ 8903 if (new_space != old_space) { 8904 spa_history_log_internal(spa, "vdev online", NULL, 8905 "pool '%s' size: %llu(+%llu)", 8906 spa_name(spa), (u_longlong_t)new_space, 8907 (u_longlong_t)(new_space - old_space)); 8908 } 8909 } 8910 8911 /* 8912 * See if any devices need to be marked REMOVED. 8913 */ 8914 if (tasks & SPA_ASYNC_REMOVE) { 8915 spa_vdev_state_enter(spa, SCL_NONE); 8916 spa_async_remove(spa, spa->spa_root_vdev); 8917 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 8918 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 8919 for (int i = 0; i < spa->spa_spares.sav_count; i++) 8920 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 8921 (void) spa_vdev_state_exit(spa, NULL, 0); 8922 } 8923 8924 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 8925 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8926 spa_async_autoexpand(spa, spa->spa_root_vdev); 8927 spa_config_exit(spa, SCL_CONFIG, FTAG); 8928 } 8929 8930 /* 8931 * See if any devices need to be marked faulted. 8932 */ 8933 if (tasks & SPA_ASYNC_FAULT_VDEV) { 8934 spa_vdev_state_enter(spa, SCL_NONE); 8935 spa_async_fault_vdev(spa, spa->spa_root_vdev); 8936 (void) spa_vdev_state_exit(spa, NULL, 0); 8937 } 8938 8939 /* 8940 * If any devices are done replacing, detach them. 8941 */ 8942 if (tasks & SPA_ASYNC_RESILVER_DONE || 8943 tasks & SPA_ASYNC_REBUILD_DONE || 8944 tasks & SPA_ASYNC_DETACH_SPARE) { 8945 spa_vdev_resilver_done(spa); 8946 } 8947 8948 /* 8949 * Kick off a resilver. 8950 */ 8951 if (tasks & SPA_ASYNC_RESILVER && 8952 !vdev_rebuild_active(spa->spa_root_vdev) && 8953 (!dsl_scan_resilvering(dp) || 8954 !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))) 8955 dsl_scan_restart_resilver(dp, 0); 8956 8957 if (tasks & SPA_ASYNC_INITIALIZE_RESTART) { 8958 mutex_enter(&spa_namespace_lock); 8959 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8960 vdev_initialize_restart(spa->spa_root_vdev); 8961 spa_config_exit(spa, SCL_CONFIG, FTAG); 8962 mutex_exit(&spa_namespace_lock); 8963 } 8964 8965 if (tasks & SPA_ASYNC_TRIM_RESTART) { 8966 mutex_enter(&spa_namespace_lock); 8967 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8968 vdev_trim_restart(spa->spa_root_vdev); 8969 spa_config_exit(spa, SCL_CONFIG, FTAG); 8970 mutex_exit(&spa_namespace_lock); 8971 } 8972 8973 if (tasks & SPA_ASYNC_AUTOTRIM_RESTART) { 8974 mutex_enter(&spa_namespace_lock); 8975 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8976 vdev_autotrim_restart(spa); 8977 spa_config_exit(spa, SCL_CONFIG, FTAG); 8978 mutex_exit(&spa_namespace_lock); 8979 } 8980 8981 /* 8982 * Kick off L2 cache whole device TRIM. 8983 */ 8984 if (tasks & SPA_ASYNC_L2CACHE_TRIM) { 8985 mutex_enter(&spa_namespace_lock); 8986 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8987 vdev_trim_l2arc(spa); 8988 spa_config_exit(spa, SCL_CONFIG, FTAG); 8989 mutex_exit(&spa_namespace_lock); 8990 } 8991 8992 /* 8993 * Kick off L2 cache rebuilding. 8994 */ 8995 if (tasks & SPA_ASYNC_L2CACHE_REBUILD) { 8996 mutex_enter(&spa_namespace_lock); 8997 spa_config_enter(spa, SCL_L2ARC, FTAG, RW_READER); 8998 l2arc_spa_rebuild_start(spa); 8999 spa_config_exit(spa, SCL_L2ARC, FTAG); 9000 mutex_exit(&spa_namespace_lock); 9001 } 9002 9003 /* 9004 * Let the world know that we're done. 9005 */ 9006 mutex_enter(&spa->spa_async_lock); 9007 spa->spa_async_thread = NULL; 9008 cv_broadcast(&spa->spa_async_cv); 9009 mutex_exit(&spa->spa_async_lock); 9010 thread_exit(); 9011 } 9012 9013 void 9014 spa_async_suspend(spa_t *spa) 9015 { 9016 mutex_enter(&spa->spa_async_lock); 9017 spa->spa_async_suspended++; 9018 while (spa->spa_async_thread != NULL) 9019 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 9020 mutex_exit(&spa->spa_async_lock); 9021 9022 spa_vdev_remove_suspend(spa); 9023 9024 zthr_t *condense_thread = spa->spa_condense_zthr; 9025 if (condense_thread != NULL) 9026 zthr_cancel(condense_thread); 9027 9028 zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr; 9029 if (raidz_expand_thread != NULL) 9030 zthr_cancel(raidz_expand_thread); 9031 9032 zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; 9033 if (discard_thread != NULL) 9034 zthr_cancel(discard_thread); 9035 9036 zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; 9037 if (ll_delete_thread != NULL) 9038 zthr_cancel(ll_delete_thread); 9039 9040 zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; 9041 if (ll_condense_thread != NULL) 9042 zthr_cancel(ll_condense_thread); 9043 } 9044 9045 void 9046 spa_async_resume(spa_t *spa) 9047 { 9048 mutex_enter(&spa->spa_async_lock); 9049 ASSERT(spa->spa_async_suspended != 0); 9050 spa->spa_async_suspended--; 9051 mutex_exit(&spa->spa_async_lock); 9052 spa_restart_removal(spa); 9053 9054 zthr_t *condense_thread = spa->spa_condense_zthr; 9055 if (condense_thread != NULL) 9056 zthr_resume(condense_thread); 9057 9058 zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr; 9059 if (raidz_expand_thread != NULL) 9060 zthr_resume(raidz_expand_thread); 9061 9062 zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; 9063 if (discard_thread != NULL) 9064 zthr_resume(discard_thread); 9065 9066 zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; 9067 if (ll_delete_thread != NULL) 9068 zthr_resume(ll_delete_thread); 9069 9070 zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; 9071 if (ll_condense_thread != NULL) 9072 zthr_resume(ll_condense_thread); 9073 } 9074 9075 static boolean_t 9076 spa_async_tasks_pending(spa_t *spa) 9077 { 9078 uint_t non_config_tasks; 9079 uint_t config_task; 9080 boolean_t config_task_suspended; 9081 9082 non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE; 9083 config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; 9084 if (spa->spa_ccw_fail_time == 0) { 9085 config_task_suspended = B_FALSE; 9086 } else { 9087 config_task_suspended = 9088 (gethrtime() - spa->spa_ccw_fail_time) < 9089 ((hrtime_t)zfs_ccw_retry_interval * NANOSEC); 9090 } 9091 9092 return (non_config_tasks || (config_task && !config_task_suspended)); 9093 } 9094 9095 static void 9096 spa_async_dispatch(spa_t *spa) 9097 { 9098 mutex_enter(&spa->spa_async_lock); 9099 if (spa_async_tasks_pending(spa) && 9100 !spa->spa_async_suspended && 9101 spa->spa_async_thread == NULL) 9102 spa->spa_async_thread = thread_create(NULL, 0, 9103 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 9104 mutex_exit(&spa->spa_async_lock); 9105 } 9106 9107 void 9108 spa_async_request(spa_t *spa, int task) 9109 { 9110 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 9111 mutex_enter(&spa->spa_async_lock); 9112 spa->spa_async_tasks |= task; 9113 mutex_exit(&spa->spa_async_lock); 9114 } 9115 9116 int 9117 spa_async_tasks(spa_t *spa) 9118 { 9119 return (spa->spa_async_tasks); 9120 } 9121 9122 /* 9123 * ========================================================================== 9124 * SPA syncing routines 9125 * ========================================================================== 9126 */ 9127 9128 9129 static int 9130 bpobj_enqueue_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 9131 dmu_tx_t *tx) 9132 { 9133 bpobj_t *bpo = arg; 9134 bpobj_enqueue(bpo, bp, bp_freed, tx); 9135 return (0); 9136 } 9137 9138 int 9139 bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 9140 { 9141 return (bpobj_enqueue_cb(arg, bp, B_FALSE, tx)); 9142 } 9143 9144 int 9145 bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 9146 { 9147 return (bpobj_enqueue_cb(arg, bp, B_TRUE, tx)); 9148 } 9149 9150 static int 9151 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 9152 { 9153 zio_t *pio = arg; 9154 9155 zio_nowait(zio_free_sync(pio, pio->io_spa, dmu_tx_get_txg(tx), bp, 9156 pio->io_flags)); 9157 return (0); 9158 } 9159 9160 static int 9161 bpobj_spa_free_sync_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 9162 dmu_tx_t *tx) 9163 { 9164 ASSERT(!bp_freed); 9165 return (spa_free_sync_cb(arg, bp, tx)); 9166 } 9167 9168 /* 9169 * Note: this simple function is not inlined to make it easier to dtrace the 9170 * amount of time spent syncing frees. 9171 */ 9172 static void 9173 spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) 9174 { 9175 zio_t *zio = zio_root(spa, NULL, NULL, 0); 9176 bplist_iterate(bpl, spa_free_sync_cb, zio, tx); 9177 VERIFY(zio_wait(zio) == 0); 9178 } 9179 9180 /* 9181 * Note: this simple function is not inlined to make it easier to dtrace the 9182 * amount of time spent syncing deferred frees. 9183 */ 9184 static void 9185 spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) 9186 { 9187 if (spa_sync_pass(spa) != 1) 9188 return; 9189 9190 /* 9191 * Note: 9192 * If the log space map feature is active, we stop deferring 9193 * frees to the next TXG and therefore running this function 9194 * would be considered a no-op as spa_deferred_bpobj should 9195 * not have any entries. 9196 * 9197 * That said we run this function anyway (instead of returning 9198 * immediately) for the edge-case scenario where we just 9199 * activated the log space map feature in this TXG but we have 9200 * deferred frees from the previous TXG. 9201 */ 9202 zio_t *zio = zio_root(spa, NULL, NULL, 0); 9203 VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, 9204 bpobj_spa_free_sync_cb, zio, tx), ==, 0); 9205 VERIFY0(zio_wait(zio)); 9206 } 9207 9208 static void 9209 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 9210 { 9211 char *packed = NULL; 9212 size_t bufsize; 9213 size_t nvsize = 0; 9214 dmu_buf_t *db; 9215 9216 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 9217 9218 /* 9219 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 9220 * information. This avoids the dmu_buf_will_dirty() path and 9221 * saves us a pre-read to get data we don't actually care about. 9222 */ 9223 bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); 9224 packed = vmem_alloc(bufsize, KM_SLEEP); 9225 9226 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 9227 KM_SLEEP) == 0); 9228 memset(packed + nvsize, 0, bufsize - nvsize); 9229 9230 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 9231 9232 vmem_free(packed, bufsize); 9233 9234 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 9235 dmu_buf_will_dirty(db, tx); 9236 *(uint64_t *)db->db_data = nvsize; 9237 dmu_buf_rele(db, FTAG); 9238 } 9239 9240 static void 9241 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 9242 const char *config, const char *entry) 9243 { 9244 nvlist_t *nvroot; 9245 nvlist_t **list; 9246 int i; 9247 9248 if (!sav->sav_sync) 9249 return; 9250 9251 /* 9252 * Update the MOS nvlist describing the list of available devices. 9253 * spa_validate_aux() will have already made sure this nvlist is 9254 * valid and the vdevs are labeled appropriately. 9255 */ 9256 if (sav->sav_object == 0) { 9257 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 9258 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 9259 sizeof (uint64_t), tx); 9260 VERIFY(zap_update(spa->spa_meta_objset, 9261 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 9262 &sav->sav_object, tx) == 0); 9263 } 9264 9265 nvroot = fnvlist_alloc(); 9266 if (sav->sav_count == 0) { 9267 fnvlist_add_nvlist_array(nvroot, config, 9268 (const nvlist_t * const *)NULL, 0); 9269 } else { 9270 list = kmem_alloc(sav->sav_count*sizeof (void *), KM_SLEEP); 9271 for (i = 0; i < sav->sav_count; i++) 9272 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 9273 B_FALSE, VDEV_CONFIG_L2CACHE); 9274 fnvlist_add_nvlist_array(nvroot, config, 9275 (const nvlist_t * const *)list, sav->sav_count); 9276 for (i = 0; i < sav->sav_count; i++) 9277 nvlist_free(list[i]); 9278 kmem_free(list, sav->sav_count * sizeof (void *)); 9279 } 9280 9281 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 9282 nvlist_free(nvroot); 9283 9284 sav->sav_sync = B_FALSE; 9285 } 9286 9287 /* 9288 * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t. 9289 * The all-vdev ZAP must be empty. 9290 */ 9291 static void 9292 spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx) 9293 { 9294 spa_t *spa = vd->vdev_spa; 9295 9296 if (vd->vdev_root_zap != 0 && 9297 spa_feature_is_active(spa, SPA_FEATURE_AVZ_V2)) { 9298 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 9299 vd->vdev_root_zap, tx)); 9300 } 9301 if (vd->vdev_top_zap != 0) { 9302 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 9303 vd->vdev_top_zap, tx)); 9304 } 9305 if (vd->vdev_leaf_zap != 0) { 9306 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 9307 vd->vdev_leaf_zap, tx)); 9308 } 9309 for (uint64_t i = 0; i < vd->vdev_children; i++) { 9310 spa_avz_build(vd->vdev_child[i], avz, tx); 9311 } 9312 } 9313 9314 static void 9315 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 9316 { 9317 nvlist_t *config; 9318 9319 /* 9320 * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS, 9321 * its config may not be dirty but we still need to build per-vdev ZAPs. 9322 * Similarly, if the pool is being assembled (e.g. after a split), we 9323 * need to rebuild the AVZ although the config may not be dirty. 9324 */ 9325 if (list_is_empty(&spa->spa_config_dirty_list) && 9326 spa->spa_avz_action == AVZ_ACTION_NONE) 9327 return; 9328 9329 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 9330 9331 ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE || 9332 spa->spa_avz_action == AVZ_ACTION_INITIALIZE || 9333 spa->spa_all_vdev_zaps != 0); 9334 9335 if (spa->spa_avz_action == AVZ_ACTION_REBUILD) { 9336 /* Make and build the new AVZ */ 9337 uint64_t new_avz = zap_create(spa->spa_meta_objset, 9338 DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); 9339 spa_avz_build(spa->spa_root_vdev, new_avz, tx); 9340 9341 /* Diff old AVZ with new one */ 9342 zap_cursor_t zc; 9343 zap_attribute_t za; 9344 9345 for (zap_cursor_init(&zc, spa->spa_meta_objset, 9346 spa->spa_all_vdev_zaps); 9347 zap_cursor_retrieve(&zc, &za) == 0; 9348 zap_cursor_advance(&zc)) { 9349 uint64_t vdzap = za.za_first_integer; 9350 if (zap_lookup_int(spa->spa_meta_objset, new_avz, 9351 vdzap) == ENOENT) { 9352 /* 9353 * ZAP is listed in old AVZ but not in new one; 9354 * destroy it 9355 */ 9356 VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap, 9357 tx)); 9358 } 9359 } 9360 9361 zap_cursor_fini(&zc); 9362 9363 /* Destroy the old AVZ */ 9364 VERIFY0(zap_destroy(spa->spa_meta_objset, 9365 spa->spa_all_vdev_zaps, tx)); 9366 9367 /* Replace the old AVZ in the dir obj with the new one */ 9368 VERIFY0(zap_update(spa->spa_meta_objset, 9369 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, 9370 sizeof (new_avz), 1, &new_avz, tx)); 9371 9372 spa->spa_all_vdev_zaps = new_avz; 9373 } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) { 9374 zap_cursor_t zc; 9375 zap_attribute_t za; 9376 9377 /* Walk through the AVZ and destroy all listed ZAPs */ 9378 for (zap_cursor_init(&zc, spa->spa_meta_objset, 9379 spa->spa_all_vdev_zaps); 9380 zap_cursor_retrieve(&zc, &za) == 0; 9381 zap_cursor_advance(&zc)) { 9382 uint64_t zap = za.za_first_integer; 9383 VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx)); 9384 } 9385 9386 zap_cursor_fini(&zc); 9387 9388 /* Destroy and unlink the AVZ itself */ 9389 VERIFY0(zap_destroy(spa->spa_meta_objset, 9390 spa->spa_all_vdev_zaps, tx)); 9391 VERIFY0(zap_remove(spa->spa_meta_objset, 9392 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx)); 9393 spa->spa_all_vdev_zaps = 0; 9394 } 9395 9396 if (spa->spa_all_vdev_zaps == 0) { 9397 spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset, 9398 DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, 9399 DMU_POOL_VDEV_ZAP_MAP, tx); 9400 } 9401 spa->spa_avz_action = AVZ_ACTION_NONE; 9402 9403 /* Create ZAPs for vdevs that don't have them. */ 9404 vdev_construct_zaps(spa->spa_root_vdev, tx); 9405 9406 config = spa_config_generate(spa, spa->spa_root_vdev, 9407 dmu_tx_get_txg(tx), B_FALSE); 9408 9409 /* 9410 * If we're upgrading the spa version then make sure that 9411 * the config object gets updated with the correct version. 9412 */ 9413 if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) 9414 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 9415 spa->spa_uberblock.ub_version); 9416 9417 spa_config_exit(spa, SCL_STATE, FTAG); 9418 9419 nvlist_free(spa->spa_config_syncing); 9420 spa->spa_config_syncing = config; 9421 9422 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 9423 } 9424 9425 static void 9426 spa_sync_version(void *arg, dmu_tx_t *tx) 9427 { 9428 uint64_t *versionp = arg; 9429 uint64_t version = *versionp; 9430 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 9431 9432 /* 9433 * Setting the version is special cased when first creating the pool. 9434 */ 9435 ASSERT(tx->tx_txg != TXG_INITIAL); 9436 9437 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 9438 ASSERT(version >= spa_version(spa)); 9439 9440 spa->spa_uberblock.ub_version = version; 9441 vdev_config_dirty(spa->spa_root_vdev); 9442 spa_history_log_internal(spa, "set", tx, "version=%lld", 9443 (longlong_t)version); 9444 } 9445 9446 /* 9447 * Set zpool properties. 9448 */ 9449 static void 9450 spa_sync_props(void *arg, dmu_tx_t *tx) 9451 { 9452 nvlist_t *nvp = arg; 9453 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 9454 objset_t *mos = spa->spa_meta_objset; 9455 nvpair_t *elem = NULL; 9456 9457 mutex_enter(&spa->spa_props_lock); 9458 9459 while ((elem = nvlist_next_nvpair(nvp, elem))) { 9460 uint64_t intval; 9461 const char *strval, *fname; 9462 zpool_prop_t prop; 9463 const char *propname; 9464 const char *elemname = nvpair_name(elem); 9465 zprop_type_t proptype; 9466 spa_feature_t fid; 9467 9468 switch (prop = zpool_name_to_prop(elemname)) { 9469 case ZPOOL_PROP_VERSION: 9470 intval = fnvpair_value_uint64(elem); 9471 /* 9472 * The version is synced separately before other 9473 * properties and should be correct by now. 9474 */ 9475 ASSERT3U(spa_version(spa), >=, intval); 9476 break; 9477 9478 case ZPOOL_PROP_ALTROOT: 9479 /* 9480 * 'altroot' is a non-persistent property. It should 9481 * have been set temporarily at creation or import time. 9482 */ 9483 ASSERT(spa->spa_root != NULL); 9484 break; 9485 9486 case ZPOOL_PROP_READONLY: 9487 case ZPOOL_PROP_CACHEFILE: 9488 /* 9489 * 'readonly' and 'cachefile' are also non-persistent 9490 * properties. 9491 */ 9492 break; 9493 case ZPOOL_PROP_COMMENT: 9494 strval = fnvpair_value_string(elem); 9495 if (spa->spa_comment != NULL) 9496 spa_strfree(spa->spa_comment); 9497 spa->spa_comment = spa_strdup(strval); 9498 /* 9499 * We need to dirty the configuration on all the vdevs 9500 * so that their labels get updated. We also need to 9501 * update the cache file to keep it in sync with the 9502 * MOS version. It's unnecessary to do this for pool 9503 * creation since the vdev's configuration has already 9504 * been dirtied. 9505 */ 9506 if (tx->tx_txg != TXG_INITIAL) { 9507 vdev_config_dirty(spa->spa_root_vdev); 9508 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 9509 } 9510 spa_history_log_internal(spa, "set", tx, 9511 "%s=%s", elemname, strval); 9512 break; 9513 case ZPOOL_PROP_COMPATIBILITY: 9514 strval = fnvpair_value_string(elem); 9515 if (spa->spa_compatibility != NULL) 9516 spa_strfree(spa->spa_compatibility); 9517 spa->spa_compatibility = spa_strdup(strval); 9518 /* 9519 * Dirty the configuration on vdevs as above. 9520 */ 9521 if (tx->tx_txg != TXG_INITIAL) { 9522 vdev_config_dirty(spa->spa_root_vdev); 9523 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 9524 } 9525 9526 spa_history_log_internal(spa, "set", tx, 9527 "%s=%s", nvpair_name(elem), strval); 9528 break; 9529 9530 case ZPOOL_PROP_INVAL: 9531 if (zpool_prop_feature(elemname)) { 9532 fname = strchr(elemname, '@') + 1; 9533 VERIFY0(zfeature_lookup_name(fname, &fid)); 9534 9535 spa_feature_enable(spa, fid, tx); 9536 spa_history_log_internal(spa, "set", tx, 9537 "%s=enabled", elemname); 9538 break; 9539 } else if (!zfs_prop_user(elemname)) { 9540 ASSERT(zpool_prop_feature(elemname)); 9541 break; 9542 } 9543 zfs_fallthrough; 9544 default: 9545 /* 9546 * Set pool property values in the poolprops mos object. 9547 */ 9548 if (spa->spa_pool_props_object == 0) { 9549 spa->spa_pool_props_object = 9550 zap_create_link(mos, DMU_OT_POOL_PROPS, 9551 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 9552 tx); 9553 } 9554 9555 /* normalize the property name */ 9556 if (prop == ZPOOL_PROP_INVAL) { 9557 propname = elemname; 9558 proptype = PROP_TYPE_STRING; 9559 } else { 9560 propname = zpool_prop_to_name(prop); 9561 proptype = zpool_prop_get_type(prop); 9562 } 9563 9564 if (nvpair_type(elem) == DATA_TYPE_STRING) { 9565 ASSERT(proptype == PROP_TYPE_STRING); 9566 strval = fnvpair_value_string(elem); 9567 VERIFY0(zap_update(mos, 9568 spa->spa_pool_props_object, propname, 9569 1, strlen(strval) + 1, strval, tx)); 9570 spa_history_log_internal(spa, "set", tx, 9571 "%s=%s", elemname, strval); 9572 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 9573 intval = fnvpair_value_uint64(elem); 9574 9575 if (proptype == PROP_TYPE_INDEX) { 9576 const char *unused; 9577 VERIFY0(zpool_prop_index_to_string( 9578 prop, intval, &unused)); 9579 } 9580 VERIFY0(zap_update(mos, 9581 spa->spa_pool_props_object, propname, 9582 8, 1, &intval, tx)); 9583 spa_history_log_internal(spa, "set", tx, 9584 "%s=%lld", elemname, 9585 (longlong_t)intval); 9586 9587 switch (prop) { 9588 case ZPOOL_PROP_DELEGATION: 9589 spa->spa_delegation = intval; 9590 break; 9591 case ZPOOL_PROP_BOOTFS: 9592 spa->spa_bootfs = intval; 9593 break; 9594 case ZPOOL_PROP_FAILUREMODE: 9595 spa->spa_failmode = intval; 9596 break; 9597 case ZPOOL_PROP_AUTOTRIM: 9598 spa->spa_autotrim = intval; 9599 spa_async_request(spa, 9600 SPA_ASYNC_AUTOTRIM_RESTART); 9601 break; 9602 case ZPOOL_PROP_AUTOEXPAND: 9603 spa->spa_autoexpand = intval; 9604 if (tx->tx_txg != TXG_INITIAL) 9605 spa_async_request(spa, 9606 SPA_ASYNC_AUTOEXPAND); 9607 break; 9608 case ZPOOL_PROP_MULTIHOST: 9609 spa->spa_multihost = intval; 9610 break; 9611 default: 9612 break; 9613 } 9614 } else { 9615 ASSERT(0); /* not allowed */ 9616 } 9617 } 9618 9619 } 9620 9621 mutex_exit(&spa->spa_props_lock); 9622 } 9623 9624 /* 9625 * Perform one-time upgrade on-disk changes. spa_version() does not 9626 * reflect the new version this txg, so there must be no changes this 9627 * txg to anything that the upgrade code depends on after it executes. 9628 * Therefore this must be called after dsl_pool_sync() does the sync 9629 * tasks. 9630 */ 9631 static void 9632 spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 9633 { 9634 if (spa_sync_pass(spa) != 1) 9635 return; 9636 9637 dsl_pool_t *dp = spa->spa_dsl_pool; 9638 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 9639 9640 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 9641 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 9642 dsl_pool_create_origin(dp, tx); 9643 9644 /* Keeping the origin open increases spa_minref */ 9645 spa->spa_minref += 3; 9646 } 9647 9648 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 9649 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 9650 dsl_pool_upgrade_clones(dp, tx); 9651 } 9652 9653 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 9654 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 9655 dsl_pool_upgrade_dir_clones(dp, tx); 9656 9657 /* Keeping the freedir open increases spa_minref */ 9658 spa->spa_minref += 3; 9659 } 9660 9661 if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && 9662 spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 9663 spa_feature_create_zap_objects(spa, tx); 9664 } 9665 9666 /* 9667 * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable 9668 * when possibility to use lz4 compression for metadata was added 9669 * Old pools that have this feature enabled must be upgraded to have 9670 * this feature active 9671 */ 9672 if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 9673 boolean_t lz4_en = spa_feature_is_enabled(spa, 9674 SPA_FEATURE_LZ4_COMPRESS); 9675 boolean_t lz4_ac = spa_feature_is_active(spa, 9676 SPA_FEATURE_LZ4_COMPRESS); 9677 9678 if (lz4_en && !lz4_ac) 9679 spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); 9680 } 9681 9682 /* 9683 * If we haven't written the salt, do so now. Note that the 9684 * feature may not be activated yet, but that's fine since 9685 * the presence of this ZAP entry is backwards compatible. 9686 */ 9687 if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 9688 DMU_POOL_CHECKSUM_SALT) == ENOENT) { 9689 VERIFY0(zap_add(spa->spa_meta_objset, 9690 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, 9691 sizeof (spa->spa_cksum_salt.zcs_bytes), 9692 spa->spa_cksum_salt.zcs_bytes, tx)); 9693 } 9694 9695 rrw_exit(&dp->dp_config_rwlock, FTAG); 9696 } 9697 9698 static void 9699 vdev_indirect_state_sync_verify(vdev_t *vd) 9700 { 9701 vdev_indirect_mapping_t *vim __maybe_unused = vd->vdev_indirect_mapping; 9702 vdev_indirect_births_t *vib __maybe_unused = vd->vdev_indirect_births; 9703 9704 if (vd->vdev_ops == &vdev_indirect_ops) { 9705 ASSERT(vim != NULL); 9706 ASSERT(vib != NULL); 9707 } 9708 9709 uint64_t obsolete_sm_object = 0; 9710 ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); 9711 if (obsolete_sm_object != 0) { 9712 ASSERT(vd->vdev_obsolete_sm != NULL); 9713 ASSERT(vd->vdev_removing || 9714 vd->vdev_ops == &vdev_indirect_ops); 9715 ASSERT(vdev_indirect_mapping_num_entries(vim) > 0); 9716 ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0); 9717 ASSERT3U(obsolete_sm_object, ==, 9718 space_map_object(vd->vdev_obsolete_sm)); 9719 ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=, 9720 space_map_allocated(vd->vdev_obsolete_sm)); 9721 } 9722 ASSERT(vd->vdev_obsolete_segments != NULL); 9723 9724 /* 9725 * Since frees / remaps to an indirect vdev can only 9726 * happen in syncing context, the obsolete segments 9727 * tree must be empty when we start syncing. 9728 */ 9729 ASSERT0(range_tree_space(vd->vdev_obsolete_segments)); 9730 } 9731 9732 /* 9733 * Set the top-level vdev's max queue depth. Evaluate each top-level's 9734 * async write queue depth in case it changed. The max queue depth will 9735 * not change in the middle of syncing out this txg. 9736 */ 9737 static void 9738 spa_sync_adjust_vdev_max_queue_depth(spa_t *spa) 9739 { 9740 ASSERT(spa_writeable(spa)); 9741 9742 vdev_t *rvd = spa->spa_root_vdev; 9743 uint32_t max_queue_depth = zfs_vdev_async_write_max_active * 9744 zfs_vdev_queue_depth_pct / 100; 9745 metaslab_class_t *normal = spa_normal_class(spa); 9746 metaslab_class_t *special = spa_special_class(spa); 9747 metaslab_class_t *dedup = spa_dedup_class(spa); 9748 9749 uint64_t slots_per_allocator = 0; 9750 for (int c = 0; c < rvd->vdev_children; c++) { 9751 vdev_t *tvd = rvd->vdev_child[c]; 9752 9753 metaslab_group_t *mg = tvd->vdev_mg; 9754 if (mg == NULL || !metaslab_group_initialized(mg)) 9755 continue; 9756 9757 metaslab_class_t *mc = mg->mg_class; 9758 if (mc != normal && mc != special && mc != dedup) 9759 continue; 9760 9761 /* 9762 * It is safe to do a lock-free check here because only async 9763 * allocations look at mg_max_alloc_queue_depth, and async 9764 * allocations all happen from spa_sync(). 9765 */ 9766 for (int i = 0; i < mg->mg_allocators; i++) { 9767 ASSERT0(zfs_refcount_count( 9768 &(mg->mg_allocator[i].mga_alloc_queue_depth))); 9769 } 9770 mg->mg_max_alloc_queue_depth = max_queue_depth; 9771 9772 for (int i = 0; i < mg->mg_allocators; i++) { 9773 mg->mg_allocator[i].mga_cur_max_alloc_queue_depth = 9774 zfs_vdev_def_queue_depth; 9775 } 9776 slots_per_allocator += zfs_vdev_def_queue_depth; 9777 } 9778 9779 for (int i = 0; i < spa->spa_alloc_count; i++) { 9780 ASSERT0(zfs_refcount_count(&normal->mc_allocator[i]. 9781 mca_alloc_slots)); 9782 ASSERT0(zfs_refcount_count(&special->mc_allocator[i]. 9783 mca_alloc_slots)); 9784 ASSERT0(zfs_refcount_count(&dedup->mc_allocator[i]. 9785 mca_alloc_slots)); 9786 normal->mc_allocator[i].mca_alloc_max_slots = 9787 slots_per_allocator; 9788 special->mc_allocator[i].mca_alloc_max_slots = 9789 slots_per_allocator; 9790 dedup->mc_allocator[i].mca_alloc_max_slots = 9791 slots_per_allocator; 9792 } 9793 normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; 9794 special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; 9795 dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; 9796 } 9797 9798 static void 9799 spa_sync_condense_indirect(spa_t *spa, dmu_tx_t *tx) 9800 { 9801 ASSERT(spa_writeable(spa)); 9802 9803 vdev_t *rvd = spa->spa_root_vdev; 9804 for (int c = 0; c < rvd->vdev_children; c++) { 9805 vdev_t *vd = rvd->vdev_child[c]; 9806 vdev_indirect_state_sync_verify(vd); 9807 9808 if (vdev_indirect_should_condense(vd)) { 9809 spa_condense_indirect_start_sync(vd, tx); 9810 break; 9811 } 9812 } 9813 } 9814 9815 static void 9816 spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx) 9817 { 9818 objset_t *mos = spa->spa_meta_objset; 9819 dsl_pool_t *dp = spa->spa_dsl_pool; 9820 uint64_t txg = tx->tx_txg; 9821 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 9822 9823 do { 9824 int pass = ++spa->spa_sync_pass; 9825 9826 spa_sync_config_object(spa, tx); 9827 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 9828 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 9829 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 9830 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 9831 spa_errlog_sync(spa, txg); 9832 dsl_pool_sync(dp, txg); 9833 9834 if (pass < zfs_sync_pass_deferred_free || 9835 spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { 9836 /* 9837 * If the log space map feature is active we don't 9838 * care about deferred frees and the deferred bpobj 9839 * as the log space map should effectively have the 9840 * same results (i.e. appending only to one object). 9841 */ 9842 spa_sync_frees(spa, free_bpl, tx); 9843 } else { 9844 /* 9845 * We can not defer frees in pass 1, because 9846 * we sync the deferred frees later in pass 1. 9847 */ 9848 ASSERT3U(pass, >, 1); 9849 bplist_iterate(free_bpl, bpobj_enqueue_alloc_cb, 9850 &spa->spa_deferred_bpobj, tx); 9851 } 9852 9853 brt_sync(spa, txg); 9854 ddt_sync(spa, txg); 9855 dsl_scan_sync(dp, tx); 9856 dsl_errorscrub_sync(dp, tx); 9857 svr_sync(spa, tx); 9858 spa_sync_upgrades(spa, tx); 9859 9860 spa_flush_metaslabs(spa, tx); 9861 9862 vdev_t *vd = NULL; 9863 while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 9864 != NULL) 9865 vdev_sync(vd, txg); 9866 9867 if (pass == 1) { 9868 /* 9869 * dsl_pool_sync() -> dp_sync_tasks may have dirtied 9870 * the config. If that happens, this txg should not 9871 * be a no-op. So we must sync the config to the MOS 9872 * before checking for no-op. 9873 * 9874 * Note that when the config is dirty, it will 9875 * be written to the MOS (i.e. the MOS will be 9876 * dirtied) every time we call spa_sync_config_object() 9877 * in this txg. Therefore we can't call this after 9878 * dsl_pool_sync() every pass, because it would 9879 * prevent us from converging, since we'd dirty 9880 * the MOS every pass. 9881 * 9882 * Sync tasks can only be processed in pass 1, so 9883 * there's no need to do this in later passes. 9884 */ 9885 spa_sync_config_object(spa, tx); 9886 } 9887 9888 /* 9889 * Note: We need to check if the MOS is dirty because we could 9890 * have marked the MOS dirty without updating the uberblock 9891 * (e.g. if we have sync tasks but no dirty user data). We need 9892 * to check the uberblock's rootbp because it is updated if we 9893 * have synced out dirty data (though in this case the MOS will 9894 * most likely also be dirty due to second order effects, we 9895 * don't want to rely on that here). 9896 */ 9897 if (pass == 1 && 9898 BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp) < txg && 9899 !dmu_objset_is_dirty(mos, txg)) { 9900 /* 9901 * Nothing changed on the first pass, therefore this 9902 * TXG is a no-op. Avoid syncing deferred frees, so 9903 * that we can keep this TXG as a no-op. 9904 */ 9905 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 9906 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 9907 ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); 9908 ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, txg)); 9909 break; 9910 } 9911 9912 spa_sync_deferred_frees(spa, tx); 9913 } while (dmu_objset_is_dirty(mos, txg)); 9914 } 9915 9916 /* 9917 * Rewrite the vdev configuration (which includes the uberblock) to 9918 * commit the transaction group. 9919 * 9920 * If there are no dirty vdevs, we sync the uberblock to a few random 9921 * top-level vdevs that are known to be visible in the config cache 9922 * (see spa_vdev_add() for a complete description). If there *are* dirty 9923 * vdevs, sync the uberblock to all vdevs. 9924 */ 9925 static void 9926 spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx) 9927 { 9928 vdev_t *rvd = spa->spa_root_vdev; 9929 uint64_t txg = tx->tx_txg; 9930 9931 for (;;) { 9932 int error = 0; 9933 9934 /* 9935 * We hold SCL_STATE to prevent vdev open/close/etc. 9936 * while we're attempting to write the vdev labels. 9937 */ 9938 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 9939 9940 if (list_is_empty(&spa->spa_config_dirty_list)) { 9941 vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; 9942 int svdcount = 0; 9943 int children = rvd->vdev_children; 9944 int c0 = random_in_range(children); 9945 9946 for (int c = 0; c < children; c++) { 9947 vdev_t *vd = 9948 rvd->vdev_child[(c0 + c) % children]; 9949 9950 /* Stop when revisiting the first vdev */ 9951 if (c > 0 && svd[0] == vd) 9952 break; 9953 9954 if (vd->vdev_ms_array == 0 || 9955 vd->vdev_islog || 9956 !vdev_is_concrete(vd)) 9957 continue; 9958 9959 svd[svdcount++] = vd; 9960 if (svdcount == SPA_SYNC_MIN_VDEVS) 9961 break; 9962 } 9963 error = vdev_config_sync(svd, svdcount, txg); 9964 } else { 9965 error = vdev_config_sync(rvd->vdev_child, 9966 rvd->vdev_children, txg); 9967 } 9968 9969 if (error == 0) 9970 spa->spa_last_synced_guid = rvd->vdev_guid; 9971 9972 spa_config_exit(spa, SCL_STATE, FTAG); 9973 9974 if (error == 0) 9975 break; 9976 zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR); 9977 zio_resume_wait(spa); 9978 } 9979 } 9980 9981 /* 9982 * Sync the specified transaction group. New blocks may be dirtied as 9983 * part of the process, so we iterate until it converges. 9984 */ 9985 void 9986 spa_sync(spa_t *spa, uint64_t txg) 9987 { 9988 vdev_t *vd = NULL; 9989 9990 VERIFY(spa_writeable(spa)); 9991 9992 /* 9993 * Wait for i/os issued in open context that need to complete 9994 * before this txg syncs. 9995 */ 9996 (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]); 9997 spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, 9998 ZIO_FLAG_CANFAIL); 9999 10000 /* 10001 * Now that there can be no more cloning in this transaction group, 10002 * but we are still before issuing frees, we can process pending BRT 10003 * updates. 10004 */ 10005 brt_pending_apply(spa, txg); 10006 10007 /* 10008 * Lock out configuration changes. 10009 */ 10010 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 10011 10012 spa->spa_syncing_txg = txg; 10013 spa->spa_sync_pass = 0; 10014 10015 for (int i = 0; i < spa->spa_alloc_count; i++) { 10016 mutex_enter(&spa->spa_allocs[i].spaa_lock); 10017 VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree)); 10018 mutex_exit(&spa->spa_allocs[i].spaa_lock); 10019 } 10020 10021 /* 10022 * If there are any pending vdev state changes, convert them 10023 * into config changes that go out with this transaction group. 10024 */ 10025 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 10026 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 10027 /* Avoid holding the write lock unless actually necessary */ 10028 if (vd->vdev_aux == NULL) { 10029 vdev_state_clean(vd); 10030 vdev_config_dirty(vd); 10031 continue; 10032 } 10033 /* 10034 * We need the write lock here because, for aux vdevs, 10035 * calling vdev_config_dirty() modifies sav_config. 10036 * This is ugly and will become unnecessary when we 10037 * eliminate the aux vdev wart by integrating all vdevs 10038 * into the root vdev tree. 10039 */ 10040 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 10041 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 10042 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 10043 vdev_state_clean(vd); 10044 vdev_config_dirty(vd); 10045 } 10046 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 10047 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 10048 } 10049 spa_config_exit(spa, SCL_STATE, FTAG); 10050 10051 dsl_pool_t *dp = spa->spa_dsl_pool; 10052 dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); 10053 10054 spa->spa_sync_starttime = gethrtime(); 10055 taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); 10056 spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq, 10057 spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + 10058 NSEC_TO_TICK(spa->spa_deadman_synctime)); 10059 10060 /* 10061 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 10062 * set spa_deflate if we have no raid-z vdevs. 10063 */ 10064 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 10065 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 10066 vdev_t *rvd = spa->spa_root_vdev; 10067 10068 int i; 10069 for (i = 0; i < rvd->vdev_children; i++) { 10070 vd = rvd->vdev_child[i]; 10071 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 10072 break; 10073 } 10074 if (i == rvd->vdev_children) { 10075 spa->spa_deflate = TRUE; 10076 VERIFY0(zap_add(spa->spa_meta_objset, 10077 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 10078 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 10079 } 10080 } 10081 10082 spa_sync_adjust_vdev_max_queue_depth(spa); 10083 10084 spa_sync_condense_indirect(spa, tx); 10085 10086 spa_sync_iterate_to_convergence(spa, tx); 10087 10088 #ifdef ZFS_DEBUG 10089 if (!list_is_empty(&spa->spa_config_dirty_list)) { 10090 /* 10091 * Make sure that the number of ZAPs for all the vdevs matches 10092 * the number of ZAPs in the per-vdev ZAP list. This only gets 10093 * called if the config is dirty; otherwise there may be 10094 * outstanding AVZ operations that weren't completed in 10095 * spa_sync_config_object. 10096 */ 10097 uint64_t all_vdev_zap_entry_count; 10098 ASSERT0(zap_count(spa->spa_meta_objset, 10099 spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count)); 10100 ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==, 10101 all_vdev_zap_entry_count); 10102 } 10103 #endif 10104 10105 if (spa->spa_vdev_removal != NULL) { 10106 ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]); 10107 } 10108 10109 spa_sync_rewrite_vdev_config(spa, tx); 10110 dmu_tx_commit(tx); 10111 10112 taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); 10113 spa->spa_deadman_tqid = 0; 10114 10115 /* 10116 * Clear the dirty config list. 10117 */ 10118 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 10119 vdev_config_clean(vd); 10120 10121 /* 10122 * Now that the new config has synced transactionally, 10123 * let it become visible to the config cache. 10124 */ 10125 if (spa->spa_config_syncing != NULL) { 10126 spa_config_set(spa, spa->spa_config_syncing); 10127 spa->spa_config_txg = txg; 10128 spa->spa_config_syncing = NULL; 10129 } 10130 10131 dsl_pool_sync_done(dp, txg); 10132 10133 for (int i = 0; i < spa->spa_alloc_count; i++) { 10134 mutex_enter(&spa->spa_allocs[i].spaa_lock); 10135 VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree)); 10136 mutex_exit(&spa->spa_allocs[i].spaa_lock); 10137 } 10138 10139 /* 10140 * Update usable space statistics. 10141 */ 10142 while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 10143 != NULL) 10144 vdev_sync_done(vd, txg); 10145 10146 metaslab_class_evict_old(spa->spa_normal_class, txg); 10147 metaslab_class_evict_old(spa->spa_log_class, txg); 10148 10149 spa_sync_close_syncing_log_sm(spa); 10150 10151 spa_update_dspace(spa); 10152 10153 if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) 10154 vdev_autotrim_kick(spa); 10155 10156 /* 10157 * It had better be the case that we didn't dirty anything 10158 * since vdev_config_sync(). 10159 */ 10160 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 10161 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 10162 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 10163 10164 while (zfs_pause_spa_sync) 10165 delay(1); 10166 10167 spa->spa_sync_pass = 0; 10168 10169 /* 10170 * Update the last synced uberblock here. We want to do this at 10171 * the end of spa_sync() so that consumers of spa_last_synced_txg() 10172 * will be guaranteed that all the processing associated with 10173 * that txg has been completed. 10174 */ 10175 spa->spa_ubsync = spa->spa_uberblock; 10176 spa_config_exit(spa, SCL_CONFIG, FTAG); 10177 10178 spa_handle_ignored_writes(spa); 10179 10180 /* 10181 * If any async tasks have been requested, kick them off. 10182 */ 10183 spa_async_dispatch(spa); 10184 } 10185 10186 /* 10187 * Sync all pools. We don't want to hold the namespace lock across these 10188 * operations, so we take a reference on the spa_t and drop the lock during the 10189 * sync. 10190 */ 10191 void 10192 spa_sync_allpools(void) 10193 { 10194 spa_t *spa = NULL; 10195 mutex_enter(&spa_namespace_lock); 10196 while ((spa = spa_next(spa)) != NULL) { 10197 if (spa_state(spa) != POOL_STATE_ACTIVE || 10198 !spa_writeable(spa) || spa_suspended(spa)) 10199 continue; 10200 spa_open_ref(spa, FTAG); 10201 mutex_exit(&spa_namespace_lock); 10202 txg_wait_synced(spa_get_dsl(spa), 0); 10203 mutex_enter(&spa_namespace_lock); 10204 spa_close(spa, FTAG); 10205 } 10206 mutex_exit(&spa_namespace_lock); 10207 } 10208 10209 taskq_t * 10210 spa_sync_tq_create(spa_t *spa, const char *name) 10211 { 10212 kthread_t **kthreads; 10213 10214 ASSERT(spa->spa_sync_tq == NULL); 10215 ASSERT3S(spa->spa_alloc_count, <=, boot_ncpus); 10216 10217 /* 10218 * - do not allow more allocators than cpus. 10219 * - there may be more cpus than allocators. 10220 * - do not allow more sync taskq threads than allocators or cpus. 10221 */ 10222 int nthreads = spa->spa_alloc_count; 10223 spa->spa_syncthreads = kmem_zalloc(sizeof (spa_syncthread_info_t) * 10224 nthreads, KM_SLEEP); 10225 10226 spa->spa_sync_tq = taskq_create_synced(name, nthreads, minclsyspri, 10227 nthreads, INT_MAX, TASKQ_PREPOPULATE, &kthreads); 10228 VERIFY(spa->spa_sync_tq != NULL); 10229 VERIFY(kthreads != NULL); 10230 10231 spa_syncthread_info_t *ti = spa->spa_syncthreads; 10232 for (int i = 0; i < nthreads; i++, ti++) { 10233 ti->sti_thread = kthreads[i]; 10234 ti->sti_allocator = i; 10235 } 10236 10237 kmem_free(kthreads, sizeof (*kthreads) * nthreads); 10238 return (spa->spa_sync_tq); 10239 } 10240 10241 void 10242 spa_sync_tq_destroy(spa_t *spa) 10243 { 10244 ASSERT(spa->spa_sync_tq != NULL); 10245 10246 taskq_wait(spa->spa_sync_tq); 10247 taskq_destroy(spa->spa_sync_tq); 10248 kmem_free(spa->spa_syncthreads, 10249 sizeof (spa_syncthread_info_t) * spa->spa_alloc_count); 10250 spa->spa_sync_tq = NULL; 10251 } 10252 10253 uint_t 10254 spa_acq_allocator(spa_t *spa) 10255 { 10256 int i; 10257 10258 if (spa->spa_alloc_count == 1) 10259 return (0); 10260 10261 mutex_enter(&spa->spa_allocs_use->sau_lock); 10262 uint_t r = spa->spa_allocs_use->sau_rotor; 10263 do { 10264 if (++r == spa->spa_alloc_count) 10265 r = 0; 10266 } while (spa->spa_allocs_use->sau_inuse[r]); 10267 spa->spa_allocs_use->sau_inuse[r] = B_TRUE; 10268 spa->spa_allocs_use->sau_rotor = r; 10269 mutex_exit(&spa->spa_allocs_use->sau_lock); 10270 10271 spa_syncthread_info_t *ti = spa->spa_syncthreads; 10272 for (i = 0; i < spa->spa_alloc_count; i++, ti++) { 10273 if (ti->sti_thread == curthread) { 10274 ti->sti_allocator = r; 10275 break; 10276 } 10277 } 10278 ASSERT3S(i, <, spa->spa_alloc_count); 10279 return (r); 10280 } 10281 10282 void 10283 spa_rel_allocator(spa_t *spa, uint_t allocator) 10284 { 10285 if (spa->spa_alloc_count > 1) 10286 spa->spa_allocs_use->sau_inuse[allocator] = B_FALSE; 10287 } 10288 10289 void 10290 spa_select_allocator(zio_t *zio) 10291 { 10292 zbookmark_phys_t *bm = &zio->io_bookmark; 10293 spa_t *spa = zio->io_spa; 10294 10295 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 10296 10297 /* 10298 * A gang block (for example) may have inherited its parent's 10299 * allocator, in which case there is nothing further to do here. 10300 */ 10301 if (ZIO_HAS_ALLOCATOR(zio)) 10302 return; 10303 10304 ASSERT(spa != NULL); 10305 ASSERT(bm != NULL); 10306 10307 /* 10308 * First try to use an allocator assigned to the syncthread, and set 10309 * the corresponding write issue taskq for the allocator. 10310 * Note, we must have an open pool to do this. 10311 */ 10312 if (spa->spa_sync_tq != NULL) { 10313 spa_syncthread_info_t *ti = spa->spa_syncthreads; 10314 for (int i = 0; i < spa->spa_alloc_count; i++, ti++) { 10315 if (ti->sti_thread == curthread) { 10316 zio->io_allocator = ti->sti_allocator; 10317 return; 10318 } 10319 } 10320 } 10321 10322 /* 10323 * We want to try to use as many allocators as possible to help improve 10324 * performance, but we also want logically adjacent IOs to be physically 10325 * adjacent to improve sequential read performance. We chunk each object 10326 * into 2^20 block regions, and then hash based on the objset, object, 10327 * level, and region to accomplish both of these goals. 10328 */ 10329 uint64_t hv = cityhash4(bm->zb_objset, bm->zb_object, bm->zb_level, 10330 bm->zb_blkid >> 20); 10331 10332 zio->io_allocator = (uint_t)hv % spa->spa_alloc_count; 10333 } 10334 10335 /* 10336 * ========================================================================== 10337 * Miscellaneous routines 10338 * ========================================================================== 10339 */ 10340 10341 /* 10342 * Remove all pools in the system. 10343 */ 10344 void 10345 spa_evict_all(void) 10346 { 10347 spa_t *spa; 10348 10349 /* 10350 * Remove all cached state. All pools should be closed now, 10351 * so every spa in the AVL tree should be unreferenced. 10352 */ 10353 mutex_enter(&spa_namespace_lock); 10354 while ((spa = spa_next(NULL)) != NULL) { 10355 /* 10356 * Stop async tasks. The async thread may need to detach 10357 * a device that's been replaced, which requires grabbing 10358 * spa_namespace_lock, so we must drop it here. 10359 */ 10360 spa_open_ref(spa, FTAG); 10361 mutex_exit(&spa_namespace_lock); 10362 spa_async_suspend(spa); 10363 mutex_enter(&spa_namespace_lock); 10364 spa_close(spa, FTAG); 10365 10366 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 10367 spa_unload(spa); 10368 spa_deactivate(spa); 10369 } 10370 spa_remove(spa); 10371 } 10372 mutex_exit(&spa_namespace_lock); 10373 } 10374 10375 vdev_t * 10376 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 10377 { 10378 vdev_t *vd; 10379 int i; 10380 10381 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 10382 return (vd); 10383 10384 if (aux) { 10385 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 10386 vd = spa->spa_l2cache.sav_vdevs[i]; 10387 if (vd->vdev_guid == guid) 10388 return (vd); 10389 } 10390 10391 for (i = 0; i < spa->spa_spares.sav_count; i++) { 10392 vd = spa->spa_spares.sav_vdevs[i]; 10393 if (vd->vdev_guid == guid) 10394 return (vd); 10395 } 10396 } 10397 10398 return (NULL); 10399 } 10400 10401 void 10402 spa_upgrade(spa_t *spa, uint64_t version) 10403 { 10404 ASSERT(spa_writeable(spa)); 10405 10406 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 10407 10408 /* 10409 * This should only be called for a non-faulted pool, and since a 10410 * future version would result in an unopenable pool, this shouldn't be 10411 * possible. 10412 */ 10413 ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); 10414 ASSERT3U(version, >=, spa->spa_uberblock.ub_version); 10415 10416 spa->spa_uberblock.ub_version = version; 10417 vdev_config_dirty(spa->spa_root_vdev); 10418 10419 spa_config_exit(spa, SCL_ALL, FTAG); 10420 10421 txg_wait_synced(spa_get_dsl(spa), 0); 10422 } 10423 10424 static boolean_t 10425 spa_has_aux_vdev(spa_t *spa, uint64_t guid, spa_aux_vdev_t *sav) 10426 { 10427 (void) spa; 10428 int i; 10429 uint64_t vdev_guid; 10430 10431 for (i = 0; i < sav->sav_count; i++) 10432 if (sav->sav_vdevs[i]->vdev_guid == guid) 10433 return (B_TRUE); 10434 10435 for (i = 0; i < sav->sav_npending; i++) { 10436 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 10437 &vdev_guid) == 0 && vdev_guid == guid) 10438 return (B_TRUE); 10439 } 10440 10441 return (B_FALSE); 10442 } 10443 10444 boolean_t 10445 spa_has_l2cache(spa_t *spa, uint64_t guid) 10446 { 10447 return (spa_has_aux_vdev(spa, guid, &spa->spa_l2cache)); 10448 } 10449 10450 boolean_t 10451 spa_has_spare(spa_t *spa, uint64_t guid) 10452 { 10453 return (spa_has_aux_vdev(spa, guid, &spa->spa_spares)); 10454 } 10455 10456 /* 10457 * Check if a pool has an active shared spare device. 10458 * Note: reference count of an active spare is 2, as a spare and as a replace 10459 */ 10460 static boolean_t 10461 spa_has_active_shared_spare(spa_t *spa) 10462 { 10463 int i, refcnt; 10464 uint64_t pool; 10465 spa_aux_vdev_t *sav = &spa->spa_spares; 10466 10467 for (i = 0; i < sav->sav_count; i++) { 10468 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 10469 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 10470 refcnt > 2) 10471 return (B_TRUE); 10472 } 10473 10474 return (B_FALSE); 10475 } 10476 10477 uint64_t 10478 spa_total_metaslabs(spa_t *spa) 10479 { 10480 vdev_t *rvd = spa->spa_root_vdev; 10481 10482 uint64_t m = 0; 10483 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 10484 vdev_t *vd = rvd->vdev_child[c]; 10485 if (!vdev_is_concrete(vd)) 10486 continue; 10487 m += vd->vdev_ms_count; 10488 } 10489 return (m); 10490 } 10491 10492 /* 10493 * Notify any waiting threads that some activity has switched from being in- 10494 * progress to not-in-progress so that the thread can wake up and determine 10495 * whether it is finished waiting. 10496 */ 10497 void 10498 spa_notify_waiters(spa_t *spa) 10499 { 10500 /* 10501 * Acquiring spa_activities_lock here prevents the cv_broadcast from 10502 * happening between the waiting thread's check and cv_wait. 10503 */ 10504 mutex_enter(&spa->spa_activities_lock); 10505 cv_broadcast(&spa->spa_activities_cv); 10506 mutex_exit(&spa->spa_activities_lock); 10507 } 10508 10509 /* 10510 * Notify any waiting threads that the pool is exporting, and then block until 10511 * they are finished using the spa_t. 10512 */ 10513 void 10514 spa_wake_waiters(spa_t *spa) 10515 { 10516 mutex_enter(&spa->spa_activities_lock); 10517 spa->spa_waiters_cancel = B_TRUE; 10518 cv_broadcast(&spa->spa_activities_cv); 10519 while (spa->spa_waiters != 0) 10520 cv_wait(&spa->spa_waiters_cv, &spa->spa_activities_lock); 10521 spa->spa_waiters_cancel = B_FALSE; 10522 mutex_exit(&spa->spa_activities_lock); 10523 } 10524 10525 /* Whether the vdev or any of its descendants are being initialized/trimmed. */ 10526 static boolean_t 10527 spa_vdev_activity_in_progress_impl(vdev_t *vd, zpool_wait_activity_t activity) 10528 { 10529 spa_t *spa = vd->vdev_spa; 10530 10531 ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER)); 10532 ASSERT(MUTEX_HELD(&spa->spa_activities_lock)); 10533 ASSERT(activity == ZPOOL_WAIT_INITIALIZE || 10534 activity == ZPOOL_WAIT_TRIM); 10535 10536 kmutex_t *lock = activity == ZPOOL_WAIT_INITIALIZE ? 10537 &vd->vdev_initialize_lock : &vd->vdev_trim_lock; 10538 10539 mutex_exit(&spa->spa_activities_lock); 10540 mutex_enter(lock); 10541 mutex_enter(&spa->spa_activities_lock); 10542 10543 boolean_t in_progress = (activity == ZPOOL_WAIT_INITIALIZE) ? 10544 (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) : 10545 (vd->vdev_trim_state == VDEV_TRIM_ACTIVE); 10546 mutex_exit(lock); 10547 10548 if (in_progress) 10549 return (B_TRUE); 10550 10551 for (int i = 0; i < vd->vdev_children; i++) { 10552 if (spa_vdev_activity_in_progress_impl(vd->vdev_child[i], 10553 activity)) 10554 return (B_TRUE); 10555 } 10556 10557 return (B_FALSE); 10558 } 10559 10560 /* 10561 * If use_guid is true, this checks whether the vdev specified by guid is 10562 * being initialized/trimmed. Otherwise, it checks whether any vdev in the pool 10563 * is being initialized/trimmed. The caller must hold the config lock and 10564 * spa_activities_lock. 10565 */ 10566 static int 10567 spa_vdev_activity_in_progress(spa_t *spa, boolean_t use_guid, uint64_t guid, 10568 zpool_wait_activity_t activity, boolean_t *in_progress) 10569 { 10570 mutex_exit(&spa->spa_activities_lock); 10571 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 10572 mutex_enter(&spa->spa_activities_lock); 10573 10574 vdev_t *vd; 10575 if (use_guid) { 10576 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 10577 if (vd == NULL || !vd->vdev_ops->vdev_op_leaf) { 10578 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 10579 return (EINVAL); 10580 } 10581 } else { 10582 vd = spa->spa_root_vdev; 10583 } 10584 10585 *in_progress = spa_vdev_activity_in_progress_impl(vd, activity); 10586 10587 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 10588 return (0); 10589 } 10590 10591 /* 10592 * Locking for waiting threads 10593 * --------------------------- 10594 * 10595 * Waiting threads need a way to check whether a given activity is in progress, 10596 * and then, if it is, wait for it to complete. Each activity will have some 10597 * in-memory representation of the relevant on-disk state which can be used to 10598 * determine whether or not the activity is in progress. The in-memory state and 10599 * the locking used to protect it will be different for each activity, and may 10600 * not be suitable for use with a cvar (e.g., some state is protected by the 10601 * config lock). To allow waiting threads to wait without any races, another 10602 * lock, spa_activities_lock, is used. 10603 * 10604 * When the state is checked, both the activity-specific lock (if there is one) 10605 * and spa_activities_lock are held. In some cases, the activity-specific lock 10606 * is acquired explicitly (e.g. the config lock). In others, the locking is 10607 * internal to some check (e.g. bpobj_is_empty). After checking, the waiting 10608 * thread releases the activity-specific lock and, if the activity is in 10609 * progress, then cv_waits using spa_activities_lock. 10610 * 10611 * The waiting thread is woken when another thread, one completing some 10612 * activity, updates the state of the activity and then calls 10613 * spa_notify_waiters, which will cv_broadcast. This 'completing' thread only 10614 * needs to hold its activity-specific lock when updating the state, and this 10615 * lock can (but doesn't have to) be dropped before calling spa_notify_waiters. 10616 * 10617 * Because spa_notify_waiters acquires spa_activities_lock before broadcasting, 10618 * and because it is held when the waiting thread checks the state of the 10619 * activity, it can never be the case that the completing thread both updates 10620 * the activity state and cv_broadcasts in between the waiting thread's check 10621 * and cv_wait. Thus, a waiting thread can never miss a wakeup. 10622 * 10623 * In order to prevent deadlock, when the waiting thread does its check, in some 10624 * cases it will temporarily drop spa_activities_lock in order to acquire the 10625 * activity-specific lock. The order in which spa_activities_lock and the 10626 * activity specific lock are acquired in the waiting thread is determined by 10627 * the order in which they are acquired in the completing thread; if the 10628 * completing thread calls spa_notify_waiters with the activity-specific lock 10629 * held, then the waiting thread must also acquire the activity-specific lock 10630 * first. 10631 */ 10632 10633 static int 10634 spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity, 10635 boolean_t use_tag, uint64_t tag, boolean_t *in_progress) 10636 { 10637 int error = 0; 10638 10639 ASSERT(MUTEX_HELD(&spa->spa_activities_lock)); 10640 10641 switch (activity) { 10642 case ZPOOL_WAIT_CKPT_DISCARD: 10643 *in_progress = 10644 (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT) && 10645 zap_contains(spa_meta_objset(spa), 10646 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT) == 10647 ENOENT); 10648 break; 10649 case ZPOOL_WAIT_FREE: 10650 *in_progress = ((spa_version(spa) >= SPA_VERSION_DEADLISTS && 10651 !bpobj_is_empty(&spa->spa_dsl_pool->dp_free_bpobj)) || 10652 spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY) || 10653 spa_livelist_delete_check(spa)); 10654 break; 10655 case ZPOOL_WAIT_INITIALIZE: 10656 case ZPOOL_WAIT_TRIM: 10657 error = spa_vdev_activity_in_progress(spa, use_tag, tag, 10658 activity, in_progress); 10659 break; 10660 case ZPOOL_WAIT_REPLACE: 10661 mutex_exit(&spa->spa_activities_lock); 10662 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 10663 mutex_enter(&spa->spa_activities_lock); 10664 10665 *in_progress = vdev_replace_in_progress(spa->spa_root_vdev); 10666 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 10667 break; 10668 case ZPOOL_WAIT_REMOVE: 10669 *in_progress = (spa->spa_removing_phys.sr_state == 10670 DSS_SCANNING); 10671 break; 10672 case ZPOOL_WAIT_RESILVER: 10673 *in_progress = vdev_rebuild_active(spa->spa_root_vdev); 10674 if (*in_progress) 10675 break; 10676 zfs_fallthrough; 10677 case ZPOOL_WAIT_SCRUB: 10678 { 10679 boolean_t scanning, paused, is_scrub; 10680 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; 10681 10682 is_scrub = (scn->scn_phys.scn_func == POOL_SCAN_SCRUB); 10683 scanning = (scn->scn_phys.scn_state == DSS_SCANNING); 10684 paused = dsl_scan_is_paused_scrub(scn); 10685 *in_progress = (scanning && !paused && 10686 is_scrub == (activity == ZPOOL_WAIT_SCRUB)); 10687 break; 10688 } 10689 case ZPOOL_WAIT_RAIDZ_EXPAND: 10690 { 10691 vdev_raidz_expand_t *vre = spa->spa_raidz_expand; 10692 *in_progress = (vre != NULL && vre->vre_state == DSS_SCANNING); 10693 break; 10694 } 10695 default: 10696 panic("unrecognized value for activity %d", activity); 10697 } 10698 10699 return (error); 10700 } 10701 10702 static int 10703 spa_wait_common(const char *pool, zpool_wait_activity_t activity, 10704 boolean_t use_tag, uint64_t tag, boolean_t *waited) 10705 { 10706 /* 10707 * The tag is used to distinguish between instances of an activity. 10708 * 'initialize' and 'trim' are the only activities that we use this for. 10709 * The other activities can only have a single instance in progress in a 10710 * pool at one time, making the tag unnecessary. 10711 * 10712 * There can be multiple devices being replaced at once, but since they 10713 * all finish once resilvering finishes, we don't bother keeping track 10714 * of them individually, we just wait for them all to finish. 10715 */ 10716 if (use_tag && activity != ZPOOL_WAIT_INITIALIZE && 10717 activity != ZPOOL_WAIT_TRIM) 10718 return (EINVAL); 10719 10720 if (activity < 0 || activity >= ZPOOL_WAIT_NUM_ACTIVITIES) 10721 return (EINVAL); 10722 10723 spa_t *spa; 10724 int error = spa_open(pool, &spa, FTAG); 10725 if (error != 0) 10726 return (error); 10727 10728 /* 10729 * Increment the spa's waiter count so that we can call spa_close and 10730 * still ensure that the spa_t doesn't get freed before this thread is 10731 * finished with it when the pool is exported. We want to call spa_close 10732 * before we start waiting because otherwise the additional ref would 10733 * prevent the pool from being exported or destroyed throughout the 10734 * potentially long wait. 10735 */ 10736 mutex_enter(&spa->spa_activities_lock); 10737 spa->spa_waiters++; 10738 spa_close(spa, FTAG); 10739 10740 *waited = B_FALSE; 10741 for (;;) { 10742 boolean_t in_progress; 10743 error = spa_activity_in_progress(spa, activity, use_tag, tag, 10744 &in_progress); 10745 10746 if (error || !in_progress || spa->spa_waiters_cancel) 10747 break; 10748 10749 *waited = B_TRUE; 10750 10751 if (cv_wait_sig(&spa->spa_activities_cv, 10752 &spa->spa_activities_lock) == 0) { 10753 error = EINTR; 10754 break; 10755 } 10756 } 10757 10758 spa->spa_waiters--; 10759 cv_signal(&spa->spa_waiters_cv); 10760 mutex_exit(&spa->spa_activities_lock); 10761 10762 return (error); 10763 } 10764 10765 /* 10766 * Wait for a particular instance of the specified activity to complete, where 10767 * the instance is identified by 'tag' 10768 */ 10769 int 10770 spa_wait_tag(const char *pool, zpool_wait_activity_t activity, uint64_t tag, 10771 boolean_t *waited) 10772 { 10773 return (spa_wait_common(pool, activity, B_TRUE, tag, waited)); 10774 } 10775 10776 /* 10777 * Wait for all instances of the specified activity complete 10778 */ 10779 int 10780 spa_wait(const char *pool, zpool_wait_activity_t activity, boolean_t *waited) 10781 { 10782 10783 return (spa_wait_common(pool, activity, B_FALSE, 0, waited)); 10784 } 10785 10786 sysevent_t * 10787 spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) 10788 { 10789 sysevent_t *ev = NULL; 10790 #ifdef _KERNEL 10791 nvlist_t *resource; 10792 10793 resource = zfs_event_create(spa, vd, FM_SYSEVENT_CLASS, name, hist_nvl); 10794 if (resource) { 10795 ev = kmem_alloc(sizeof (sysevent_t), KM_SLEEP); 10796 ev->resource = resource; 10797 } 10798 #else 10799 (void) spa, (void) vd, (void) hist_nvl, (void) name; 10800 #endif 10801 return (ev); 10802 } 10803 10804 void 10805 spa_event_post(sysevent_t *ev) 10806 { 10807 #ifdef _KERNEL 10808 if (ev) { 10809 zfs_zevent_post(ev->resource, NULL, zfs_zevent_post_cb); 10810 kmem_free(ev, sizeof (*ev)); 10811 } 10812 #else 10813 (void) ev; 10814 #endif 10815 } 10816 10817 /* 10818 * Post a zevent corresponding to the given sysevent. The 'name' must be one 10819 * of the event definitions in sys/sysevent/eventdefs.h. The payload will be 10820 * filled in from the spa and (optionally) the vdev. This doesn't do anything 10821 * in the userland libzpool, as we don't want consumers to misinterpret ztest 10822 * or zdb as real changes. 10823 */ 10824 void 10825 spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) 10826 { 10827 spa_event_post(spa_event_create(spa, vd, hist_nvl, name)); 10828 } 10829 10830 /* state manipulation functions */ 10831 EXPORT_SYMBOL(spa_open); 10832 EXPORT_SYMBOL(spa_open_rewind); 10833 EXPORT_SYMBOL(spa_get_stats); 10834 EXPORT_SYMBOL(spa_create); 10835 EXPORT_SYMBOL(spa_import); 10836 EXPORT_SYMBOL(spa_tryimport); 10837 EXPORT_SYMBOL(spa_destroy); 10838 EXPORT_SYMBOL(spa_export); 10839 EXPORT_SYMBOL(spa_reset); 10840 EXPORT_SYMBOL(spa_async_request); 10841 EXPORT_SYMBOL(spa_async_suspend); 10842 EXPORT_SYMBOL(spa_async_resume); 10843 EXPORT_SYMBOL(spa_inject_addref); 10844 EXPORT_SYMBOL(spa_inject_delref); 10845 EXPORT_SYMBOL(spa_scan_stat_init); 10846 EXPORT_SYMBOL(spa_scan_get_stats); 10847 10848 /* device manipulation */ 10849 EXPORT_SYMBOL(spa_vdev_add); 10850 EXPORT_SYMBOL(spa_vdev_attach); 10851 EXPORT_SYMBOL(spa_vdev_detach); 10852 EXPORT_SYMBOL(spa_vdev_setpath); 10853 EXPORT_SYMBOL(spa_vdev_setfru); 10854 EXPORT_SYMBOL(spa_vdev_split_mirror); 10855 10856 /* spare statech is global across all pools) */ 10857 EXPORT_SYMBOL(spa_spare_add); 10858 EXPORT_SYMBOL(spa_spare_remove); 10859 EXPORT_SYMBOL(spa_spare_exists); 10860 EXPORT_SYMBOL(spa_spare_activate); 10861 10862 /* L2ARC statech is global across all pools) */ 10863 EXPORT_SYMBOL(spa_l2cache_add); 10864 EXPORT_SYMBOL(spa_l2cache_remove); 10865 EXPORT_SYMBOL(spa_l2cache_exists); 10866 EXPORT_SYMBOL(spa_l2cache_activate); 10867 EXPORT_SYMBOL(spa_l2cache_drop); 10868 10869 /* scanning */ 10870 EXPORT_SYMBOL(spa_scan); 10871 EXPORT_SYMBOL(spa_scan_stop); 10872 10873 /* spa syncing */ 10874 EXPORT_SYMBOL(spa_sync); /* only for DMU use */ 10875 EXPORT_SYMBOL(spa_sync_allpools); 10876 10877 /* properties */ 10878 EXPORT_SYMBOL(spa_prop_set); 10879 EXPORT_SYMBOL(spa_prop_get); 10880 EXPORT_SYMBOL(spa_prop_clear_bootfs); 10881 10882 /* asynchronous event notification */ 10883 EXPORT_SYMBOL(spa_event_notify); 10884 10885 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_pct, UINT, ZMOD_RW, 10886 "Percentage of CPUs to run a metaslab preload taskq"); 10887 10888 /* BEGIN CSTYLED */ 10889 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, UINT, ZMOD_RW, 10890 "log2 fraction of arc that can be used by inflight I/Os when " 10891 "verifying pool during import"); 10892 /* END CSTYLED */ 10893 10894 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_metadata, INT, ZMOD_RW, 10895 "Set to traverse metadata on pool import"); 10896 10897 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW, 10898 "Set to traverse data on pool import"); 10899 10900 ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW, 10901 "Print vdev tree to zfs_dbgmsg during pool import"); 10902 10903 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RW, 10904 "Percentage of CPUs to run an IO worker thread"); 10905 10906 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RW, 10907 "Number of threads per IO worker taskqueue"); 10908 10909 /* BEGIN CSTYLED */ 10910 ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, U64, ZMOD_RW, 10911 "Allow importing pool with up to this number of missing top-level " 10912 "vdevs (in read-only mode)"); 10913 /* END CSTYLED */ 10914 10915 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT, 10916 ZMOD_RW, "Set the livelist condense zthr to pause"); 10917 10918 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_pause, INT, 10919 ZMOD_RW, "Set the livelist condense synctask to pause"); 10920 10921 /* BEGIN CSTYLED */ 10922 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_cancel, 10923 INT, ZMOD_RW, 10924 "Whether livelist condensing was canceled in the synctask"); 10925 10926 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_cancel, 10927 INT, ZMOD_RW, 10928 "Whether livelist condensing was canceled in the zthr function"); 10929 10930 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, 10931 ZMOD_RW, 10932 "Whether extra ALLOC blkptrs were added to a livelist entry while it " 10933 "was being condensed"); 10934 10935 #ifdef _KERNEL 10936 ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read, 10937 spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RW, 10938 "Configure IO queues for read IO"); 10939 ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write, 10940 spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RW, 10941 "Configure IO queues for write IO"); 10942 #endif 10943 /* END CSTYLED */ 10944 10945 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_write_tpq, UINT, ZMOD_RW, 10946 "Number of CPUs per write issue taskq"); 10947