1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011, 2020 by Delphix. All rights reserved. 25 * Copyright (c) 2018, Nexenta Systems, Inc. All rights reserved. 26 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 27 * Copyright 2013 Saso Kiselkov. All rights reserved. 28 * Copyright (c) 2014 Integros [integros.com] 29 * Copyright 2016 Toomas Soome <tsoome@me.com> 30 * Copyright (c) 2016 Actifio, Inc. All rights reserved. 31 * Copyright 2018 Joyent, Inc. 32 * Copyright (c) 2017, 2019, Datto Inc. All rights reserved. 33 * Copyright 2017 Joyent, Inc. 34 * Copyright (c) 2017, Intel Corporation. 35 * Copyright (c) 2021, Colm Buckley <colm@tuatha.org> 36 */ 37 38 /* 39 * SPA: Storage Pool Allocator 40 * 41 * This file contains all the routines used when modifying on-disk SPA state. 42 * This includes opening, importing, destroying, exporting a pool, and syncing a 43 * pool. 44 */ 45 46 #include <sys/zfs_context.h> 47 #include <sys/fm/fs/zfs.h> 48 #include <sys/spa_impl.h> 49 #include <sys/zio.h> 50 #include <sys/zio_checksum.h> 51 #include <sys/dmu.h> 52 #include <sys/dmu_tx.h> 53 #include <sys/zap.h> 54 #include <sys/zil.h> 55 #include <sys/ddt.h> 56 #include <sys/vdev_impl.h> 57 #include <sys/vdev_removal.h> 58 #include <sys/vdev_indirect_mapping.h> 59 #include <sys/vdev_indirect_births.h> 60 #include <sys/vdev_initialize.h> 61 #include <sys/vdev_rebuild.h> 62 #include <sys/vdev_trim.h> 63 #include <sys/vdev_disk.h> 64 #include <sys/vdev_draid.h> 65 #include <sys/metaslab.h> 66 #include <sys/metaslab_impl.h> 67 #include <sys/mmp.h> 68 #include <sys/uberblock_impl.h> 69 #include <sys/txg.h> 70 #include <sys/avl.h> 71 #include <sys/bpobj.h> 72 #include <sys/dmu_traverse.h> 73 #include <sys/dmu_objset.h> 74 #include <sys/unique.h> 75 #include <sys/dsl_pool.h> 76 #include <sys/dsl_dataset.h> 77 #include <sys/dsl_dir.h> 78 #include <sys/dsl_prop.h> 79 #include <sys/dsl_synctask.h> 80 #include <sys/fs/zfs.h> 81 #include <sys/arc.h> 82 #include <sys/callb.h> 83 #include <sys/systeminfo.h> 84 #include <sys/spa_boot.h> 85 #include <sys/zfs_ioctl.h> 86 #include <sys/dsl_scan.h> 87 #include <sys/zfeature.h> 88 #include <sys/dsl_destroy.h> 89 #include <sys/zvol.h> 90 91 #ifdef _KERNEL 92 #include <sys/fm/protocol.h> 93 #include <sys/fm/util.h> 94 #include <sys/callb.h> 95 #include <sys/zone.h> 96 #include <sys/vmsystm.h> 97 #endif /* _KERNEL */ 98 99 #include "zfs_prop.h" 100 #include "zfs_comutil.h" 101 102 /* 103 * The interval, in seconds, at which failed configuration cache file writes 104 * should be retried. 105 */ 106 int zfs_ccw_retry_interval = 300; 107 108 typedef enum zti_modes { 109 ZTI_MODE_FIXED, /* value is # of threads (min 1) */ 110 ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ 111 ZTI_MODE_SCALE, /* Taskqs scale with CPUs. */ 112 ZTI_MODE_NULL, /* don't create a taskq */ 113 ZTI_NMODES 114 } zti_modes_t; 115 116 #define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } 117 #define ZTI_PCT(n) { ZTI_MODE_ONLINE_PERCENT, (n), 1 } 118 #define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } 119 #define ZTI_SCALE { ZTI_MODE_SCALE, 0, 1 } 120 #define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } 121 122 #define ZTI_N(n) ZTI_P(n, 1) 123 #define ZTI_ONE ZTI_N(1) 124 125 typedef struct zio_taskq_info { 126 zti_modes_t zti_mode; 127 uint_t zti_value; 128 uint_t zti_count; 129 } zio_taskq_info_t; 130 131 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 132 "iss", "iss_h", "int", "int_h" 133 }; 134 135 /* 136 * This table defines the taskq settings for each ZFS I/O type. When 137 * initializing a pool, we use this table to create an appropriately sized 138 * taskq. Some operations are low volume and therefore have a small, static 139 * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE 140 * macros. Other operations process a large amount of data; the ZTI_BATCH 141 * macro causes us to create a taskq oriented for throughput. Some operations 142 * are so high frequency and short-lived that the taskq itself can become a 143 * point of lock contention. The ZTI_P(#, #) macro indicates that we need an 144 * additional degree of parallelism specified by the number of threads per- 145 * taskq and the number of taskqs; when dispatching an event in this case, the 146 * particular taskq is chosen at random. ZTI_SCALE is similar to ZTI_BATCH, 147 * but with number of taskqs also scaling with number of CPUs. 148 * 149 * The different taskq priorities are to handle the different contexts (issue 150 * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that 151 * need to be handled with minimum delay. 152 */ 153 static const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 154 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 155 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ 156 { ZTI_N(8), ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* READ */ 157 { ZTI_BATCH, ZTI_N(5), ZTI_SCALE, ZTI_N(5) }, /* WRITE */ 158 { ZTI_SCALE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ 159 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ 160 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ 161 { ZTI_N(4), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* TRIM */ 162 }; 163 164 static void spa_sync_version(void *arg, dmu_tx_t *tx); 165 static void spa_sync_props(void *arg, dmu_tx_t *tx); 166 static boolean_t spa_has_active_shared_spare(spa_t *spa); 167 static int spa_load_impl(spa_t *spa, spa_import_type_t type, 168 const char **ereport); 169 static void spa_vdev_resilver_done(spa_t *spa); 170 171 static uint_t zio_taskq_batch_pct = 80; /* 1 thread per cpu in pset */ 172 static uint_t zio_taskq_batch_tpq; /* threads per taskq */ 173 static const boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 174 static const uint_t zio_taskq_basedc = 80; /* base duty cycle */ 175 176 static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */ 177 178 /* 179 * Report any spa_load_verify errors found, but do not fail spa_load. 180 * This is used by zdb to analyze non-idle pools. 181 */ 182 boolean_t spa_load_verify_dryrun = B_FALSE; 183 184 /* 185 * Allow read spacemaps in case of readonly import (spa_mode == SPA_MODE_READ). 186 * This is used by zdb for spacemaps verification. 187 */ 188 boolean_t spa_mode_readable_spacemaps = B_FALSE; 189 190 /* 191 * This (illegal) pool name is used when temporarily importing a spa_t in order 192 * to get the vdev stats associated with the imported devices. 193 */ 194 #define TRYIMPORT_NAME "$import" 195 196 /* 197 * For debugging purposes: print out vdev tree during pool import. 198 */ 199 static int spa_load_print_vdev_tree = B_FALSE; 200 201 /* 202 * A non-zero value for zfs_max_missing_tvds means that we allow importing 203 * pools with missing top-level vdevs. This is strictly intended for advanced 204 * pool recovery cases since missing data is almost inevitable. Pools with 205 * missing devices can only be imported read-only for safety reasons, and their 206 * fail-mode will be automatically set to "continue". 207 * 208 * With 1 missing vdev we should be able to import the pool and mount all 209 * datasets. User data that was not modified after the missing device has been 210 * added should be recoverable. This means that snapshots created prior to the 211 * addition of that device should be completely intact. 212 * 213 * With 2 missing vdevs, some datasets may fail to mount since there are 214 * dataset statistics that are stored as regular metadata. Some data might be 215 * recoverable if those vdevs were added recently. 216 * 217 * With 3 or more missing vdevs, the pool is severely damaged and MOS entries 218 * may be missing entirely. Chances of data recovery are very low. Note that 219 * there are also risks of performing an inadvertent rewind as we might be 220 * missing all the vdevs with the latest uberblocks. 221 */ 222 unsigned long zfs_max_missing_tvds = 0; 223 224 /* 225 * The parameters below are similar to zfs_max_missing_tvds but are only 226 * intended for a preliminary open of the pool with an untrusted config which 227 * might be incomplete or out-dated. 228 * 229 * We are more tolerant for pools opened from a cachefile since we could have 230 * an out-dated cachefile where a device removal was not registered. 231 * We could have set the limit arbitrarily high but in the case where devices 232 * are really missing we would want to return the proper error codes; we chose 233 * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available 234 * and we get a chance to retrieve the trusted config. 235 */ 236 uint64_t zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1; 237 238 /* 239 * In the case where config was assembled by scanning device paths (/dev/dsks 240 * by default) we are less tolerant since all the existing devices should have 241 * been detected and we want spa_load to return the right error codes. 242 */ 243 uint64_t zfs_max_missing_tvds_scan = 0; 244 245 /* 246 * Debugging aid that pauses spa_sync() towards the end. 247 */ 248 static const boolean_t zfs_pause_spa_sync = B_FALSE; 249 250 /* 251 * Variables to indicate the livelist condense zthr func should wait at certain 252 * points for the livelist to be removed - used to test condense/destroy races 253 */ 254 static int zfs_livelist_condense_zthr_pause = 0; 255 static int zfs_livelist_condense_sync_pause = 0; 256 257 /* 258 * Variables to track whether or not condense cancellation has been 259 * triggered in testing. 260 */ 261 static int zfs_livelist_condense_sync_cancel = 0; 262 static int zfs_livelist_condense_zthr_cancel = 0; 263 264 /* 265 * Variable to track whether or not extra ALLOC blkptrs were added to a 266 * livelist entry while it was being condensed (caused by the way we track 267 * remapped blkptrs in dbuf_remap_impl) 268 */ 269 static int zfs_livelist_condense_new_alloc = 0; 270 271 /* 272 * ========================================================================== 273 * SPA properties routines 274 * ========================================================================== 275 */ 276 277 /* 278 * Add a (source=src, propname=propval) list to an nvlist. 279 */ 280 static void 281 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, const char *strval, 282 uint64_t intval, zprop_source_t src) 283 { 284 const char *propname = zpool_prop_to_name(prop); 285 nvlist_t *propval; 286 287 propval = fnvlist_alloc(); 288 fnvlist_add_uint64(propval, ZPROP_SOURCE, src); 289 290 if (strval != NULL) 291 fnvlist_add_string(propval, ZPROP_VALUE, strval); 292 else 293 fnvlist_add_uint64(propval, ZPROP_VALUE, intval); 294 295 fnvlist_add_nvlist(nvl, propname, propval); 296 nvlist_free(propval); 297 } 298 299 /* 300 * Get property values from the spa configuration. 301 */ 302 static void 303 spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 304 { 305 vdev_t *rvd = spa->spa_root_vdev; 306 dsl_pool_t *pool = spa->spa_dsl_pool; 307 uint64_t size, alloc, cap, version; 308 const zprop_source_t src = ZPROP_SRC_NONE; 309 spa_config_dirent_t *dp; 310 metaslab_class_t *mc = spa_normal_class(spa); 311 312 ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 313 314 if (rvd != NULL) { 315 alloc = metaslab_class_get_alloc(mc); 316 alloc += metaslab_class_get_alloc(spa_special_class(spa)); 317 alloc += metaslab_class_get_alloc(spa_dedup_class(spa)); 318 alloc += metaslab_class_get_alloc(spa_embedded_log_class(spa)); 319 320 size = metaslab_class_get_space(mc); 321 size += metaslab_class_get_space(spa_special_class(spa)); 322 size += metaslab_class_get_space(spa_dedup_class(spa)); 323 size += metaslab_class_get_space(spa_embedded_log_class(spa)); 324 325 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 326 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 327 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 328 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 329 size - alloc, src); 330 spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL, 331 spa->spa_checkpoint_info.sci_dspace, src); 332 333 spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL, 334 metaslab_class_fragmentation(mc), src); 335 spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, 336 metaslab_class_expandable_space(mc), src); 337 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, 338 (spa_mode(spa) == SPA_MODE_READ), src); 339 340 cap = (size == 0) ? 0 : (alloc * 100 / size); 341 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 342 343 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 344 ddt_get_pool_dedup_ratio(spa), src); 345 346 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 347 rvd->vdev_state, src); 348 349 version = spa_version(spa); 350 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) { 351 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, 352 version, ZPROP_SRC_DEFAULT); 353 } else { 354 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, 355 version, ZPROP_SRC_LOCAL); 356 } 357 spa_prop_add_list(*nvp, ZPOOL_PROP_LOAD_GUID, 358 NULL, spa_load_guid(spa), src); 359 } 360 361 if (pool != NULL) { 362 /* 363 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, 364 * when opening pools before this version freedir will be NULL. 365 */ 366 if (pool->dp_free_dir != NULL) { 367 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 368 dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes, 369 src); 370 } else { 371 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, 372 NULL, 0, src); 373 } 374 375 if (pool->dp_leak_dir != NULL) { 376 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, 377 dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes, 378 src); 379 } else { 380 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, 381 NULL, 0, src); 382 } 383 } 384 385 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 386 387 if (spa->spa_comment != NULL) { 388 spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 389 0, ZPROP_SRC_LOCAL); 390 } 391 392 if (spa->spa_compatibility != NULL) { 393 spa_prop_add_list(*nvp, ZPOOL_PROP_COMPATIBILITY, 394 spa->spa_compatibility, 0, ZPROP_SRC_LOCAL); 395 } 396 397 if (spa->spa_root != NULL) 398 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 399 0, ZPROP_SRC_LOCAL); 400 401 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { 402 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 403 MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE); 404 } else { 405 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 406 SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); 407 } 408 409 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) { 410 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, 411 DNODE_MAX_SIZE, ZPROP_SRC_NONE); 412 } else { 413 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, 414 DNODE_MIN_SIZE, ZPROP_SRC_NONE); 415 } 416 417 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 418 if (dp->scd_path == NULL) { 419 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 420 "none", 0, ZPROP_SRC_LOCAL); 421 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 422 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 423 dp->scd_path, 0, ZPROP_SRC_LOCAL); 424 } 425 } 426 } 427 428 /* 429 * Get zpool property values. 430 */ 431 int 432 spa_prop_get(spa_t *spa, nvlist_t **nvp) 433 { 434 objset_t *mos = spa->spa_meta_objset; 435 zap_cursor_t zc; 436 zap_attribute_t za; 437 dsl_pool_t *dp; 438 int err; 439 440 err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP); 441 if (err) 442 return (err); 443 444 dp = spa_get_dsl(spa); 445 dsl_pool_config_enter(dp, FTAG); 446 mutex_enter(&spa->spa_props_lock); 447 448 /* 449 * Get properties from the spa config. 450 */ 451 spa_prop_get_config(spa, nvp); 452 453 /* If no pool property object, no more prop to get. */ 454 if (mos == NULL || spa->spa_pool_props_object == 0) 455 goto out; 456 457 /* 458 * Get properties from the MOS pool property object. 459 */ 460 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 461 (err = zap_cursor_retrieve(&zc, &za)) == 0; 462 zap_cursor_advance(&zc)) { 463 uint64_t intval = 0; 464 char *strval = NULL; 465 zprop_source_t src = ZPROP_SRC_DEFAULT; 466 zpool_prop_t prop; 467 468 if ((prop = zpool_name_to_prop(za.za_name)) == ZPOOL_PROP_INVAL) 469 continue; 470 471 switch (za.za_integer_length) { 472 case 8: 473 /* integer property */ 474 if (za.za_first_integer != 475 zpool_prop_default_numeric(prop)) 476 src = ZPROP_SRC_LOCAL; 477 478 if (prop == ZPOOL_PROP_BOOTFS) { 479 dsl_dataset_t *ds = NULL; 480 481 err = dsl_dataset_hold_obj(dp, 482 za.za_first_integer, FTAG, &ds); 483 if (err != 0) 484 break; 485 486 strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, 487 KM_SLEEP); 488 dsl_dataset_name(ds, strval); 489 dsl_dataset_rele(ds, FTAG); 490 } else { 491 strval = NULL; 492 intval = za.za_first_integer; 493 } 494 495 spa_prop_add_list(*nvp, prop, strval, intval, src); 496 497 if (strval != NULL) 498 kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN); 499 500 break; 501 502 case 1: 503 /* string property */ 504 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 505 err = zap_lookup(mos, spa->spa_pool_props_object, 506 za.za_name, 1, za.za_num_integers, strval); 507 if (err) { 508 kmem_free(strval, za.za_num_integers); 509 break; 510 } 511 spa_prop_add_list(*nvp, prop, strval, 0, src); 512 kmem_free(strval, za.za_num_integers); 513 break; 514 515 default: 516 break; 517 } 518 } 519 zap_cursor_fini(&zc); 520 out: 521 mutex_exit(&spa->spa_props_lock); 522 dsl_pool_config_exit(dp, FTAG); 523 if (err && err != ENOENT) { 524 nvlist_free(*nvp); 525 *nvp = NULL; 526 return (err); 527 } 528 529 return (0); 530 } 531 532 /* 533 * Validate the given pool properties nvlist and modify the list 534 * for the property values to be set. 535 */ 536 static int 537 spa_prop_validate(spa_t *spa, nvlist_t *props) 538 { 539 nvpair_t *elem; 540 int error = 0, reset_bootfs = 0; 541 uint64_t objnum = 0; 542 boolean_t has_feature = B_FALSE; 543 544 elem = NULL; 545 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 546 uint64_t intval; 547 char *strval, *slash, *check, *fname; 548 const char *propname = nvpair_name(elem); 549 zpool_prop_t prop = zpool_name_to_prop(propname); 550 551 switch (prop) { 552 case ZPOOL_PROP_INVAL: 553 if (!zpool_prop_feature(propname)) { 554 error = SET_ERROR(EINVAL); 555 break; 556 } 557 558 /* 559 * Sanitize the input. 560 */ 561 if (nvpair_type(elem) != DATA_TYPE_UINT64) { 562 error = SET_ERROR(EINVAL); 563 break; 564 } 565 566 if (nvpair_value_uint64(elem, &intval) != 0) { 567 error = SET_ERROR(EINVAL); 568 break; 569 } 570 571 if (intval != 0) { 572 error = SET_ERROR(EINVAL); 573 break; 574 } 575 576 fname = strchr(propname, '@') + 1; 577 if (zfeature_lookup_name(fname, NULL) != 0) { 578 error = SET_ERROR(EINVAL); 579 break; 580 } 581 582 has_feature = B_TRUE; 583 break; 584 585 case ZPOOL_PROP_VERSION: 586 error = nvpair_value_uint64(elem, &intval); 587 if (!error && 588 (intval < spa_version(spa) || 589 intval > SPA_VERSION_BEFORE_FEATURES || 590 has_feature)) 591 error = SET_ERROR(EINVAL); 592 break; 593 594 case ZPOOL_PROP_DELEGATION: 595 case ZPOOL_PROP_AUTOREPLACE: 596 case ZPOOL_PROP_LISTSNAPS: 597 case ZPOOL_PROP_AUTOEXPAND: 598 case ZPOOL_PROP_AUTOTRIM: 599 error = nvpair_value_uint64(elem, &intval); 600 if (!error && intval > 1) 601 error = SET_ERROR(EINVAL); 602 break; 603 604 case ZPOOL_PROP_MULTIHOST: 605 error = nvpair_value_uint64(elem, &intval); 606 if (!error && intval > 1) 607 error = SET_ERROR(EINVAL); 608 609 if (!error) { 610 uint32_t hostid = zone_get_hostid(NULL); 611 if (hostid) 612 spa->spa_hostid = hostid; 613 else 614 error = SET_ERROR(ENOTSUP); 615 } 616 617 break; 618 619 case ZPOOL_PROP_BOOTFS: 620 /* 621 * If the pool version is less than SPA_VERSION_BOOTFS, 622 * or the pool is still being created (version == 0), 623 * the bootfs property cannot be set. 624 */ 625 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 626 error = SET_ERROR(ENOTSUP); 627 break; 628 } 629 630 /* 631 * Make sure the vdev config is bootable 632 */ 633 if (!vdev_is_bootable(spa->spa_root_vdev)) { 634 error = SET_ERROR(ENOTSUP); 635 break; 636 } 637 638 reset_bootfs = 1; 639 640 error = nvpair_value_string(elem, &strval); 641 642 if (!error) { 643 objset_t *os; 644 645 if (strval == NULL || strval[0] == '\0') { 646 objnum = zpool_prop_default_numeric( 647 ZPOOL_PROP_BOOTFS); 648 break; 649 } 650 651 error = dmu_objset_hold(strval, FTAG, &os); 652 if (error != 0) 653 break; 654 655 /* Must be ZPL. */ 656 if (dmu_objset_type(os) != DMU_OST_ZFS) { 657 error = SET_ERROR(ENOTSUP); 658 } else { 659 objnum = dmu_objset_id(os); 660 } 661 dmu_objset_rele(os, FTAG); 662 } 663 break; 664 665 case ZPOOL_PROP_FAILUREMODE: 666 error = nvpair_value_uint64(elem, &intval); 667 if (!error && intval > ZIO_FAILURE_MODE_PANIC) 668 error = SET_ERROR(EINVAL); 669 670 /* 671 * This is a special case which only occurs when 672 * the pool has completely failed. This allows 673 * the user to change the in-core failmode property 674 * without syncing it out to disk (I/Os might 675 * currently be blocked). We do this by returning 676 * EIO to the caller (spa_prop_set) to trick it 677 * into thinking we encountered a property validation 678 * error. 679 */ 680 if (!error && spa_suspended(spa)) { 681 spa->spa_failmode = intval; 682 error = SET_ERROR(EIO); 683 } 684 break; 685 686 case ZPOOL_PROP_CACHEFILE: 687 if ((error = nvpair_value_string(elem, &strval)) != 0) 688 break; 689 690 if (strval[0] == '\0') 691 break; 692 693 if (strcmp(strval, "none") == 0) 694 break; 695 696 if (strval[0] != '/') { 697 error = SET_ERROR(EINVAL); 698 break; 699 } 700 701 slash = strrchr(strval, '/'); 702 ASSERT(slash != NULL); 703 704 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 705 strcmp(slash, "/..") == 0) 706 error = SET_ERROR(EINVAL); 707 break; 708 709 case ZPOOL_PROP_COMMENT: 710 if ((error = nvpair_value_string(elem, &strval)) != 0) 711 break; 712 for (check = strval; *check != '\0'; check++) { 713 if (!isprint(*check)) { 714 error = SET_ERROR(EINVAL); 715 break; 716 } 717 } 718 if (strlen(strval) > ZPROP_MAX_COMMENT) 719 error = SET_ERROR(E2BIG); 720 break; 721 722 default: 723 break; 724 } 725 726 if (error) 727 break; 728 } 729 730 (void) nvlist_remove_all(props, 731 zpool_prop_to_name(ZPOOL_PROP_DEDUPDITTO)); 732 733 if (!error && reset_bootfs) { 734 error = nvlist_remove(props, 735 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 736 737 if (!error) { 738 error = nvlist_add_uint64(props, 739 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 740 } 741 } 742 743 return (error); 744 } 745 746 void 747 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 748 { 749 char *cachefile; 750 spa_config_dirent_t *dp; 751 752 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 753 &cachefile) != 0) 754 return; 755 756 dp = kmem_alloc(sizeof (spa_config_dirent_t), 757 KM_SLEEP); 758 759 if (cachefile[0] == '\0') 760 dp->scd_path = spa_strdup(spa_config_path); 761 else if (strcmp(cachefile, "none") == 0) 762 dp->scd_path = NULL; 763 else 764 dp->scd_path = spa_strdup(cachefile); 765 766 list_insert_head(&spa->spa_config_list, dp); 767 if (need_sync) 768 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 769 } 770 771 int 772 spa_prop_set(spa_t *spa, nvlist_t *nvp) 773 { 774 int error; 775 nvpair_t *elem = NULL; 776 boolean_t need_sync = B_FALSE; 777 778 if ((error = spa_prop_validate(spa, nvp)) != 0) 779 return (error); 780 781 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 782 zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); 783 784 if (prop == ZPOOL_PROP_CACHEFILE || 785 prop == ZPOOL_PROP_ALTROOT || 786 prop == ZPOOL_PROP_READONLY) 787 continue; 788 789 if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) { 790 uint64_t ver = 0; 791 792 if (prop == ZPOOL_PROP_VERSION) { 793 VERIFY(nvpair_value_uint64(elem, &ver) == 0); 794 } else { 795 ASSERT(zpool_prop_feature(nvpair_name(elem))); 796 ver = SPA_VERSION_FEATURES; 797 need_sync = B_TRUE; 798 } 799 800 /* Save time if the version is already set. */ 801 if (ver == spa_version(spa)) 802 continue; 803 804 /* 805 * In addition to the pool directory object, we might 806 * create the pool properties object, the features for 807 * read object, the features for write object, or the 808 * feature descriptions object. 809 */ 810 error = dsl_sync_task(spa->spa_name, NULL, 811 spa_sync_version, &ver, 812 6, ZFS_SPACE_CHECK_RESERVED); 813 if (error) 814 return (error); 815 continue; 816 } 817 818 need_sync = B_TRUE; 819 break; 820 } 821 822 if (need_sync) { 823 return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, 824 nvp, 6, ZFS_SPACE_CHECK_RESERVED)); 825 } 826 827 return (0); 828 } 829 830 /* 831 * If the bootfs property value is dsobj, clear it. 832 */ 833 void 834 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 835 { 836 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 837 VERIFY(zap_remove(spa->spa_meta_objset, 838 spa->spa_pool_props_object, 839 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 840 spa->spa_bootfs = 0; 841 } 842 } 843 844 static int 845 spa_change_guid_check(void *arg, dmu_tx_t *tx) 846 { 847 uint64_t *newguid __maybe_unused = arg; 848 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 849 vdev_t *rvd = spa->spa_root_vdev; 850 uint64_t vdev_state; 851 852 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 853 int error = (spa_has_checkpoint(spa)) ? 854 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 855 return (SET_ERROR(error)); 856 } 857 858 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 859 vdev_state = rvd->vdev_state; 860 spa_config_exit(spa, SCL_STATE, FTAG); 861 862 if (vdev_state != VDEV_STATE_HEALTHY) 863 return (SET_ERROR(ENXIO)); 864 865 ASSERT3U(spa_guid(spa), !=, *newguid); 866 867 return (0); 868 } 869 870 static void 871 spa_change_guid_sync(void *arg, dmu_tx_t *tx) 872 { 873 uint64_t *newguid = arg; 874 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 875 uint64_t oldguid; 876 vdev_t *rvd = spa->spa_root_vdev; 877 878 oldguid = spa_guid(spa); 879 880 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 881 rvd->vdev_guid = *newguid; 882 rvd->vdev_guid_sum += (*newguid - oldguid); 883 vdev_config_dirty(rvd); 884 spa_config_exit(spa, SCL_STATE, FTAG); 885 886 spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", 887 (u_longlong_t)oldguid, (u_longlong_t)*newguid); 888 } 889 890 /* 891 * Change the GUID for the pool. This is done so that we can later 892 * re-import a pool built from a clone of our own vdevs. We will modify 893 * the root vdev's guid, our own pool guid, and then mark all of our 894 * vdevs dirty. Note that we must make sure that all our vdevs are 895 * online when we do this, or else any vdevs that weren't present 896 * would be orphaned from our pool. We are also going to issue a 897 * sysevent to update any watchers. 898 */ 899 int 900 spa_change_guid(spa_t *spa) 901 { 902 int error; 903 uint64_t guid; 904 905 mutex_enter(&spa->spa_vdev_top_lock); 906 mutex_enter(&spa_namespace_lock); 907 guid = spa_generate_guid(NULL); 908 909 error = dsl_sync_task(spa->spa_name, spa_change_guid_check, 910 spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); 911 912 if (error == 0) { 913 spa_write_cachefile(spa, B_FALSE, B_TRUE); 914 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID); 915 } 916 917 mutex_exit(&spa_namespace_lock); 918 mutex_exit(&spa->spa_vdev_top_lock); 919 920 return (error); 921 } 922 923 /* 924 * ========================================================================== 925 * SPA state manipulation (open/create/destroy/import/export) 926 * ========================================================================== 927 */ 928 929 static int 930 spa_error_entry_compare(const void *a, const void *b) 931 { 932 const spa_error_entry_t *sa = (const spa_error_entry_t *)a; 933 const spa_error_entry_t *sb = (const spa_error_entry_t *)b; 934 int ret; 935 936 ret = memcmp(&sa->se_bookmark, &sb->se_bookmark, 937 sizeof (zbookmark_phys_t)); 938 939 return (TREE_ISIGN(ret)); 940 } 941 942 /* 943 * Utility function which retrieves copies of the current logs and 944 * re-initializes them in the process. 945 */ 946 void 947 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 948 { 949 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 950 951 memcpy(last, &spa->spa_errlist_last, sizeof (avl_tree_t)); 952 memcpy(scrub, &spa->spa_errlist_scrub, sizeof (avl_tree_t)); 953 954 avl_create(&spa->spa_errlist_scrub, 955 spa_error_entry_compare, sizeof (spa_error_entry_t), 956 offsetof(spa_error_entry_t, se_avl)); 957 avl_create(&spa->spa_errlist_last, 958 spa_error_entry_compare, sizeof (spa_error_entry_t), 959 offsetof(spa_error_entry_t, se_avl)); 960 } 961 962 static void 963 spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 964 { 965 const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 966 enum zti_modes mode = ztip->zti_mode; 967 uint_t value = ztip->zti_value; 968 uint_t count = ztip->zti_count; 969 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 970 uint_t cpus, flags = TASKQ_DYNAMIC; 971 boolean_t batch = B_FALSE; 972 973 switch (mode) { 974 case ZTI_MODE_FIXED: 975 ASSERT3U(value, >, 0); 976 break; 977 978 case ZTI_MODE_BATCH: 979 batch = B_TRUE; 980 flags |= TASKQ_THREADS_CPU_PCT; 981 value = MIN(zio_taskq_batch_pct, 100); 982 break; 983 984 case ZTI_MODE_SCALE: 985 flags |= TASKQ_THREADS_CPU_PCT; 986 /* 987 * We want more taskqs to reduce lock contention, but we want 988 * less for better request ordering and CPU utilization. 989 */ 990 cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100); 991 if (zio_taskq_batch_tpq > 0) { 992 count = MAX(1, (cpus + zio_taskq_batch_tpq / 2) / 993 zio_taskq_batch_tpq); 994 } else { 995 /* 996 * Prefer 6 threads per taskq, but no more taskqs 997 * than threads in them on large systems. For 80%: 998 * 999 * taskq taskq total 1000 * cpus taskqs percent threads threads 1001 * ------- ------- ------- ------- ------- 1002 * 1 1 80% 1 1 1003 * 2 1 80% 1 1 1004 * 4 1 80% 3 3 1005 * 8 2 40% 3 6 1006 * 16 3 27% 4 12 1007 * 32 5 16% 5 25 1008 * 64 7 11% 7 49 1009 * 128 10 8% 10 100 1010 * 256 14 6% 15 210 1011 */ 1012 count = 1 + cpus / 6; 1013 while (count * count > cpus) 1014 count--; 1015 } 1016 /* Limit each taskq within 100% to not trigger assertion. */ 1017 count = MAX(count, (zio_taskq_batch_pct + 99) / 100); 1018 value = (zio_taskq_batch_pct + count / 2) / count; 1019 break; 1020 1021 case ZTI_MODE_NULL: 1022 tqs->stqs_count = 0; 1023 tqs->stqs_taskq = NULL; 1024 return; 1025 1026 default: 1027 panic("unrecognized mode for %s_%s taskq (%u:%u) in " 1028 "spa_activate()", 1029 zio_type_name[t], zio_taskq_types[q], mode, value); 1030 break; 1031 } 1032 1033 ASSERT3U(count, >, 0); 1034 tqs->stqs_count = count; 1035 tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); 1036 1037 for (uint_t i = 0; i < count; i++) { 1038 taskq_t *tq; 1039 char name[32]; 1040 1041 if (count > 1) 1042 (void) snprintf(name, sizeof (name), "%s_%s_%u", 1043 zio_type_name[t], zio_taskq_types[q], i); 1044 else 1045 (void) snprintf(name, sizeof (name), "%s_%s", 1046 zio_type_name[t], zio_taskq_types[q]); 1047 1048 if (zio_taskq_sysdc && spa->spa_proc != &p0) { 1049 if (batch) 1050 flags |= TASKQ_DC_BATCH; 1051 1052 (void) zio_taskq_basedc; 1053 tq = taskq_create_sysdc(name, value, 50, INT_MAX, 1054 spa->spa_proc, zio_taskq_basedc, flags); 1055 } else { 1056 pri_t pri = maxclsyspri; 1057 /* 1058 * The write issue taskq can be extremely CPU 1059 * intensive. Run it at slightly less important 1060 * priority than the other taskqs. 1061 * 1062 * Under Linux and FreeBSD this means incrementing 1063 * the priority value as opposed to platforms like 1064 * illumos where it should be decremented. 1065 * 1066 * On FreeBSD, if priorities divided by four (RQ_PPQ) 1067 * are equal then a difference between them is 1068 * insignificant. 1069 */ 1070 if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) { 1071 #if defined(__linux__) 1072 pri++; 1073 #elif defined(__FreeBSD__) 1074 pri += 4; 1075 #else 1076 #error "unknown OS" 1077 #endif 1078 } 1079 tq = taskq_create_proc(name, value, pri, 50, 1080 INT_MAX, spa->spa_proc, flags); 1081 } 1082 1083 tqs->stqs_taskq[i] = tq; 1084 } 1085 } 1086 1087 static void 1088 spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 1089 { 1090 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1091 1092 if (tqs->stqs_taskq == NULL) { 1093 ASSERT3U(tqs->stqs_count, ==, 0); 1094 return; 1095 } 1096 1097 for (uint_t i = 0; i < tqs->stqs_count; i++) { 1098 ASSERT3P(tqs->stqs_taskq[i], !=, NULL); 1099 taskq_destroy(tqs->stqs_taskq[i]); 1100 } 1101 1102 kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); 1103 tqs->stqs_taskq = NULL; 1104 } 1105 1106 /* 1107 * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. 1108 * Note that a type may have multiple discrete taskqs to avoid lock contention 1109 * on the taskq itself. In that case we choose which taskq at random by using 1110 * the low bits of gethrtime(). 1111 */ 1112 void 1113 spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, 1114 task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) 1115 { 1116 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1117 taskq_t *tq; 1118 1119 ASSERT3P(tqs->stqs_taskq, !=, NULL); 1120 ASSERT3U(tqs->stqs_count, !=, 0); 1121 1122 if (tqs->stqs_count == 1) { 1123 tq = tqs->stqs_taskq[0]; 1124 } else { 1125 tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; 1126 } 1127 1128 taskq_dispatch_ent(tq, func, arg, flags, ent); 1129 } 1130 1131 /* 1132 * Same as spa_taskq_dispatch_ent() but block on the task until completion. 1133 */ 1134 void 1135 spa_taskq_dispatch_sync(spa_t *spa, zio_type_t t, zio_taskq_type_t q, 1136 task_func_t *func, void *arg, uint_t flags) 1137 { 1138 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1139 taskq_t *tq; 1140 taskqid_t id; 1141 1142 ASSERT3P(tqs->stqs_taskq, !=, NULL); 1143 ASSERT3U(tqs->stqs_count, !=, 0); 1144 1145 if (tqs->stqs_count == 1) { 1146 tq = tqs->stqs_taskq[0]; 1147 } else { 1148 tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; 1149 } 1150 1151 id = taskq_dispatch(tq, func, arg, flags); 1152 if (id) 1153 taskq_wait_id(tq, id); 1154 } 1155 1156 static void 1157 spa_create_zio_taskqs(spa_t *spa) 1158 { 1159 for (int t = 0; t < ZIO_TYPES; t++) { 1160 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1161 spa_taskqs_init(spa, t, q); 1162 } 1163 } 1164 } 1165 1166 /* 1167 * Disabled until spa_thread() can be adapted for Linux. 1168 */ 1169 #undef HAVE_SPA_THREAD 1170 1171 #if defined(_KERNEL) && defined(HAVE_SPA_THREAD) 1172 static void 1173 spa_thread(void *arg) 1174 { 1175 psetid_t zio_taskq_psrset_bind = PS_NONE; 1176 callb_cpr_t cprinfo; 1177 1178 spa_t *spa = arg; 1179 user_t *pu = PTOU(curproc); 1180 1181 CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 1182 spa->spa_name); 1183 1184 ASSERT(curproc != &p0); 1185 (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 1186 "zpool-%s", spa->spa_name); 1187 (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 1188 1189 /* bind this thread to the requested psrset */ 1190 if (zio_taskq_psrset_bind != PS_NONE) { 1191 pool_lock(); 1192 mutex_enter(&cpu_lock); 1193 mutex_enter(&pidlock); 1194 mutex_enter(&curproc->p_lock); 1195 1196 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 1197 0, NULL, NULL) == 0) { 1198 curthread->t_bind_pset = zio_taskq_psrset_bind; 1199 } else { 1200 cmn_err(CE_WARN, 1201 "Couldn't bind process for zfs pool \"%s\" to " 1202 "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 1203 } 1204 1205 mutex_exit(&curproc->p_lock); 1206 mutex_exit(&pidlock); 1207 mutex_exit(&cpu_lock); 1208 pool_unlock(); 1209 } 1210 1211 if (zio_taskq_sysdc) { 1212 sysdc_thread_enter(curthread, 100, 0); 1213 } 1214 1215 spa->spa_proc = curproc; 1216 spa->spa_did = curthread->t_did; 1217 1218 spa_create_zio_taskqs(spa); 1219 1220 mutex_enter(&spa->spa_proc_lock); 1221 ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 1222 1223 spa->spa_proc_state = SPA_PROC_ACTIVE; 1224 cv_broadcast(&spa->spa_proc_cv); 1225 1226 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1227 while (spa->spa_proc_state == SPA_PROC_ACTIVE) 1228 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1229 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 1230 1231 ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 1232 spa->spa_proc_state = SPA_PROC_GONE; 1233 spa->spa_proc = &p0; 1234 cv_broadcast(&spa->spa_proc_cv); 1235 CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 1236 1237 mutex_enter(&curproc->p_lock); 1238 lwp_exit(); 1239 } 1240 #endif 1241 1242 /* 1243 * Activate an uninitialized pool. 1244 */ 1245 static void 1246 spa_activate(spa_t *spa, spa_mode_t mode) 1247 { 1248 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 1249 1250 spa->spa_state = POOL_STATE_ACTIVE; 1251 spa->spa_mode = mode; 1252 spa->spa_read_spacemaps = spa_mode_readable_spacemaps; 1253 1254 spa->spa_normal_class = metaslab_class_create(spa, &zfs_metaslab_ops); 1255 spa->spa_log_class = metaslab_class_create(spa, &zfs_metaslab_ops); 1256 spa->spa_embedded_log_class = 1257 metaslab_class_create(spa, &zfs_metaslab_ops); 1258 spa->spa_special_class = metaslab_class_create(spa, &zfs_metaslab_ops); 1259 spa->spa_dedup_class = metaslab_class_create(spa, &zfs_metaslab_ops); 1260 1261 /* Try to create a covering process */ 1262 mutex_enter(&spa->spa_proc_lock); 1263 ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 1264 ASSERT(spa->spa_proc == &p0); 1265 spa->spa_did = 0; 1266 1267 (void) spa_create_process; 1268 #ifdef HAVE_SPA_THREAD 1269 /* Only create a process if we're going to be around a while. */ 1270 if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 1271 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 1272 NULL, 0) == 0) { 1273 spa->spa_proc_state = SPA_PROC_CREATED; 1274 while (spa->spa_proc_state == SPA_PROC_CREATED) { 1275 cv_wait(&spa->spa_proc_cv, 1276 &spa->spa_proc_lock); 1277 } 1278 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1279 ASSERT(spa->spa_proc != &p0); 1280 ASSERT(spa->spa_did != 0); 1281 } else { 1282 #ifdef _KERNEL 1283 cmn_err(CE_WARN, 1284 "Couldn't create process for zfs pool \"%s\"\n", 1285 spa->spa_name); 1286 #endif 1287 } 1288 } 1289 #endif /* HAVE_SPA_THREAD */ 1290 mutex_exit(&spa->spa_proc_lock); 1291 1292 /* If we didn't create a process, we need to create our taskqs. */ 1293 if (spa->spa_proc == &p0) { 1294 spa_create_zio_taskqs(spa); 1295 } 1296 1297 for (size_t i = 0; i < TXG_SIZE; i++) { 1298 spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, 1299 ZIO_FLAG_CANFAIL); 1300 } 1301 1302 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 1303 offsetof(vdev_t, vdev_config_dirty_node)); 1304 list_create(&spa->spa_evicting_os_list, sizeof (objset_t), 1305 offsetof(objset_t, os_evicting_node)); 1306 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 1307 offsetof(vdev_t, vdev_state_dirty_node)); 1308 1309 txg_list_create(&spa->spa_vdev_txg_list, spa, 1310 offsetof(struct vdev, vdev_txg_node)); 1311 1312 avl_create(&spa->spa_errlist_scrub, 1313 spa_error_entry_compare, sizeof (spa_error_entry_t), 1314 offsetof(spa_error_entry_t, se_avl)); 1315 avl_create(&spa->spa_errlist_last, 1316 spa_error_entry_compare, sizeof (spa_error_entry_t), 1317 offsetof(spa_error_entry_t, se_avl)); 1318 1319 spa_activate_os(spa); 1320 1321 spa_keystore_init(&spa->spa_keystore); 1322 1323 /* 1324 * This taskq is used to perform zvol-minor-related tasks 1325 * asynchronously. This has several advantages, including easy 1326 * resolution of various deadlocks. 1327 * 1328 * The taskq must be single threaded to ensure tasks are always 1329 * processed in the order in which they were dispatched. 1330 * 1331 * A taskq per pool allows one to keep the pools independent. 1332 * This way if one pool is suspended, it will not impact another. 1333 * 1334 * The preferred location to dispatch a zvol minor task is a sync 1335 * task. In this context, there is easy access to the spa_t and minimal 1336 * error handling is required because the sync task must succeed. 1337 */ 1338 spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri, 1339 1, INT_MAX, 0); 1340 1341 /* 1342 * Taskq dedicated to prefetcher threads: this is used to prevent the 1343 * pool traverse code from monopolizing the global (and limited) 1344 * system_taskq by inappropriately scheduling long running tasks on it. 1345 */ 1346 spa->spa_prefetch_taskq = taskq_create("z_prefetch", 100, 1347 defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); 1348 1349 /* 1350 * The taskq to upgrade datasets in this pool. Currently used by 1351 * feature SPA_FEATURE_USEROBJ_ACCOUNTING/SPA_FEATURE_PROJECT_QUOTA. 1352 */ 1353 spa->spa_upgrade_taskq = taskq_create("z_upgrade", 100, 1354 defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); 1355 } 1356 1357 /* 1358 * Opposite of spa_activate(). 1359 */ 1360 static void 1361 spa_deactivate(spa_t *spa) 1362 { 1363 ASSERT(spa->spa_sync_on == B_FALSE); 1364 ASSERT(spa->spa_dsl_pool == NULL); 1365 ASSERT(spa->spa_root_vdev == NULL); 1366 ASSERT(spa->spa_async_zio_root == NULL); 1367 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 1368 1369 spa_evicting_os_wait(spa); 1370 1371 if (spa->spa_zvol_taskq) { 1372 taskq_destroy(spa->spa_zvol_taskq); 1373 spa->spa_zvol_taskq = NULL; 1374 } 1375 1376 if (spa->spa_prefetch_taskq) { 1377 taskq_destroy(spa->spa_prefetch_taskq); 1378 spa->spa_prefetch_taskq = NULL; 1379 } 1380 1381 if (spa->spa_upgrade_taskq) { 1382 taskq_destroy(spa->spa_upgrade_taskq); 1383 spa->spa_upgrade_taskq = NULL; 1384 } 1385 1386 txg_list_destroy(&spa->spa_vdev_txg_list); 1387 1388 list_destroy(&spa->spa_config_dirty_list); 1389 list_destroy(&spa->spa_evicting_os_list); 1390 list_destroy(&spa->spa_state_dirty_list); 1391 1392 taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); 1393 1394 for (int t = 0; t < ZIO_TYPES; t++) { 1395 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1396 spa_taskqs_fini(spa, t, q); 1397 } 1398 } 1399 1400 for (size_t i = 0; i < TXG_SIZE; i++) { 1401 ASSERT3P(spa->spa_txg_zio[i], !=, NULL); 1402 VERIFY0(zio_wait(spa->spa_txg_zio[i])); 1403 spa->spa_txg_zio[i] = NULL; 1404 } 1405 1406 metaslab_class_destroy(spa->spa_normal_class); 1407 spa->spa_normal_class = NULL; 1408 1409 metaslab_class_destroy(spa->spa_log_class); 1410 spa->spa_log_class = NULL; 1411 1412 metaslab_class_destroy(spa->spa_embedded_log_class); 1413 spa->spa_embedded_log_class = NULL; 1414 1415 metaslab_class_destroy(spa->spa_special_class); 1416 spa->spa_special_class = NULL; 1417 1418 metaslab_class_destroy(spa->spa_dedup_class); 1419 spa->spa_dedup_class = NULL; 1420 1421 /* 1422 * If this was part of an import or the open otherwise failed, we may 1423 * still have errors left in the queues. Empty them just in case. 1424 */ 1425 spa_errlog_drain(spa); 1426 avl_destroy(&spa->spa_errlist_scrub); 1427 avl_destroy(&spa->spa_errlist_last); 1428 1429 spa_keystore_fini(&spa->spa_keystore); 1430 1431 spa->spa_state = POOL_STATE_UNINITIALIZED; 1432 1433 mutex_enter(&spa->spa_proc_lock); 1434 if (spa->spa_proc_state != SPA_PROC_NONE) { 1435 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1436 spa->spa_proc_state = SPA_PROC_DEACTIVATE; 1437 cv_broadcast(&spa->spa_proc_cv); 1438 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 1439 ASSERT(spa->spa_proc != &p0); 1440 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1441 } 1442 ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 1443 spa->spa_proc_state = SPA_PROC_NONE; 1444 } 1445 ASSERT(spa->spa_proc == &p0); 1446 mutex_exit(&spa->spa_proc_lock); 1447 1448 /* 1449 * We want to make sure spa_thread() has actually exited the ZFS 1450 * module, so that the module can't be unloaded out from underneath 1451 * it. 1452 */ 1453 if (spa->spa_did != 0) { 1454 thread_join(spa->spa_did); 1455 spa->spa_did = 0; 1456 } 1457 1458 spa_deactivate_os(spa); 1459 1460 } 1461 1462 /* 1463 * Verify a pool configuration, and construct the vdev tree appropriately. This 1464 * will create all the necessary vdevs in the appropriate layout, with each vdev 1465 * in the CLOSED state. This will prep the pool before open/creation/import. 1466 * All vdev validation is done by the vdev_alloc() routine. 1467 */ 1468 int 1469 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 1470 uint_t id, int atype) 1471 { 1472 nvlist_t **child; 1473 uint_t children; 1474 int error; 1475 1476 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 1477 return (error); 1478 1479 if ((*vdp)->vdev_ops->vdev_op_leaf) 1480 return (0); 1481 1482 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1483 &child, &children); 1484 1485 if (error == ENOENT) 1486 return (0); 1487 1488 if (error) { 1489 vdev_free(*vdp); 1490 *vdp = NULL; 1491 return (SET_ERROR(EINVAL)); 1492 } 1493 1494 for (int c = 0; c < children; c++) { 1495 vdev_t *vd; 1496 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 1497 atype)) != 0) { 1498 vdev_free(*vdp); 1499 *vdp = NULL; 1500 return (error); 1501 } 1502 } 1503 1504 ASSERT(*vdp != NULL); 1505 1506 return (0); 1507 } 1508 1509 static boolean_t 1510 spa_should_flush_logs_on_unload(spa_t *spa) 1511 { 1512 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) 1513 return (B_FALSE); 1514 1515 if (!spa_writeable(spa)) 1516 return (B_FALSE); 1517 1518 if (!spa->spa_sync_on) 1519 return (B_FALSE); 1520 1521 if (spa_state(spa) != POOL_STATE_EXPORTED) 1522 return (B_FALSE); 1523 1524 if (zfs_keep_log_spacemaps_at_export) 1525 return (B_FALSE); 1526 1527 return (B_TRUE); 1528 } 1529 1530 /* 1531 * Opens a transaction that will set the flag that will instruct 1532 * spa_sync to attempt to flush all the metaslabs for that txg. 1533 */ 1534 static void 1535 spa_unload_log_sm_flush_all(spa_t *spa) 1536 { 1537 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 1538 VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 1539 1540 ASSERT3U(spa->spa_log_flushall_txg, ==, 0); 1541 spa->spa_log_flushall_txg = dmu_tx_get_txg(tx); 1542 1543 dmu_tx_commit(tx); 1544 txg_wait_synced(spa_get_dsl(spa), spa->spa_log_flushall_txg); 1545 } 1546 1547 static void 1548 spa_unload_log_sm_metadata(spa_t *spa) 1549 { 1550 void *cookie = NULL; 1551 spa_log_sm_t *sls; 1552 while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg, 1553 &cookie)) != NULL) { 1554 VERIFY0(sls->sls_mscount); 1555 kmem_free(sls, sizeof (spa_log_sm_t)); 1556 } 1557 1558 for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); 1559 e != NULL; e = list_head(&spa->spa_log_summary)) { 1560 VERIFY0(e->lse_mscount); 1561 list_remove(&spa->spa_log_summary, e); 1562 kmem_free(e, sizeof (log_summary_entry_t)); 1563 } 1564 1565 spa->spa_unflushed_stats.sus_nblocks = 0; 1566 spa->spa_unflushed_stats.sus_memused = 0; 1567 spa->spa_unflushed_stats.sus_blocklimit = 0; 1568 } 1569 1570 static void 1571 spa_destroy_aux_threads(spa_t *spa) 1572 { 1573 if (spa->spa_condense_zthr != NULL) { 1574 zthr_destroy(spa->spa_condense_zthr); 1575 spa->spa_condense_zthr = NULL; 1576 } 1577 if (spa->spa_checkpoint_discard_zthr != NULL) { 1578 zthr_destroy(spa->spa_checkpoint_discard_zthr); 1579 spa->spa_checkpoint_discard_zthr = NULL; 1580 } 1581 if (spa->spa_livelist_delete_zthr != NULL) { 1582 zthr_destroy(spa->spa_livelist_delete_zthr); 1583 spa->spa_livelist_delete_zthr = NULL; 1584 } 1585 if (spa->spa_livelist_condense_zthr != NULL) { 1586 zthr_destroy(spa->spa_livelist_condense_zthr); 1587 spa->spa_livelist_condense_zthr = NULL; 1588 } 1589 } 1590 1591 /* 1592 * Opposite of spa_load(). 1593 */ 1594 static void 1595 spa_unload(spa_t *spa) 1596 { 1597 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1598 ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED); 1599 1600 spa_import_progress_remove(spa_guid(spa)); 1601 spa_load_note(spa, "UNLOADING"); 1602 1603 spa_wake_waiters(spa); 1604 1605 /* 1606 * If we have set the spa_final_txg, we have already performed the 1607 * tasks below in spa_export_common(). We should not redo it here since 1608 * we delay the final TXGs beyond what spa_final_txg is set at. 1609 */ 1610 if (spa->spa_final_txg == UINT64_MAX) { 1611 /* 1612 * If the log space map feature is enabled and the pool is 1613 * getting exported (but not destroyed), we want to spend some 1614 * time flushing as many metaslabs as we can in an attempt to 1615 * destroy log space maps and save import time. 1616 */ 1617 if (spa_should_flush_logs_on_unload(spa)) 1618 spa_unload_log_sm_flush_all(spa); 1619 1620 /* 1621 * Stop async tasks. 1622 */ 1623 spa_async_suspend(spa); 1624 1625 if (spa->spa_root_vdev) { 1626 vdev_t *root_vdev = spa->spa_root_vdev; 1627 vdev_initialize_stop_all(root_vdev, 1628 VDEV_INITIALIZE_ACTIVE); 1629 vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE); 1630 vdev_autotrim_stop_all(spa); 1631 vdev_rebuild_stop_all(spa); 1632 } 1633 } 1634 1635 /* 1636 * Stop syncing. 1637 */ 1638 if (spa->spa_sync_on) { 1639 txg_sync_stop(spa->spa_dsl_pool); 1640 spa->spa_sync_on = B_FALSE; 1641 } 1642 1643 /* 1644 * This ensures that there is no async metaslab prefetching 1645 * while we attempt to unload the spa. 1646 */ 1647 if (spa->spa_root_vdev != NULL) { 1648 for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) { 1649 vdev_t *vc = spa->spa_root_vdev->vdev_child[c]; 1650 if (vc->vdev_mg != NULL) 1651 taskq_wait(vc->vdev_mg->mg_taskq); 1652 } 1653 } 1654 1655 if (spa->spa_mmp.mmp_thread) 1656 mmp_thread_stop(spa); 1657 1658 /* 1659 * Wait for any outstanding async I/O to complete. 1660 */ 1661 if (spa->spa_async_zio_root != NULL) { 1662 for (int i = 0; i < max_ncpus; i++) 1663 (void) zio_wait(spa->spa_async_zio_root[i]); 1664 kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *)); 1665 spa->spa_async_zio_root = NULL; 1666 } 1667 1668 if (spa->spa_vdev_removal != NULL) { 1669 spa_vdev_removal_destroy(spa->spa_vdev_removal); 1670 spa->spa_vdev_removal = NULL; 1671 } 1672 1673 spa_destroy_aux_threads(spa); 1674 1675 spa_condense_fini(spa); 1676 1677 bpobj_close(&spa->spa_deferred_bpobj); 1678 1679 spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); 1680 1681 /* 1682 * Close all vdevs. 1683 */ 1684 if (spa->spa_root_vdev) 1685 vdev_free(spa->spa_root_vdev); 1686 ASSERT(spa->spa_root_vdev == NULL); 1687 1688 /* 1689 * Close the dsl pool. 1690 */ 1691 if (spa->spa_dsl_pool) { 1692 dsl_pool_close(spa->spa_dsl_pool); 1693 spa->spa_dsl_pool = NULL; 1694 spa->spa_meta_objset = NULL; 1695 } 1696 1697 ddt_unload(spa); 1698 spa_unload_log_sm_metadata(spa); 1699 1700 /* 1701 * Drop and purge level 2 cache 1702 */ 1703 spa_l2cache_drop(spa); 1704 1705 for (int i = 0; i < spa->spa_spares.sav_count; i++) 1706 vdev_free(spa->spa_spares.sav_vdevs[i]); 1707 if (spa->spa_spares.sav_vdevs) { 1708 kmem_free(spa->spa_spares.sav_vdevs, 1709 spa->spa_spares.sav_count * sizeof (void *)); 1710 spa->spa_spares.sav_vdevs = NULL; 1711 } 1712 if (spa->spa_spares.sav_config) { 1713 nvlist_free(spa->spa_spares.sav_config); 1714 spa->spa_spares.sav_config = NULL; 1715 } 1716 spa->spa_spares.sav_count = 0; 1717 1718 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { 1719 vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); 1720 vdev_free(spa->spa_l2cache.sav_vdevs[i]); 1721 } 1722 if (spa->spa_l2cache.sav_vdevs) { 1723 kmem_free(spa->spa_l2cache.sav_vdevs, 1724 spa->spa_l2cache.sav_count * sizeof (void *)); 1725 spa->spa_l2cache.sav_vdevs = NULL; 1726 } 1727 if (spa->spa_l2cache.sav_config) { 1728 nvlist_free(spa->spa_l2cache.sav_config); 1729 spa->spa_l2cache.sav_config = NULL; 1730 } 1731 spa->spa_l2cache.sav_count = 0; 1732 1733 spa->spa_async_suspended = 0; 1734 1735 spa->spa_indirect_vdevs_loaded = B_FALSE; 1736 1737 if (spa->spa_comment != NULL) { 1738 spa_strfree(spa->spa_comment); 1739 spa->spa_comment = NULL; 1740 } 1741 if (spa->spa_compatibility != NULL) { 1742 spa_strfree(spa->spa_compatibility); 1743 spa->spa_compatibility = NULL; 1744 } 1745 1746 spa_config_exit(spa, SCL_ALL, spa); 1747 } 1748 1749 /* 1750 * Load (or re-load) the current list of vdevs describing the active spares for 1751 * this pool. When this is called, we have some form of basic information in 1752 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 1753 * then re-generate a more complete list including status information. 1754 */ 1755 void 1756 spa_load_spares(spa_t *spa) 1757 { 1758 nvlist_t **spares; 1759 uint_t nspares; 1760 int i; 1761 vdev_t *vd, *tvd; 1762 1763 #ifndef _KERNEL 1764 /* 1765 * zdb opens both the current state of the pool and the 1766 * checkpointed state (if present), with a different spa_t. 1767 * 1768 * As spare vdevs are shared among open pools, we skip loading 1769 * them when we load the checkpointed state of the pool. 1770 */ 1771 if (!spa_writeable(spa)) 1772 return; 1773 #endif 1774 1775 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1776 1777 /* 1778 * First, close and free any existing spare vdevs. 1779 */ 1780 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1781 vd = spa->spa_spares.sav_vdevs[i]; 1782 1783 /* Undo the call to spa_activate() below */ 1784 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1785 B_FALSE)) != NULL && tvd->vdev_isspare) 1786 spa_spare_remove(tvd); 1787 vdev_close(vd); 1788 vdev_free(vd); 1789 } 1790 1791 if (spa->spa_spares.sav_vdevs) 1792 kmem_free(spa->spa_spares.sav_vdevs, 1793 spa->spa_spares.sav_count * sizeof (void *)); 1794 1795 if (spa->spa_spares.sav_config == NULL) 1796 nspares = 0; 1797 else 1798 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1799 ZPOOL_CONFIG_SPARES, &spares, &nspares)); 1800 1801 spa->spa_spares.sav_count = (int)nspares; 1802 spa->spa_spares.sav_vdevs = NULL; 1803 1804 if (nspares == 0) 1805 return; 1806 1807 /* 1808 * Construct the array of vdevs, opening them to get status in the 1809 * process. For each spare, there is potentially two different vdev_t 1810 * structures associated with it: one in the list of spares (used only 1811 * for basic validation purposes) and one in the active vdev 1812 * configuration (if it's spared in). During this phase we open and 1813 * validate each vdev on the spare list. If the vdev also exists in the 1814 * active configuration, then we also mark this vdev as an active spare. 1815 */ 1816 spa->spa_spares.sav_vdevs = kmem_zalloc(nspares * sizeof (void *), 1817 KM_SLEEP); 1818 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1819 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 1820 VDEV_ALLOC_SPARE) == 0); 1821 ASSERT(vd != NULL); 1822 1823 spa->spa_spares.sav_vdevs[i] = vd; 1824 1825 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1826 B_FALSE)) != NULL) { 1827 if (!tvd->vdev_isspare) 1828 spa_spare_add(tvd); 1829 1830 /* 1831 * We only mark the spare active if we were successfully 1832 * able to load the vdev. Otherwise, importing a pool 1833 * with a bad active spare would result in strange 1834 * behavior, because multiple pool would think the spare 1835 * is actively in use. 1836 * 1837 * There is a vulnerability here to an equally bizarre 1838 * circumstance, where a dead active spare is later 1839 * brought back to life (onlined or otherwise). Given 1840 * the rarity of this scenario, and the extra complexity 1841 * it adds, we ignore the possibility. 1842 */ 1843 if (!vdev_is_dead(tvd)) 1844 spa_spare_activate(tvd); 1845 } 1846 1847 vd->vdev_top = vd; 1848 vd->vdev_aux = &spa->spa_spares; 1849 1850 if (vdev_open(vd) != 0) 1851 continue; 1852 1853 if (vdev_validate_aux(vd) == 0) 1854 spa_spare_add(vd); 1855 } 1856 1857 /* 1858 * Recompute the stashed list of spares, with status information 1859 * this time. 1860 */ 1861 fnvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES); 1862 1863 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 1864 KM_SLEEP); 1865 for (i = 0; i < spa->spa_spares.sav_count; i++) 1866 spares[i] = vdev_config_generate(spa, 1867 spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 1868 fnvlist_add_nvlist_array(spa->spa_spares.sav_config, 1869 ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, 1870 spa->spa_spares.sav_count); 1871 for (i = 0; i < spa->spa_spares.sav_count; i++) 1872 nvlist_free(spares[i]); 1873 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 1874 } 1875 1876 /* 1877 * Load (or re-load) the current list of vdevs describing the active l2cache for 1878 * this pool. When this is called, we have some form of basic information in 1879 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 1880 * then re-generate a more complete list including status information. 1881 * Devices which are already active have their details maintained, and are 1882 * not re-opened. 1883 */ 1884 void 1885 spa_load_l2cache(spa_t *spa) 1886 { 1887 nvlist_t **l2cache = NULL; 1888 uint_t nl2cache; 1889 int i, j, oldnvdevs; 1890 uint64_t guid; 1891 vdev_t *vd, **oldvdevs, **newvdevs; 1892 spa_aux_vdev_t *sav = &spa->spa_l2cache; 1893 1894 #ifndef _KERNEL 1895 /* 1896 * zdb opens both the current state of the pool and the 1897 * checkpointed state (if present), with a different spa_t. 1898 * 1899 * As L2 caches are part of the ARC which is shared among open 1900 * pools, we skip loading them when we load the checkpointed 1901 * state of the pool. 1902 */ 1903 if (!spa_writeable(spa)) 1904 return; 1905 #endif 1906 1907 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1908 1909 oldvdevs = sav->sav_vdevs; 1910 oldnvdevs = sav->sav_count; 1911 sav->sav_vdevs = NULL; 1912 sav->sav_count = 0; 1913 1914 if (sav->sav_config == NULL) { 1915 nl2cache = 0; 1916 newvdevs = NULL; 1917 goto out; 1918 } 1919 1920 VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, 1921 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache)); 1922 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 1923 1924 /* 1925 * Process new nvlist of vdevs. 1926 */ 1927 for (i = 0; i < nl2cache; i++) { 1928 guid = fnvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID); 1929 1930 newvdevs[i] = NULL; 1931 for (j = 0; j < oldnvdevs; j++) { 1932 vd = oldvdevs[j]; 1933 if (vd != NULL && guid == vd->vdev_guid) { 1934 /* 1935 * Retain previous vdev for add/remove ops. 1936 */ 1937 newvdevs[i] = vd; 1938 oldvdevs[j] = NULL; 1939 break; 1940 } 1941 } 1942 1943 if (newvdevs[i] == NULL) { 1944 /* 1945 * Create new vdev 1946 */ 1947 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 1948 VDEV_ALLOC_L2CACHE) == 0); 1949 ASSERT(vd != NULL); 1950 newvdevs[i] = vd; 1951 1952 /* 1953 * Commit this vdev as an l2cache device, 1954 * even if it fails to open. 1955 */ 1956 spa_l2cache_add(vd); 1957 1958 vd->vdev_top = vd; 1959 vd->vdev_aux = sav; 1960 1961 spa_l2cache_activate(vd); 1962 1963 if (vdev_open(vd) != 0) 1964 continue; 1965 1966 (void) vdev_validate_aux(vd); 1967 1968 if (!vdev_is_dead(vd)) 1969 l2arc_add_vdev(spa, vd); 1970 1971 /* 1972 * Upon cache device addition to a pool or pool 1973 * creation with a cache device or if the header 1974 * of the device is invalid we issue an async 1975 * TRIM command for the whole device which will 1976 * execute if l2arc_trim_ahead > 0. 1977 */ 1978 spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM); 1979 } 1980 } 1981 1982 sav->sav_vdevs = newvdevs; 1983 sav->sav_count = (int)nl2cache; 1984 1985 /* 1986 * Recompute the stashed list of l2cache devices, with status 1987 * information this time. 1988 */ 1989 fnvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE); 1990 1991 if (sav->sav_count > 0) 1992 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), 1993 KM_SLEEP); 1994 for (i = 0; i < sav->sav_count; i++) 1995 l2cache[i] = vdev_config_generate(spa, 1996 sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 1997 fnvlist_add_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 1998 (const nvlist_t * const *)l2cache, sav->sav_count); 1999 2000 out: 2001 /* 2002 * Purge vdevs that were dropped 2003 */ 2004 for (i = 0; i < oldnvdevs; i++) { 2005 uint64_t pool; 2006 2007 vd = oldvdevs[i]; 2008 if (vd != NULL) { 2009 ASSERT(vd->vdev_isl2cache); 2010 2011 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 2012 pool != 0ULL && l2arc_vdev_present(vd)) 2013 l2arc_remove_vdev(vd); 2014 vdev_clear_stats(vd); 2015 vdev_free(vd); 2016 } 2017 } 2018 2019 if (oldvdevs) 2020 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 2021 2022 for (i = 0; i < sav->sav_count; i++) 2023 nvlist_free(l2cache[i]); 2024 if (sav->sav_count) 2025 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 2026 } 2027 2028 static int 2029 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 2030 { 2031 dmu_buf_t *db; 2032 char *packed = NULL; 2033 size_t nvsize = 0; 2034 int error; 2035 *value = NULL; 2036 2037 error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); 2038 if (error) 2039 return (error); 2040 2041 nvsize = *(uint64_t *)db->db_data; 2042 dmu_buf_rele(db, FTAG); 2043 2044 packed = vmem_alloc(nvsize, KM_SLEEP); 2045 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 2046 DMU_READ_PREFETCH); 2047 if (error == 0) 2048 error = nvlist_unpack(packed, nvsize, value, 0); 2049 vmem_free(packed, nvsize); 2050 2051 return (error); 2052 } 2053 2054 /* 2055 * Concrete top-level vdevs that are not missing and are not logs. At every 2056 * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds. 2057 */ 2058 static uint64_t 2059 spa_healthy_core_tvds(spa_t *spa) 2060 { 2061 vdev_t *rvd = spa->spa_root_vdev; 2062 uint64_t tvds = 0; 2063 2064 for (uint64_t i = 0; i < rvd->vdev_children; i++) { 2065 vdev_t *vd = rvd->vdev_child[i]; 2066 if (vd->vdev_islog) 2067 continue; 2068 if (vdev_is_concrete(vd) && !vdev_is_dead(vd)) 2069 tvds++; 2070 } 2071 2072 return (tvds); 2073 } 2074 2075 /* 2076 * Checks to see if the given vdev could not be opened, in which case we post a 2077 * sysevent to notify the autoreplace code that the device has been removed. 2078 */ 2079 static void 2080 spa_check_removed(vdev_t *vd) 2081 { 2082 for (uint64_t c = 0; c < vd->vdev_children; c++) 2083 spa_check_removed(vd->vdev_child[c]); 2084 2085 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && 2086 vdev_is_concrete(vd)) { 2087 zfs_post_autoreplace(vd->vdev_spa, vd); 2088 spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK); 2089 } 2090 } 2091 2092 static int 2093 spa_check_for_missing_logs(spa_t *spa) 2094 { 2095 vdev_t *rvd = spa->spa_root_vdev; 2096 2097 /* 2098 * If we're doing a normal import, then build up any additional 2099 * diagnostic information about missing log devices. 2100 * We'll pass this up to the user for further processing. 2101 */ 2102 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 2103 nvlist_t **child, *nv; 2104 uint64_t idx = 0; 2105 2106 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t *), 2107 KM_SLEEP); 2108 nv = fnvlist_alloc(); 2109 2110 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 2111 vdev_t *tvd = rvd->vdev_child[c]; 2112 2113 /* 2114 * We consider a device as missing only if it failed 2115 * to open (i.e. offline or faulted is not considered 2116 * as missing). 2117 */ 2118 if (tvd->vdev_islog && 2119 tvd->vdev_state == VDEV_STATE_CANT_OPEN) { 2120 child[idx++] = vdev_config_generate(spa, tvd, 2121 B_FALSE, VDEV_CONFIG_MISSING); 2122 } 2123 } 2124 2125 if (idx > 0) { 2126 fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 2127 (const nvlist_t * const *)child, idx); 2128 fnvlist_add_nvlist(spa->spa_load_info, 2129 ZPOOL_CONFIG_MISSING_DEVICES, nv); 2130 2131 for (uint64_t i = 0; i < idx; i++) 2132 nvlist_free(child[i]); 2133 } 2134 nvlist_free(nv); 2135 kmem_free(child, rvd->vdev_children * sizeof (char **)); 2136 2137 if (idx > 0) { 2138 spa_load_failed(spa, "some log devices are missing"); 2139 vdev_dbgmsg_print_tree(rvd, 2); 2140 return (SET_ERROR(ENXIO)); 2141 } 2142 } else { 2143 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 2144 vdev_t *tvd = rvd->vdev_child[c]; 2145 2146 if (tvd->vdev_islog && 2147 tvd->vdev_state == VDEV_STATE_CANT_OPEN) { 2148 spa_set_log_state(spa, SPA_LOG_CLEAR); 2149 spa_load_note(spa, "some log devices are " 2150 "missing, ZIL is dropped."); 2151 vdev_dbgmsg_print_tree(rvd, 2); 2152 break; 2153 } 2154 } 2155 } 2156 2157 return (0); 2158 } 2159 2160 /* 2161 * Check for missing log devices 2162 */ 2163 static boolean_t 2164 spa_check_logs(spa_t *spa) 2165 { 2166 boolean_t rv = B_FALSE; 2167 dsl_pool_t *dp = spa_get_dsl(spa); 2168 2169 switch (spa->spa_log_state) { 2170 default: 2171 break; 2172 case SPA_LOG_MISSING: 2173 /* need to recheck in case slog has been restored */ 2174 case SPA_LOG_UNKNOWN: 2175 rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 2176 zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0); 2177 if (rv) 2178 spa_set_log_state(spa, SPA_LOG_MISSING); 2179 break; 2180 } 2181 return (rv); 2182 } 2183 2184 /* 2185 * Passivate any log vdevs (note, does not apply to embedded log metaslabs). 2186 */ 2187 static boolean_t 2188 spa_passivate_log(spa_t *spa) 2189 { 2190 vdev_t *rvd = spa->spa_root_vdev; 2191 boolean_t slog_found = B_FALSE; 2192 2193 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 2194 2195 for (int c = 0; c < rvd->vdev_children; c++) { 2196 vdev_t *tvd = rvd->vdev_child[c]; 2197 2198 if (tvd->vdev_islog) { 2199 ASSERT3P(tvd->vdev_log_mg, ==, NULL); 2200 metaslab_group_passivate(tvd->vdev_mg); 2201 slog_found = B_TRUE; 2202 } 2203 } 2204 2205 return (slog_found); 2206 } 2207 2208 /* 2209 * Activate any log vdevs (note, does not apply to embedded log metaslabs). 2210 */ 2211 static void 2212 spa_activate_log(spa_t *spa) 2213 { 2214 vdev_t *rvd = spa->spa_root_vdev; 2215 2216 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 2217 2218 for (int c = 0; c < rvd->vdev_children; c++) { 2219 vdev_t *tvd = rvd->vdev_child[c]; 2220 2221 if (tvd->vdev_islog) { 2222 ASSERT3P(tvd->vdev_log_mg, ==, NULL); 2223 metaslab_group_activate(tvd->vdev_mg); 2224 } 2225 } 2226 } 2227 2228 int 2229 spa_reset_logs(spa_t *spa) 2230 { 2231 int error; 2232 2233 error = dmu_objset_find(spa_name(spa), zil_reset, 2234 NULL, DS_FIND_CHILDREN); 2235 if (error == 0) { 2236 /* 2237 * We successfully offlined the log device, sync out the 2238 * current txg so that the "stubby" block can be removed 2239 * by zil_sync(). 2240 */ 2241 txg_wait_synced(spa->spa_dsl_pool, 0); 2242 } 2243 return (error); 2244 } 2245 2246 static void 2247 spa_aux_check_removed(spa_aux_vdev_t *sav) 2248 { 2249 for (int i = 0; i < sav->sav_count; i++) 2250 spa_check_removed(sav->sav_vdevs[i]); 2251 } 2252 2253 void 2254 spa_claim_notify(zio_t *zio) 2255 { 2256 spa_t *spa = zio->io_spa; 2257 2258 if (zio->io_error) 2259 return; 2260 2261 mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 2262 if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 2263 spa->spa_claim_max_txg = zio->io_bp->blk_birth; 2264 mutex_exit(&spa->spa_props_lock); 2265 } 2266 2267 typedef struct spa_load_error { 2268 boolean_t sle_verify_data; 2269 uint64_t sle_meta_count; 2270 uint64_t sle_data_count; 2271 } spa_load_error_t; 2272 2273 static void 2274 spa_load_verify_done(zio_t *zio) 2275 { 2276 blkptr_t *bp = zio->io_bp; 2277 spa_load_error_t *sle = zio->io_private; 2278 dmu_object_type_t type = BP_GET_TYPE(bp); 2279 int error = zio->io_error; 2280 spa_t *spa = zio->io_spa; 2281 2282 abd_free(zio->io_abd); 2283 if (error) { 2284 if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && 2285 type != DMU_OT_INTENT_LOG) 2286 atomic_inc_64(&sle->sle_meta_count); 2287 else 2288 atomic_inc_64(&sle->sle_data_count); 2289 } 2290 2291 mutex_enter(&spa->spa_scrub_lock); 2292 spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp); 2293 cv_broadcast(&spa->spa_scrub_io_cv); 2294 mutex_exit(&spa->spa_scrub_lock); 2295 } 2296 2297 /* 2298 * Maximum number of inflight bytes is the log2 fraction of the arc size. 2299 * By default, we set it to 1/16th of the arc. 2300 */ 2301 static int spa_load_verify_shift = 4; 2302 static int spa_load_verify_metadata = B_TRUE; 2303 static int spa_load_verify_data = B_TRUE; 2304 2305 static int 2306 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 2307 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 2308 { 2309 zio_t *rio = arg; 2310 spa_load_error_t *sle = rio->io_private; 2311 2312 (void) zilog, (void) dnp; 2313 2314 /* 2315 * Note: normally this routine will not be called if 2316 * spa_load_verify_metadata is not set. However, it may be useful 2317 * to manually set the flag after the traversal has begun. 2318 */ 2319 if (!spa_load_verify_metadata) 2320 return (0); 2321 2322 /* 2323 * Sanity check the block pointer in order to detect obvious damage 2324 * before using the contents in subsequent checks or in zio_read(). 2325 * When damaged consider it to be a metadata error since we cannot 2326 * trust the BP_GET_TYPE and BP_GET_LEVEL values. 2327 */ 2328 if (!zfs_blkptr_verify(spa, bp, B_FALSE, BLK_VERIFY_LOG)) { 2329 atomic_inc_64(&sle->sle_meta_count); 2330 return (0); 2331 } 2332 2333 if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || 2334 BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp)) 2335 return (0); 2336 2337 if (!BP_IS_METADATA(bp) && 2338 (!spa_load_verify_data || !sle->sle_verify_data)) 2339 return (0); 2340 2341 uint64_t maxinflight_bytes = 2342 arc_target_bytes() >> spa_load_verify_shift; 2343 size_t size = BP_GET_PSIZE(bp); 2344 2345 mutex_enter(&spa->spa_scrub_lock); 2346 while (spa->spa_load_verify_bytes >= maxinflight_bytes) 2347 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2348 spa->spa_load_verify_bytes += size; 2349 mutex_exit(&spa->spa_scrub_lock); 2350 2351 zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size, 2352 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 2353 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 2354 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 2355 return (0); 2356 } 2357 2358 static int 2359 verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) 2360 { 2361 (void) dp, (void) arg; 2362 2363 if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN) 2364 return (SET_ERROR(ENAMETOOLONG)); 2365 2366 return (0); 2367 } 2368 2369 static int 2370 spa_load_verify(spa_t *spa) 2371 { 2372 zio_t *rio; 2373 spa_load_error_t sle = { 0 }; 2374 zpool_load_policy_t policy; 2375 boolean_t verify_ok = B_FALSE; 2376 int error = 0; 2377 2378 zpool_get_load_policy(spa->spa_config, &policy); 2379 2380 if (policy.zlp_rewind & ZPOOL_NEVER_REWIND || 2381 policy.zlp_maxmeta == UINT64_MAX) 2382 return (0); 2383 2384 dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); 2385 error = dmu_objset_find_dp(spa->spa_dsl_pool, 2386 spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL, 2387 DS_FIND_CHILDREN); 2388 dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); 2389 if (error != 0) 2390 return (error); 2391 2392 /* 2393 * Verify data only if we are rewinding or error limit was set. 2394 * Otherwise nothing except dbgmsg care about it to waste time. 2395 */ 2396 sle.sle_verify_data = (policy.zlp_rewind & ZPOOL_REWIND_MASK) || 2397 (policy.zlp_maxdata < UINT64_MAX); 2398 2399 rio = zio_root(spa, NULL, &sle, 2400 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 2401 2402 if (spa_load_verify_metadata) { 2403 if (spa->spa_extreme_rewind) { 2404 spa_load_note(spa, "performing a complete scan of the " 2405 "pool since extreme rewind is on. This may take " 2406 "a very long time.\n (spa_load_verify_data=%u, " 2407 "spa_load_verify_metadata=%u)", 2408 spa_load_verify_data, spa_load_verify_metadata); 2409 } 2410 2411 error = traverse_pool(spa, spa->spa_verify_min_txg, 2412 TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | 2413 TRAVERSE_NO_DECRYPT, spa_load_verify_cb, rio); 2414 } 2415 2416 (void) zio_wait(rio); 2417 ASSERT0(spa->spa_load_verify_bytes); 2418 2419 spa->spa_load_meta_errors = sle.sle_meta_count; 2420 spa->spa_load_data_errors = sle.sle_data_count; 2421 2422 if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) { 2423 spa_load_note(spa, "spa_load_verify found %llu metadata errors " 2424 "and %llu data errors", (u_longlong_t)sle.sle_meta_count, 2425 (u_longlong_t)sle.sle_data_count); 2426 } 2427 2428 if (spa_load_verify_dryrun || 2429 (!error && sle.sle_meta_count <= policy.zlp_maxmeta && 2430 sle.sle_data_count <= policy.zlp_maxdata)) { 2431 int64_t loss = 0; 2432 2433 verify_ok = B_TRUE; 2434 spa->spa_load_txg = spa->spa_uberblock.ub_txg; 2435 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 2436 2437 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 2438 fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_TIME, 2439 spa->spa_load_txg_ts); 2440 fnvlist_add_int64(spa->spa_load_info, ZPOOL_CONFIG_REWIND_TIME, 2441 loss); 2442 fnvlist_add_uint64(spa->spa_load_info, 2443 ZPOOL_CONFIG_LOAD_META_ERRORS, sle.sle_meta_count); 2444 fnvlist_add_uint64(spa->spa_load_info, 2445 ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count); 2446 } else { 2447 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 2448 } 2449 2450 if (spa_load_verify_dryrun) 2451 return (0); 2452 2453 if (error) { 2454 if (error != ENXIO && error != EIO) 2455 error = SET_ERROR(EIO); 2456 return (error); 2457 } 2458 2459 return (verify_ok ? 0 : EIO); 2460 } 2461 2462 /* 2463 * Find a value in the pool props object. 2464 */ 2465 static void 2466 spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 2467 { 2468 (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 2469 zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 2470 } 2471 2472 /* 2473 * Find a value in the pool directory object. 2474 */ 2475 static int 2476 spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent) 2477 { 2478 int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 2479 name, sizeof (uint64_t), 1, val); 2480 2481 if (error != 0 && (error != ENOENT || log_enoent)) { 2482 spa_load_failed(spa, "couldn't get '%s' value in MOS directory " 2483 "[error=%d]", name, error); 2484 } 2485 2486 return (error); 2487 } 2488 2489 static int 2490 spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 2491 { 2492 vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 2493 return (SET_ERROR(err)); 2494 } 2495 2496 boolean_t 2497 spa_livelist_delete_check(spa_t *spa) 2498 { 2499 return (spa->spa_livelists_to_delete != 0); 2500 } 2501 2502 static boolean_t 2503 spa_livelist_delete_cb_check(void *arg, zthr_t *z) 2504 { 2505 (void) z; 2506 spa_t *spa = arg; 2507 return (spa_livelist_delete_check(spa)); 2508 } 2509 2510 static int 2511 delete_blkptr_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 2512 { 2513 spa_t *spa = arg; 2514 zio_free(spa, tx->tx_txg, bp); 2515 dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, 2516 -bp_get_dsize_sync(spa, bp), 2517 -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); 2518 return (0); 2519 } 2520 2521 static int 2522 dsl_get_next_livelist_obj(objset_t *os, uint64_t zap_obj, uint64_t *llp) 2523 { 2524 int err; 2525 zap_cursor_t zc; 2526 zap_attribute_t za; 2527 zap_cursor_init(&zc, os, zap_obj); 2528 err = zap_cursor_retrieve(&zc, &za); 2529 zap_cursor_fini(&zc); 2530 if (err == 0) 2531 *llp = za.za_first_integer; 2532 return (err); 2533 } 2534 2535 /* 2536 * Components of livelist deletion that must be performed in syncing 2537 * context: freeing block pointers and updating the pool-wide data 2538 * structures to indicate how much work is left to do 2539 */ 2540 typedef struct sublist_delete_arg { 2541 spa_t *spa; 2542 dsl_deadlist_t *ll; 2543 uint64_t key; 2544 bplist_t *to_free; 2545 } sublist_delete_arg_t; 2546 2547 static void 2548 sublist_delete_sync(void *arg, dmu_tx_t *tx) 2549 { 2550 sublist_delete_arg_t *sda = arg; 2551 spa_t *spa = sda->spa; 2552 dsl_deadlist_t *ll = sda->ll; 2553 uint64_t key = sda->key; 2554 bplist_t *to_free = sda->to_free; 2555 2556 bplist_iterate(to_free, delete_blkptr_cb, spa, tx); 2557 dsl_deadlist_remove_entry(ll, key, tx); 2558 } 2559 2560 typedef struct livelist_delete_arg { 2561 spa_t *spa; 2562 uint64_t ll_obj; 2563 uint64_t zap_obj; 2564 } livelist_delete_arg_t; 2565 2566 static void 2567 livelist_delete_sync(void *arg, dmu_tx_t *tx) 2568 { 2569 livelist_delete_arg_t *lda = arg; 2570 spa_t *spa = lda->spa; 2571 uint64_t ll_obj = lda->ll_obj; 2572 uint64_t zap_obj = lda->zap_obj; 2573 objset_t *mos = spa->spa_meta_objset; 2574 uint64_t count; 2575 2576 /* free the livelist and decrement the feature count */ 2577 VERIFY0(zap_remove_int(mos, zap_obj, ll_obj, tx)); 2578 dsl_deadlist_free(mos, ll_obj, tx); 2579 spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx); 2580 VERIFY0(zap_count(mos, zap_obj, &count)); 2581 if (count == 0) { 2582 /* no more livelists to delete */ 2583 VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, 2584 DMU_POOL_DELETED_CLONES, tx)); 2585 VERIFY0(zap_destroy(mos, zap_obj, tx)); 2586 spa->spa_livelists_to_delete = 0; 2587 spa_notify_waiters(spa); 2588 } 2589 } 2590 2591 /* 2592 * Load in the value for the livelist to be removed and open it. Then, 2593 * load its first sublist and determine which block pointers should actually 2594 * be freed. Then, call a synctask which performs the actual frees and updates 2595 * the pool-wide livelist data. 2596 */ 2597 static void 2598 spa_livelist_delete_cb(void *arg, zthr_t *z) 2599 { 2600 spa_t *spa = arg; 2601 uint64_t ll_obj = 0, count; 2602 objset_t *mos = spa->spa_meta_objset; 2603 uint64_t zap_obj = spa->spa_livelists_to_delete; 2604 /* 2605 * Determine the next livelist to delete. This function should only 2606 * be called if there is at least one deleted clone. 2607 */ 2608 VERIFY0(dsl_get_next_livelist_obj(mos, zap_obj, &ll_obj)); 2609 VERIFY0(zap_count(mos, ll_obj, &count)); 2610 if (count > 0) { 2611 dsl_deadlist_t *ll; 2612 dsl_deadlist_entry_t *dle; 2613 bplist_t to_free; 2614 ll = kmem_zalloc(sizeof (dsl_deadlist_t), KM_SLEEP); 2615 dsl_deadlist_open(ll, mos, ll_obj); 2616 dle = dsl_deadlist_first(ll); 2617 ASSERT3P(dle, !=, NULL); 2618 bplist_create(&to_free); 2619 int err = dsl_process_sub_livelist(&dle->dle_bpobj, &to_free, 2620 z, NULL); 2621 if (err == 0) { 2622 sublist_delete_arg_t sync_arg = { 2623 .spa = spa, 2624 .ll = ll, 2625 .key = dle->dle_mintxg, 2626 .to_free = &to_free 2627 }; 2628 zfs_dbgmsg("deleting sublist (id %llu) from" 2629 " livelist %llu, %lld remaining", 2630 (u_longlong_t)dle->dle_bpobj.bpo_object, 2631 (u_longlong_t)ll_obj, (longlong_t)count - 1); 2632 VERIFY0(dsl_sync_task(spa_name(spa), NULL, 2633 sublist_delete_sync, &sync_arg, 0, 2634 ZFS_SPACE_CHECK_DESTROY)); 2635 } else { 2636 VERIFY3U(err, ==, EINTR); 2637 } 2638 bplist_clear(&to_free); 2639 bplist_destroy(&to_free); 2640 dsl_deadlist_close(ll); 2641 kmem_free(ll, sizeof (dsl_deadlist_t)); 2642 } else { 2643 livelist_delete_arg_t sync_arg = { 2644 .spa = spa, 2645 .ll_obj = ll_obj, 2646 .zap_obj = zap_obj 2647 }; 2648 zfs_dbgmsg("deletion of livelist %llu completed", 2649 (u_longlong_t)ll_obj); 2650 VERIFY0(dsl_sync_task(spa_name(spa), NULL, livelist_delete_sync, 2651 &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY)); 2652 } 2653 } 2654 2655 static void 2656 spa_start_livelist_destroy_thread(spa_t *spa) 2657 { 2658 ASSERT3P(spa->spa_livelist_delete_zthr, ==, NULL); 2659 spa->spa_livelist_delete_zthr = 2660 zthr_create("z_livelist_destroy", 2661 spa_livelist_delete_cb_check, spa_livelist_delete_cb, spa, 2662 minclsyspri); 2663 } 2664 2665 typedef struct livelist_new_arg { 2666 bplist_t *allocs; 2667 bplist_t *frees; 2668 } livelist_new_arg_t; 2669 2670 static int 2671 livelist_track_new_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 2672 dmu_tx_t *tx) 2673 { 2674 ASSERT(tx == NULL); 2675 livelist_new_arg_t *lna = arg; 2676 if (bp_freed) { 2677 bplist_append(lna->frees, bp); 2678 } else { 2679 bplist_append(lna->allocs, bp); 2680 zfs_livelist_condense_new_alloc++; 2681 } 2682 return (0); 2683 } 2684 2685 typedef struct livelist_condense_arg { 2686 spa_t *spa; 2687 bplist_t to_keep; 2688 uint64_t first_size; 2689 uint64_t next_size; 2690 } livelist_condense_arg_t; 2691 2692 static void 2693 spa_livelist_condense_sync(void *arg, dmu_tx_t *tx) 2694 { 2695 livelist_condense_arg_t *lca = arg; 2696 spa_t *spa = lca->spa; 2697 bplist_t new_frees; 2698 dsl_dataset_t *ds = spa->spa_to_condense.ds; 2699 2700 /* Have we been cancelled? */ 2701 if (spa->spa_to_condense.cancelled) { 2702 zfs_livelist_condense_sync_cancel++; 2703 goto out; 2704 } 2705 2706 dsl_deadlist_entry_t *first = spa->spa_to_condense.first; 2707 dsl_deadlist_entry_t *next = spa->spa_to_condense.next; 2708 dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist; 2709 2710 /* 2711 * It's possible that the livelist was changed while the zthr was 2712 * running. Therefore, we need to check for new blkptrs in the two 2713 * entries being condensed and continue to track them in the livelist. 2714 * Because of the way we handle remapped blkptrs (see dbuf_remap_impl), 2715 * it's possible that the newly added blkptrs are FREEs or ALLOCs so 2716 * we need to sort them into two different bplists. 2717 */ 2718 uint64_t first_obj = first->dle_bpobj.bpo_object; 2719 uint64_t next_obj = next->dle_bpobj.bpo_object; 2720 uint64_t cur_first_size = first->dle_bpobj.bpo_phys->bpo_num_blkptrs; 2721 uint64_t cur_next_size = next->dle_bpobj.bpo_phys->bpo_num_blkptrs; 2722 2723 bplist_create(&new_frees); 2724 livelist_new_arg_t new_bps = { 2725 .allocs = &lca->to_keep, 2726 .frees = &new_frees, 2727 }; 2728 2729 if (cur_first_size > lca->first_size) { 2730 VERIFY0(livelist_bpobj_iterate_from_nofree(&first->dle_bpobj, 2731 livelist_track_new_cb, &new_bps, lca->first_size)); 2732 } 2733 if (cur_next_size > lca->next_size) { 2734 VERIFY0(livelist_bpobj_iterate_from_nofree(&next->dle_bpobj, 2735 livelist_track_new_cb, &new_bps, lca->next_size)); 2736 } 2737 2738 dsl_deadlist_clear_entry(first, ll, tx); 2739 ASSERT(bpobj_is_empty(&first->dle_bpobj)); 2740 dsl_deadlist_remove_entry(ll, next->dle_mintxg, tx); 2741 2742 bplist_iterate(&lca->to_keep, dsl_deadlist_insert_alloc_cb, ll, tx); 2743 bplist_iterate(&new_frees, dsl_deadlist_insert_free_cb, ll, tx); 2744 bplist_destroy(&new_frees); 2745 2746 char dsname[ZFS_MAX_DATASET_NAME_LEN]; 2747 dsl_dataset_name(ds, dsname); 2748 zfs_dbgmsg("txg %llu condensing livelist of %s (id %llu), bpobj %llu " 2749 "(%llu blkptrs) and bpobj %llu (%llu blkptrs) -> bpobj %llu " 2750 "(%llu blkptrs)", (u_longlong_t)tx->tx_txg, dsname, 2751 (u_longlong_t)ds->ds_object, (u_longlong_t)first_obj, 2752 (u_longlong_t)cur_first_size, (u_longlong_t)next_obj, 2753 (u_longlong_t)cur_next_size, 2754 (u_longlong_t)first->dle_bpobj.bpo_object, 2755 (u_longlong_t)first->dle_bpobj.bpo_phys->bpo_num_blkptrs); 2756 out: 2757 dmu_buf_rele(ds->ds_dbuf, spa); 2758 spa->spa_to_condense.ds = NULL; 2759 bplist_clear(&lca->to_keep); 2760 bplist_destroy(&lca->to_keep); 2761 kmem_free(lca, sizeof (livelist_condense_arg_t)); 2762 spa->spa_to_condense.syncing = B_FALSE; 2763 } 2764 2765 static void 2766 spa_livelist_condense_cb(void *arg, zthr_t *t) 2767 { 2768 while (zfs_livelist_condense_zthr_pause && 2769 !(zthr_has_waiters(t) || zthr_iscancelled(t))) 2770 delay(1); 2771 2772 spa_t *spa = arg; 2773 dsl_deadlist_entry_t *first = spa->spa_to_condense.first; 2774 dsl_deadlist_entry_t *next = spa->spa_to_condense.next; 2775 uint64_t first_size, next_size; 2776 2777 livelist_condense_arg_t *lca = 2778 kmem_alloc(sizeof (livelist_condense_arg_t), KM_SLEEP); 2779 bplist_create(&lca->to_keep); 2780 2781 /* 2782 * Process the livelists (matching FREEs and ALLOCs) in open context 2783 * so we have minimal work in syncing context to condense. 2784 * 2785 * We save bpobj sizes (first_size and next_size) to use later in 2786 * syncing context to determine if entries were added to these sublists 2787 * while in open context. This is possible because the clone is still 2788 * active and open for normal writes and we want to make sure the new, 2789 * unprocessed blockpointers are inserted into the livelist normally. 2790 * 2791 * Note that dsl_process_sub_livelist() both stores the size number of 2792 * blockpointers and iterates over them while the bpobj's lock held, so 2793 * the sizes returned to us are consistent which what was actually 2794 * processed. 2795 */ 2796 int err = dsl_process_sub_livelist(&first->dle_bpobj, &lca->to_keep, t, 2797 &first_size); 2798 if (err == 0) 2799 err = dsl_process_sub_livelist(&next->dle_bpobj, &lca->to_keep, 2800 t, &next_size); 2801 2802 if (err == 0) { 2803 while (zfs_livelist_condense_sync_pause && 2804 !(zthr_has_waiters(t) || zthr_iscancelled(t))) 2805 delay(1); 2806 2807 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 2808 dmu_tx_mark_netfree(tx); 2809 dmu_tx_hold_space(tx, 1); 2810 err = dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE); 2811 if (err == 0) { 2812 /* 2813 * Prevent the condense zthr restarting before 2814 * the synctask completes. 2815 */ 2816 spa->spa_to_condense.syncing = B_TRUE; 2817 lca->spa = spa; 2818 lca->first_size = first_size; 2819 lca->next_size = next_size; 2820 dsl_sync_task_nowait(spa_get_dsl(spa), 2821 spa_livelist_condense_sync, lca, tx); 2822 dmu_tx_commit(tx); 2823 return; 2824 } 2825 } 2826 /* 2827 * Condensing can not continue: either it was externally stopped or 2828 * we were unable to assign to a tx because the pool has run out of 2829 * space. In the second case, we'll just end up trying to condense 2830 * again in a later txg. 2831 */ 2832 ASSERT(err != 0); 2833 bplist_clear(&lca->to_keep); 2834 bplist_destroy(&lca->to_keep); 2835 kmem_free(lca, sizeof (livelist_condense_arg_t)); 2836 dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, spa); 2837 spa->spa_to_condense.ds = NULL; 2838 if (err == EINTR) 2839 zfs_livelist_condense_zthr_cancel++; 2840 } 2841 2842 /* 2843 * Check that there is something to condense but that a condense is not 2844 * already in progress and that condensing has not been cancelled. 2845 */ 2846 static boolean_t 2847 spa_livelist_condense_cb_check(void *arg, zthr_t *z) 2848 { 2849 (void) z; 2850 spa_t *spa = arg; 2851 if ((spa->spa_to_condense.ds != NULL) && 2852 (spa->spa_to_condense.syncing == B_FALSE) && 2853 (spa->spa_to_condense.cancelled == B_FALSE)) { 2854 return (B_TRUE); 2855 } 2856 return (B_FALSE); 2857 } 2858 2859 static void 2860 spa_start_livelist_condensing_thread(spa_t *spa) 2861 { 2862 spa->spa_to_condense.ds = NULL; 2863 spa->spa_to_condense.first = NULL; 2864 spa->spa_to_condense.next = NULL; 2865 spa->spa_to_condense.syncing = B_FALSE; 2866 spa->spa_to_condense.cancelled = B_FALSE; 2867 2868 ASSERT3P(spa->spa_livelist_condense_zthr, ==, NULL); 2869 spa->spa_livelist_condense_zthr = 2870 zthr_create("z_livelist_condense", 2871 spa_livelist_condense_cb_check, 2872 spa_livelist_condense_cb, spa, minclsyspri); 2873 } 2874 2875 static void 2876 spa_spawn_aux_threads(spa_t *spa) 2877 { 2878 ASSERT(spa_writeable(spa)); 2879 2880 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 2881 2882 spa_start_indirect_condensing_thread(spa); 2883 spa_start_livelist_destroy_thread(spa); 2884 spa_start_livelist_condensing_thread(spa); 2885 2886 ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL); 2887 spa->spa_checkpoint_discard_zthr = 2888 zthr_create("z_checkpoint_discard", 2889 spa_checkpoint_discard_thread_check, 2890 spa_checkpoint_discard_thread, spa, minclsyspri); 2891 } 2892 2893 /* 2894 * Fix up config after a partly-completed split. This is done with the 2895 * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 2896 * pool have that entry in their config, but only the splitting one contains 2897 * a list of all the guids of the vdevs that are being split off. 2898 * 2899 * This function determines what to do with that list: either rejoin 2900 * all the disks to the pool, or complete the splitting process. To attempt 2901 * the rejoin, each disk that is offlined is marked online again, and 2902 * we do a reopen() call. If the vdev label for every disk that was 2903 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 2904 * then we call vdev_split() on each disk, and complete the split. 2905 * 2906 * Otherwise we leave the config alone, with all the vdevs in place in 2907 * the original pool. 2908 */ 2909 static void 2910 spa_try_repair(spa_t *spa, nvlist_t *config) 2911 { 2912 uint_t extracted; 2913 uint64_t *glist; 2914 uint_t i, gcount; 2915 nvlist_t *nvl; 2916 vdev_t **vd; 2917 boolean_t attempt_reopen; 2918 2919 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 2920 return; 2921 2922 /* check that the config is complete */ 2923 if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 2924 &glist, &gcount) != 0) 2925 return; 2926 2927 vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 2928 2929 /* attempt to online all the vdevs & validate */ 2930 attempt_reopen = B_TRUE; 2931 for (i = 0; i < gcount; i++) { 2932 if (glist[i] == 0) /* vdev is hole */ 2933 continue; 2934 2935 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 2936 if (vd[i] == NULL) { 2937 /* 2938 * Don't bother attempting to reopen the disks; 2939 * just do the split. 2940 */ 2941 attempt_reopen = B_FALSE; 2942 } else { 2943 /* attempt to re-online it */ 2944 vd[i]->vdev_offline = B_FALSE; 2945 } 2946 } 2947 2948 if (attempt_reopen) { 2949 vdev_reopen(spa->spa_root_vdev); 2950 2951 /* check each device to see what state it's in */ 2952 for (extracted = 0, i = 0; i < gcount; i++) { 2953 if (vd[i] != NULL && 2954 vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 2955 break; 2956 ++extracted; 2957 } 2958 } 2959 2960 /* 2961 * If every disk has been moved to the new pool, or if we never 2962 * even attempted to look at them, then we split them off for 2963 * good. 2964 */ 2965 if (!attempt_reopen || gcount == extracted) { 2966 for (i = 0; i < gcount; i++) 2967 if (vd[i] != NULL) 2968 vdev_split(vd[i]); 2969 vdev_reopen(spa->spa_root_vdev); 2970 } 2971 2972 kmem_free(vd, gcount * sizeof (vdev_t *)); 2973 } 2974 2975 static int 2976 spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type) 2977 { 2978 const char *ereport = FM_EREPORT_ZFS_POOL; 2979 int error; 2980 2981 spa->spa_load_state = state; 2982 (void) spa_import_progress_set_state(spa_guid(spa), 2983 spa_load_state(spa)); 2984 2985 gethrestime(&spa->spa_loaded_ts); 2986 error = spa_load_impl(spa, type, &ereport); 2987 2988 /* 2989 * Don't count references from objsets that are already closed 2990 * and are making their way through the eviction process. 2991 */ 2992 spa_evicting_os_wait(spa); 2993 spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); 2994 if (error) { 2995 if (error != EEXIST) { 2996 spa->spa_loaded_ts.tv_sec = 0; 2997 spa->spa_loaded_ts.tv_nsec = 0; 2998 } 2999 if (error != EBADF) { 3000 (void) zfs_ereport_post(ereport, spa, 3001 NULL, NULL, NULL, 0); 3002 } 3003 } 3004 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 3005 spa->spa_ena = 0; 3006 3007 (void) spa_import_progress_set_state(spa_guid(spa), 3008 spa_load_state(spa)); 3009 3010 return (error); 3011 } 3012 3013 #ifdef ZFS_DEBUG 3014 /* 3015 * Count the number of per-vdev ZAPs associated with all of the vdevs in the 3016 * vdev tree rooted in the given vd, and ensure that each ZAP is present in the 3017 * spa's per-vdev ZAP list. 3018 */ 3019 static uint64_t 3020 vdev_count_verify_zaps(vdev_t *vd) 3021 { 3022 spa_t *spa = vd->vdev_spa; 3023 uint64_t total = 0; 3024 3025 if (vd->vdev_top_zap != 0) { 3026 total++; 3027 ASSERT0(zap_lookup_int(spa->spa_meta_objset, 3028 spa->spa_all_vdev_zaps, vd->vdev_top_zap)); 3029 } 3030 if (vd->vdev_leaf_zap != 0) { 3031 total++; 3032 ASSERT0(zap_lookup_int(spa->spa_meta_objset, 3033 spa->spa_all_vdev_zaps, vd->vdev_leaf_zap)); 3034 } 3035 3036 for (uint64_t i = 0; i < vd->vdev_children; i++) { 3037 total += vdev_count_verify_zaps(vd->vdev_child[i]); 3038 } 3039 3040 return (total); 3041 } 3042 #else 3043 #define vdev_count_verify_zaps(vd) ((void) sizeof (vd), 0) 3044 #endif 3045 3046 /* 3047 * Determine whether the activity check is required. 3048 */ 3049 static boolean_t 3050 spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label, 3051 nvlist_t *config) 3052 { 3053 uint64_t state = 0; 3054 uint64_t hostid = 0; 3055 uint64_t tryconfig_txg = 0; 3056 uint64_t tryconfig_timestamp = 0; 3057 uint16_t tryconfig_mmp_seq = 0; 3058 nvlist_t *nvinfo; 3059 3060 if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { 3061 nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); 3062 (void) nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG, 3063 &tryconfig_txg); 3064 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 3065 &tryconfig_timestamp); 3066 (void) nvlist_lookup_uint16(nvinfo, ZPOOL_CONFIG_MMP_SEQ, 3067 &tryconfig_mmp_seq); 3068 } 3069 3070 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state); 3071 3072 /* 3073 * Disable the MMP activity check - This is used by zdb which 3074 * is intended to be used on potentially active pools. 3075 */ 3076 if (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) 3077 return (B_FALSE); 3078 3079 /* 3080 * Skip the activity check when the MMP feature is disabled. 3081 */ 3082 if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay == 0) 3083 return (B_FALSE); 3084 3085 /* 3086 * If the tryconfig_ values are nonzero, they are the results of an 3087 * earlier tryimport. If they all match the uberblock we just found, 3088 * then the pool has not changed and we return false so we do not test 3089 * a second time. 3090 */ 3091 if (tryconfig_txg && tryconfig_txg == ub->ub_txg && 3092 tryconfig_timestamp && tryconfig_timestamp == ub->ub_timestamp && 3093 tryconfig_mmp_seq && tryconfig_mmp_seq == 3094 (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) 3095 return (B_FALSE); 3096 3097 /* 3098 * Allow the activity check to be skipped when importing the pool 3099 * on the same host which last imported it. Since the hostid from 3100 * configuration may be stale use the one read from the label. 3101 */ 3102 if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID)) 3103 hostid = fnvlist_lookup_uint64(label, ZPOOL_CONFIG_HOSTID); 3104 3105 if (hostid == spa_get_hostid(spa)) 3106 return (B_FALSE); 3107 3108 /* 3109 * Skip the activity test when the pool was cleanly exported. 3110 */ 3111 if (state != POOL_STATE_ACTIVE) 3112 return (B_FALSE); 3113 3114 return (B_TRUE); 3115 } 3116 3117 /* 3118 * Nanoseconds the activity check must watch for changes on-disk. 3119 */ 3120 static uint64_t 3121 spa_activity_check_duration(spa_t *spa, uberblock_t *ub) 3122 { 3123 uint64_t import_intervals = MAX(zfs_multihost_import_intervals, 1); 3124 uint64_t multihost_interval = MSEC2NSEC( 3125 MMP_INTERVAL_OK(zfs_multihost_interval)); 3126 uint64_t import_delay = MAX(NANOSEC, import_intervals * 3127 multihost_interval); 3128 3129 /* 3130 * Local tunables determine a minimum duration except for the case 3131 * where we know when the remote host will suspend the pool if MMP 3132 * writes do not land. 3133 * 3134 * See Big Theory comment at the top of mmp.c for the reasoning behind 3135 * these cases and times. 3136 */ 3137 3138 ASSERT(MMP_IMPORT_SAFETY_FACTOR >= 100); 3139 3140 if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) && 3141 MMP_FAIL_INT(ub) > 0) { 3142 3143 /* MMP on remote host will suspend pool after failed writes */ 3144 import_delay = MMP_FAIL_INT(ub) * MSEC2NSEC(MMP_INTERVAL(ub)) * 3145 MMP_IMPORT_SAFETY_FACTOR / 100; 3146 3147 zfs_dbgmsg("fail_intvals>0 import_delay=%llu ub_mmp " 3148 "mmp_fails=%llu ub_mmp mmp_interval=%llu " 3149 "import_intervals=%llu", (u_longlong_t)import_delay, 3150 (u_longlong_t)MMP_FAIL_INT(ub), 3151 (u_longlong_t)MMP_INTERVAL(ub), 3152 (u_longlong_t)import_intervals); 3153 3154 } else if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) && 3155 MMP_FAIL_INT(ub) == 0) { 3156 3157 /* MMP on remote host will never suspend pool */ 3158 import_delay = MAX(import_delay, (MSEC2NSEC(MMP_INTERVAL(ub)) + 3159 ub->ub_mmp_delay) * import_intervals); 3160 3161 zfs_dbgmsg("fail_intvals=0 import_delay=%llu ub_mmp " 3162 "mmp_interval=%llu ub_mmp_delay=%llu " 3163 "import_intervals=%llu", (u_longlong_t)import_delay, 3164 (u_longlong_t)MMP_INTERVAL(ub), 3165 (u_longlong_t)ub->ub_mmp_delay, 3166 (u_longlong_t)import_intervals); 3167 3168 } else if (MMP_VALID(ub)) { 3169 /* 3170 * zfs-0.7 compatibility case 3171 */ 3172 3173 import_delay = MAX(import_delay, (multihost_interval + 3174 ub->ub_mmp_delay) * import_intervals); 3175 3176 zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu " 3177 "import_intervals=%llu leaves=%u", 3178 (u_longlong_t)import_delay, 3179 (u_longlong_t)ub->ub_mmp_delay, 3180 (u_longlong_t)import_intervals, 3181 vdev_count_leaves(spa)); 3182 } else { 3183 /* Using local tunings is the only reasonable option */ 3184 zfs_dbgmsg("pool last imported on non-MMP aware " 3185 "host using import_delay=%llu multihost_interval=%llu " 3186 "import_intervals=%llu", (u_longlong_t)import_delay, 3187 (u_longlong_t)multihost_interval, 3188 (u_longlong_t)import_intervals); 3189 } 3190 3191 return (import_delay); 3192 } 3193 3194 /* 3195 * Perform the import activity check. If the user canceled the import or 3196 * we detected activity then fail. 3197 */ 3198 static int 3199 spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config) 3200 { 3201 uint64_t txg = ub->ub_txg; 3202 uint64_t timestamp = ub->ub_timestamp; 3203 uint64_t mmp_config = ub->ub_mmp_config; 3204 uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0; 3205 uint64_t import_delay; 3206 hrtime_t import_expire; 3207 nvlist_t *mmp_label = NULL; 3208 vdev_t *rvd = spa->spa_root_vdev; 3209 kcondvar_t cv; 3210 kmutex_t mtx; 3211 int error = 0; 3212 3213 cv_init(&cv, NULL, CV_DEFAULT, NULL); 3214 mutex_init(&mtx, NULL, MUTEX_DEFAULT, NULL); 3215 mutex_enter(&mtx); 3216 3217 /* 3218 * If ZPOOL_CONFIG_MMP_TXG is present an activity check was performed 3219 * during the earlier tryimport. If the txg recorded there is 0 then 3220 * the pool is known to be active on another host. 3221 * 3222 * Otherwise, the pool might be in use on another host. Check for 3223 * changes in the uberblocks on disk if necessary. 3224 */ 3225 if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { 3226 nvlist_t *nvinfo = fnvlist_lookup_nvlist(config, 3227 ZPOOL_CONFIG_LOAD_INFO); 3228 3229 if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_TXG) && 3230 fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG) == 0) { 3231 vdev_uberblock_load(rvd, ub, &mmp_label); 3232 error = SET_ERROR(EREMOTEIO); 3233 goto out; 3234 } 3235 } 3236 3237 import_delay = spa_activity_check_duration(spa, ub); 3238 3239 /* Add a small random factor in case of simultaneous imports (0-25%) */ 3240 import_delay += import_delay * random_in_range(250) / 1000; 3241 3242 import_expire = gethrtime() + import_delay; 3243 3244 while (gethrtime() < import_expire) { 3245 (void) spa_import_progress_set_mmp_check(spa_guid(spa), 3246 NSEC2SEC(import_expire - gethrtime())); 3247 3248 vdev_uberblock_load(rvd, ub, &mmp_label); 3249 3250 if (txg != ub->ub_txg || timestamp != ub->ub_timestamp || 3251 mmp_seq != (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) { 3252 zfs_dbgmsg("multihost activity detected " 3253 "txg %llu ub_txg %llu " 3254 "timestamp %llu ub_timestamp %llu " 3255 "mmp_config %#llx ub_mmp_config %#llx", 3256 (u_longlong_t)txg, (u_longlong_t)ub->ub_txg, 3257 (u_longlong_t)timestamp, 3258 (u_longlong_t)ub->ub_timestamp, 3259 (u_longlong_t)mmp_config, 3260 (u_longlong_t)ub->ub_mmp_config); 3261 3262 error = SET_ERROR(EREMOTEIO); 3263 break; 3264 } 3265 3266 if (mmp_label) { 3267 nvlist_free(mmp_label); 3268 mmp_label = NULL; 3269 } 3270 3271 error = cv_timedwait_sig(&cv, &mtx, ddi_get_lbolt() + hz); 3272 if (error != -1) { 3273 error = SET_ERROR(EINTR); 3274 break; 3275 } 3276 error = 0; 3277 } 3278 3279 out: 3280 mutex_exit(&mtx); 3281 mutex_destroy(&mtx); 3282 cv_destroy(&cv); 3283 3284 /* 3285 * If the pool is determined to be active store the status in the 3286 * spa->spa_load_info nvlist. If the remote hostname or hostid are 3287 * available from configuration read from disk store them as well. 3288 * This allows 'zpool import' to generate a more useful message. 3289 * 3290 * ZPOOL_CONFIG_MMP_STATE - observed pool status (mandatory) 3291 * ZPOOL_CONFIG_MMP_HOSTNAME - hostname from the active pool 3292 * ZPOOL_CONFIG_MMP_HOSTID - hostid from the active pool 3293 */ 3294 if (error == EREMOTEIO) { 3295 const char *hostname = "<unknown>"; 3296 uint64_t hostid = 0; 3297 3298 if (mmp_label) { 3299 if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTNAME)) { 3300 hostname = fnvlist_lookup_string(mmp_label, 3301 ZPOOL_CONFIG_HOSTNAME); 3302 fnvlist_add_string(spa->spa_load_info, 3303 ZPOOL_CONFIG_MMP_HOSTNAME, hostname); 3304 } 3305 3306 if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTID)) { 3307 hostid = fnvlist_lookup_uint64(mmp_label, 3308 ZPOOL_CONFIG_HOSTID); 3309 fnvlist_add_uint64(spa->spa_load_info, 3310 ZPOOL_CONFIG_MMP_HOSTID, hostid); 3311 } 3312 } 3313 3314 fnvlist_add_uint64(spa->spa_load_info, 3315 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_ACTIVE); 3316 fnvlist_add_uint64(spa->spa_load_info, 3317 ZPOOL_CONFIG_MMP_TXG, 0); 3318 3319 error = spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO); 3320 } 3321 3322 if (mmp_label) 3323 nvlist_free(mmp_label); 3324 3325 return (error); 3326 } 3327 3328 static int 3329 spa_verify_host(spa_t *spa, nvlist_t *mos_config) 3330 { 3331 uint64_t hostid; 3332 char *hostname; 3333 uint64_t myhostid = 0; 3334 3335 if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config, 3336 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 3337 hostname = fnvlist_lookup_string(mos_config, 3338 ZPOOL_CONFIG_HOSTNAME); 3339 3340 myhostid = zone_get_hostid(NULL); 3341 3342 if (hostid != 0 && myhostid != 0 && hostid != myhostid) { 3343 cmn_err(CE_WARN, "pool '%s' could not be " 3344 "loaded as it was last accessed by " 3345 "another system (host: %s hostid: 0x%llx). " 3346 "See: https://openzfs.github.io/openzfs-docs/msg/" 3347 "ZFS-8000-EY", 3348 spa_name(spa), hostname, (u_longlong_t)hostid); 3349 spa_load_failed(spa, "hostid verification failed: pool " 3350 "last accessed by host: %s (hostid: 0x%llx)", 3351 hostname, (u_longlong_t)hostid); 3352 return (SET_ERROR(EBADF)); 3353 } 3354 } 3355 3356 return (0); 3357 } 3358 3359 static int 3360 spa_ld_parse_config(spa_t *spa, spa_import_type_t type) 3361 { 3362 int error = 0; 3363 nvlist_t *nvtree, *nvl, *config = spa->spa_config; 3364 int parse; 3365 vdev_t *rvd; 3366 uint64_t pool_guid; 3367 char *comment; 3368 char *compatibility; 3369 3370 /* 3371 * Versioning wasn't explicitly added to the label until later, so if 3372 * it's not present treat it as the initial version. 3373 */ 3374 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 3375 &spa->spa_ubsync.ub_version) != 0) 3376 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 3377 3378 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 3379 spa_load_failed(spa, "invalid config provided: '%s' missing", 3380 ZPOOL_CONFIG_POOL_GUID); 3381 return (SET_ERROR(EINVAL)); 3382 } 3383 3384 /* 3385 * If we are doing an import, ensure that the pool is not already 3386 * imported by checking if its pool guid already exists in the 3387 * spa namespace. 3388 * 3389 * The only case that we allow an already imported pool to be 3390 * imported again, is when the pool is checkpointed and we want to 3391 * look at its checkpointed state from userland tools like zdb. 3392 */ 3393 #ifdef _KERNEL 3394 if ((spa->spa_load_state == SPA_LOAD_IMPORT || 3395 spa->spa_load_state == SPA_LOAD_TRYIMPORT) && 3396 spa_guid_exists(pool_guid, 0)) { 3397 #else 3398 if ((spa->spa_load_state == SPA_LOAD_IMPORT || 3399 spa->spa_load_state == SPA_LOAD_TRYIMPORT) && 3400 spa_guid_exists(pool_guid, 0) && 3401 !spa_importing_readonly_checkpoint(spa)) { 3402 #endif 3403 spa_load_failed(spa, "a pool with guid %llu is already open", 3404 (u_longlong_t)pool_guid); 3405 return (SET_ERROR(EEXIST)); 3406 } 3407 3408 spa->spa_config_guid = pool_guid; 3409 3410 nvlist_free(spa->spa_load_info); 3411 spa->spa_load_info = fnvlist_alloc(); 3412 3413 ASSERT(spa->spa_comment == NULL); 3414 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) 3415 spa->spa_comment = spa_strdup(comment); 3416 3417 ASSERT(spa->spa_compatibility == NULL); 3418 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMPATIBILITY, 3419 &compatibility) == 0) 3420 spa->spa_compatibility = spa_strdup(compatibility); 3421 3422 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 3423 &spa->spa_config_txg); 3424 3425 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0) 3426 spa->spa_config_splitting = fnvlist_dup(nvl); 3427 3428 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) { 3429 spa_load_failed(spa, "invalid config provided: '%s' missing", 3430 ZPOOL_CONFIG_VDEV_TREE); 3431 return (SET_ERROR(EINVAL)); 3432 } 3433 3434 /* 3435 * Create "The Godfather" zio to hold all async IOs 3436 */ 3437 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 3438 KM_SLEEP); 3439 for (int i = 0; i < max_ncpus; i++) { 3440 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 3441 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 3442 ZIO_FLAG_GODFATHER); 3443 } 3444 3445 /* 3446 * Parse the configuration into a vdev tree. We explicitly set the 3447 * value that will be returned by spa_version() since parsing the 3448 * configuration requires knowing the version number. 3449 */ 3450 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3451 parse = (type == SPA_IMPORT_EXISTING ? 3452 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 3453 error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse); 3454 spa_config_exit(spa, SCL_ALL, FTAG); 3455 3456 if (error != 0) { 3457 spa_load_failed(spa, "unable to parse config [error=%d]", 3458 error); 3459 return (error); 3460 } 3461 3462 ASSERT(spa->spa_root_vdev == rvd); 3463 ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); 3464 ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT); 3465 3466 if (type != SPA_IMPORT_ASSEMBLE) { 3467 ASSERT(spa_guid(spa) == pool_guid); 3468 } 3469 3470 return (0); 3471 } 3472 3473 /* 3474 * Recursively open all vdevs in the vdev tree. This function is called twice: 3475 * first with the untrusted config, then with the trusted config. 3476 */ 3477 static int 3478 spa_ld_open_vdevs(spa_t *spa) 3479 { 3480 int error = 0; 3481 3482 /* 3483 * spa_missing_tvds_allowed defines how many top-level vdevs can be 3484 * missing/unopenable for the root vdev to be still considered openable. 3485 */ 3486 if (spa->spa_trust_config) { 3487 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds; 3488 } else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) { 3489 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile; 3490 } else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) { 3491 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan; 3492 } else { 3493 spa->spa_missing_tvds_allowed = 0; 3494 } 3495 3496 spa->spa_missing_tvds_allowed = 3497 MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed); 3498 3499 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3500 error = vdev_open(spa->spa_root_vdev); 3501 spa_config_exit(spa, SCL_ALL, FTAG); 3502 3503 if (spa->spa_missing_tvds != 0) { 3504 spa_load_note(spa, "vdev tree has %lld missing top-level " 3505 "vdevs.", (u_longlong_t)spa->spa_missing_tvds); 3506 if (spa->spa_trust_config && (spa->spa_mode & SPA_MODE_WRITE)) { 3507 /* 3508 * Although theoretically we could allow users to open 3509 * incomplete pools in RW mode, we'd need to add a lot 3510 * of extra logic (e.g. adjust pool space to account 3511 * for missing vdevs). 3512 * This limitation also prevents users from accidentally 3513 * opening the pool in RW mode during data recovery and 3514 * damaging it further. 3515 */ 3516 spa_load_note(spa, "pools with missing top-level " 3517 "vdevs can only be opened in read-only mode."); 3518 error = SET_ERROR(ENXIO); 3519 } else { 3520 spa_load_note(spa, "current settings allow for maximum " 3521 "%lld missing top-level vdevs at this stage.", 3522 (u_longlong_t)spa->spa_missing_tvds_allowed); 3523 } 3524 } 3525 if (error != 0) { 3526 spa_load_failed(spa, "unable to open vdev tree [error=%d]", 3527 error); 3528 } 3529 if (spa->spa_missing_tvds != 0 || error != 0) 3530 vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2); 3531 3532 return (error); 3533 } 3534 3535 /* 3536 * We need to validate the vdev labels against the configuration that 3537 * we have in hand. This function is called twice: first with an untrusted 3538 * config, then with a trusted config. The validation is more strict when the 3539 * config is trusted. 3540 */ 3541 static int 3542 spa_ld_validate_vdevs(spa_t *spa) 3543 { 3544 int error = 0; 3545 vdev_t *rvd = spa->spa_root_vdev; 3546 3547 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3548 error = vdev_validate(rvd); 3549 spa_config_exit(spa, SCL_ALL, FTAG); 3550 3551 if (error != 0) { 3552 spa_load_failed(spa, "vdev_validate failed [error=%d]", error); 3553 return (error); 3554 } 3555 3556 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 3557 spa_load_failed(spa, "cannot open vdev tree after invalidating " 3558 "some vdevs"); 3559 vdev_dbgmsg_print_tree(rvd, 2); 3560 return (SET_ERROR(ENXIO)); 3561 } 3562 3563 return (0); 3564 } 3565 3566 static void 3567 spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub) 3568 { 3569 spa->spa_state = POOL_STATE_ACTIVE; 3570 spa->spa_ubsync = spa->spa_uberblock; 3571 spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 3572 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 3573 spa->spa_first_txg = spa->spa_last_ubsync_txg ? 3574 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 3575 spa->spa_claim_max_txg = spa->spa_first_txg; 3576 spa->spa_prev_software_version = ub->ub_software_version; 3577 } 3578 3579 static int 3580 spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) 3581 { 3582 vdev_t *rvd = spa->spa_root_vdev; 3583 nvlist_t *label; 3584 uberblock_t *ub = &spa->spa_uberblock; 3585 boolean_t activity_check = B_FALSE; 3586 3587 /* 3588 * If we are opening the checkpointed state of the pool by 3589 * rewinding to it, at this point we will have written the 3590 * checkpointed uberblock to the vdev labels, so searching 3591 * the labels will find the right uberblock. However, if 3592 * we are opening the checkpointed state read-only, we have 3593 * not modified the labels. Therefore, we must ignore the 3594 * labels and continue using the spa_uberblock that was set 3595 * by spa_ld_checkpoint_rewind. 3596 * 3597 * Note that it would be fine to ignore the labels when 3598 * rewinding (opening writeable) as well. However, if we 3599 * crash just after writing the labels, we will end up 3600 * searching the labels. Doing so in the common case means 3601 * that this code path gets exercised normally, rather than 3602 * just in the edge case. 3603 */ 3604 if (ub->ub_checkpoint_txg != 0 && 3605 spa_importing_readonly_checkpoint(spa)) { 3606 spa_ld_select_uberblock_done(spa, ub); 3607 return (0); 3608 } 3609 3610 /* 3611 * Find the best uberblock. 3612 */ 3613 vdev_uberblock_load(rvd, ub, &label); 3614 3615 /* 3616 * If we weren't able to find a single valid uberblock, return failure. 3617 */ 3618 if (ub->ub_txg == 0) { 3619 nvlist_free(label); 3620 spa_load_failed(spa, "no valid uberblock found"); 3621 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 3622 } 3623 3624 if (spa->spa_load_max_txg != UINT64_MAX) { 3625 (void) spa_import_progress_set_max_txg(spa_guid(spa), 3626 (u_longlong_t)spa->spa_load_max_txg); 3627 } 3628 spa_load_note(spa, "using uberblock with txg=%llu", 3629 (u_longlong_t)ub->ub_txg); 3630 3631 3632 /* 3633 * For pools which have the multihost property on determine if the 3634 * pool is truly inactive and can be safely imported. Prevent 3635 * hosts which don't have a hostid set from importing the pool. 3636 */ 3637 activity_check = spa_activity_check_required(spa, ub, label, 3638 spa->spa_config); 3639 if (activity_check) { 3640 if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay && 3641 spa_get_hostid(spa) == 0) { 3642 nvlist_free(label); 3643 fnvlist_add_uint64(spa->spa_load_info, 3644 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); 3645 return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); 3646 } 3647 3648 int error = spa_activity_check(spa, ub, spa->spa_config); 3649 if (error) { 3650 nvlist_free(label); 3651 return (error); 3652 } 3653 3654 fnvlist_add_uint64(spa->spa_load_info, 3655 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_INACTIVE); 3656 fnvlist_add_uint64(spa->spa_load_info, 3657 ZPOOL_CONFIG_MMP_TXG, ub->ub_txg); 3658 fnvlist_add_uint16(spa->spa_load_info, 3659 ZPOOL_CONFIG_MMP_SEQ, 3660 (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)); 3661 } 3662 3663 /* 3664 * If the pool has an unsupported version we can't open it. 3665 */ 3666 if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { 3667 nvlist_free(label); 3668 spa_load_failed(spa, "version %llu is not supported", 3669 (u_longlong_t)ub->ub_version); 3670 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 3671 } 3672 3673 if (ub->ub_version >= SPA_VERSION_FEATURES) { 3674 nvlist_t *features; 3675 3676 /* 3677 * If we weren't able to find what's necessary for reading the 3678 * MOS in the label, return failure. 3679 */ 3680 if (label == NULL) { 3681 spa_load_failed(spa, "label config unavailable"); 3682 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 3683 ENXIO)); 3684 } 3685 3686 if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ, 3687 &features) != 0) { 3688 nvlist_free(label); 3689 spa_load_failed(spa, "invalid label: '%s' missing", 3690 ZPOOL_CONFIG_FEATURES_FOR_READ); 3691 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 3692 ENXIO)); 3693 } 3694 3695 /* 3696 * Update our in-core representation with the definitive values 3697 * from the label. 3698 */ 3699 nvlist_free(spa->spa_label_features); 3700 spa->spa_label_features = fnvlist_dup(features); 3701 } 3702 3703 nvlist_free(label); 3704 3705 /* 3706 * Look through entries in the label nvlist's features_for_read. If 3707 * there is a feature listed there which we don't understand then we 3708 * cannot open a pool. 3709 */ 3710 if (ub->ub_version >= SPA_VERSION_FEATURES) { 3711 nvlist_t *unsup_feat; 3712 3713 unsup_feat = fnvlist_alloc(); 3714 3715 for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, 3716 NULL); nvp != NULL; 3717 nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { 3718 if (!zfeature_is_supported(nvpair_name(nvp))) { 3719 fnvlist_add_string(unsup_feat, 3720 nvpair_name(nvp), ""); 3721 } 3722 } 3723 3724 if (!nvlist_empty(unsup_feat)) { 3725 fnvlist_add_nvlist(spa->spa_load_info, 3726 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 3727 nvlist_free(unsup_feat); 3728 spa_load_failed(spa, "some features are unsupported"); 3729 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 3730 ENOTSUP)); 3731 } 3732 3733 nvlist_free(unsup_feat); 3734 } 3735 3736 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 3737 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3738 spa_try_repair(spa, spa->spa_config); 3739 spa_config_exit(spa, SCL_ALL, FTAG); 3740 nvlist_free(spa->spa_config_splitting); 3741 spa->spa_config_splitting = NULL; 3742 } 3743 3744 /* 3745 * Initialize internal SPA structures. 3746 */ 3747 spa_ld_select_uberblock_done(spa, ub); 3748 3749 return (0); 3750 } 3751 3752 static int 3753 spa_ld_open_rootbp(spa_t *spa) 3754 { 3755 int error = 0; 3756 vdev_t *rvd = spa->spa_root_vdev; 3757 3758 error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 3759 if (error != 0) { 3760 spa_load_failed(spa, "unable to open rootbp in dsl_pool_init " 3761 "[error=%d]", error); 3762 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3763 } 3764 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 3765 3766 return (0); 3767 } 3768 3769 static int 3770 spa_ld_trusted_config(spa_t *spa, spa_import_type_t type, 3771 boolean_t reloading) 3772 { 3773 vdev_t *mrvd, *rvd = spa->spa_root_vdev; 3774 nvlist_t *nv, *mos_config, *policy; 3775 int error = 0, copy_error; 3776 uint64_t healthy_tvds, healthy_tvds_mos; 3777 uint64_t mos_config_txg; 3778 3779 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE) 3780 != 0) 3781 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3782 3783 /* 3784 * If we're assembling a pool from a split, the config provided is 3785 * already trusted so there is nothing to do. 3786 */ 3787 if (type == SPA_IMPORT_ASSEMBLE) 3788 return (0); 3789 3790 healthy_tvds = spa_healthy_core_tvds(spa); 3791 3792 if (load_nvlist(spa, spa->spa_config_object, &mos_config) 3793 != 0) { 3794 spa_load_failed(spa, "unable to retrieve MOS config"); 3795 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3796 } 3797 3798 /* 3799 * If we are doing an open, pool owner wasn't verified yet, thus do 3800 * the verification here. 3801 */ 3802 if (spa->spa_load_state == SPA_LOAD_OPEN) { 3803 error = spa_verify_host(spa, mos_config); 3804 if (error != 0) { 3805 nvlist_free(mos_config); 3806 return (error); 3807 } 3808 } 3809 3810 nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE); 3811 3812 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3813 3814 /* 3815 * Build a new vdev tree from the trusted config 3816 */ 3817 error = spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD); 3818 if (error != 0) { 3819 nvlist_free(mos_config); 3820 spa_config_exit(spa, SCL_ALL, FTAG); 3821 spa_load_failed(spa, "spa_config_parse failed [error=%d]", 3822 error); 3823 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 3824 } 3825 3826 /* 3827 * Vdev paths in the MOS may be obsolete. If the untrusted config was 3828 * obtained by scanning /dev/dsk, then it will have the right vdev 3829 * paths. We update the trusted MOS config with this information. 3830 * We first try to copy the paths with vdev_copy_path_strict, which 3831 * succeeds only when both configs have exactly the same vdev tree. 3832 * If that fails, we fall back to a more flexible method that has a 3833 * best effort policy. 3834 */ 3835 copy_error = vdev_copy_path_strict(rvd, mrvd); 3836 if (copy_error != 0 || spa_load_print_vdev_tree) { 3837 spa_load_note(spa, "provided vdev tree:"); 3838 vdev_dbgmsg_print_tree(rvd, 2); 3839 spa_load_note(spa, "MOS vdev tree:"); 3840 vdev_dbgmsg_print_tree(mrvd, 2); 3841 } 3842 if (copy_error != 0) { 3843 spa_load_note(spa, "vdev_copy_path_strict failed, falling " 3844 "back to vdev_copy_path_relaxed"); 3845 vdev_copy_path_relaxed(rvd, mrvd); 3846 } 3847 3848 vdev_close(rvd); 3849 vdev_free(rvd); 3850 spa->spa_root_vdev = mrvd; 3851 rvd = mrvd; 3852 spa_config_exit(spa, SCL_ALL, FTAG); 3853 3854 /* 3855 * We will use spa_config if we decide to reload the spa or if spa_load 3856 * fails and we rewind. We must thus regenerate the config using the 3857 * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to 3858 * pass settings on how to load the pool and is not stored in the MOS. 3859 * We copy it over to our new, trusted config. 3860 */ 3861 mos_config_txg = fnvlist_lookup_uint64(mos_config, 3862 ZPOOL_CONFIG_POOL_TXG); 3863 nvlist_free(mos_config); 3864 mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE); 3865 if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY, 3866 &policy) == 0) 3867 fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy); 3868 spa_config_set(spa, mos_config); 3869 spa->spa_config_source = SPA_CONFIG_SRC_MOS; 3870 3871 /* 3872 * Now that we got the config from the MOS, we should be more strict 3873 * in checking blkptrs and can make assumptions about the consistency 3874 * of the vdev tree. spa_trust_config must be set to true before opening 3875 * vdevs in order for them to be writeable. 3876 */ 3877 spa->spa_trust_config = B_TRUE; 3878 3879 /* 3880 * Open and validate the new vdev tree 3881 */ 3882 error = spa_ld_open_vdevs(spa); 3883 if (error != 0) 3884 return (error); 3885 3886 error = spa_ld_validate_vdevs(spa); 3887 if (error != 0) 3888 return (error); 3889 3890 if (copy_error != 0 || spa_load_print_vdev_tree) { 3891 spa_load_note(spa, "final vdev tree:"); 3892 vdev_dbgmsg_print_tree(rvd, 2); 3893 } 3894 3895 if (spa->spa_load_state != SPA_LOAD_TRYIMPORT && 3896 !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) { 3897 /* 3898 * Sanity check to make sure that we are indeed loading the 3899 * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds 3900 * in the config provided and they happened to be the only ones 3901 * to have the latest uberblock, we could involuntarily perform 3902 * an extreme rewind. 3903 */ 3904 healthy_tvds_mos = spa_healthy_core_tvds(spa); 3905 if (healthy_tvds_mos - healthy_tvds >= 3906 SPA_SYNC_MIN_VDEVS) { 3907 spa_load_note(spa, "config provided misses too many " 3908 "top-level vdevs compared to MOS (%lld vs %lld). ", 3909 (u_longlong_t)healthy_tvds, 3910 (u_longlong_t)healthy_tvds_mos); 3911 spa_load_note(spa, "vdev tree:"); 3912 vdev_dbgmsg_print_tree(rvd, 2); 3913 if (reloading) { 3914 spa_load_failed(spa, "config was already " 3915 "provided from MOS. Aborting."); 3916 return (spa_vdev_err(rvd, 3917 VDEV_AUX_CORRUPT_DATA, EIO)); 3918 } 3919 spa_load_note(spa, "spa must be reloaded using MOS " 3920 "config"); 3921 return (SET_ERROR(EAGAIN)); 3922 } 3923 } 3924 3925 error = spa_check_for_missing_logs(spa); 3926 if (error != 0) 3927 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 3928 3929 if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) { 3930 spa_load_failed(spa, "uberblock guid sum doesn't match MOS " 3931 "guid sum (%llu != %llu)", 3932 (u_longlong_t)spa->spa_uberblock.ub_guid_sum, 3933 (u_longlong_t)rvd->vdev_guid_sum); 3934 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 3935 ENXIO)); 3936 } 3937 3938 return (0); 3939 } 3940 3941 static int 3942 spa_ld_open_indirect_vdev_metadata(spa_t *spa) 3943 { 3944 int error = 0; 3945 vdev_t *rvd = spa->spa_root_vdev; 3946 3947 /* 3948 * Everything that we read before spa_remove_init() must be stored 3949 * on concreted vdevs. Therefore we do this as early as possible. 3950 */ 3951 error = spa_remove_init(spa); 3952 if (error != 0) { 3953 spa_load_failed(spa, "spa_remove_init failed [error=%d]", 3954 error); 3955 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3956 } 3957 3958 /* 3959 * Retrieve information needed to condense indirect vdev mappings. 3960 */ 3961 error = spa_condense_init(spa); 3962 if (error != 0) { 3963 spa_load_failed(spa, "spa_condense_init failed [error=%d]", 3964 error); 3965 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 3966 } 3967 3968 return (0); 3969 } 3970 3971 static int 3972 spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep) 3973 { 3974 int error = 0; 3975 vdev_t *rvd = spa->spa_root_vdev; 3976 3977 if (spa_version(spa) >= SPA_VERSION_FEATURES) { 3978 boolean_t missing_feat_read = B_FALSE; 3979 nvlist_t *unsup_feat, *enabled_feat; 3980 3981 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, 3982 &spa->spa_feat_for_read_obj, B_TRUE) != 0) { 3983 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3984 } 3985 3986 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, 3987 &spa->spa_feat_for_write_obj, B_TRUE) != 0) { 3988 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3989 } 3990 3991 if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, 3992 &spa->spa_feat_desc_obj, B_TRUE) != 0) { 3993 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3994 } 3995 3996 enabled_feat = fnvlist_alloc(); 3997 unsup_feat = fnvlist_alloc(); 3998 3999 if (!spa_features_check(spa, B_FALSE, 4000 unsup_feat, enabled_feat)) 4001 missing_feat_read = B_TRUE; 4002 4003 if (spa_writeable(spa) || 4004 spa->spa_load_state == SPA_LOAD_TRYIMPORT) { 4005 if (!spa_features_check(spa, B_TRUE, 4006 unsup_feat, enabled_feat)) { 4007 *missing_feat_writep = B_TRUE; 4008 } 4009 } 4010 4011 fnvlist_add_nvlist(spa->spa_load_info, 4012 ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); 4013 4014 if (!nvlist_empty(unsup_feat)) { 4015 fnvlist_add_nvlist(spa->spa_load_info, 4016 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 4017 } 4018 4019 fnvlist_free(enabled_feat); 4020 fnvlist_free(unsup_feat); 4021 4022 if (!missing_feat_read) { 4023 fnvlist_add_boolean(spa->spa_load_info, 4024 ZPOOL_CONFIG_CAN_RDONLY); 4025 } 4026 4027 /* 4028 * If the state is SPA_LOAD_TRYIMPORT, our objective is 4029 * twofold: to determine whether the pool is available for 4030 * import in read-write mode and (if it is not) whether the 4031 * pool is available for import in read-only mode. If the pool 4032 * is available for import in read-write mode, it is displayed 4033 * as available in userland; if it is not available for import 4034 * in read-only mode, it is displayed as unavailable in 4035 * userland. If the pool is available for import in read-only 4036 * mode but not read-write mode, it is displayed as unavailable 4037 * in userland with a special note that the pool is actually 4038 * available for open in read-only mode. 4039 * 4040 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are 4041 * missing a feature for write, we must first determine whether 4042 * the pool can be opened read-only before returning to 4043 * userland in order to know whether to display the 4044 * abovementioned note. 4045 */ 4046 if (missing_feat_read || (*missing_feat_writep && 4047 spa_writeable(spa))) { 4048 spa_load_failed(spa, "pool uses unsupported features"); 4049 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 4050 ENOTSUP)); 4051 } 4052 4053 /* 4054 * Load refcounts for ZFS features from disk into an in-memory 4055 * cache during SPA initialization. 4056 */ 4057 for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { 4058 uint64_t refcount; 4059 4060 error = feature_get_refcount_from_disk(spa, 4061 &spa_feature_table[i], &refcount); 4062 if (error == 0) { 4063 spa->spa_feat_refcount_cache[i] = refcount; 4064 } else if (error == ENOTSUP) { 4065 spa->spa_feat_refcount_cache[i] = 4066 SPA_FEATURE_DISABLED; 4067 } else { 4068 spa_load_failed(spa, "error getting refcount " 4069 "for feature %s [error=%d]", 4070 spa_feature_table[i].fi_guid, error); 4071 return (spa_vdev_err(rvd, 4072 VDEV_AUX_CORRUPT_DATA, EIO)); 4073 } 4074 } 4075 } 4076 4077 if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { 4078 if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, 4079 &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0) 4080 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4081 } 4082 4083 /* 4084 * Encryption was added before bookmark_v2, even though bookmark_v2 4085 * is now a dependency. If this pool has encryption enabled without 4086 * bookmark_v2, trigger an errata message. 4087 */ 4088 if (spa_feature_is_enabled(spa, SPA_FEATURE_ENCRYPTION) && 4089 !spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_V2)) { 4090 spa->spa_errata = ZPOOL_ERRATA_ZOL_8308_ENCRYPTION; 4091 } 4092 4093 return (0); 4094 } 4095 4096 static int 4097 spa_ld_load_special_directories(spa_t *spa) 4098 { 4099 int error = 0; 4100 vdev_t *rvd = spa->spa_root_vdev; 4101 4102 spa->spa_is_initializing = B_TRUE; 4103 error = dsl_pool_open(spa->spa_dsl_pool); 4104 spa->spa_is_initializing = B_FALSE; 4105 if (error != 0) { 4106 spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error); 4107 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4108 } 4109 4110 return (0); 4111 } 4112 4113 static int 4114 spa_ld_get_props(spa_t *spa) 4115 { 4116 int error = 0; 4117 uint64_t obj; 4118 vdev_t *rvd = spa->spa_root_vdev; 4119 4120 /* Grab the checksum salt from the MOS. */ 4121 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 4122 DMU_POOL_CHECKSUM_SALT, 1, 4123 sizeof (spa->spa_cksum_salt.zcs_bytes), 4124 spa->spa_cksum_salt.zcs_bytes); 4125 if (error == ENOENT) { 4126 /* Generate a new salt for subsequent use */ 4127 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 4128 sizeof (spa->spa_cksum_salt.zcs_bytes)); 4129 } else if (error != 0) { 4130 spa_load_failed(spa, "unable to retrieve checksum salt from " 4131 "MOS [error=%d]", error); 4132 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4133 } 4134 4135 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0) 4136 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4137 error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 4138 if (error != 0) { 4139 spa_load_failed(spa, "error opening deferred-frees bpobj " 4140 "[error=%d]", error); 4141 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4142 } 4143 4144 /* 4145 * Load the bit that tells us to use the new accounting function 4146 * (raid-z deflation). If we have an older pool, this will not 4147 * be present. 4148 */ 4149 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE); 4150 if (error != 0 && error != ENOENT) 4151 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4152 4153 error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 4154 &spa->spa_creation_version, B_FALSE); 4155 if (error != 0 && error != ENOENT) 4156 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4157 4158 /* 4159 * Load the persistent error log. If we have an older pool, this will 4160 * not be present. 4161 */ 4162 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last, 4163 B_FALSE); 4164 if (error != 0 && error != ENOENT) 4165 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4166 4167 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 4168 &spa->spa_errlog_scrub, B_FALSE); 4169 if (error != 0 && error != ENOENT) 4170 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4171 4172 /* 4173 * Load the livelist deletion field. If a livelist is queued for 4174 * deletion, indicate that in the spa 4175 */ 4176 error = spa_dir_prop(spa, DMU_POOL_DELETED_CLONES, 4177 &spa->spa_livelists_to_delete, B_FALSE); 4178 if (error != 0 && error != ENOENT) 4179 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4180 4181 /* 4182 * Load the history object. If we have an older pool, this 4183 * will not be present. 4184 */ 4185 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE); 4186 if (error != 0 && error != ENOENT) 4187 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4188 4189 /* 4190 * Load the per-vdev ZAP map. If we have an older pool, this will not 4191 * be present; in this case, defer its creation to a later time to 4192 * avoid dirtying the MOS this early / out of sync context. See 4193 * spa_sync_config_object. 4194 */ 4195 4196 /* The sentinel is only available in the MOS config. */ 4197 nvlist_t *mos_config; 4198 if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) { 4199 spa_load_failed(spa, "unable to retrieve MOS config"); 4200 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4201 } 4202 4203 error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP, 4204 &spa->spa_all_vdev_zaps, B_FALSE); 4205 4206 if (error == ENOENT) { 4207 VERIFY(!nvlist_exists(mos_config, 4208 ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); 4209 spa->spa_avz_action = AVZ_ACTION_INITIALIZE; 4210 ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); 4211 } else if (error != 0) { 4212 nvlist_free(mos_config); 4213 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4214 } else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) { 4215 /* 4216 * An older version of ZFS overwrote the sentinel value, so 4217 * we have orphaned per-vdev ZAPs in the MOS. Defer their 4218 * destruction to later; see spa_sync_config_object. 4219 */ 4220 spa->spa_avz_action = AVZ_ACTION_DESTROY; 4221 /* 4222 * We're assuming that no vdevs have had their ZAPs created 4223 * before this. Better be sure of it. 4224 */ 4225 ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); 4226 } 4227 nvlist_free(mos_config); 4228 4229 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 4230 4231 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object, 4232 B_FALSE); 4233 if (error && error != ENOENT) 4234 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4235 4236 if (error == 0) { 4237 uint64_t autoreplace = 0; 4238 4239 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 4240 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 4241 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 4242 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 4243 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 4244 spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost); 4245 spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_autotrim); 4246 spa->spa_autoreplace = (autoreplace != 0); 4247 } 4248 4249 /* 4250 * If we are importing a pool with missing top-level vdevs, 4251 * we enforce that the pool doesn't panic or get suspended on 4252 * error since the likelihood of missing data is extremely high. 4253 */ 4254 if (spa->spa_missing_tvds > 0 && 4255 spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE && 4256 spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 4257 spa_load_note(spa, "forcing failmode to 'continue' " 4258 "as some top level vdevs are missing"); 4259 spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE; 4260 } 4261 4262 return (0); 4263 } 4264 4265 static int 4266 spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type) 4267 { 4268 int error = 0; 4269 vdev_t *rvd = spa->spa_root_vdev; 4270 4271 /* 4272 * If we're assembling the pool from the split-off vdevs of 4273 * an existing pool, we don't want to attach the spares & cache 4274 * devices. 4275 */ 4276 4277 /* 4278 * Load any hot spares for this pool. 4279 */ 4280 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object, 4281 B_FALSE); 4282 if (error != 0 && error != ENOENT) 4283 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4284 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 4285 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 4286 if (load_nvlist(spa, spa->spa_spares.sav_object, 4287 &spa->spa_spares.sav_config) != 0) { 4288 spa_load_failed(spa, "error loading spares nvlist"); 4289 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4290 } 4291 4292 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4293 spa_load_spares(spa); 4294 spa_config_exit(spa, SCL_ALL, FTAG); 4295 } else if (error == 0) { 4296 spa->spa_spares.sav_sync = B_TRUE; 4297 } 4298 4299 /* 4300 * Load any level 2 ARC devices for this pool. 4301 */ 4302 error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 4303 &spa->spa_l2cache.sav_object, B_FALSE); 4304 if (error != 0 && error != ENOENT) 4305 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4306 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 4307 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 4308 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 4309 &spa->spa_l2cache.sav_config) != 0) { 4310 spa_load_failed(spa, "error loading l2cache nvlist"); 4311 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4312 } 4313 4314 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4315 spa_load_l2cache(spa); 4316 spa_config_exit(spa, SCL_ALL, FTAG); 4317 } else if (error == 0) { 4318 spa->spa_l2cache.sav_sync = B_TRUE; 4319 } 4320 4321 return (0); 4322 } 4323 4324 static int 4325 spa_ld_load_vdev_metadata(spa_t *spa) 4326 { 4327 int error = 0; 4328 vdev_t *rvd = spa->spa_root_vdev; 4329 4330 /* 4331 * If the 'multihost' property is set, then never allow a pool to 4332 * be imported when the system hostid is zero. The exception to 4333 * this rule is zdb which is always allowed to access pools. 4334 */ 4335 if (spa_multihost(spa) && spa_get_hostid(spa) == 0 && 4336 (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) == 0) { 4337 fnvlist_add_uint64(spa->spa_load_info, 4338 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); 4339 return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); 4340 } 4341 4342 /* 4343 * If the 'autoreplace' property is set, then post a resource notifying 4344 * the ZFS DE that it should not issue any faults for unopenable 4345 * devices. We also iterate over the vdevs, and post a sysevent for any 4346 * unopenable vdevs so that the normal autoreplace handler can take 4347 * over. 4348 */ 4349 if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 4350 spa_check_removed(spa->spa_root_vdev); 4351 /* 4352 * For the import case, this is done in spa_import(), because 4353 * at this point we're using the spare definitions from 4354 * the MOS config, not necessarily from the userland config. 4355 */ 4356 if (spa->spa_load_state != SPA_LOAD_IMPORT) { 4357 spa_aux_check_removed(&spa->spa_spares); 4358 spa_aux_check_removed(&spa->spa_l2cache); 4359 } 4360 } 4361 4362 /* 4363 * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc. 4364 */ 4365 error = vdev_load(rvd); 4366 if (error != 0) { 4367 spa_load_failed(spa, "vdev_load failed [error=%d]", error); 4368 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 4369 } 4370 4371 error = spa_ld_log_spacemaps(spa); 4372 if (error != 0) { 4373 spa_load_failed(spa, "spa_ld_log_spacemaps failed [error=%d]", 4374 error); 4375 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 4376 } 4377 4378 /* 4379 * Propagate the leaf DTLs we just loaded all the way up the vdev tree. 4380 */ 4381 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4382 vdev_dtl_reassess(rvd, 0, 0, B_FALSE, B_FALSE); 4383 spa_config_exit(spa, SCL_ALL, FTAG); 4384 4385 return (0); 4386 } 4387 4388 static int 4389 spa_ld_load_dedup_tables(spa_t *spa) 4390 { 4391 int error = 0; 4392 vdev_t *rvd = spa->spa_root_vdev; 4393 4394 error = ddt_load(spa); 4395 if (error != 0) { 4396 spa_load_failed(spa, "ddt_load failed [error=%d]", error); 4397 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4398 } 4399 4400 return (0); 4401 } 4402 4403 static int 4404 spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, const char **ereport) 4405 { 4406 vdev_t *rvd = spa->spa_root_vdev; 4407 4408 if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) { 4409 boolean_t missing = spa_check_logs(spa); 4410 if (missing) { 4411 if (spa->spa_missing_tvds != 0) { 4412 spa_load_note(spa, "spa_check_logs failed " 4413 "so dropping the logs"); 4414 } else { 4415 *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 4416 spa_load_failed(spa, "spa_check_logs failed"); 4417 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, 4418 ENXIO)); 4419 } 4420 } 4421 } 4422 4423 return (0); 4424 } 4425 4426 static int 4427 spa_ld_verify_pool_data(spa_t *spa) 4428 { 4429 int error = 0; 4430 vdev_t *rvd = spa->spa_root_vdev; 4431 4432 /* 4433 * We've successfully opened the pool, verify that we're ready 4434 * to start pushing transactions. 4435 */ 4436 if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 4437 error = spa_load_verify(spa); 4438 if (error != 0) { 4439 spa_load_failed(spa, "spa_load_verify failed " 4440 "[error=%d]", error); 4441 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 4442 error)); 4443 } 4444 } 4445 4446 return (0); 4447 } 4448 4449 static void 4450 spa_ld_claim_log_blocks(spa_t *spa) 4451 { 4452 dmu_tx_t *tx; 4453 dsl_pool_t *dp = spa_get_dsl(spa); 4454 4455 /* 4456 * Claim log blocks that haven't been committed yet. 4457 * This must all happen in a single txg. 4458 * Note: spa_claim_max_txg is updated by spa_claim_notify(), 4459 * invoked from zil_claim_log_block()'s i/o done callback. 4460 * Price of rollback is that we abandon the log. 4461 */ 4462 spa->spa_claiming = B_TRUE; 4463 4464 tx = dmu_tx_create_assigned(dp, spa_first_txg(spa)); 4465 (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 4466 zil_claim, tx, DS_FIND_CHILDREN); 4467 dmu_tx_commit(tx); 4468 4469 spa->spa_claiming = B_FALSE; 4470 4471 spa_set_log_state(spa, SPA_LOG_GOOD); 4472 } 4473 4474 static void 4475 spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg, 4476 boolean_t update_config_cache) 4477 { 4478 vdev_t *rvd = spa->spa_root_vdev; 4479 int need_update = B_FALSE; 4480 4481 /* 4482 * If the config cache is stale, or we have uninitialized 4483 * metaslabs (see spa_vdev_add()), then update the config. 4484 * 4485 * If this is a verbatim import, trust the current 4486 * in-core spa_config and update the disk labels. 4487 */ 4488 if (update_config_cache || config_cache_txg != spa->spa_config_txg || 4489 spa->spa_load_state == SPA_LOAD_IMPORT || 4490 spa->spa_load_state == SPA_LOAD_RECOVER || 4491 (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 4492 need_update = B_TRUE; 4493 4494 for (int c = 0; c < rvd->vdev_children; c++) 4495 if (rvd->vdev_child[c]->vdev_ms_array == 0) 4496 need_update = B_TRUE; 4497 4498 /* 4499 * Update the config cache asynchronously in case we're the 4500 * root pool, in which case the config cache isn't writable yet. 4501 */ 4502 if (need_update) 4503 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 4504 } 4505 4506 static void 4507 spa_ld_prepare_for_reload(spa_t *spa) 4508 { 4509 spa_mode_t mode = spa->spa_mode; 4510 int async_suspended = spa->spa_async_suspended; 4511 4512 spa_unload(spa); 4513 spa_deactivate(spa); 4514 spa_activate(spa, mode); 4515 4516 /* 4517 * We save the value of spa_async_suspended as it gets reset to 0 by 4518 * spa_unload(). We want to restore it back to the original value before 4519 * returning as we might be calling spa_async_resume() later. 4520 */ 4521 spa->spa_async_suspended = async_suspended; 4522 } 4523 4524 static int 4525 spa_ld_read_checkpoint_txg(spa_t *spa) 4526 { 4527 uberblock_t checkpoint; 4528 int error = 0; 4529 4530 ASSERT0(spa->spa_checkpoint_txg); 4531 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 4532 4533 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 4534 DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), 4535 sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); 4536 4537 if (error == ENOENT) 4538 return (0); 4539 4540 if (error != 0) 4541 return (error); 4542 4543 ASSERT3U(checkpoint.ub_txg, !=, 0); 4544 ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0); 4545 ASSERT3U(checkpoint.ub_timestamp, !=, 0); 4546 spa->spa_checkpoint_txg = checkpoint.ub_txg; 4547 spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp; 4548 4549 return (0); 4550 } 4551 4552 static int 4553 spa_ld_mos_init(spa_t *spa, spa_import_type_t type) 4554 { 4555 int error = 0; 4556 4557 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 4558 ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); 4559 4560 /* 4561 * Never trust the config that is provided unless we are assembling 4562 * a pool following a split. 4563 * This means don't trust blkptrs and the vdev tree in general. This 4564 * also effectively puts the spa in read-only mode since 4565 * spa_writeable() checks for spa_trust_config to be true. 4566 * We will later load a trusted config from the MOS. 4567 */ 4568 if (type != SPA_IMPORT_ASSEMBLE) 4569 spa->spa_trust_config = B_FALSE; 4570 4571 /* 4572 * Parse the config provided to create a vdev tree. 4573 */ 4574 error = spa_ld_parse_config(spa, type); 4575 if (error != 0) 4576 return (error); 4577 4578 spa_import_progress_add(spa); 4579 4580 /* 4581 * Now that we have the vdev tree, try to open each vdev. This involves 4582 * opening the underlying physical device, retrieving its geometry and 4583 * probing the vdev with a dummy I/O. The state of each vdev will be set 4584 * based on the success of those operations. After this we'll be ready 4585 * to read from the vdevs. 4586 */ 4587 error = spa_ld_open_vdevs(spa); 4588 if (error != 0) 4589 return (error); 4590 4591 /* 4592 * Read the label of each vdev and make sure that the GUIDs stored 4593 * there match the GUIDs in the config provided. 4594 * If we're assembling a new pool that's been split off from an 4595 * existing pool, the labels haven't yet been updated so we skip 4596 * validation for now. 4597 */ 4598 if (type != SPA_IMPORT_ASSEMBLE) { 4599 error = spa_ld_validate_vdevs(spa); 4600 if (error != 0) 4601 return (error); 4602 } 4603 4604 /* 4605 * Read all vdev labels to find the best uberblock (i.e. latest, 4606 * unless spa_load_max_txg is set) and store it in spa_uberblock. We 4607 * get the list of features required to read blkptrs in the MOS from 4608 * the vdev label with the best uberblock and verify that our version 4609 * of zfs supports them all. 4610 */ 4611 error = spa_ld_select_uberblock(spa, type); 4612 if (error != 0) 4613 return (error); 4614 4615 /* 4616 * Pass that uberblock to the dsl_pool layer which will open the root 4617 * blkptr. This blkptr points to the latest version of the MOS and will 4618 * allow us to read its contents. 4619 */ 4620 error = spa_ld_open_rootbp(spa); 4621 if (error != 0) 4622 return (error); 4623 4624 return (0); 4625 } 4626 4627 static int 4628 spa_ld_checkpoint_rewind(spa_t *spa) 4629 { 4630 uberblock_t checkpoint; 4631 int error = 0; 4632 4633 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 4634 ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); 4635 4636 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 4637 DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), 4638 sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); 4639 4640 if (error != 0) { 4641 spa_load_failed(spa, "unable to retrieve checkpointed " 4642 "uberblock from the MOS config [error=%d]", error); 4643 4644 if (error == ENOENT) 4645 error = ZFS_ERR_NO_CHECKPOINT; 4646 4647 return (error); 4648 } 4649 4650 ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg); 4651 ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg); 4652 4653 /* 4654 * We need to update the txg and timestamp of the checkpointed 4655 * uberblock to be higher than the latest one. This ensures that 4656 * the checkpointed uberblock is selected if we were to close and 4657 * reopen the pool right after we've written it in the vdev labels. 4658 * (also see block comment in vdev_uberblock_compare) 4659 */ 4660 checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1; 4661 checkpoint.ub_timestamp = gethrestime_sec(); 4662 4663 /* 4664 * Set current uberblock to be the checkpointed uberblock. 4665 */ 4666 spa->spa_uberblock = checkpoint; 4667 4668 /* 4669 * If we are doing a normal rewind, then the pool is open for 4670 * writing and we sync the "updated" checkpointed uberblock to 4671 * disk. Once this is done, we've basically rewound the whole 4672 * pool and there is no way back. 4673 * 4674 * There are cases when we don't want to attempt and sync the 4675 * checkpointed uberblock to disk because we are opening a 4676 * pool as read-only. Specifically, verifying the checkpointed 4677 * state with zdb, and importing the checkpointed state to get 4678 * a "preview" of its content. 4679 */ 4680 if (spa_writeable(spa)) { 4681 vdev_t *rvd = spa->spa_root_vdev; 4682 4683 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4684 vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; 4685 int svdcount = 0; 4686 int children = rvd->vdev_children; 4687 int c0 = random_in_range(children); 4688 4689 for (int c = 0; c < children; c++) { 4690 vdev_t *vd = rvd->vdev_child[(c0 + c) % children]; 4691 4692 /* Stop when revisiting the first vdev */ 4693 if (c > 0 && svd[0] == vd) 4694 break; 4695 4696 if (vd->vdev_ms_array == 0 || vd->vdev_islog || 4697 !vdev_is_concrete(vd)) 4698 continue; 4699 4700 svd[svdcount++] = vd; 4701 if (svdcount == SPA_SYNC_MIN_VDEVS) 4702 break; 4703 } 4704 error = vdev_config_sync(svd, svdcount, spa->spa_first_txg); 4705 if (error == 0) 4706 spa->spa_last_synced_guid = rvd->vdev_guid; 4707 spa_config_exit(spa, SCL_ALL, FTAG); 4708 4709 if (error != 0) { 4710 spa_load_failed(spa, "failed to write checkpointed " 4711 "uberblock to the vdev labels [error=%d]", error); 4712 return (error); 4713 } 4714 } 4715 4716 return (0); 4717 } 4718 4719 static int 4720 spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type, 4721 boolean_t *update_config_cache) 4722 { 4723 int error; 4724 4725 /* 4726 * Parse the config for pool, open and validate vdevs, 4727 * select an uberblock, and use that uberblock to open 4728 * the MOS. 4729 */ 4730 error = spa_ld_mos_init(spa, type); 4731 if (error != 0) 4732 return (error); 4733 4734 /* 4735 * Retrieve the trusted config stored in the MOS and use it to create 4736 * a new, exact version of the vdev tree, then reopen all vdevs. 4737 */ 4738 error = spa_ld_trusted_config(spa, type, B_FALSE); 4739 if (error == EAGAIN) { 4740 if (update_config_cache != NULL) 4741 *update_config_cache = B_TRUE; 4742 4743 /* 4744 * Redo the loading process with the trusted config if it is 4745 * too different from the untrusted config. 4746 */ 4747 spa_ld_prepare_for_reload(spa); 4748 spa_load_note(spa, "RELOADING"); 4749 error = spa_ld_mos_init(spa, type); 4750 if (error != 0) 4751 return (error); 4752 4753 error = spa_ld_trusted_config(spa, type, B_TRUE); 4754 if (error != 0) 4755 return (error); 4756 4757 } else if (error != 0) { 4758 return (error); 4759 } 4760 4761 return (0); 4762 } 4763 4764 /* 4765 * Load an existing storage pool, using the config provided. This config 4766 * describes which vdevs are part of the pool and is later validated against 4767 * partial configs present in each vdev's label and an entire copy of the 4768 * config stored in the MOS. 4769 */ 4770 static int 4771 spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport) 4772 { 4773 int error = 0; 4774 boolean_t missing_feat_write = B_FALSE; 4775 boolean_t checkpoint_rewind = 4776 (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); 4777 boolean_t update_config_cache = B_FALSE; 4778 4779 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 4780 ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); 4781 4782 spa_load_note(spa, "LOADING"); 4783 4784 error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache); 4785 if (error != 0) 4786 return (error); 4787 4788 /* 4789 * If we are rewinding to the checkpoint then we need to repeat 4790 * everything we've done so far in this function but this time 4791 * selecting the checkpointed uberblock and using that to open 4792 * the MOS. 4793 */ 4794 if (checkpoint_rewind) { 4795 /* 4796 * If we are rewinding to the checkpoint update config cache 4797 * anyway. 4798 */ 4799 update_config_cache = B_TRUE; 4800 4801 /* 4802 * Extract the checkpointed uberblock from the current MOS 4803 * and use this as the pool's uberblock from now on. If the 4804 * pool is imported as writeable we also write the checkpoint 4805 * uberblock to the labels, making the rewind permanent. 4806 */ 4807 error = spa_ld_checkpoint_rewind(spa); 4808 if (error != 0) 4809 return (error); 4810 4811 /* 4812 * Redo the loading process again with the 4813 * checkpointed uberblock. 4814 */ 4815 spa_ld_prepare_for_reload(spa); 4816 spa_load_note(spa, "LOADING checkpointed uberblock"); 4817 error = spa_ld_mos_with_trusted_config(spa, type, NULL); 4818 if (error != 0) 4819 return (error); 4820 } 4821 4822 /* 4823 * Retrieve the checkpoint txg if the pool has a checkpoint. 4824 */ 4825 error = spa_ld_read_checkpoint_txg(spa); 4826 if (error != 0) 4827 return (error); 4828 4829 /* 4830 * Retrieve the mapping of indirect vdevs. Those vdevs were removed 4831 * from the pool and their contents were re-mapped to other vdevs. Note 4832 * that everything that we read before this step must have been 4833 * rewritten on concrete vdevs after the last device removal was 4834 * initiated. Otherwise we could be reading from indirect vdevs before 4835 * we have loaded their mappings. 4836 */ 4837 error = spa_ld_open_indirect_vdev_metadata(spa); 4838 if (error != 0) 4839 return (error); 4840 4841 /* 4842 * Retrieve the full list of active features from the MOS and check if 4843 * they are all supported. 4844 */ 4845 error = spa_ld_check_features(spa, &missing_feat_write); 4846 if (error != 0) 4847 return (error); 4848 4849 /* 4850 * Load several special directories from the MOS needed by the dsl_pool 4851 * layer. 4852 */ 4853 error = spa_ld_load_special_directories(spa); 4854 if (error != 0) 4855 return (error); 4856 4857 /* 4858 * Retrieve pool properties from the MOS. 4859 */ 4860 error = spa_ld_get_props(spa); 4861 if (error != 0) 4862 return (error); 4863 4864 /* 4865 * Retrieve the list of auxiliary devices - cache devices and spares - 4866 * and open them. 4867 */ 4868 error = spa_ld_open_aux_vdevs(spa, type); 4869 if (error != 0) 4870 return (error); 4871 4872 /* 4873 * Load the metadata for all vdevs. Also check if unopenable devices 4874 * should be autoreplaced. 4875 */ 4876 error = spa_ld_load_vdev_metadata(spa); 4877 if (error != 0) 4878 return (error); 4879 4880 error = spa_ld_load_dedup_tables(spa); 4881 if (error != 0) 4882 return (error); 4883 4884 /* 4885 * Verify the logs now to make sure we don't have any unexpected errors 4886 * when we claim log blocks later. 4887 */ 4888 error = spa_ld_verify_logs(spa, type, ereport); 4889 if (error != 0) 4890 return (error); 4891 4892 if (missing_feat_write) { 4893 ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT); 4894 4895 /* 4896 * At this point, we know that we can open the pool in 4897 * read-only mode but not read-write mode. We now have enough 4898 * information and can return to userland. 4899 */ 4900 return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT, 4901 ENOTSUP)); 4902 } 4903 4904 /* 4905 * Traverse the last txgs to make sure the pool was left off in a safe 4906 * state. When performing an extreme rewind, we verify the whole pool, 4907 * which can take a very long time. 4908 */ 4909 error = spa_ld_verify_pool_data(spa); 4910 if (error != 0) 4911 return (error); 4912 4913 /* 4914 * Calculate the deflated space for the pool. This must be done before 4915 * we write anything to the pool because we'd need to update the space 4916 * accounting using the deflated sizes. 4917 */ 4918 spa_update_dspace(spa); 4919 4920 /* 4921 * We have now retrieved all the information we needed to open the 4922 * pool. If we are importing the pool in read-write mode, a few 4923 * additional steps must be performed to finish the import. 4924 */ 4925 if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER || 4926 spa->spa_load_max_txg == UINT64_MAX)) { 4927 uint64_t config_cache_txg = spa->spa_config_txg; 4928 4929 ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT); 4930 4931 /* 4932 * In case of a checkpoint rewind, log the original txg 4933 * of the checkpointed uberblock. 4934 */ 4935 if (checkpoint_rewind) { 4936 spa_history_log_internal(spa, "checkpoint rewind", 4937 NULL, "rewound state to txg=%llu", 4938 (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg); 4939 } 4940 4941 /* 4942 * Traverse the ZIL and claim all blocks. 4943 */ 4944 spa_ld_claim_log_blocks(spa); 4945 4946 /* 4947 * Kick-off the syncing thread. 4948 */ 4949 spa->spa_sync_on = B_TRUE; 4950 txg_sync_start(spa->spa_dsl_pool); 4951 mmp_thread_start(spa); 4952 4953 /* 4954 * Wait for all claims to sync. We sync up to the highest 4955 * claimed log block birth time so that claimed log blocks 4956 * don't appear to be from the future. spa_claim_max_txg 4957 * will have been set for us by ZIL traversal operations 4958 * performed above. 4959 */ 4960 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 4961 4962 /* 4963 * Check if we need to request an update of the config. On the 4964 * next sync, we would update the config stored in vdev labels 4965 * and the cachefile (by default /etc/zfs/zpool.cache). 4966 */ 4967 spa_ld_check_for_config_update(spa, config_cache_txg, 4968 update_config_cache); 4969 4970 /* 4971 * Check if a rebuild was in progress and if so resume it. 4972 * Then check all DTLs to see if anything needs resilvering. 4973 * The resilver will be deferred if a rebuild was started. 4974 */ 4975 if (vdev_rebuild_active(spa->spa_root_vdev)) { 4976 vdev_rebuild_restart(spa); 4977 } else if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 4978 vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 4979 spa_async_request(spa, SPA_ASYNC_RESILVER); 4980 } 4981 4982 /* 4983 * Log the fact that we booted up (so that we can detect if 4984 * we rebooted in the middle of an operation). 4985 */ 4986 spa_history_log_version(spa, "open", NULL); 4987 4988 spa_restart_removal(spa); 4989 spa_spawn_aux_threads(spa); 4990 4991 /* 4992 * Delete any inconsistent datasets. 4993 * 4994 * Note: 4995 * Since we may be issuing deletes for clones here, 4996 * we make sure to do so after we've spawned all the 4997 * auxiliary threads above (from which the livelist 4998 * deletion zthr is part of). 4999 */ 5000 (void) dmu_objset_find(spa_name(spa), 5001 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 5002 5003 /* 5004 * Clean up any stale temporary dataset userrefs. 5005 */ 5006 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 5007 5008 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5009 vdev_initialize_restart(spa->spa_root_vdev); 5010 vdev_trim_restart(spa->spa_root_vdev); 5011 vdev_autotrim_restart(spa); 5012 spa_config_exit(spa, SCL_CONFIG, FTAG); 5013 } 5014 5015 spa_import_progress_remove(spa_guid(spa)); 5016 spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); 5017 5018 spa_load_note(spa, "LOADED"); 5019 5020 return (0); 5021 } 5022 5023 static int 5024 spa_load_retry(spa_t *spa, spa_load_state_t state) 5025 { 5026 spa_mode_t mode = spa->spa_mode; 5027 5028 spa_unload(spa); 5029 spa_deactivate(spa); 5030 5031 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1; 5032 5033 spa_activate(spa, mode); 5034 spa_async_suspend(spa); 5035 5036 spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu", 5037 (u_longlong_t)spa->spa_load_max_txg); 5038 5039 return (spa_load(spa, state, SPA_IMPORT_EXISTING)); 5040 } 5041 5042 /* 5043 * If spa_load() fails this function will try loading prior txg's. If 5044 * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool 5045 * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this 5046 * function will not rewind the pool and will return the same error as 5047 * spa_load(). 5048 */ 5049 static int 5050 spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request, 5051 int rewind_flags) 5052 { 5053 nvlist_t *loadinfo = NULL; 5054 nvlist_t *config = NULL; 5055 int load_error, rewind_error; 5056 uint64_t safe_rewind_txg; 5057 uint64_t min_txg; 5058 5059 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 5060 spa->spa_load_max_txg = spa->spa_load_txg; 5061 spa_set_log_state(spa, SPA_LOG_CLEAR); 5062 } else { 5063 spa->spa_load_max_txg = max_request; 5064 if (max_request != UINT64_MAX) 5065 spa->spa_extreme_rewind = B_TRUE; 5066 } 5067 5068 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING); 5069 if (load_error == 0) 5070 return (0); 5071 if (load_error == ZFS_ERR_NO_CHECKPOINT) { 5072 /* 5073 * When attempting checkpoint-rewind on a pool with no 5074 * checkpoint, we should not attempt to load uberblocks 5075 * from previous txgs when spa_load fails. 5076 */ 5077 ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); 5078 spa_import_progress_remove(spa_guid(spa)); 5079 return (load_error); 5080 } 5081 5082 if (spa->spa_root_vdev != NULL) 5083 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 5084 5085 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 5086 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 5087 5088 if (rewind_flags & ZPOOL_NEVER_REWIND) { 5089 nvlist_free(config); 5090 spa_import_progress_remove(spa_guid(spa)); 5091 return (load_error); 5092 } 5093 5094 if (state == SPA_LOAD_RECOVER) { 5095 /* Price of rolling back is discarding txgs, including log */ 5096 spa_set_log_state(spa, SPA_LOG_CLEAR); 5097 } else { 5098 /* 5099 * If we aren't rolling back save the load info from our first 5100 * import attempt so that we can restore it after attempting 5101 * to rewind. 5102 */ 5103 loadinfo = spa->spa_load_info; 5104 spa->spa_load_info = fnvlist_alloc(); 5105 } 5106 5107 spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 5108 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 5109 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 5110 TXG_INITIAL : safe_rewind_txg; 5111 5112 /* 5113 * Continue as long as we're finding errors, we're still within 5114 * the acceptable rewind range, and we're still finding uberblocks 5115 */ 5116 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 5117 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 5118 if (spa->spa_load_max_txg < safe_rewind_txg) 5119 spa->spa_extreme_rewind = B_TRUE; 5120 rewind_error = spa_load_retry(spa, state); 5121 } 5122 5123 spa->spa_extreme_rewind = B_FALSE; 5124 spa->spa_load_max_txg = UINT64_MAX; 5125 5126 if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 5127 spa_config_set(spa, config); 5128 else 5129 nvlist_free(config); 5130 5131 if (state == SPA_LOAD_RECOVER) { 5132 ASSERT3P(loadinfo, ==, NULL); 5133 spa_import_progress_remove(spa_guid(spa)); 5134 return (rewind_error); 5135 } else { 5136 /* Store the rewind info as part of the initial load info */ 5137 fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, 5138 spa->spa_load_info); 5139 5140 /* Restore the initial load info */ 5141 fnvlist_free(spa->spa_load_info); 5142 spa->spa_load_info = loadinfo; 5143 5144 spa_import_progress_remove(spa_guid(spa)); 5145 return (load_error); 5146 } 5147 } 5148 5149 /* 5150 * Pool Open/Import 5151 * 5152 * The import case is identical to an open except that the configuration is sent 5153 * down from userland, instead of grabbed from the configuration cache. For the 5154 * case of an open, the pool configuration will exist in the 5155 * POOL_STATE_UNINITIALIZED state. 5156 * 5157 * The stats information (gen/count/ustats) is used to gather vdev statistics at 5158 * the same time open the pool, without having to keep around the spa_t in some 5159 * ambiguous state. 5160 */ 5161 static int 5162 spa_open_common(const char *pool, spa_t **spapp, const void *tag, 5163 nvlist_t *nvpolicy, nvlist_t **config) 5164 { 5165 spa_t *spa; 5166 spa_load_state_t state = SPA_LOAD_OPEN; 5167 int error; 5168 int locked = B_FALSE; 5169 int firstopen = B_FALSE; 5170 5171 *spapp = NULL; 5172 5173 /* 5174 * As disgusting as this is, we need to support recursive calls to this 5175 * function because dsl_dir_open() is called during spa_load(), and ends 5176 * up calling spa_open() again. The real fix is to figure out how to 5177 * avoid dsl_dir_open() calling this in the first place. 5178 */ 5179 if (MUTEX_NOT_HELD(&spa_namespace_lock)) { 5180 mutex_enter(&spa_namespace_lock); 5181 locked = B_TRUE; 5182 } 5183 5184 if ((spa = spa_lookup(pool)) == NULL) { 5185 if (locked) 5186 mutex_exit(&spa_namespace_lock); 5187 return (SET_ERROR(ENOENT)); 5188 } 5189 5190 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 5191 zpool_load_policy_t policy; 5192 5193 firstopen = B_TRUE; 5194 5195 zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config, 5196 &policy); 5197 if (policy.zlp_rewind & ZPOOL_DO_REWIND) 5198 state = SPA_LOAD_RECOVER; 5199 5200 spa_activate(spa, spa_mode_global); 5201 5202 if (state != SPA_LOAD_RECOVER) 5203 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 5204 spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; 5205 5206 zfs_dbgmsg("spa_open_common: opening %s", pool); 5207 error = spa_load_best(spa, state, policy.zlp_txg, 5208 policy.zlp_rewind); 5209 5210 if (error == EBADF) { 5211 /* 5212 * If vdev_validate() returns failure (indicated by 5213 * EBADF), it indicates that one of the vdevs indicates 5214 * that the pool has been exported or destroyed. If 5215 * this is the case, the config cache is out of sync and 5216 * we should remove the pool from the namespace. 5217 */ 5218 spa_unload(spa); 5219 spa_deactivate(spa); 5220 spa_write_cachefile(spa, B_TRUE, B_TRUE); 5221 spa_remove(spa); 5222 if (locked) 5223 mutex_exit(&spa_namespace_lock); 5224 return (SET_ERROR(ENOENT)); 5225 } 5226 5227 if (error) { 5228 /* 5229 * We can't open the pool, but we still have useful 5230 * information: the state of each vdev after the 5231 * attempted vdev_open(). Return this to the user. 5232 */ 5233 if (config != NULL && spa->spa_config) { 5234 *config = fnvlist_dup(spa->spa_config); 5235 fnvlist_add_nvlist(*config, 5236 ZPOOL_CONFIG_LOAD_INFO, 5237 spa->spa_load_info); 5238 } 5239 spa_unload(spa); 5240 spa_deactivate(spa); 5241 spa->spa_last_open_failed = error; 5242 if (locked) 5243 mutex_exit(&spa_namespace_lock); 5244 *spapp = NULL; 5245 return (error); 5246 } 5247 } 5248 5249 spa_open_ref(spa, tag); 5250 5251 if (config != NULL) 5252 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 5253 5254 /* 5255 * If we've recovered the pool, pass back any information we 5256 * gathered while doing the load. 5257 */ 5258 if (state == SPA_LOAD_RECOVER) { 5259 fnvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 5260 spa->spa_load_info); 5261 } 5262 5263 if (locked) { 5264 spa->spa_last_open_failed = 0; 5265 spa->spa_last_ubsync_txg = 0; 5266 spa->spa_load_txg = 0; 5267 mutex_exit(&spa_namespace_lock); 5268 } 5269 5270 if (firstopen) 5271 zvol_create_minors_recursive(spa_name(spa)); 5272 5273 *spapp = spa; 5274 5275 return (0); 5276 } 5277 5278 int 5279 spa_open_rewind(const char *name, spa_t **spapp, const void *tag, 5280 nvlist_t *policy, nvlist_t **config) 5281 { 5282 return (spa_open_common(name, spapp, tag, policy, config)); 5283 } 5284 5285 int 5286 spa_open(const char *name, spa_t **spapp, const void *tag) 5287 { 5288 return (spa_open_common(name, spapp, tag, NULL, NULL)); 5289 } 5290 5291 /* 5292 * Lookup the given spa_t, incrementing the inject count in the process, 5293 * preventing it from being exported or destroyed. 5294 */ 5295 spa_t * 5296 spa_inject_addref(char *name) 5297 { 5298 spa_t *spa; 5299 5300 mutex_enter(&spa_namespace_lock); 5301 if ((spa = spa_lookup(name)) == NULL) { 5302 mutex_exit(&spa_namespace_lock); 5303 return (NULL); 5304 } 5305 spa->spa_inject_ref++; 5306 mutex_exit(&spa_namespace_lock); 5307 5308 return (spa); 5309 } 5310 5311 void 5312 spa_inject_delref(spa_t *spa) 5313 { 5314 mutex_enter(&spa_namespace_lock); 5315 spa->spa_inject_ref--; 5316 mutex_exit(&spa_namespace_lock); 5317 } 5318 5319 /* 5320 * Add spares device information to the nvlist. 5321 */ 5322 static void 5323 spa_add_spares(spa_t *spa, nvlist_t *config) 5324 { 5325 nvlist_t **spares; 5326 uint_t i, nspares; 5327 nvlist_t *nvroot; 5328 uint64_t guid; 5329 vdev_stat_t *vs; 5330 uint_t vsc; 5331 uint64_t pool; 5332 5333 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 5334 5335 if (spa->spa_spares.sav_count == 0) 5336 return; 5337 5338 nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); 5339 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 5340 ZPOOL_CONFIG_SPARES, &spares, &nspares)); 5341 if (nspares != 0) { 5342 fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 5343 (const nvlist_t * const *)spares, nspares); 5344 VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 5345 &spares, &nspares)); 5346 5347 /* 5348 * Go through and find any spares which have since been 5349 * repurposed as an active spare. If this is the case, update 5350 * their status appropriately. 5351 */ 5352 for (i = 0; i < nspares; i++) { 5353 guid = fnvlist_lookup_uint64(spares[i], 5354 ZPOOL_CONFIG_GUID); 5355 if (spa_spare_exists(guid, &pool, NULL) && 5356 pool != 0ULL) { 5357 VERIFY0(nvlist_lookup_uint64_array(spares[i], 5358 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, 5359 &vsc)); 5360 vs->vs_state = VDEV_STATE_CANT_OPEN; 5361 vs->vs_aux = VDEV_AUX_SPARED; 5362 } 5363 } 5364 } 5365 } 5366 5367 /* 5368 * Add l2cache device information to the nvlist, including vdev stats. 5369 */ 5370 static void 5371 spa_add_l2cache(spa_t *spa, nvlist_t *config) 5372 { 5373 nvlist_t **l2cache; 5374 uint_t i, j, nl2cache; 5375 nvlist_t *nvroot; 5376 uint64_t guid; 5377 vdev_t *vd; 5378 vdev_stat_t *vs; 5379 uint_t vsc; 5380 5381 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 5382 5383 if (spa->spa_l2cache.sav_count == 0) 5384 return; 5385 5386 nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); 5387 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 5388 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache)); 5389 if (nl2cache != 0) { 5390 fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 5391 (const nvlist_t * const *)l2cache, nl2cache); 5392 VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 5393 &l2cache, &nl2cache)); 5394 5395 /* 5396 * Update level 2 cache device stats. 5397 */ 5398 5399 for (i = 0; i < nl2cache; i++) { 5400 guid = fnvlist_lookup_uint64(l2cache[i], 5401 ZPOOL_CONFIG_GUID); 5402 5403 vd = NULL; 5404 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 5405 if (guid == 5406 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 5407 vd = spa->spa_l2cache.sav_vdevs[j]; 5408 break; 5409 } 5410 } 5411 ASSERT(vd != NULL); 5412 5413 VERIFY0(nvlist_lookup_uint64_array(l2cache[i], 5414 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)); 5415 vdev_get_stats(vd, vs); 5416 vdev_config_generate_stats(vd, l2cache[i]); 5417 5418 } 5419 } 5420 } 5421 5422 static void 5423 spa_feature_stats_from_disk(spa_t *spa, nvlist_t *features) 5424 { 5425 zap_cursor_t zc; 5426 zap_attribute_t za; 5427 5428 if (spa->spa_feat_for_read_obj != 0) { 5429 for (zap_cursor_init(&zc, spa->spa_meta_objset, 5430 spa->spa_feat_for_read_obj); 5431 zap_cursor_retrieve(&zc, &za) == 0; 5432 zap_cursor_advance(&zc)) { 5433 ASSERT(za.za_integer_length == sizeof (uint64_t) && 5434 za.za_num_integers == 1); 5435 VERIFY0(nvlist_add_uint64(features, za.za_name, 5436 za.za_first_integer)); 5437 } 5438 zap_cursor_fini(&zc); 5439 } 5440 5441 if (spa->spa_feat_for_write_obj != 0) { 5442 for (zap_cursor_init(&zc, spa->spa_meta_objset, 5443 spa->spa_feat_for_write_obj); 5444 zap_cursor_retrieve(&zc, &za) == 0; 5445 zap_cursor_advance(&zc)) { 5446 ASSERT(za.za_integer_length == sizeof (uint64_t) && 5447 za.za_num_integers == 1); 5448 VERIFY0(nvlist_add_uint64(features, za.za_name, 5449 za.za_first_integer)); 5450 } 5451 zap_cursor_fini(&zc); 5452 } 5453 } 5454 5455 static void 5456 spa_feature_stats_from_cache(spa_t *spa, nvlist_t *features) 5457 { 5458 int i; 5459 5460 for (i = 0; i < SPA_FEATURES; i++) { 5461 zfeature_info_t feature = spa_feature_table[i]; 5462 uint64_t refcount; 5463 5464 if (feature_get_refcount(spa, &feature, &refcount) != 0) 5465 continue; 5466 5467 VERIFY0(nvlist_add_uint64(features, feature.fi_guid, refcount)); 5468 } 5469 } 5470 5471 /* 5472 * Store a list of pool features and their reference counts in the 5473 * config. 5474 * 5475 * The first time this is called on a spa, allocate a new nvlist, fetch 5476 * the pool features and reference counts from disk, then save the list 5477 * in the spa. In subsequent calls on the same spa use the saved nvlist 5478 * and refresh its values from the cached reference counts. This 5479 * ensures we don't block here on I/O on a suspended pool so 'zpool 5480 * clear' can resume the pool. 5481 */ 5482 static void 5483 spa_add_feature_stats(spa_t *spa, nvlist_t *config) 5484 { 5485 nvlist_t *features; 5486 5487 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 5488 5489 mutex_enter(&spa->spa_feat_stats_lock); 5490 features = spa->spa_feat_stats; 5491 5492 if (features != NULL) { 5493 spa_feature_stats_from_cache(spa, features); 5494 } else { 5495 VERIFY0(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP)); 5496 spa->spa_feat_stats = features; 5497 spa_feature_stats_from_disk(spa, features); 5498 } 5499 5500 VERIFY0(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, 5501 features)); 5502 5503 mutex_exit(&spa->spa_feat_stats_lock); 5504 } 5505 5506 int 5507 spa_get_stats(const char *name, nvlist_t **config, 5508 char *altroot, size_t buflen) 5509 { 5510 int error; 5511 spa_t *spa; 5512 5513 *config = NULL; 5514 error = spa_open_common(name, &spa, FTAG, NULL, config); 5515 5516 if (spa != NULL) { 5517 /* 5518 * This still leaves a window of inconsistency where the spares 5519 * or l2cache devices could change and the config would be 5520 * self-inconsistent. 5521 */ 5522 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5523 5524 if (*config != NULL) { 5525 uint64_t loadtimes[2]; 5526 5527 loadtimes[0] = spa->spa_loaded_ts.tv_sec; 5528 loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 5529 fnvlist_add_uint64_array(*config, 5530 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2); 5531 5532 fnvlist_add_uint64(*config, 5533 ZPOOL_CONFIG_ERRCOUNT, 5534 spa_get_errlog_size(spa)); 5535 5536 if (spa_suspended(spa)) { 5537 fnvlist_add_uint64(*config, 5538 ZPOOL_CONFIG_SUSPENDED, 5539 spa->spa_failmode); 5540 fnvlist_add_uint64(*config, 5541 ZPOOL_CONFIG_SUSPENDED_REASON, 5542 spa->spa_suspended); 5543 } 5544 5545 spa_add_spares(spa, *config); 5546 spa_add_l2cache(spa, *config); 5547 spa_add_feature_stats(spa, *config); 5548 } 5549 } 5550 5551 /* 5552 * We want to get the alternate root even for faulted pools, so we cheat 5553 * and call spa_lookup() directly. 5554 */ 5555 if (altroot) { 5556 if (spa == NULL) { 5557 mutex_enter(&spa_namespace_lock); 5558 spa = spa_lookup(name); 5559 if (spa) 5560 spa_altroot(spa, altroot, buflen); 5561 else 5562 altroot[0] = '\0'; 5563 spa = NULL; 5564 mutex_exit(&spa_namespace_lock); 5565 } else { 5566 spa_altroot(spa, altroot, buflen); 5567 } 5568 } 5569 5570 if (spa != NULL) { 5571 spa_config_exit(spa, SCL_CONFIG, FTAG); 5572 spa_close(spa, FTAG); 5573 } 5574 5575 return (error); 5576 } 5577 5578 /* 5579 * Validate that the auxiliary device array is well formed. We must have an 5580 * array of nvlists, each which describes a valid leaf vdev. If this is an 5581 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 5582 * specified, as long as they are well-formed. 5583 */ 5584 static int 5585 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 5586 spa_aux_vdev_t *sav, const char *config, uint64_t version, 5587 vdev_labeltype_t label) 5588 { 5589 nvlist_t **dev; 5590 uint_t i, ndev; 5591 vdev_t *vd; 5592 int error; 5593 5594 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 5595 5596 /* 5597 * It's acceptable to have no devs specified. 5598 */ 5599 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 5600 return (0); 5601 5602 if (ndev == 0) 5603 return (SET_ERROR(EINVAL)); 5604 5605 /* 5606 * Make sure the pool is formatted with a version that supports this 5607 * device type. 5608 */ 5609 if (spa_version(spa) < version) 5610 return (SET_ERROR(ENOTSUP)); 5611 5612 /* 5613 * Set the pending device list so we correctly handle device in-use 5614 * checking. 5615 */ 5616 sav->sav_pending = dev; 5617 sav->sav_npending = ndev; 5618 5619 for (i = 0; i < ndev; i++) { 5620 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 5621 mode)) != 0) 5622 goto out; 5623 5624 if (!vd->vdev_ops->vdev_op_leaf) { 5625 vdev_free(vd); 5626 error = SET_ERROR(EINVAL); 5627 goto out; 5628 } 5629 5630 vd->vdev_top = vd; 5631 5632 if ((error = vdev_open(vd)) == 0 && 5633 (error = vdev_label_init(vd, crtxg, label)) == 0) { 5634 fnvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 5635 vd->vdev_guid); 5636 } 5637 5638 vdev_free(vd); 5639 5640 if (error && 5641 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 5642 goto out; 5643 else 5644 error = 0; 5645 } 5646 5647 out: 5648 sav->sav_pending = NULL; 5649 sav->sav_npending = 0; 5650 return (error); 5651 } 5652 5653 static int 5654 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 5655 { 5656 int error; 5657 5658 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 5659 5660 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 5661 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 5662 VDEV_LABEL_SPARE)) != 0) { 5663 return (error); 5664 } 5665 5666 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 5667 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 5668 VDEV_LABEL_L2CACHE)); 5669 } 5670 5671 static void 5672 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 5673 const char *config) 5674 { 5675 int i; 5676 5677 if (sav->sav_config != NULL) { 5678 nvlist_t **olddevs; 5679 uint_t oldndevs; 5680 nvlist_t **newdevs; 5681 5682 /* 5683 * Generate new dev list by concatenating with the 5684 * current dev list. 5685 */ 5686 VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, config, 5687 &olddevs, &oldndevs)); 5688 5689 newdevs = kmem_alloc(sizeof (void *) * 5690 (ndevs + oldndevs), KM_SLEEP); 5691 for (i = 0; i < oldndevs; i++) 5692 newdevs[i] = fnvlist_dup(olddevs[i]); 5693 for (i = 0; i < ndevs; i++) 5694 newdevs[i + oldndevs] = fnvlist_dup(devs[i]); 5695 5696 fnvlist_remove(sav->sav_config, config); 5697 5698 fnvlist_add_nvlist_array(sav->sav_config, config, 5699 (const nvlist_t * const *)newdevs, ndevs + oldndevs); 5700 for (i = 0; i < oldndevs + ndevs; i++) 5701 nvlist_free(newdevs[i]); 5702 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 5703 } else { 5704 /* 5705 * Generate a new dev list. 5706 */ 5707 sav->sav_config = fnvlist_alloc(); 5708 fnvlist_add_nvlist_array(sav->sav_config, config, 5709 (const nvlist_t * const *)devs, ndevs); 5710 } 5711 } 5712 5713 /* 5714 * Stop and drop level 2 ARC devices 5715 */ 5716 void 5717 spa_l2cache_drop(spa_t *spa) 5718 { 5719 vdev_t *vd; 5720 int i; 5721 spa_aux_vdev_t *sav = &spa->spa_l2cache; 5722 5723 for (i = 0; i < sav->sav_count; i++) { 5724 uint64_t pool; 5725 5726 vd = sav->sav_vdevs[i]; 5727 ASSERT(vd != NULL); 5728 5729 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 5730 pool != 0ULL && l2arc_vdev_present(vd)) 5731 l2arc_remove_vdev(vd); 5732 } 5733 } 5734 5735 /* 5736 * Verify encryption parameters for spa creation. If we are encrypting, we must 5737 * have the encryption feature flag enabled. 5738 */ 5739 static int 5740 spa_create_check_encryption_params(dsl_crypto_params_t *dcp, 5741 boolean_t has_encryption) 5742 { 5743 if (dcp->cp_crypt != ZIO_CRYPT_OFF && 5744 dcp->cp_crypt != ZIO_CRYPT_INHERIT && 5745 !has_encryption) 5746 return (SET_ERROR(ENOTSUP)); 5747 5748 return (dmu_objset_create_crypt_check(NULL, dcp, NULL)); 5749 } 5750 5751 /* 5752 * Pool Creation 5753 */ 5754 int 5755 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 5756 nvlist_t *zplprops, dsl_crypto_params_t *dcp) 5757 { 5758 spa_t *spa; 5759 char *altroot = NULL; 5760 vdev_t *rvd; 5761 dsl_pool_t *dp; 5762 dmu_tx_t *tx; 5763 int error = 0; 5764 uint64_t txg = TXG_INITIAL; 5765 nvlist_t **spares, **l2cache; 5766 uint_t nspares, nl2cache; 5767 uint64_t version, obj, ndraid = 0; 5768 boolean_t has_features; 5769 boolean_t has_encryption; 5770 boolean_t has_allocclass; 5771 spa_feature_t feat; 5772 char *feat_name; 5773 char *poolname; 5774 nvlist_t *nvl; 5775 5776 if (props == NULL || 5777 nvlist_lookup_string(props, "tname", &poolname) != 0) 5778 poolname = (char *)pool; 5779 5780 /* 5781 * If this pool already exists, return failure. 5782 */ 5783 mutex_enter(&spa_namespace_lock); 5784 if (spa_lookup(poolname) != NULL) { 5785 mutex_exit(&spa_namespace_lock); 5786 return (SET_ERROR(EEXIST)); 5787 } 5788 5789 /* 5790 * Allocate a new spa_t structure. 5791 */ 5792 nvl = fnvlist_alloc(); 5793 fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool); 5794 (void) nvlist_lookup_string(props, 5795 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 5796 spa = spa_add(poolname, nvl, altroot); 5797 fnvlist_free(nvl); 5798 spa_activate(spa, spa_mode_global); 5799 5800 if (props && (error = spa_prop_validate(spa, props))) { 5801 spa_deactivate(spa); 5802 spa_remove(spa); 5803 mutex_exit(&spa_namespace_lock); 5804 return (error); 5805 } 5806 5807 /* 5808 * Temporary pool names should never be written to disk. 5809 */ 5810 if (poolname != pool) 5811 spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME; 5812 5813 has_features = B_FALSE; 5814 has_encryption = B_FALSE; 5815 has_allocclass = B_FALSE; 5816 for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); 5817 elem != NULL; elem = nvlist_next_nvpair(props, elem)) { 5818 if (zpool_prop_feature(nvpair_name(elem))) { 5819 has_features = B_TRUE; 5820 5821 feat_name = strchr(nvpair_name(elem), '@') + 1; 5822 VERIFY0(zfeature_lookup_name(feat_name, &feat)); 5823 if (feat == SPA_FEATURE_ENCRYPTION) 5824 has_encryption = B_TRUE; 5825 if (feat == SPA_FEATURE_ALLOCATION_CLASSES) 5826 has_allocclass = B_TRUE; 5827 } 5828 } 5829 5830 /* verify encryption params, if they were provided */ 5831 if (dcp != NULL) { 5832 error = spa_create_check_encryption_params(dcp, has_encryption); 5833 if (error != 0) { 5834 spa_deactivate(spa); 5835 spa_remove(spa); 5836 mutex_exit(&spa_namespace_lock); 5837 return (error); 5838 } 5839 } 5840 if (!has_allocclass && zfs_special_devs(nvroot, NULL)) { 5841 spa_deactivate(spa); 5842 spa_remove(spa); 5843 mutex_exit(&spa_namespace_lock); 5844 return (ENOTSUP); 5845 } 5846 5847 if (has_features || nvlist_lookup_uint64(props, 5848 zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { 5849 version = SPA_VERSION; 5850 } 5851 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 5852 5853 spa->spa_first_txg = txg; 5854 spa->spa_uberblock.ub_txg = txg - 1; 5855 spa->spa_uberblock.ub_version = version; 5856 spa->spa_ubsync = spa->spa_uberblock; 5857 spa->spa_load_state = SPA_LOAD_CREATE; 5858 spa->spa_removing_phys.sr_state = DSS_NONE; 5859 spa->spa_removing_phys.sr_removing_vdev = -1; 5860 spa->spa_removing_phys.sr_prev_indirect_vdev = -1; 5861 spa->spa_indirect_vdevs_loaded = B_TRUE; 5862 5863 /* 5864 * Create "The Godfather" zio to hold all async IOs 5865 */ 5866 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 5867 KM_SLEEP); 5868 for (int i = 0; i < max_ncpus; i++) { 5869 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 5870 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 5871 ZIO_FLAG_GODFATHER); 5872 } 5873 5874 /* 5875 * Create the root vdev. 5876 */ 5877 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5878 5879 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 5880 5881 ASSERT(error != 0 || rvd != NULL); 5882 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 5883 5884 if (error == 0 && !zfs_allocatable_devs(nvroot)) 5885 error = SET_ERROR(EINVAL); 5886 5887 if (error == 0 && 5888 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 5889 (error = vdev_draid_spare_create(nvroot, rvd, &ndraid, 0)) == 0 && 5890 (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { 5891 /* 5892 * instantiate the metaslab groups (this will dirty the vdevs) 5893 * we can no longer error exit past this point 5894 */ 5895 for (int c = 0; error == 0 && c < rvd->vdev_children; c++) { 5896 vdev_t *vd = rvd->vdev_child[c]; 5897 5898 vdev_metaslab_set_size(vd); 5899 vdev_expand(vd, txg); 5900 } 5901 } 5902 5903 spa_config_exit(spa, SCL_ALL, FTAG); 5904 5905 if (error != 0) { 5906 spa_unload(spa); 5907 spa_deactivate(spa); 5908 spa_remove(spa); 5909 mutex_exit(&spa_namespace_lock); 5910 return (error); 5911 } 5912 5913 /* 5914 * Get the list of spares, if specified. 5915 */ 5916 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 5917 &spares, &nspares) == 0) { 5918 spa->spa_spares.sav_config = fnvlist_alloc(); 5919 fnvlist_add_nvlist_array(spa->spa_spares.sav_config, 5920 ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, 5921 nspares); 5922 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5923 spa_load_spares(spa); 5924 spa_config_exit(spa, SCL_ALL, FTAG); 5925 spa->spa_spares.sav_sync = B_TRUE; 5926 } 5927 5928 /* 5929 * Get the list of level 2 cache devices, if specified. 5930 */ 5931 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 5932 &l2cache, &nl2cache) == 0) { 5933 VERIFY0(nvlist_alloc(&spa->spa_l2cache.sav_config, 5934 NV_UNIQUE_NAME, KM_SLEEP)); 5935 fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 5936 ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache, 5937 nl2cache); 5938 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5939 spa_load_l2cache(spa); 5940 spa_config_exit(spa, SCL_ALL, FTAG); 5941 spa->spa_l2cache.sav_sync = B_TRUE; 5942 } 5943 5944 spa->spa_is_initializing = B_TRUE; 5945 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, dcp, txg); 5946 spa->spa_is_initializing = B_FALSE; 5947 5948 /* 5949 * Create DDTs (dedup tables). 5950 */ 5951 ddt_create(spa); 5952 5953 spa_update_dspace(spa); 5954 5955 tx = dmu_tx_create_assigned(dp, txg); 5956 5957 /* 5958 * Create the pool's history object. 5959 */ 5960 if (version >= SPA_VERSION_ZPOOL_HISTORY && !spa->spa_history) 5961 spa_history_create_obj(spa, tx); 5962 5963 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE); 5964 spa_history_log_version(spa, "create", tx); 5965 5966 /* 5967 * Create the pool config object. 5968 */ 5969 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 5970 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 5971 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 5972 5973 if (zap_add(spa->spa_meta_objset, 5974 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 5975 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 5976 cmn_err(CE_PANIC, "failed to add pool config"); 5977 } 5978 5979 if (zap_add(spa->spa_meta_objset, 5980 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 5981 sizeof (uint64_t), 1, &version, tx) != 0) { 5982 cmn_err(CE_PANIC, "failed to add pool version"); 5983 } 5984 5985 /* Newly created pools with the right version are always deflated. */ 5986 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 5987 spa->spa_deflate = TRUE; 5988 if (zap_add(spa->spa_meta_objset, 5989 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 5990 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 5991 cmn_err(CE_PANIC, "failed to add deflate"); 5992 } 5993 } 5994 5995 /* 5996 * Create the deferred-free bpobj. Turn off compression 5997 * because sync-to-convergence takes longer if the blocksize 5998 * keeps changing. 5999 */ 6000 obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 6001 dmu_object_set_compress(spa->spa_meta_objset, obj, 6002 ZIO_COMPRESS_OFF, tx); 6003 if (zap_add(spa->spa_meta_objset, 6004 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 6005 sizeof (uint64_t), 1, &obj, tx) != 0) { 6006 cmn_err(CE_PANIC, "failed to add bpobj"); 6007 } 6008 VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 6009 spa->spa_meta_objset, obj)); 6010 6011 /* 6012 * Generate some random noise for salted checksums to operate on. 6013 */ 6014 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 6015 sizeof (spa->spa_cksum_salt.zcs_bytes)); 6016 6017 /* 6018 * Set pool properties. 6019 */ 6020 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 6021 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 6022 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 6023 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 6024 spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST); 6025 spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM); 6026 6027 if (props != NULL) { 6028 spa_configfile_set(spa, props, B_FALSE); 6029 spa_sync_props(props, tx); 6030 } 6031 6032 for (int i = 0; i < ndraid; i++) 6033 spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); 6034 6035 dmu_tx_commit(tx); 6036 6037 spa->spa_sync_on = B_TRUE; 6038 txg_sync_start(dp); 6039 mmp_thread_start(spa); 6040 txg_wait_synced(dp, txg); 6041 6042 spa_spawn_aux_threads(spa); 6043 6044 spa_write_cachefile(spa, B_FALSE, B_TRUE); 6045 6046 /* 6047 * Don't count references from objsets that are already closed 6048 * and are making their way through the eviction process. 6049 */ 6050 spa_evicting_os_wait(spa); 6051 spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); 6052 spa->spa_load_state = SPA_LOAD_NONE; 6053 6054 spa_import_os(spa); 6055 6056 mutex_exit(&spa_namespace_lock); 6057 6058 return (0); 6059 } 6060 6061 /* 6062 * Import a non-root pool into the system. 6063 */ 6064 int 6065 spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 6066 { 6067 spa_t *spa; 6068 char *altroot = NULL; 6069 spa_load_state_t state = SPA_LOAD_IMPORT; 6070 zpool_load_policy_t policy; 6071 spa_mode_t mode = spa_mode_global; 6072 uint64_t readonly = B_FALSE; 6073 int error; 6074 nvlist_t *nvroot; 6075 nvlist_t **spares, **l2cache; 6076 uint_t nspares, nl2cache; 6077 6078 /* 6079 * If a pool with this name exists, return failure. 6080 */ 6081 mutex_enter(&spa_namespace_lock); 6082 if (spa_lookup(pool) != NULL) { 6083 mutex_exit(&spa_namespace_lock); 6084 return (SET_ERROR(EEXIST)); 6085 } 6086 6087 /* 6088 * Create and initialize the spa structure. 6089 */ 6090 (void) nvlist_lookup_string(props, 6091 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 6092 (void) nvlist_lookup_uint64(props, 6093 zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 6094 if (readonly) 6095 mode = SPA_MODE_READ; 6096 spa = spa_add(pool, config, altroot); 6097 spa->spa_import_flags = flags; 6098 6099 /* 6100 * Verbatim import - Take a pool and insert it into the namespace 6101 * as if it had been loaded at boot. 6102 */ 6103 if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 6104 if (props != NULL) 6105 spa_configfile_set(spa, props, B_FALSE); 6106 6107 spa_write_cachefile(spa, B_FALSE, B_TRUE); 6108 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); 6109 zfs_dbgmsg("spa_import: verbatim import of %s", pool); 6110 mutex_exit(&spa_namespace_lock); 6111 return (0); 6112 } 6113 6114 spa_activate(spa, mode); 6115 6116 /* 6117 * Don't start async tasks until we know everything is healthy. 6118 */ 6119 spa_async_suspend(spa); 6120 6121 zpool_get_load_policy(config, &policy); 6122 if (policy.zlp_rewind & ZPOOL_DO_REWIND) 6123 state = SPA_LOAD_RECOVER; 6124 6125 spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT; 6126 6127 if (state != SPA_LOAD_RECOVER) { 6128 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 6129 zfs_dbgmsg("spa_import: importing %s", pool); 6130 } else { 6131 zfs_dbgmsg("spa_import: importing %s, max_txg=%lld " 6132 "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg); 6133 } 6134 error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind); 6135 6136 /* 6137 * Propagate anything learned while loading the pool and pass it 6138 * back to caller (i.e. rewind info, missing devices, etc). 6139 */ 6140 fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info); 6141 6142 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6143 /* 6144 * Toss any existing sparelist, as it doesn't have any validity 6145 * anymore, and conflicts with spa_has_spare(). 6146 */ 6147 if (spa->spa_spares.sav_config) { 6148 nvlist_free(spa->spa_spares.sav_config); 6149 spa->spa_spares.sav_config = NULL; 6150 spa_load_spares(spa); 6151 } 6152 if (spa->spa_l2cache.sav_config) { 6153 nvlist_free(spa->spa_l2cache.sav_config); 6154 spa->spa_l2cache.sav_config = NULL; 6155 spa_load_l2cache(spa); 6156 } 6157 6158 nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); 6159 spa_config_exit(spa, SCL_ALL, FTAG); 6160 6161 if (props != NULL) 6162 spa_configfile_set(spa, props, B_FALSE); 6163 6164 if (error != 0 || (props && spa_writeable(spa) && 6165 (error = spa_prop_set(spa, props)))) { 6166 spa_unload(spa); 6167 spa_deactivate(spa); 6168 spa_remove(spa); 6169 mutex_exit(&spa_namespace_lock); 6170 return (error); 6171 } 6172 6173 spa_async_resume(spa); 6174 6175 /* 6176 * Override any spares and level 2 cache devices as specified by 6177 * the user, as these may have correct device names/devids, etc. 6178 */ 6179 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 6180 &spares, &nspares) == 0) { 6181 if (spa->spa_spares.sav_config) 6182 fnvlist_remove(spa->spa_spares.sav_config, 6183 ZPOOL_CONFIG_SPARES); 6184 else 6185 spa->spa_spares.sav_config = fnvlist_alloc(); 6186 fnvlist_add_nvlist_array(spa->spa_spares.sav_config, 6187 ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, 6188 nspares); 6189 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6190 spa_load_spares(spa); 6191 spa_config_exit(spa, SCL_ALL, FTAG); 6192 spa->spa_spares.sav_sync = B_TRUE; 6193 } 6194 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 6195 &l2cache, &nl2cache) == 0) { 6196 if (spa->spa_l2cache.sav_config) 6197 fnvlist_remove(spa->spa_l2cache.sav_config, 6198 ZPOOL_CONFIG_L2CACHE); 6199 else 6200 spa->spa_l2cache.sav_config = fnvlist_alloc(); 6201 fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 6202 ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache, 6203 nl2cache); 6204 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6205 spa_load_l2cache(spa); 6206 spa_config_exit(spa, SCL_ALL, FTAG); 6207 spa->spa_l2cache.sav_sync = B_TRUE; 6208 } 6209 6210 /* 6211 * Check for any removed devices. 6212 */ 6213 if (spa->spa_autoreplace) { 6214 spa_aux_check_removed(&spa->spa_spares); 6215 spa_aux_check_removed(&spa->spa_l2cache); 6216 } 6217 6218 if (spa_writeable(spa)) { 6219 /* 6220 * Update the config cache to include the newly-imported pool. 6221 */ 6222 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 6223 } 6224 6225 /* 6226 * It's possible that the pool was expanded while it was exported. 6227 * We kick off an async task to handle this for us. 6228 */ 6229 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 6230 6231 spa_history_log_version(spa, "import", NULL); 6232 6233 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); 6234 6235 mutex_exit(&spa_namespace_lock); 6236 6237 zvol_create_minors_recursive(pool); 6238 6239 spa_import_os(spa); 6240 6241 return (0); 6242 } 6243 6244 nvlist_t * 6245 spa_tryimport(nvlist_t *tryconfig) 6246 { 6247 nvlist_t *config = NULL; 6248 char *poolname, *cachefile; 6249 spa_t *spa; 6250 uint64_t state; 6251 int error; 6252 zpool_load_policy_t policy; 6253 6254 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 6255 return (NULL); 6256 6257 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 6258 return (NULL); 6259 6260 /* 6261 * Create and initialize the spa structure. 6262 */ 6263 mutex_enter(&spa_namespace_lock); 6264 spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 6265 spa_activate(spa, SPA_MODE_READ); 6266 6267 /* 6268 * Rewind pool if a max txg was provided. 6269 */ 6270 zpool_get_load_policy(spa->spa_config, &policy); 6271 if (policy.zlp_txg != UINT64_MAX) { 6272 spa->spa_load_max_txg = policy.zlp_txg; 6273 spa->spa_extreme_rewind = B_TRUE; 6274 zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld", 6275 poolname, (longlong_t)policy.zlp_txg); 6276 } else { 6277 zfs_dbgmsg("spa_tryimport: importing %s", poolname); 6278 } 6279 6280 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile) 6281 == 0) { 6282 zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile); 6283 spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; 6284 } else { 6285 spa->spa_config_source = SPA_CONFIG_SRC_SCAN; 6286 } 6287 6288 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING); 6289 6290 /* 6291 * If 'tryconfig' was at least parsable, return the current config. 6292 */ 6293 if (spa->spa_root_vdev != NULL) { 6294 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 6295 fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, poolname); 6296 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, state); 6297 fnvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 6298 spa->spa_uberblock.ub_timestamp); 6299 fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 6300 spa->spa_load_info); 6301 fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, 6302 spa->spa_errata); 6303 6304 /* 6305 * If the bootfs property exists on this pool then we 6306 * copy it out so that external consumers can tell which 6307 * pools are bootable. 6308 */ 6309 if ((!error || error == EEXIST) && spa->spa_bootfs) { 6310 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 6311 6312 /* 6313 * We have to play games with the name since the 6314 * pool was opened as TRYIMPORT_NAME. 6315 */ 6316 if (dsl_dsobj_to_dsname(spa_name(spa), 6317 spa->spa_bootfs, tmpname) == 0) { 6318 char *cp; 6319 char *dsname; 6320 6321 dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 6322 6323 cp = strchr(tmpname, '/'); 6324 if (cp == NULL) { 6325 (void) strlcpy(dsname, tmpname, 6326 MAXPATHLEN); 6327 } else { 6328 (void) snprintf(dsname, MAXPATHLEN, 6329 "%s/%s", poolname, ++cp); 6330 } 6331 fnvlist_add_string(config, ZPOOL_CONFIG_BOOTFS, 6332 dsname); 6333 kmem_free(dsname, MAXPATHLEN); 6334 } 6335 kmem_free(tmpname, MAXPATHLEN); 6336 } 6337 6338 /* 6339 * Add the list of hot spares and level 2 cache devices. 6340 */ 6341 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6342 spa_add_spares(spa, config); 6343 spa_add_l2cache(spa, config); 6344 spa_config_exit(spa, SCL_CONFIG, FTAG); 6345 } 6346 6347 spa_unload(spa); 6348 spa_deactivate(spa); 6349 spa_remove(spa); 6350 mutex_exit(&spa_namespace_lock); 6351 6352 return (config); 6353 } 6354 6355 /* 6356 * Pool export/destroy 6357 * 6358 * The act of destroying or exporting a pool is very simple. We make sure there 6359 * is no more pending I/O and any references to the pool are gone. Then, we 6360 * update the pool state and sync all the labels to disk, removing the 6361 * configuration from the cache afterwards. If the 'hardforce' flag is set, then 6362 * we don't sync the labels or remove the configuration cache. 6363 */ 6364 static int 6365 spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, 6366 boolean_t force, boolean_t hardforce) 6367 { 6368 int error; 6369 spa_t *spa; 6370 6371 if (oldconfig) 6372 *oldconfig = NULL; 6373 6374 if (!(spa_mode_global & SPA_MODE_WRITE)) 6375 return (SET_ERROR(EROFS)); 6376 6377 mutex_enter(&spa_namespace_lock); 6378 if ((spa = spa_lookup(pool)) == NULL) { 6379 mutex_exit(&spa_namespace_lock); 6380 return (SET_ERROR(ENOENT)); 6381 } 6382 6383 if (spa->spa_is_exporting) { 6384 /* the pool is being exported by another thread */ 6385 mutex_exit(&spa_namespace_lock); 6386 return (SET_ERROR(ZFS_ERR_EXPORT_IN_PROGRESS)); 6387 } 6388 spa->spa_is_exporting = B_TRUE; 6389 6390 /* 6391 * Put a hold on the pool, drop the namespace lock, stop async tasks, 6392 * reacquire the namespace lock, and see if we can export. 6393 */ 6394 spa_open_ref(spa, FTAG); 6395 mutex_exit(&spa_namespace_lock); 6396 spa_async_suspend(spa); 6397 if (spa->spa_zvol_taskq) { 6398 zvol_remove_minors(spa, spa_name(spa), B_TRUE); 6399 taskq_wait(spa->spa_zvol_taskq); 6400 } 6401 mutex_enter(&spa_namespace_lock); 6402 spa_close(spa, FTAG); 6403 6404 if (spa->spa_state == POOL_STATE_UNINITIALIZED) 6405 goto export_spa; 6406 /* 6407 * The pool will be in core if it's openable, in which case we can 6408 * modify its state. Objsets may be open only because they're dirty, 6409 * so we have to force it to sync before checking spa_refcnt. 6410 */ 6411 if (spa->spa_sync_on) { 6412 txg_wait_synced(spa->spa_dsl_pool, 0); 6413 spa_evicting_os_wait(spa); 6414 } 6415 6416 /* 6417 * A pool cannot be exported or destroyed if there are active 6418 * references. If we are resetting a pool, allow references by 6419 * fault injection handlers. 6420 */ 6421 if (!spa_refcount_zero(spa) || (spa->spa_inject_ref != 0)) { 6422 error = SET_ERROR(EBUSY); 6423 goto fail; 6424 } 6425 6426 if (spa->spa_sync_on) { 6427 /* 6428 * A pool cannot be exported if it has an active shared spare. 6429 * This is to prevent other pools stealing the active spare 6430 * from an exported pool. At user's own will, such pool can 6431 * be forcedly exported. 6432 */ 6433 if (!force && new_state == POOL_STATE_EXPORTED && 6434 spa_has_active_shared_spare(spa)) { 6435 error = SET_ERROR(EXDEV); 6436 goto fail; 6437 } 6438 6439 /* 6440 * We're about to export or destroy this pool. Make sure 6441 * we stop all initialization and trim activity here before 6442 * we set the spa_final_txg. This will ensure that all 6443 * dirty data resulting from the initialization is 6444 * committed to disk before we unload the pool. 6445 */ 6446 if (spa->spa_root_vdev != NULL) { 6447 vdev_t *rvd = spa->spa_root_vdev; 6448 vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE); 6449 vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE); 6450 vdev_autotrim_stop_all(spa); 6451 vdev_rebuild_stop_all(spa); 6452 } 6453 6454 /* 6455 * We want this to be reflected on every label, 6456 * so mark them all dirty. spa_unload() will do the 6457 * final sync that pushes these changes out. 6458 */ 6459 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 6460 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6461 spa->spa_state = new_state; 6462 vdev_config_dirty(spa->spa_root_vdev); 6463 spa_config_exit(spa, SCL_ALL, FTAG); 6464 } 6465 6466 /* 6467 * If the log space map feature is enabled and the pool is 6468 * getting exported (but not destroyed), we want to spend some 6469 * time flushing as many metaslabs as we can in an attempt to 6470 * destroy log space maps and save import time. This has to be 6471 * done before we set the spa_final_txg, otherwise 6472 * spa_sync() -> spa_flush_metaslabs() may dirty the final TXGs. 6473 * spa_should_flush_logs_on_unload() should be called after 6474 * spa_state has been set to the new_state. 6475 */ 6476 if (spa_should_flush_logs_on_unload(spa)) 6477 spa_unload_log_sm_flush_all(spa); 6478 6479 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 6480 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6481 spa->spa_final_txg = spa_last_synced_txg(spa) + 6482 TXG_DEFER_SIZE + 1; 6483 spa_config_exit(spa, SCL_ALL, FTAG); 6484 } 6485 } 6486 6487 export_spa: 6488 spa_export_os(spa); 6489 6490 if (new_state == POOL_STATE_DESTROYED) 6491 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY); 6492 else if (new_state == POOL_STATE_EXPORTED) 6493 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_EXPORT); 6494 6495 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 6496 spa_unload(spa); 6497 spa_deactivate(spa); 6498 } 6499 6500 if (oldconfig && spa->spa_config) 6501 *oldconfig = fnvlist_dup(spa->spa_config); 6502 6503 if (new_state != POOL_STATE_UNINITIALIZED) { 6504 if (!hardforce) 6505 spa_write_cachefile(spa, B_TRUE, B_TRUE); 6506 spa_remove(spa); 6507 } else { 6508 /* 6509 * If spa_remove() is not called for this spa_t and 6510 * there is any possibility that it can be reused, 6511 * we make sure to reset the exporting flag. 6512 */ 6513 spa->spa_is_exporting = B_FALSE; 6514 } 6515 6516 mutex_exit(&spa_namespace_lock); 6517 return (0); 6518 6519 fail: 6520 spa->spa_is_exporting = B_FALSE; 6521 spa_async_resume(spa); 6522 mutex_exit(&spa_namespace_lock); 6523 return (error); 6524 } 6525 6526 /* 6527 * Destroy a storage pool. 6528 */ 6529 int 6530 spa_destroy(const char *pool) 6531 { 6532 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 6533 B_FALSE, B_FALSE)); 6534 } 6535 6536 /* 6537 * Export a storage pool. 6538 */ 6539 int 6540 spa_export(const char *pool, nvlist_t **oldconfig, boolean_t force, 6541 boolean_t hardforce) 6542 { 6543 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 6544 force, hardforce)); 6545 } 6546 6547 /* 6548 * Similar to spa_export(), this unloads the spa_t without actually removing it 6549 * from the namespace in any way. 6550 */ 6551 int 6552 spa_reset(const char *pool) 6553 { 6554 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 6555 B_FALSE, B_FALSE)); 6556 } 6557 6558 /* 6559 * ========================================================================== 6560 * Device manipulation 6561 * ========================================================================== 6562 */ 6563 6564 /* 6565 * This is called as a synctask to increment the draid feature flag 6566 */ 6567 static void 6568 spa_draid_feature_incr(void *arg, dmu_tx_t *tx) 6569 { 6570 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6571 int draid = (int)(uintptr_t)arg; 6572 6573 for (int c = 0; c < draid; c++) 6574 spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); 6575 } 6576 6577 /* 6578 * Add a device to a storage pool. 6579 */ 6580 int 6581 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 6582 { 6583 uint64_t txg, ndraid = 0; 6584 int error; 6585 vdev_t *rvd = spa->spa_root_vdev; 6586 vdev_t *vd, *tvd; 6587 nvlist_t **spares, **l2cache; 6588 uint_t nspares, nl2cache; 6589 6590 ASSERT(spa_writeable(spa)); 6591 6592 txg = spa_vdev_enter(spa); 6593 6594 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 6595 VDEV_ALLOC_ADD)) != 0) 6596 return (spa_vdev_exit(spa, NULL, txg, error)); 6597 6598 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 6599 6600 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 6601 &nspares) != 0) 6602 nspares = 0; 6603 6604 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 6605 &nl2cache) != 0) 6606 nl2cache = 0; 6607 6608 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 6609 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 6610 6611 if (vd->vdev_children != 0 && 6612 (error = vdev_create(vd, txg, B_FALSE)) != 0) { 6613 return (spa_vdev_exit(spa, vd, txg, error)); 6614 } 6615 6616 /* 6617 * The virtual dRAID spares must be added after vdev tree is created 6618 * and the vdev guids are generated. The guid of their associated 6619 * dRAID is stored in the config and used when opening the spare. 6620 */ 6621 if ((error = vdev_draid_spare_create(nvroot, vd, &ndraid, 6622 rvd->vdev_children)) == 0) { 6623 if (ndraid > 0 && nvlist_lookup_nvlist_array(nvroot, 6624 ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0) 6625 nspares = 0; 6626 } else { 6627 return (spa_vdev_exit(spa, vd, txg, error)); 6628 } 6629 6630 /* 6631 * We must validate the spares and l2cache devices after checking the 6632 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 6633 */ 6634 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 6635 return (spa_vdev_exit(spa, vd, txg, error)); 6636 6637 /* 6638 * If we are in the middle of a device removal, we can only add 6639 * devices which match the existing devices in the pool. 6640 * If we are in the middle of a removal, or have some indirect 6641 * vdevs, we can not add raidz or dRAID top levels. 6642 */ 6643 if (spa->spa_vdev_removal != NULL || 6644 spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { 6645 for (int c = 0; c < vd->vdev_children; c++) { 6646 tvd = vd->vdev_child[c]; 6647 if (spa->spa_vdev_removal != NULL && 6648 tvd->vdev_ashift != spa->spa_max_ashift) { 6649 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 6650 } 6651 /* Fail if top level vdev is raidz or a dRAID */ 6652 if (vdev_get_nparity(tvd) != 0) 6653 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 6654 6655 /* 6656 * Need the top level mirror to be 6657 * a mirror of leaf vdevs only 6658 */ 6659 if (tvd->vdev_ops == &vdev_mirror_ops) { 6660 for (uint64_t cid = 0; 6661 cid < tvd->vdev_children; cid++) { 6662 vdev_t *cvd = tvd->vdev_child[cid]; 6663 if (!cvd->vdev_ops->vdev_op_leaf) { 6664 return (spa_vdev_exit(spa, vd, 6665 txg, EINVAL)); 6666 } 6667 } 6668 } 6669 } 6670 } 6671 6672 for (int c = 0; c < vd->vdev_children; c++) { 6673 tvd = vd->vdev_child[c]; 6674 vdev_remove_child(vd, tvd); 6675 tvd->vdev_id = rvd->vdev_children; 6676 vdev_add_child(rvd, tvd); 6677 vdev_config_dirty(tvd); 6678 } 6679 6680 if (nspares != 0) { 6681 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 6682 ZPOOL_CONFIG_SPARES); 6683 spa_load_spares(spa); 6684 spa->spa_spares.sav_sync = B_TRUE; 6685 } 6686 6687 if (nl2cache != 0) { 6688 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 6689 ZPOOL_CONFIG_L2CACHE); 6690 spa_load_l2cache(spa); 6691 spa->spa_l2cache.sav_sync = B_TRUE; 6692 } 6693 6694 /* 6695 * We can't increment a feature while holding spa_vdev so we 6696 * have to do it in a synctask. 6697 */ 6698 if (ndraid != 0) { 6699 dmu_tx_t *tx; 6700 6701 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 6702 dsl_sync_task_nowait(spa->spa_dsl_pool, spa_draid_feature_incr, 6703 (void *)(uintptr_t)ndraid, tx); 6704 dmu_tx_commit(tx); 6705 } 6706 6707 /* 6708 * We have to be careful when adding new vdevs to an existing pool. 6709 * If other threads start allocating from these vdevs before we 6710 * sync the config cache, and we lose power, then upon reboot we may 6711 * fail to open the pool because there are DVAs that the config cache 6712 * can't translate. Therefore, we first add the vdevs without 6713 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 6714 * and then let spa_config_update() initialize the new metaslabs. 6715 * 6716 * spa_load() checks for added-but-not-initialized vdevs, so that 6717 * if we lose power at any point in this sequence, the remaining 6718 * steps will be completed the next time we load the pool. 6719 */ 6720 (void) spa_vdev_exit(spa, vd, txg, 0); 6721 6722 mutex_enter(&spa_namespace_lock); 6723 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 6724 spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD); 6725 mutex_exit(&spa_namespace_lock); 6726 6727 return (0); 6728 } 6729 6730 /* 6731 * Attach a device to a mirror. The arguments are the path to any device 6732 * in the mirror, and the nvroot for the new device. If the path specifies 6733 * a device that is not mirrored, we automatically insert the mirror vdev. 6734 * 6735 * If 'replacing' is specified, the new device is intended to replace the 6736 * existing device; in this case the two devices are made into their own 6737 * mirror using the 'replacing' vdev, which is functionally identical to 6738 * the mirror vdev (it actually reuses all the same ops) but has a few 6739 * extra rules: you can't attach to it after it's been created, and upon 6740 * completion of resilvering, the first disk (the one being replaced) 6741 * is automatically detached. 6742 * 6743 * If 'rebuild' is specified, then sequential reconstruction (a.ka. rebuild) 6744 * should be performed instead of traditional healing reconstruction. From 6745 * an administrators perspective these are both resilver operations. 6746 */ 6747 int 6748 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, 6749 int rebuild) 6750 { 6751 uint64_t txg, dtl_max_txg; 6752 vdev_t *rvd = spa->spa_root_vdev; 6753 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 6754 vdev_ops_t *pvops; 6755 char *oldvdpath, *newvdpath; 6756 int newvd_isspare; 6757 int error; 6758 6759 ASSERT(spa_writeable(spa)); 6760 6761 txg = spa_vdev_enter(spa); 6762 6763 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 6764 6765 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 6766 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 6767 error = (spa_has_checkpoint(spa)) ? 6768 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 6769 return (spa_vdev_exit(spa, NULL, txg, error)); 6770 } 6771 6772 if (rebuild) { 6773 if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) 6774 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 6775 6776 if (dsl_scan_resilvering(spa_get_dsl(spa))) 6777 return (spa_vdev_exit(spa, NULL, txg, 6778 ZFS_ERR_RESILVER_IN_PROGRESS)); 6779 } else { 6780 if (vdev_rebuild_active(rvd)) 6781 return (spa_vdev_exit(spa, NULL, txg, 6782 ZFS_ERR_REBUILD_IN_PROGRESS)); 6783 } 6784 6785 if (spa->spa_vdev_removal != NULL) 6786 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 6787 6788 if (oldvd == NULL) 6789 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 6790 6791 if (!oldvd->vdev_ops->vdev_op_leaf) 6792 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 6793 6794 pvd = oldvd->vdev_parent; 6795 6796 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 6797 VDEV_ALLOC_ATTACH)) != 0) 6798 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 6799 6800 if (newrootvd->vdev_children != 1) 6801 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 6802 6803 newvd = newrootvd->vdev_child[0]; 6804 6805 if (!newvd->vdev_ops->vdev_op_leaf) 6806 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 6807 6808 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 6809 return (spa_vdev_exit(spa, newrootvd, txg, error)); 6810 6811 /* 6812 * Spares can't replace logs 6813 */ 6814 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 6815 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6816 6817 /* 6818 * A dRAID spare can only replace a child of its parent dRAID vdev. 6819 */ 6820 if (newvd->vdev_ops == &vdev_draid_spare_ops && 6821 oldvd->vdev_top != vdev_draid_spare_get_parent(newvd)) { 6822 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6823 } 6824 6825 if (rebuild) { 6826 /* 6827 * For rebuilds, the top vdev must support reconstruction 6828 * using only space maps. This means the only allowable 6829 * vdevs types are the root vdev, a mirror, or dRAID. 6830 */ 6831 tvd = pvd; 6832 if (pvd->vdev_top != NULL) 6833 tvd = pvd->vdev_top; 6834 6835 if (tvd->vdev_ops != &vdev_mirror_ops && 6836 tvd->vdev_ops != &vdev_root_ops && 6837 tvd->vdev_ops != &vdev_draid_ops) { 6838 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6839 } 6840 } 6841 6842 if (!replacing) { 6843 /* 6844 * For attach, the only allowable parent is a mirror or the root 6845 * vdev. 6846 */ 6847 if (pvd->vdev_ops != &vdev_mirror_ops && 6848 pvd->vdev_ops != &vdev_root_ops) 6849 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6850 6851 pvops = &vdev_mirror_ops; 6852 } else { 6853 /* 6854 * Active hot spares can only be replaced by inactive hot 6855 * spares. 6856 */ 6857 if (pvd->vdev_ops == &vdev_spare_ops && 6858 oldvd->vdev_isspare && 6859 !spa_has_spare(spa, newvd->vdev_guid)) 6860 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6861 6862 /* 6863 * If the source is a hot spare, and the parent isn't already a 6864 * spare, then we want to create a new hot spare. Otherwise, we 6865 * want to create a replacing vdev. The user is not allowed to 6866 * attach to a spared vdev child unless the 'isspare' state is 6867 * the same (spare replaces spare, non-spare replaces 6868 * non-spare). 6869 */ 6870 if (pvd->vdev_ops == &vdev_replacing_ops && 6871 spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 6872 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6873 } else if (pvd->vdev_ops == &vdev_spare_ops && 6874 newvd->vdev_isspare != oldvd->vdev_isspare) { 6875 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6876 } 6877 6878 if (newvd->vdev_isspare) 6879 pvops = &vdev_spare_ops; 6880 else 6881 pvops = &vdev_replacing_ops; 6882 } 6883 6884 /* 6885 * Make sure the new device is big enough. 6886 */ 6887 if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 6888 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 6889 6890 /* 6891 * The new device cannot have a higher alignment requirement 6892 * than the top-level vdev. 6893 */ 6894 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 6895 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6896 6897 /* 6898 * If this is an in-place replacement, update oldvd's path and devid 6899 * to make it distinguishable from newvd, and unopenable from now on. 6900 */ 6901 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 6902 spa_strfree(oldvd->vdev_path); 6903 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 6904 KM_SLEEP); 6905 (void) snprintf(oldvd->vdev_path, strlen(newvd->vdev_path) + 5, 6906 "%s/%s", newvd->vdev_path, "old"); 6907 if (oldvd->vdev_devid != NULL) { 6908 spa_strfree(oldvd->vdev_devid); 6909 oldvd->vdev_devid = NULL; 6910 } 6911 } 6912 6913 /* 6914 * If the parent is not a mirror, or if we're replacing, insert the new 6915 * mirror/replacing/spare vdev above oldvd. 6916 */ 6917 if (pvd->vdev_ops != pvops) 6918 pvd = vdev_add_parent(oldvd, pvops); 6919 6920 ASSERT(pvd->vdev_top->vdev_parent == rvd); 6921 ASSERT(pvd->vdev_ops == pvops); 6922 ASSERT(oldvd->vdev_parent == pvd); 6923 6924 /* 6925 * Extract the new device from its root and add it to pvd. 6926 */ 6927 vdev_remove_child(newrootvd, newvd); 6928 newvd->vdev_id = pvd->vdev_children; 6929 newvd->vdev_crtxg = oldvd->vdev_crtxg; 6930 vdev_add_child(pvd, newvd); 6931 6932 /* 6933 * Reevaluate the parent vdev state. 6934 */ 6935 vdev_propagate_state(pvd); 6936 6937 tvd = newvd->vdev_top; 6938 ASSERT(pvd->vdev_top == tvd); 6939 ASSERT(tvd->vdev_parent == rvd); 6940 6941 vdev_config_dirty(tvd); 6942 6943 /* 6944 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 6945 * for any dmu_sync-ed blocks. It will propagate upward when 6946 * spa_vdev_exit() calls vdev_dtl_reassess(). 6947 */ 6948 dtl_max_txg = txg + TXG_CONCURRENT_STATES; 6949 6950 vdev_dtl_dirty(newvd, DTL_MISSING, 6951 TXG_INITIAL, dtl_max_txg - TXG_INITIAL); 6952 6953 if (newvd->vdev_isspare) { 6954 spa_spare_activate(newvd); 6955 spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE); 6956 } 6957 6958 oldvdpath = spa_strdup(oldvd->vdev_path); 6959 newvdpath = spa_strdup(newvd->vdev_path); 6960 newvd_isspare = newvd->vdev_isspare; 6961 6962 /* 6963 * Mark newvd's DTL dirty in this txg. 6964 */ 6965 vdev_dirty(tvd, VDD_DTL, newvd, txg); 6966 6967 /* 6968 * Schedule the resilver or rebuild to restart in the future. We do 6969 * this to ensure that dmu_sync-ed blocks have been stitched into the 6970 * respective datasets. 6971 */ 6972 if (rebuild) { 6973 newvd->vdev_rebuild_txg = txg; 6974 6975 vdev_rebuild(tvd); 6976 } else { 6977 newvd->vdev_resilver_txg = txg; 6978 6979 if (dsl_scan_resilvering(spa_get_dsl(spa)) && 6980 spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) { 6981 vdev_defer_resilver(newvd); 6982 } else { 6983 dsl_scan_restart_resilver(spa->spa_dsl_pool, 6984 dtl_max_txg); 6985 } 6986 } 6987 6988 if (spa->spa_bootfs) 6989 spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH); 6990 6991 spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH); 6992 6993 /* 6994 * Commit the config 6995 */ 6996 (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 6997 6998 spa_history_log_internal(spa, "vdev attach", NULL, 6999 "%s vdev=%s %s vdev=%s", 7000 replacing && newvd_isspare ? "spare in" : 7001 replacing ? "replace" : "attach", newvdpath, 7002 replacing ? "for" : "to", oldvdpath); 7003 7004 spa_strfree(oldvdpath); 7005 spa_strfree(newvdpath); 7006 7007 return (0); 7008 } 7009 7010 /* 7011 * Detach a device from a mirror or replacing vdev. 7012 * 7013 * If 'replace_done' is specified, only detach if the parent 7014 * is a replacing vdev. 7015 */ 7016 int 7017 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 7018 { 7019 uint64_t txg; 7020 int error; 7021 vdev_t *rvd __maybe_unused = spa->spa_root_vdev; 7022 vdev_t *vd, *pvd, *cvd, *tvd; 7023 boolean_t unspare = B_FALSE; 7024 uint64_t unspare_guid = 0; 7025 char *vdpath; 7026 7027 ASSERT(spa_writeable(spa)); 7028 7029 txg = spa_vdev_detach_enter(spa, guid); 7030 7031 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 7032 7033 /* 7034 * Besides being called directly from the userland through the 7035 * ioctl interface, spa_vdev_detach() can be potentially called 7036 * at the end of spa_vdev_resilver_done(). 7037 * 7038 * In the regular case, when we have a checkpoint this shouldn't 7039 * happen as we never empty the DTLs of a vdev during the scrub 7040 * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done() 7041 * should never get here when we have a checkpoint. 7042 * 7043 * That said, even in a case when we checkpoint the pool exactly 7044 * as spa_vdev_resilver_done() calls this function everything 7045 * should be fine as the resilver will return right away. 7046 */ 7047 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 7048 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 7049 error = (spa_has_checkpoint(spa)) ? 7050 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 7051 return (spa_vdev_exit(spa, NULL, txg, error)); 7052 } 7053 7054 if (vd == NULL) 7055 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 7056 7057 if (!vd->vdev_ops->vdev_op_leaf) 7058 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7059 7060 pvd = vd->vdev_parent; 7061 7062 /* 7063 * If the parent/child relationship is not as expected, don't do it. 7064 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 7065 * vdev that's replacing B with C. The user's intent in replacing 7066 * is to go from M(A,B) to M(A,C). If the user decides to cancel 7067 * the replace by detaching C, the expected behavior is to end up 7068 * M(A,B). But suppose that right after deciding to detach C, 7069 * the replacement of B completes. We would have M(A,C), and then 7070 * ask to detach C, which would leave us with just A -- not what 7071 * the user wanted. To prevent this, we make sure that the 7072 * parent/child relationship hasn't changed -- in this example, 7073 * that C's parent is still the replacing vdev R. 7074 */ 7075 if (pvd->vdev_guid != pguid && pguid != 0) 7076 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 7077 7078 /* 7079 * Only 'replacing' or 'spare' vdevs can be replaced. 7080 */ 7081 if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 7082 pvd->vdev_ops != &vdev_spare_ops) 7083 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7084 7085 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 7086 spa_version(spa) >= SPA_VERSION_SPARES); 7087 7088 /* 7089 * Only mirror, replacing, and spare vdevs support detach. 7090 */ 7091 if (pvd->vdev_ops != &vdev_replacing_ops && 7092 pvd->vdev_ops != &vdev_mirror_ops && 7093 pvd->vdev_ops != &vdev_spare_ops) 7094 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7095 7096 /* 7097 * If this device has the only valid copy of some data, 7098 * we cannot safely detach it. 7099 */ 7100 if (vdev_dtl_required(vd)) 7101 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 7102 7103 ASSERT(pvd->vdev_children >= 2); 7104 7105 /* 7106 * If we are detaching the second disk from a replacing vdev, then 7107 * check to see if we changed the original vdev's path to have "/old" 7108 * at the end in spa_vdev_attach(). If so, undo that change now. 7109 */ 7110 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 7111 vd->vdev_path != NULL) { 7112 size_t len = strlen(vd->vdev_path); 7113 7114 for (int c = 0; c < pvd->vdev_children; c++) { 7115 cvd = pvd->vdev_child[c]; 7116 7117 if (cvd == vd || cvd->vdev_path == NULL) 7118 continue; 7119 7120 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 7121 strcmp(cvd->vdev_path + len, "/old") == 0) { 7122 spa_strfree(cvd->vdev_path); 7123 cvd->vdev_path = spa_strdup(vd->vdev_path); 7124 break; 7125 } 7126 } 7127 } 7128 7129 /* 7130 * If we are detaching the original disk from a normal spare, then it 7131 * implies that the spare should become a real disk, and be removed 7132 * from the active spare list for the pool. dRAID spares on the 7133 * other hand are coupled to the pool and thus should never be removed 7134 * from the spares list. 7135 */ 7136 if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0) { 7137 vdev_t *last_cvd = pvd->vdev_child[pvd->vdev_children - 1]; 7138 7139 if (last_cvd->vdev_isspare && 7140 last_cvd->vdev_ops != &vdev_draid_spare_ops) { 7141 unspare = B_TRUE; 7142 } 7143 } 7144 7145 /* 7146 * Erase the disk labels so the disk can be used for other things. 7147 * This must be done after all other error cases are handled, 7148 * but before we disembowel vd (so we can still do I/O to it). 7149 * But if we can't do it, don't treat the error as fatal -- 7150 * it may be that the unwritability of the disk is the reason 7151 * it's being detached! 7152 */ 7153 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 7154 7155 /* 7156 * Remove vd from its parent and compact the parent's children. 7157 */ 7158 vdev_remove_child(pvd, vd); 7159 vdev_compact_children(pvd); 7160 7161 /* 7162 * Remember one of the remaining children so we can get tvd below. 7163 */ 7164 cvd = pvd->vdev_child[pvd->vdev_children - 1]; 7165 7166 /* 7167 * If we need to remove the remaining child from the list of hot spares, 7168 * do it now, marking the vdev as no longer a spare in the process. 7169 * We must do this before vdev_remove_parent(), because that can 7170 * change the GUID if it creates a new toplevel GUID. For a similar 7171 * reason, we must remove the spare now, in the same txg as the detach; 7172 * otherwise someone could attach a new sibling, change the GUID, and 7173 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 7174 */ 7175 if (unspare) { 7176 ASSERT(cvd->vdev_isspare); 7177 spa_spare_remove(cvd); 7178 unspare_guid = cvd->vdev_guid; 7179 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 7180 cvd->vdev_unspare = B_TRUE; 7181 } 7182 7183 /* 7184 * If the parent mirror/replacing vdev only has one child, 7185 * the parent is no longer needed. Remove it from the tree. 7186 */ 7187 if (pvd->vdev_children == 1) { 7188 if (pvd->vdev_ops == &vdev_spare_ops) 7189 cvd->vdev_unspare = B_FALSE; 7190 vdev_remove_parent(cvd); 7191 } 7192 7193 /* 7194 * We don't set tvd until now because the parent we just removed 7195 * may have been the previous top-level vdev. 7196 */ 7197 tvd = cvd->vdev_top; 7198 ASSERT(tvd->vdev_parent == rvd); 7199 7200 /* 7201 * Reevaluate the parent vdev state. 7202 */ 7203 vdev_propagate_state(cvd); 7204 7205 /* 7206 * If the 'autoexpand' property is set on the pool then automatically 7207 * try to expand the size of the pool. For example if the device we 7208 * just detached was smaller than the others, it may be possible to 7209 * add metaslabs (i.e. grow the pool). We need to reopen the vdev 7210 * first so that we can obtain the updated sizes of the leaf vdevs. 7211 */ 7212 if (spa->spa_autoexpand) { 7213 vdev_reopen(tvd); 7214 vdev_expand(tvd, txg); 7215 } 7216 7217 vdev_config_dirty(tvd); 7218 7219 /* 7220 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 7221 * vd->vdev_detached is set and free vd's DTL object in syncing context. 7222 * But first make sure we're not on any *other* txg's DTL list, to 7223 * prevent vd from being accessed after it's freed. 7224 */ 7225 vdpath = spa_strdup(vd->vdev_path ? vd->vdev_path : "none"); 7226 for (int t = 0; t < TXG_SIZE; t++) 7227 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 7228 vd->vdev_detached = B_TRUE; 7229 vdev_dirty(tvd, VDD_DTL, vd, txg); 7230 7231 spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE); 7232 spa_notify_waiters(spa); 7233 7234 /* hang on to the spa before we release the lock */ 7235 spa_open_ref(spa, FTAG); 7236 7237 error = spa_vdev_exit(spa, vd, txg, 0); 7238 7239 spa_history_log_internal(spa, "detach", NULL, 7240 "vdev=%s", vdpath); 7241 spa_strfree(vdpath); 7242 7243 /* 7244 * If this was the removal of the original device in a hot spare vdev, 7245 * then we want to go through and remove the device from the hot spare 7246 * list of every other pool. 7247 */ 7248 if (unspare) { 7249 spa_t *altspa = NULL; 7250 7251 mutex_enter(&spa_namespace_lock); 7252 while ((altspa = spa_next(altspa)) != NULL) { 7253 if (altspa->spa_state != POOL_STATE_ACTIVE || 7254 altspa == spa) 7255 continue; 7256 7257 spa_open_ref(altspa, FTAG); 7258 mutex_exit(&spa_namespace_lock); 7259 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 7260 mutex_enter(&spa_namespace_lock); 7261 spa_close(altspa, FTAG); 7262 } 7263 mutex_exit(&spa_namespace_lock); 7264 7265 /* search the rest of the vdevs for spares to remove */ 7266 spa_vdev_resilver_done(spa); 7267 } 7268 7269 /* all done with the spa; OK to release */ 7270 mutex_enter(&spa_namespace_lock); 7271 spa_close(spa, FTAG); 7272 mutex_exit(&spa_namespace_lock); 7273 7274 return (error); 7275 } 7276 7277 static int 7278 spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, 7279 list_t *vd_list) 7280 { 7281 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 7282 7283 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 7284 7285 /* Look up vdev and ensure it's a leaf. */ 7286 vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); 7287 if (vd == NULL || vd->vdev_detached) { 7288 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7289 return (SET_ERROR(ENODEV)); 7290 } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { 7291 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7292 return (SET_ERROR(EINVAL)); 7293 } else if (!vdev_writeable(vd)) { 7294 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7295 return (SET_ERROR(EROFS)); 7296 } 7297 mutex_enter(&vd->vdev_initialize_lock); 7298 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7299 7300 /* 7301 * When we activate an initialize action we check to see 7302 * if the vdev_initialize_thread is NULL. We do this instead 7303 * of using the vdev_initialize_state since there might be 7304 * a previous initialization process which has completed but 7305 * the thread is not exited. 7306 */ 7307 if (cmd_type == POOL_INITIALIZE_START && 7308 (vd->vdev_initialize_thread != NULL || 7309 vd->vdev_top->vdev_removing)) { 7310 mutex_exit(&vd->vdev_initialize_lock); 7311 return (SET_ERROR(EBUSY)); 7312 } else if (cmd_type == POOL_INITIALIZE_CANCEL && 7313 (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE && 7314 vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) { 7315 mutex_exit(&vd->vdev_initialize_lock); 7316 return (SET_ERROR(ESRCH)); 7317 } else if (cmd_type == POOL_INITIALIZE_SUSPEND && 7318 vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) { 7319 mutex_exit(&vd->vdev_initialize_lock); 7320 return (SET_ERROR(ESRCH)); 7321 } 7322 7323 switch (cmd_type) { 7324 case POOL_INITIALIZE_START: 7325 vdev_initialize(vd); 7326 break; 7327 case POOL_INITIALIZE_CANCEL: 7328 vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED, vd_list); 7329 break; 7330 case POOL_INITIALIZE_SUSPEND: 7331 vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list); 7332 break; 7333 default: 7334 panic("invalid cmd_type %llu", (unsigned long long)cmd_type); 7335 } 7336 mutex_exit(&vd->vdev_initialize_lock); 7337 7338 return (0); 7339 } 7340 7341 int 7342 spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, 7343 nvlist_t *vdev_errlist) 7344 { 7345 int total_errors = 0; 7346 list_t vd_list; 7347 7348 list_create(&vd_list, sizeof (vdev_t), 7349 offsetof(vdev_t, vdev_initialize_node)); 7350 7351 /* 7352 * We hold the namespace lock through the whole function 7353 * to prevent any changes to the pool while we're starting or 7354 * stopping initialization. The config and state locks are held so that 7355 * we can properly assess the vdev state before we commit to 7356 * the initializing operation. 7357 */ 7358 mutex_enter(&spa_namespace_lock); 7359 7360 for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); 7361 pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { 7362 uint64_t vdev_guid = fnvpair_value_uint64(pair); 7363 7364 int error = spa_vdev_initialize_impl(spa, vdev_guid, cmd_type, 7365 &vd_list); 7366 if (error != 0) { 7367 char guid_as_str[MAXNAMELEN]; 7368 7369 (void) snprintf(guid_as_str, sizeof (guid_as_str), 7370 "%llu", (unsigned long long)vdev_guid); 7371 fnvlist_add_int64(vdev_errlist, guid_as_str, error); 7372 total_errors++; 7373 } 7374 } 7375 7376 /* Wait for all initialize threads to stop. */ 7377 vdev_initialize_stop_wait(spa, &vd_list); 7378 7379 /* Sync out the initializing state */ 7380 txg_wait_synced(spa->spa_dsl_pool, 0); 7381 mutex_exit(&spa_namespace_lock); 7382 7383 list_destroy(&vd_list); 7384 7385 return (total_errors); 7386 } 7387 7388 static int 7389 spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, 7390 uint64_t rate, boolean_t partial, boolean_t secure, list_t *vd_list) 7391 { 7392 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 7393 7394 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 7395 7396 /* Look up vdev and ensure it's a leaf. */ 7397 vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); 7398 if (vd == NULL || vd->vdev_detached) { 7399 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7400 return (SET_ERROR(ENODEV)); 7401 } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { 7402 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7403 return (SET_ERROR(EINVAL)); 7404 } else if (!vdev_writeable(vd)) { 7405 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7406 return (SET_ERROR(EROFS)); 7407 } else if (!vd->vdev_has_trim) { 7408 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7409 return (SET_ERROR(EOPNOTSUPP)); 7410 } else if (secure && !vd->vdev_has_securetrim) { 7411 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7412 return (SET_ERROR(EOPNOTSUPP)); 7413 } 7414 mutex_enter(&vd->vdev_trim_lock); 7415 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7416 7417 /* 7418 * When we activate a TRIM action we check to see if the 7419 * vdev_trim_thread is NULL. We do this instead of using the 7420 * vdev_trim_state since there might be a previous TRIM process 7421 * which has completed but the thread is not exited. 7422 */ 7423 if (cmd_type == POOL_TRIM_START && 7424 (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing)) { 7425 mutex_exit(&vd->vdev_trim_lock); 7426 return (SET_ERROR(EBUSY)); 7427 } else if (cmd_type == POOL_TRIM_CANCEL && 7428 (vd->vdev_trim_state != VDEV_TRIM_ACTIVE && 7429 vd->vdev_trim_state != VDEV_TRIM_SUSPENDED)) { 7430 mutex_exit(&vd->vdev_trim_lock); 7431 return (SET_ERROR(ESRCH)); 7432 } else if (cmd_type == POOL_TRIM_SUSPEND && 7433 vd->vdev_trim_state != VDEV_TRIM_ACTIVE) { 7434 mutex_exit(&vd->vdev_trim_lock); 7435 return (SET_ERROR(ESRCH)); 7436 } 7437 7438 switch (cmd_type) { 7439 case POOL_TRIM_START: 7440 vdev_trim(vd, rate, partial, secure); 7441 break; 7442 case POOL_TRIM_CANCEL: 7443 vdev_trim_stop(vd, VDEV_TRIM_CANCELED, vd_list); 7444 break; 7445 case POOL_TRIM_SUSPEND: 7446 vdev_trim_stop(vd, VDEV_TRIM_SUSPENDED, vd_list); 7447 break; 7448 default: 7449 panic("invalid cmd_type %llu", (unsigned long long)cmd_type); 7450 } 7451 mutex_exit(&vd->vdev_trim_lock); 7452 7453 return (0); 7454 } 7455 7456 /* 7457 * Initiates a manual TRIM for the requested vdevs. This kicks off individual 7458 * TRIM threads for each child vdev. These threads pass over all of the free 7459 * space in the vdev's metaslabs and issues TRIM commands for that space. 7460 */ 7461 int 7462 spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate, 7463 boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist) 7464 { 7465 int total_errors = 0; 7466 list_t vd_list; 7467 7468 list_create(&vd_list, sizeof (vdev_t), 7469 offsetof(vdev_t, vdev_trim_node)); 7470 7471 /* 7472 * We hold the namespace lock through the whole function 7473 * to prevent any changes to the pool while we're starting or 7474 * stopping TRIM. The config and state locks are held so that 7475 * we can properly assess the vdev state before we commit to 7476 * the TRIM operation. 7477 */ 7478 mutex_enter(&spa_namespace_lock); 7479 7480 for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); 7481 pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { 7482 uint64_t vdev_guid = fnvpair_value_uint64(pair); 7483 7484 int error = spa_vdev_trim_impl(spa, vdev_guid, cmd_type, 7485 rate, partial, secure, &vd_list); 7486 if (error != 0) { 7487 char guid_as_str[MAXNAMELEN]; 7488 7489 (void) snprintf(guid_as_str, sizeof (guid_as_str), 7490 "%llu", (unsigned long long)vdev_guid); 7491 fnvlist_add_int64(vdev_errlist, guid_as_str, error); 7492 total_errors++; 7493 } 7494 } 7495 7496 /* Wait for all TRIM threads to stop. */ 7497 vdev_trim_stop_wait(spa, &vd_list); 7498 7499 /* Sync out the TRIM state */ 7500 txg_wait_synced(spa->spa_dsl_pool, 0); 7501 mutex_exit(&spa_namespace_lock); 7502 7503 list_destroy(&vd_list); 7504 7505 return (total_errors); 7506 } 7507 7508 /* 7509 * Split a set of devices from their mirrors, and create a new pool from them. 7510 */ 7511 int 7512 spa_vdev_split_mirror(spa_t *spa, const char *newname, nvlist_t *config, 7513 nvlist_t *props, boolean_t exp) 7514 { 7515 int error = 0; 7516 uint64_t txg, *glist; 7517 spa_t *newspa; 7518 uint_t c, children, lastlog; 7519 nvlist_t **child, *nvl, *tmp; 7520 dmu_tx_t *tx; 7521 char *altroot = NULL; 7522 vdev_t *rvd, **vml = NULL; /* vdev modify list */ 7523 boolean_t activate_slog; 7524 7525 ASSERT(spa_writeable(spa)); 7526 7527 txg = spa_vdev_enter(spa); 7528 7529 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 7530 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 7531 error = (spa_has_checkpoint(spa)) ? 7532 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 7533 return (spa_vdev_exit(spa, NULL, txg, error)); 7534 } 7535 7536 /* clear the log and flush everything up to now */ 7537 activate_slog = spa_passivate_log(spa); 7538 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 7539 error = spa_reset_logs(spa); 7540 txg = spa_vdev_config_enter(spa); 7541 7542 if (activate_slog) 7543 spa_activate_log(spa); 7544 7545 if (error != 0) 7546 return (spa_vdev_exit(spa, NULL, txg, error)); 7547 7548 /* check new spa name before going any further */ 7549 if (spa_lookup(newname) != NULL) 7550 return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 7551 7552 /* 7553 * scan through all the children to ensure they're all mirrors 7554 */ 7555 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 7556 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 7557 &children) != 0) 7558 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 7559 7560 /* first, check to ensure we've got the right child count */ 7561 rvd = spa->spa_root_vdev; 7562 lastlog = 0; 7563 for (c = 0; c < rvd->vdev_children; c++) { 7564 vdev_t *vd = rvd->vdev_child[c]; 7565 7566 /* don't count the holes & logs as children */ 7567 if (vd->vdev_islog || (vd->vdev_ops != &vdev_indirect_ops && 7568 !vdev_is_concrete(vd))) { 7569 if (lastlog == 0) 7570 lastlog = c; 7571 continue; 7572 } 7573 7574 lastlog = 0; 7575 } 7576 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 7577 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 7578 7579 /* next, ensure no spare or cache devices are part of the split */ 7580 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 7581 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 7582 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 7583 7584 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 7585 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 7586 7587 /* then, loop over each vdev and validate it */ 7588 for (c = 0; c < children; c++) { 7589 uint64_t is_hole = 0; 7590 7591 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 7592 &is_hole); 7593 7594 if (is_hole != 0) { 7595 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 7596 spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 7597 continue; 7598 } else { 7599 error = SET_ERROR(EINVAL); 7600 break; 7601 } 7602 } 7603 7604 /* deal with indirect vdevs */ 7605 if (spa->spa_root_vdev->vdev_child[c]->vdev_ops == 7606 &vdev_indirect_ops) 7607 continue; 7608 7609 /* which disk is going to be split? */ 7610 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 7611 &glist[c]) != 0) { 7612 error = SET_ERROR(EINVAL); 7613 break; 7614 } 7615 7616 /* look it up in the spa */ 7617 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 7618 if (vml[c] == NULL) { 7619 error = SET_ERROR(ENODEV); 7620 break; 7621 } 7622 7623 /* make sure there's nothing stopping the split */ 7624 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 7625 vml[c]->vdev_islog || 7626 !vdev_is_concrete(vml[c]) || 7627 vml[c]->vdev_isspare || 7628 vml[c]->vdev_isl2cache || 7629 !vdev_writeable(vml[c]) || 7630 vml[c]->vdev_children != 0 || 7631 vml[c]->vdev_state != VDEV_STATE_HEALTHY || 7632 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 7633 error = SET_ERROR(EINVAL); 7634 break; 7635 } 7636 7637 if (vdev_dtl_required(vml[c]) || 7638 vdev_resilver_needed(vml[c], NULL, NULL)) { 7639 error = SET_ERROR(EBUSY); 7640 break; 7641 } 7642 7643 /* we need certain info from the top level */ 7644 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 7645 vml[c]->vdev_top->vdev_ms_array); 7646 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 7647 vml[c]->vdev_top->vdev_ms_shift); 7648 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 7649 vml[c]->vdev_top->vdev_asize); 7650 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 7651 vml[c]->vdev_top->vdev_ashift); 7652 7653 /* transfer per-vdev ZAPs */ 7654 ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0); 7655 VERIFY0(nvlist_add_uint64(child[c], 7656 ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap)); 7657 7658 ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0); 7659 VERIFY0(nvlist_add_uint64(child[c], 7660 ZPOOL_CONFIG_VDEV_TOP_ZAP, 7661 vml[c]->vdev_parent->vdev_top_zap)); 7662 } 7663 7664 if (error != 0) { 7665 kmem_free(vml, children * sizeof (vdev_t *)); 7666 kmem_free(glist, children * sizeof (uint64_t)); 7667 return (spa_vdev_exit(spa, NULL, txg, error)); 7668 } 7669 7670 /* stop writers from using the disks */ 7671 for (c = 0; c < children; c++) { 7672 if (vml[c] != NULL) 7673 vml[c]->vdev_offline = B_TRUE; 7674 } 7675 vdev_reopen(spa->spa_root_vdev); 7676 7677 /* 7678 * Temporarily record the splitting vdevs in the spa config. This 7679 * will disappear once the config is regenerated. 7680 */ 7681 nvl = fnvlist_alloc(); 7682 fnvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, glist, children); 7683 kmem_free(glist, children * sizeof (uint64_t)); 7684 7685 mutex_enter(&spa->spa_props_lock); 7686 fnvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, nvl); 7687 mutex_exit(&spa->spa_props_lock); 7688 spa->spa_config_splitting = nvl; 7689 vdev_config_dirty(spa->spa_root_vdev); 7690 7691 /* configure and create the new pool */ 7692 fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname); 7693 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 7694 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE); 7695 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa)); 7696 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg); 7697 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 7698 spa_generate_guid(NULL)); 7699 VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); 7700 (void) nvlist_lookup_string(props, 7701 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 7702 7703 /* add the new pool to the namespace */ 7704 newspa = spa_add(newname, config, altroot); 7705 newspa->spa_avz_action = AVZ_ACTION_REBUILD; 7706 newspa->spa_config_txg = spa->spa_config_txg; 7707 spa_set_log_state(newspa, SPA_LOG_CLEAR); 7708 7709 /* release the spa config lock, retaining the namespace lock */ 7710 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 7711 7712 if (zio_injection_enabled) 7713 zio_handle_panic_injection(spa, FTAG, 1); 7714 7715 spa_activate(newspa, spa_mode_global); 7716 spa_async_suspend(newspa); 7717 7718 /* 7719 * Temporarily stop the initializing and TRIM activity. We set the 7720 * state to ACTIVE so that we know to resume initializing or TRIM 7721 * once the split has completed. 7722 */ 7723 list_t vd_initialize_list; 7724 list_create(&vd_initialize_list, sizeof (vdev_t), 7725 offsetof(vdev_t, vdev_initialize_node)); 7726 7727 list_t vd_trim_list; 7728 list_create(&vd_trim_list, sizeof (vdev_t), 7729 offsetof(vdev_t, vdev_trim_node)); 7730 7731 for (c = 0; c < children; c++) { 7732 if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { 7733 mutex_enter(&vml[c]->vdev_initialize_lock); 7734 vdev_initialize_stop(vml[c], 7735 VDEV_INITIALIZE_ACTIVE, &vd_initialize_list); 7736 mutex_exit(&vml[c]->vdev_initialize_lock); 7737 7738 mutex_enter(&vml[c]->vdev_trim_lock); 7739 vdev_trim_stop(vml[c], VDEV_TRIM_ACTIVE, &vd_trim_list); 7740 mutex_exit(&vml[c]->vdev_trim_lock); 7741 } 7742 } 7743 7744 vdev_initialize_stop_wait(spa, &vd_initialize_list); 7745 vdev_trim_stop_wait(spa, &vd_trim_list); 7746 7747 list_destroy(&vd_initialize_list); 7748 list_destroy(&vd_trim_list); 7749 7750 newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT; 7751 newspa->spa_is_splitting = B_TRUE; 7752 7753 /* create the new pool from the disks of the original pool */ 7754 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE); 7755 if (error) 7756 goto out; 7757 7758 /* if that worked, generate a real config for the new pool */ 7759 if (newspa->spa_root_vdev != NULL) { 7760 newspa->spa_config_splitting = fnvlist_alloc(); 7761 fnvlist_add_uint64(newspa->spa_config_splitting, 7762 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)); 7763 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 7764 B_TRUE)); 7765 } 7766 7767 /* set the props */ 7768 if (props != NULL) { 7769 spa_configfile_set(newspa, props, B_FALSE); 7770 error = spa_prop_set(newspa, props); 7771 if (error) 7772 goto out; 7773 } 7774 7775 /* flush everything */ 7776 txg = spa_vdev_config_enter(newspa); 7777 vdev_config_dirty(newspa->spa_root_vdev); 7778 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 7779 7780 if (zio_injection_enabled) 7781 zio_handle_panic_injection(spa, FTAG, 2); 7782 7783 spa_async_resume(newspa); 7784 7785 /* finally, update the original pool's config */ 7786 txg = spa_vdev_config_enter(spa); 7787 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 7788 error = dmu_tx_assign(tx, TXG_WAIT); 7789 if (error != 0) 7790 dmu_tx_abort(tx); 7791 for (c = 0; c < children; c++) { 7792 if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { 7793 vdev_t *tvd = vml[c]->vdev_top; 7794 7795 /* 7796 * Need to be sure the detachable VDEV is not 7797 * on any *other* txg's DTL list to prevent it 7798 * from being accessed after it's freed. 7799 */ 7800 for (int t = 0; t < TXG_SIZE; t++) { 7801 (void) txg_list_remove_this( 7802 &tvd->vdev_dtl_list, vml[c], t); 7803 } 7804 7805 vdev_split(vml[c]); 7806 if (error == 0) 7807 spa_history_log_internal(spa, "detach", tx, 7808 "vdev=%s", vml[c]->vdev_path); 7809 7810 vdev_free(vml[c]); 7811 } 7812 } 7813 spa->spa_avz_action = AVZ_ACTION_REBUILD; 7814 vdev_config_dirty(spa->spa_root_vdev); 7815 spa->spa_config_splitting = NULL; 7816 nvlist_free(nvl); 7817 if (error == 0) 7818 dmu_tx_commit(tx); 7819 (void) spa_vdev_exit(spa, NULL, txg, 0); 7820 7821 if (zio_injection_enabled) 7822 zio_handle_panic_injection(spa, FTAG, 3); 7823 7824 /* split is complete; log a history record */ 7825 spa_history_log_internal(newspa, "split", NULL, 7826 "from pool %s", spa_name(spa)); 7827 7828 newspa->spa_is_splitting = B_FALSE; 7829 kmem_free(vml, children * sizeof (vdev_t *)); 7830 7831 /* if we're not going to mount the filesystems in userland, export */ 7832 if (exp) 7833 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 7834 B_FALSE, B_FALSE); 7835 7836 return (error); 7837 7838 out: 7839 spa_unload(newspa); 7840 spa_deactivate(newspa); 7841 spa_remove(newspa); 7842 7843 txg = spa_vdev_config_enter(spa); 7844 7845 /* re-online all offlined disks */ 7846 for (c = 0; c < children; c++) { 7847 if (vml[c] != NULL) 7848 vml[c]->vdev_offline = B_FALSE; 7849 } 7850 7851 /* restart initializing or trimming disks as necessary */ 7852 spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); 7853 spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); 7854 spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); 7855 7856 vdev_reopen(spa->spa_root_vdev); 7857 7858 nvlist_free(spa->spa_config_splitting); 7859 spa->spa_config_splitting = NULL; 7860 (void) spa_vdev_exit(spa, NULL, txg, error); 7861 7862 kmem_free(vml, children * sizeof (vdev_t *)); 7863 return (error); 7864 } 7865 7866 /* 7867 * Find any device that's done replacing, or a vdev marked 'unspare' that's 7868 * currently spared, so we can detach it. 7869 */ 7870 static vdev_t * 7871 spa_vdev_resilver_done_hunt(vdev_t *vd) 7872 { 7873 vdev_t *newvd, *oldvd; 7874 7875 for (int c = 0; c < vd->vdev_children; c++) { 7876 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 7877 if (oldvd != NULL) 7878 return (oldvd); 7879 } 7880 7881 /* 7882 * Check for a completed replacement. We always consider the first 7883 * vdev in the list to be the oldest vdev, and the last one to be 7884 * the newest (see spa_vdev_attach() for how that works). In 7885 * the case where the newest vdev is faulted, we will not automatically 7886 * remove it after a resilver completes. This is OK as it will require 7887 * user intervention to determine which disk the admin wishes to keep. 7888 */ 7889 if (vd->vdev_ops == &vdev_replacing_ops) { 7890 ASSERT(vd->vdev_children > 1); 7891 7892 newvd = vd->vdev_child[vd->vdev_children - 1]; 7893 oldvd = vd->vdev_child[0]; 7894 7895 if (vdev_dtl_empty(newvd, DTL_MISSING) && 7896 vdev_dtl_empty(newvd, DTL_OUTAGE) && 7897 !vdev_dtl_required(oldvd)) 7898 return (oldvd); 7899 } 7900 7901 /* 7902 * Check for a completed resilver with the 'unspare' flag set. 7903 * Also potentially update faulted state. 7904 */ 7905 if (vd->vdev_ops == &vdev_spare_ops) { 7906 vdev_t *first = vd->vdev_child[0]; 7907 vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 7908 7909 if (last->vdev_unspare) { 7910 oldvd = first; 7911 newvd = last; 7912 } else if (first->vdev_unspare) { 7913 oldvd = last; 7914 newvd = first; 7915 } else { 7916 oldvd = NULL; 7917 } 7918 7919 if (oldvd != NULL && 7920 vdev_dtl_empty(newvd, DTL_MISSING) && 7921 vdev_dtl_empty(newvd, DTL_OUTAGE) && 7922 !vdev_dtl_required(oldvd)) 7923 return (oldvd); 7924 7925 vdev_propagate_state(vd); 7926 7927 /* 7928 * If there are more than two spares attached to a disk, 7929 * and those spares are not required, then we want to 7930 * attempt to free them up now so that they can be used 7931 * by other pools. Once we're back down to a single 7932 * disk+spare, we stop removing them. 7933 */ 7934 if (vd->vdev_children > 2) { 7935 newvd = vd->vdev_child[1]; 7936 7937 if (newvd->vdev_isspare && last->vdev_isspare && 7938 vdev_dtl_empty(last, DTL_MISSING) && 7939 vdev_dtl_empty(last, DTL_OUTAGE) && 7940 !vdev_dtl_required(newvd)) 7941 return (newvd); 7942 } 7943 } 7944 7945 return (NULL); 7946 } 7947 7948 static void 7949 spa_vdev_resilver_done(spa_t *spa) 7950 { 7951 vdev_t *vd, *pvd, *ppvd; 7952 uint64_t guid, sguid, pguid, ppguid; 7953 7954 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 7955 7956 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 7957 pvd = vd->vdev_parent; 7958 ppvd = pvd->vdev_parent; 7959 guid = vd->vdev_guid; 7960 pguid = pvd->vdev_guid; 7961 ppguid = ppvd->vdev_guid; 7962 sguid = 0; 7963 /* 7964 * If we have just finished replacing a hot spared device, then 7965 * we need to detach the parent's first child (the original hot 7966 * spare) as well. 7967 */ 7968 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 7969 ppvd->vdev_children == 2) { 7970 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 7971 sguid = ppvd->vdev_child[1]->vdev_guid; 7972 } 7973 ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); 7974 7975 spa_config_exit(spa, SCL_ALL, FTAG); 7976 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 7977 return; 7978 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 7979 return; 7980 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 7981 } 7982 7983 spa_config_exit(spa, SCL_ALL, FTAG); 7984 7985 /* 7986 * If a detach was not performed above replace waiters will not have 7987 * been notified. In which case we must do so now. 7988 */ 7989 spa_notify_waiters(spa); 7990 } 7991 7992 /* 7993 * Update the stored path or FRU for this vdev. 7994 */ 7995 static int 7996 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 7997 boolean_t ispath) 7998 { 7999 vdev_t *vd; 8000 boolean_t sync = B_FALSE; 8001 8002 ASSERT(spa_writeable(spa)); 8003 8004 spa_vdev_state_enter(spa, SCL_ALL); 8005 8006 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 8007 return (spa_vdev_state_exit(spa, NULL, ENOENT)); 8008 8009 if (!vd->vdev_ops->vdev_op_leaf) 8010 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 8011 8012 if (ispath) { 8013 if (strcmp(value, vd->vdev_path) != 0) { 8014 spa_strfree(vd->vdev_path); 8015 vd->vdev_path = spa_strdup(value); 8016 sync = B_TRUE; 8017 } 8018 } else { 8019 if (vd->vdev_fru == NULL) { 8020 vd->vdev_fru = spa_strdup(value); 8021 sync = B_TRUE; 8022 } else if (strcmp(value, vd->vdev_fru) != 0) { 8023 spa_strfree(vd->vdev_fru); 8024 vd->vdev_fru = spa_strdup(value); 8025 sync = B_TRUE; 8026 } 8027 } 8028 8029 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 8030 } 8031 8032 int 8033 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 8034 { 8035 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 8036 } 8037 8038 int 8039 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 8040 { 8041 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 8042 } 8043 8044 /* 8045 * ========================================================================== 8046 * SPA Scanning 8047 * ========================================================================== 8048 */ 8049 int 8050 spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd) 8051 { 8052 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 8053 8054 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 8055 return (SET_ERROR(EBUSY)); 8056 8057 return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd)); 8058 } 8059 8060 int 8061 spa_scan_stop(spa_t *spa) 8062 { 8063 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 8064 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 8065 return (SET_ERROR(EBUSY)); 8066 return (dsl_scan_cancel(spa->spa_dsl_pool)); 8067 } 8068 8069 int 8070 spa_scan(spa_t *spa, pool_scan_func_t func) 8071 { 8072 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 8073 8074 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 8075 return (SET_ERROR(ENOTSUP)); 8076 8077 if (func == POOL_SCAN_RESILVER && 8078 !spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) 8079 return (SET_ERROR(ENOTSUP)); 8080 8081 /* 8082 * If a resilver was requested, but there is no DTL on a 8083 * writeable leaf device, we have nothing to do. 8084 */ 8085 if (func == POOL_SCAN_RESILVER && 8086 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 8087 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 8088 return (0); 8089 } 8090 8091 return (dsl_scan(spa->spa_dsl_pool, func)); 8092 } 8093 8094 /* 8095 * ========================================================================== 8096 * SPA async task processing 8097 * ========================================================================== 8098 */ 8099 8100 static void 8101 spa_async_remove(spa_t *spa, vdev_t *vd) 8102 { 8103 if (vd->vdev_remove_wanted) { 8104 vd->vdev_remove_wanted = B_FALSE; 8105 vd->vdev_delayed_close = B_FALSE; 8106 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 8107 8108 /* 8109 * We want to clear the stats, but we don't want to do a full 8110 * vdev_clear() as that will cause us to throw away 8111 * degraded/faulted state as well as attempt to reopen the 8112 * device, all of which is a waste. 8113 */ 8114 vd->vdev_stat.vs_read_errors = 0; 8115 vd->vdev_stat.vs_write_errors = 0; 8116 vd->vdev_stat.vs_checksum_errors = 0; 8117 8118 vdev_state_dirty(vd->vdev_top); 8119 8120 /* Tell userspace that the vdev is gone. */ 8121 zfs_post_remove(spa, vd); 8122 } 8123 8124 for (int c = 0; c < vd->vdev_children; c++) 8125 spa_async_remove(spa, vd->vdev_child[c]); 8126 } 8127 8128 static void 8129 spa_async_probe(spa_t *spa, vdev_t *vd) 8130 { 8131 if (vd->vdev_probe_wanted) { 8132 vd->vdev_probe_wanted = B_FALSE; 8133 vdev_reopen(vd); /* vdev_open() does the actual probe */ 8134 } 8135 8136 for (int c = 0; c < vd->vdev_children; c++) 8137 spa_async_probe(spa, vd->vdev_child[c]); 8138 } 8139 8140 static void 8141 spa_async_autoexpand(spa_t *spa, vdev_t *vd) 8142 { 8143 if (!spa->spa_autoexpand) 8144 return; 8145 8146 for (int c = 0; c < vd->vdev_children; c++) { 8147 vdev_t *cvd = vd->vdev_child[c]; 8148 spa_async_autoexpand(spa, cvd); 8149 } 8150 8151 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 8152 return; 8153 8154 spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_AUTOEXPAND); 8155 } 8156 8157 static __attribute__((noreturn)) void 8158 spa_async_thread(void *arg) 8159 { 8160 spa_t *spa = (spa_t *)arg; 8161 dsl_pool_t *dp = spa->spa_dsl_pool; 8162 int tasks; 8163 8164 ASSERT(spa->spa_sync_on); 8165 8166 mutex_enter(&spa->spa_async_lock); 8167 tasks = spa->spa_async_tasks; 8168 spa->spa_async_tasks = 0; 8169 mutex_exit(&spa->spa_async_lock); 8170 8171 /* 8172 * See if the config needs to be updated. 8173 */ 8174 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 8175 uint64_t old_space, new_space; 8176 8177 mutex_enter(&spa_namespace_lock); 8178 old_space = metaslab_class_get_space(spa_normal_class(spa)); 8179 old_space += metaslab_class_get_space(spa_special_class(spa)); 8180 old_space += metaslab_class_get_space(spa_dedup_class(spa)); 8181 old_space += metaslab_class_get_space( 8182 spa_embedded_log_class(spa)); 8183 8184 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 8185 8186 new_space = metaslab_class_get_space(spa_normal_class(spa)); 8187 new_space += metaslab_class_get_space(spa_special_class(spa)); 8188 new_space += metaslab_class_get_space(spa_dedup_class(spa)); 8189 new_space += metaslab_class_get_space( 8190 spa_embedded_log_class(spa)); 8191 mutex_exit(&spa_namespace_lock); 8192 8193 /* 8194 * If the pool grew as a result of the config update, 8195 * then log an internal history event. 8196 */ 8197 if (new_space != old_space) { 8198 spa_history_log_internal(spa, "vdev online", NULL, 8199 "pool '%s' size: %llu(+%llu)", 8200 spa_name(spa), (u_longlong_t)new_space, 8201 (u_longlong_t)(new_space - old_space)); 8202 } 8203 } 8204 8205 /* 8206 * See if any devices need to be marked REMOVED. 8207 */ 8208 if (tasks & SPA_ASYNC_REMOVE) { 8209 spa_vdev_state_enter(spa, SCL_NONE); 8210 spa_async_remove(spa, spa->spa_root_vdev); 8211 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 8212 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 8213 for (int i = 0; i < spa->spa_spares.sav_count; i++) 8214 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 8215 (void) spa_vdev_state_exit(spa, NULL, 0); 8216 } 8217 8218 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 8219 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8220 spa_async_autoexpand(spa, spa->spa_root_vdev); 8221 spa_config_exit(spa, SCL_CONFIG, FTAG); 8222 } 8223 8224 /* 8225 * See if any devices need to be probed. 8226 */ 8227 if (tasks & SPA_ASYNC_PROBE) { 8228 spa_vdev_state_enter(spa, SCL_NONE); 8229 spa_async_probe(spa, spa->spa_root_vdev); 8230 (void) spa_vdev_state_exit(spa, NULL, 0); 8231 } 8232 8233 /* 8234 * If any devices are done replacing, detach them. 8235 */ 8236 if (tasks & SPA_ASYNC_RESILVER_DONE || 8237 tasks & SPA_ASYNC_REBUILD_DONE) { 8238 spa_vdev_resilver_done(spa); 8239 } 8240 8241 /* 8242 * Kick off a resilver. 8243 */ 8244 if (tasks & SPA_ASYNC_RESILVER && 8245 !vdev_rebuild_active(spa->spa_root_vdev) && 8246 (!dsl_scan_resilvering(dp) || 8247 !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))) 8248 dsl_scan_restart_resilver(dp, 0); 8249 8250 if (tasks & SPA_ASYNC_INITIALIZE_RESTART) { 8251 mutex_enter(&spa_namespace_lock); 8252 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8253 vdev_initialize_restart(spa->spa_root_vdev); 8254 spa_config_exit(spa, SCL_CONFIG, FTAG); 8255 mutex_exit(&spa_namespace_lock); 8256 } 8257 8258 if (tasks & SPA_ASYNC_TRIM_RESTART) { 8259 mutex_enter(&spa_namespace_lock); 8260 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8261 vdev_trim_restart(spa->spa_root_vdev); 8262 spa_config_exit(spa, SCL_CONFIG, FTAG); 8263 mutex_exit(&spa_namespace_lock); 8264 } 8265 8266 if (tasks & SPA_ASYNC_AUTOTRIM_RESTART) { 8267 mutex_enter(&spa_namespace_lock); 8268 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8269 vdev_autotrim_restart(spa); 8270 spa_config_exit(spa, SCL_CONFIG, FTAG); 8271 mutex_exit(&spa_namespace_lock); 8272 } 8273 8274 /* 8275 * Kick off L2 cache whole device TRIM. 8276 */ 8277 if (tasks & SPA_ASYNC_L2CACHE_TRIM) { 8278 mutex_enter(&spa_namespace_lock); 8279 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8280 vdev_trim_l2arc(spa); 8281 spa_config_exit(spa, SCL_CONFIG, FTAG); 8282 mutex_exit(&spa_namespace_lock); 8283 } 8284 8285 /* 8286 * Kick off L2 cache rebuilding. 8287 */ 8288 if (tasks & SPA_ASYNC_L2CACHE_REBUILD) { 8289 mutex_enter(&spa_namespace_lock); 8290 spa_config_enter(spa, SCL_L2ARC, FTAG, RW_READER); 8291 l2arc_spa_rebuild_start(spa); 8292 spa_config_exit(spa, SCL_L2ARC, FTAG); 8293 mutex_exit(&spa_namespace_lock); 8294 } 8295 8296 /* 8297 * Let the world know that we're done. 8298 */ 8299 mutex_enter(&spa->spa_async_lock); 8300 spa->spa_async_thread = NULL; 8301 cv_broadcast(&spa->spa_async_cv); 8302 mutex_exit(&spa->spa_async_lock); 8303 thread_exit(); 8304 } 8305 8306 void 8307 spa_async_suspend(spa_t *spa) 8308 { 8309 mutex_enter(&spa->spa_async_lock); 8310 spa->spa_async_suspended++; 8311 while (spa->spa_async_thread != NULL) 8312 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 8313 mutex_exit(&spa->spa_async_lock); 8314 8315 spa_vdev_remove_suspend(spa); 8316 8317 zthr_t *condense_thread = spa->spa_condense_zthr; 8318 if (condense_thread != NULL) 8319 zthr_cancel(condense_thread); 8320 8321 zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; 8322 if (discard_thread != NULL) 8323 zthr_cancel(discard_thread); 8324 8325 zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; 8326 if (ll_delete_thread != NULL) 8327 zthr_cancel(ll_delete_thread); 8328 8329 zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; 8330 if (ll_condense_thread != NULL) 8331 zthr_cancel(ll_condense_thread); 8332 } 8333 8334 void 8335 spa_async_resume(spa_t *spa) 8336 { 8337 mutex_enter(&spa->spa_async_lock); 8338 ASSERT(spa->spa_async_suspended != 0); 8339 spa->spa_async_suspended--; 8340 mutex_exit(&spa->spa_async_lock); 8341 spa_restart_removal(spa); 8342 8343 zthr_t *condense_thread = spa->spa_condense_zthr; 8344 if (condense_thread != NULL) 8345 zthr_resume(condense_thread); 8346 8347 zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; 8348 if (discard_thread != NULL) 8349 zthr_resume(discard_thread); 8350 8351 zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; 8352 if (ll_delete_thread != NULL) 8353 zthr_resume(ll_delete_thread); 8354 8355 zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; 8356 if (ll_condense_thread != NULL) 8357 zthr_resume(ll_condense_thread); 8358 } 8359 8360 static boolean_t 8361 spa_async_tasks_pending(spa_t *spa) 8362 { 8363 uint_t non_config_tasks; 8364 uint_t config_task; 8365 boolean_t config_task_suspended; 8366 8367 non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE; 8368 config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; 8369 if (spa->spa_ccw_fail_time == 0) { 8370 config_task_suspended = B_FALSE; 8371 } else { 8372 config_task_suspended = 8373 (gethrtime() - spa->spa_ccw_fail_time) < 8374 ((hrtime_t)zfs_ccw_retry_interval * NANOSEC); 8375 } 8376 8377 return (non_config_tasks || (config_task && !config_task_suspended)); 8378 } 8379 8380 static void 8381 spa_async_dispatch(spa_t *spa) 8382 { 8383 mutex_enter(&spa->spa_async_lock); 8384 if (spa_async_tasks_pending(spa) && 8385 !spa->spa_async_suspended && 8386 spa->spa_async_thread == NULL) 8387 spa->spa_async_thread = thread_create(NULL, 0, 8388 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 8389 mutex_exit(&spa->spa_async_lock); 8390 } 8391 8392 void 8393 spa_async_request(spa_t *spa, int task) 8394 { 8395 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 8396 mutex_enter(&spa->spa_async_lock); 8397 spa->spa_async_tasks |= task; 8398 mutex_exit(&spa->spa_async_lock); 8399 } 8400 8401 int 8402 spa_async_tasks(spa_t *spa) 8403 { 8404 return (spa->spa_async_tasks); 8405 } 8406 8407 /* 8408 * ========================================================================== 8409 * SPA syncing routines 8410 * ========================================================================== 8411 */ 8412 8413 8414 static int 8415 bpobj_enqueue_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 8416 dmu_tx_t *tx) 8417 { 8418 bpobj_t *bpo = arg; 8419 bpobj_enqueue(bpo, bp, bp_freed, tx); 8420 return (0); 8421 } 8422 8423 int 8424 bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 8425 { 8426 return (bpobj_enqueue_cb(arg, bp, B_FALSE, tx)); 8427 } 8428 8429 int 8430 bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 8431 { 8432 return (bpobj_enqueue_cb(arg, bp, B_TRUE, tx)); 8433 } 8434 8435 static int 8436 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 8437 { 8438 zio_t *pio = arg; 8439 8440 zio_nowait(zio_free_sync(pio, pio->io_spa, dmu_tx_get_txg(tx), bp, 8441 pio->io_flags)); 8442 return (0); 8443 } 8444 8445 static int 8446 bpobj_spa_free_sync_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 8447 dmu_tx_t *tx) 8448 { 8449 ASSERT(!bp_freed); 8450 return (spa_free_sync_cb(arg, bp, tx)); 8451 } 8452 8453 /* 8454 * Note: this simple function is not inlined to make it easier to dtrace the 8455 * amount of time spent syncing frees. 8456 */ 8457 static void 8458 spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) 8459 { 8460 zio_t *zio = zio_root(spa, NULL, NULL, 0); 8461 bplist_iterate(bpl, spa_free_sync_cb, zio, tx); 8462 VERIFY(zio_wait(zio) == 0); 8463 } 8464 8465 /* 8466 * Note: this simple function is not inlined to make it easier to dtrace the 8467 * amount of time spent syncing deferred frees. 8468 */ 8469 static void 8470 spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) 8471 { 8472 if (spa_sync_pass(spa) != 1) 8473 return; 8474 8475 /* 8476 * Note: 8477 * If the log space map feature is active, we stop deferring 8478 * frees to the next TXG and therefore running this function 8479 * would be considered a no-op as spa_deferred_bpobj should 8480 * not have any entries. 8481 * 8482 * That said we run this function anyway (instead of returning 8483 * immediately) for the edge-case scenario where we just 8484 * activated the log space map feature in this TXG but we have 8485 * deferred frees from the previous TXG. 8486 */ 8487 zio_t *zio = zio_root(spa, NULL, NULL, 0); 8488 VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, 8489 bpobj_spa_free_sync_cb, zio, tx), ==, 0); 8490 VERIFY0(zio_wait(zio)); 8491 } 8492 8493 static void 8494 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 8495 { 8496 char *packed = NULL; 8497 size_t bufsize; 8498 size_t nvsize = 0; 8499 dmu_buf_t *db; 8500 8501 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 8502 8503 /* 8504 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 8505 * information. This avoids the dmu_buf_will_dirty() path and 8506 * saves us a pre-read to get data we don't actually care about. 8507 */ 8508 bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); 8509 packed = vmem_alloc(bufsize, KM_SLEEP); 8510 8511 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 8512 KM_SLEEP) == 0); 8513 memset(packed + nvsize, 0, bufsize - nvsize); 8514 8515 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 8516 8517 vmem_free(packed, bufsize); 8518 8519 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 8520 dmu_buf_will_dirty(db, tx); 8521 *(uint64_t *)db->db_data = nvsize; 8522 dmu_buf_rele(db, FTAG); 8523 } 8524 8525 static void 8526 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 8527 const char *config, const char *entry) 8528 { 8529 nvlist_t *nvroot; 8530 nvlist_t **list; 8531 int i; 8532 8533 if (!sav->sav_sync) 8534 return; 8535 8536 /* 8537 * Update the MOS nvlist describing the list of available devices. 8538 * spa_validate_aux() will have already made sure this nvlist is 8539 * valid and the vdevs are labeled appropriately. 8540 */ 8541 if (sav->sav_object == 0) { 8542 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 8543 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 8544 sizeof (uint64_t), tx); 8545 VERIFY(zap_update(spa->spa_meta_objset, 8546 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 8547 &sav->sav_object, tx) == 0); 8548 } 8549 8550 nvroot = fnvlist_alloc(); 8551 if (sav->sav_count == 0) { 8552 fnvlist_add_nvlist_array(nvroot, config, 8553 (const nvlist_t * const *)NULL, 0); 8554 } else { 8555 list = kmem_alloc(sav->sav_count*sizeof (void *), KM_SLEEP); 8556 for (i = 0; i < sav->sav_count; i++) 8557 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 8558 B_FALSE, VDEV_CONFIG_L2CACHE); 8559 fnvlist_add_nvlist_array(nvroot, config, 8560 (const nvlist_t * const *)list, sav->sav_count); 8561 for (i = 0; i < sav->sav_count; i++) 8562 nvlist_free(list[i]); 8563 kmem_free(list, sav->sav_count * sizeof (void *)); 8564 } 8565 8566 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 8567 nvlist_free(nvroot); 8568 8569 sav->sav_sync = B_FALSE; 8570 } 8571 8572 /* 8573 * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t. 8574 * The all-vdev ZAP must be empty. 8575 */ 8576 static void 8577 spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx) 8578 { 8579 spa_t *spa = vd->vdev_spa; 8580 8581 if (vd->vdev_top_zap != 0) { 8582 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 8583 vd->vdev_top_zap, tx)); 8584 } 8585 if (vd->vdev_leaf_zap != 0) { 8586 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 8587 vd->vdev_leaf_zap, tx)); 8588 } 8589 for (uint64_t i = 0; i < vd->vdev_children; i++) { 8590 spa_avz_build(vd->vdev_child[i], avz, tx); 8591 } 8592 } 8593 8594 static void 8595 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 8596 { 8597 nvlist_t *config; 8598 8599 /* 8600 * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS, 8601 * its config may not be dirty but we still need to build per-vdev ZAPs. 8602 * Similarly, if the pool is being assembled (e.g. after a split), we 8603 * need to rebuild the AVZ although the config may not be dirty. 8604 */ 8605 if (list_is_empty(&spa->spa_config_dirty_list) && 8606 spa->spa_avz_action == AVZ_ACTION_NONE) 8607 return; 8608 8609 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 8610 8611 ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE || 8612 spa->spa_avz_action == AVZ_ACTION_INITIALIZE || 8613 spa->spa_all_vdev_zaps != 0); 8614 8615 if (spa->spa_avz_action == AVZ_ACTION_REBUILD) { 8616 /* Make and build the new AVZ */ 8617 uint64_t new_avz = zap_create(spa->spa_meta_objset, 8618 DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); 8619 spa_avz_build(spa->spa_root_vdev, new_avz, tx); 8620 8621 /* Diff old AVZ with new one */ 8622 zap_cursor_t zc; 8623 zap_attribute_t za; 8624 8625 for (zap_cursor_init(&zc, spa->spa_meta_objset, 8626 spa->spa_all_vdev_zaps); 8627 zap_cursor_retrieve(&zc, &za) == 0; 8628 zap_cursor_advance(&zc)) { 8629 uint64_t vdzap = za.za_first_integer; 8630 if (zap_lookup_int(spa->spa_meta_objset, new_avz, 8631 vdzap) == ENOENT) { 8632 /* 8633 * ZAP is listed in old AVZ but not in new one; 8634 * destroy it 8635 */ 8636 VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap, 8637 tx)); 8638 } 8639 } 8640 8641 zap_cursor_fini(&zc); 8642 8643 /* Destroy the old AVZ */ 8644 VERIFY0(zap_destroy(spa->spa_meta_objset, 8645 spa->spa_all_vdev_zaps, tx)); 8646 8647 /* Replace the old AVZ in the dir obj with the new one */ 8648 VERIFY0(zap_update(spa->spa_meta_objset, 8649 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, 8650 sizeof (new_avz), 1, &new_avz, tx)); 8651 8652 spa->spa_all_vdev_zaps = new_avz; 8653 } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) { 8654 zap_cursor_t zc; 8655 zap_attribute_t za; 8656 8657 /* Walk through the AVZ and destroy all listed ZAPs */ 8658 for (zap_cursor_init(&zc, spa->spa_meta_objset, 8659 spa->spa_all_vdev_zaps); 8660 zap_cursor_retrieve(&zc, &za) == 0; 8661 zap_cursor_advance(&zc)) { 8662 uint64_t zap = za.za_first_integer; 8663 VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx)); 8664 } 8665 8666 zap_cursor_fini(&zc); 8667 8668 /* Destroy and unlink the AVZ itself */ 8669 VERIFY0(zap_destroy(spa->spa_meta_objset, 8670 spa->spa_all_vdev_zaps, tx)); 8671 VERIFY0(zap_remove(spa->spa_meta_objset, 8672 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx)); 8673 spa->spa_all_vdev_zaps = 0; 8674 } 8675 8676 if (spa->spa_all_vdev_zaps == 0) { 8677 spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset, 8678 DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, 8679 DMU_POOL_VDEV_ZAP_MAP, tx); 8680 } 8681 spa->spa_avz_action = AVZ_ACTION_NONE; 8682 8683 /* Create ZAPs for vdevs that don't have them. */ 8684 vdev_construct_zaps(spa->spa_root_vdev, tx); 8685 8686 config = spa_config_generate(spa, spa->spa_root_vdev, 8687 dmu_tx_get_txg(tx), B_FALSE); 8688 8689 /* 8690 * If we're upgrading the spa version then make sure that 8691 * the config object gets updated with the correct version. 8692 */ 8693 if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) 8694 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 8695 spa->spa_uberblock.ub_version); 8696 8697 spa_config_exit(spa, SCL_STATE, FTAG); 8698 8699 nvlist_free(spa->spa_config_syncing); 8700 spa->spa_config_syncing = config; 8701 8702 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 8703 } 8704 8705 static void 8706 spa_sync_version(void *arg, dmu_tx_t *tx) 8707 { 8708 uint64_t *versionp = arg; 8709 uint64_t version = *versionp; 8710 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 8711 8712 /* 8713 * Setting the version is special cased when first creating the pool. 8714 */ 8715 ASSERT(tx->tx_txg != TXG_INITIAL); 8716 8717 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 8718 ASSERT(version >= spa_version(spa)); 8719 8720 spa->spa_uberblock.ub_version = version; 8721 vdev_config_dirty(spa->spa_root_vdev); 8722 spa_history_log_internal(spa, "set", tx, "version=%lld", 8723 (longlong_t)version); 8724 } 8725 8726 /* 8727 * Set zpool properties. 8728 */ 8729 static void 8730 spa_sync_props(void *arg, dmu_tx_t *tx) 8731 { 8732 nvlist_t *nvp = arg; 8733 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 8734 objset_t *mos = spa->spa_meta_objset; 8735 nvpair_t *elem = NULL; 8736 8737 mutex_enter(&spa->spa_props_lock); 8738 8739 while ((elem = nvlist_next_nvpair(nvp, elem))) { 8740 uint64_t intval; 8741 char *strval, *fname; 8742 zpool_prop_t prop; 8743 const char *propname; 8744 zprop_type_t proptype; 8745 spa_feature_t fid; 8746 8747 switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 8748 case ZPOOL_PROP_INVAL: 8749 /* 8750 * We checked this earlier in spa_prop_validate(). 8751 */ 8752 ASSERT(zpool_prop_feature(nvpair_name(elem))); 8753 8754 fname = strchr(nvpair_name(elem), '@') + 1; 8755 VERIFY0(zfeature_lookup_name(fname, &fid)); 8756 8757 spa_feature_enable(spa, fid, tx); 8758 spa_history_log_internal(spa, "set", tx, 8759 "%s=enabled", nvpair_name(elem)); 8760 break; 8761 8762 case ZPOOL_PROP_VERSION: 8763 intval = fnvpair_value_uint64(elem); 8764 /* 8765 * The version is synced separately before other 8766 * properties and should be correct by now. 8767 */ 8768 ASSERT3U(spa_version(spa), >=, intval); 8769 break; 8770 8771 case ZPOOL_PROP_ALTROOT: 8772 /* 8773 * 'altroot' is a non-persistent property. It should 8774 * have been set temporarily at creation or import time. 8775 */ 8776 ASSERT(spa->spa_root != NULL); 8777 break; 8778 8779 case ZPOOL_PROP_READONLY: 8780 case ZPOOL_PROP_CACHEFILE: 8781 /* 8782 * 'readonly' and 'cachefile' are also non-persistent 8783 * properties. 8784 */ 8785 break; 8786 case ZPOOL_PROP_COMMENT: 8787 strval = fnvpair_value_string(elem); 8788 if (spa->spa_comment != NULL) 8789 spa_strfree(spa->spa_comment); 8790 spa->spa_comment = spa_strdup(strval); 8791 /* 8792 * We need to dirty the configuration on all the vdevs 8793 * so that their labels get updated. We also need to 8794 * update the cache file to keep it in sync with the 8795 * MOS version. It's unnecessary to do this for pool 8796 * creation since the vdev's configuration has already 8797 * been dirtied. 8798 */ 8799 if (tx->tx_txg != TXG_INITIAL) { 8800 vdev_config_dirty(spa->spa_root_vdev); 8801 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 8802 } 8803 spa_history_log_internal(spa, "set", tx, 8804 "%s=%s", nvpair_name(elem), strval); 8805 break; 8806 case ZPOOL_PROP_COMPATIBILITY: 8807 strval = fnvpair_value_string(elem); 8808 if (spa->spa_compatibility != NULL) 8809 spa_strfree(spa->spa_compatibility); 8810 spa->spa_compatibility = spa_strdup(strval); 8811 /* 8812 * Dirty the configuration on vdevs as above. 8813 */ 8814 if (tx->tx_txg != TXG_INITIAL) { 8815 vdev_config_dirty(spa->spa_root_vdev); 8816 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 8817 } 8818 8819 spa_history_log_internal(spa, "set", tx, 8820 "%s=%s", nvpair_name(elem), strval); 8821 break; 8822 8823 default: 8824 /* 8825 * Set pool property values in the poolprops mos object. 8826 */ 8827 if (spa->spa_pool_props_object == 0) { 8828 spa->spa_pool_props_object = 8829 zap_create_link(mos, DMU_OT_POOL_PROPS, 8830 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 8831 tx); 8832 } 8833 8834 /* normalize the property name */ 8835 propname = zpool_prop_to_name(prop); 8836 proptype = zpool_prop_get_type(prop); 8837 8838 if (nvpair_type(elem) == DATA_TYPE_STRING) { 8839 ASSERT(proptype == PROP_TYPE_STRING); 8840 strval = fnvpair_value_string(elem); 8841 VERIFY0(zap_update(mos, 8842 spa->spa_pool_props_object, propname, 8843 1, strlen(strval) + 1, strval, tx)); 8844 spa_history_log_internal(spa, "set", tx, 8845 "%s=%s", nvpair_name(elem), strval); 8846 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 8847 intval = fnvpair_value_uint64(elem); 8848 8849 if (proptype == PROP_TYPE_INDEX) { 8850 const char *unused; 8851 VERIFY0(zpool_prop_index_to_string( 8852 prop, intval, &unused)); 8853 } 8854 VERIFY0(zap_update(mos, 8855 spa->spa_pool_props_object, propname, 8856 8, 1, &intval, tx)); 8857 spa_history_log_internal(spa, "set", tx, 8858 "%s=%lld", nvpair_name(elem), 8859 (longlong_t)intval); 8860 } else { 8861 ASSERT(0); /* not allowed */ 8862 } 8863 8864 switch (prop) { 8865 case ZPOOL_PROP_DELEGATION: 8866 spa->spa_delegation = intval; 8867 break; 8868 case ZPOOL_PROP_BOOTFS: 8869 spa->spa_bootfs = intval; 8870 break; 8871 case ZPOOL_PROP_FAILUREMODE: 8872 spa->spa_failmode = intval; 8873 break; 8874 case ZPOOL_PROP_AUTOTRIM: 8875 spa->spa_autotrim = intval; 8876 spa_async_request(spa, 8877 SPA_ASYNC_AUTOTRIM_RESTART); 8878 break; 8879 case ZPOOL_PROP_AUTOEXPAND: 8880 spa->spa_autoexpand = intval; 8881 if (tx->tx_txg != TXG_INITIAL) 8882 spa_async_request(spa, 8883 SPA_ASYNC_AUTOEXPAND); 8884 break; 8885 case ZPOOL_PROP_MULTIHOST: 8886 spa->spa_multihost = intval; 8887 break; 8888 default: 8889 break; 8890 } 8891 } 8892 8893 } 8894 8895 mutex_exit(&spa->spa_props_lock); 8896 } 8897 8898 /* 8899 * Perform one-time upgrade on-disk changes. spa_version() does not 8900 * reflect the new version this txg, so there must be no changes this 8901 * txg to anything that the upgrade code depends on after it executes. 8902 * Therefore this must be called after dsl_pool_sync() does the sync 8903 * tasks. 8904 */ 8905 static void 8906 spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 8907 { 8908 if (spa_sync_pass(spa) != 1) 8909 return; 8910 8911 dsl_pool_t *dp = spa->spa_dsl_pool; 8912 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 8913 8914 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 8915 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 8916 dsl_pool_create_origin(dp, tx); 8917 8918 /* Keeping the origin open increases spa_minref */ 8919 spa->spa_minref += 3; 8920 } 8921 8922 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 8923 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 8924 dsl_pool_upgrade_clones(dp, tx); 8925 } 8926 8927 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 8928 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 8929 dsl_pool_upgrade_dir_clones(dp, tx); 8930 8931 /* Keeping the freedir open increases spa_minref */ 8932 spa->spa_minref += 3; 8933 } 8934 8935 if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && 8936 spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 8937 spa_feature_create_zap_objects(spa, tx); 8938 } 8939 8940 /* 8941 * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable 8942 * when possibility to use lz4 compression for metadata was added 8943 * Old pools that have this feature enabled must be upgraded to have 8944 * this feature active 8945 */ 8946 if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 8947 boolean_t lz4_en = spa_feature_is_enabled(spa, 8948 SPA_FEATURE_LZ4_COMPRESS); 8949 boolean_t lz4_ac = spa_feature_is_active(spa, 8950 SPA_FEATURE_LZ4_COMPRESS); 8951 8952 if (lz4_en && !lz4_ac) 8953 spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); 8954 } 8955 8956 /* 8957 * If we haven't written the salt, do so now. Note that the 8958 * feature may not be activated yet, but that's fine since 8959 * the presence of this ZAP entry is backwards compatible. 8960 */ 8961 if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 8962 DMU_POOL_CHECKSUM_SALT) == ENOENT) { 8963 VERIFY0(zap_add(spa->spa_meta_objset, 8964 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, 8965 sizeof (spa->spa_cksum_salt.zcs_bytes), 8966 spa->spa_cksum_salt.zcs_bytes, tx)); 8967 } 8968 8969 rrw_exit(&dp->dp_config_rwlock, FTAG); 8970 } 8971 8972 static void 8973 vdev_indirect_state_sync_verify(vdev_t *vd) 8974 { 8975 vdev_indirect_mapping_t *vim __maybe_unused = vd->vdev_indirect_mapping; 8976 vdev_indirect_births_t *vib __maybe_unused = vd->vdev_indirect_births; 8977 8978 if (vd->vdev_ops == &vdev_indirect_ops) { 8979 ASSERT(vim != NULL); 8980 ASSERT(vib != NULL); 8981 } 8982 8983 uint64_t obsolete_sm_object = 0; 8984 ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); 8985 if (obsolete_sm_object != 0) { 8986 ASSERT(vd->vdev_obsolete_sm != NULL); 8987 ASSERT(vd->vdev_removing || 8988 vd->vdev_ops == &vdev_indirect_ops); 8989 ASSERT(vdev_indirect_mapping_num_entries(vim) > 0); 8990 ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0); 8991 ASSERT3U(obsolete_sm_object, ==, 8992 space_map_object(vd->vdev_obsolete_sm)); 8993 ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=, 8994 space_map_allocated(vd->vdev_obsolete_sm)); 8995 } 8996 ASSERT(vd->vdev_obsolete_segments != NULL); 8997 8998 /* 8999 * Since frees / remaps to an indirect vdev can only 9000 * happen in syncing context, the obsolete segments 9001 * tree must be empty when we start syncing. 9002 */ 9003 ASSERT0(range_tree_space(vd->vdev_obsolete_segments)); 9004 } 9005 9006 /* 9007 * Set the top-level vdev's max queue depth. Evaluate each top-level's 9008 * async write queue depth in case it changed. The max queue depth will 9009 * not change in the middle of syncing out this txg. 9010 */ 9011 static void 9012 spa_sync_adjust_vdev_max_queue_depth(spa_t *spa) 9013 { 9014 ASSERT(spa_writeable(spa)); 9015 9016 vdev_t *rvd = spa->spa_root_vdev; 9017 uint32_t max_queue_depth = zfs_vdev_async_write_max_active * 9018 zfs_vdev_queue_depth_pct / 100; 9019 metaslab_class_t *normal = spa_normal_class(spa); 9020 metaslab_class_t *special = spa_special_class(spa); 9021 metaslab_class_t *dedup = spa_dedup_class(spa); 9022 9023 uint64_t slots_per_allocator = 0; 9024 for (int c = 0; c < rvd->vdev_children; c++) { 9025 vdev_t *tvd = rvd->vdev_child[c]; 9026 9027 metaslab_group_t *mg = tvd->vdev_mg; 9028 if (mg == NULL || !metaslab_group_initialized(mg)) 9029 continue; 9030 9031 metaslab_class_t *mc = mg->mg_class; 9032 if (mc != normal && mc != special && mc != dedup) 9033 continue; 9034 9035 /* 9036 * It is safe to do a lock-free check here because only async 9037 * allocations look at mg_max_alloc_queue_depth, and async 9038 * allocations all happen from spa_sync(). 9039 */ 9040 for (int i = 0; i < mg->mg_allocators; i++) { 9041 ASSERT0(zfs_refcount_count( 9042 &(mg->mg_allocator[i].mga_alloc_queue_depth))); 9043 } 9044 mg->mg_max_alloc_queue_depth = max_queue_depth; 9045 9046 for (int i = 0; i < mg->mg_allocators; i++) { 9047 mg->mg_allocator[i].mga_cur_max_alloc_queue_depth = 9048 zfs_vdev_def_queue_depth; 9049 } 9050 slots_per_allocator += zfs_vdev_def_queue_depth; 9051 } 9052 9053 for (int i = 0; i < spa->spa_alloc_count; i++) { 9054 ASSERT0(zfs_refcount_count(&normal->mc_allocator[i]. 9055 mca_alloc_slots)); 9056 ASSERT0(zfs_refcount_count(&special->mc_allocator[i]. 9057 mca_alloc_slots)); 9058 ASSERT0(zfs_refcount_count(&dedup->mc_allocator[i]. 9059 mca_alloc_slots)); 9060 normal->mc_allocator[i].mca_alloc_max_slots = 9061 slots_per_allocator; 9062 special->mc_allocator[i].mca_alloc_max_slots = 9063 slots_per_allocator; 9064 dedup->mc_allocator[i].mca_alloc_max_slots = 9065 slots_per_allocator; 9066 } 9067 normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; 9068 special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; 9069 dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; 9070 } 9071 9072 static void 9073 spa_sync_condense_indirect(spa_t *spa, dmu_tx_t *tx) 9074 { 9075 ASSERT(spa_writeable(spa)); 9076 9077 vdev_t *rvd = spa->spa_root_vdev; 9078 for (int c = 0; c < rvd->vdev_children; c++) { 9079 vdev_t *vd = rvd->vdev_child[c]; 9080 vdev_indirect_state_sync_verify(vd); 9081 9082 if (vdev_indirect_should_condense(vd)) { 9083 spa_condense_indirect_start_sync(vd, tx); 9084 break; 9085 } 9086 } 9087 } 9088 9089 static void 9090 spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx) 9091 { 9092 objset_t *mos = spa->spa_meta_objset; 9093 dsl_pool_t *dp = spa->spa_dsl_pool; 9094 uint64_t txg = tx->tx_txg; 9095 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 9096 9097 do { 9098 int pass = ++spa->spa_sync_pass; 9099 9100 spa_sync_config_object(spa, tx); 9101 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 9102 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 9103 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 9104 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 9105 spa_errlog_sync(spa, txg); 9106 dsl_pool_sync(dp, txg); 9107 9108 if (pass < zfs_sync_pass_deferred_free || 9109 spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { 9110 /* 9111 * If the log space map feature is active we don't 9112 * care about deferred frees and the deferred bpobj 9113 * as the log space map should effectively have the 9114 * same results (i.e. appending only to one object). 9115 */ 9116 spa_sync_frees(spa, free_bpl, tx); 9117 } else { 9118 /* 9119 * We can not defer frees in pass 1, because 9120 * we sync the deferred frees later in pass 1. 9121 */ 9122 ASSERT3U(pass, >, 1); 9123 bplist_iterate(free_bpl, bpobj_enqueue_alloc_cb, 9124 &spa->spa_deferred_bpobj, tx); 9125 } 9126 9127 ddt_sync(spa, txg); 9128 dsl_scan_sync(dp, tx); 9129 svr_sync(spa, tx); 9130 spa_sync_upgrades(spa, tx); 9131 9132 spa_flush_metaslabs(spa, tx); 9133 9134 vdev_t *vd = NULL; 9135 while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 9136 != NULL) 9137 vdev_sync(vd, txg); 9138 9139 /* 9140 * Note: We need to check if the MOS is dirty because we could 9141 * have marked the MOS dirty without updating the uberblock 9142 * (e.g. if we have sync tasks but no dirty user data). We need 9143 * to check the uberblock's rootbp because it is updated if we 9144 * have synced out dirty data (though in this case the MOS will 9145 * most likely also be dirty due to second order effects, we 9146 * don't want to rely on that here). 9147 */ 9148 if (pass == 1 && 9149 spa->spa_uberblock.ub_rootbp.blk_birth < txg && 9150 !dmu_objset_is_dirty(mos, txg)) { 9151 /* 9152 * Nothing changed on the first pass, therefore this 9153 * TXG is a no-op. Avoid syncing deferred frees, so 9154 * that we can keep this TXG as a no-op. 9155 */ 9156 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 9157 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 9158 ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); 9159 ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, txg)); 9160 break; 9161 } 9162 9163 spa_sync_deferred_frees(spa, tx); 9164 } while (dmu_objset_is_dirty(mos, txg)); 9165 } 9166 9167 /* 9168 * Rewrite the vdev configuration (which includes the uberblock) to 9169 * commit the transaction group. 9170 * 9171 * If there are no dirty vdevs, we sync the uberblock to a few random 9172 * top-level vdevs that are known to be visible in the config cache 9173 * (see spa_vdev_add() for a complete description). If there *are* dirty 9174 * vdevs, sync the uberblock to all vdevs. 9175 */ 9176 static void 9177 spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx) 9178 { 9179 vdev_t *rvd = spa->spa_root_vdev; 9180 uint64_t txg = tx->tx_txg; 9181 9182 for (;;) { 9183 int error = 0; 9184 9185 /* 9186 * We hold SCL_STATE to prevent vdev open/close/etc. 9187 * while we're attempting to write the vdev labels. 9188 */ 9189 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 9190 9191 if (list_is_empty(&spa->spa_config_dirty_list)) { 9192 vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; 9193 int svdcount = 0; 9194 int children = rvd->vdev_children; 9195 int c0 = random_in_range(children); 9196 9197 for (int c = 0; c < children; c++) { 9198 vdev_t *vd = 9199 rvd->vdev_child[(c0 + c) % children]; 9200 9201 /* Stop when revisiting the first vdev */ 9202 if (c > 0 && svd[0] == vd) 9203 break; 9204 9205 if (vd->vdev_ms_array == 0 || 9206 vd->vdev_islog || 9207 !vdev_is_concrete(vd)) 9208 continue; 9209 9210 svd[svdcount++] = vd; 9211 if (svdcount == SPA_SYNC_MIN_VDEVS) 9212 break; 9213 } 9214 error = vdev_config_sync(svd, svdcount, txg); 9215 } else { 9216 error = vdev_config_sync(rvd->vdev_child, 9217 rvd->vdev_children, txg); 9218 } 9219 9220 if (error == 0) 9221 spa->spa_last_synced_guid = rvd->vdev_guid; 9222 9223 spa_config_exit(spa, SCL_STATE, FTAG); 9224 9225 if (error == 0) 9226 break; 9227 zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR); 9228 zio_resume_wait(spa); 9229 } 9230 } 9231 9232 /* 9233 * Sync the specified transaction group. New blocks may be dirtied as 9234 * part of the process, so we iterate until it converges. 9235 */ 9236 void 9237 spa_sync(spa_t *spa, uint64_t txg) 9238 { 9239 vdev_t *vd = NULL; 9240 9241 VERIFY(spa_writeable(spa)); 9242 9243 /* 9244 * Wait for i/os issued in open context that need to complete 9245 * before this txg syncs. 9246 */ 9247 (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]); 9248 spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, 9249 ZIO_FLAG_CANFAIL); 9250 9251 /* 9252 * Lock out configuration changes. 9253 */ 9254 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 9255 9256 spa->spa_syncing_txg = txg; 9257 spa->spa_sync_pass = 0; 9258 9259 for (int i = 0; i < spa->spa_alloc_count; i++) { 9260 mutex_enter(&spa->spa_allocs[i].spaa_lock); 9261 VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree)); 9262 mutex_exit(&spa->spa_allocs[i].spaa_lock); 9263 } 9264 9265 /* 9266 * If there are any pending vdev state changes, convert them 9267 * into config changes that go out with this transaction group. 9268 */ 9269 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 9270 while (list_head(&spa->spa_state_dirty_list) != NULL) { 9271 /* 9272 * We need the write lock here because, for aux vdevs, 9273 * calling vdev_config_dirty() modifies sav_config. 9274 * This is ugly and will become unnecessary when we 9275 * eliminate the aux vdev wart by integrating all vdevs 9276 * into the root vdev tree. 9277 */ 9278 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 9279 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 9280 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 9281 vdev_state_clean(vd); 9282 vdev_config_dirty(vd); 9283 } 9284 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 9285 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 9286 } 9287 spa_config_exit(spa, SCL_STATE, FTAG); 9288 9289 dsl_pool_t *dp = spa->spa_dsl_pool; 9290 dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); 9291 9292 spa->spa_sync_starttime = gethrtime(); 9293 taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); 9294 spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq, 9295 spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + 9296 NSEC_TO_TICK(spa->spa_deadman_synctime)); 9297 9298 /* 9299 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 9300 * set spa_deflate if we have no raid-z vdevs. 9301 */ 9302 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 9303 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 9304 vdev_t *rvd = spa->spa_root_vdev; 9305 9306 int i; 9307 for (i = 0; i < rvd->vdev_children; i++) { 9308 vd = rvd->vdev_child[i]; 9309 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 9310 break; 9311 } 9312 if (i == rvd->vdev_children) { 9313 spa->spa_deflate = TRUE; 9314 VERIFY0(zap_add(spa->spa_meta_objset, 9315 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 9316 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 9317 } 9318 } 9319 9320 spa_sync_adjust_vdev_max_queue_depth(spa); 9321 9322 spa_sync_condense_indirect(spa, tx); 9323 9324 spa_sync_iterate_to_convergence(spa, tx); 9325 9326 #ifdef ZFS_DEBUG 9327 if (!list_is_empty(&spa->spa_config_dirty_list)) { 9328 /* 9329 * Make sure that the number of ZAPs for all the vdevs matches 9330 * the number of ZAPs in the per-vdev ZAP list. This only gets 9331 * called if the config is dirty; otherwise there may be 9332 * outstanding AVZ operations that weren't completed in 9333 * spa_sync_config_object. 9334 */ 9335 uint64_t all_vdev_zap_entry_count; 9336 ASSERT0(zap_count(spa->spa_meta_objset, 9337 spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count)); 9338 ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==, 9339 all_vdev_zap_entry_count); 9340 } 9341 #endif 9342 9343 if (spa->spa_vdev_removal != NULL) { 9344 ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]); 9345 } 9346 9347 spa_sync_rewrite_vdev_config(spa, tx); 9348 dmu_tx_commit(tx); 9349 9350 taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); 9351 spa->spa_deadman_tqid = 0; 9352 9353 /* 9354 * Clear the dirty config list. 9355 */ 9356 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 9357 vdev_config_clean(vd); 9358 9359 /* 9360 * Now that the new config has synced transactionally, 9361 * let it become visible to the config cache. 9362 */ 9363 if (spa->spa_config_syncing != NULL) { 9364 spa_config_set(spa, spa->spa_config_syncing); 9365 spa->spa_config_txg = txg; 9366 spa->spa_config_syncing = NULL; 9367 } 9368 9369 dsl_pool_sync_done(dp, txg); 9370 9371 for (int i = 0; i < spa->spa_alloc_count; i++) { 9372 mutex_enter(&spa->spa_allocs[i].spaa_lock); 9373 VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree)); 9374 mutex_exit(&spa->spa_allocs[i].spaa_lock); 9375 } 9376 9377 /* 9378 * Update usable space statistics. 9379 */ 9380 while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 9381 != NULL) 9382 vdev_sync_done(vd, txg); 9383 9384 metaslab_class_evict_old(spa->spa_normal_class, txg); 9385 metaslab_class_evict_old(spa->spa_log_class, txg); 9386 9387 spa_sync_close_syncing_log_sm(spa); 9388 9389 spa_update_dspace(spa); 9390 9391 /* 9392 * It had better be the case that we didn't dirty anything 9393 * since vdev_config_sync(). 9394 */ 9395 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 9396 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 9397 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 9398 9399 while (zfs_pause_spa_sync) 9400 delay(1); 9401 9402 spa->spa_sync_pass = 0; 9403 9404 /* 9405 * Update the last synced uberblock here. We want to do this at 9406 * the end of spa_sync() so that consumers of spa_last_synced_txg() 9407 * will be guaranteed that all the processing associated with 9408 * that txg has been completed. 9409 */ 9410 spa->spa_ubsync = spa->spa_uberblock; 9411 spa_config_exit(spa, SCL_CONFIG, FTAG); 9412 9413 spa_handle_ignored_writes(spa); 9414 9415 /* 9416 * If any async tasks have been requested, kick them off. 9417 */ 9418 spa_async_dispatch(spa); 9419 } 9420 9421 /* 9422 * Sync all pools. We don't want to hold the namespace lock across these 9423 * operations, so we take a reference on the spa_t and drop the lock during the 9424 * sync. 9425 */ 9426 void 9427 spa_sync_allpools(void) 9428 { 9429 spa_t *spa = NULL; 9430 mutex_enter(&spa_namespace_lock); 9431 while ((spa = spa_next(spa)) != NULL) { 9432 if (spa_state(spa) != POOL_STATE_ACTIVE || 9433 !spa_writeable(spa) || spa_suspended(spa)) 9434 continue; 9435 spa_open_ref(spa, FTAG); 9436 mutex_exit(&spa_namespace_lock); 9437 txg_wait_synced(spa_get_dsl(spa), 0); 9438 mutex_enter(&spa_namespace_lock); 9439 spa_close(spa, FTAG); 9440 } 9441 mutex_exit(&spa_namespace_lock); 9442 } 9443 9444 /* 9445 * ========================================================================== 9446 * Miscellaneous routines 9447 * ========================================================================== 9448 */ 9449 9450 /* 9451 * Remove all pools in the system. 9452 */ 9453 void 9454 spa_evict_all(void) 9455 { 9456 spa_t *spa; 9457 9458 /* 9459 * Remove all cached state. All pools should be closed now, 9460 * so every spa in the AVL tree should be unreferenced. 9461 */ 9462 mutex_enter(&spa_namespace_lock); 9463 while ((spa = spa_next(NULL)) != NULL) { 9464 /* 9465 * Stop async tasks. The async thread may need to detach 9466 * a device that's been replaced, which requires grabbing 9467 * spa_namespace_lock, so we must drop it here. 9468 */ 9469 spa_open_ref(spa, FTAG); 9470 mutex_exit(&spa_namespace_lock); 9471 spa_async_suspend(spa); 9472 mutex_enter(&spa_namespace_lock); 9473 spa_close(spa, FTAG); 9474 9475 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 9476 spa_unload(spa); 9477 spa_deactivate(spa); 9478 } 9479 spa_remove(spa); 9480 } 9481 mutex_exit(&spa_namespace_lock); 9482 } 9483 9484 vdev_t * 9485 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 9486 { 9487 vdev_t *vd; 9488 int i; 9489 9490 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 9491 return (vd); 9492 9493 if (aux) { 9494 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 9495 vd = spa->spa_l2cache.sav_vdevs[i]; 9496 if (vd->vdev_guid == guid) 9497 return (vd); 9498 } 9499 9500 for (i = 0; i < spa->spa_spares.sav_count; i++) { 9501 vd = spa->spa_spares.sav_vdevs[i]; 9502 if (vd->vdev_guid == guid) 9503 return (vd); 9504 } 9505 } 9506 9507 return (NULL); 9508 } 9509 9510 void 9511 spa_upgrade(spa_t *spa, uint64_t version) 9512 { 9513 ASSERT(spa_writeable(spa)); 9514 9515 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 9516 9517 /* 9518 * This should only be called for a non-faulted pool, and since a 9519 * future version would result in an unopenable pool, this shouldn't be 9520 * possible. 9521 */ 9522 ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); 9523 ASSERT3U(version, >=, spa->spa_uberblock.ub_version); 9524 9525 spa->spa_uberblock.ub_version = version; 9526 vdev_config_dirty(spa->spa_root_vdev); 9527 9528 spa_config_exit(spa, SCL_ALL, FTAG); 9529 9530 txg_wait_synced(spa_get_dsl(spa), 0); 9531 } 9532 9533 static boolean_t 9534 spa_has_aux_vdev(spa_t *spa, uint64_t guid, spa_aux_vdev_t *sav) 9535 { 9536 (void) spa; 9537 int i; 9538 uint64_t vdev_guid; 9539 9540 for (i = 0; i < sav->sav_count; i++) 9541 if (sav->sav_vdevs[i]->vdev_guid == guid) 9542 return (B_TRUE); 9543 9544 for (i = 0; i < sav->sav_npending; i++) { 9545 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 9546 &vdev_guid) == 0 && vdev_guid == guid) 9547 return (B_TRUE); 9548 } 9549 9550 return (B_FALSE); 9551 } 9552 9553 boolean_t 9554 spa_has_l2cache(spa_t *spa, uint64_t guid) 9555 { 9556 return (spa_has_aux_vdev(spa, guid, &spa->spa_l2cache)); 9557 } 9558 9559 boolean_t 9560 spa_has_spare(spa_t *spa, uint64_t guid) 9561 { 9562 return (spa_has_aux_vdev(spa, guid, &spa->spa_spares)); 9563 } 9564 9565 /* 9566 * Check if a pool has an active shared spare device. 9567 * Note: reference count of an active spare is 2, as a spare and as a replace 9568 */ 9569 static boolean_t 9570 spa_has_active_shared_spare(spa_t *spa) 9571 { 9572 int i, refcnt; 9573 uint64_t pool; 9574 spa_aux_vdev_t *sav = &spa->spa_spares; 9575 9576 for (i = 0; i < sav->sav_count; i++) { 9577 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 9578 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 9579 refcnt > 2) 9580 return (B_TRUE); 9581 } 9582 9583 return (B_FALSE); 9584 } 9585 9586 uint64_t 9587 spa_total_metaslabs(spa_t *spa) 9588 { 9589 vdev_t *rvd = spa->spa_root_vdev; 9590 9591 uint64_t m = 0; 9592 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 9593 vdev_t *vd = rvd->vdev_child[c]; 9594 if (!vdev_is_concrete(vd)) 9595 continue; 9596 m += vd->vdev_ms_count; 9597 } 9598 return (m); 9599 } 9600 9601 /* 9602 * Notify any waiting threads that some activity has switched from being in- 9603 * progress to not-in-progress so that the thread can wake up and determine 9604 * whether it is finished waiting. 9605 */ 9606 void 9607 spa_notify_waiters(spa_t *spa) 9608 { 9609 /* 9610 * Acquiring spa_activities_lock here prevents the cv_broadcast from 9611 * happening between the waiting thread's check and cv_wait. 9612 */ 9613 mutex_enter(&spa->spa_activities_lock); 9614 cv_broadcast(&spa->spa_activities_cv); 9615 mutex_exit(&spa->spa_activities_lock); 9616 } 9617 9618 /* 9619 * Notify any waiting threads that the pool is exporting, and then block until 9620 * they are finished using the spa_t. 9621 */ 9622 void 9623 spa_wake_waiters(spa_t *spa) 9624 { 9625 mutex_enter(&spa->spa_activities_lock); 9626 spa->spa_waiters_cancel = B_TRUE; 9627 cv_broadcast(&spa->spa_activities_cv); 9628 while (spa->spa_waiters != 0) 9629 cv_wait(&spa->spa_waiters_cv, &spa->spa_activities_lock); 9630 spa->spa_waiters_cancel = B_FALSE; 9631 mutex_exit(&spa->spa_activities_lock); 9632 } 9633 9634 /* Whether the vdev or any of its descendants are being initialized/trimmed. */ 9635 static boolean_t 9636 spa_vdev_activity_in_progress_impl(vdev_t *vd, zpool_wait_activity_t activity) 9637 { 9638 spa_t *spa = vd->vdev_spa; 9639 9640 ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER)); 9641 ASSERT(MUTEX_HELD(&spa->spa_activities_lock)); 9642 ASSERT(activity == ZPOOL_WAIT_INITIALIZE || 9643 activity == ZPOOL_WAIT_TRIM); 9644 9645 kmutex_t *lock = activity == ZPOOL_WAIT_INITIALIZE ? 9646 &vd->vdev_initialize_lock : &vd->vdev_trim_lock; 9647 9648 mutex_exit(&spa->spa_activities_lock); 9649 mutex_enter(lock); 9650 mutex_enter(&spa->spa_activities_lock); 9651 9652 boolean_t in_progress = (activity == ZPOOL_WAIT_INITIALIZE) ? 9653 (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) : 9654 (vd->vdev_trim_state == VDEV_TRIM_ACTIVE); 9655 mutex_exit(lock); 9656 9657 if (in_progress) 9658 return (B_TRUE); 9659 9660 for (int i = 0; i < vd->vdev_children; i++) { 9661 if (spa_vdev_activity_in_progress_impl(vd->vdev_child[i], 9662 activity)) 9663 return (B_TRUE); 9664 } 9665 9666 return (B_FALSE); 9667 } 9668 9669 /* 9670 * If use_guid is true, this checks whether the vdev specified by guid is 9671 * being initialized/trimmed. Otherwise, it checks whether any vdev in the pool 9672 * is being initialized/trimmed. The caller must hold the config lock and 9673 * spa_activities_lock. 9674 */ 9675 static int 9676 spa_vdev_activity_in_progress(spa_t *spa, boolean_t use_guid, uint64_t guid, 9677 zpool_wait_activity_t activity, boolean_t *in_progress) 9678 { 9679 mutex_exit(&spa->spa_activities_lock); 9680 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 9681 mutex_enter(&spa->spa_activities_lock); 9682 9683 vdev_t *vd; 9684 if (use_guid) { 9685 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 9686 if (vd == NULL || !vd->vdev_ops->vdev_op_leaf) { 9687 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 9688 return (EINVAL); 9689 } 9690 } else { 9691 vd = spa->spa_root_vdev; 9692 } 9693 9694 *in_progress = spa_vdev_activity_in_progress_impl(vd, activity); 9695 9696 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 9697 return (0); 9698 } 9699 9700 /* 9701 * Locking for waiting threads 9702 * --------------------------- 9703 * 9704 * Waiting threads need a way to check whether a given activity is in progress, 9705 * and then, if it is, wait for it to complete. Each activity will have some 9706 * in-memory representation of the relevant on-disk state which can be used to 9707 * determine whether or not the activity is in progress. The in-memory state and 9708 * the locking used to protect it will be different for each activity, and may 9709 * not be suitable for use with a cvar (e.g., some state is protected by the 9710 * config lock). To allow waiting threads to wait without any races, another 9711 * lock, spa_activities_lock, is used. 9712 * 9713 * When the state is checked, both the activity-specific lock (if there is one) 9714 * and spa_activities_lock are held. In some cases, the activity-specific lock 9715 * is acquired explicitly (e.g. the config lock). In others, the locking is 9716 * internal to some check (e.g. bpobj_is_empty). After checking, the waiting 9717 * thread releases the activity-specific lock and, if the activity is in 9718 * progress, then cv_waits using spa_activities_lock. 9719 * 9720 * The waiting thread is woken when another thread, one completing some 9721 * activity, updates the state of the activity and then calls 9722 * spa_notify_waiters, which will cv_broadcast. This 'completing' thread only 9723 * needs to hold its activity-specific lock when updating the state, and this 9724 * lock can (but doesn't have to) be dropped before calling spa_notify_waiters. 9725 * 9726 * Because spa_notify_waiters acquires spa_activities_lock before broadcasting, 9727 * and because it is held when the waiting thread checks the state of the 9728 * activity, it can never be the case that the completing thread both updates 9729 * the activity state and cv_broadcasts in between the waiting thread's check 9730 * and cv_wait. Thus, a waiting thread can never miss a wakeup. 9731 * 9732 * In order to prevent deadlock, when the waiting thread does its check, in some 9733 * cases it will temporarily drop spa_activities_lock in order to acquire the 9734 * activity-specific lock. The order in which spa_activities_lock and the 9735 * activity specific lock are acquired in the waiting thread is determined by 9736 * the order in which they are acquired in the completing thread; if the 9737 * completing thread calls spa_notify_waiters with the activity-specific lock 9738 * held, then the waiting thread must also acquire the activity-specific lock 9739 * first. 9740 */ 9741 9742 static int 9743 spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity, 9744 boolean_t use_tag, uint64_t tag, boolean_t *in_progress) 9745 { 9746 int error = 0; 9747 9748 ASSERT(MUTEX_HELD(&spa->spa_activities_lock)); 9749 9750 switch (activity) { 9751 case ZPOOL_WAIT_CKPT_DISCARD: 9752 *in_progress = 9753 (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT) && 9754 zap_contains(spa_meta_objset(spa), 9755 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT) == 9756 ENOENT); 9757 break; 9758 case ZPOOL_WAIT_FREE: 9759 *in_progress = ((spa_version(spa) >= SPA_VERSION_DEADLISTS && 9760 !bpobj_is_empty(&spa->spa_dsl_pool->dp_free_bpobj)) || 9761 spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY) || 9762 spa_livelist_delete_check(spa)); 9763 break; 9764 case ZPOOL_WAIT_INITIALIZE: 9765 case ZPOOL_WAIT_TRIM: 9766 error = spa_vdev_activity_in_progress(spa, use_tag, tag, 9767 activity, in_progress); 9768 break; 9769 case ZPOOL_WAIT_REPLACE: 9770 mutex_exit(&spa->spa_activities_lock); 9771 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 9772 mutex_enter(&spa->spa_activities_lock); 9773 9774 *in_progress = vdev_replace_in_progress(spa->spa_root_vdev); 9775 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 9776 break; 9777 case ZPOOL_WAIT_REMOVE: 9778 *in_progress = (spa->spa_removing_phys.sr_state == 9779 DSS_SCANNING); 9780 break; 9781 case ZPOOL_WAIT_RESILVER: 9782 if ((*in_progress = vdev_rebuild_active(spa->spa_root_vdev))) 9783 break; 9784 zfs_fallthrough; 9785 case ZPOOL_WAIT_SCRUB: 9786 { 9787 boolean_t scanning, paused, is_scrub; 9788 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; 9789 9790 is_scrub = (scn->scn_phys.scn_func == POOL_SCAN_SCRUB); 9791 scanning = (scn->scn_phys.scn_state == DSS_SCANNING); 9792 paused = dsl_scan_is_paused_scrub(scn); 9793 *in_progress = (scanning && !paused && 9794 is_scrub == (activity == ZPOOL_WAIT_SCRUB)); 9795 break; 9796 } 9797 default: 9798 panic("unrecognized value for activity %d", activity); 9799 } 9800 9801 return (error); 9802 } 9803 9804 static int 9805 spa_wait_common(const char *pool, zpool_wait_activity_t activity, 9806 boolean_t use_tag, uint64_t tag, boolean_t *waited) 9807 { 9808 /* 9809 * The tag is used to distinguish between instances of an activity. 9810 * 'initialize' and 'trim' are the only activities that we use this for. 9811 * The other activities can only have a single instance in progress in a 9812 * pool at one time, making the tag unnecessary. 9813 * 9814 * There can be multiple devices being replaced at once, but since they 9815 * all finish once resilvering finishes, we don't bother keeping track 9816 * of them individually, we just wait for them all to finish. 9817 */ 9818 if (use_tag && activity != ZPOOL_WAIT_INITIALIZE && 9819 activity != ZPOOL_WAIT_TRIM) 9820 return (EINVAL); 9821 9822 if (activity < 0 || activity >= ZPOOL_WAIT_NUM_ACTIVITIES) 9823 return (EINVAL); 9824 9825 spa_t *spa; 9826 int error = spa_open(pool, &spa, FTAG); 9827 if (error != 0) 9828 return (error); 9829 9830 /* 9831 * Increment the spa's waiter count so that we can call spa_close and 9832 * still ensure that the spa_t doesn't get freed before this thread is 9833 * finished with it when the pool is exported. We want to call spa_close 9834 * before we start waiting because otherwise the additional ref would 9835 * prevent the pool from being exported or destroyed throughout the 9836 * potentially long wait. 9837 */ 9838 mutex_enter(&spa->spa_activities_lock); 9839 spa->spa_waiters++; 9840 spa_close(spa, FTAG); 9841 9842 *waited = B_FALSE; 9843 for (;;) { 9844 boolean_t in_progress; 9845 error = spa_activity_in_progress(spa, activity, use_tag, tag, 9846 &in_progress); 9847 9848 if (error || !in_progress || spa->spa_waiters_cancel) 9849 break; 9850 9851 *waited = B_TRUE; 9852 9853 if (cv_wait_sig(&spa->spa_activities_cv, 9854 &spa->spa_activities_lock) == 0) { 9855 error = EINTR; 9856 break; 9857 } 9858 } 9859 9860 spa->spa_waiters--; 9861 cv_signal(&spa->spa_waiters_cv); 9862 mutex_exit(&spa->spa_activities_lock); 9863 9864 return (error); 9865 } 9866 9867 /* 9868 * Wait for a particular instance of the specified activity to complete, where 9869 * the instance is identified by 'tag' 9870 */ 9871 int 9872 spa_wait_tag(const char *pool, zpool_wait_activity_t activity, uint64_t tag, 9873 boolean_t *waited) 9874 { 9875 return (spa_wait_common(pool, activity, B_TRUE, tag, waited)); 9876 } 9877 9878 /* 9879 * Wait for all instances of the specified activity complete 9880 */ 9881 int 9882 spa_wait(const char *pool, zpool_wait_activity_t activity, boolean_t *waited) 9883 { 9884 9885 return (spa_wait_common(pool, activity, B_FALSE, 0, waited)); 9886 } 9887 9888 sysevent_t * 9889 spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) 9890 { 9891 sysevent_t *ev = NULL; 9892 #ifdef _KERNEL 9893 nvlist_t *resource; 9894 9895 resource = zfs_event_create(spa, vd, FM_SYSEVENT_CLASS, name, hist_nvl); 9896 if (resource) { 9897 ev = kmem_alloc(sizeof (sysevent_t), KM_SLEEP); 9898 ev->resource = resource; 9899 } 9900 #else 9901 (void) spa, (void) vd, (void) hist_nvl, (void) name; 9902 #endif 9903 return (ev); 9904 } 9905 9906 void 9907 spa_event_post(sysevent_t *ev) 9908 { 9909 #ifdef _KERNEL 9910 if (ev) { 9911 zfs_zevent_post(ev->resource, NULL, zfs_zevent_post_cb); 9912 kmem_free(ev, sizeof (*ev)); 9913 } 9914 #else 9915 (void) ev; 9916 #endif 9917 } 9918 9919 /* 9920 * Post a zevent corresponding to the given sysevent. The 'name' must be one 9921 * of the event definitions in sys/sysevent/eventdefs.h. The payload will be 9922 * filled in from the spa and (optionally) the vdev. This doesn't do anything 9923 * in the userland libzpool, as we don't want consumers to misinterpret ztest 9924 * or zdb as real changes. 9925 */ 9926 void 9927 spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) 9928 { 9929 spa_event_post(spa_event_create(spa, vd, hist_nvl, name)); 9930 } 9931 9932 /* state manipulation functions */ 9933 EXPORT_SYMBOL(spa_open); 9934 EXPORT_SYMBOL(spa_open_rewind); 9935 EXPORT_SYMBOL(spa_get_stats); 9936 EXPORT_SYMBOL(spa_create); 9937 EXPORT_SYMBOL(spa_import); 9938 EXPORT_SYMBOL(spa_tryimport); 9939 EXPORT_SYMBOL(spa_destroy); 9940 EXPORT_SYMBOL(spa_export); 9941 EXPORT_SYMBOL(spa_reset); 9942 EXPORT_SYMBOL(spa_async_request); 9943 EXPORT_SYMBOL(spa_async_suspend); 9944 EXPORT_SYMBOL(spa_async_resume); 9945 EXPORT_SYMBOL(spa_inject_addref); 9946 EXPORT_SYMBOL(spa_inject_delref); 9947 EXPORT_SYMBOL(spa_scan_stat_init); 9948 EXPORT_SYMBOL(spa_scan_get_stats); 9949 9950 /* device manipulation */ 9951 EXPORT_SYMBOL(spa_vdev_add); 9952 EXPORT_SYMBOL(spa_vdev_attach); 9953 EXPORT_SYMBOL(spa_vdev_detach); 9954 EXPORT_SYMBOL(spa_vdev_setpath); 9955 EXPORT_SYMBOL(spa_vdev_setfru); 9956 EXPORT_SYMBOL(spa_vdev_split_mirror); 9957 9958 /* spare statech is global across all pools) */ 9959 EXPORT_SYMBOL(spa_spare_add); 9960 EXPORT_SYMBOL(spa_spare_remove); 9961 EXPORT_SYMBOL(spa_spare_exists); 9962 EXPORT_SYMBOL(spa_spare_activate); 9963 9964 /* L2ARC statech is global across all pools) */ 9965 EXPORT_SYMBOL(spa_l2cache_add); 9966 EXPORT_SYMBOL(spa_l2cache_remove); 9967 EXPORT_SYMBOL(spa_l2cache_exists); 9968 EXPORT_SYMBOL(spa_l2cache_activate); 9969 EXPORT_SYMBOL(spa_l2cache_drop); 9970 9971 /* scanning */ 9972 EXPORT_SYMBOL(spa_scan); 9973 EXPORT_SYMBOL(spa_scan_stop); 9974 9975 /* spa syncing */ 9976 EXPORT_SYMBOL(spa_sync); /* only for DMU use */ 9977 EXPORT_SYMBOL(spa_sync_allpools); 9978 9979 /* properties */ 9980 EXPORT_SYMBOL(spa_prop_set); 9981 EXPORT_SYMBOL(spa_prop_get); 9982 EXPORT_SYMBOL(spa_prop_clear_bootfs); 9983 9984 /* asynchronous event notification */ 9985 EXPORT_SYMBOL(spa_event_notify); 9986 9987 /* BEGIN CSTYLED */ 9988 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, INT, ZMOD_RW, 9989 "log2 fraction of arc that can be used by inflight I/Os when " 9990 "verifying pool during import"); 9991 /* END CSTYLED */ 9992 9993 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_metadata, INT, ZMOD_RW, 9994 "Set to traverse metadata on pool import"); 9995 9996 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW, 9997 "Set to traverse data on pool import"); 9998 9999 ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW, 10000 "Print vdev tree to zfs_dbgmsg during pool import"); 10001 10002 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RD, 10003 "Percentage of CPUs to run an IO worker thread"); 10004 10005 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RD, 10006 "Number of threads per IO worker taskqueue"); 10007 10008 /* BEGIN CSTYLED */ 10009 ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, ULONG, ZMOD_RW, 10010 "Allow importing pool with up to this number of missing top-level " 10011 "vdevs (in read-only mode)"); 10012 /* END CSTYLED */ 10013 10014 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT, 10015 ZMOD_RW, "Set the livelist condense zthr to pause"); 10016 10017 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_pause, INT, 10018 ZMOD_RW, "Set the livelist condense synctask to pause"); 10019 10020 /* BEGIN CSTYLED */ 10021 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_cancel, 10022 INT, ZMOD_RW, 10023 "Whether livelist condensing was canceled in the synctask"); 10024 10025 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_cancel, 10026 INT, ZMOD_RW, 10027 "Whether livelist condensing was canceled in the zthr function"); 10028 10029 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, 10030 ZMOD_RW, 10031 "Whether extra ALLOC blkptrs were added to a livelist entry while it " 10032 "was being condensed"); 10033 /* END CSTYLED */ 10034