1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011, 2020 by Delphix. All rights reserved. 25 * Copyright (c) 2018, Nexenta Systems, Inc. All rights reserved. 26 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 27 * Copyright 2013 Saso Kiselkov. All rights reserved. 28 * Copyright (c) 2014 Integros [integros.com] 29 * Copyright 2016 Toomas Soome <tsoome@me.com> 30 * Copyright (c) 2016 Actifio, Inc. All rights reserved. 31 * Copyright 2018 Joyent, Inc. 32 * Copyright (c) 2017, 2019, Datto Inc. All rights reserved. 33 * Copyright 2017 Joyent, Inc. 34 * Copyright (c) 2017, Intel Corporation. 35 * Copyright (c) 2021, Colm Buckley <colm@tuatha.org> 36 */ 37 38 /* 39 * SPA: Storage Pool Allocator 40 * 41 * This file contains all the routines used when modifying on-disk SPA state. 42 * This includes opening, importing, destroying, exporting a pool, and syncing a 43 * pool. 44 */ 45 46 #include <sys/zfs_context.h> 47 #include <sys/fm/fs/zfs.h> 48 #include <sys/spa_impl.h> 49 #include <sys/zio.h> 50 #include <sys/zio_checksum.h> 51 #include <sys/dmu.h> 52 #include <sys/dmu_tx.h> 53 #include <sys/zap.h> 54 #include <sys/zil.h> 55 #include <sys/ddt.h> 56 #include <sys/vdev_impl.h> 57 #include <sys/vdev_removal.h> 58 #include <sys/vdev_indirect_mapping.h> 59 #include <sys/vdev_indirect_births.h> 60 #include <sys/vdev_initialize.h> 61 #include <sys/vdev_rebuild.h> 62 #include <sys/vdev_trim.h> 63 #include <sys/vdev_disk.h> 64 #include <sys/vdev_draid.h> 65 #include <sys/metaslab.h> 66 #include <sys/metaslab_impl.h> 67 #include <sys/mmp.h> 68 #include <sys/uberblock_impl.h> 69 #include <sys/txg.h> 70 #include <sys/avl.h> 71 #include <sys/bpobj.h> 72 #include <sys/dmu_traverse.h> 73 #include <sys/dmu_objset.h> 74 #include <sys/unique.h> 75 #include <sys/dsl_pool.h> 76 #include <sys/dsl_dataset.h> 77 #include <sys/dsl_dir.h> 78 #include <sys/dsl_prop.h> 79 #include <sys/dsl_synctask.h> 80 #include <sys/fs/zfs.h> 81 #include <sys/arc.h> 82 #include <sys/callb.h> 83 #include <sys/systeminfo.h> 84 #include <sys/spa_boot.h> 85 #include <sys/zfs_ioctl.h> 86 #include <sys/dsl_scan.h> 87 #include <sys/zfeature.h> 88 #include <sys/dsl_destroy.h> 89 #include <sys/zvol.h> 90 91 #ifdef _KERNEL 92 #include <sys/fm/protocol.h> 93 #include <sys/fm/util.h> 94 #include <sys/callb.h> 95 #include <sys/zone.h> 96 #include <sys/vmsystm.h> 97 #endif /* _KERNEL */ 98 99 #include "zfs_prop.h" 100 #include "zfs_comutil.h" 101 102 /* 103 * The interval, in seconds, at which failed configuration cache file writes 104 * should be retried. 105 */ 106 int zfs_ccw_retry_interval = 300; 107 108 typedef enum zti_modes { 109 ZTI_MODE_FIXED, /* value is # of threads (min 1) */ 110 ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ 111 ZTI_MODE_SCALE, /* Taskqs scale with CPUs. */ 112 ZTI_MODE_NULL, /* don't create a taskq */ 113 ZTI_NMODES 114 } zti_modes_t; 115 116 #define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } 117 #define ZTI_PCT(n) { ZTI_MODE_ONLINE_PERCENT, (n), 1 } 118 #define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } 119 #define ZTI_SCALE { ZTI_MODE_SCALE, 0, 1 } 120 #define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } 121 122 #define ZTI_N(n) ZTI_P(n, 1) 123 #define ZTI_ONE ZTI_N(1) 124 125 typedef struct zio_taskq_info { 126 zti_modes_t zti_mode; 127 uint_t zti_value; 128 uint_t zti_count; 129 } zio_taskq_info_t; 130 131 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 132 "iss", "iss_h", "int", "int_h" 133 }; 134 135 /* 136 * This table defines the taskq settings for each ZFS I/O type. When 137 * initializing a pool, we use this table to create an appropriately sized 138 * taskq. Some operations are low volume and therefore have a small, static 139 * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE 140 * macros. Other operations process a large amount of data; the ZTI_BATCH 141 * macro causes us to create a taskq oriented for throughput. Some operations 142 * are so high frequency and short-lived that the taskq itself can become a 143 * point of lock contention. The ZTI_P(#, #) macro indicates that we need an 144 * additional degree of parallelism specified by the number of threads per- 145 * taskq and the number of taskqs; when dispatching an event in this case, the 146 * particular taskq is chosen at random. ZTI_SCALE is similar to ZTI_BATCH, 147 * but with number of taskqs also scaling with number of CPUs. 148 * 149 * The different taskq priorities are to handle the different contexts (issue 150 * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that 151 * need to be handled with minimum delay. 152 */ 153 static const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 154 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 155 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ 156 { ZTI_N(8), ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* READ */ 157 { ZTI_BATCH, ZTI_N(5), ZTI_SCALE, ZTI_N(5) }, /* WRITE */ 158 { ZTI_SCALE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ 159 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ 160 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ 161 { ZTI_N(4), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* TRIM */ 162 }; 163 164 static void spa_sync_version(void *arg, dmu_tx_t *tx); 165 static void spa_sync_props(void *arg, dmu_tx_t *tx); 166 static boolean_t spa_has_active_shared_spare(spa_t *spa); 167 static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport); 168 static void spa_vdev_resilver_done(spa_t *spa); 169 170 static uint_t zio_taskq_batch_pct = 80; /* 1 thread per cpu in pset */ 171 static uint_t zio_taskq_batch_tpq; /* threads per taskq */ 172 static const boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 173 static const uint_t zio_taskq_basedc = 80; /* base duty cycle */ 174 175 static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */ 176 177 /* 178 * Report any spa_load_verify errors found, but do not fail spa_load. 179 * This is used by zdb to analyze non-idle pools. 180 */ 181 boolean_t spa_load_verify_dryrun = B_FALSE; 182 183 /* 184 * Allow read spacemaps in case of readonly import (spa_mode == SPA_MODE_READ). 185 * This is used by zdb for spacemaps verification. 186 */ 187 boolean_t spa_mode_readable_spacemaps = B_FALSE; 188 189 /* 190 * This (illegal) pool name is used when temporarily importing a spa_t in order 191 * to get the vdev stats associated with the imported devices. 192 */ 193 #define TRYIMPORT_NAME "$import" 194 195 /* 196 * For debugging purposes: print out vdev tree during pool import. 197 */ 198 static int spa_load_print_vdev_tree = B_FALSE; 199 200 /* 201 * A non-zero value for zfs_max_missing_tvds means that we allow importing 202 * pools with missing top-level vdevs. This is strictly intended for advanced 203 * pool recovery cases since missing data is almost inevitable. Pools with 204 * missing devices can only be imported read-only for safety reasons, and their 205 * fail-mode will be automatically set to "continue". 206 * 207 * With 1 missing vdev we should be able to import the pool and mount all 208 * datasets. User data that was not modified after the missing device has been 209 * added should be recoverable. This means that snapshots created prior to the 210 * addition of that device should be completely intact. 211 * 212 * With 2 missing vdevs, some datasets may fail to mount since there are 213 * dataset statistics that are stored as regular metadata. Some data might be 214 * recoverable if those vdevs were added recently. 215 * 216 * With 3 or more missing vdevs, the pool is severely damaged and MOS entries 217 * may be missing entirely. Chances of data recovery are very low. Note that 218 * there are also risks of performing an inadvertent rewind as we might be 219 * missing all the vdevs with the latest uberblocks. 220 */ 221 unsigned long zfs_max_missing_tvds = 0; 222 223 /* 224 * The parameters below are similar to zfs_max_missing_tvds but are only 225 * intended for a preliminary open of the pool with an untrusted config which 226 * might be incomplete or out-dated. 227 * 228 * We are more tolerant for pools opened from a cachefile since we could have 229 * an out-dated cachefile where a device removal was not registered. 230 * We could have set the limit arbitrarily high but in the case where devices 231 * are really missing we would want to return the proper error codes; we chose 232 * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available 233 * and we get a chance to retrieve the trusted config. 234 */ 235 uint64_t zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1; 236 237 /* 238 * In the case where config was assembled by scanning device paths (/dev/dsks 239 * by default) we are less tolerant since all the existing devices should have 240 * been detected and we want spa_load to return the right error codes. 241 */ 242 uint64_t zfs_max_missing_tvds_scan = 0; 243 244 /* 245 * Debugging aid that pauses spa_sync() towards the end. 246 */ 247 static const boolean_t zfs_pause_spa_sync = B_FALSE; 248 249 /* 250 * Variables to indicate the livelist condense zthr func should wait at certain 251 * points for the livelist to be removed - used to test condense/destroy races 252 */ 253 static int zfs_livelist_condense_zthr_pause = 0; 254 static int zfs_livelist_condense_sync_pause = 0; 255 256 /* 257 * Variables to track whether or not condense cancellation has been 258 * triggered in testing. 259 */ 260 static int zfs_livelist_condense_sync_cancel = 0; 261 static int zfs_livelist_condense_zthr_cancel = 0; 262 263 /* 264 * Variable to track whether or not extra ALLOC blkptrs were added to a 265 * livelist entry while it was being condensed (caused by the way we track 266 * remapped blkptrs in dbuf_remap_impl) 267 */ 268 static int zfs_livelist_condense_new_alloc = 0; 269 270 /* 271 * ========================================================================== 272 * SPA properties routines 273 * ========================================================================== 274 */ 275 276 /* 277 * Add a (source=src, propname=propval) list to an nvlist. 278 */ 279 static void 280 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 281 uint64_t intval, zprop_source_t src) 282 { 283 const char *propname = zpool_prop_to_name(prop); 284 nvlist_t *propval; 285 286 propval = fnvlist_alloc(); 287 fnvlist_add_uint64(propval, ZPROP_SOURCE, src); 288 289 if (strval != NULL) 290 fnvlist_add_string(propval, ZPROP_VALUE, strval); 291 else 292 fnvlist_add_uint64(propval, ZPROP_VALUE, intval); 293 294 fnvlist_add_nvlist(nvl, propname, propval); 295 nvlist_free(propval); 296 } 297 298 /* 299 * Get property values from the spa configuration. 300 */ 301 static void 302 spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 303 { 304 vdev_t *rvd = spa->spa_root_vdev; 305 dsl_pool_t *pool = spa->spa_dsl_pool; 306 uint64_t size, alloc, cap, version; 307 const zprop_source_t src = ZPROP_SRC_NONE; 308 spa_config_dirent_t *dp; 309 metaslab_class_t *mc = spa_normal_class(spa); 310 311 ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 312 313 if (rvd != NULL) { 314 alloc = metaslab_class_get_alloc(mc); 315 alloc += metaslab_class_get_alloc(spa_special_class(spa)); 316 alloc += metaslab_class_get_alloc(spa_dedup_class(spa)); 317 alloc += metaslab_class_get_alloc(spa_embedded_log_class(spa)); 318 319 size = metaslab_class_get_space(mc); 320 size += metaslab_class_get_space(spa_special_class(spa)); 321 size += metaslab_class_get_space(spa_dedup_class(spa)); 322 size += metaslab_class_get_space(spa_embedded_log_class(spa)); 323 324 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 325 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 326 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 327 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 328 size - alloc, src); 329 spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL, 330 spa->spa_checkpoint_info.sci_dspace, src); 331 332 spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL, 333 metaslab_class_fragmentation(mc), src); 334 spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, 335 metaslab_class_expandable_space(mc), src); 336 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, 337 (spa_mode(spa) == SPA_MODE_READ), src); 338 339 cap = (size == 0) ? 0 : (alloc * 100 / size); 340 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 341 342 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 343 ddt_get_pool_dedup_ratio(spa), src); 344 345 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 346 rvd->vdev_state, src); 347 348 version = spa_version(spa); 349 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) { 350 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, 351 version, ZPROP_SRC_DEFAULT); 352 } else { 353 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, 354 version, ZPROP_SRC_LOCAL); 355 } 356 spa_prop_add_list(*nvp, ZPOOL_PROP_LOAD_GUID, 357 NULL, spa_load_guid(spa), src); 358 } 359 360 if (pool != NULL) { 361 /* 362 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, 363 * when opening pools before this version freedir will be NULL. 364 */ 365 if (pool->dp_free_dir != NULL) { 366 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 367 dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes, 368 src); 369 } else { 370 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, 371 NULL, 0, src); 372 } 373 374 if (pool->dp_leak_dir != NULL) { 375 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, 376 dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes, 377 src); 378 } else { 379 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, 380 NULL, 0, src); 381 } 382 } 383 384 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 385 386 if (spa->spa_comment != NULL) { 387 spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 388 0, ZPROP_SRC_LOCAL); 389 } 390 391 if (spa->spa_compatibility != NULL) { 392 spa_prop_add_list(*nvp, ZPOOL_PROP_COMPATIBILITY, 393 spa->spa_compatibility, 0, ZPROP_SRC_LOCAL); 394 } 395 396 if (spa->spa_root != NULL) 397 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 398 0, ZPROP_SRC_LOCAL); 399 400 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { 401 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 402 MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE); 403 } else { 404 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 405 SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); 406 } 407 408 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) { 409 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, 410 DNODE_MAX_SIZE, ZPROP_SRC_NONE); 411 } else { 412 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, 413 DNODE_MIN_SIZE, ZPROP_SRC_NONE); 414 } 415 416 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 417 if (dp->scd_path == NULL) { 418 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 419 "none", 0, ZPROP_SRC_LOCAL); 420 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 421 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 422 dp->scd_path, 0, ZPROP_SRC_LOCAL); 423 } 424 } 425 } 426 427 /* 428 * Get zpool property values. 429 */ 430 int 431 spa_prop_get(spa_t *spa, nvlist_t **nvp) 432 { 433 objset_t *mos = spa->spa_meta_objset; 434 zap_cursor_t zc; 435 zap_attribute_t za; 436 dsl_pool_t *dp; 437 int err; 438 439 err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP); 440 if (err) 441 return (err); 442 443 dp = spa_get_dsl(spa); 444 dsl_pool_config_enter(dp, FTAG); 445 mutex_enter(&spa->spa_props_lock); 446 447 /* 448 * Get properties from the spa config. 449 */ 450 spa_prop_get_config(spa, nvp); 451 452 /* If no pool property object, no more prop to get. */ 453 if (mos == NULL || spa->spa_pool_props_object == 0) 454 goto out; 455 456 /* 457 * Get properties from the MOS pool property object. 458 */ 459 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 460 (err = zap_cursor_retrieve(&zc, &za)) == 0; 461 zap_cursor_advance(&zc)) { 462 uint64_t intval = 0; 463 char *strval = NULL; 464 zprop_source_t src = ZPROP_SRC_DEFAULT; 465 zpool_prop_t prop; 466 467 if ((prop = zpool_name_to_prop(za.za_name)) == ZPOOL_PROP_INVAL) 468 continue; 469 470 switch (za.za_integer_length) { 471 case 8: 472 /* integer property */ 473 if (za.za_first_integer != 474 zpool_prop_default_numeric(prop)) 475 src = ZPROP_SRC_LOCAL; 476 477 if (prop == ZPOOL_PROP_BOOTFS) { 478 dsl_dataset_t *ds = NULL; 479 480 err = dsl_dataset_hold_obj(dp, 481 za.za_first_integer, FTAG, &ds); 482 if (err != 0) 483 break; 484 485 strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, 486 KM_SLEEP); 487 dsl_dataset_name(ds, strval); 488 dsl_dataset_rele(ds, FTAG); 489 } else { 490 strval = NULL; 491 intval = za.za_first_integer; 492 } 493 494 spa_prop_add_list(*nvp, prop, strval, intval, src); 495 496 if (strval != NULL) 497 kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN); 498 499 break; 500 501 case 1: 502 /* string property */ 503 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 504 err = zap_lookup(mos, spa->spa_pool_props_object, 505 za.za_name, 1, za.za_num_integers, strval); 506 if (err) { 507 kmem_free(strval, za.za_num_integers); 508 break; 509 } 510 spa_prop_add_list(*nvp, prop, strval, 0, src); 511 kmem_free(strval, za.za_num_integers); 512 break; 513 514 default: 515 break; 516 } 517 } 518 zap_cursor_fini(&zc); 519 out: 520 mutex_exit(&spa->spa_props_lock); 521 dsl_pool_config_exit(dp, FTAG); 522 if (err && err != ENOENT) { 523 nvlist_free(*nvp); 524 *nvp = NULL; 525 return (err); 526 } 527 528 return (0); 529 } 530 531 /* 532 * Validate the given pool properties nvlist and modify the list 533 * for the property values to be set. 534 */ 535 static int 536 spa_prop_validate(spa_t *spa, nvlist_t *props) 537 { 538 nvpair_t *elem; 539 int error = 0, reset_bootfs = 0; 540 uint64_t objnum = 0; 541 boolean_t has_feature = B_FALSE; 542 543 elem = NULL; 544 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 545 uint64_t intval; 546 char *strval, *slash, *check, *fname; 547 const char *propname = nvpair_name(elem); 548 zpool_prop_t prop = zpool_name_to_prop(propname); 549 550 switch (prop) { 551 case ZPOOL_PROP_INVAL: 552 if (!zpool_prop_feature(propname)) { 553 error = SET_ERROR(EINVAL); 554 break; 555 } 556 557 /* 558 * Sanitize the input. 559 */ 560 if (nvpair_type(elem) != DATA_TYPE_UINT64) { 561 error = SET_ERROR(EINVAL); 562 break; 563 } 564 565 if (nvpair_value_uint64(elem, &intval) != 0) { 566 error = SET_ERROR(EINVAL); 567 break; 568 } 569 570 if (intval != 0) { 571 error = SET_ERROR(EINVAL); 572 break; 573 } 574 575 fname = strchr(propname, '@') + 1; 576 if (zfeature_lookup_name(fname, NULL) != 0) { 577 error = SET_ERROR(EINVAL); 578 break; 579 } 580 581 has_feature = B_TRUE; 582 break; 583 584 case ZPOOL_PROP_VERSION: 585 error = nvpair_value_uint64(elem, &intval); 586 if (!error && 587 (intval < spa_version(spa) || 588 intval > SPA_VERSION_BEFORE_FEATURES || 589 has_feature)) 590 error = SET_ERROR(EINVAL); 591 break; 592 593 case ZPOOL_PROP_DELEGATION: 594 case ZPOOL_PROP_AUTOREPLACE: 595 case ZPOOL_PROP_LISTSNAPS: 596 case ZPOOL_PROP_AUTOEXPAND: 597 case ZPOOL_PROP_AUTOTRIM: 598 error = nvpair_value_uint64(elem, &intval); 599 if (!error && intval > 1) 600 error = SET_ERROR(EINVAL); 601 break; 602 603 case ZPOOL_PROP_MULTIHOST: 604 error = nvpair_value_uint64(elem, &intval); 605 if (!error && intval > 1) 606 error = SET_ERROR(EINVAL); 607 608 if (!error) { 609 uint32_t hostid = zone_get_hostid(NULL); 610 if (hostid) 611 spa->spa_hostid = hostid; 612 else 613 error = SET_ERROR(ENOTSUP); 614 } 615 616 break; 617 618 case ZPOOL_PROP_BOOTFS: 619 /* 620 * If the pool version is less than SPA_VERSION_BOOTFS, 621 * or the pool is still being created (version == 0), 622 * the bootfs property cannot be set. 623 */ 624 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 625 error = SET_ERROR(ENOTSUP); 626 break; 627 } 628 629 /* 630 * Make sure the vdev config is bootable 631 */ 632 if (!vdev_is_bootable(spa->spa_root_vdev)) { 633 error = SET_ERROR(ENOTSUP); 634 break; 635 } 636 637 reset_bootfs = 1; 638 639 error = nvpair_value_string(elem, &strval); 640 641 if (!error) { 642 objset_t *os; 643 644 if (strval == NULL || strval[0] == '\0') { 645 objnum = zpool_prop_default_numeric( 646 ZPOOL_PROP_BOOTFS); 647 break; 648 } 649 650 error = dmu_objset_hold(strval, FTAG, &os); 651 if (error != 0) 652 break; 653 654 /* Must be ZPL. */ 655 if (dmu_objset_type(os) != DMU_OST_ZFS) { 656 error = SET_ERROR(ENOTSUP); 657 } else { 658 objnum = dmu_objset_id(os); 659 } 660 dmu_objset_rele(os, FTAG); 661 } 662 break; 663 664 case ZPOOL_PROP_FAILUREMODE: 665 error = nvpair_value_uint64(elem, &intval); 666 if (!error && intval > ZIO_FAILURE_MODE_PANIC) 667 error = SET_ERROR(EINVAL); 668 669 /* 670 * This is a special case which only occurs when 671 * the pool has completely failed. This allows 672 * the user to change the in-core failmode property 673 * without syncing it out to disk (I/Os might 674 * currently be blocked). We do this by returning 675 * EIO to the caller (spa_prop_set) to trick it 676 * into thinking we encountered a property validation 677 * error. 678 */ 679 if (!error && spa_suspended(spa)) { 680 spa->spa_failmode = intval; 681 error = SET_ERROR(EIO); 682 } 683 break; 684 685 case ZPOOL_PROP_CACHEFILE: 686 if ((error = nvpair_value_string(elem, &strval)) != 0) 687 break; 688 689 if (strval[0] == '\0') 690 break; 691 692 if (strcmp(strval, "none") == 0) 693 break; 694 695 if (strval[0] != '/') { 696 error = SET_ERROR(EINVAL); 697 break; 698 } 699 700 slash = strrchr(strval, '/'); 701 ASSERT(slash != NULL); 702 703 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 704 strcmp(slash, "/..") == 0) 705 error = SET_ERROR(EINVAL); 706 break; 707 708 case ZPOOL_PROP_COMMENT: 709 if ((error = nvpair_value_string(elem, &strval)) != 0) 710 break; 711 for (check = strval; *check != '\0'; check++) { 712 if (!isprint(*check)) { 713 error = SET_ERROR(EINVAL); 714 break; 715 } 716 } 717 if (strlen(strval) > ZPROP_MAX_COMMENT) 718 error = SET_ERROR(E2BIG); 719 break; 720 721 default: 722 break; 723 } 724 725 if (error) 726 break; 727 } 728 729 (void) nvlist_remove_all(props, 730 zpool_prop_to_name(ZPOOL_PROP_DEDUPDITTO)); 731 732 if (!error && reset_bootfs) { 733 error = nvlist_remove(props, 734 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 735 736 if (!error) { 737 error = nvlist_add_uint64(props, 738 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 739 } 740 } 741 742 return (error); 743 } 744 745 void 746 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 747 { 748 char *cachefile; 749 spa_config_dirent_t *dp; 750 751 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 752 &cachefile) != 0) 753 return; 754 755 dp = kmem_alloc(sizeof (spa_config_dirent_t), 756 KM_SLEEP); 757 758 if (cachefile[0] == '\0') 759 dp->scd_path = spa_strdup(spa_config_path); 760 else if (strcmp(cachefile, "none") == 0) 761 dp->scd_path = NULL; 762 else 763 dp->scd_path = spa_strdup(cachefile); 764 765 list_insert_head(&spa->spa_config_list, dp); 766 if (need_sync) 767 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 768 } 769 770 int 771 spa_prop_set(spa_t *spa, nvlist_t *nvp) 772 { 773 int error; 774 nvpair_t *elem = NULL; 775 boolean_t need_sync = B_FALSE; 776 777 if ((error = spa_prop_validate(spa, nvp)) != 0) 778 return (error); 779 780 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 781 zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); 782 783 if (prop == ZPOOL_PROP_CACHEFILE || 784 prop == ZPOOL_PROP_ALTROOT || 785 prop == ZPOOL_PROP_READONLY) 786 continue; 787 788 if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) { 789 uint64_t ver = 0; 790 791 if (prop == ZPOOL_PROP_VERSION) { 792 VERIFY(nvpair_value_uint64(elem, &ver) == 0); 793 } else { 794 ASSERT(zpool_prop_feature(nvpair_name(elem))); 795 ver = SPA_VERSION_FEATURES; 796 need_sync = B_TRUE; 797 } 798 799 /* Save time if the version is already set. */ 800 if (ver == spa_version(spa)) 801 continue; 802 803 /* 804 * In addition to the pool directory object, we might 805 * create the pool properties object, the features for 806 * read object, the features for write object, or the 807 * feature descriptions object. 808 */ 809 error = dsl_sync_task(spa->spa_name, NULL, 810 spa_sync_version, &ver, 811 6, ZFS_SPACE_CHECK_RESERVED); 812 if (error) 813 return (error); 814 continue; 815 } 816 817 need_sync = B_TRUE; 818 break; 819 } 820 821 if (need_sync) { 822 return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, 823 nvp, 6, ZFS_SPACE_CHECK_RESERVED)); 824 } 825 826 return (0); 827 } 828 829 /* 830 * If the bootfs property value is dsobj, clear it. 831 */ 832 void 833 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 834 { 835 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 836 VERIFY(zap_remove(spa->spa_meta_objset, 837 spa->spa_pool_props_object, 838 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 839 spa->spa_bootfs = 0; 840 } 841 } 842 843 static int 844 spa_change_guid_check(void *arg, dmu_tx_t *tx) 845 { 846 uint64_t *newguid __maybe_unused = arg; 847 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 848 vdev_t *rvd = spa->spa_root_vdev; 849 uint64_t vdev_state; 850 851 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 852 int error = (spa_has_checkpoint(spa)) ? 853 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 854 return (SET_ERROR(error)); 855 } 856 857 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 858 vdev_state = rvd->vdev_state; 859 spa_config_exit(spa, SCL_STATE, FTAG); 860 861 if (vdev_state != VDEV_STATE_HEALTHY) 862 return (SET_ERROR(ENXIO)); 863 864 ASSERT3U(spa_guid(spa), !=, *newguid); 865 866 return (0); 867 } 868 869 static void 870 spa_change_guid_sync(void *arg, dmu_tx_t *tx) 871 { 872 uint64_t *newguid = arg; 873 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 874 uint64_t oldguid; 875 vdev_t *rvd = spa->spa_root_vdev; 876 877 oldguid = spa_guid(spa); 878 879 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 880 rvd->vdev_guid = *newguid; 881 rvd->vdev_guid_sum += (*newguid - oldguid); 882 vdev_config_dirty(rvd); 883 spa_config_exit(spa, SCL_STATE, FTAG); 884 885 spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", 886 (u_longlong_t)oldguid, (u_longlong_t)*newguid); 887 } 888 889 /* 890 * Change the GUID for the pool. This is done so that we can later 891 * re-import a pool built from a clone of our own vdevs. We will modify 892 * the root vdev's guid, our own pool guid, and then mark all of our 893 * vdevs dirty. Note that we must make sure that all our vdevs are 894 * online when we do this, or else any vdevs that weren't present 895 * would be orphaned from our pool. We are also going to issue a 896 * sysevent to update any watchers. 897 */ 898 int 899 spa_change_guid(spa_t *spa) 900 { 901 int error; 902 uint64_t guid; 903 904 mutex_enter(&spa->spa_vdev_top_lock); 905 mutex_enter(&spa_namespace_lock); 906 guid = spa_generate_guid(NULL); 907 908 error = dsl_sync_task(spa->spa_name, spa_change_guid_check, 909 spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); 910 911 if (error == 0) { 912 spa_write_cachefile(spa, B_FALSE, B_TRUE); 913 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID); 914 } 915 916 mutex_exit(&spa_namespace_lock); 917 mutex_exit(&spa->spa_vdev_top_lock); 918 919 return (error); 920 } 921 922 /* 923 * ========================================================================== 924 * SPA state manipulation (open/create/destroy/import/export) 925 * ========================================================================== 926 */ 927 928 static int 929 spa_error_entry_compare(const void *a, const void *b) 930 { 931 const spa_error_entry_t *sa = (const spa_error_entry_t *)a; 932 const spa_error_entry_t *sb = (const spa_error_entry_t *)b; 933 int ret; 934 935 ret = memcmp(&sa->se_bookmark, &sb->se_bookmark, 936 sizeof (zbookmark_phys_t)); 937 938 return (TREE_ISIGN(ret)); 939 } 940 941 /* 942 * Utility function which retrieves copies of the current logs and 943 * re-initializes them in the process. 944 */ 945 void 946 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 947 { 948 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 949 950 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 951 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 952 953 avl_create(&spa->spa_errlist_scrub, 954 spa_error_entry_compare, sizeof (spa_error_entry_t), 955 offsetof(spa_error_entry_t, se_avl)); 956 avl_create(&spa->spa_errlist_last, 957 spa_error_entry_compare, sizeof (spa_error_entry_t), 958 offsetof(spa_error_entry_t, se_avl)); 959 } 960 961 static void 962 spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 963 { 964 const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 965 enum zti_modes mode = ztip->zti_mode; 966 uint_t value = ztip->zti_value; 967 uint_t count = ztip->zti_count; 968 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 969 uint_t cpus, flags = TASKQ_DYNAMIC; 970 boolean_t batch = B_FALSE; 971 972 switch (mode) { 973 case ZTI_MODE_FIXED: 974 ASSERT3U(value, >, 0); 975 break; 976 977 case ZTI_MODE_BATCH: 978 batch = B_TRUE; 979 flags |= TASKQ_THREADS_CPU_PCT; 980 value = MIN(zio_taskq_batch_pct, 100); 981 break; 982 983 case ZTI_MODE_SCALE: 984 flags |= TASKQ_THREADS_CPU_PCT; 985 /* 986 * We want more taskqs to reduce lock contention, but we want 987 * less for better request ordering and CPU utilization. 988 */ 989 cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100); 990 if (zio_taskq_batch_tpq > 0) { 991 count = MAX(1, (cpus + zio_taskq_batch_tpq / 2) / 992 zio_taskq_batch_tpq); 993 } else { 994 /* 995 * Prefer 6 threads per taskq, but no more taskqs 996 * than threads in them on large systems. For 80%: 997 * 998 * taskq taskq total 999 * cpus taskqs percent threads threads 1000 * ------- ------- ------- ------- ------- 1001 * 1 1 80% 1 1 1002 * 2 1 80% 1 1 1003 * 4 1 80% 3 3 1004 * 8 2 40% 3 6 1005 * 16 3 27% 4 12 1006 * 32 5 16% 5 25 1007 * 64 7 11% 7 49 1008 * 128 10 8% 10 100 1009 * 256 14 6% 15 210 1010 */ 1011 count = 1 + cpus / 6; 1012 while (count * count > cpus) 1013 count--; 1014 } 1015 /* Limit each taskq within 100% to not trigger assertion. */ 1016 count = MAX(count, (zio_taskq_batch_pct + 99) / 100); 1017 value = (zio_taskq_batch_pct + count / 2) / count; 1018 break; 1019 1020 case ZTI_MODE_NULL: 1021 tqs->stqs_count = 0; 1022 tqs->stqs_taskq = NULL; 1023 return; 1024 1025 default: 1026 panic("unrecognized mode for %s_%s taskq (%u:%u) in " 1027 "spa_activate()", 1028 zio_type_name[t], zio_taskq_types[q], mode, value); 1029 break; 1030 } 1031 1032 ASSERT3U(count, >, 0); 1033 tqs->stqs_count = count; 1034 tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); 1035 1036 for (uint_t i = 0; i < count; i++) { 1037 taskq_t *tq; 1038 char name[32]; 1039 1040 if (count > 1) 1041 (void) snprintf(name, sizeof (name), "%s_%s_%u", 1042 zio_type_name[t], zio_taskq_types[q], i); 1043 else 1044 (void) snprintf(name, sizeof (name), "%s_%s", 1045 zio_type_name[t], zio_taskq_types[q]); 1046 1047 if (zio_taskq_sysdc && spa->spa_proc != &p0) { 1048 if (batch) 1049 flags |= TASKQ_DC_BATCH; 1050 1051 (void) zio_taskq_basedc; 1052 tq = taskq_create_sysdc(name, value, 50, INT_MAX, 1053 spa->spa_proc, zio_taskq_basedc, flags); 1054 } else { 1055 pri_t pri = maxclsyspri; 1056 /* 1057 * The write issue taskq can be extremely CPU 1058 * intensive. Run it at slightly less important 1059 * priority than the other taskqs. 1060 * 1061 * Under Linux and FreeBSD this means incrementing 1062 * the priority value as opposed to platforms like 1063 * illumos where it should be decremented. 1064 * 1065 * On FreeBSD, if priorities divided by four (RQ_PPQ) 1066 * are equal then a difference between them is 1067 * insignificant. 1068 */ 1069 if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) { 1070 #if defined(__linux__) 1071 pri++; 1072 #elif defined(__FreeBSD__) 1073 pri += 4; 1074 #else 1075 #error "unknown OS" 1076 #endif 1077 } 1078 tq = taskq_create_proc(name, value, pri, 50, 1079 INT_MAX, spa->spa_proc, flags); 1080 } 1081 1082 tqs->stqs_taskq[i] = tq; 1083 } 1084 } 1085 1086 static void 1087 spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 1088 { 1089 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1090 1091 if (tqs->stqs_taskq == NULL) { 1092 ASSERT3U(tqs->stqs_count, ==, 0); 1093 return; 1094 } 1095 1096 for (uint_t i = 0; i < tqs->stqs_count; i++) { 1097 ASSERT3P(tqs->stqs_taskq[i], !=, NULL); 1098 taskq_destroy(tqs->stqs_taskq[i]); 1099 } 1100 1101 kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); 1102 tqs->stqs_taskq = NULL; 1103 } 1104 1105 /* 1106 * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. 1107 * Note that a type may have multiple discrete taskqs to avoid lock contention 1108 * on the taskq itself. In that case we choose which taskq at random by using 1109 * the low bits of gethrtime(). 1110 */ 1111 void 1112 spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, 1113 task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) 1114 { 1115 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1116 taskq_t *tq; 1117 1118 ASSERT3P(tqs->stqs_taskq, !=, NULL); 1119 ASSERT3U(tqs->stqs_count, !=, 0); 1120 1121 if (tqs->stqs_count == 1) { 1122 tq = tqs->stqs_taskq[0]; 1123 } else { 1124 tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; 1125 } 1126 1127 taskq_dispatch_ent(tq, func, arg, flags, ent); 1128 } 1129 1130 /* 1131 * Same as spa_taskq_dispatch_ent() but block on the task until completion. 1132 */ 1133 void 1134 spa_taskq_dispatch_sync(spa_t *spa, zio_type_t t, zio_taskq_type_t q, 1135 task_func_t *func, void *arg, uint_t flags) 1136 { 1137 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1138 taskq_t *tq; 1139 taskqid_t id; 1140 1141 ASSERT3P(tqs->stqs_taskq, !=, NULL); 1142 ASSERT3U(tqs->stqs_count, !=, 0); 1143 1144 if (tqs->stqs_count == 1) { 1145 tq = tqs->stqs_taskq[0]; 1146 } else { 1147 tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; 1148 } 1149 1150 id = taskq_dispatch(tq, func, arg, flags); 1151 if (id) 1152 taskq_wait_id(tq, id); 1153 } 1154 1155 static void 1156 spa_create_zio_taskqs(spa_t *spa) 1157 { 1158 for (int t = 0; t < ZIO_TYPES; t++) { 1159 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1160 spa_taskqs_init(spa, t, q); 1161 } 1162 } 1163 } 1164 1165 /* 1166 * Disabled until spa_thread() can be adapted for Linux. 1167 */ 1168 #undef HAVE_SPA_THREAD 1169 1170 #if defined(_KERNEL) && defined(HAVE_SPA_THREAD) 1171 static void 1172 spa_thread(void *arg) 1173 { 1174 psetid_t zio_taskq_psrset_bind = PS_NONE; 1175 callb_cpr_t cprinfo; 1176 1177 spa_t *spa = arg; 1178 user_t *pu = PTOU(curproc); 1179 1180 CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 1181 spa->spa_name); 1182 1183 ASSERT(curproc != &p0); 1184 (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 1185 "zpool-%s", spa->spa_name); 1186 (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 1187 1188 /* bind this thread to the requested psrset */ 1189 if (zio_taskq_psrset_bind != PS_NONE) { 1190 pool_lock(); 1191 mutex_enter(&cpu_lock); 1192 mutex_enter(&pidlock); 1193 mutex_enter(&curproc->p_lock); 1194 1195 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 1196 0, NULL, NULL) == 0) { 1197 curthread->t_bind_pset = zio_taskq_psrset_bind; 1198 } else { 1199 cmn_err(CE_WARN, 1200 "Couldn't bind process for zfs pool \"%s\" to " 1201 "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 1202 } 1203 1204 mutex_exit(&curproc->p_lock); 1205 mutex_exit(&pidlock); 1206 mutex_exit(&cpu_lock); 1207 pool_unlock(); 1208 } 1209 1210 if (zio_taskq_sysdc) { 1211 sysdc_thread_enter(curthread, 100, 0); 1212 } 1213 1214 spa->spa_proc = curproc; 1215 spa->spa_did = curthread->t_did; 1216 1217 spa_create_zio_taskqs(spa); 1218 1219 mutex_enter(&spa->spa_proc_lock); 1220 ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 1221 1222 spa->spa_proc_state = SPA_PROC_ACTIVE; 1223 cv_broadcast(&spa->spa_proc_cv); 1224 1225 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1226 while (spa->spa_proc_state == SPA_PROC_ACTIVE) 1227 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1228 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 1229 1230 ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 1231 spa->spa_proc_state = SPA_PROC_GONE; 1232 spa->spa_proc = &p0; 1233 cv_broadcast(&spa->spa_proc_cv); 1234 CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 1235 1236 mutex_enter(&curproc->p_lock); 1237 lwp_exit(); 1238 } 1239 #endif 1240 1241 /* 1242 * Activate an uninitialized pool. 1243 */ 1244 static void 1245 spa_activate(spa_t *spa, spa_mode_t mode) 1246 { 1247 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 1248 1249 spa->spa_state = POOL_STATE_ACTIVE; 1250 spa->spa_mode = mode; 1251 spa->spa_read_spacemaps = spa_mode_readable_spacemaps; 1252 1253 spa->spa_normal_class = metaslab_class_create(spa, &zfs_metaslab_ops); 1254 spa->spa_log_class = metaslab_class_create(spa, &zfs_metaslab_ops); 1255 spa->spa_embedded_log_class = 1256 metaslab_class_create(spa, &zfs_metaslab_ops); 1257 spa->spa_special_class = metaslab_class_create(spa, &zfs_metaslab_ops); 1258 spa->spa_dedup_class = metaslab_class_create(spa, &zfs_metaslab_ops); 1259 1260 /* Try to create a covering process */ 1261 mutex_enter(&spa->spa_proc_lock); 1262 ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 1263 ASSERT(spa->spa_proc == &p0); 1264 spa->spa_did = 0; 1265 1266 (void) spa_create_process; 1267 #ifdef HAVE_SPA_THREAD 1268 /* Only create a process if we're going to be around a while. */ 1269 if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 1270 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 1271 NULL, 0) == 0) { 1272 spa->spa_proc_state = SPA_PROC_CREATED; 1273 while (spa->spa_proc_state == SPA_PROC_CREATED) { 1274 cv_wait(&spa->spa_proc_cv, 1275 &spa->spa_proc_lock); 1276 } 1277 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1278 ASSERT(spa->spa_proc != &p0); 1279 ASSERT(spa->spa_did != 0); 1280 } else { 1281 #ifdef _KERNEL 1282 cmn_err(CE_WARN, 1283 "Couldn't create process for zfs pool \"%s\"\n", 1284 spa->spa_name); 1285 #endif 1286 } 1287 } 1288 #endif /* HAVE_SPA_THREAD */ 1289 mutex_exit(&spa->spa_proc_lock); 1290 1291 /* If we didn't create a process, we need to create our taskqs. */ 1292 if (spa->spa_proc == &p0) { 1293 spa_create_zio_taskqs(spa); 1294 } 1295 1296 for (size_t i = 0; i < TXG_SIZE; i++) { 1297 spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, 1298 ZIO_FLAG_CANFAIL); 1299 } 1300 1301 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 1302 offsetof(vdev_t, vdev_config_dirty_node)); 1303 list_create(&spa->spa_evicting_os_list, sizeof (objset_t), 1304 offsetof(objset_t, os_evicting_node)); 1305 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 1306 offsetof(vdev_t, vdev_state_dirty_node)); 1307 1308 txg_list_create(&spa->spa_vdev_txg_list, spa, 1309 offsetof(struct vdev, vdev_txg_node)); 1310 1311 avl_create(&spa->spa_errlist_scrub, 1312 spa_error_entry_compare, sizeof (spa_error_entry_t), 1313 offsetof(spa_error_entry_t, se_avl)); 1314 avl_create(&spa->spa_errlist_last, 1315 spa_error_entry_compare, sizeof (spa_error_entry_t), 1316 offsetof(spa_error_entry_t, se_avl)); 1317 1318 spa_keystore_init(&spa->spa_keystore); 1319 1320 /* 1321 * This taskq is used to perform zvol-minor-related tasks 1322 * asynchronously. This has several advantages, including easy 1323 * resolution of various deadlocks. 1324 * 1325 * The taskq must be single threaded to ensure tasks are always 1326 * processed in the order in which they were dispatched. 1327 * 1328 * A taskq per pool allows one to keep the pools independent. 1329 * This way if one pool is suspended, it will not impact another. 1330 * 1331 * The preferred location to dispatch a zvol minor task is a sync 1332 * task. In this context, there is easy access to the spa_t and minimal 1333 * error handling is required because the sync task must succeed. 1334 */ 1335 spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri, 1336 1, INT_MAX, 0); 1337 1338 /* 1339 * Taskq dedicated to prefetcher threads: this is used to prevent the 1340 * pool traverse code from monopolizing the global (and limited) 1341 * system_taskq by inappropriately scheduling long running tasks on it. 1342 */ 1343 spa->spa_prefetch_taskq = taskq_create("z_prefetch", 100, 1344 defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); 1345 1346 /* 1347 * The taskq to upgrade datasets in this pool. Currently used by 1348 * feature SPA_FEATURE_USEROBJ_ACCOUNTING/SPA_FEATURE_PROJECT_QUOTA. 1349 */ 1350 spa->spa_upgrade_taskq = taskq_create("z_upgrade", 100, 1351 defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); 1352 } 1353 1354 /* 1355 * Opposite of spa_activate(). 1356 */ 1357 static void 1358 spa_deactivate(spa_t *spa) 1359 { 1360 ASSERT(spa->spa_sync_on == B_FALSE); 1361 ASSERT(spa->spa_dsl_pool == NULL); 1362 ASSERT(spa->spa_root_vdev == NULL); 1363 ASSERT(spa->spa_async_zio_root == NULL); 1364 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 1365 1366 spa_evicting_os_wait(spa); 1367 1368 if (spa->spa_zvol_taskq) { 1369 taskq_destroy(spa->spa_zvol_taskq); 1370 spa->spa_zvol_taskq = NULL; 1371 } 1372 1373 if (spa->spa_prefetch_taskq) { 1374 taskq_destroy(spa->spa_prefetch_taskq); 1375 spa->spa_prefetch_taskq = NULL; 1376 } 1377 1378 if (spa->spa_upgrade_taskq) { 1379 taskq_destroy(spa->spa_upgrade_taskq); 1380 spa->spa_upgrade_taskq = NULL; 1381 } 1382 1383 txg_list_destroy(&spa->spa_vdev_txg_list); 1384 1385 list_destroy(&spa->spa_config_dirty_list); 1386 list_destroy(&spa->spa_evicting_os_list); 1387 list_destroy(&spa->spa_state_dirty_list); 1388 1389 taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); 1390 1391 for (int t = 0; t < ZIO_TYPES; t++) { 1392 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1393 spa_taskqs_fini(spa, t, q); 1394 } 1395 } 1396 1397 for (size_t i = 0; i < TXG_SIZE; i++) { 1398 ASSERT3P(spa->spa_txg_zio[i], !=, NULL); 1399 VERIFY0(zio_wait(spa->spa_txg_zio[i])); 1400 spa->spa_txg_zio[i] = NULL; 1401 } 1402 1403 metaslab_class_destroy(spa->spa_normal_class); 1404 spa->spa_normal_class = NULL; 1405 1406 metaslab_class_destroy(spa->spa_log_class); 1407 spa->spa_log_class = NULL; 1408 1409 metaslab_class_destroy(spa->spa_embedded_log_class); 1410 spa->spa_embedded_log_class = NULL; 1411 1412 metaslab_class_destroy(spa->spa_special_class); 1413 spa->spa_special_class = NULL; 1414 1415 metaslab_class_destroy(spa->spa_dedup_class); 1416 spa->spa_dedup_class = NULL; 1417 1418 /* 1419 * If this was part of an import or the open otherwise failed, we may 1420 * still have errors left in the queues. Empty them just in case. 1421 */ 1422 spa_errlog_drain(spa); 1423 avl_destroy(&spa->spa_errlist_scrub); 1424 avl_destroy(&spa->spa_errlist_last); 1425 1426 spa_keystore_fini(&spa->spa_keystore); 1427 1428 spa->spa_state = POOL_STATE_UNINITIALIZED; 1429 1430 mutex_enter(&spa->spa_proc_lock); 1431 if (spa->spa_proc_state != SPA_PROC_NONE) { 1432 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1433 spa->spa_proc_state = SPA_PROC_DEACTIVATE; 1434 cv_broadcast(&spa->spa_proc_cv); 1435 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 1436 ASSERT(spa->spa_proc != &p0); 1437 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1438 } 1439 ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 1440 spa->spa_proc_state = SPA_PROC_NONE; 1441 } 1442 ASSERT(spa->spa_proc == &p0); 1443 mutex_exit(&spa->spa_proc_lock); 1444 1445 /* 1446 * We want to make sure spa_thread() has actually exited the ZFS 1447 * module, so that the module can't be unloaded out from underneath 1448 * it. 1449 */ 1450 if (spa->spa_did != 0) { 1451 thread_join(spa->spa_did); 1452 spa->spa_did = 0; 1453 } 1454 } 1455 1456 /* 1457 * Verify a pool configuration, and construct the vdev tree appropriately. This 1458 * will create all the necessary vdevs in the appropriate layout, with each vdev 1459 * in the CLOSED state. This will prep the pool before open/creation/import. 1460 * All vdev validation is done by the vdev_alloc() routine. 1461 */ 1462 int 1463 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 1464 uint_t id, int atype) 1465 { 1466 nvlist_t **child; 1467 uint_t children; 1468 int error; 1469 1470 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 1471 return (error); 1472 1473 if ((*vdp)->vdev_ops->vdev_op_leaf) 1474 return (0); 1475 1476 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1477 &child, &children); 1478 1479 if (error == ENOENT) 1480 return (0); 1481 1482 if (error) { 1483 vdev_free(*vdp); 1484 *vdp = NULL; 1485 return (SET_ERROR(EINVAL)); 1486 } 1487 1488 for (int c = 0; c < children; c++) { 1489 vdev_t *vd; 1490 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 1491 atype)) != 0) { 1492 vdev_free(*vdp); 1493 *vdp = NULL; 1494 return (error); 1495 } 1496 } 1497 1498 ASSERT(*vdp != NULL); 1499 1500 return (0); 1501 } 1502 1503 static boolean_t 1504 spa_should_flush_logs_on_unload(spa_t *spa) 1505 { 1506 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) 1507 return (B_FALSE); 1508 1509 if (!spa_writeable(spa)) 1510 return (B_FALSE); 1511 1512 if (!spa->spa_sync_on) 1513 return (B_FALSE); 1514 1515 if (spa_state(spa) != POOL_STATE_EXPORTED) 1516 return (B_FALSE); 1517 1518 if (zfs_keep_log_spacemaps_at_export) 1519 return (B_FALSE); 1520 1521 return (B_TRUE); 1522 } 1523 1524 /* 1525 * Opens a transaction that will set the flag that will instruct 1526 * spa_sync to attempt to flush all the metaslabs for that txg. 1527 */ 1528 static void 1529 spa_unload_log_sm_flush_all(spa_t *spa) 1530 { 1531 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 1532 VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 1533 1534 ASSERT3U(spa->spa_log_flushall_txg, ==, 0); 1535 spa->spa_log_flushall_txg = dmu_tx_get_txg(tx); 1536 1537 dmu_tx_commit(tx); 1538 txg_wait_synced(spa_get_dsl(spa), spa->spa_log_flushall_txg); 1539 } 1540 1541 static void 1542 spa_unload_log_sm_metadata(spa_t *spa) 1543 { 1544 void *cookie = NULL; 1545 spa_log_sm_t *sls; 1546 while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg, 1547 &cookie)) != NULL) { 1548 VERIFY0(sls->sls_mscount); 1549 kmem_free(sls, sizeof (spa_log_sm_t)); 1550 } 1551 1552 for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); 1553 e != NULL; e = list_head(&spa->spa_log_summary)) { 1554 VERIFY0(e->lse_mscount); 1555 list_remove(&spa->spa_log_summary, e); 1556 kmem_free(e, sizeof (log_summary_entry_t)); 1557 } 1558 1559 spa->spa_unflushed_stats.sus_nblocks = 0; 1560 spa->spa_unflushed_stats.sus_memused = 0; 1561 spa->spa_unflushed_stats.sus_blocklimit = 0; 1562 } 1563 1564 static void 1565 spa_destroy_aux_threads(spa_t *spa) 1566 { 1567 if (spa->spa_condense_zthr != NULL) { 1568 zthr_destroy(spa->spa_condense_zthr); 1569 spa->spa_condense_zthr = NULL; 1570 } 1571 if (spa->spa_checkpoint_discard_zthr != NULL) { 1572 zthr_destroy(spa->spa_checkpoint_discard_zthr); 1573 spa->spa_checkpoint_discard_zthr = NULL; 1574 } 1575 if (spa->spa_livelist_delete_zthr != NULL) { 1576 zthr_destroy(spa->spa_livelist_delete_zthr); 1577 spa->spa_livelist_delete_zthr = NULL; 1578 } 1579 if (spa->spa_livelist_condense_zthr != NULL) { 1580 zthr_destroy(spa->spa_livelist_condense_zthr); 1581 spa->spa_livelist_condense_zthr = NULL; 1582 } 1583 } 1584 1585 /* 1586 * Opposite of spa_load(). 1587 */ 1588 static void 1589 spa_unload(spa_t *spa) 1590 { 1591 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1592 ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED); 1593 1594 spa_import_progress_remove(spa_guid(spa)); 1595 spa_load_note(spa, "UNLOADING"); 1596 1597 spa_wake_waiters(spa); 1598 1599 /* 1600 * If the log space map feature is enabled and the pool is getting 1601 * exported (but not destroyed), we want to spend some time flushing 1602 * as many metaslabs as we can in an attempt to destroy log space 1603 * maps and save import time. 1604 */ 1605 if (spa_should_flush_logs_on_unload(spa)) 1606 spa_unload_log_sm_flush_all(spa); 1607 1608 /* 1609 * Stop async tasks. 1610 */ 1611 spa_async_suspend(spa); 1612 1613 if (spa->spa_root_vdev) { 1614 vdev_t *root_vdev = spa->spa_root_vdev; 1615 vdev_initialize_stop_all(root_vdev, VDEV_INITIALIZE_ACTIVE); 1616 vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE); 1617 vdev_autotrim_stop_all(spa); 1618 vdev_rebuild_stop_all(spa); 1619 } 1620 1621 /* 1622 * Stop syncing. 1623 */ 1624 if (spa->spa_sync_on) { 1625 txg_sync_stop(spa->spa_dsl_pool); 1626 spa->spa_sync_on = B_FALSE; 1627 } 1628 1629 /* 1630 * This ensures that there is no async metaslab prefetching 1631 * while we attempt to unload the spa. 1632 */ 1633 if (spa->spa_root_vdev != NULL) { 1634 for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) { 1635 vdev_t *vc = spa->spa_root_vdev->vdev_child[c]; 1636 if (vc->vdev_mg != NULL) 1637 taskq_wait(vc->vdev_mg->mg_taskq); 1638 } 1639 } 1640 1641 if (spa->spa_mmp.mmp_thread) 1642 mmp_thread_stop(spa); 1643 1644 /* 1645 * Wait for any outstanding async I/O to complete. 1646 */ 1647 if (spa->spa_async_zio_root != NULL) { 1648 for (int i = 0; i < max_ncpus; i++) 1649 (void) zio_wait(spa->spa_async_zio_root[i]); 1650 kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *)); 1651 spa->spa_async_zio_root = NULL; 1652 } 1653 1654 if (spa->spa_vdev_removal != NULL) { 1655 spa_vdev_removal_destroy(spa->spa_vdev_removal); 1656 spa->spa_vdev_removal = NULL; 1657 } 1658 1659 spa_destroy_aux_threads(spa); 1660 1661 spa_condense_fini(spa); 1662 1663 bpobj_close(&spa->spa_deferred_bpobj); 1664 1665 spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); 1666 1667 /* 1668 * Close all vdevs. 1669 */ 1670 if (spa->spa_root_vdev) 1671 vdev_free(spa->spa_root_vdev); 1672 ASSERT(spa->spa_root_vdev == NULL); 1673 1674 /* 1675 * Close the dsl pool. 1676 */ 1677 if (spa->spa_dsl_pool) { 1678 dsl_pool_close(spa->spa_dsl_pool); 1679 spa->spa_dsl_pool = NULL; 1680 spa->spa_meta_objset = NULL; 1681 } 1682 1683 ddt_unload(spa); 1684 spa_unload_log_sm_metadata(spa); 1685 1686 /* 1687 * Drop and purge level 2 cache 1688 */ 1689 spa_l2cache_drop(spa); 1690 1691 for (int i = 0; i < spa->spa_spares.sav_count; i++) 1692 vdev_free(spa->spa_spares.sav_vdevs[i]); 1693 if (spa->spa_spares.sav_vdevs) { 1694 kmem_free(spa->spa_spares.sav_vdevs, 1695 spa->spa_spares.sav_count * sizeof (void *)); 1696 spa->spa_spares.sav_vdevs = NULL; 1697 } 1698 if (spa->spa_spares.sav_config) { 1699 nvlist_free(spa->spa_spares.sav_config); 1700 spa->spa_spares.sav_config = NULL; 1701 } 1702 spa->spa_spares.sav_count = 0; 1703 1704 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { 1705 vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); 1706 vdev_free(spa->spa_l2cache.sav_vdevs[i]); 1707 } 1708 if (spa->spa_l2cache.sav_vdevs) { 1709 kmem_free(spa->spa_l2cache.sav_vdevs, 1710 spa->spa_l2cache.sav_count * sizeof (void *)); 1711 spa->spa_l2cache.sav_vdevs = NULL; 1712 } 1713 if (spa->spa_l2cache.sav_config) { 1714 nvlist_free(spa->spa_l2cache.sav_config); 1715 spa->spa_l2cache.sav_config = NULL; 1716 } 1717 spa->spa_l2cache.sav_count = 0; 1718 1719 spa->spa_async_suspended = 0; 1720 1721 spa->spa_indirect_vdevs_loaded = B_FALSE; 1722 1723 if (spa->spa_comment != NULL) { 1724 spa_strfree(spa->spa_comment); 1725 spa->spa_comment = NULL; 1726 } 1727 if (spa->spa_compatibility != NULL) { 1728 spa_strfree(spa->spa_compatibility); 1729 spa->spa_compatibility = NULL; 1730 } 1731 1732 spa_config_exit(spa, SCL_ALL, spa); 1733 } 1734 1735 /* 1736 * Load (or re-load) the current list of vdevs describing the active spares for 1737 * this pool. When this is called, we have some form of basic information in 1738 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 1739 * then re-generate a more complete list including status information. 1740 */ 1741 void 1742 spa_load_spares(spa_t *spa) 1743 { 1744 nvlist_t **spares; 1745 uint_t nspares; 1746 int i; 1747 vdev_t *vd, *tvd; 1748 1749 #ifndef _KERNEL 1750 /* 1751 * zdb opens both the current state of the pool and the 1752 * checkpointed state (if present), with a different spa_t. 1753 * 1754 * As spare vdevs are shared among open pools, we skip loading 1755 * them when we load the checkpointed state of the pool. 1756 */ 1757 if (!spa_writeable(spa)) 1758 return; 1759 #endif 1760 1761 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1762 1763 /* 1764 * First, close and free any existing spare vdevs. 1765 */ 1766 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1767 vd = spa->spa_spares.sav_vdevs[i]; 1768 1769 /* Undo the call to spa_activate() below */ 1770 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1771 B_FALSE)) != NULL && tvd->vdev_isspare) 1772 spa_spare_remove(tvd); 1773 vdev_close(vd); 1774 vdev_free(vd); 1775 } 1776 1777 if (spa->spa_spares.sav_vdevs) 1778 kmem_free(spa->spa_spares.sav_vdevs, 1779 spa->spa_spares.sav_count * sizeof (void *)); 1780 1781 if (spa->spa_spares.sav_config == NULL) 1782 nspares = 0; 1783 else 1784 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1785 ZPOOL_CONFIG_SPARES, &spares, &nspares)); 1786 1787 spa->spa_spares.sav_count = (int)nspares; 1788 spa->spa_spares.sav_vdevs = NULL; 1789 1790 if (nspares == 0) 1791 return; 1792 1793 /* 1794 * Construct the array of vdevs, opening them to get status in the 1795 * process. For each spare, there is potentially two different vdev_t 1796 * structures associated with it: one in the list of spares (used only 1797 * for basic validation purposes) and one in the active vdev 1798 * configuration (if it's spared in). During this phase we open and 1799 * validate each vdev on the spare list. If the vdev also exists in the 1800 * active configuration, then we also mark this vdev as an active spare. 1801 */ 1802 spa->spa_spares.sav_vdevs = kmem_zalloc(nspares * sizeof (void *), 1803 KM_SLEEP); 1804 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1805 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 1806 VDEV_ALLOC_SPARE) == 0); 1807 ASSERT(vd != NULL); 1808 1809 spa->spa_spares.sav_vdevs[i] = vd; 1810 1811 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1812 B_FALSE)) != NULL) { 1813 if (!tvd->vdev_isspare) 1814 spa_spare_add(tvd); 1815 1816 /* 1817 * We only mark the spare active if we were successfully 1818 * able to load the vdev. Otherwise, importing a pool 1819 * with a bad active spare would result in strange 1820 * behavior, because multiple pool would think the spare 1821 * is actively in use. 1822 * 1823 * There is a vulnerability here to an equally bizarre 1824 * circumstance, where a dead active spare is later 1825 * brought back to life (onlined or otherwise). Given 1826 * the rarity of this scenario, and the extra complexity 1827 * it adds, we ignore the possibility. 1828 */ 1829 if (!vdev_is_dead(tvd)) 1830 spa_spare_activate(tvd); 1831 } 1832 1833 vd->vdev_top = vd; 1834 vd->vdev_aux = &spa->spa_spares; 1835 1836 if (vdev_open(vd) != 0) 1837 continue; 1838 1839 if (vdev_validate_aux(vd) == 0) 1840 spa_spare_add(vd); 1841 } 1842 1843 /* 1844 * Recompute the stashed list of spares, with status information 1845 * this time. 1846 */ 1847 fnvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES); 1848 1849 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 1850 KM_SLEEP); 1851 for (i = 0; i < spa->spa_spares.sav_count; i++) 1852 spares[i] = vdev_config_generate(spa, 1853 spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 1854 fnvlist_add_nvlist_array(spa->spa_spares.sav_config, 1855 ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, 1856 spa->spa_spares.sav_count); 1857 for (i = 0; i < spa->spa_spares.sav_count; i++) 1858 nvlist_free(spares[i]); 1859 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 1860 } 1861 1862 /* 1863 * Load (or re-load) the current list of vdevs describing the active l2cache for 1864 * this pool. When this is called, we have some form of basic information in 1865 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 1866 * then re-generate a more complete list including status information. 1867 * Devices which are already active have their details maintained, and are 1868 * not re-opened. 1869 */ 1870 void 1871 spa_load_l2cache(spa_t *spa) 1872 { 1873 nvlist_t **l2cache = NULL; 1874 uint_t nl2cache; 1875 int i, j, oldnvdevs; 1876 uint64_t guid; 1877 vdev_t *vd, **oldvdevs, **newvdevs; 1878 spa_aux_vdev_t *sav = &spa->spa_l2cache; 1879 1880 #ifndef _KERNEL 1881 /* 1882 * zdb opens both the current state of the pool and the 1883 * checkpointed state (if present), with a different spa_t. 1884 * 1885 * As L2 caches are part of the ARC which is shared among open 1886 * pools, we skip loading them when we load the checkpointed 1887 * state of the pool. 1888 */ 1889 if (!spa_writeable(spa)) 1890 return; 1891 #endif 1892 1893 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1894 1895 oldvdevs = sav->sav_vdevs; 1896 oldnvdevs = sav->sav_count; 1897 sav->sav_vdevs = NULL; 1898 sav->sav_count = 0; 1899 1900 if (sav->sav_config == NULL) { 1901 nl2cache = 0; 1902 newvdevs = NULL; 1903 goto out; 1904 } 1905 1906 VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, 1907 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache)); 1908 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 1909 1910 /* 1911 * Process new nvlist of vdevs. 1912 */ 1913 for (i = 0; i < nl2cache; i++) { 1914 guid = fnvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID); 1915 1916 newvdevs[i] = NULL; 1917 for (j = 0; j < oldnvdevs; j++) { 1918 vd = oldvdevs[j]; 1919 if (vd != NULL && guid == vd->vdev_guid) { 1920 /* 1921 * Retain previous vdev for add/remove ops. 1922 */ 1923 newvdevs[i] = vd; 1924 oldvdevs[j] = NULL; 1925 break; 1926 } 1927 } 1928 1929 if (newvdevs[i] == NULL) { 1930 /* 1931 * Create new vdev 1932 */ 1933 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 1934 VDEV_ALLOC_L2CACHE) == 0); 1935 ASSERT(vd != NULL); 1936 newvdevs[i] = vd; 1937 1938 /* 1939 * Commit this vdev as an l2cache device, 1940 * even if it fails to open. 1941 */ 1942 spa_l2cache_add(vd); 1943 1944 vd->vdev_top = vd; 1945 vd->vdev_aux = sav; 1946 1947 spa_l2cache_activate(vd); 1948 1949 if (vdev_open(vd) != 0) 1950 continue; 1951 1952 (void) vdev_validate_aux(vd); 1953 1954 if (!vdev_is_dead(vd)) 1955 l2arc_add_vdev(spa, vd); 1956 1957 /* 1958 * Upon cache device addition to a pool or pool 1959 * creation with a cache device or if the header 1960 * of the device is invalid we issue an async 1961 * TRIM command for the whole device which will 1962 * execute if l2arc_trim_ahead > 0. 1963 */ 1964 spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM); 1965 } 1966 } 1967 1968 sav->sav_vdevs = newvdevs; 1969 sav->sav_count = (int)nl2cache; 1970 1971 /* 1972 * Recompute the stashed list of l2cache devices, with status 1973 * information this time. 1974 */ 1975 fnvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE); 1976 1977 if (sav->sav_count > 0) 1978 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), 1979 KM_SLEEP); 1980 for (i = 0; i < sav->sav_count; i++) 1981 l2cache[i] = vdev_config_generate(spa, 1982 sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 1983 fnvlist_add_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 1984 (const nvlist_t * const *)l2cache, sav->sav_count); 1985 1986 out: 1987 /* 1988 * Purge vdevs that were dropped 1989 */ 1990 for (i = 0; i < oldnvdevs; i++) { 1991 uint64_t pool; 1992 1993 vd = oldvdevs[i]; 1994 if (vd != NULL) { 1995 ASSERT(vd->vdev_isl2cache); 1996 1997 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1998 pool != 0ULL && l2arc_vdev_present(vd)) 1999 l2arc_remove_vdev(vd); 2000 vdev_clear_stats(vd); 2001 vdev_free(vd); 2002 } 2003 } 2004 2005 if (oldvdevs) 2006 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 2007 2008 for (i = 0; i < sav->sav_count; i++) 2009 nvlist_free(l2cache[i]); 2010 if (sav->sav_count) 2011 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 2012 } 2013 2014 static int 2015 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 2016 { 2017 dmu_buf_t *db; 2018 char *packed = NULL; 2019 size_t nvsize = 0; 2020 int error; 2021 *value = NULL; 2022 2023 error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); 2024 if (error) 2025 return (error); 2026 2027 nvsize = *(uint64_t *)db->db_data; 2028 dmu_buf_rele(db, FTAG); 2029 2030 packed = vmem_alloc(nvsize, KM_SLEEP); 2031 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 2032 DMU_READ_PREFETCH); 2033 if (error == 0) 2034 error = nvlist_unpack(packed, nvsize, value, 0); 2035 vmem_free(packed, nvsize); 2036 2037 return (error); 2038 } 2039 2040 /* 2041 * Concrete top-level vdevs that are not missing and are not logs. At every 2042 * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds. 2043 */ 2044 static uint64_t 2045 spa_healthy_core_tvds(spa_t *spa) 2046 { 2047 vdev_t *rvd = spa->spa_root_vdev; 2048 uint64_t tvds = 0; 2049 2050 for (uint64_t i = 0; i < rvd->vdev_children; i++) { 2051 vdev_t *vd = rvd->vdev_child[i]; 2052 if (vd->vdev_islog) 2053 continue; 2054 if (vdev_is_concrete(vd) && !vdev_is_dead(vd)) 2055 tvds++; 2056 } 2057 2058 return (tvds); 2059 } 2060 2061 /* 2062 * Checks to see if the given vdev could not be opened, in which case we post a 2063 * sysevent to notify the autoreplace code that the device has been removed. 2064 */ 2065 static void 2066 spa_check_removed(vdev_t *vd) 2067 { 2068 for (uint64_t c = 0; c < vd->vdev_children; c++) 2069 spa_check_removed(vd->vdev_child[c]); 2070 2071 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && 2072 vdev_is_concrete(vd)) { 2073 zfs_post_autoreplace(vd->vdev_spa, vd); 2074 spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK); 2075 } 2076 } 2077 2078 static int 2079 spa_check_for_missing_logs(spa_t *spa) 2080 { 2081 vdev_t *rvd = spa->spa_root_vdev; 2082 2083 /* 2084 * If we're doing a normal import, then build up any additional 2085 * diagnostic information about missing log devices. 2086 * We'll pass this up to the user for further processing. 2087 */ 2088 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 2089 nvlist_t **child, *nv; 2090 uint64_t idx = 0; 2091 2092 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t *), 2093 KM_SLEEP); 2094 nv = fnvlist_alloc(); 2095 2096 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 2097 vdev_t *tvd = rvd->vdev_child[c]; 2098 2099 /* 2100 * We consider a device as missing only if it failed 2101 * to open (i.e. offline or faulted is not considered 2102 * as missing). 2103 */ 2104 if (tvd->vdev_islog && 2105 tvd->vdev_state == VDEV_STATE_CANT_OPEN) { 2106 child[idx++] = vdev_config_generate(spa, tvd, 2107 B_FALSE, VDEV_CONFIG_MISSING); 2108 } 2109 } 2110 2111 if (idx > 0) { 2112 fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 2113 (const nvlist_t * const *)child, idx); 2114 fnvlist_add_nvlist(spa->spa_load_info, 2115 ZPOOL_CONFIG_MISSING_DEVICES, nv); 2116 2117 for (uint64_t i = 0; i < idx; i++) 2118 nvlist_free(child[i]); 2119 } 2120 nvlist_free(nv); 2121 kmem_free(child, rvd->vdev_children * sizeof (char **)); 2122 2123 if (idx > 0) { 2124 spa_load_failed(spa, "some log devices are missing"); 2125 vdev_dbgmsg_print_tree(rvd, 2); 2126 return (SET_ERROR(ENXIO)); 2127 } 2128 } else { 2129 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 2130 vdev_t *tvd = rvd->vdev_child[c]; 2131 2132 if (tvd->vdev_islog && 2133 tvd->vdev_state == VDEV_STATE_CANT_OPEN) { 2134 spa_set_log_state(spa, SPA_LOG_CLEAR); 2135 spa_load_note(spa, "some log devices are " 2136 "missing, ZIL is dropped."); 2137 vdev_dbgmsg_print_tree(rvd, 2); 2138 break; 2139 } 2140 } 2141 } 2142 2143 return (0); 2144 } 2145 2146 /* 2147 * Check for missing log devices 2148 */ 2149 static boolean_t 2150 spa_check_logs(spa_t *spa) 2151 { 2152 boolean_t rv = B_FALSE; 2153 dsl_pool_t *dp = spa_get_dsl(spa); 2154 2155 switch (spa->spa_log_state) { 2156 default: 2157 break; 2158 case SPA_LOG_MISSING: 2159 /* need to recheck in case slog has been restored */ 2160 case SPA_LOG_UNKNOWN: 2161 rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 2162 zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0); 2163 if (rv) 2164 spa_set_log_state(spa, SPA_LOG_MISSING); 2165 break; 2166 } 2167 return (rv); 2168 } 2169 2170 /* 2171 * Passivate any log vdevs (note, does not apply to embedded log metaslabs). 2172 */ 2173 static boolean_t 2174 spa_passivate_log(spa_t *spa) 2175 { 2176 vdev_t *rvd = spa->spa_root_vdev; 2177 boolean_t slog_found = B_FALSE; 2178 2179 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 2180 2181 for (int c = 0; c < rvd->vdev_children; c++) { 2182 vdev_t *tvd = rvd->vdev_child[c]; 2183 2184 if (tvd->vdev_islog) { 2185 ASSERT3P(tvd->vdev_log_mg, ==, NULL); 2186 metaslab_group_passivate(tvd->vdev_mg); 2187 slog_found = B_TRUE; 2188 } 2189 } 2190 2191 return (slog_found); 2192 } 2193 2194 /* 2195 * Activate any log vdevs (note, does not apply to embedded log metaslabs). 2196 */ 2197 static void 2198 spa_activate_log(spa_t *spa) 2199 { 2200 vdev_t *rvd = spa->spa_root_vdev; 2201 2202 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 2203 2204 for (int c = 0; c < rvd->vdev_children; c++) { 2205 vdev_t *tvd = rvd->vdev_child[c]; 2206 2207 if (tvd->vdev_islog) { 2208 ASSERT3P(tvd->vdev_log_mg, ==, NULL); 2209 metaslab_group_activate(tvd->vdev_mg); 2210 } 2211 } 2212 } 2213 2214 int 2215 spa_reset_logs(spa_t *spa) 2216 { 2217 int error; 2218 2219 error = dmu_objset_find(spa_name(spa), zil_reset, 2220 NULL, DS_FIND_CHILDREN); 2221 if (error == 0) { 2222 /* 2223 * We successfully offlined the log device, sync out the 2224 * current txg so that the "stubby" block can be removed 2225 * by zil_sync(). 2226 */ 2227 txg_wait_synced(spa->spa_dsl_pool, 0); 2228 } 2229 return (error); 2230 } 2231 2232 static void 2233 spa_aux_check_removed(spa_aux_vdev_t *sav) 2234 { 2235 for (int i = 0; i < sav->sav_count; i++) 2236 spa_check_removed(sav->sav_vdevs[i]); 2237 } 2238 2239 void 2240 spa_claim_notify(zio_t *zio) 2241 { 2242 spa_t *spa = zio->io_spa; 2243 2244 if (zio->io_error) 2245 return; 2246 2247 mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 2248 if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 2249 spa->spa_claim_max_txg = zio->io_bp->blk_birth; 2250 mutex_exit(&spa->spa_props_lock); 2251 } 2252 2253 typedef struct spa_load_error { 2254 uint64_t sle_meta_count; 2255 uint64_t sle_data_count; 2256 } spa_load_error_t; 2257 2258 static void 2259 spa_load_verify_done(zio_t *zio) 2260 { 2261 blkptr_t *bp = zio->io_bp; 2262 spa_load_error_t *sle = zio->io_private; 2263 dmu_object_type_t type = BP_GET_TYPE(bp); 2264 int error = zio->io_error; 2265 spa_t *spa = zio->io_spa; 2266 2267 abd_free(zio->io_abd); 2268 if (error) { 2269 if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && 2270 type != DMU_OT_INTENT_LOG) 2271 atomic_inc_64(&sle->sle_meta_count); 2272 else 2273 atomic_inc_64(&sle->sle_data_count); 2274 } 2275 2276 mutex_enter(&spa->spa_scrub_lock); 2277 spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp); 2278 cv_broadcast(&spa->spa_scrub_io_cv); 2279 mutex_exit(&spa->spa_scrub_lock); 2280 } 2281 2282 /* 2283 * Maximum number of inflight bytes is the log2 fraction of the arc size. 2284 * By default, we set it to 1/16th of the arc. 2285 */ 2286 static int spa_load_verify_shift = 4; 2287 static int spa_load_verify_metadata = B_TRUE; 2288 static int spa_load_verify_data = B_TRUE; 2289 2290 static int 2291 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 2292 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 2293 { 2294 (void) zilog, (void) dnp; 2295 2296 if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || 2297 BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp)) 2298 return (0); 2299 /* 2300 * Note: normally this routine will not be called if 2301 * spa_load_verify_metadata is not set. However, it may be useful 2302 * to manually set the flag after the traversal has begun. 2303 */ 2304 if (!spa_load_verify_metadata) 2305 return (0); 2306 if (!BP_IS_METADATA(bp) && !spa_load_verify_data) 2307 return (0); 2308 2309 uint64_t maxinflight_bytes = 2310 arc_target_bytes() >> spa_load_verify_shift; 2311 zio_t *rio = arg; 2312 size_t size = BP_GET_PSIZE(bp); 2313 2314 mutex_enter(&spa->spa_scrub_lock); 2315 while (spa->spa_load_verify_bytes >= maxinflight_bytes) 2316 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2317 spa->spa_load_verify_bytes += size; 2318 mutex_exit(&spa->spa_scrub_lock); 2319 2320 zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size, 2321 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 2322 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 2323 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 2324 return (0); 2325 } 2326 2327 static int 2328 verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) 2329 { 2330 (void) dp, (void) arg; 2331 2332 if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN) 2333 return (SET_ERROR(ENAMETOOLONG)); 2334 2335 return (0); 2336 } 2337 2338 static int 2339 spa_load_verify(spa_t *spa) 2340 { 2341 zio_t *rio; 2342 spa_load_error_t sle = { 0 }; 2343 zpool_load_policy_t policy; 2344 boolean_t verify_ok = B_FALSE; 2345 int error = 0; 2346 2347 zpool_get_load_policy(spa->spa_config, &policy); 2348 2349 if (policy.zlp_rewind & ZPOOL_NEVER_REWIND) 2350 return (0); 2351 2352 dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); 2353 error = dmu_objset_find_dp(spa->spa_dsl_pool, 2354 spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL, 2355 DS_FIND_CHILDREN); 2356 dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); 2357 if (error != 0) 2358 return (error); 2359 2360 rio = zio_root(spa, NULL, &sle, 2361 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 2362 2363 if (spa_load_verify_metadata) { 2364 if (spa->spa_extreme_rewind) { 2365 spa_load_note(spa, "performing a complete scan of the " 2366 "pool since extreme rewind is on. This may take " 2367 "a very long time.\n (spa_load_verify_data=%u, " 2368 "spa_load_verify_metadata=%u)", 2369 spa_load_verify_data, spa_load_verify_metadata); 2370 } 2371 2372 error = traverse_pool(spa, spa->spa_verify_min_txg, 2373 TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | 2374 TRAVERSE_NO_DECRYPT, spa_load_verify_cb, rio); 2375 } 2376 2377 (void) zio_wait(rio); 2378 ASSERT0(spa->spa_load_verify_bytes); 2379 2380 spa->spa_load_meta_errors = sle.sle_meta_count; 2381 spa->spa_load_data_errors = sle.sle_data_count; 2382 2383 if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) { 2384 spa_load_note(spa, "spa_load_verify found %llu metadata errors " 2385 "and %llu data errors", (u_longlong_t)sle.sle_meta_count, 2386 (u_longlong_t)sle.sle_data_count); 2387 } 2388 2389 if (spa_load_verify_dryrun || 2390 (!error && sle.sle_meta_count <= policy.zlp_maxmeta && 2391 sle.sle_data_count <= policy.zlp_maxdata)) { 2392 int64_t loss = 0; 2393 2394 verify_ok = B_TRUE; 2395 spa->spa_load_txg = spa->spa_uberblock.ub_txg; 2396 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 2397 2398 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 2399 fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_TIME, 2400 spa->spa_load_txg_ts); 2401 fnvlist_add_int64(spa->spa_load_info, ZPOOL_CONFIG_REWIND_TIME, 2402 loss); 2403 fnvlist_add_uint64(spa->spa_load_info, 2404 ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count); 2405 } else { 2406 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 2407 } 2408 2409 if (spa_load_verify_dryrun) 2410 return (0); 2411 2412 if (error) { 2413 if (error != ENXIO && error != EIO) 2414 error = SET_ERROR(EIO); 2415 return (error); 2416 } 2417 2418 return (verify_ok ? 0 : EIO); 2419 } 2420 2421 /* 2422 * Find a value in the pool props object. 2423 */ 2424 static void 2425 spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 2426 { 2427 (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 2428 zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 2429 } 2430 2431 /* 2432 * Find a value in the pool directory object. 2433 */ 2434 static int 2435 spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent) 2436 { 2437 int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 2438 name, sizeof (uint64_t), 1, val); 2439 2440 if (error != 0 && (error != ENOENT || log_enoent)) { 2441 spa_load_failed(spa, "couldn't get '%s' value in MOS directory " 2442 "[error=%d]", name, error); 2443 } 2444 2445 return (error); 2446 } 2447 2448 static int 2449 spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 2450 { 2451 vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 2452 return (SET_ERROR(err)); 2453 } 2454 2455 boolean_t 2456 spa_livelist_delete_check(spa_t *spa) 2457 { 2458 return (spa->spa_livelists_to_delete != 0); 2459 } 2460 2461 static boolean_t 2462 spa_livelist_delete_cb_check(void *arg, zthr_t *z) 2463 { 2464 (void) z; 2465 spa_t *spa = arg; 2466 return (spa_livelist_delete_check(spa)); 2467 } 2468 2469 static int 2470 delete_blkptr_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 2471 { 2472 spa_t *spa = arg; 2473 zio_free(spa, tx->tx_txg, bp); 2474 dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, 2475 -bp_get_dsize_sync(spa, bp), 2476 -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); 2477 return (0); 2478 } 2479 2480 static int 2481 dsl_get_next_livelist_obj(objset_t *os, uint64_t zap_obj, uint64_t *llp) 2482 { 2483 int err; 2484 zap_cursor_t zc; 2485 zap_attribute_t za; 2486 zap_cursor_init(&zc, os, zap_obj); 2487 err = zap_cursor_retrieve(&zc, &za); 2488 zap_cursor_fini(&zc); 2489 if (err == 0) 2490 *llp = za.za_first_integer; 2491 return (err); 2492 } 2493 2494 /* 2495 * Components of livelist deletion that must be performed in syncing 2496 * context: freeing block pointers and updating the pool-wide data 2497 * structures to indicate how much work is left to do 2498 */ 2499 typedef struct sublist_delete_arg { 2500 spa_t *spa; 2501 dsl_deadlist_t *ll; 2502 uint64_t key; 2503 bplist_t *to_free; 2504 } sublist_delete_arg_t; 2505 2506 static void 2507 sublist_delete_sync(void *arg, dmu_tx_t *tx) 2508 { 2509 sublist_delete_arg_t *sda = arg; 2510 spa_t *spa = sda->spa; 2511 dsl_deadlist_t *ll = sda->ll; 2512 uint64_t key = sda->key; 2513 bplist_t *to_free = sda->to_free; 2514 2515 bplist_iterate(to_free, delete_blkptr_cb, spa, tx); 2516 dsl_deadlist_remove_entry(ll, key, tx); 2517 } 2518 2519 typedef struct livelist_delete_arg { 2520 spa_t *spa; 2521 uint64_t ll_obj; 2522 uint64_t zap_obj; 2523 } livelist_delete_arg_t; 2524 2525 static void 2526 livelist_delete_sync(void *arg, dmu_tx_t *tx) 2527 { 2528 livelist_delete_arg_t *lda = arg; 2529 spa_t *spa = lda->spa; 2530 uint64_t ll_obj = lda->ll_obj; 2531 uint64_t zap_obj = lda->zap_obj; 2532 objset_t *mos = spa->spa_meta_objset; 2533 uint64_t count; 2534 2535 /* free the livelist and decrement the feature count */ 2536 VERIFY0(zap_remove_int(mos, zap_obj, ll_obj, tx)); 2537 dsl_deadlist_free(mos, ll_obj, tx); 2538 spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx); 2539 VERIFY0(zap_count(mos, zap_obj, &count)); 2540 if (count == 0) { 2541 /* no more livelists to delete */ 2542 VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, 2543 DMU_POOL_DELETED_CLONES, tx)); 2544 VERIFY0(zap_destroy(mos, zap_obj, tx)); 2545 spa->spa_livelists_to_delete = 0; 2546 spa_notify_waiters(spa); 2547 } 2548 } 2549 2550 /* 2551 * Load in the value for the livelist to be removed and open it. Then, 2552 * load its first sublist and determine which block pointers should actually 2553 * be freed. Then, call a synctask which performs the actual frees and updates 2554 * the pool-wide livelist data. 2555 */ 2556 static void 2557 spa_livelist_delete_cb(void *arg, zthr_t *z) 2558 { 2559 spa_t *spa = arg; 2560 uint64_t ll_obj = 0, count; 2561 objset_t *mos = spa->spa_meta_objset; 2562 uint64_t zap_obj = spa->spa_livelists_to_delete; 2563 /* 2564 * Determine the next livelist to delete. This function should only 2565 * be called if there is at least one deleted clone. 2566 */ 2567 VERIFY0(dsl_get_next_livelist_obj(mos, zap_obj, &ll_obj)); 2568 VERIFY0(zap_count(mos, ll_obj, &count)); 2569 if (count > 0) { 2570 dsl_deadlist_t *ll; 2571 dsl_deadlist_entry_t *dle; 2572 bplist_t to_free; 2573 ll = kmem_zalloc(sizeof (dsl_deadlist_t), KM_SLEEP); 2574 dsl_deadlist_open(ll, mos, ll_obj); 2575 dle = dsl_deadlist_first(ll); 2576 ASSERT3P(dle, !=, NULL); 2577 bplist_create(&to_free); 2578 int err = dsl_process_sub_livelist(&dle->dle_bpobj, &to_free, 2579 z, NULL); 2580 if (err == 0) { 2581 sublist_delete_arg_t sync_arg = { 2582 .spa = spa, 2583 .ll = ll, 2584 .key = dle->dle_mintxg, 2585 .to_free = &to_free 2586 }; 2587 zfs_dbgmsg("deleting sublist (id %llu) from" 2588 " livelist %llu, %lld remaining", 2589 (u_longlong_t)dle->dle_bpobj.bpo_object, 2590 (u_longlong_t)ll_obj, (longlong_t)count - 1); 2591 VERIFY0(dsl_sync_task(spa_name(spa), NULL, 2592 sublist_delete_sync, &sync_arg, 0, 2593 ZFS_SPACE_CHECK_DESTROY)); 2594 } else { 2595 VERIFY3U(err, ==, EINTR); 2596 } 2597 bplist_clear(&to_free); 2598 bplist_destroy(&to_free); 2599 dsl_deadlist_close(ll); 2600 kmem_free(ll, sizeof (dsl_deadlist_t)); 2601 } else { 2602 livelist_delete_arg_t sync_arg = { 2603 .spa = spa, 2604 .ll_obj = ll_obj, 2605 .zap_obj = zap_obj 2606 }; 2607 zfs_dbgmsg("deletion of livelist %llu completed", 2608 (u_longlong_t)ll_obj); 2609 VERIFY0(dsl_sync_task(spa_name(spa), NULL, livelist_delete_sync, 2610 &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY)); 2611 } 2612 } 2613 2614 static void 2615 spa_start_livelist_destroy_thread(spa_t *spa) 2616 { 2617 ASSERT3P(spa->spa_livelist_delete_zthr, ==, NULL); 2618 spa->spa_livelist_delete_zthr = 2619 zthr_create("z_livelist_destroy", 2620 spa_livelist_delete_cb_check, spa_livelist_delete_cb, spa, 2621 minclsyspri); 2622 } 2623 2624 typedef struct livelist_new_arg { 2625 bplist_t *allocs; 2626 bplist_t *frees; 2627 } livelist_new_arg_t; 2628 2629 static int 2630 livelist_track_new_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 2631 dmu_tx_t *tx) 2632 { 2633 ASSERT(tx == NULL); 2634 livelist_new_arg_t *lna = arg; 2635 if (bp_freed) { 2636 bplist_append(lna->frees, bp); 2637 } else { 2638 bplist_append(lna->allocs, bp); 2639 zfs_livelist_condense_new_alloc++; 2640 } 2641 return (0); 2642 } 2643 2644 typedef struct livelist_condense_arg { 2645 spa_t *spa; 2646 bplist_t to_keep; 2647 uint64_t first_size; 2648 uint64_t next_size; 2649 } livelist_condense_arg_t; 2650 2651 static void 2652 spa_livelist_condense_sync(void *arg, dmu_tx_t *tx) 2653 { 2654 livelist_condense_arg_t *lca = arg; 2655 spa_t *spa = lca->spa; 2656 bplist_t new_frees; 2657 dsl_dataset_t *ds = spa->spa_to_condense.ds; 2658 2659 /* Have we been cancelled? */ 2660 if (spa->spa_to_condense.cancelled) { 2661 zfs_livelist_condense_sync_cancel++; 2662 goto out; 2663 } 2664 2665 dsl_deadlist_entry_t *first = spa->spa_to_condense.first; 2666 dsl_deadlist_entry_t *next = spa->spa_to_condense.next; 2667 dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist; 2668 2669 /* 2670 * It's possible that the livelist was changed while the zthr was 2671 * running. Therefore, we need to check for new blkptrs in the two 2672 * entries being condensed and continue to track them in the livelist. 2673 * Because of the way we handle remapped blkptrs (see dbuf_remap_impl), 2674 * it's possible that the newly added blkptrs are FREEs or ALLOCs so 2675 * we need to sort them into two different bplists. 2676 */ 2677 uint64_t first_obj = first->dle_bpobj.bpo_object; 2678 uint64_t next_obj = next->dle_bpobj.bpo_object; 2679 uint64_t cur_first_size = first->dle_bpobj.bpo_phys->bpo_num_blkptrs; 2680 uint64_t cur_next_size = next->dle_bpobj.bpo_phys->bpo_num_blkptrs; 2681 2682 bplist_create(&new_frees); 2683 livelist_new_arg_t new_bps = { 2684 .allocs = &lca->to_keep, 2685 .frees = &new_frees, 2686 }; 2687 2688 if (cur_first_size > lca->first_size) { 2689 VERIFY0(livelist_bpobj_iterate_from_nofree(&first->dle_bpobj, 2690 livelist_track_new_cb, &new_bps, lca->first_size)); 2691 } 2692 if (cur_next_size > lca->next_size) { 2693 VERIFY0(livelist_bpobj_iterate_from_nofree(&next->dle_bpobj, 2694 livelist_track_new_cb, &new_bps, lca->next_size)); 2695 } 2696 2697 dsl_deadlist_clear_entry(first, ll, tx); 2698 ASSERT(bpobj_is_empty(&first->dle_bpobj)); 2699 dsl_deadlist_remove_entry(ll, next->dle_mintxg, tx); 2700 2701 bplist_iterate(&lca->to_keep, dsl_deadlist_insert_alloc_cb, ll, tx); 2702 bplist_iterate(&new_frees, dsl_deadlist_insert_free_cb, ll, tx); 2703 bplist_destroy(&new_frees); 2704 2705 char dsname[ZFS_MAX_DATASET_NAME_LEN]; 2706 dsl_dataset_name(ds, dsname); 2707 zfs_dbgmsg("txg %llu condensing livelist of %s (id %llu), bpobj %llu " 2708 "(%llu blkptrs) and bpobj %llu (%llu blkptrs) -> bpobj %llu " 2709 "(%llu blkptrs)", (u_longlong_t)tx->tx_txg, dsname, 2710 (u_longlong_t)ds->ds_object, (u_longlong_t)first_obj, 2711 (u_longlong_t)cur_first_size, (u_longlong_t)next_obj, 2712 (u_longlong_t)cur_next_size, 2713 (u_longlong_t)first->dle_bpobj.bpo_object, 2714 (u_longlong_t)first->dle_bpobj.bpo_phys->bpo_num_blkptrs); 2715 out: 2716 dmu_buf_rele(ds->ds_dbuf, spa); 2717 spa->spa_to_condense.ds = NULL; 2718 bplist_clear(&lca->to_keep); 2719 bplist_destroy(&lca->to_keep); 2720 kmem_free(lca, sizeof (livelist_condense_arg_t)); 2721 spa->spa_to_condense.syncing = B_FALSE; 2722 } 2723 2724 static void 2725 spa_livelist_condense_cb(void *arg, zthr_t *t) 2726 { 2727 while (zfs_livelist_condense_zthr_pause && 2728 !(zthr_has_waiters(t) || zthr_iscancelled(t))) 2729 delay(1); 2730 2731 spa_t *spa = arg; 2732 dsl_deadlist_entry_t *first = spa->spa_to_condense.first; 2733 dsl_deadlist_entry_t *next = spa->spa_to_condense.next; 2734 uint64_t first_size, next_size; 2735 2736 livelist_condense_arg_t *lca = 2737 kmem_alloc(sizeof (livelist_condense_arg_t), KM_SLEEP); 2738 bplist_create(&lca->to_keep); 2739 2740 /* 2741 * Process the livelists (matching FREEs and ALLOCs) in open context 2742 * so we have minimal work in syncing context to condense. 2743 * 2744 * We save bpobj sizes (first_size and next_size) to use later in 2745 * syncing context to determine if entries were added to these sublists 2746 * while in open context. This is possible because the clone is still 2747 * active and open for normal writes and we want to make sure the new, 2748 * unprocessed blockpointers are inserted into the livelist normally. 2749 * 2750 * Note that dsl_process_sub_livelist() both stores the size number of 2751 * blockpointers and iterates over them while the bpobj's lock held, so 2752 * the sizes returned to us are consistent which what was actually 2753 * processed. 2754 */ 2755 int err = dsl_process_sub_livelist(&first->dle_bpobj, &lca->to_keep, t, 2756 &first_size); 2757 if (err == 0) 2758 err = dsl_process_sub_livelist(&next->dle_bpobj, &lca->to_keep, 2759 t, &next_size); 2760 2761 if (err == 0) { 2762 while (zfs_livelist_condense_sync_pause && 2763 !(zthr_has_waiters(t) || zthr_iscancelled(t))) 2764 delay(1); 2765 2766 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 2767 dmu_tx_mark_netfree(tx); 2768 dmu_tx_hold_space(tx, 1); 2769 err = dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE); 2770 if (err == 0) { 2771 /* 2772 * Prevent the condense zthr restarting before 2773 * the synctask completes. 2774 */ 2775 spa->spa_to_condense.syncing = B_TRUE; 2776 lca->spa = spa; 2777 lca->first_size = first_size; 2778 lca->next_size = next_size; 2779 dsl_sync_task_nowait(spa_get_dsl(spa), 2780 spa_livelist_condense_sync, lca, tx); 2781 dmu_tx_commit(tx); 2782 return; 2783 } 2784 } 2785 /* 2786 * Condensing can not continue: either it was externally stopped or 2787 * we were unable to assign to a tx because the pool has run out of 2788 * space. In the second case, we'll just end up trying to condense 2789 * again in a later txg. 2790 */ 2791 ASSERT(err != 0); 2792 bplist_clear(&lca->to_keep); 2793 bplist_destroy(&lca->to_keep); 2794 kmem_free(lca, sizeof (livelist_condense_arg_t)); 2795 dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, spa); 2796 spa->spa_to_condense.ds = NULL; 2797 if (err == EINTR) 2798 zfs_livelist_condense_zthr_cancel++; 2799 } 2800 2801 /* 2802 * Check that there is something to condense but that a condense is not 2803 * already in progress and that condensing has not been cancelled. 2804 */ 2805 static boolean_t 2806 spa_livelist_condense_cb_check(void *arg, zthr_t *z) 2807 { 2808 (void) z; 2809 spa_t *spa = arg; 2810 if ((spa->spa_to_condense.ds != NULL) && 2811 (spa->spa_to_condense.syncing == B_FALSE) && 2812 (spa->spa_to_condense.cancelled == B_FALSE)) { 2813 return (B_TRUE); 2814 } 2815 return (B_FALSE); 2816 } 2817 2818 static void 2819 spa_start_livelist_condensing_thread(spa_t *spa) 2820 { 2821 spa->spa_to_condense.ds = NULL; 2822 spa->spa_to_condense.first = NULL; 2823 spa->spa_to_condense.next = NULL; 2824 spa->spa_to_condense.syncing = B_FALSE; 2825 spa->spa_to_condense.cancelled = B_FALSE; 2826 2827 ASSERT3P(spa->spa_livelist_condense_zthr, ==, NULL); 2828 spa->spa_livelist_condense_zthr = 2829 zthr_create("z_livelist_condense", 2830 spa_livelist_condense_cb_check, 2831 spa_livelist_condense_cb, spa, minclsyspri); 2832 } 2833 2834 static void 2835 spa_spawn_aux_threads(spa_t *spa) 2836 { 2837 ASSERT(spa_writeable(spa)); 2838 2839 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 2840 2841 spa_start_indirect_condensing_thread(spa); 2842 spa_start_livelist_destroy_thread(spa); 2843 spa_start_livelist_condensing_thread(spa); 2844 2845 ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL); 2846 spa->spa_checkpoint_discard_zthr = 2847 zthr_create("z_checkpoint_discard", 2848 spa_checkpoint_discard_thread_check, 2849 spa_checkpoint_discard_thread, spa, minclsyspri); 2850 } 2851 2852 /* 2853 * Fix up config after a partly-completed split. This is done with the 2854 * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 2855 * pool have that entry in their config, but only the splitting one contains 2856 * a list of all the guids of the vdevs that are being split off. 2857 * 2858 * This function determines what to do with that list: either rejoin 2859 * all the disks to the pool, or complete the splitting process. To attempt 2860 * the rejoin, each disk that is offlined is marked online again, and 2861 * we do a reopen() call. If the vdev label for every disk that was 2862 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 2863 * then we call vdev_split() on each disk, and complete the split. 2864 * 2865 * Otherwise we leave the config alone, with all the vdevs in place in 2866 * the original pool. 2867 */ 2868 static void 2869 spa_try_repair(spa_t *spa, nvlist_t *config) 2870 { 2871 uint_t extracted; 2872 uint64_t *glist; 2873 uint_t i, gcount; 2874 nvlist_t *nvl; 2875 vdev_t **vd; 2876 boolean_t attempt_reopen; 2877 2878 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 2879 return; 2880 2881 /* check that the config is complete */ 2882 if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 2883 &glist, &gcount) != 0) 2884 return; 2885 2886 vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 2887 2888 /* attempt to online all the vdevs & validate */ 2889 attempt_reopen = B_TRUE; 2890 for (i = 0; i < gcount; i++) { 2891 if (glist[i] == 0) /* vdev is hole */ 2892 continue; 2893 2894 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 2895 if (vd[i] == NULL) { 2896 /* 2897 * Don't bother attempting to reopen the disks; 2898 * just do the split. 2899 */ 2900 attempt_reopen = B_FALSE; 2901 } else { 2902 /* attempt to re-online it */ 2903 vd[i]->vdev_offline = B_FALSE; 2904 } 2905 } 2906 2907 if (attempt_reopen) { 2908 vdev_reopen(spa->spa_root_vdev); 2909 2910 /* check each device to see what state it's in */ 2911 for (extracted = 0, i = 0; i < gcount; i++) { 2912 if (vd[i] != NULL && 2913 vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 2914 break; 2915 ++extracted; 2916 } 2917 } 2918 2919 /* 2920 * If every disk has been moved to the new pool, or if we never 2921 * even attempted to look at them, then we split them off for 2922 * good. 2923 */ 2924 if (!attempt_reopen || gcount == extracted) { 2925 for (i = 0; i < gcount; i++) 2926 if (vd[i] != NULL) 2927 vdev_split(vd[i]); 2928 vdev_reopen(spa->spa_root_vdev); 2929 } 2930 2931 kmem_free(vd, gcount * sizeof (vdev_t *)); 2932 } 2933 2934 static int 2935 spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type) 2936 { 2937 char *ereport = FM_EREPORT_ZFS_POOL; 2938 int error; 2939 2940 spa->spa_load_state = state; 2941 (void) spa_import_progress_set_state(spa_guid(spa), 2942 spa_load_state(spa)); 2943 2944 gethrestime(&spa->spa_loaded_ts); 2945 error = spa_load_impl(spa, type, &ereport); 2946 2947 /* 2948 * Don't count references from objsets that are already closed 2949 * and are making their way through the eviction process. 2950 */ 2951 spa_evicting_os_wait(spa); 2952 spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); 2953 if (error) { 2954 if (error != EEXIST) { 2955 spa->spa_loaded_ts.tv_sec = 0; 2956 spa->spa_loaded_ts.tv_nsec = 0; 2957 } 2958 if (error != EBADF) { 2959 (void) zfs_ereport_post(ereport, spa, 2960 NULL, NULL, NULL, 0); 2961 } 2962 } 2963 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 2964 spa->spa_ena = 0; 2965 2966 (void) spa_import_progress_set_state(spa_guid(spa), 2967 spa_load_state(spa)); 2968 2969 return (error); 2970 } 2971 2972 #ifdef ZFS_DEBUG 2973 /* 2974 * Count the number of per-vdev ZAPs associated with all of the vdevs in the 2975 * vdev tree rooted in the given vd, and ensure that each ZAP is present in the 2976 * spa's per-vdev ZAP list. 2977 */ 2978 static uint64_t 2979 vdev_count_verify_zaps(vdev_t *vd) 2980 { 2981 spa_t *spa = vd->vdev_spa; 2982 uint64_t total = 0; 2983 2984 if (vd->vdev_top_zap != 0) { 2985 total++; 2986 ASSERT0(zap_lookup_int(spa->spa_meta_objset, 2987 spa->spa_all_vdev_zaps, vd->vdev_top_zap)); 2988 } 2989 if (vd->vdev_leaf_zap != 0) { 2990 total++; 2991 ASSERT0(zap_lookup_int(spa->spa_meta_objset, 2992 spa->spa_all_vdev_zaps, vd->vdev_leaf_zap)); 2993 } 2994 2995 for (uint64_t i = 0; i < vd->vdev_children; i++) { 2996 total += vdev_count_verify_zaps(vd->vdev_child[i]); 2997 } 2998 2999 return (total); 3000 } 3001 #else 3002 #define vdev_count_verify_zaps(vd) ((void) sizeof (vd), 0) 3003 #endif 3004 3005 /* 3006 * Determine whether the activity check is required. 3007 */ 3008 static boolean_t 3009 spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label, 3010 nvlist_t *config) 3011 { 3012 uint64_t state = 0; 3013 uint64_t hostid = 0; 3014 uint64_t tryconfig_txg = 0; 3015 uint64_t tryconfig_timestamp = 0; 3016 uint16_t tryconfig_mmp_seq = 0; 3017 nvlist_t *nvinfo; 3018 3019 if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { 3020 nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); 3021 (void) nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG, 3022 &tryconfig_txg); 3023 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 3024 &tryconfig_timestamp); 3025 (void) nvlist_lookup_uint16(nvinfo, ZPOOL_CONFIG_MMP_SEQ, 3026 &tryconfig_mmp_seq); 3027 } 3028 3029 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state); 3030 3031 /* 3032 * Disable the MMP activity check - This is used by zdb which 3033 * is intended to be used on potentially active pools. 3034 */ 3035 if (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) 3036 return (B_FALSE); 3037 3038 /* 3039 * Skip the activity check when the MMP feature is disabled. 3040 */ 3041 if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay == 0) 3042 return (B_FALSE); 3043 3044 /* 3045 * If the tryconfig_ values are nonzero, they are the results of an 3046 * earlier tryimport. If they all match the uberblock we just found, 3047 * then the pool has not changed and we return false so we do not test 3048 * a second time. 3049 */ 3050 if (tryconfig_txg && tryconfig_txg == ub->ub_txg && 3051 tryconfig_timestamp && tryconfig_timestamp == ub->ub_timestamp && 3052 tryconfig_mmp_seq && tryconfig_mmp_seq == 3053 (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) 3054 return (B_FALSE); 3055 3056 /* 3057 * Allow the activity check to be skipped when importing the pool 3058 * on the same host which last imported it. Since the hostid from 3059 * configuration may be stale use the one read from the label. 3060 */ 3061 if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID)) 3062 hostid = fnvlist_lookup_uint64(label, ZPOOL_CONFIG_HOSTID); 3063 3064 if (hostid == spa_get_hostid(spa)) 3065 return (B_FALSE); 3066 3067 /* 3068 * Skip the activity test when the pool was cleanly exported. 3069 */ 3070 if (state != POOL_STATE_ACTIVE) 3071 return (B_FALSE); 3072 3073 return (B_TRUE); 3074 } 3075 3076 /* 3077 * Nanoseconds the activity check must watch for changes on-disk. 3078 */ 3079 static uint64_t 3080 spa_activity_check_duration(spa_t *spa, uberblock_t *ub) 3081 { 3082 uint64_t import_intervals = MAX(zfs_multihost_import_intervals, 1); 3083 uint64_t multihost_interval = MSEC2NSEC( 3084 MMP_INTERVAL_OK(zfs_multihost_interval)); 3085 uint64_t import_delay = MAX(NANOSEC, import_intervals * 3086 multihost_interval); 3087 3088 /* 3089 * Local tunables determine a minimum duration except for the case 3090 * where we know when the remote host will suspend the pool if MMP 3091 * writes do not land. 3092 * 3093 * See Big Theory comment at the top of mmp.c for the reasoning behind 3094 * these cases and times. 3095 */ 3096 3097 ASSERT(MMP_IMPORT_SAFETY_FACTOR >= 100); 3098 3099 if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) && 3100 MMP_FAIL_INT(ub) > 0) { 3101 3102 /* MMP on remote host will suspend pool after failed writes */ 3103 import_delay = MMP_FAIL_INT(ub) * MSEC2NSEC(MMP_INTERVAL(ub)) * 3104 MMP_IMPORT_SAFETY_FACTOR / 100; 3105 3106 zfs_dbgmsg("fail_intvals>0 import_delay=%llu ub_mmp " 3107 "mmp_fails=%llu ub_mmp mmp_interval=%llu " 3108 "import_intervals=%llu", (u_longlong_t)import_delay, 3109 (u_longlong_t)MMP_FAIL_INT(ub), 3110 (u_longlong_t)MMP_INTERVAL(ub), 3111 (u_longlong_t)import_intervals); 3112 3113 } else if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) && 3114 MMP_FAIL_INT(ub) == 0) { 3115 3116 /* MMP on remote host will never suspend pool */ 3117 import_delay = MAX(import_delay, (MSEC2NSEC(MMP_INTERVAL(ub)) + 3118 ub->ub_mmp_delay) * import_intervals); 3119 3120 zfs_dbgmsg("fail_intvals=0 import_delay=%llu ub_mmp " 3121 "mmp_interval=%llu ub_mmp_delay=%llu " 3122 "import_intervals=%llu", (u_longlong_t)import_delay, 3123 (u_longlong_t)MMP_INTERVAL(ub), 3124 (u_longlong_t)ub->ub_mmp_delay, 3125 (u_longlong_t)import_intervals); 3126 3127 } else if (MMP_VALID(ub)) { 3128 /* 3129 * zfs-0.7 compatibility case 3130 */ 3131 3132 import_delay = MAX(import_delay, (multihost_interval + 3133 ub->ub_mmp_delay) * import_intervals); 3134 3135 zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu " 3136 "import_intervals=%llu leaves=%u", 3137 (u_longlong_t)import_delay, 3138 (u_longlong_t)ub->ub_mmp_delay, 3139 (u_longlong_t)import_intervals, 3140 vdev_count_leaves(spa)); 3141 } else { 3142 /* Using local tunings is the only reasonable option */ 3143 zfs_dbgmsg("pool last imported on non-MMP aware " 3144 "host using import_delay=%llu multihost_interval=%llu " 3145 "import_intervals=%llu", (u_longlong_t)import_delay, 3146 (u_longlong_t)multihost_interval, 3147 (u_longlong_t)import_intervals); 3148 } 3149 3150 return (import_delay); 3151 } 3152 3153 /* 3154 * Perform the import activity check. If the user canceled the import or 3155 * we detected activity then fail. 3156 */ 3157 static int 3158 spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config) 3159 { 3160 uint64_t txg = ub->ub_txg; 3161 uint64_t timestamp = ub->ub_timestamp; 3162 uint64_t mmp_config = ub->ub_mmp_config; 3163 uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0; 3164 uint64_t import_delay; 3165 hrtime_t import_expire; 3166 nvlist_t *mmp_label = NULL; 3167 vdev_t *rvd = spa->spa_root_vdev; 3168 kcondvar_t cv; 3169 kmutex_t mtx; 3170 int error = 0; 3171 3172 cv_init(&cv, NULL, CV_DEFAULT, NULL); 3173 mutex_init(&mtx, NULL, MUTEX_DEFAULT, NULL); 3174 mutex_enter(&mtx); 3175 3176 /* 3177 * If ZPOOL_CONFIG_MMP_TXG is present an activity check was performed 3178 * during the earlier tryimport. If the txg recorded there is 0 then 3179 * the pool is known to be active on another host. 3180 * 3181 * Otherwise, the pool might be in use on another host. Check for 3182 * changes in the uberblocks on disk if necessary. 3183 */ 3184 if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { 3185 nvlist_t *nvinfo = fnvlist_lookup_nvlist(config, 3186 ZPOOL_CONFIG_LOAD_INFO); 3187 3188 if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_TXG) && 3189 fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG) == 0) { 3190 vdev_uberblock_load(rvd, ub, &mmp_label); 3191 error = SET_ERROR(EREMOTEIO); 3192 goto out; 3193 } 3194 } 3195 3196 import_delay = spa_activity_check_duration(spa, ub); 3197 3198 /* Add a small random factor in case of simultaneous imports (0-25%) */ 3199 import_delay += import_delay * random_in_range(250) / 1000; 3200 3201 import_expire = gethrtime() + import_delay; 3202 3203 while (gethrtime() < import_expire) { 3204 (void) spa_import_progress_set_mmp_check(spa_guid(spa), 3205 NSEC2SEC(import_expire - gethrtime())); 3206 3207 vdev_uberblock_load(rvd, ub, &mmp_label); 3208 3209 if (txg != ub->ub_txg || timestamp != ub->ub_timestamp || 3210 mmp_seq != (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) { 3211 zfs_dbgmsg("multihost activity detected " 3212 "txg %llu ub_txg %llu " 3213 "timestamp %llu ub_timestamp %llu " 3214 "mmp_config %#llx ub_mmp_config %#llx", 3215 (u_longlong_t)txg, (u_longlong_t)ub->ub_txg, 3216 (u_longlong_t)timestamp, 3217 (u_longlong_t)ub->ub_timestamp, 3218 (u_longlong_t)mmp_config, 3219 (u_longlong_t)ub->ub_mmp_config); 3220 3221 error = SET_ERROR(EREMOTEIO); 3222 break; 3223 } 3224 3225 if (mmp_label) { 3226 nvlist_free(mmp_label); 3227 mmp_label = NULL; 3228 } 3229 3230 error = cv_timedwait_sig(&cv, &mtx, ddi_get_lbolt() + hz); 3231 if (error != -1) { 3232 error = SET_ERROR(EINTR); 3233 break; 3234 } 3235 error = 0; 3236 } 3237 3238 out: 3239 mutex_exit(&mtx); 3240 mutex_destroy(&mtx); 3241 cv_destroy(&cv); 3242 3243 /* 3244 * If the pool is determined to be active store the status in the 3245 * spa->spa_load_info nvlist. If the remote hostname or hostid are 3246 * available from configuration read from disk store them as well. 3247 * This allows 'zpool import' to generate a more useful message. 3248 * 3249 * ZPOOL_CONFIG_MMP_STATE - observed pool status (mandatory) 3250 * ZPOOL_CONFIG_MMP_HOSTNAME - hostname from the active pool 3251 * ZPOOL_CONFIG_MMP_HOSTID - hostid from the active pool 3252 */ 3253 if (error == EREMOTEIO) { 3254 char *hostname = "<unknown>"; 3255 uint64_t hostid = 0; 3256 3257 if (mmp_label) { 3258 if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTNAME)) { 3259 hostname = fnvlist_lookup_string(mmp_label, 3260 ZPOOL_CONFIG_HOSTNAME); 3261 fnvlist_add_string(spa->spa_load_info, 3262 ZPOOL_CONFIG_MMP_HOSTNAME, hostname); 3263 } 3264 3265 if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTID)) { 3266 hostid = fnvlist_lookup_uint64(mmp_label, 3267 ZPOOL_CONFIG_HOSTID); 3268 fnvlist_add_uint64(spa->spa_load_info, 3269 ZPOOL_CONFIG_MMP_HOSTID, hostid); 3270 } 3271 } 3272 3273 fnvlist_add_uint64(spa->spa_load_info, 3274 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_ACTIVE); 3275 fnvlist_add_uint64(spa->spa_load_info, 3276 ZPOOL_CONFIG_MMP_TXG, 0); 3277 3278 error = spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO); 3279 } 3280 3281 if (mmp_label) 3282 nvlist_free(mmp_label); 3283 3284 return (error); 3285 } 3286 3287 static int 3288 spa_verify_host(spa_t *spa, nvlist_t *mos_config) 3289 { 3290 uint64_t hostid; 3291 char *hostname; 3292 uint64_t myhostid = 0; 3293 3294 if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config, 3295 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 3296 hostname = fnvlist_lookup_string(mos_config, 3297 ZPOOL_CONFIG_HOSTNAME); 3298 3299 myhostid = zone_get_hostid(NULL); 3300 3301 if (hostid != 0 && myhostid != 0 && hostid != myhostid) { 3302 cmn_err(CE_WARN, "pool '%s' could not be " 3303 "loaded as it was last accessed by " 3304 "another system (host: %s hostid: 0x%llx). " 3305 "See: https://openzfs.github.io/openzfs-docs/msg/" 3306 "ZFS-8000-EY", 3307 spa_name(spa), hostname, (u_longlong_t)hostid); 3308 spa_load_failed(spa, "hostid verification failed: pool " 3309 "last accessed by host: %s (hostid: 0x%llx)", 3310 hostname, (u_longlong_t)hostid); 3311 return (SET_ERROR(EBADF)); 3312 } 3313 } 3314 3315 return (0); 3316 } 3317 3318 static int 3319 spa_ld_parse_config(spa_t *spa, spa_import_type_t type) 3320 { 3321 int error = 0; 3322 nvlist_t *nvtree, *nvl, *config = spa->spa_config; 3323 int parse; 3324 vdev_t *rvd; 3325 uint64_t pool_guid; 3326 char *comment; 3327 char *compatibility; 3328 3329 /* 3330 * Versioning wasn't explicitly added to the label until later, so if 3331 * it's not present treat it as the initial version. 3332 */ 3333 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 3334 &spa->spa_ubsync.ub_version) != 0) 3335 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 3336 3337 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 3338 spa_load_failed(spa, "invalid config provided: '%s' missing", 3339 ZPOOL_CONFIG_POOL_GUID); 3340 return (SET_ERROR(EINVAL)); 3341 } 3342 3343 /* 3344 * If we are doing an import, ensure that the pool is not already 3345 * imported by checking if its pool guid already exists in the 3346 * spa namespace. 3347 * 3348 * The only case that we allow an already imported pool to be 3349 * imported again, is when the pool is checkpointed and we want to 3350 * look at its checkpointed state from userland tools like zdb. 3351 */ 3352 #ifdef _KERNEL 3353 if ((spa->spa_load_state == SPA_LOAD_IMPORT || 3354 spa->spa_load_state == SPA_LOAD_TRYIMPORT) && 3355 spa_guid_exists(pool_guid, 0)) { 3356 #else 3357 if ((spa->spa_load_state == SPA_LOAD_IMPORT || 3358 spa->spa_load_state == SPA_LOAD_TRYIMPORT) && 3359 spa_guid_exists(pool_guid, 0) && 3360 !spa_importing_readonly_checkpoint(spa)) { 3361 #endif 3362 spa_load_failed(spa, "a pool with guid %llu is already open", 3363 (u_longlong_t)pool_guid); 3364 return (SET_ERROR(EEXIST)); 3365 } 3366 3367 spa->spa_config_guid = pool_guid; 3368 3369 nvlist_free(spa->spa_load_info); 3370 spa->spa_load_info = fnvlist_alloc(); 3371 3372 ASSERT(spa->spa_comment == NULL); 3373 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) 3374 spa->spa_comment = spa_strdup(comment); 3375 3376 ASSERT(spa->spa_compatibility == NULL); 3377 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMPATIBILITY, 3378 &compatibility) == 0) 3379 spa->spa_compatibility = spa_strdup(compatibility); 3380 3381 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 3382 &spa->spa_config_txg); 3383 3384 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0) 3385 spa->spa_config_splitting = fnvlist_dup(nvl); 3386 3387 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) { 3388 spa_load_failed(spa, "invalid config provided: '%s' missing", 3389 ZPOOL_CONFIG_VDEV_TREE); 3390 return (SET_ERROR(EINVAL)); 3391 } 3392 3393 /* 3394 * Create "The Godfather" zio to hold all async IOs 3395 */ 3396 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 3397 KM_SLEEP); 3398 for (int i = 0; i < max_ncpus; i++) { 3399 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 3400 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 3401 ZIO_FLAG_GODFATHER); 3402 } 3403 3404 /* 3405 * Parse the configuration into a vdev tree. We explicitly set the 3406 * value that will be returned by spa_version() since parsing the 3407 * configuration requires knowing the version number. 3408 */ 3409 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3410 parse = (type == SPA_IMPORT_EXISTING ? 3411 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 3412 error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse); 3413 spa_config_exit(spa, SCL_ALL, FTAG); 3414 3415 if (error != 0) { 3416 spa_load_failed(spa, "unable to parse config [error=%d]", 3417 error); 3418 return (error); 3419 } 3420 3421 ASSERT(spa->spa_root_vdev == rvd); 3422 ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); 3423 ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT); 3424 3425 if (type != SPA_IMPORT_ASSEMBLE) { 3426 ASSERT(spa_guid(spa) == pool_guid); 3427 } 3428 3429 return (0); 3430 } 3431 3432 /* 3433 * Recursively open all vdevs in the vdev tree. This function is called twice: 3434 * first with the untrusted config, then with the trusted config. 3435 */ 3436 static int 3437 spa_ld_open_vdevs(spa_t *spa) 3438 { 3439 int error = 0; 3440 3441 /* 3442 * spa_missing_tvds_allowed defines how many top-level vdevs can be 3443 * missing/unopenable for the root vdev to be still considered openable. 3444 */ 3445 if (spa->spa_trust_config) { 3446 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds; 3447 } else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) { 3448 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile; 3449 } else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) { 3450 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan; 3451 } else { 3452 spa->spa_missing_tvds_allowed = 0; 3453 } 3454 3455 spa->spa_missing_tvds_allowed = 3456 MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed); 3457 3458 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3459 error = vdev_open(spa->spa_root_vdev); 3460 spa_config_exit(spa, SCL_ALL, FTAG); 3461 3462 if (spa->spa_missing_tvds != 0) { 3463 spa_load_note(spa, "vdev tree has %lld missing top-level " 3464 "vdevs.", (u_longlong_t)spa->spa_missing_tvds); 3465 if (spa->spa_trust_config && (spa->spa_mode & SPA_MODE_WRITE)) { 3466 /* 3467 * Although theoretically we could allow users to open 3468 * incomplete pools in RW mode, we'd need to add a lot 3469 * of extra logic (e.g. adjust pool space to account 3470 * for missing vdevs). 3471 * This limitation also prevents users from accidentally 3472 * opening the pool in RW mode during data recovery and 3473 * damaging it further. 3474 */ 3475 spa_load_note(spa, "pools with missing top-level " 3476 "vdevs can only be opened in read-only mode."); 3477 error = SET_ERROR(ENXIO); 3478 } else { 3479 spa_load_note(spa, "current settings allow for maximum " 3480 "%lld missing top-level vdevs at this stage.", 3481 (u_longlong_t)spa->spa_missing_tvds_allowed); 3482 } 3483 } 3484 if (error != 0) { 3485 spa_load_failed(spa, "unable to open vdev tree [error=%d]", 3486 error); 3487 } 3488 if (spa->spa_missing_tvds != 0 || error != 0) 3489 vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2); 3490 3491 return (error); 3492 } 3493 3494 /* 3495 * We need to validate the vdev labels against the configuration that 3496 * we have in hand. This function is called twice: first with an untrusted 3497 * config, then with a trusted config. The validation is more strict when the 3498 * config is trusted. 3499 */ 3500 static int 3501 spa_ld_validate_vdevs(spa_t *spa) 3502 { 3503 int error = 0; 3504 vdev_t *rvd = spa->spa_root_vdev; 3505 3506 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3507 error = vdev_validate(rvd); 3508 spa_config_exit(spa, SCL_ALL, FTAG); 3509 3510 if (error != 0) { 3511 spa_load_failed(spa, "vdev_validate failed [error=%d]", error); 3512 return (error); 3513 } 3514 3515 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 3516 spa_load_failed(spa, "cannot open vdev tree after invalidating " 3517 "some vdevs"); 3518 vdev_dbgmsg_print_tree(rvd, 2); 3519 return (SET_ERROR(ENXIO)); 3520 } 3521 3522 return (0); 3523 } 3524 3525 static void 3526 spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub) 3527 { 3528 spa->spa_state = POOL_STATE_ACTIVE; 3529 spa->spa_ubsync = spa->spa_uberblock; 3530 spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 3531 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 3532 spa->spa_first_txg = spa->spa_last_ubsync_txg ? 3533 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 3534 spa->spa_claim_max_txg = spa->spa_first_txg; 3535 spa->spa_prev_software_version = ub->ub_software_version; 3536 } 3537 3538 static int 3539 spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) 3540 { 3541 vdev_t *rvd = spa->spa_root_vdev; 3542 nvlist_t *label; 3543 uberblock_t *ub = &spa->spa_uberblock; 3544 boolean_t activity_check = B_FALSE; 3545 3546 /* 3547 * If we are opening the checkpointed state of the pool by 3548 * rewinding to it, at this point we will have written the 3549 * checkpointed uberblock to the vdev labels, so searching 3550 * the labels will find the right uberblock. However, if 3551 * we are opening the checkpointed state read-only, we have 3552 * not modified the labels. Therefore, we must ignore the 3553 * labels and continue using the spa_uberblock that was set 3554 * by spa_ld_checkpoint_rewind. 3555 * 3556 * Note that it would be fine to ignore the labels when 3557 * rewinding (opening writeable) as well. However, if we 3558 * crash just after writing the labels, we will end up 3559 * searching the labels. Doing so in the common case means 3560 * that this code path gets exercised normally, rather than 3561 * just in the edge case. 3562 */ 3563 if (ub->ub_checkpoint_txg != 0 && 3564 spa_importing_readonly_checkpoint(spa)) { 3565 spa_ld_select_uberblock_done(spa, ub); 3566 return (0); 3567 } 3568 3569 /* 3570 * Find the best uberblock. 3571 */ 3572 vdev_uberblock_load(rvd, ub, &label); 3573 3574 /* 3575 * If we weren't able to find a single valid uberblock, return failure. 3576 */ 3577 if (ub->ub_txg == 0) { 3578 nvlist_free(label); 3579 spa_load_failed(spa, "no valid uberblock found"); 3580 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 3581 } 3582 3583 if (spa->spa_load_max_txg != UINT64_MAX) { 3584 (void) spa_import_progress_set_max_txg(spa_guid(spa), 3585 (u_longlong_t)spa->spa_load_max_txg); 3586 } 3587 spa_load_note(spa, "using uberblock with txg=%llu", 3588 (u_longlong_t)ub->ub_txg); 3589 3590 3591 /* 3592 * For pools which have the multihost property on determine if the 3593 * pool is truly inactive and can be safely imported. Prevent 3594 * hosts which don't have a hostid set from importing the pool. 3595 */ 3596 activity_check = spa_activity_check_required(spa, ub, label, 3597 spa->spa_config); 3598 if (activity_check) { 3599 if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay && 3600 spa_get_hostid(spa) == 0) { 3601 nvlist_free(label); 3602 fnvlist_add_uint64(spa->spa_load_info, 3603 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); 3604 return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); 3605 } 3606 3607 int error = spa_activity_check(spa, ub, spa->spa_config); 3608 if (error) { 3609 nvlist_free(label); 3610 return (error); 3611 } 3612 3613 fnvlist_add_uint64(spa->spa_load_info, 3614 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_INACTIVE); 3615 fnvlist_add_uint64(spa->spa_load_info, 3616 ZPOOL_CONFIG_MMP_TXG, ub->ub_txg); 3617 fnvlist_add_uint16(spa->spa_load_info, 3618 ZPOOL_CONFIG_MMP_SEQ, 3619 (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)); 3620 } 3621 3622 /* 3623 * If the pool has an unsupported version we can't open it. 3624 */ 3625 if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { 3626 nvlist_free(label); 3627 spa_load_failed(spa, "version %llu is not supported", 3628 (u_longlong_t)ub->ub_version); 3629 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 3630 } 3631 3632 if (ub->ub_version >= SPA_VERSION_FEATURES) { 3633 nvlist_t *features; 3634 3635 /* 3636 * If we weren't able to find what's necessary for reading the 3637 * MOS in the label, return failure. 3638 */ 3639 if (label == NULL) { 3640 spa_load_failed(spa, "label config unavailable"); 3641 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 3642 ENXIO)); 3643 } 3644 3645 if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ, 3646 &features) != 0) { 3647 nvlist_free(label); 3648 spa_load_failed(spa, "invalid label: '%s' missing", 3649 ZPOOL_CONFIG_FEATURES_FOR_READ); 3650 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 3651 ENXIO)); 3652 } 3653 3654 /* 3655 * Update our in-core representation with the definitive values 3656 * from the label. 3657 */ 3658 nvlist_free(spa->spa_label_features); 3659 spa->spa_label_features = fnvlist_dup(features); 3660 } 3661 3662 nvlist_free(label); 3663 3664 /* 3665 * Look through entries in the label nvlist's features_for_read. If 3666 * there is a feature listed there which we don't understand then we 3667 * cannot open a pool. 3668 */ 3669 if (ub->ub_version >= SPA_VERSION_FEATURES) { 3670 nvlist_t *unsup_feat; 3671 3672 unsup_feat = fnvlist_alloc(); 3673 3674 for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, 3675 NULL); nvp != NULL; 3676 nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { 3677 if (!zfeature_is_supported(nvpair_name(nvp))) { 3678 fnvlist_add_string(unsup_feat, 3679 nvpair_name(nvp), ""); 3680 } 3681 } 3682 3683 if (!nvlist_empty(unsup_feat)) { 3684 fnvlist_add_nvlist(spa->spa_load_info, 3685 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 3686 nvlist_free(unsup_feat); 3687 spa_load_failed(spa, "some features are unsupported"); 3688 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 3689 ENOTSUP)); 3690 } 3691 3692 nvlist_free(unsup_feat); 3693 } 3694 3695 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 3696 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3697 spa_try_repair(spa, spa->spa_config); 3698 spa_config_exit(spa, SCL_ALL, FTAG); 3699 nvlist_free(spa->spa_config_splitting); 3700 spa->spa_config_splitting = NULL; 3701 } 3702 3703 /* 3704 * Initialize internal SPA structures. 3705 */ 3706 spa_ld_select_uberblock_done(spa, ub); 3707 3708 return (0); 3709 } 3710 3711 static int 3712 spa_ld_open_rootbp(spa_t *spa) 3713 { 3714 int error = 0; 3715 vdev_t *rvd = spa->spa_root_vdev; 3716 3717 error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 3718 if (error != 0) { 3719 spa_load_failed(spa, "unable to open rootbp in dsl_pool_init " 3720 "[error=%d]", error); 3721 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3722 } 3723 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 3724 3725 return (0); 3726 } 3727 3728 static int 3729 spa_ld_trusted_config(spa_t *spa, spa_import_type_t type, 3730 boolean_t reloading) 3731 { 3732 vdev_t *mrvd, *rvd = spa->spa_root_vdev; 3733 nvlist_t *nv, *mos_config, *policy; 3734 int error = 0, copy_error; 3735 uint64_t healthy_tvds, healthy_tvds_mos; 3736 uint64_t mos_config_txg; 3737 3738 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE) 3739 != 0) 3740 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3741 3742 /* 3743 * If we're assembling a pool from a split, the config provided is 3744 * already trusted so there is nothing to do. 3745 */ 3746 if (type == SPA_IMPORT_ASSEMBLE) 3747 return (0); 3748 3749 healthy_tvds = spa_healthy_core_tvds(spa); 3750 3751 if (load_nvlist(spa, spa->spa_config_object, &mos_config) 3752 != 0) { 3753 spa_load_failed(spa, "unable to retrieve MOS config"); 3754 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3755 } 3756 3757 /* 3758 * If we are doing an open, pool owner wasn't verified yet, thus do 3759 * the verification here. 3760 */ 3761 if (spa->spa_load_state == SPA_LOAD_OPEN) { 3762 error = spa_verify_host(spa, mos_config); 3763 if (error != 0) { 3764 nvlist_free(mos_config); 3765 return (error); 3766 } 3767 } 3768 3769 nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE); 3770 3771 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3772 3773 /* 3774 * Build a new vdev tree from the trusted config 3775 */ 3776 error = spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD); 3777 if (error != 0) { 3778 nvlist_free(mos_config); 3779 spa_config_exit(spa, SCL_ALL, FTAG); 3780 spa_load_failed(spa, "spa_config_parse failed [error=%d]", 3781 error); 3782 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 3783 } 3784 3785 /* 3786 * Vdev paths in the MOS may be obsolete. If the untrusted config was 3787 * obtained by scanning /dev/dsk, then it will have the right vdev 3788 * paths. We update the trusted MOS config with this information. 3789 * We first try to copy the paths with vdev_copy_path_strict, which 3790 * succeeds only when both configs have exactly the same vdev tree. 3791 * If that fails, we fall back to a more flexible method that has a 3792 * best effort policy. 3793 */ 3794 copy_error = vdev_copy_path_strict(rvd, mrvd); 3795 if (copy_error != 0 || spa_load_print_vdev_tree) { 3796 spa_load_note(spa, "provided vdev tree:"); 3797 vdev_dbgmsg_print_tree(rvd, 2); 3798 spa_load_note(spa, "MOS vdev tree:"); 3799 vdev_dbgmsg_print_tree(mrvd, 2); 3800 } 3801 if (copy_error != 0) { 3802 spa_load_note(spa, "vdev_copy_path_strict failed, falling " 3803 "back to vdev_copy_path_relaxed"); 3804 vdev_copy_path_relaxed(rvd, mrvd); 3805 } 3806 3807 vdev_close(rvd); 3808 vdev_free(rvd); 3809 spa->spa_root_vdev = mrvd; 3810 rvd = mrvd; 3811 spa_config_exit(spa, SCL_ALL, FTAG); 3812 3813 /* 3814 * We will use spa_config if we decide to reload the spa or if spa_load 3815 * fails and we rewind. We must thus regenerate the config using the 3816 * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to 3817 * pass settings on how to load the pool and is not stored in the MOS. 3818 * We copy it over to our new, trusted config. 3819 */ 3820 mos_config_txg = fnvlist_lookup_uint64(mos_config, 3821 ZPOOL_CONFIG_POOL_TXG); 3822 nvlist_free(mos_config); 3823 mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE); 3824 if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY, 3825 &policy) == 0) 3826 fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy); 3827 spa_config_set(spa, mos_config); 3828 spa->spa_config_source = SPA_CONFIG_SRC_MOS; 3829 3830 /* 3831 * Now that we got the config from the MOS, we should be more strict 3832 * in checking blkptrs and can make assumptions about the consistency 3833 * of the vdev tree. spa_trust_config must be set to true before opening 3834 * vdevs in order for them to be writeable. 3835 */ 3836 spa->spa_trust_config = B_TRUE; 3837 3838 /* 3839 * Open and validate the new vdev tree 3840 */ 3841 error = spa_ld_open_vdevs(spa); 3842 if (error != 0) 3843 return (error); 3844 3845 error = spa_ld_validate_vdevs(spa); 3846 if (error != 0) 3847 return (error); 3848 3849 if (copy_error != 0 || spa_load_print_vdev_tree) { 3850 spa_load_note(spa, "final vdev tree:"); 3851 vdev_dbgmsg_print_tree(rvd, 2); 3852 } 3853 3854 if (spa->spa_load_state != SPA_LOAD_TRYIMPORT && 3855 !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) { 3856 /* 3857 * Sanity check to make sure that we are indeed loading the 3858 * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds 3859 * in the config provided and they happened to be the only ones 3860 * to have the latest uberblock, we could involuntarily perform 3861 * an extreme rewind. 3862 */ 3863 healthy_tvds_mos = spa_healthy_core_tvds(spa); 3864 if (healthy_tvds_mos - healthy_tvds >= 3865 SPA_SYNC_MIN_VDEVS) { 3866 spa_load_note(spa, "config provided misses too many " 3867 "top-level vdevs compared to MOS (%lld vs %lld). ", 3868 (u_longlong_t)healthy_tvds, 3869 (u_longlong_t)healthy_tvds_mos); 3870 spa_load_note(spa, "vdev tree:"); 3871 vdev_dbgmsg_print_tree(rvd, 2); 3872 if (reloading) { 3873 spa_load_failed(spa, "config was already " 3874 "provided from MOS. Aborting."); 3875 return (spa_vdev_err(rvd, 3876 VDEV_AUX_CORRUPT_DATA, EIO)); 3877 } 3878 spa_load_note(spa, "spa must be reloaded using MOS " 3879 "config"); 3880 return (SET_ERROR(EAGAIN)); 3881 } 3882 } 3883 3884 error = spa_check_for_missing_logs(spa); 3885 if (error != 0) 3886 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 3887 3888 if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) { 3889 spa_load_failed(spa, "uberblock guid sum doesn't match MOS " 3890 "guid sum (%llu != %llu)", 3891 (u_longlong_t)spa->spa_uberblock.ub_guid_sum, 3892 (u_longlong_t)rvd->vdev_guid_sum); 3893 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 3894 ENXIO)); 3895 } 3896 3897 return (0); 3898 } 3899 3900 static int 3901 spa_ld_open_indirect_vdev_metadata(spa_t *spa) 3902 { 3903 int error = 0; 3904 vdev_t *rvd = spa->spa_root_vdev; 3905 3906 /* 3907 * Everything that we read before spa_remove_init() must be stored 3908 * on concreted vdevs. Therefore we do this as early as possible. 3909 */ 3910 error = spa_remove_init(spa); 3911 if (error != 0) { 3912 spa_load_failed(spa, "spa_remove_init failed [error=%d]", 3913 error); 3914 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3915 } 3916 3917 /* 3918 * Retrieve information needed to condense indirect vdev mappings. 3919 */ 3920 error = spa_condense_init(spa); 3921 if (error != 0) { 3922 spa_load_failed(spa, "spa_condense_init failed [error=%d]", 3923 error); 3924 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 3925 } 3926 3927 return (0); 3928 } 3929 3930 static int 3931 spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep) 3932 { 3933 int error = 0; 3934 vdev_t *rvd = spa->spa_root_vdev; 3935 3936 if (spa_version(spa) >= SPA_VERSION_FEATURES) { 3937 boolean_t missing_feat_read = B_FALSE; 3938 nvlist_t *unsup_feat, *enabled_feat; 3939 3940 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, 3941 &spa->spa_feat_for_read_obj, B_TRUE) != 0) { 3942 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3943 } 3944 3945 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, 3946 &spa->spa_feat_for_write_obj, B_TRUE) != 0) { 3947 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3948 } 3949 3950 if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, 3951 &spa->spa_feat_desc_obj, B_TRUE) != 0) { 3952 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3953 } 3954 3955 enabled_feat = fnvlist_alloc(); 3956 unsup_feat = fnvlist_alloc(); 3957 3958 if (!spa_features_check(spa, B_FALSE, 3959 unsup_feat, enabled_feat)) 3960 missing_feat_read = B_TRUE; 3961 3962 if (spa_writeable(spa) || 3963 spa->spa_load_state == SPA_LOAD_TRYIMPORT) { 3964 if (!spa_features_check(spa, B_TRUE, 3965 unsup_feat, enabled_feat)) { 3966 *missing_feat_writep = B_TRUE; 3967 } 3968 } 3969 3970 fnvlist_add_nvlist(spa->spa_load_info, 3971 ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); 3972 3973 if (!nvlist_empty(unsup_feat)) { 3974 fnvlist_add_nvlist(spa->spa_load_info, 3975 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 3976 } 3977 3978 fnvlist_free(enabled_feat); 3979 fnvlist_free(unsup_feat); 3980 3981 if (!missing_feat_read) { 3982 fnvlist_add_boolean(spa->spa_load_info, 3983 ZPOOL_CONFIG_CAN_RDONLY); 3984 } 3985 3986 /* 3987 * If the state is SPA_LOAD_TRYIMPORT, our objective is 3988 * twofold: to determine whether the pool is available for 3989 * import in read-write mode and (if it is not) whether the 3990 * pool is available for import in read-only mode. If the pool 3991 * is available for import in read-write mode, it is displayed 3992 * as available in userland; if it is not available for import 3993 * in read-only mode, it is displayed as unavailable in 3994 * userland. If the pool is available for import in read-only 3995 * mode but not read-write mode, it is displayed as unavailable 3996 * in userland with a special note that the pool is actually 3997 * available for open in read-only mode. 3998 * 3999 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are 4000 * missing a feature for write, we must first determine whether 4001 * the pool can be opened read-only before returning to 4002 * userland in order to know whether to display the 4003 * abovementioned note. 4004 */ 4005 if (missing_feat_read || (*missing_feat_writep && 4006 spa_writeable(spa))) { 4007 spa_load_failed(spa, "pool uses unsupported features"); 4008 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 4009 ENOTSUP)); 4010 } 4011 4012 /* 4013 * Load refcounts for ZFS features from disk into an in-memory 4014 * cache during SPA initialization. 4015 */ 4016 for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { 4017 uint64_t refcount; 4018 4019 error = feature_get_refcount_from_disk(spa, 4020 &spa_feature_table[i], &refcount); 4021 if (error == 0) { 4022 spa->spa_feat_refcount_cache[i] = refcount; 4023 } else if (error == ENOTSUP) { 4024 spa->spa_feat_refcount_cache[i] = 4025 SPA_FEATURE_DISABLED; 4026 } else { 4027 spa_load_failed(spa, "error getting refcount " 4028 "for feature %s [error=%d]", 4029 spa_feature_table[i].fi_guid, error); 4030 return (spa_vdev_err(rvd, 4031 VDEV_AUX_CORRUPT_DATA, EIO)); 4032 } 4033 } 4034 } 4035 4036 if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { 4037 if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, 4038 &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0) 4039 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4040 } 4041 4042 /* 4043 * Encryption was added before bookmark_v2, even though bookmark_v2 4044 * is now a dependency. If this pool has encryption enabled without 4045 * bookmark_v2, trigger an errata message. 4046 */ 4047 if (spa_feature_is_enabled(spa, SPA_FEATURE_ENCRYPTION) && 4048 !spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_V2)) { 4049 spa->spa_errata = ZPOOL_ERRATA_ZOL_8308_ENCRYPTION; 4050 } 4051 4052 return (0); 4053 } 4054 4055 static int 4056 spa_ld_load_special_directories(spa_t *spa) 4057 { 4058 int error = 0; 4059 vdev_t *rvd = spa->spa_root_vdev; 4060 4061 spa->spa_is_initializing = B_TRUE; 4062 error = dsl_pool_open(spa->spa_dsl_pool); 4063 spa->spa_is_initializing = B_FALSE; 4064 if (error != 0) { 4065 spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error); 4066 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4067 } 4068 4069 return (0); 4070 } 4071 4072 static int 4073 spa_ld_get_props(spa_t *spa) 4074 { 4075 int error = 0; 4076 uint64_t obj; 4077 vdev_t *rvd = spa->spa_root_vdev; 4078 4079 /* Grab the checksum salt from the MOS. */ 4080 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 4081 DMU_POOL_CHECKSUM_SALT, 1, 4082 sizeof (spa->spa_cksum_salt.zcs_bytes), 4083 spa->spa_cksum_salt.zcs_bytes); 4084 if (error == ENOENT) { 4085 /* Generate a new salt for subsequent use */ 4086 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 4087 sizeof (spa->spa_cksum_salt.zcs_bytes)); 4088 } else if (error != 0) { 4089 spa_load_failed(spa, "unable to retrieve checksum salt from " 4090 "MOS [error=%d]", error); 4091 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4092 } 4093 4094 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0) 4095 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4096 error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 4097 if (error != 0) { 4098 spa_load_failed(spa, "error opening deferred-frees bpobj " 4099 "[error=%d]", error); 4100 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4101 } 4102 4103 /* 4104 * Load the bit that tells us to use the new accounting function 4105 * (raid-z deflation). If we have an older pool, this will not 4106 * be present. 4107 */ 4108 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE); 4109 if (error != 0 && error != ENOENT) 4110 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4111 4112 error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 4113 &spa->spa_creation_version, B_FALSE); 4114 if (error != 0 && error != ENOENT) 4115 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4116 4117 /* 4118 * Load the persistent error log. If we have an older pool, this will 4119 * not be present. 4120 */ 4121 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last, 4122 B_FALSE); 4123 if (error != 0 && error != ENOENT) 4124 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4125 4126 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 4127 &spa->spa_errlog_scrub, B_FALSE); 4128 if (error != 0 && error != ENOENT) 4129 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4130 4131 /* 4132 * Load the livelist deletion field. If a livelist is queued for 4133 * deletion, indicate that in the spa 4134 */ 4135 error = spa_dir_prop(spa, DMU_POOL_DELETED_CLONES, 4136 &spa->spa_livelists_to_delete, B_FALSE); 4137 if (error != 0 && error != ENOENT) 4138 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4139 4140 /* 4141 * Load the history object. If we have an older pool, this 4142 * will not be present. 4143 */ 4144 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE); 4145 if (error != 0 && error != ENOENT) 4146 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4147 4148 /* 4149 * Load the per-vdev ZAP map. If we have an older pool, this will not 4150 * be present; in this case, defer its creation to a later time to 4151 * avoid dirtying the MOS this early / out of sync context. See 4152 * spa_sync_config_object. 4153 */ 4154 4155 /* The sentinel is only available in the MOS config. */ 4156 nvlist_t *mos_config; 4157 if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) { 4158 spa_load_failed(spa, "unable to retrieve MOS config"); 4159 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4160 } 4161 4162 error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP, 4163 &spa->spa_all_vdev_zaps, B_FALSE); 4164 4165 if (error == ENOENT) { 4166 VERIFY(!nvlist_exists(mos_config, 4167 ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); 4168 spa->spa_avz_action = AVZ_ACTION_INITIALIZE; 4169 ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); 4170 } else if (error != 0) { 4171 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4172 } else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) { 4173 /* 4174 * An older version of ZFS overwrote the sentinel value, so 4175 * we have orphaned per-vdev ZAPs in the MOS. Defer their 4176 * destruction to later; see spa_sync_config_object. 4177 */ 4178 spa->spa_avz_action = AVZ_ACTION_DESTROY; 4179 /* 4180 * We're assuming that no vdevs have had their ZAPs created 4181 * before this. Better be sure of it. 4182 */ 4183 ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); 4184 } 4185 nvlist_free(mos_config); 4186 4187 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 4188 4189 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object, 4190 B_FALSE); 4191 if (error && error != ENOENT) 4192 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4193 4194 if (error == 0) { 4195 uint64_t autoreplace = 0; 4196 4197 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 4198 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 4199 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 4200 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 4201 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 4202 spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost); 4203 spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_autotrim); 4204 spa->spa_autoreplace = (autoreplace != 0); 4205 } 4206 4207 /* 4208 * If we are importing a pool with missing top-level vdevs, 4209 * we enforce that the pool doesn't panic or get suspended on 4210 * error since the likelihood of missing data is extremely high. 4211 */ 4212 if (spa->spa_missing_tvds > 0 && 4213 spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE && 4214 spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 4215 spa_load_note(spa, "forcing failmode to 'continue' " 4216 "as some top level vdevs are missing"); 4217 spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE; 4218 } 4219 4220 return (0); 4221 } 4222 4223 static int 4224 spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type) 4225 { 4226 int error = 0; 4227 vdev_t *rvd = spa->spa_root_vdev; 4228 4229 /* 4230 * If we're assembling the pool from the split-off vdevs of 4231 * an existing pool, we don't want to attach the spares & cache 4232 * devices. 4233 */ 4234 4235 /* 4236 * Load any hot spares for this pool. 4237 */ 4238 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object, 4239 B_FALSE); 4240 if (error != 0 && error != ENOENT) 4241 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4242 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 4243 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 4244 if (load_nvlist(spa, spa->spa_spares.sav_object, 4245 &spa->spa_spares.sav_config) != 0) { 4246 spa_load_failed(spa, "error loading spares nvlist"); 4247 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4248 } 4249 4250 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4251 spa_load_spares(spa); 4252 spa_config_exit(spa, SCL_ALL, FTAG); 4253 } else if (error == 0) { 4254 spa->spa_spares.sav_sync = B_TRUE; 4255 } 4256 4257 /* 4258 * Load any level 2 ARC devices for this pool. 4259 */ 4260 error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 4261 &spa->spa_l2cache.sav_object, B_FALSE); 4262 if (error != 0 && error != ENOENT) 4263 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4264 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 4265 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 4266 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 4267 &spa->spa_l2cache.sav_config) != 0) { 4268 spa_load_failed(spa, "error loading l2cache nvlist"); 4269 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4270 } 4271 4272 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4273 spa_load_l2cache(spa); 4274 spa_config_exit(spa, SCL_ALL, FTAG); 4275 } else if (error == 0) { 4276 spa->spa_l2cache.sav_sync = B_TRUE; 4277 } 4278 4279 return (0); 4280 } 4281 4282 static int 4283 spa_ld_load_vdev_metadata(spa_t *spa) 4284 { 4285 int error = 0; 4286 vdev_t *rvd = spa->spa_root_vdev; 4287 4288 /* 4289 * If the 'multihost' property is set, then never allow a pool to 4290 * be imported when the system hostid is zero. The exception to 4291 * this rule is zdb which is always allowed to access pools. 4292 */ 4293 if (spa_multihost(spa) && spa_get_hostid(spa) == 0 && 4294 (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) == 0) { 4295 fnvlist_add_uint64(spa->spa_load_info, 4296 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); 4297 return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); 4298 } 4299 4300 /* 4301 * If the 'autoreplace' property is set, then post a resource notifying 4302 * the ZFS DE that it should not issue any faults for unopenable 4303 * devices. We also iterate over the vdevs, and post a sysevent for any 4304 * unopenable vdevs so that the normal autoreplace handler can take 4305 * over. 4306 */ 4307 if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 4308 spa_check_removed(spa->spa_root_vdev); 4309 /* 4310 * For the import case, this is done in spa_import(), because 4311 * at this point we're using the spare definitions from 4312 * the MOS config, not necessarily from the userland config. 4313 */ 4314 if (spa->spa_load_state != SPA_LOAD_IMPORT) { 4315 spa_aux_check_removed(&spa->spa_spares); 4316 spa_aux_check_removed(&spa->spa_l2cache); 4317 } 4318 } 4319 4320 /* 4321 * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc. 4322 */ 4323 error = vdev_load(rvd); 4324 if (error != 0) { 4325 spa_load_failed(spa, "vdev_load failed [error=%d]", error); 4326 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 4327 } 4328 4329 error = spa_ld_log_spacemaps(spa); 4330 if (error != 0) { 4331 spa_load_failed(spa, "spa_ld_log_sm_data failed [error=%d]", 4332 error); 4333 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 4334 } 4335 4336 /* 4337 * Propagate the leaf DTLs we just loaded all the way up the vdev tree. 4338 */ 4339 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4340 vdev_dtl_reassess(rvd, 0, 0, B_FALSE, B_FALSE); 4341 spa_config_exit(spa, SCL_ALL, FTAG); 4342 4343 return (0); 4344 } 4345 4346 static int 4347 spa_ld_load_dedup_tables(spa_t *spa) 4348 { 4349 int error = 0; 4350 vdev_t *rvd = spa->spa_root_vdev; 4351 4352 error = ddt_load(spa); 4353 if (error != 0) { 4354 spa_load_failed(spa, "ddt_load failed [error=%d]", error); 4355 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4356 } 4357 4358 return (0); 4359 } 4360 4361 static int 4362 spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, char **ereport) 4363 { 4364 vdev_t *rvd = spa->spa_root_vdev; 4365 4366 if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) { 4367 boolean_t missing = spa_check_logs(spa); 4368 if (missing) { 4369 if (spa->spa_missing_tvds != 0) { 4370 spa_load_note(spa, "spa_check_logs failed " 4371 "so dropping the logs"); 4372 } else { 4373 *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 4374 spa_load_failed(spa, "spa_check_logs failed"); 4375 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, 4376 ENXIO)); 4377 } 4378 } 4379 } 4380 4381 return (0); 4382 } 4383 4384 static int 4385 spa_ld_verify_pool_data(spa_t *spa) 4386 { 4387 int error = 0; 4388 vdev_t *rvd = spa->spa_root_vdev; 4389 4390 /* 4391 * We've successfully opened the pool, verify that we're ready 4392 * to start pushing transactions. 4393 */ 4394 if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 4395 error = spa_load_verify(spa); 4396 if (error != 0) { 4397 spa_load_failed(spa, "spa_load_verify failed " 4398 "[error=%d]", error); 4399 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 4400 error)); 4401 } 4402 } 4403 4404 return (0); 4405 } 4406 4407 static void 4408 spa_ld_claim_log_blocks(spa_t *spa) 4409 { 4410 dmu_tx_t *tx; 4411 dsl_pool_t *dp = spa_get_dsl(spa); 4412 4413 /* 4414 * Claim log blocks that haven't been committed yet. 4415 * This must all happen in a single txg. 4416 * Note: spa_claim_max_txg is updated by spa_claim_notify(), 4417 * invoked from zil_claim_log_block()'s i/o done callback. 4418 * Price of rollback is that we abandon the log. 4419 */ 4420 spa->spa_claiming = B_TRUE; 4421 4422 tx = dmu_tx_create_assigned(dp, spa_first_txg(spa)); 4423 (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 4424 zil_claim, tx, DS_FIND_CHILDREN); 4425 dmu_tx_commit(tx); 4426 4427 spa->spa_claiming = B_FALSE; 4428 4429 spa_set_log_state(spa, SPA_LOG_GOOD); 4430 } 4431 4432 static void 4433 spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg, 4434 boolean_t update_config_cache) 4435 { 4436 vdev_t *rvd = spa->spa_root_vdev; 4437 int need_update = B_FALSE; 4438 4439 /* 4440 * If the config cache is stale, or we have uninitialized 4441 * metaslabs (see spa_vdev_add()), then update the config. 4442 * 4443 * If this is a verbatim import, trust the current 4444 * in-core spa_config and update the disk labels. 4445 */ 4446 if (update_config_cache || config_cache_txg != spa->spa_config_txg || 4447 spa->spa_load_state == SPA_LOAD_IMPORT || 4448 spa->spa_load_state == SPA_LOAD_RECOVER || 4449 (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 4450 need_update = B_TRUE; 4451 4452 for (int c = 0; c < rvd->vdev_children; c++) 4453 if (rvd->vdev_child[c]->vdev_ms_array == 0) 4454 need_update = B_TRUE; 4455 4456 /* 4457 * Update the config cache asynchronously in case we're the 4458 * root pool, in which case the config cache isn't writable yet. 4459 */ 4460 if (need_update) 4461 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 4462 } 4463 4464 static void 4465 spa_ld_prepare_for_reload(spa_t *spa) 4466 { 4467 spa_mode_t mode = spa->spa_mode; 4468 int async_suspended = spa->spa_async_suspended; 4469 4470 spa_unload(spa); 4471 spa_deactivate(spa); 4472 spa_activate(spa, mode); 4473 4474 /* 4475 * We save the value of spa_async_suspended as it gets reset to 0 by 4476 * spa_unload(). We want to restore it back to the original value before 4477 * returning as we might be calling spa_async_resume() later. 4478 */ 4479 spa->spa_async_suspended = async_suspended; 4480 } 4481 4482 static int 4483 spa_ld_read_checkpoint_txg(spa_t *spa) 4484 { 4485 uberblock_t checkpoint; 4486 int error = 0; 4487 4488 ASSERT0(spa->spa_checkpoint_txg); 4489 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 4490 4491 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 4492 DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), 4493 sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); 4494 4495 if (error == ENOENT) 4496 return (0); 4497 4498 if (error != 0) 4499 return (error); 4500 4501 ASSERT3U(checkpoint.ub_txg, !=, 0); 4502 ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0); 4503 ASSERT3U(checkpoint.ub_timestamp, !=, 0); 4504 spa->spa_checkpoint_txg = checkpoint.ub_txg; 4505 spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp; 4506 4507 return (0); 4508 } 4509 4510 static int 4511 spa_ld_mos_init(spa_t *spa, spa_import_type_t type) 4512 { 4513 int error = 0; 4514 4515 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 4516 ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); 4517 4518 /* 4519 * Never trust the config that is provided unless we are assembling 4520 * a pool following a split. 4521 * This means don't trust blkptrs and the vdev tree in general. This 4522 * also effectively puts the spa in read-only mode since 4523 * spa_writeable() checks for spa_trust_config to be true. 4524 * We will later load a trusted config from the MOS. 4525 */ 4526 if (type != SPA_IMPORT_ASSEMBLE) 4527 spa->spa_trust_config = B_FALSE; 4528 4529 /* 4530 * Parse the config provided to create a vdev tree. 4531 */ 4532 error = spa_ld_parse_config(spa, type); 4533 if (error != 0) 4534 return (error); 4535 4536 spa_import_progress_add(spa); 4537 4538 /* 4539 * Now that we have the vdev tree, try to open each vdev. This involves 4540 * opening the underlying physical device, retrieving its geometry and 4541 * probing the vdev with a dummy I/O. The state of each vdev will be set 4542 * based on the success of those operations. After this we'll be ready 4543 * to read from the vdevs. 4544 */ 4545 error = spa_ld_open_vdevs(spa); 4546 if (error != 0) 4547 return (error); 4548 4549 /* 4550 * Read the label of each vdev and make sure that the GUIDs stored 4551 * there match the GUIDs in the config provided. 4552 * If we're assembling a new pool that's been split off from an 4553 * existing pool, the labels haven't yet been updated so we skip 4554 * validation for now. 4555 */ 4556 if (type != SPA_IMPORT_ASSEMBLE) { 4557 error = spa_ld_validate_vdevs(spa); 4558 if (error != 0) 4559 return (error); 4560 } 4561 4562 /* 4563 * Read all vdev labels to find the best uberblock (i.e. latest, 4564 * unless spa_load_max_txg is set) and store it in spa_uberblock. We 4565 * get the list of features required to read blkptrs in the MOS from 4566 * the vdev label with the best uberblock and verify that our version 4567 * of zfs supports them all. 4568 */ 4569 error = spa_ld_select_uberblock(spa, type); 4570 if (error != 0) 4571 return (error); 4572 4573 /* 4574 * Pass that uberblock to the dsl_pool layer which will open the root 4575 * blkptr. This blkptr points to the latest version of the MOS and will 4576 * allow us to read its contents. 4577 */ 4578 error = spa_ld_open_rootbp(spa); 4579 if (error != 0) 4580 return (error); 4581 4582 return (0); 4583 } 4584 4585 static int 4586 spa_ld_checkpoint_rewind(spa_t *spa) 4587 { 4588 uberblock_t checkpoint; 4589 int error = 0; 4590 4591 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 4592 ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); 4593 4594 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 4595 DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), 4596 sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); 4597 4598 if (error != 0) { 4599 spa_load_failed(spa, "unable to retrieve checkpointed " 4600 "uberblock from the MOS config [error=%d]", error); 4601 4602 if (error == ENOENT) 4603 error = ZFS_ERR_NO_CHECKPOINT; 4604 4605 return (error); 4606 } 4607 4608 ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg); 4609 ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg); 4610 4611 /* 4612 * We need to update the txg and timestamp of the checkpointed 4613 * uberblock to be higher than the latest one. This ensures that 4614 * the checkpointed uberblock is selected if we were to close and 4615 * reopen the pool right after we've written it in the vdev labels. 4616 * (also see block comment in vdev_uberblock_compare) 4617 */ 4618 checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1; 4619 checkpoint.ub_timestamp = gethrestime_sec(); 4620 4621 /* 4622 * Set current uberblock to be the checkpointed uberblock. 4623 */ 4624 spa->spa_uberblock = checkpoint; 4625 4626 /* 4627 * If we are doing a normal rewind, then the pool is open for 4628 * writing and we sync the "updated" checkpointed uberblock to 4629 * disk. Once this is done, we've basically rewound the whole 4630 * pool and there is no way back. 4631 * 4632 * There are cases when we don't want to attempt and sync the 4633 * checkpointed uberblock to disk because we are opening a 4634 * pool as read-only. Specifically, verifying the checkpointed 4635 * state with zdb, and importing the checkpointed state to get 4636 * a "preview" of its content. 4637 */ 4638 if (spa_writeable(spa)) { 4639 vdev_t *rvd = spa->spa_root_vdev; 4640 4641 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4642 vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; 4643 int svdcount = 0; 4644 int children = rvd->vdev_children; 4645 int c0 = random_in_range(children); 4646 4647 for (int c = 0; c < children; c++) { 4648 vdev_t *vd = rvd->vdev_child[(c0 + c) % children]; 4649 4650 /* Stop when revisiting the first vdev */ 4651 if (c > 0 && svd[0] == vd) 4652 break; 4653 4654 if (vd->vdev_ms_array == 0 || vd->vdev_islog || 4655 !vdev_is_concrete(vd)) 4656 continue; 4657 4658 svd[svdcount++] = vd; 4659 if (svdcount == SPA_SYNC_MIN_VDEVS) 4660 break; 4661 } 4662 error = vdev_config_sync(svd, svdcount, spa->spa_first_txg); 4663 if (error == 0) 4664 spa->spa_last_synced_guid = rvd->vdev_guid; 4665 spa_config_exit(spa, SCL_ALL, FTAG); 4666 4667 if (error != 0) { 4668 spa_load_failed(spa, "failed to write checkpointed " 4669 "uberblock to the vdev labels [error=%d]", error); 4670 return (error); 4671 } 4672 } 4673 4674 return (0); 4675 } 4676 4677 static int 4678 spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type, 4679 boolean_t *update_config_cache) 4680 { 4681 int error; 4682 4683 /* 4684 * Parse the config for pool, open and validate vdevs, 4685 * select an uberblock, and use that uberblock to open 4686 * the MOS. 4687 */ 4688 error = spa_ld_mos_init(spa, type); 4689 if (error != 0) 4690 return (error); 4691 4692 /* 4693 * Retrieve the trusted config stored in the MOS and use it to create 4694 * a new, exact version of the vdev tree, then reopen all vdevs. 4695 */ 4696 error = spa_ld_trusted_config(spa, type, B_FALSE); 4697 if (error == EAGAIN) { 4698 if (update_config_cache != NULL) 4699 *update_config_cache = B_TRUE; 4700 4701 /* 4702 * Redo the loading process with the trusted config if it is 4703 * too different from the untrusted config. 4704 */ 4705 spa_ld_prepare_for_reload(spa); 4706 spa_load_note(spa, "RELOADING"); 4707 error = spa_ld_mos_init(spa, type); 4708 if (error != 0) 4709 return (error); 4710 4711 error = spa_ld_trusted_config(spa, type, B_TRUE); 4712 if (error != 0) 4713 return (error); 4714 4715 } else if (error != 0) { 4716 return (error); 4717 } 4718 4719 return (0); 4720 } 4721 4722 /* 4723 * Load an existing storage pool, using the config provided. This config 4724 * describes which vdevs are part of the pool and is later validated against 4725 * partial configs present in each vdev's label and an entire copy of the 4726 * config stored in the MOS. 4727 */ 4728 static int 4729 spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) 4730 { 4731 int error = 0; 4732 boolean_t missing_feat_write = B_FALSE; 4733 boolean_t checkpoint_rewind = 4734 (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); 4735 boolean_t update_config_cache = B_FALSE; 4736 4737 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 4738 ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); 4739 4740 spa_load_note(spa, "LOADING"); 4741 4742 error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache); 4743 if (error != 0) 4744 return (error); 4745 4746 /* 4747 * If we are rewinding to the checkpoint then we need to repeat 4748 * everything we've done so far in this function but this time 4749 * selecting the checkpointed uberblock and using that to open 4750 * the MOS. 4751 */ 4752 if (checkpoint_rewind) { 4753 /* 4754 * If we are rewinding to the checkpoint update config cache 4755 * anyway. 4756 */ 4757 update_config_cache = B_TRUE; 4758 4759 /* 4760 * Extract the checkpointed uberblock from the current MOS 4761 * and use this as the pool's uberblock from now on. If the 4762 * pool is imported as writeable we also write the checkpoint 4763 * uberblock to the labels, making the rewind permanent. 4764 */ 4765 error = spa_ld_checkpoint_rewind(spa); 4766 if (error != 0) 4767 return (error); 4768 4769 /* 4770 * Redo the loading process again with the 4771 * checkpointed uberblock. 4772 */ 4773 spa_ld_prepare_for_reload(spa); 4774 spa_load_note(spa, "LOADING checkpointed uberblock"); 4775 error = spa_ld_mos_with_trusted_config(spa, type, NULL); 4776 if (error != 0) 4777 return (error); 4778 } 4779 4780 /* 4781 * Retrieve the checkpoint txg if the pool has a checkpoint. 4782 */ 4783 error = spa_ld_read_checkpoint_txg(spa); 4784 if (error != 0) 4785 return (error); 4786 4787 /* 4788 * Retrieve the mapping of indirect vdevs. Those vdevs were removed 4789 * from the pool and their contents were re-mapped to other vdevs. Note 4790 * that everything that we read before this step must have been 4791 * rewritten on concrete vdevs after the last device removal was 4792 * initiated. Otherwise we could be reading from indirect vdevs before 4793 * we have loaded their mappings. 4794 */ 4795 error = spa_ld_open_indirect_vdev_metadata(spa); 4796 if (error != 0) 4797 return (error); 4798 4799 /* 4800 * Retrieve the full list of active features from the MOS and check if 4801 * they are all supported. 4802 */ 4803 error = spa_ld_check_features(spa, &missing_feat_write); 4804 if (error != 0) 4805 return (error); 4806 4807 /* 4808 * Load several special directories from the MOS needed by the dsl_pool 4809 * layer. 4810 */ 4811 error = spa_ld_load_special_directories(spa); 4812 if (error != 0) 4813 return (error); 4814 4815 /* 4816 * Retrieve pool properties from the MOS. 4817 */ 4818 error = spa_ld_get_props(spa); 4819 if (error != 0) 4820 return (error); 4821 4822 /* 4823 * Retrieve the list of auxiliary devices - cache devices and spares - 4824 * and open them. 4825 */ 4826 error = spa_ld_open_aux_vdevs(spa, type); 4827 if (error != 0) 4828 return (error); 4829 4830 /* 4831 * Load the metadata for all vdevs. Also check if unopenable devices 4832 * should be autoreplaced. 4833 */ 4834 error = spa_ld_load_vdev_metadata(spa); 4835 if (error != 0) 4836 return (error); 4837 4838 error = spa_ld_load_dedup_tables(spa); 4839 if (error != 0) 4840 return (error); 4841 4842 /* 4843 * Verify the logs now to make sure we don't have any unexpected errors 4844 * when we claim log blocks later. 4845 */ 4846 error = spa_ld_verify_logs(spa, type, ereport); 4847 if (error != 0) 4848 return (error); 4849 4850 if (missing_feat_write) { 4851 ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT); 4852 4853 /* 4854 * At this point, we know that we can open the pool in 4855 * read-only mode but not read-write mode. We now have enough 4856 * information and can return to userland. 4857 */ 4858 return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT, 4859 ENOTSUP)); 4860 } 4861 4862 /* 4863 * Traverse the last txgs to make sure the pool was left off in a safe 4864 * state. When performing an extreme rewind, we verify the whole pool, 4865 * which can take a very long time. 4866 */ 4867 error = spa_ld_verify_pool_data(spa); 4868 if (error != 0) 4869 return (error); 4870 4871 /* 4872 * Calculate the deflated space for the pool. This must be done before 4873 * we write anything to the pool because we'd need to update the space 4874 * accounting using the deflated sizes. 4875 */ 4876 spa_update_dspace(spa); 4877 4878 /* 4879 * We have now retrieved all the information we needed to open the 4880 * pool. If we are importing the pool in read-write mode, a few 4881 * additional steps must be performed to finish the import. 4882 */ 4883 if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER || 4884 spa->spa_load_max_txg == UINT64_MAX)) { 4885 uint64_t config_cache_txg = spa->spa_config_txg; 4886 4887 ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT); 4888 4889 /* 4890 * In case of a checkpoint rewind, log the original txg 4891 * of the checkpointed uberblock. 4892 */ 4893 if (checkpoint_rewind) { 4894 spa_history_log_internal(spa, "checkpoint rewind", 4895 NULL, "rewound state to txg=%llu", 4896 (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg); 4897 } 4898 4899 /* 4900 * Traverse the ZIL and claim all blocks. 4901 */ 4902 spa_ld_claim_log_blocks(spa); 4903 4904 /* 4905 * Kick-off the syncing thread. 4906 */ 4907 spa->spa_sync_on = B_TRUE; 4908 txg_sync_start(spa->spa_dsl_pool); 4909 mmp_thread_start(spa); 4910 4911 /* 4912 * Wait for all claims to sync. We sync up to the highest 4913 * claimed log block birth time so that claimed log blocks 4914 * don't appear to be from the future. spa_claim_max_txg 4915 * will have been set for us by ZIL traversal operations 4916 * performed above. 4917 */ 4918 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 4919 4920 /* 4921 * Check if we need to request an update of the config. On the 4922 * next sync, we would update the config stored in vdev labels 4923 * and the cachefile (by default /etc/zfs/zpool.cache). 4924 */ 4925 spa_ld_check_for_config_update(spa, config_cache_txg, 4926 update_config_cache); 4927 4928 /* 4929 * Check if a rebuild was in progress and if so resume it. 4930 * Then check all DTLs to see if anything needs resilvering. 4931 * The resilver will be deferred if a rebuild was started. 4932 */ 4933 if (vdev_rebuild_active(spa->spa_root_vdev)) { 4934 vdev_rebuild_restart(spa); 4935 } else if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 4936 vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 4937 spa_async_request(spa, SPA_ASYNC_RESILVER); 4938 } 4939 4940 /* 4941 * Log the fact that we booted up (so that we can detect if 4942 * we rebooted in the middle of an operation). 4943 */ 4944 spa_history_log_version(spa, "open", NULL); 4945 4946 spa_restart_removal(spa); 4947 spa_spawn_aux_threads(spa); 4948 4949 /* 4950 * Delete any inconsistent datasets. 4951 * 4952 * Note: 4953 * Since we may be issuing deletes for clones here, 4954 * we make sure to do so after we've spawned all the 4955 * auxiliary threads above (from which the livelist 4956 * deletion zthr is part of). 4957 */ 4958 (void) dmu_objset_find(spa_name(spa), 4959 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 4960 4961 /* 4962 * Clean up any stale temporary dataset userrefs. 4963 */ 4964 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 4965 4966 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4967 vdev_initialize_restart(spa->spa_root_vdev); 4968 vdev_trim_restart(spa->spa_root_vdev); 4969 vdev_autotrim_restart(spa); 4970 spa_config_exit(spa, SCL_CONFIG, FTAG); 4971 } 4972 4973 spa_import_progress_remove(spa_guid(spa)); 4974 spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); 4975 4976 spa_load_note(spa, "LOADED"); 4977 4978 return (0); 4979 } 4980 4981 static int 4982 spa_load_retry(spa_t *spa, spa_load_state_t state) 4983 { 4984 spa_mode_t mode = spa->spa_mode; 4985 4986 spa_unload(spa); 4987 spa_deactivate(spa); 4988 4989 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1; 4990 4991 spa_activate(spa, mode); 4992 spa_async_suspend(spa); 4993 4994 spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu", 4995 (u_longlong_t)spa->spa_load_max_txg); 4996 4997 return (spa_load(spa, state, SPA_IMPORT_EXISTING)); 4998 } 4999 5000 /* 5001 * If spa_load() fails this function will try loading prior txg's. If 5002 * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool 5003 * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this 5004 * function will not rewind the pool and will return the same error as 5005 * spa_load(). 5006 */ 5007 static int 5008 spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request, 5009 int rewind_flags) 5010 { 5011 nvlist_t *loadinfo = NULL; 5012 nvlist_t *config = NULL; 5013 int load_error, rewind_error; 5014 uint64_t safe_rewind_txg; 5015 uint64_t min_txg; 5016 5017 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 5018 spa->spa_load_max_txg = spa->spa_load_txg; 5019 spa_set_log_state(spa, SPA_LOG_CLEAR); 5020 } else { 5021 spa->spa_load_max_txg = max_request; 5022 if (max_request != UINT64_MAX) 5023 spa->spa_extreme_rewind = B_TRUE; 5024 } 5025 5026 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING); 5027 if (load_error == 0) 5028 return (0); 5029 if (load_error == ZFS_ERR_NO_CHECKPOINT) { 5030 /* 5031 * When attempting checkpoint-rewind on a pool with no 5032 * checkpoint, we should not attempt to load uberblocks 5033 * from previous txgs when spa_load fails. 5034 */ 5035 ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); 5036 spa_import_progress_remove(spa_guid(spa)); 5037 return (load_error); 5038 } 5039 5040 if (spa->spa_root_vdev != NULL) 5041 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 5042 5043 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 5044 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 5045 5046 if (rewind_flags & ZPOOL_NEVER_REWIND) { 5047 nvlist_free(config); 5048 spa_import_progress_remove(spa_guid(spa)); 5049 return (load_error); 5050 } 5051 5052 if (state == SPA_LOAD_RECOVER) { 5053 /* Price of rolling back is discarding txgs, including log */ 5054 spa_set_log_state(spa, SPA_LOG_CLEAR); 5055 } else { 5056 /* 5057 * If we aren't rolling back save the load info from our first 5058 * import attempt so that we can restore it after attempting 5059 * to rewind. 5060 */ 5061 loadinfo = spa->spa_load_info; 5062 spa->spa_load_info = fnvlist_alloc(); 5063 } 5064 5065 spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 5066 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 5067 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 5068 TXG_INITIAL : safe_rewind_txg; 5069 5070 /* 5071 * Continue as long as we're finding errors, we're still within 5072 * the acceptable rewind range, and we're still finding uberblocks 5073 */ 5074 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 5075 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 5076 if (spa->spa_load_max_txg < safe_rewind_txg) 5077 spa->spa_extreme_rewind = B_TRUE; 5078 rewind_error = spa_load_retry(spa, state); 5079 } 5080 5081 spa->spa_extreme_rewind = B_FALSE; 5082 spa->spa_load_max_txg = UINT64_MAX; 5083 5084 if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 5085 spa_config_set(spa, config); 5086 else 5087 nvlist_free(config); 5088 5089 if (state == SPA_LOAD_RECOVER) { 5090 ASSERT3P(loadinfo, ==, NULL); 5091 spa_import_progress_remove(spa_guid(spa)); 5092 return (rewind_error); 5093 } else { 5094 /* Store the rewind info as part of the initial load info */ 5095 fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, 5096 spa->spa_load_info); 5097 5098 /* Restore the initial load info */ 5099 fnvlist_free(spa->spa_load_info); 5100 spa->spa_load_info = loadinfo; 5101 5102 spa_import_progress_remove(spa_guid(spa)); 5103 return (load_error); 5104 } 5105 } 5106 5107 /* 5108 * Pool Open/Import 5109 * 5110 * The import case is identical to an open except that the configuration is sent 5111 * down from userland, instead of grabbed from the configuration cache. For the 5112 * case of an open, the pool configuration will exist in the 5113 * POOL_STATE_UNINITIALIZED state. 5114 * 5115 * The stats information (gen/count/ustats) is used to gather vdev statistics at 5116 * the same time open the pool, without having to keep around the spa_t in some 5117 * ambiguous state. 5118 */ 5119 static int 5120 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, 5121 nvlist_t **config) 5122 { 5123 spa_t *spa; 5124 spa_load_state_t state = SPA_LOAD_OPEN; 5125 int error; 5126 int locked = B_FALSE; 5127 int firstopen = B_FALSE; 5128 5129 *spapp = NULL; 5130 5131 /* 5132 * As disgusting as this is, we need to support recursive calls to this 5133 * function because dsl_dir_open() is called during spa_load(), and ends 5134 * up calling spa_open() again. The real fix is to figure out how to 5135 * avoid dsl_dir_open() calling this in the first place. 5136 */ 5137 if (MUTEX_NOT_HELD(&spa_namespace_lock)) { 5138 mutex_enter(&spa_namespace_lock); 5139 locked = B_TRUE; 5140 } 5141 5142 if ((spa = spa_lookup(pool)) == NULL) { 5143 if (locked) 5144 mutex_exit(&spa_namespace_lock); 5145 return (SET_ERROR(ENOENT)); 5146 } 5147 5148 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 5149 zpool_load_policy_t policy; 5150 5151 firstopen = B_TRUE; 5152 5153 zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config, 5154 &policy); 5155 if (policy.zlp_rewind & ZPOOL_DO_REWIND) 5156 state = SPA_LOAD_RECOVER; 5157 5158 spa_activate(spa, spa_mode_global); 5159 5160 if (state != SPA_LOAD_RECOVER) 5161 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 5162 spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; 5163 5164 zfs_dbgmsg("spa_open_common: opening %s", pool); 5165 error = spa_load_best(spa, state, policy.zlp_txg, 5166 policy.zlp_rewind); 5167 5168 if (error == EBADF) { 5169 /* 5170 * If vdev_validate() returns failure (indicated by 5171 * EBADF), it indicates that one of the vdevs indicates 5172 * that the pool has been exported or destroyed. If 5173 * this is the case, the config cache is out of sync and 5174 * we should remove the pool from the namespace. 5175 */ 5176 spa_unload(spa); 5177 spa_deactivate(spa); 5178 spa_write_cachefile(spa, B_TRUE, B_TRUE); 5179 spa_remove(spa); 5180 if (locked) 5181 mutex_exit(&spa_namespace_lock); 5182 return (SET_ERROR(ENOENT)); 5183 } 5184 5185 if (error) { 5186 /* 5187 * We can't open the pool, but we still have useful 5188 * information: the state of each vdev after the 5189 * attempted vdev_open(). Return this to the user. 5190 */ 5191 if (config != NULL && spa->spa_config) { 5192 *config = fnvlist_dup(spa->spa_config); 5193 fnvlist_add_nvlist(*config, 5194 ZPOOL_CONFIG_LOAD_INFO, 5195 spa->spa_load_info); 5196 } 5197 spa_unload(spa); 5198 spa_deactivate(spa); 5199 spa->spa_last_open_failed = error; 5200 if (locked) 5201 mutex_exit(&spa_namespace_lock); 5202 *spapp = NULL; 5203 return (error); 5204 } 5205 } 5206 5207 spa_open_ref(spa, tag); 5208 5209 if (config != NULL) 5210 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 5211 5212 /* 5213 * If we've recovered the pool, pass back any information we 5214 * gathered while doing the load. 5215 */ 5216 if (state == SPA_LOAD_RECOVER) { 5217 fnvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 5218 spa->spa_load_info); 5219 } 5220 5221 if (locked) { 5222 spa->spa_last_open_failed = 0; 5223 spa->spa_last_ubsync_txg = 0; 5224 spa->spa_load_txg = 0; 5225 mutex_exit(&spa_namespace_lock); 5226 } 5227 5228 if (firstopen) 5229 zvol_create_minors_recursive(spa_name(spa)); 5230 5231 *spapp = spa; 5232 5233 return (0); 5234 } 5235 5236 int 5237 spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, 5238 nvlist_t **config) 5239 { 5240 return (spa_open_common(name, spapp, tag, policy, config)); 5241 } 5242 5243 int 5244 spa_open(const char *name, spa_t **spapp, void *tag) 5245 { 5246 return (spa_open_common(name, spapp, tag, NULL, NULL)); 5247 } 5248 5249 /* 5250 * Lookup the given spa_t, incrementing the inject count in the process, 5251 * preventing it from being exported or destroyed. 5252 */ 5253 spa_t * 5254 spa_inject_addref(char *name) 5255 { 5256 spa_t *spa; 5257 5258 mutex_enter(&spa_namespace_lock); 5259 if ((spa = spa_lookup(name)) == NULL) { 5260 mutex_exit(&spa_namespace_lock); 5261 return (NULL); 5262 } 5263 spa->spa_inject_ref++; 5264 mutex_exit(&spa_namespace_lock); 5265 5266 return (spa); 5267 } 5268 5269 void 5270 spa_inject_delref(spa_t *spa) 5271 { 5272 mutex_enter(&spa_namespace_lock); 5273 spa->spa_inject_ref--; 5274 mutex_exit(&spa_namespace_lock); 5275 } 5276 5277 /* 5278 * Add spares device information to the nvlist. 5279 */ 5280 static void 5281 spa_add_spares(spa_t *spa, nvlist_t *config) 5282 { 5283 nvlist_t **spares; 5284 uint_t i, nspares; 5285 nvlist_t *nvroot; 5286 uint64_t guid; 5287 vdev_stat_t *vs; 5288 uint_t vsc; 5289 uint64_t pool; 5290 5291 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 5292 5293 if (spa->spa_spares.sav_count == 0) 5294 return; 5295 5296 nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); 5297 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 5298 ZPOOL_CONFIG_SPARES, &spares, &nspares)); 5299 if (nspares != 0) { 5300 fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 5301 (const nvlist_t * const *)spares, nspares); 5302 VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 5303 &spares, &nspares)); 5304 5305 /* 5306 * Go through and find any spares which have since been 5307 * repurposed as an active spare. If this is the case, update 5308 * their status appropriately. 5309 */ 5310 for (i = 0; i < nspares; i++) { 5311 guid = fnvlist_lookup_uint64(spares[i], 5312 ZPOOL_CONFIG_GUID); 5313 if (spa_spare_exists(guid, &pool, NULL) && 5314 pool != 0ULL) { 5315 VERIFY0(nvlist_lookup_uint64_array(spares[i], 5316 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, 5317 &vsc)); 5318 vs->vs_state = VDEV_STATE_CANT_OPEN; 5319 vs->vs_aux = VDEV_AUX_SPARED; 5320 } 5321 } 5322 } 5323 } 5324 5325 /* 5326 * Add l2cache device information to the nvlist, including vdev stats. 5327 */ 5328 static void 5329 spa_add_l2cache(spa_t *spa, nvlist_t *config) 5330 { 5331 nvlist_t **l2cache; 5332 uint_t i, j, nl2cache; 5333 nvlist_t *nvroot; 5334 uint64_t guid; 5335 vdev_t *vd; 5336 vdev_stat_t *vs; 5337 uint_t vsc; 5338 5339 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 5340 5341 if (spa->spa_l2cache.sav_count == 0) 5342 return; 5343 5344 nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); 5345 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 5346 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache)); 5347 if (nl2cache != 0) { 5348 fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 5349 (const nvlist_t * const *)l2cache, nl2cache); 5350 VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 5351 &l2cache, &nl2cache)); 5352 5353 /* 5354 * Update level 2 cache device stats. 5355 */ 5356 5357 for (i = 0; i < nl2cache; i++) { 5358 guid = fnvlist_lookup_uint64(l2cache[i], 5359 ZPOOL_CONFIG_GUID); 5360 5361 vd = NULL; 5362 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 5363 if (guid == 5364 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 5365 vd = spa->spa_l2cache.sav_vdevs[j]; 5366 break; 5367 } 5368 } 5369 ASSERT(vd != NULL); 5370 5371 VERIFY0(nvlist_lookup_uint64_array(l2cache[i], 5372 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)); 5373 vdev_get_stats(vd, vs); 5374 vdev_config_generate_stats(vd, l2cache[i]); 5375 5376 } 5377 } 5378 } 5379 5380 static void 5381 spa_feature_stats_from_disk(spa_t *spa, nvlist_t *features) 5382 { 5383 zap_cursor_t zc; 5384 zap_attribute_t za; 5385 5386 if (spa->spa_feat_for_read_obj != 0) { 5387 for (zap_cursor_init(&zc, spa->spa_meta_objset, 5388 spa->spa_feat_for_read_obj); 5389 zap_cursor_retrieve(&zc, &za) == 0; 5390 zap_cursor_advance(&zc)) { 5391 ASSERT(za.za_integer_length == sizeof (uint64_t) && 5392 za.za_num_integers == 1); 5393 VERIFY0(nvlist_add_uint64(features, za.za_name, 5394 za.za_first_integer)); 5395 } 5396 zap_cursor_fini(&zc); 5397 } 5398 5399 if (spa->spa_feat_for_write_obj != 0) { 5400 for (zap_cursor_init(&zc, spa->spa_meta_objset, 5401 spa->spa_feat_for_write_obj); 5402 zap_cursor_retrieve(&zc, &za) == 0; 5403 zap_cursor_advance(&zc)) { 5404 ASSERT(za.za_integer_length == sizeof (uint64_t) && 5405 za.za_num_integers == 1); 5406 VERIFY0(nvlist_add_uint64(features, za.za_name, 5407 za.za_first_integer)); 5408 } 5409 zap_cursor_fini(&zc); 5410 } 5411 } 5412 5413 static void 5414 spa_feature_stats_from_cache(spa_t *spa, nvlist_t *features) 5415 { 5416 int i; 5417 5418 for (i = 0; i < SPA_FEATURES; i++) { 5419 zfeature_info_t feature = spa_feature_table[i]; 5420 uint64_t refcount; 5421 5422 if (feature_get_refcount(spa, &feature, &refcount) != 0) 5423 continue; 5424 5425 VERIFY0(nvlist_add_uint64(features, feature.fi_guid, refcount)); 5426 } 5427 } 5428 5429 /* 5430 * Store a list of pool features and their reference counts in the 5431 * config. 5432 * 5433 * The first time this is called on a spa, allocate a new nvlist, fetch 5434 * the pool features and reference counts from disk, then save the list 5435 * in the spa. In subsequent calls on the same spa use the saved nvlist 5436 * and refresh its values from the cached reference counts. This 5437 * ensures we don't block here on I/O on a suspended pool so 'zpool 5438 * clear' can resume the pool. 5439 */ 5440 static void 5441 spa_add_feature_stats(spa_t *spa, nvlist_t *config) 5442 { 5443 nvlist_t *features; 5444 5445 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 5446 5447 mutex_enter(&spa->spa_feat_stats_lock); 5448 features = spa->spa_feat_stats; 5449 5450 if (features != NULL) { 5451 spa_feature_stats_from_cache(spa, features); 5452 } else { 5453 VERIFY0(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP)); 5454 spa->spa_feat_stats = features; 5455 spa_feature_stats_from_disk(spa, features); 5456 } 5457 5458 VERIFY0(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, 5459 features)); 5460 5461 mutex_exit(&spa->spa_feat_stats_lock); 5462 } 5463 5464 int 5465 spa_get_stats(const char *name, nvlist_t **config, 5466 char *altroot, size_t buflen) 5467 { 5468 int error; 5469 spa_t *spa; 5470 5471 *config = NULL; 5472 error = spa_open_common(name, &spa, FTAG, NULL, config); 5473 5474 if (spa != NULL) { 5475 /* 5476 * This still leaves a window of inconsistency where the spares 5477 * or l2cache devices could change and the config would be 5478 * self-inconsistent. 5479 */ 5480 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5481 5482 if (*config != NULL) { 5483 uint64_t loadtimes[2]; 5484 5485 loadtimes[0] = spa->spa_loaded_ts.tv_sec; 5486 loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 5487 fnvlist_add_uint64_array(*config, 5488 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2); 5489 5490 fnvlist_add_uint64(*config, 5491 ZPOOL_CONFIG_ERRCOUNT, 5492 spa_get_errlog_size(spa)); 5493 5494 if (spa_suspended(spa)) { 5495 fnvlist_add_uint64(*config, 5496 ZPOOL_CONFIG_SUSPENDED, 5497 spa->spa_failmode); 5498 fnvlist_add_uint64(*config, 5499 ZPOOL_CONFIG_SUSPENDED_REASON, 5500 spa->spa_suspended); 5501 } 5502 5503 spa_add_spares(spa, *config); 5504 spa_add_l2cache(spa, *config); 5505 spa_add_feature_stats(spa, *config); 5506 } 5507 } 5508 5509 /* 5510 * We want to get the alternate root even for faulted pools, so we cheat 5511 * and call spa_lookup() directly. 5512 */ 5513 if (altroot) { 5514 if (spa == NULL) { 5515 mutex_enter(&spa_namespace_lock); 5516 spa = spa_lookup(name); 5517 if (spa) 5518 spa_altroot(spa, altroot, buflen); 5519 else 5520 altroot[0] = '\0'; 5521 spa = NULL; 5522 mutex_exit(&spa_namespace_lock); 5523 } else { 5524 spa_altroot(spa, altroot, buflen); 5525 } 5526 } 5527 5528 if (spa != NULL) { 5529 spa_config_exit(spa, SCL_CONFIG, FTAG); 5530 spa_close(spa, FTAG); 5531 } 5532 5533 return (error); 5534 } 5535 5536 /* 5537 * Validate that the auxiliary device array is well formed. We must have an 5538 * array of nvlists, each which describes a valid leaf vdev. If this is an 5539 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 5540 * specified, as long as they are well-formed. 5541 */ 5542 static int 5543 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 5544 spa_aux_vdev_t *sav, const char *config, uint64_t version, 5545 vdev_labeltype_t label) 5546 { 5547 nvlist_t **dev; 5548 uint_t i, ndev; 5549 vdev_t *vd; 5550 int error; 5551 5552 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 5553 5554 /* 5555 * It's acceptable to have no devs specified. 5556 */ 5557 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 5558 return (0); 5559 5560 if (ndev == 0) 5561 return (SET_ERROR(EINVAL)); 5562 5563 /* 5564 * Make sure the pool is formatted with a version that supports this 5565 * device type. 5566 */ 5567 if (spa_version(spa) < version) 5568 return (SET_ERROR(ENOTSUP)); 5569 5570 /* 5571 * Set the pending device list so we correctly handle device in-use 5572 * checking. 5573 */ 5574 sav->sav_pending = dev; 5575 sav->sav_npending = ndev; 5576 5577 for (i = 0; i < ndev; i++) { 5578 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 5579 mode)) != 0) 5580 goto out; 5581 5582 if (!vd->vdev_ops->vdev_op_leaf) { 5583 vdev_free(vd); 5584 error = SET_ERROR(EINVAL); 5585 goto out; 5586 } 5587 5588 vd->vdev_top = vd; 5589 5590 if ((error = vdev_open(vd)) == 0 && 5591 (error = vdev_label_init(vd, crtxg, label)) == 0) { 5592 fnvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 5593 vd->vdev_guid); 5594 } 5595 5596 vdev_free(vd); 5597 5598 if (error && 5599 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 5600 goto out; 5601 else 5602 error = 0; 5603 } 5604 5605 out: 5606 sav->sav_pending = NULL; 5607 sav->sav_npending = 0; 5608 return (error); 5609 } 5610 5611 static int 5612 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 5613 { 5614 int error; 5615 5616 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 5617 5618 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 5619 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 5620 VDEV_LABEL_SPARE)) != 0) { 5621 return (error); 5622 } 5623 5624 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 5625 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 5626 VDEV_LABEL_L2CACHE)); 5627 } 5628 5629 static void 5630 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 5631 const char *config) 5632 { 5633 int i; 5634 5635 if (sav->sav_config != NULL) { 5636 nvlist_t **olddevs; 5637 uint_t oldndevs; 5638 nvlist_t **newdevs; 5639 5640 /* 5641 * Generate new dev list by concatenating with the 5642 * current dev list. 5643 */ 5644 VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, config, 5645 &olddevs, &oldndevs)); 5646 5647 newdevs = kmem_alloc(sizeof (void *) * 5648 (ndevs + oldndevs), KM_SLEEP); 5649 for (i = 0; i < oldndevs; i++) 5650 newdevs[i] = fnvlist_dup(olddevs[i]); 5651 for (i = 0; i < ndevs; i++) 5652 newdevs[i + oldndevs] = fnvlist_dup(devs[i]); 5653 5654 fnvlist_remove(sav->sav_config, config); 5655 5656 fnvlist_add_nvlist_array(sav->sav_config, config, 5657 (const nvlist_t * const *)newdevs, ndevs + oldndevs); 5658 for (i = 0; i < oldndevs + ndevs; i++) 5659 nvlist_free(newdevs[i]); 5660 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 5661 } else { 5662 /* 5663 * Generate a new dev list. 5664 */ 5665 sav->sav_config = fnvlist_alloc(); 5666 fnvlist_add_nvlist_array(sav->sav_config, config, 5667 (const nvlist_t * const *)devs, ndevs); 5668 } 5669 } 5670 5671 /* 5672 * Stop and drop level 2 ARC devices 5673 */ 5674 void 5675 spa_l2cache_drop(spa_t *spa) 5676 { 5677 vdev_t *vd; 5678 int i; 5679 spa_aux_vdev_t *sav = &spa->spa_l2cache; 5680 5681 for (i = 0; i < sav->sav_count; i++) { 5682 uint64_t pool; 5683 5684 vd = sav->sav_vdevs[i]; 5685 ASSERT(vd != NULL); 5686 5687 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 5688 pool != 0ULL && l2arc_vdev_present(vd)) 5689 l2arc_remove_vdev(vd); 5690 } 5691 } 5692 5693 /* 5694 * Verify encryption parameters for spa creation. If we are encrypting, we must 5695 * have the encryption feature flag enabled. 5696 */ 5697 static int 5698 spa_create_check_encryption_params(dsl_crypto_params_t *dcp, 5699 boolean_t has_encryption) 5700 { 5701 if (dcp->cp_crypt != ZIO_CRYPT_OFF && 5702 dcp->cp_crypt != ZIO_CRYPT_INHERIT && 5703 !has_encryption) 5704 return (SET_ERROR(ENOTSUP)); 5705 5706 return (dmu_objset_create_crypt_check(NULL, dcp, NULL)); 5707 } 5708 5709 /* 5710 * Pool Creation 5711 */ 5712 int 5713 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 5714 nvlist_t *zplprops, dsl_crypto_params_t *dcp) 5715 { 5716 spa_t *spa; 5717 char *altroot = NULL; 5718 vdev_t *rvd; 5719 dsl_pool_t *dp; 5720 dmu_tx_t *tx; 5721 int error = 0; 5722 uint64_t txg = TXG_INITIAL; 5723 nvlist_t **spares, **l2cache; 5724 uint_t nspares, nl2cache; 5725 uint64_t version, obj, ndraid = 0; 5726 boolean_t has_features; 5727 boolean_t has_encryption; 5728 boolean_t has_allocclass; 5729 spa_feature_t feat; 5730 char *feat_name; 5731 char *poolname; 5732 nvlist_t *nvl; 5733 5734 if (props == NULL || 5735 nvlist_lookup_string(props, "tname", &poolname) != 0) 5736 poolname = (char *)pool; 5737 5738 /* 5739 * If this pool already exists, return failure. 5740 */ 5741 mutex_enter(&spa_namespace_lock); 5742 if (spa_lookup(poolname) != NULL) { 5743 mutex_exit(&spa_namespace_lock); 5744 return (SET_ERROR(EEXIST)); 5745 } 5746 5747 /* 5748 * Allocate a new spa_t structure. 5749 */ 5750 nvl = fnvlist_alloc(); 5751 fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool); 5752 (void) nvlist_lookup_string(props, 5753 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 5754 spa = spa_add(poolname, nvl, altroot); 5755 fnvlist_free(nvl); 5756 spa_activate(spa, spa_mode_global); 5757 5758 if (props && (error = spa_prop_validate(spa, props))) { 5759 spa_deactivate(spa); 5760 spa_remove(spa); 5761 mutex_exit(&spa_namespace_lock); 5762 return (error); 5763 } 5764 5765 /* 5766 * Temporary pool names should never be written to disk. 5767 */ 5768 if (poolname != pool) 5769 spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME; 5770 5771 has_features = B_FALSE; 5772 has_encryption = B_FALSE; 5773 has_allocclass = B_FALSE; 5774 for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); 5775 elem != NULL; elem = nvlist_next_nvpair(props, elem)) { 5776 if (zpool_prop_feature(nvpair_name(elem))) { 5777 has_features = B_TRUE; 5778 5779 feat_name = strchr(nvpair_name(elem), '@') + 1; 5780 VERIFY0(zfeature_lookup_name(feat_name, &feat)); 5781 if (feat == SPA_FEATURE_ENCRYPTION) 5782 has_encryption = B_TRUE; 5783 if (feat == SPA_FEATURE_ALLOCATION_CLASSES) 5784 has_allocclass = B_TRUE; 5785 } 5786 } 5787 5788 /* verify encryption params, if they were provided */ 5789 if (dcp != NULL) { 5790 error = spa_create_check_encryption_params(dcp, has_encryption); 5791 if (error != 0) { 5792 spa_deactivate(spa); 5793 spa_remove(spa); 5794 mutex_exit(&spa_namespace_lock); 5795 return (error); 5796 } 5797 } 5798 if (!has_allocclass && zfs_special_devs(nvroot, NULL)) { 5799 spa_deactivate(spa); 5800 spa_remove(spa); 5801 mutex_exit(&spa_namespace_lock); 5802 return (ENOTSUP); 5803 } 5804 5805 if (has_features || nvlist_lookup_uint64(props, 5806 zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { 5807 version = SPA_VERSION; 5808 } 5809 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 5810 5811 spa->spa_first_txg = txg; 5812 spa->spa_uberblock.ub_txg = txg - 1; 5813 spa->spa_uberblock.ub_version = version; 5814 spa->spa_ubsync = spa->spa_uberblock; 5815 spa->spa_load_state = SPA_LOAD_CREATE; 5816 spa->spa_removing_phys.sr_state = DSS_NONE; 5817 spa->spa_removing_phys.sr_removing_vdev = -1; 5818 spa->spa_removing_phys.sr_prev_indirect_vdev = -1; 5819 spa->spa_indirect_vdevs_loaded = B_TRUE; 5820 5821 /* 5822 * Create "The Godfather" zio to hold all async IOs 5823 */ 5824 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 5825 KM_SLEEP); 5826 for (int i = 0; i < max_ncpus; i++) { 5827 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 5828 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 5829 ZIO_FLAG_GODFATHER); 5830 } 5831 5832 /* 5833 * Create the root vdev. 5834 */ 5835 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5836 5837 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 5838 5839 ASSERT(error != 0 || rvd != NULL); 5840 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 5841 5842 if (error == 0 && !zfs_allocatable_devs(nvroot)) 5843 error = SET_ERROR(EINVAL); 5844 5845 if (error == 0 && 5846 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 5847 (error = vdev_draid_spare_create(nvroot, rvd, &ndraid, 0)) == 0 && 5848 (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { 5849 /* 5850 * instantiate the metaslab groups (this will dirty the vdevs) 5851 * we can no longer error exit past this point 5852 */ 5853 for (int c = 0; error == 0 && c < rvd->vdev_children; c++) { 5854 vdev_t *vd = rvd->vdev_child[c]; 5855 5856 vdev_metaslab_set_size(vd); 5857 vdev_expand(vd, txg); 5858 } 5859 } 5860 5861 spa_config_exit(spa, SCL_ALL, FTAG); 5862 5863 if (error != 0) { 5864 spa_unload(spa); 5865 spa_deactivate(spa); 5866 spa_remove(spa); 5867 mutex_exit(&spa_namespace_lock); 5868 return (error); 5869 } 5870 5871 /* 5872 * Get the list of spares, if specified. 5873 */ 5874 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 5875 &spares, &nspares) == 0) { 5876 spa->spa_spares.sav_config = fnvlist_alloc(); 5877 fnvlist_add_nvlist_array(spa->spa_spares.sav_config, 5878 ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, 5879 nspares); 5880 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5881 spa_load_spares(spa); 5882 spa_config_exit(spa, SCL_ALL, FTAG); 5883 spa->spa_spares.sav_sync = B_TRUE; 5884 } 5885 5886 /* 5887 * Get the list of level 2 cache devices, if specified. 5888 */ 5889 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 5890 &l2cache, &nl2cache) == 0) { 5891 VERIFY0(nvlist_alloc(&spa->spa_l2cache.sav_config, 5892 NV_UNIQUE_NAME, KM_SLEEP)); 5893 fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 5894 ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache, 5895 nl2cache); 5896 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5897 spa_load_l2cache(spa); 5898 spa_config_exit(spa, SCL_ALL, FTAG); 5899 spa->spa_l2cache.sav_sync = B_TRUE; 5900 } 5901 5902 spa->spa_is_initializing = B_TRUE; 5903 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, dcp, txg); 5904 spa->spa_is_initializing = B_FALSE; 5905 5906 /* 5907 * Create DDTs (dedup tables). 5908 */ 5909 ddt_create(spa); 5910 5911 spa_update_dspace(spa); 5912 5913 tx = dmu_tx_create_assigned(dp, txg); 5914 5915 /* 5916 * Create the pool's history object. 5917 */ 5918 if (version >= SPA_VERSION_ZPOOL_HISTORY && !spa->spa_history) 5919 spa_history_create_obj(spa, tx); 5920 5921 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE); 5922 spa_history_log_version(spa, "create", tx); 5923 5924 /* 5925 * Create the pool config object. 5926 */ 5927 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 5928 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 5929 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 5930 5931 if (zap_add(spa->spa_meta_objset, 5932 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 5933 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 5934 cmn_err(CE_PANIC, "failed to add pool config"); 5935 } 5936 5937 if (zap_add(spa->spa_meta_objset, 5938 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 5939 sizeof (uint64_t), 1, &version, tx) != 0) { 5940 cmn_err(CE_PANIC, "failed to add pool version"); 5941 } 5942 5943 /* Newly created pools with the right version are always deflated. */ 5944 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 5945 spa->spa_deflate = TRUE; 5946 if (zap_add(spa->spa_meta_objset, 5947 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 5948 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 5949 cmn_err(CE_PANIC, "failed to add deflate"); 5950 } 5951 } 5952 5953 /* 5954 * Create the deferred-free bpobj. Turn off compression 5955 * because sync-to-convergence takes longer if the blocksize 5956 * keeps changing. 5957 */ 5958 obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 5959 dmu_object_set_compress(spa->spa_meta_objset, obj, 5960 ZIO_COMPRESS_OFF, tx); 5961 if (zap_add(spa->spa_meta_objset, 5962 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 5963 sizeof (uint64_t), 1, &obj, tx) != 0) { 5964 cmn_err(CE_PANIC, "failed to add bpobj"); 5965 } 5966 VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 5967 spa->spa_meta_objset, obj)); 5968 5969 /* 5970 * Generate some random noise for salted checksums to operate on. 5971 */ 5972 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 5973 sizeof (spa->spa_cksum_salt.zcs_bytes)); 5974 5975 /* 5976 * Set pool properties. 5977 */ 5978 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 5979 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 5980 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 5981 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 5982 spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST); 5983 spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM); 5984 5985 if (props != NULL) { 5986 spa_configfile_set(spa, props, B_FALSE); 5987 spa_sync_props(props, tx); 5988 } 5989 5990 for (int i = 0; i < ndraid; i++) 5991 spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); 5992 5993 dmu_tx_commit(tx); 5994 5995 spa->spa_sync_on = B_TRUE; 5996 txg_sync_start(dp); 5997 mmp_thread_start(spa); 5998 txg_wait_synced(dp, txg); 5999 6000 spa_spawn_aux_threads(spa); 6001 6002 spa_write_cachefile(spa, B_FALSE, B_TRUE); 6003 6004 /* 6005 * Don't count references from objsets that are already closed 6006 * and are making their way through the eviction process. 6007 */ 6008 spa_evicting_os_wait(spa); 6009 spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); 6010 spa->spa_load_state = SPA_LOAD_NONE; 6011 6012 mutex_exit(&spa_namespace_lock); 6013 6014 return (0); 6015 } 6016 6017 /* 6018 * Import a non-root pool into the system. 6019 */ 6020 int 6021 spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 6022 { 6023 spa_t *spa; 6024 char *altroot = NULL; 6025 spa_load_state_t state = SPA_LOAD_IMPORT; 6026 zpool_load_policy_t policy; 6027 spa_mode_t mode = spa_mode_global; 6028 uint64_t readonly = B_FALSE; 6029 int error; 6030 nvlist_t *nvroot; 6031 nvlist_t **spares, **l2cache; 6032 uint_t nspares, nl2cache; 6033 6034 /* 6035 * If a pool with this name exists, return failure. 6036 */ 6037 mutex_enter(&spa_namespace_lock); 6038 if (spa_lookup(pool) != NULL) { 6039 mutex_exit(&spa_namespace_lock); 6040 return (SET_ERROR(EEXIST)); 6041 } 6042 6043 /* 6044 * Create and initialize the spa structure. 6045 */ 6046 (void) nvlist_lookup_string(props, 6047 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 6048 (void) nvlist_lookup_uint64(props, 6049 zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 6050 if (readonly) 6051 mode = SPA_MODE_READ; 6052 spa = spa_add(pool, config, altroot); 6053 spa->spa_import_flags = flags; 6054 6055 /* 6056 * Verbatim import - Take a pool and insert it into the namespace 6057 * as if it had been loaded at boot. 6058 */ 6059 if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 6060 if (props != NULL) 6061 spa_configfile_set(spa, props, B_FALSE); 6062 6063 spa_write_cachefile(spa, B_FALSE, B_TRUE); 6064 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); 6065 zfs_dbgmsg("spa_import: verbatim import of %s", pool); 6066 mutex_exit(&spa_namespace_lock); 6067 return (0); 6068 } 6069 6070 spa_activate(spa, mode); 6071 6072 /* 6073 * Don't start async tasks until we know everything is healthy. 6074 */ 6075 spa_async_suspend(spa); 6076 6077 zpool_get_load_policy(config, &policy); 6078 if (policy.zlp_rewind & ZPOOL_DO_REWIND) 6079 state = SPA_LOAD_RECOVER; 6080 6081 spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT; 6082 6083 if (state != SPA_LOAD_RECOVER) { 6084 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 6085 zfs_dbgmsg("spa_import: importing %s", pool); 6086 } else { 6087 zfs_dbgmsg("spa_import: importing %s, max_txg=%lld " 6088 "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg); 6089 } 6090 error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind); 6091 6092 /* 6093 * Propagate anything learned while loading the pool and pass it 6094 * back to caller (i.e. rewind info, missing devices, etc). 6095 */ 6096 fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info); 6097 6098 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6099 /* 6100 * Toss any existing sparelist, as it doesn't have any validity 6101 * anymore, and conflicts with spa_has_spare(). 6102 */ 6103 if (spa->spa_spares.sav_config) { 6104 nvlist_free(spa->spa_spares.sav_config); 6105 spa->spa_spares.sav_config = NULL; 6106 spa_load_spares(spa); 6107 } 6108 if (spa->spa_l2cache.sav_config) { 6109 nvlist_free(spa->spa_l2cache.sav_config); 6110 spa->spa_l2cache.sav_config = NULL; 6111 spa_load_l2cache(spa); 6112 } 6113 6114 nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); 6115 spa_config_exit(spa, SCL_ALL, FTAG); 6116 6117 if (props != NULL) 6118 spa_configfile_set(spa, props, B_FALSE); 6119 6120 if (error != 0 || (props && spa_writeable(spa) && 6121 (error = spa_prop_set(spa, props)))) { 6122 spa_unload(spa); 6123 spa_deactivate(spa); 6124 spa_remove(spa); 6125 mutex_exit(&spa_namespace_lock); 6126 return (error); 6127 } 6128 6129 spa_async_resume(spa); 6130 6131 /* 6132 * Override any spares and level 2 cache devices as specified by 6133 * the user, as these may have correct device names/devids, etc. 6134 */ 6135 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 6136 &spares, &nspares) == 0) { 6137 if (spa->spa_spares.sav_config) 6138 fnvlist_remove(spa->spa_spares.sav_config, 6139 ZPOOL_CONFIG_SPARES); 6140 else 6141 spa->spa_spares.sav_config = fnvlist_alloc(); 6142 fnvlist_add_nvlist_array(spa->spa_spares.sav_config, 6143 ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares, 6144 nspares); 6145 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6146 spa_load_spares(spa); 6147 spa_config_exit(spa, SCL_ALL, FTAG); 6148 spa->spa_spares.sav_sync = B_TRUE; 6149 } 6150 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 6151 &l2cache, &nl2cache) == 0) { 6152 if (spa->spa_l2cache.sav_config) 6153 fnvlist_remove(spa->spa_l2cache.sav_config, 6154 ZPOOL_CONFIG_L2CACHE); 6155 else 6156 spa->spa_l2cache.sav_config = fnvlist_alloc(); 6157 fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 6158 ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache, 6159 nl2cache); 6160 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6161 spa_load_l2cache(spa); 6162 spa_config_exit(spa, SCL_ALL, FTAG); 6163 spa->spa_l2cache.sav_sync = B_TRUE; 6164 } 6165 6166 /* 6167 * Check for any removed devices. 6168 */ 6169 if (spa->spa_autoreplace) { 6170 spa_aux_check_removed(&spa->spa_spares); 6171 spa_aux_check_removed(&spa->spa_l2cache); 6172 } 6173 6174 if (spa_writeable(spa)) { 6175 /* 6176 * Update the config cache to include the newly-imported pool. 6177 */ 6178 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 6179 } 6180 6181 /* 6182 * It's possible that the pool was expanded while it was exported. 6183 * We kick off an async task to handle this for us. 6184 */ 6185 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 6186 6187 spa_history_log_version(spa, "import", NULL); 6188 6189 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); 6190 6191 mutex_exit(&spa_namespace_lock); 6192 6193 zvol_create_minors_recursive(pool); 6194 6195 return (0); 6196 } 6197 6198 nvlist_t * 6199 spa_tryimport(nvlist_t *tryconfig) 6200 { 6201 nvlist_t *config = NULL; 6202 char *poolname, *cachefile; 6203 spa_t *spa; 6204 uint64_t state; 6205 int error; 6206 zpool_load_policy_t policy; 6207 6208 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 6209 return (NULL); 6210 6211 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 6212 return (NULL); 6213 6214 /* 6215 * Create and initialize the spa structure. 6216 */ 6217 mutex_enter(&spa_namespace_lock); 6218 spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 6219 spa_activate(spa, SPA_MODE_READ); 6220 6221 /* 6222 * Rewind pool if a max txg was provided. 6223 */ 6224 zpool_get_load_policy(spa->spa_config, &policy); 6225 if (policy.zlp_txg != UINT64_MAX) { 6226 spa->spa_load_max_txg = policy.zlp_txg; 6227 spa->spa_extreme_rewind = B_TRUE; 6228 zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld", 6229 poolname, (longlong_t)policy.zlp_txg); 6230 } else { 6231 zfs_dbgmsg("spa_tryimport: importing %s", poolname); 6232 } 6233 6234 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile) 6235 == 0) { 6236 zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile); 6237 spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; 6238 } else { 6239 spa->spa_config_source = SPA_CONFIG_SRC_SCAN; 6240 } 6241 6242 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING); 6243 6244 /* 6245 * If 'tryconfig' was at least parsable, return the current config. 6246 */ 6247 if (spa->spa_root_vdev != NULL) { 6248 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 6249 fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, poolname); 6250 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, state); 6251 fnvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 6252 spa->spa_uberblock.ub_timestamp); 6253 fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 6254 spa->spa_load_info); 6255 fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, 6256 spa->spa_errata); 6257 6258 /* 6259 * If the bootfs property exists on this pool then we 6260 * copy it out so that external consumers can tell which 6261 * pools are bootable. 6262 */ 6263 if ((!error || error == EEXIST) && spa->spa_bootfs) { 6264 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 6265 6266 /* 6267 * We have to play games with the name since the 6268 * pool was opened as TRYIMPORT_NAME. 6269 */ 6270 if (dsl_dsobj_to_dsname(spa_name(spa), 6271 spa->spa_bootfs, tmpname) == 0) { 6272 char *cp; 6273 char *dsname; 6274 6275 dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 6276 6277 cp = strchr(tmpname, '/'); 6278 if (cp == NULL) { 6279 (void) strlcpy(dsname, tmpname, 6280 MAXPATHLEN); 6281 } else { 6282 (void) snprintf(dsname, MAXPATHLEN, 6283 "%s/%s", poolname, ++cp); 6284 } 6285 fnvlist_add_string(config, ZPOOL_CONFIG_BOOTFS, 6286 dsname); 6287 kmem_free(dsname, MAXPATHLEN); 6288 } 6289 kmem_free(tmpname, MAXPATHLEN); 6290 } 6291 6292 /* 6293 * Add the list of hot spares and level 2 cache devices. 6294 */ 6295 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6296 spa_add_spares(spa, config); 6297 spa_add_l2cache(spa, config); 6298 spa_config_exit(spa, SCL_CONFIG, FTAG); 6299 } 6300 6301 spa_unload(spa); 6302 spa_deactivate(spa); 6303 spa_remove(spa); 6304 mutex_exit(&spa_namespace_lock); 6305 6306 return (config); 6307 } 6308 6309 /* 6310 * Pool export/destroy 6311 * 6312 * The act of destroying or exporting a pool is very simple. We make sure there 6313 * is no more pending I/O and any references to the pool are gone. Then, we 6314 * update the pool state and sync all the labels to disk, removing the 6315 * configuration from the cache afterwards. If the 'hardforce' flag is set, then 6316 * we don't sync the labels or remove the configuration cache. 6317 */ 6318 static int 6319 spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, 6320 boolean_t force, boolean_t hardforce) 6321 { 6322 int error; 6323 spa_t *spa; 6324 6325 if (oldconfig) 6326 *oldconfig = NULL; 6327 6328 if (!(spa_mode_global & SPA_MODE_WRITE)) 6329 return (SET_ERROR(EROFS)); 6330 6331 mutex_enter(&spa_namespace_lock); 6332 if ((spa = spa_lookup(pool)) == NULL) { 6333 mutex_exit(&spa_namespace_lock); 6334 return (SET_ERROR(ENOENT)); 6335 } 6336 6337 if (spa->spa_is_exporting) { 6338 /* the pool is being exported by another thread */ 6339 mutex_exit(&spa_namespace_lock); 6340 return (SET_ERROR(ZFS_ERR_EXPORT_IN_PROGRESS)); 6341 } 6342 spa->spa_is_exporting = B_TRUE; 6343 6344 /* 6345 * Put a hold on the pool, drop the namespace lock, stop async tasks, 6346 * reacquire the namespace lock, and see if we can export. 6347 */ 6348 spa_open_ref(spa, FTAG); 6349 mutex_exit(&spa_namespace_lock); 6350 spa_async_suspend(spa); 6351 if (spa->spa_zvol_taskq) { 6352 zvol_remove_minors(spa, spa_name(spa), B_TRUE); 6353 taskq_wait(spa->spa_zvol_taskq); 6354 } 6355 mutex_enter(&spa_namespace_lock); 6356 spa_close(spa, FTAG); 6357 6358 if (spa->spa_state == POOL_STATE_UNINITIALIZED) 6359 goto export_spa; 6360 /* 6361 * The pool will be in core if it's openable, in which case we can 6362 * modify its state. Objsets may be open only because they're dirty, 6363 * so we have to force it to sync before checking spa_refcnt. 6364 */ 6365 if (spa->spa_sync_on) { 6366 txg_wait_synced(spa->spa_dsl_pool, 0); 6367 spa_evicting_os_wait(spa); 6368 } 6369 6370 /* 6371 * A pool cannot be exported or destroyed if there are active 6372 * references. If we are resetting a pool, allow references by 6373 * fault injection handlers. 6374 */ 6375 if (!spa_refcount_zero(spa) || (spa->spa_inject_ref != 0)) { 6376 error = SET_ERROR(EBUSY); 6377 goto fail; 6378 } 6379 6380 if (spa->spa_sync_on) { 6381 /* 6382 * A pool cannot be exported if it has an active shared spare. 6383 * This is to prevent other pools stealing the active spare 6384 * from an exported pool. At user's own will, such pool can 6385 * be forcedly exported. 6386 */ 6387 if (!force && new_state == POOL_STATE_EXPORTED && 6388 spa_has_active_shared_spare(spa)) { 6389 error = SET_ERROR(EXDEV); 6390 goto fail; 6391 } 6392 6393 /* 6394 * We're about to export or destroy this pool. Make sure 6395 * we stop all initialization and trim activity here before 6396 * we set the spa_final_txg. This will ensure that all 6397 * dirty data resulting from the initialization is 6398 * committed to disk before we unload the pool. 6399 */ 6400 if (spa->spa_root_vdev != NULL) { 6401 vdev_t *rvd = spa->spa_root_vdev; 6402 vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE); 6403 vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE); 6404 vdev_autotrim_stop_all(spa); 6405 vdev_rebuild_stop_all(spa); 6406 } 6407 6408 /* 6409 * We want this to be reflected on every label, 6410 * so mark them all dirty. spa_unload() will do the 6411 * final sync that pushes these changes out. 6412 */ 6413 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 6414 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6415 spa->spa_state = new_state; 6416 spa->spa_final_txg = spa_last_synced_txg(spa) + 6417 TXG_DEFER_SIZE + 1; 6418 vdev_config_dirty(spa->spa_root_vdev); 6419 spa_config_exit(spa, SCL_ALL, FTAG); 6420 } 6421 } 6422 6423 export_spa: 6424 if (new_state == POOL_STATE_DESTROYED) 6425 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY); 6426 else if (new_state == POOL_STATE_EXPORTED) 6427 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_EXPORT); 6428 6429 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 6430 spa_unload(spa); 6431 spa_deactivate(spa); 6432 } 6433 6434 if (oldconfig && spa->spa_config) 6435 *oldconfig = fnvlist_dup(spa->spa_config); 6436 6437 if (new_state != POOL_STATE_UNINITIALIZED) { 6438 if (!hardforce) 6439 spa_write_cachefile(spa, B_TRUE, B_TRUE); 6440 spa_remove(spa); 6441 } else { 6442 /* 6443 * If spa_remove() is not called for this spa_t and 6444 * there is any possibility that it can be reused, 6445 * we make sure to reset the exporting flag. 6446 */ 6447 spa->spa_is_exporting = B_FALSE; 6448 } 6449 6450 mutex_exit(&spa_namespace_lock); 6451 return (0); 6452 6453 fail: 6454 spa->spa_is_exporting = B_FALSE; 6455 spa_async_resume(spa); 6456 mutex_exit(&spa_namespace_lock); 6457 return (error); 6458 } 6459 6460 /* 6461 * Destroy a storage pool. 6462 */ 6463 int 6464 spa_destroy(const char *pool) 6465 { 6466 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 6467 B_FALSE, B_FALSE)); 6468 } 6469 6470 /* 6471 * Export a storage pool. 6472 */ 6473 int 6474 spa_export(const char *pool, nvlist_t **oldconfig, boolean_t force, 6475 boolean_t hardforce) 6476 { 6477 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 6478 force, hardforce)); 6479 } 6480 6481 /* 6482 * Similar to spa_export(), this unloads the spa_t without actually removing it 6483 * from the namespace in any way. 6484 */ 6485 int 6486 spa_reset(const char *pool) 6487 { 6488 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 6489 B_FALSE, B_FALSE)); 6490 } 6491 6492 /* 6493 * ========================================================================== 6494 * Device manipulation 6495 * ========================================================================== 6496 */ 6497 6498 /* 6499 * This is called as a synctask to increment the draid feature flag 6500 */ 6501 static void 6502 spa_draid_feature_incr(void *arg, dmu_tx_t *tx) 6503 { 6504 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6505 int draid = (int)(uintptr_t)arg; 6506 6507 for (int c = 0; c < draid; c++) 6508 spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); 6509 } 6510 6511 /* 6512 * Add a device to a storage pool. 6513 */ 6514 int 6515 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 6516 { 6517 uint64_t txg, ndraid = 0; 6518 int error; 6519 vdev_t *rvd = spa->spa_root_vdev; 6520 vdev_t *vd, *tvd; 6521 nvlist_t **spares, **l2cache; 6522 uint_t nspares, nl2cache; 6523 6524 ASSERT(spa_writeable(spa)); 6525 6526 txg = spa_vdev_enter(spa); 6527 6528 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 6529 VDEV_ALLOC_ADD)) != 0) 6530 return (spa_vdev_exit(spa, NULL, txg, error)); 6531 6532 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 6533 6534 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 6535 &nspares) != 0) 6536 nspares = 0; 6537 6538 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 6539 &nl2cache) != 0) 6540 nl2cache = 0; 6541 6542 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 6543 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 6544 6545 if (vd->vdev_children != 0 && 6546 (error = vdev_create(vd, txg, B_FALSE)) != 0) { 6547 return (spa_vdev_exit(spa, vd, txg, error)); 6548 } 6549 6550 /* 6551 * The virtual dRAID spares must be added after vdev tree is created 6552 * and the vdev guids are generated. The guid of their associated 6553 * dRAID is stored in the config and used when opening the spare. 6554 */ 6555 if ((error = vdev_draid_spare_create(nvroot, vd, &ndraid, 6556 rvd->vdev_children)) == 0) { 6557 if (ndraid > 0 && nvlist_lookup_nvlist_array(nvroot, 6558 ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0) 6559 nspares = 0; 6560 } else { 6561 return (spa_vdev_exit(spa, vd, txg, error)); 6562 } 6563 6564 /* 6565 * We must validate the spares and l2cache devices after checking the 6566 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 6567 */ 6568 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 6569 return (spa_vdev_exit(spa, vd, txg, error)); 6570 6571 /* 6572 * If we are in the middle of a device removal, we can only add 6573 * devices which match the existing devices in the pool. 6574 * If we are in the middle of a removal, or have some indirect 6575 * vdevs, we can not add raidz or dRAID top levels. 6576 */ 6577 if (spa->spa_vdev_removal != NULL || 6578 spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { 6579 for (int c = 0; c < vd->vdev_children; c++) { 6580 tvd = vd->vdev_child[c]; 6581 if (spa->spa_vdev_removal != NULL && 6582 tvd->vdev_ashift != spa->spa_max_ashift) { 6583 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 6584 } 6585 /* Fail if top level vdev is raidz or a dRAID */ 6586 if (vdev_get_nparity(tvd) != 0) 6587 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 6588 6589 /* 6590 * Need the top level mirror to be 6591 * a mirror of leaf vdevs only 6592 */ 6593 if (tvd->vdev_ops == &vdev_mirror_ops) { 6594 for (uint64_t cid = 0; 6595 cid < tvd->vdev_children; cid++) { 6596 vdev_t *cvd = tvd->vdev_child[cid]; 6597 if (!cvd->vdev_ops->vdev_op_leaf) { 6598 return (spa_vdev_exit(spa, vd, 6599 txg, EINVAL)); 6600 } 6601 } 6602 } 6603 } 6604 } 6605 6606 for (int c = 0; c < vd->vdev_children; c++) { 6607 tvd = vd->vdev_child[c]; 6608 vdev_remove_child(vd, tvd); 6609 tvd->vdev_id = rvd->vdev_children; 6610 vdev_add_child(rvd, tvd); 6611 vdev_config_dirty(tvd); 6612 } 6613 6614 if (nspares != 0) { 6615 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 6616 ZPOOL_CONFIG_SPARES); 6617 spa_load_spares(spa); 6618 spa->spa_spares.sav_sync = B_TRUE; 6619 } 6620 6621 if (nl2cache != 0) { 6622 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 6623 ZPOOL_CONFIG_L2CACHE); 6624 spa_load_l2cache(spa); 6625 spa->spa_l2cache.sav_sync = B_TRUE; 6626 } 6627 6628 /* 6629 * We can't increment a feature while holding spa_vdev so we 6630 * have to do it in a synctask. 6631 */ 6632 if (ndraid != 0) { 6633 dmu_tx_t *tx; 6634 6635 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 6636 dsl_sync_task_nowait(spa->spa_dsl_pool, spa_draid_feature_incr, 6637 (void *)(uintptr_t)ndraid, tx); 6638 dmu_tx_commit(tx); 6639 } 6640 6641 /* 6642 * We have to be careful when adding new vdevs to an existing pool. 6643 * If other threads start allocating from these vdevs before we 6644 * sync the config cache, and we lose power, then upon reboot we may 6645 * fail to open the pool because there are DVAs that the config cache 6646 * can't translate. Therefore, we first add the vdevs without 6647 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 6648 * and then let spa_config_update() initialize the new metaslabs. 6649 * 6650 * spa_load() checks for added-but-not-initialized vdevs, so that 6651 * if we lose power at any point in this sequence, the remaining 6652 * steps will be completed the next time we load the pool. 6653 */ 6654 (void) spa_vdev_exit(spa, vd, txg, 0); 6655 6656 mutex_enter(&spa_namespace_lock); 6657 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 6658 spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD); 6659 mutex_exit(&spa_namespace_lock); 6660 6661 return (0); 6662 } 6663 6664 /* 6665 * Attach a device to a mirror. The arguments are the path to any device 6666 * in the mirror, and the nvroot for the new device. If the path specifies 6667 * a device that is not mirrored, we automatically insert the mirror vdev. 6668 * 6669 * If 'replacing' is specified, the new device is intended to replace the 6670 * existing device; in this case the two devices are made into their own 6671 * mirror using the 'replacing' vdev, which is functionally identical to 6672 * the mirror vdev (it actually reuses all the same ops) but has a few 6673 * extra rules: you can't attach to it after it's been created, and upon 6674 * completion of resilvering, the first disk (the one being replaced) 6675 * is automatically detached. 6676 * 6677 * If 'rebuild' is specified, then sequential reconstruction (a.ka. rebuild) 6678 * should be performed instead of traditional healing reconstruction. From 6679 * an administrators perspective these are both resilver operations. 6680 */ 6681 int 6682 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, 6683 int rebuild) 6684 { 6685 uint64_t txg, dtl_max_txg; 6686 vdev_t *rvd = spa->spa_root_vdev; 6687 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 6688 vdev_ops_t *pvops; 6689 char *oldvdpath, *newvdpath; 6690 int newvd_isspare; 6691 int error; 6692 6693 ASSERT(spa_writeable(spa)); 6694 6695 txg = spa_vdev_enter(spa); 6696 6697 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 6698 6699 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 6700 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 6701 error = (spa_has_checkpoint(spa)) ? 6702 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 6703 return (spa_vdev_exit(spa, NULL, txg, error)); 6704 } 6705 6706 if (rebuild) { 6707 if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) 6708 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 6709 6710 if (dsl_scan_resilvering(spa_get_dsl(spa))) 6711 return (spa_vdev_exit(spa, NULL, txg, 6712 ZFS_ERR_RESILVER_IN_PROGRESS)); 6713 } else { 6714 if (vdev_rebuild_active(rvd)) 6715 return (spa_vdev_exit(spa, NULL, txg, 6716 ZFS_ERR_REBUILD_IN_PROGRESS)); 6717 } 6718 6719 if (spa->spa_vdev_removal != NULL) 6720 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 6721 6722 if (oldvd == NULL) 6723 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 6724 6725 if (!oldvd->vdev_ops->vdev_op_leaf) 6726 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 6727 6728 pvd = oldvd->vdev_parent; 6729 6730 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 6731 VDEV_ALLOC_ATTACH)) != 0) 6732 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 6733 6734 if (newrootvd->vdev_children != 1) 6735 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 6736 6737 newvd = newrootvd->vdev_child[0]; 6738 6739 if (!newvd->vdev_ops->vdev_op_leaf) 6740 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 6741 6742 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 6743 return (spa_vdev_exit(spa, newrootvd, txg, error)); 6744 6745 /* 6746 * Spares can't replace logs 6747 */ 6748 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 6749 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6750 6751 /* 6752 * A dRAID spare can only replace a child of its parent dRAID vdev. 6753 */ 6754 if (newvd->vdev_ops == &vdev_draid_spare_ops && 6755 oldvd->vdev_top != vdev_draid_spare_get_parent(newvd)) { 6756 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6757 } 6758 6759 if (rebuild) { 6760 /* 6761 * For rebuilds, the top vdev must support reconstruction 6762 * using only space maps. This means the only allowable 6763 * vdevs types are the root vdev, a mirror, or dRAID. 6764 */ 6765 tvd = pvd; 6766 if (pvd->vdev_top != NULL) 6767 tvd = pvd->vdev_top; 6768 6769 if (tvd->vdev_ops != &vdev_mirror_ops && 6770 tvd->vdev_ops != &vdev_root_ops && 6771 tvd->vdev_ops != &vdev_draid_ops) { 6772 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6773 } 6774 } 6775 6776 if (!replacing) { 6777 /* 6778 * For attach, the only allowable parent is a mirror or the root 6779 * vdev. 6780 */ 6781 if (pvd->vdev_ops != &vdev_mirror_ops && 6782 pvd->vdev_ops != &vdev_root_ops) 6783 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6784 6785 pvops = &vdev_mirror_ops; 6786 } else { 6787 /* 6788 * Active hot spares can only be replaced by inactive hot 6789 * spares. 6790 */ 6791 if (pvd->vdev_ops == &vdev_spare_ops && 6792 oldvd->vdev_isspare && 6793 !spa_has_spare(spa, newvd->vdev_guid)) 6794 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6795 6796 /* 6797 * If the source is a hot spare, and the parent isn't already a 6798 * spare, then we want to create a new hot spare. Otherwise, we 6799 * want to create a replacing vdev. The user is not allowed to 6800 * attach to a spared vdev child unless the 'isspare' state is 6801 * the same (spare replaces spare, non-spare replaces 6802 * non-spare). 6803 */ 6804 if (pvd->vdev_ops == &vdev_replacing_ops && 6805 spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 6806 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6807 } else if (pvd->vdev_ops == &vdev_spare_ops && 6808 newvd->vdev_isspare != oldvd->vdev_isspare) { 6809 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6810 } 6811 6812 if (newvd->vdev_isspare) 6813 pvops = &vdev_spare_ops; 6814 else 6815 pvops = &vdev_replacing_ops; 6816 } 6817 6818 /* 6819 * Make sure the new device is big enough. 6820 */ 6821 if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 6822 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 6823 6824 /* 6825 * The new device cannot have a higher alignment requirement 6826 * than the top-level vdev. 6827 */ 6828 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 6829 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6830 6831 /* 6832 * If this is an in-place replacement, update oldvd's path and devid 6833 * to make it distinguishable from newvd, and unopenable from now on. 6834 */ 6835 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 6836 spa_strfree(oldvd->vdev_path); 6837 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 6838 KM_SLEEP); 6839 (void) snprintf(oldvd->vdev_path, strlen(newvd->vdev_path) + 5, 6840 "%s/%s", newvd->vdev_path, "old"); 6841 if (oldvd->vdev_devid != NULL) { 6842 spa_strfree(oldvd->vdev_devid); 6843 oldvd->vdev_devid = NULL; 6844 } 6845 } 6846 6847 /* 6848 * If the parent is not a mirror, or if we're replacing, insert the new 6849 * mirror/replacing/spare vdev above oldvd. 6850 */ 6851 if (pvd->vdev_ops != pvops) 6852 pvd = vdev_add_parent(oldvd, pvops); 6853 6854 ASSERT(pvd->vdev_top->vdev_parent == rvd); 6855 ASSERT(pvd->vdev_ops == pvops); 6856 ASSERT(oldvd->vdev_parent == pvd); 6857 6858 /* 6859 * Extract the new device from its root and add it to pvd. 6860 */ 6861 vdev_remove_child(newrootvd, newvd); 6862 newvd->vdev_id = pvd->vdev_children; 6863 newvd->vdev_crtxg = oldvd->vdev_crtxg; 6864 vdev_add_child(pvd, newvd); 6865 6866 /* 6867 * Reevaluate the parent vdev state. 6868 */ 6869 vdev_propagate_state(pvd); 6870 6871 tvd = newvd->vdev_top; 6872 ASSERT(pvd->vdev_top == tvd); 6873 ASSERT(tvd->vdev_parent == rvd); 6874 6875 vdev_config_dirty(tvd); 6876 6877 /* 6878 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 6879 * for any dmu_sync-ed blocks. It will propagate upward when 6880 * spa_vdev_exit() calls vdev_dtl_reassess(). 6881 */ 6882 dtl_max_txg = txg + TXG_CONCURRENT_STATES; 6883 6884 vdev_dtl_dirty(newvd, DTL_MISSING, 6885 TXG_INITIAL, dtl_max_txg - TXG_INITIAL); 6886 6887 if (newvd->vdev_isspare) { 6888 spa_spare_activate(newvd); 6889 spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE); 6890 } 6891 6892 oldvdpath = spa_strdup(oldvd->vdev_path); 6893 newvdpath = spa_strdup(newvd->vdev_path); 6894 newvd_isspare = newvd->vdev_isspare; 6895 6896 /* 6897 * Mark newvd's DTL dirty in this txg. 6898 */ 6899 vdev_dirty(tvd, VDD_DTL, newvd, txg); 6900 6901 /* 6902 * Schedule the resilver or rebuild to restart in the future. We do 6903 * this to ensure that dmu_sync-ed blocks have been stitched into the 6904 * respective datasets. 6905 */ 6906 if (rebuild) { 6907 newvd->vdev_rebuild_txg = txg; 6908 6909 vdev_rebuild(tvd); 6910 } else { 6911 newvd->vdev_resilver_txg = txg; 6912 6913 if (dsl_scan_resilvering(spa_get_dsl(spa)) && 6914 spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) { 6915 vdev_defer_resilver(newvd); 6916 } else { 6917 dsl_scan_restart_resilver(spa->spa_dsl_pool, 6918 dtl_max_txg); 6919 } 6920 } 6921 6922 if (spa->spa_bootfs) 6923 spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH); 6924 6925 spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH); 6926 6927 /* 6928 * Commit the config 6929 */ 6930 (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 6931 6932 spa_history_log_internal(spa, "vdev attach", NULL, 6933 "%s vdev=%s %s vdev=%s", 6934 replacing && newvd_isspare ? "spare in" : 6935 replacing ? "replace" : "attach", newvdpath, 6936 replacing ? "for" : "to", oldvdpath); 6937 6938 spa_strfree(oldvdpath); 6939 spa_strfree(newvdpath); 6940 6941 return (0); 6942 } 6943 6944 /* 6945 * Detach a device from a mirror or replacing vdev. 6946 * 6947 * If 'replace_done' is specified, only detach if the parent 6948 * is a replacing vdev. 6949 */ 6950 int 6951 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 6952 { 6953 uint64_t txg; 6954 int error; 6955 vdev_t *rvd __maybe_unused = spa->spa_root_vdev; 6956 vdev_t *vd, *pvd, *cvd, *tvd; 6957 boolean_t unspare = B_FALSE; 6958 uint64_t unspare_guid = 0; 6959 char *vdpath; 6960 6961 ASSERT(spa_writeable(spa)); 6962 6963 txg = spa_vdev_detach_enter(spa, guid); 6964 6965 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 6966 6967 /* 6968 * Besides being called directly from the userland through the 6969 * ioctl interface, spa_vdev_detach() can be potentially called 6970 * at the end of spa_vdev_resilver_done(). 6971 * 6972 * In the regular case, when we have a checkpoint this shouldn't 6973 * happen as we never empty the DTLs of a vdev during the scrub 6974 * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done() 6975 * should never get here when we have a checkpoint. 6976 * 6977 * That said, even in a case when we checkpoint the pool exactly 6978 * as spa_vdev_resilver_done() calls this function everything 6979 * should be fine as the resilver will return right away. 6980 */ 6981 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 6982 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 6983 error = (spa_has_checkpoint(spa)) ? 6984 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 6985 return (spa_vdev_exit(spa, NULL, txg, error)); 6986 } 6987 6988 if (vd == NULL) 6989 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 6990 6991 if (!vd->vdev_ops->vdev_op_leaf) 6992 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 6993 6994 pvd = vd->vdev_parent; 6995 6996 /* 6997 * If the parent/child relationship is not as expected, don't do it. 6998 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 6999 * vdev that's replacing B with C. The user's intent in replacing 7000 * is to go from M(A,B) to M(A,C). If the user decides to cancel 7001 * the replace by detaching C, the expected behavior is to end up 7002 * M(A,B). But suppose that right after deciding to detach C, 7003 * the replacement of B completes. We would have M(A,C), and then 7004 * ask to detach C, which would leave us with just A -- not what 7005 * the user wanted. To prevent this, we make sure that the 7006 * parent/child relationship hasn't changed -- in this example, 7007 * that C's parent is still the replacing vdev R. 7008 */ 7009 if (pvd->vdev_guid != pguid && pguid != 0) 7010 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 7011 7012 /* 7013 * Only 'replacing' or 'spare' vdevs can be replaced. 7014 */ 7015 if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 7016 pvd->vdev_ops != &vdev_spare_ops) 7017 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7018 7019 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 7020 spa_version(spa) >= SPA_VERSION_SPARES); 7021 7022 /* 7023 * Only mirror, replacing, and spare vdevs support detach. 7024 */ 7025 if (pvd->vdev_ops != &vdev_replacing_ops && 7026 pvd->vdev_ops != &vdev_mirror_ops && 7027 pvd->vdev_ops != &vdev_spare_ops) 7028 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7029 7030 /* 7031 * If this device has the only valid copy of some data, 7032 * we cannot safely detach it. 7033 */ 7034 if (vdev_dtl_required(vd)) 7035 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 7036 7037 ASSERT(pvd->vdev_children >= 2); 7038 7039 /* 7040 * If we are detaching the second disk from a replacing vdev, then 7041 * check to see if we changed the original vdev's path to have "/old" 7042 * at the end in spa_vdev_attach(). If so, undo that change now. 7043 */ 7044 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 7045 vd->vdev_path != NULL) { 7046 size_t len = strlen(vd->vdev_path); 7047 7048 for (int c = 0; c < pvd->vdev_children; c++) { 7049 cvd = pvd->vdev_child[c]; 7050 7051 if (cvd == vd || cvd->vdev_path == NULL) 7052 continue; 7053 7054 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 7055 strcmp(cvd->vdev_path + len, "/old") == 0) { 7056 spa_strfree(cvd->vdev_path); 7057 cvd->vdev_path = spa_strdup(vd->vdev_path); 7058 break; 7059 } 7060 } 7061 } 7062 7063 /* 7064 * If we are detaching the original disk from a normal spare, then it 7065 * implies that the spare should become a real disk, and be removed 7066 * from the active spare list for the pool. dRAID spares on the 7067 * other hand are coupled to the pool and thus should never be removed 7068 * from the spares list. 7069 */ 7070 if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0) { 7071 vdev_t *last_cvd = pvd->vdev_child[pvd->vdev_children - 1]; 7072 7073 if (last_cvd->vdev_isspare && 7074 last_cvd->vdev_ops != &vdev_draid_spare_ops) { 7075 unspare = B_TRUE; 7076 } 7077 } 7078 7079 /* 7080 * Erase the disk labels so the disk can be used for other things. 7081 * This must be done after all other error cases are handled, 7082 * but before we disembowel vd (so we can still do I/O to it). 7083 * But if we can't do it, don't treat the error as fatal -- 7084 * it may be that the unwritability of the disk is the reason 7085 * it's being detached! 7086 */ 7087 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 7088 7089 /* 7090 * Remove vd from its parent and compact the parent's children. 7091 */ 7092 vdev_remove_child(pvd, vd); 7093 vdev_compact_children(pvd); 7094 7095 /* 7096 * Remember one of the remaining children so we can get tvd below. 7097 */ 7098 cvd = pvd->vdev_child[pvd->vdev_children - 1]; 7099 7100 /* 7101 * If we need to remove the remaining child from the list of hot spares, 7102 * do it now, marking the vdev as no longer a spare in the process. 7103 * We must do this before vdev_remove_parent(), because that can 7104 * change the GUID if it creates a new toplevel GUID. For a similar 7105 * reason, we must remove the spare now, in the same txg as the detach; 7106 * otherwise someone could attach a new sibling, change the GUID, and 7107 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 7108 */ 7109 if (unspare) { 7110 ASSERT(cvd->vdev_isspare); 7111 spa_spare_remove(cvd); 7112 unspare_guid = cvd->vdev_guid; 7113 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 7114 cvd->vdev_unspare = B_TRUE; 7115 } 7116 7117 /* 7118 * If the parent mirror/replacing vdev only has one child, 7119 * the parent is no longer needed. Remove it from the tree. 7120 */ 7121 if (pvd->vdev_children == 1) { 7122 if (pvd->vdev_ops == &vdev_spare_ops) 7123 cvd->vdev_unspare = B_FALSE; 7124 vdev_remove_parent(cvd); 7125 } 7126 7127 /* 7128 * We don't set tvd until now because the parent we just removed 7129 * may have been the previous top-level vdev. 7130 */ 7131 tvd = cvd->vdev_top; 7132 ASSERT(tvd->vdev_parent == rvd); 7133 7134 /* 7135 * Reevaluate the parent vdev state. 7136 */ 7137 vdev_propagate_state(cvd); 7138 7139 /* 7140 * If the 'autoexpand' property is set on the pool then automatically 7141 * try to expand the size of the pool. For example if the device we 7142 * just detached was smaller than the others, it may be possible to 7143 * add metaslabs (i.e. grow the pool). We need to reopen the vdev 7144 * first so that we can obtain the updated sizes of the leaf vdevs. 7145 */ 7146 if (spa->spa_autoexpand) { 7147 vdev_reopen(tvd); 7148 vdev_expand(tvd, txg); 7149 } 7150 7151 vdev_config_dirty(tvd); 7152 7153 /* 7154 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 7155 * vd->vdev_detached is set and free vd's DTL object in syncing context. 7156 * But first make sure we're not on any *other* txg's DTL list, to 7157 * prevent vd from being accessed after it's freed. 7158 */ 7159 vdpath = spa_strdup(vd->vdev_path ? vd->vdev_path : "none"); 7160 for (int t = 0; t < TXG_SIZE; t++) 7161 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 7162 vd->vdev_detached = B_TRUE; 7163 vdev_dirty(tvd, VDD_DTL, vd, txg); 7164 7165 spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE); 7166 spa_notify_waiters(spa); 7167 7168 /* hang on to the spa before we release the lock */ 7169 spa_open_ref(spa, FTAG); 7170 7171 error = spa_vdev_exit(spa, vd, txg, 0); 7172 7173 spa_history_log_internal(spa, "detach", NULL, 7174 "vdev=%s", vdpath); 7175 spa_strfree(vdpath); 7176 7177 /* 7178 * If this was the removal of the original device in a hot spare vdev, 7179 * then we want to go through and remove the device from the hot spare 7180 * list of every other pool. 7181 */ 7182 if (unspare) { 7183 spa_t *altspa = NULL; 7184 7185 mutex_enter(&spa_namespace_lock); 7186 while ((altspa = spa_next(altspa)) != NULL) { 7187 if (altspa->spa_state != POOL_STATE_ACTIVE || 7188 altspa == spa) 7189 continue; 7190 7191 spa_open_ref(altspa, FTAG); 7192 mutex_exit(&spa_namespace_lock); 7193 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 7194 mutex_enter(&spa_namespace_lock); 7195 spa_close(altspa, FTAG); 7196 } 7197 mutex_exit(&spa_namespace_lock); 7198 7199 /* search the rest of the vdevs for spares to remove */ 7200 spa_vdev_resilver_done(spa); 7201 } 7202 7203 /* all done with the spa; OK to release */ 7204 mutex_enter(&spa_namespace_lock); 7205 spa_close(spa, FTAG); 7206 mutex_exit(&spa_namespace_lock); 7207 7208 return (error); 7209 } 7210 7211 static int 7212 spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, 7213 list_t *vd_list) 7214 { 7215 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 7216 7217 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 7218 7219 /* Look up vdev and ensure it's a leaf. */ 7220 vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); 7221 if (vd == NULL || vd->vdev_detached) { 7222 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7223 return (SET_ERROR(ENODEV)); 7224 } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { 7225 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7226 return (SET_ERROR(EINVAL)); 7227 } else if (!vdev_writeable(vd)) { 7228 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7229 return (SET_ERROR(EROFS)); 7230 } 7231 mutex_enter(&vd->vdev_initialize_lock); 7232 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7233 7234 /* 7235 * When we activate an initialize action we check to see 7236 * if the vdev_initialize_thread is NULL. We do this instead 7237 * of using the vdev_initialize_state since there might be 7238 * a previous initialization process which has completed but 7239 * the thread is not exited. 7240 */ 7241 if (cmd_type == POOL_INITIALIZE_START && 7242 (vd->vdev_initialize_thread != NULL || 7243 vd->vdev_top->vdev_removing)) { 7244 mutex_exit(&vd->vdev_initialize_lock); 7245 return (SET_ERROR(EBUSY)); 7246 } else if (cmd_type == POOL_INITIALIZE_CANCEL && 7247 (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE && 7248 vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) { 7249 mutex_exit(&vd->vdev_initialize_lock); 7250 return (SET_ERROR(ESRCH)); 7251 } else if (cmd_type == POOL_INITIALIZE_SUSPEND && 7252 vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) { 7253 mutex_exit(&vd->vdev_initialize_lock); 7254 return (SET_ERROR(ESRCH)); 7255 } 7256 7257 switch (cmd_type) { 7258 case POOL_INITIALIZE_START: 7259 vdev_initialize(vd); 7260 break; 7261 case POOL_INITIALIZE_CANCEL: 7262 vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED, vd_list); 7263 break; 7264 case POOL_INITIALIZE_SUSPEND: 7265 vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list); 7266 break; 7267 default: 7268 panic("invalid cmd_type %llu", (unsigned long long)cmd_type); 7269 } 7270 mutex_exit(&vd->vdev_initialize_lock); 7271 7272 return (0); 7273 } 7274 7275 int 7276 spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, 7277 nvlist_t *vdev_errlist) 7278 { 7279 int total_errors = 0; 7280 list_t vd_list; 7281 7282 list_create(&vd_list, sizeof (vdev_t), 7283 offsetof(vdev_t, vdev_initialize_node)); 7284 7285 /* 7286 * We hold the namespace lock through the whole function 7287 * to prevent any changes to the pool while we're starting or 7288 * stopping initialization. The config and state locks are held so that 7289 * we can properly assess the vdev state before we commit to 7290 * the initializing operation. 7291 */ 7292 mutex_enter(&spa_namespace_lock); 7293 7294 for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); 7295 pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { 7296 uint64_t vdev_guid = fnvpair_value_uint64(pair); 7297 7298 int error = spa_vdev_initialize_impl(spa, vdev_guid, cmd_type, 7299 &vd_list); 7300 if (error != 0) { 7301 char guid_as_str[MAXNAMELEN]; 7302 7303 (void) snprintf(guid_as_str, sizeof (guid_as_str), 7304 "%llu", (unsigned long long)vdev_guid); 7305 fnvlist_add_int64(vdev_errlist, guid_as_str, error); 7306 total_errors++; 7307 } 7308 } 7309 7310 /* Wait for all initialize threads to stop. */ 7311 vdev_initialize_stop_wait(spa, &vd_list); 7312 7313 /* Sync out the initializing state */ 7314 txg_wait_synced(spa->spa_dsl_pool, 0); 7315 mutex_exit(&spa_namespace_lock); 7316 7317 list_destroy(&vd_list); 7318 7319 return (total_errors); 7320 } 7321 7322 static int 7323 spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, 7324 uint64_t rate, boolean_t partial, boolean_t secure, list_t *vd_list) 7325 { 7326 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 7327 7328 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 7329 7330 /* Look up vdev and ensure it's a leaf. */ 7331 vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); 7332 if (vd == NULL || vd->vdev_detached) { 7333 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7334 return (SET_ERROR(ENODEV)); 7335 } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { 7336 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7337 return (SET_ERROR(EINVAL)); 7338 } else if (!vdev_writeable(vd)) { 7339 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7340 return (SET_ERROR(EROFS)); 7341 } else if (!vd->vdev_has_trim) { 7342 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7343 return (SET_ERROR(EOPNOTSUPP)); 7344 } else if (secure && !vd->vdev_has_securetrim) { 7345 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7346 return (SET_ERROR(EOPNOTSUPP)); 7347 } 7348 mutex_enter(&vd->vdev_trim_lock); 7349 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7350 7351 /* 7352 * When we activate a TRIM action we check to see if the 7353 * vdev_trim_thread is NULL. We do this instead of using the 7354 * vdev_trim_state since there might be a previous TRIM process 7355 * which has completed but the thread is not exited. 7356 */ 7357 if (cmd_type == POOL_TRIM_START && 7358 (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing)) { 7359 mutex_exit(&vd->vdev_trim_lock); 7360 return (SET_ERROR(EBUSY)); 7361 } else if (cmd_type == POOL_TRIM_CANCEL && 7362 (vd->vdev_trim_state != VDEV_TRIM_ACTIVE && 7363 vd->vdev_trim_state != VDEV_TRIM_SUSPENDED)) { 7364 mutex_exit(&vd->vdev_trim_lock); 7365 return (SET_ERROR(ESRCH)); 7366 } else if (cmd_type == POOL_TRIM_SUSPEND && 7367 vd->vdev_trim_state != VDEV_TRIM_ACTIVE) { 7368 mutex_exit(&vd->vdev_trim_lock); 7369 return (SET_ERROR(ESRCH)); 7370 } 7371 7372 switch (cmd_type) { 7373 case POOL_TRIM_START: 7374 vdev_trim(vd, rate, partial, secure); 7375 break; 7376 case POOL_TRIM_CANCEL: 7377 vdev_trim_stop(vd, VDEV_TRIM_CANCELED, vd_list); 7378 break; 7379 case POOL_TRIM_SUSPEND: 7380 vdev_trim_stop(vd, VDEV_TRIM_SUSPENDED, vd_list); 7381 break; 7382 default: 7383 panic("invalid cmd_type %llu", (unsigned long long)cmd_type); 7384 } 7385 mutex_exit(&vd->vdev_trim_lock); 7386 7387 return (0); 7388 } 7389 7390 /* 7391 * Initiates a manual TRIM for the requested vdevs. This kicks off individual 7392 * TRIM threads for each child vdev. These threads pass over all of the free 7393 * space in the vdev's metaslabs and issues TRIM commands for that space. 7394 */ 7395 int 7396 spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate, 7397 boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist) 7398 { 7399 int total_errors = 0; 7400 list_t vd_list; 7401 7402 list_create(&vd_list, sizeof (vdev_t), 7403 offsetof(vdev_t, vdev_trim_node)); 7404 7405 /* 7406 * We hold the namespace lock through the whole function 7407 * to prevent any changes to the pool while we're starting or 7408 * stopping TRIM. The config and state locks are held so that 7409 * we can properly assess the vdev state before we commit to 7410 * the TRIM operation. 7411 */ 7412 mutex_enter(&spa_namespace_lock); 7413 7414 for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); 7415 pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { 7416 uint64_t vdev_guid = fnvpair_value_uint64(pair); 7417 7418 int error = spa_vdev_trim_impl(spa, vdev_guid, cmd_type, 7419 rate, partial, secure, &vd_list); 7420 if (error != 0) { 7421 char guid_as_str[MAXNAMELEN]; 7422 7423 (void) snprintf(guid_as_str, sizeof (guid_as_str), 7424 "%llu", (unsigned long long)vdev_guid); 7425 fnvlist_add_int64(vdev_errlist, guid_as_str, error); 7426 total_errors++; 7427 } 7428 } 7429 7430 /* Wait for all TRIM threads to stop. */ 7431 vdev_trim_stop_wait(spa, &vd_list); 7432 7433 /* Sync out the TRIM state */ 7434 txg_wait_synced(spa->spa_dsl_pool, 0); 7435 mutex_exit(&spa_namespace_lock); 7436 7437 list_destroy(&vd_list); 7438 7439 return (total_errors); 7440 } 7441 7442 /* 7443 * Split a set of devices from their mirrors, and create a new pool from them. 7444 */ 7445 int 7446 spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, 7447 nvlist_t *props, boolean_t exp) 7448 { 7449 int error = 0; 7450 uint64_t txg, *glist; 7451 spa_t *newspa; 7452 uint_t c, children, lastlog; 7453 nvlist_t **child, *nvl, *tmp; 7454 dmu_tx_t *tx; 7455 char *altroot = NULL; 7456 vdev_t *rvd, **vml = NULL; /* vdev modify list */ 7457 boolean_t activate_slog; 7458 7459 ASSERT(spa_writeable(spa)); 7460 7461 txg = spa_vdev_enter(spa); 7462 7463 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 7464 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 7465 error = (spa_has_checkpoint(spa)) ? 7466 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 7467 return (spa_vdev_exit(spa, NULL, txg, error)); 7468 } 7469 7470 /* clear the log and flush everything up to now */ 7471 activate_slog = spa_passivate_log(spa); 7472 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 7473 error = spa_reset_logs(spa); 7474 txg = spa_vdev_config_enter(spa); 7475 7476 if (activate_slog) 7477 spa_activate_log(spa); 7478 7479 if (error != 0) 7480 return (spa_vdev_exit(spa, NULL, txg, error)); 7481 7482 /* check new spa name before going any further */ 7483 if (spa_lookup(newname) != NULL) 7484 return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 7485 7486 /* 7487 * scan through all the children to ensure they're all mirrors 7488 */ 7489 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 7490 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 7491 &children) != 0) 7492 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 7493 7494 /* first, check to ensure we've got the right child count */ 7495 rvd = spa->spa_root_vdev; 7496 lastlog = 0; 7497 for (c = 0; c < rvd->vdev_children; c++) { 7498 vdev_t *vd = rvd->vdev_child[c]; 7499 7500 /* don't count the holes & logs as children */ 7501 if (vd->vdev_islog || (vd->vdev_ops != &vdev_indirect_ops && 7502 !vdev_is_concrete(vd))) { 7503 if (lastlog == 0) 7504 lastlog = c; 7505 continue; 7506 } 7507 7508 lastlog = 0; 7509 } 7510 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 7511 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 7512 7513 /* next, ensure no spare or cache devices are part of the split */ 7514 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 7515 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 7516 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 7517 7518 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 7519 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 7520 7521 /* then, loop over each vdev and validate it */ 7522 for (c = 0; c < children; c++) { 7523 uint64_t is_hole = 0; 7524 7525 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 7526 &is_hole); 7527 7528 if (is_hole != 0) { 7529 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 7530 spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 7531 continue; 7532 } else { 7533 error = SET_ERROR(EINVAL); 7534 break; 7535 } 7536 } 7537 7538 /* deal with indirect vdevs */ 7539 if (spa->spa_root_vdev->vdev_child[c]->vdev_ops == 7540 &vdev_indirect_ops) 7541 continue; 7542 7543 /* which disk is going to be split? */ 7544 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 7545 &glist[c]) != 0) { 7546 error = SET_ERROR(EINVAL); 7547 break; 7548 } 7549 7550 /* look it up in the spa */ 7551 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 7552 if (vml[c] == NULL) { 7553 error = SET_ERROR(ENODEV); 7554 break; 7555 } 7556 7557 /* make sure there's nothing stopping the split */ 7558 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 7559 vml[c]->vdev_islog || 7560 !vdev_is_concrete(vml[c]) || 7561 vml[c]->vdev_isspare || 7562 vml[c]->vdev_isl2cache || 7563 !vdev_writeable(vml[c]) || 7564 vml[c]->vdev_children != 0 || 7565 vml[c]->vdev_state != VDEV_STATE_HEALTHY || 7566 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 7567 error = SET_ERROR(EINVAL); 7568 break; 7569 } 7570 7571 if (vdev_dtl_required(vml[c]) || 7572 vdev_resilver_needed(vml[c], NULL, NULL)) { 7573 error = SET_ERROR(EBUSY); 7574 break; 7575 } 7576 7577 /* we need certain info from the top level */ 7578 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 7579 vml[c]->vdev_top->vdev_ms_array); 7580 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 7581 vml[c]->vdev_top->vdev_ms_shift); 7582 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 7583 vml[c]->vdev_top->vdev_asize); 7584 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 7585 vml[c]->vdev_top->vdev_ashift); 7586 7587 /* transfer per-vdev ZAPs */ 7588 ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0); 7589 VERIFY0(nvlist_add_uint64(child[c], 7590 ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap)); 7591 7592 ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0); 7593 VERIFY0(nvlist_add_uint64(child[c], 7594 ZPOOL_CONFIG_VDEV_TOP_ZAP, 7595 vml[c]->vdev_parent->vdev_top_zap)); 7596 } 7597 7598 if (error != 0) { 7599 kmem_free(vml, children * sizeof (vdev_t *)); 7600 kmem_free(glist, children * sizeof (uint64_t)); 7601 return (spa_vdev_exit(spa, NULL, txg, error)); 7602 } 7603 7604 /* stop writers from using the disks */ 7605 for (c = 0; c < children; c++) { 7606 if (vml[c] != NULL) 7607 vml[c]->vdev_offline = B_TRUE; 7608 } 7609 vdev_reopen(spa->spa_root_vdev); 7610 7611 /* 7612 * Temporarily record the splitting vdevs in the spa config. This 7613 * will disappear once the config is regenerated. 7614 */ 7615 nvl = fnvlist_alloc(); 7616 fnvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, glist, children); 7617 kmem_free(glist, children * sizeof (uint64_t)); 7618 7619 mutex_enter(&spa->spa_props_lock); 7620 fnvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, nvl); 7621 mutex_exit(&spa->spa_props_lock); 7622 spa->spa_config_splitting = nvl; 7623 vdev_config_dirty(spa->spa_root_vdev); 7624 7625 /* configure and create the new pool */ 7626 fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname); 7627 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 7628 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE); 7629 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa)); 7630 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg); 7631 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 7632 spa_generate_guid(NULL)); 7633 VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); 7634 (void) nvlist_lookup_string(props, 7635 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 7636 7637 /* add the new pool to the namespace */ 7638 newspa = spa_add(newname, config, altroot); 7639 newspa->spa_avz_action = AVZ_ACTION_REBUILD; 7640 newspa->spa_config_txg = spa->spa_config_txg; 7641 spa_set_log_state(newspa, SPA_LOG_CLEAR); 7642 7643 /* release the spa config lock, retaining the namespace lock */ 7644 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 7645 7646 if (zio_injection_enabled) 7647 zio_handle_panic_injection(spa, FTAG, 1); 7648 7649 spa_activate(newspa, spa_mode_global); 7650 spa_async_suspend(newspa); 7651 7652 /* 7653 * Temporarily stop the initializing and TRIM activity. We set the 7654 * state to ACTIVE so that we know to resume initializing or TRIM 7655 * once the split has completed. 7656 */ 7657 list_t vd_initialize_list; 7658 list_create(&vd_initialize_list, sizeof (vdev_t), 7659 offsetof(vdev_t, vdev_initialize_node)); 7660 7661 list_t vd_trim_list; 7662 list_create(&vd_trim_list, sizeof (vdev_t), 7663 offsetof(vdev_t, vdev_trim_node)); 7664 7665 for (c = 0; c < children; c++) { 7666 if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { 7667 mutex_enter(&vml[c]->vdev_initialize_lock); 7668 vdev_initialize_stop(vml[c], 7669 VDEV_INITIALIZE_ACTIVE, &vd_initialize_list); 7670 mutex_exit(&vml[c]->vdev_initialize_lock); 7671 7672 mutex_enter(&vml[c]->vdev_trim_lock); 7673 vdev_trim_stop(vml[c], VDEV_TRIM_ACTIVE, &vd_trim_list); 7674 mutex_exit(&vml[c]->vdev_trim_lock); 7675 } 7676 } 7677 7678 vdev_initialize_stop_wait(spa, &vd_initialize_list); 7679 vdev_trim_stop_wait(spa, &vd_trim_list); 7680 7681 list_destroy(&vd_initialize_list); 7682 list_destroy(&vd_trim_list); 7683 7684 newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT; 7685 newspa->spa_is_splitting = B_TRUE; 7686 7687 /* create the new pool from the disks of the original pool */ 7688 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE); 7689 if (error) 7690 goto out; 7691 7692 /* if that worked, generate a real config for the new pool */ 7693 if (newspa->spa_root_vdev != NULL) { 7694 newspa->spa_config_splitting = fnvlist_alloc(); 7695 fnvlist_add_uint64(newspa->spa_config_splitting, 7696 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)); 7697 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 7698 B_TRUE)); 7699 } 7700 7701 /* set the props */ 7702 if (props != NULL) { 7703 spa_configfile_set(newspa, props, B_FALSE); 7704 error = spa_prop_set(newspa, props); 7705 if (error) 7706 goto out; 7707 } 7708 7709 /* flush everything */ 7710 txg = spa_vdev_config_enter(newspa); 7711 vdev_config_dirty(newspa->spa_root_vdev); 7712 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 7713 7714 if (zio_injection_enabled) 7715 zio_handle_panic_injection(spa, FTAG, 2); 7716 7717 spa_async_resume(newspa); 7718 7719 /* finally, update the original pool's config */ 7720 txg = spa_vdev_config_enter(spa); 7721 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 7722 error = dmu_tx_assign(tx, TXG_WAIT); 7723 if (error != 0) 7724 dmu_tx_abort(tx); 7725 for (c = 0; c < children; c++) { 7726 if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { 7727 vdev_t *tvd = vml[c]->vdev_top; 7728 7729 /* 7730 * Need to be sure the detachable VDEV is not 7731 * on any *other* txg's DTL list to prevent it 7732 * from being accessed after it's freed. 7733 */ 7734 for (int t = 0; t < TXG_SIZE; t++) { 7735 (void) txg_list_remove_this( 7736 &tvd->vdev_dtl_list, vml[c], t); 7737 } 7738 7739 vdev_split(vml[c]); 7740 if (error == 0) 7741 spa_history_log_internal(spa, "detach", tx, 7742 "vdev=%s", vml[c]->vdev_path); 7743 7744 vdev_free(vml[c]); 7745 } 7746 } 7747 spa->spa_avz_action = AVZ_ACTION_REBUILD; 7748 vdev_config_dirty(spa->spa_root_vdev); 7749 spa->spa_config_splitting = NULL; 7750 nvlist_free(nvl); 7751 if (error == 0) 7752 dmu_tx_commit(tx); 7753 (void) spa_vdev_exit(spa, NULL, txg, 0); 7754 7755 if (zio_injection_enabled) 7756 zio_handle_panic_injection(spa, FTAG, 3); 7757 7758 /* split is complete; log a history record */ 7759 spa_history_log_internal(newspa, "split", NULL, 7760 "from pool %s", spa_name(spa)); 7761 7762 newspa->spa_is_splitting = B_FALSE; 7763 kmem_free(vml, children * sizeof (vdev_t *)); 7764 7765 /* if we're not going to mount the filesystems in userland, export */ 7766 if (exp) 7767 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 7768 B_FALSE, B_FALSE); 7769 7770 return (error); 7771 7772 out: 7773 spa_unload(newspa); 7774 spa_deactivate(newspa); 7775 spa_remove(newspa); 7776 7777 txg = spa_vdev_config_enter(spa); 7778 7779 /* re-online all offlined disks */ 7780 for (c = 0; c < children; c++) { 7781 if (vml[c] != NULL) 7782 vml[c]->vdev_offline = B_FALSE; 7783 } 7784 7785 /* restart initializing or trimming disks as necessary */ 7786 spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); 7787 spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); 7788 spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); 7789 7790 vdev_reopen(spa->spa_root_vdev); 7791 7792 nvlist_free(spa->spa_config_splitting); 7793 spa->spa_config_splitting = NULL; 7794 (void) spa_vdev_exit(spa, NULL, txg, error); 7795 7796 kmem_free(vml, children * sizeof (vdev_t *)); 7797 return (error); 7798 } 7799 7800 /* 7801 * Find any device that's done replacing, or a vdev marked 'unspare' that's 7802 * currently spared, so we can detach it. 7803 */ 7804 static vdev_t * 7805 spa_vdev_resilver_done_hunt(vdev_t *vd) 7806 { 7807 vdev_t *newvd, *oldvd; 7808 7809 for (int c = 0; c < vd->vdev_children; c++) { 7810 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 7811 if (oldvd != NULL) 7812 return (oldvd); 7813 } 7814 7815 /* 7816 * Check for a completed replacement. We always consider the first 7817 * vdev in the list to be the oldest vdev, and the last one to be 7818 * the newest (see spa_vdev_attach() for how that works). In 7819 * the case where the newest vdev is faulted, we will not automatically 7820 * remove it after a resilver completes. This is OK as it will require 7821 * user intervention to determine which disk the admin wishes to keep. 7822 */ 7823 if (vd->vdev_ops == &vdev_replacing_ops) { 7824 ASSERT(vd->vdev_children > 1); 7825 7826 newvd = vd->vdev_child[vd->vdev_children - 1]; 7827 oldvd = vd->vdev_child[0]; 7828 7829 if (vdev_dtl_empty(newvd, DTL_MISSING) && 7830 vdev_dtl_empty(newvd, DTL_OUTAGE) && 7831 !vdev_dtl_required(oldvd)) 7832 return (oldvd); 7833 } 7834 7835 /* 7836 * Check for a completed resilver with the 'unspare' flag set. 7837 * Also potentially update faulted state. 7838 */ 7839 if (vd->vdev_ops == &vdev_spare_ops) { 7840 vdev_t *first = vd->vdev_child[0]; 7841 vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 7842 7843 if (last->vdev_unspare) { 7844 oldvd = first; 7845 newvd = last; 7846 } else if (first->vdev_unspare) { 7847 oldvd = last; 7848 newvd = first; 7849 } else { 7850 oldvd = NULL; 7851 } 7852 7853 if (oldvd != NULL && 7854 vdev_dtl_empty(newvd, DTL_MISSING) && 7855 vdev_dtl_empty(newvd, DTL_OUTAGE) && 7856 !vdev_dtl_required(oldvd)) 7857 return (oldvd); 7858 7859 vdev_propagate_state(vd); 7860 7861 /* 7862 * If there are more than two spares attached to a disk, 7863 * and those spares are not required, then we want to 7864 * attempt to free them up now so that they can be used 7865 * by other pools. Once we're back down to a single 7866 * disk+spare, we stop removing them. 7867 */ 7868 if (vd->vdev_children > 2) { 7869 newvd = vd->vdev_child[1]; 7870 7871 if (newvd->vdev_isspare && last->vdev_isspare && 7872 vdev_dtl_empty(last, DTL_MISSING) && 7873 vdev_dtl_empty(last, DTL_OUTAGE) && 7874 !vdev_dtl_required(newvd)) 7875 return (newvd); 7876 } 7877 } 7878 7879 return (NULL); 7880 } 7881 7882 static void 7883 spa_vdev_resilver_done(spa_t *spa) 7884 { 7885 vdev_t *vd, *pvd, *ppvd; 7886 uint64_t guid, sguid, pguid, ppguid; 7887 7888 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 7889 7890 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 7891 pvd = vd->vdev_parent; 7892 ppvd = pvd->vdev_parent; 7893 guid = vd->vdev_guid; 7894 pguid = pvd->vdev_guid; 7895 ppguid = ppvd->vdev_guid; 7896 sguid = 0; 7897 /* 7898 * If we have just finished replacing a hot spared device, then 7899 * we need to detach the parent's first child (the original hot 7900 * spare) as well. 7901 */ 7902 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 7903 ppvd->vdev_children == 2) { 7904 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 7905 sguid = ppvd->vdev_child[1]->vdev_guid; 7906 } 7907 ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); 7908 7909 spa_config_exit(spa, SCL_ALL, FTAG); 7910 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 7911 return; 7912 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 7913 return; 7914 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 7915 } 7916 7917 spa_config_exit(spa, SCL_ALL, FTAG); 7918 7919 /* 7920 * If a detach was not performed above replace waiters will not have 7921 * been notified. In which case we must do so now. 7922 */ 7923 spa_notify_waiters(spa); 7924 } 7925 7926 /* 7927 * Update the stored path or FRU for this vdev. 7928 */ 7929 static int 7930 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 7931 boolean_t ispath) 7932 { 7933 vdev_t *vd; 7934 boolean_t sync = B_FALSE; 7935 7936 ASSERT(spa_writeable(spa)); 7937 7938 spa_vdev_state_enter(spa, SCL_ALL); 7939 7940 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 7941 return (spa_vdev_state_exit(spa, NULL, ENOENT)); 7942 7943 if (!vd->vdev_ops->vdev_op_leaf) 7944 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 7945 7946 if (ispath) { 7947 if (strcmp(value, vd->vdev_path) != 0) { 7948 spa_strfree(vd->vdev_path); 7949 vd->vdev_path = spa_strdup(value); 7950 sync = B_TRUE; 7951 } 7952 } else { 7953 if (vd->vdev_fru == NULL) { 7954 vd->vdev_fru = spa_strdup(value); 7955 sync = B_TRUE; 7956 } else if (strcmp(value, vd->vdev_fru) != 0) { 7957 spa_strfree(vd->vdev_fru); 7958 vd->vdev_fru = spa_strdup(value); 7959 sync = B_TRUE; 7960 } 7961 } 7962 7963 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 7964 } 7965 7966 int 7967 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 7968 { 7969 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 7970 } 7971 7972 int 7973 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 7974 { 7975 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 7976 } 7977 7978 /* 7979 * ========================================================================== 7980 * SPA Scanning 7981 * ========================================================================== 7982 */ 7983 int 7984 spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd) 7985 { 7986 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 7987 7988 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 7989 return (SET_ERROR(EBUSY)); 7990 7991 return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd)); 7992 } 7993 7994 int 7995 spa_scan_stop(spa_t *spa) 7996 { 7997 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 7998 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 7999 return (SET_ERROR(EBUSY)); 8000 return (dsl_scan_cancel(spa->spa_dsl_pool)); 8001 } 8002 8003 int 8004 spa_scan(spa_t *spa, pool_scan_func_t func) 8005 { 8006 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 8007 8008 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 8009 return (SET_ERROR(ENOTSUP)); 8010 8011 if (func == POOL_SCAN_RESILVER && 8012 !spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) 8013 return (SET_ERROR(ENOTSUP)); 8014 8015 /* 8016 * If a resilver was requested, but there is no DTL on a 8017 * writeable leaf device, we have nothing to do. 8018 */ 8019 if (func == POOL_SCAN_RESILVER && 8020 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 8021 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 8022 return (0); 8023 } 8024 8025 return (dsl_scan(spa->spa_dsl_pool, func)); 8026 } 8027 8028 /* 8029 * ========================================================================== 8030 * SPA async task processing 8031 * ========================================================================== 8032 */ 8033 8034 static void 8035 spa_async_remove(spa_t *spa, vdev_t *vd) 8036 { 8037 if (vd->vdev_remove_wanted) { 8038 vd->vdev_remove_wanted = B_FALSE; 8039 vd->vdev_delayed_close = B_FALSE; 8040 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 8041 8042 /* 8043 * We want to clear the stats, but we don't want to do a full 8044 * vdev_clear() as that will cause us to throw away 8045 * degraded/faulted state as well as attempt to reopen the 8046 * device, all of which is a waste. 8047 */ 8048 vd->vdev_stat.vs_read_errors = 0; 8049 vd->vdev_stat.vs_write_errors = 0; 8050 vd->vdev_stat.vs_checksum_errors = 0; 8051 8052 vdev_state_dirty(vd->vdev_top); 8053 8054 /* Tell userspace that the vdev is gone. */ 8055 zfs_post_remove(spa, vd); 8056 } 8057 8058 for (int c = 0; c < vd->vdev_children; c++) 8059 spa_async_remove(spa, vd->vdev_child[c]); 8060 } 8061 8062 static void 8063 spa_async_probe(spa_t *spa, vdev_t *vd) 8064 { 8065 if (vd->vdev_probe_wanted) { 8066 vd->vdev_probe_wanted = B_FALSE; 8067 vdev_reopen(vd); /* vdev_open() does the actual probe */ 8068 } 8069 8070 for (int c = 0; c < vd->vdev_children; c++) 8071 spa_async_probe(spa, vd->vdev_child[c]); 8072 } 8073 8074 static void 8075 spa_async_autoexpand(spa_t *spa, vdev_t *vd) 8076 { 8077 if (!spa->spa_autoexpand) 8078 return; 8079 8080 for (int c = 0; c < vd->vdev_children; c++) { 8081 vdev_t *cvd = vd->vdev_child[c]; 8082 spa_async_autoexpand(spa, cvd); 8083 } 8084 8085 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 8086 return; 8087 8088 spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_AUTOEXPAND); 8089 } 8090 8091 static void 8092 spa_async_thread(void *arg) 8093 { 8094 spa_t *spa = (spa_t *)arg; 8095 dsl_pool_t *dp = spa->spa_dsl_pool; 8096 int tasks; 8097 8098 ASSERT(spa->spa_sync_on); 8099 8100 mutex_enter(&spa->spa_async_lock); 8101 tasks = spa->spa_async_tasks; 8102 spa->spa_async_tasks = 0; 8103 mutex_exit(&spa->spa_async_lock); 8104 8105 /* 8106 * See if the config needs to be updated. 8107 */ 8108 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 8109 uint64_t old_space, new_space; 8110 8111 mutex_enter(&spa_namespace_lock); 8112 old_space = metaslab_class_get_space(spa_normal_class(spa)); 8113 old_space += metaslab_class_get_space(spa_special_class(spa)); 8114 old_space += metaslab_class_get_space(spa_dedup_class(spa)); 8115 old_space += metaslab_class_get_space( 8116 spa_embedded_log_class(spa)); 8117 8118 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 8119 8120 new_space = metaslab_class_get_space(spa_normal_class(spa)); 8121 new_space += metaslab_class_get_space(spa_special_class(spa)); 8122 new_space += metaslab_class_get_space(spa_dedup_class(spa)); 8123 new_space += metaslab_class_get_space( 8124 spa_embedded_log_class(spa)); 8125 mutex_exit(&spa_namespace_lock); 8126 8127 /* 8128 * If the pool grew as a result of the config update, 8129 * then log an internal history event. 8130 */ 8131 if (new_space != old_space) { 8132 spa_history_log_internal(spa, "vdev online", NULL, 8133 "pool '%s' size: %llu(+%llu)", 8134 spa_name(spa), (u_longlong_t)new_space, 8135 (u_longlong_t)(new_space - old_space)); 8136 } 8137 } 8138 8139 /* 8140 * See if any devices need to be marked REMOVED. 8141 */ 8142 if (tasks & SPA_ASYNC_REMOVE) { 8143 spa_vdev_state_enter(spa, SCL_NONE); 8144 spa_async_remove(spa, spa->spa_root_vdev); 8145 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 8146 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 8147 for (int i = 0; i < spa->spa_spares.sav_count; i++) 8148 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 8149 (void) spa_vdev_state_exit(spa, NULL, 0); 8150 } 8151 8152 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 8153 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8154 spa_async_autoexpand(spa, spa->spa_root_vdev); 8155 spa_config_exit(spa, SCL_CONFIG, FTAG); 8156 } 8157 8158 /* 8159 * See if any devices need to be probed. 8160 */ 8161 if (tasks & SPA_ASYNC_PROBE) { 8162 spa_vdev_state_enter(spa, SCL_NONE); 8163 spa_async_probe(spa, spa->spa_root_vdev); 8164 (void) spa_vdev_state_exit(spa, NULL, 0); 8165 } 8166 8167 /* 8168 * If any devices are done replacing, detach them. 8169 */ 8170 if (tasks & SPA_ASYNC_RESILVER_DONE || 8171 tasks & SPA_ASYNC_REBUILD_DONE) { 8172 spa_vdev_resilver_done(spa); 8173 } 8174 8175 /* 8176 * Kick off a resilver. 8177 */ 8178 if (tasks & SPA_ASYNC_RESILVER && 8179 !vdev_rebuild_active(spa->spa_root_vdev) && 8180 (!dsl_scan_resilvering(dp) || 8181 !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))) 8182 dsl_scan_restart_resilver(dp, 0); 8183 8184 if (tasks & SPA_ASYNC_INITIALIZE_RESTART) { 8185 mutex_enter(&spa_namespace_lock); 8186 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8187 vdev_initialize_restart(spa->spa_root_vdev); 8188 spa_config_exit(spa, SCL_CONFIG, FTAG); 8189 mutex_exit(&spa_namespace_lock); 8190 } 8191 8192 if (tasks & SPA_ASYNC_TRIM_RESTART) { 8193 mutex_enter(&spa_namespace_lock); 8194 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8195 vdev_trim_restart(spa->spa_root_vdev); 8196 spa_config_exit(spa, SCL_CONFIG, FTAG); 8197 mutex_exit(&spa_namespace_lock); 8198 } 8199 8200 if (tasks & SPA_ASYNC_AUTOTRIM_RESTART) { 8201 mutex_enter(&spa_namespace_lock); 8202 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8203 vdev_autotrim_restart(spa); 8204 spa_config_exit(spa, SCL_CONFIG, FTAG); 8205 mutex_exit(&spa_namespace_lock); 8206 } 8207 8208 /* 8209 * Kick off L2 cache whole device TRIM. 8210 */ 8211 if (tasks & SPA_ASYNC_L2CACHE_TRIM) { 8212 mutex_enter(&spa_namespace_lock); 8213 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8214 vdev_trim_l2arc(spa); 8215 spa_config_exit(spa, SCL_CONFIG, FTAG); 8216 mutex_exit(&spa_namespace_lock); 8217 } 8218 8219 /* 8220 * Kick off L2 cache rebuilding. 8221 */ 8222 if (tasks & SPA_ASYNC_L2CACHE_REBUILD) { 8223 mutex_enter(&spa_namespace_lock); 8224 spa_config_enter(spa, SCL_L2ARC, FTAG, RW_READER); 8225 l2arc_spa_rebuild_start(spa); 8226 spa_config_exit(spa, SCL_L2ARC, FTAG); 8227 mutex_exit(&spa_namespace_lock); 8228 } 8229 8230 /* 8231 * Let the world know that we're done. 8232 */ 8233 mutex_enter(&spa->spa_async_lock); 8234 spa->spa_async_thread = NULL; 8235 cv_broadcast(&spa->spa_async_cv); 8236 mutex_exit(&spa->spa_async_lock); 8237 thread_exit(); 8238 } 8239 8240 void 8241 spa_async_suspend(spa_t *spa) 8242 { 8243 mutex_enter(&spa->spa_async_lock); 8244 spa->spa_async_suspended++; 8245 while (spa->spa_async_thread != NULL) 8246 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 8247 mutex_exit(&spa->spa_async_lock); 8248 8249 spa_vdev_remove_suspend(spa); 8250 8251 zthr_t *condense_thread = spa->spa_condense_zthr; 8252 if (condense_thread != NULL) 8253 zthr_cancel(condense_thread); 8254 8255 zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; 8256 if (discard_thread != NULL) 8257 zthr_cancel(discard_thread); 8258 8259 zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; 8260 if (ll_delete_thread != NULL) 8261 zthr_cancel(ll_delete_thread); 8262 8263 zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; 8264 if (ll_condense_thread != NULL) 8265 zthr_cancel(ll_condense_thread); 8266 } 8267 8268 void 8269 spa_async_resume(spa_t *spa) 8270 { 8271 mutex_enter(&spa->spa_async_lock); 8272 ASSERT(spa->spa_async_suspended != 0); 8273 spa->spa_async_suspended--; 8274 mutex_exit(&spa->spa_async_lock); 8275 spa_restart_removal(spa); 8276 8277 zthr_t *condense_thread = spa->spa_condense_zthr; 8278 if (condense_thread != NULL) 8279 zthr_resume(condense_thread); 8280 8281 zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; 8282 if (discard_thread != NULL) 8283 zthr_resume(discard_thread); 8284 8285 zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; 8286 if (ll_delete_thread != NULL) 8287 zthr_resume(ll_delete_thread); 8288 8289 zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; 8290 if (ll_condense_thread != NULL) 8291 zthr_resume(ll_condense_thread); 8292 } 8293 8294 static boolean_t 8295 spa_async_tasks_pending(spa_t *spa) 8296 { 8297 uint_t non_config_tasks; 8298 uint_t config_task; 8299 boolean_t config_task_suspended; 8300 8301 non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE; 8302 config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; 8303 if (spa->spa_ccw_fail_time == 0) { 8304 config_task_suspended = B_FALSE; 8305 } else { 8306 config_task_suspended = 8307 (gethrtime() - spa->spa_ccw_fail_time) < 8308 ((hrtime_t)zfs_ccw_retry_interval * NANOSEC); 8309 } 8310 8311 return (non_config_tasks || (config_task && !config_task_suspended)); 8312 } 8313 8314 static void 8315 spa_async_dispatch(spa_t *spa) 8316 { 8317 mutex_enter(&spa->spa_async_lock); 8318 if (spa_async_tasks_pending(spa) && 8319 !spa->spa_async_suspended && 8320 spa->spa_async_thread == NULL) 8321 spa->spa_async_thread = thread_create(NULL, 0, 8322 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 8323 mutex_exit(&spa->spa_async_lock); 8324 } 8325 8326 void 8327 spa_async_request(spa_t *spa, int task) 8328 { 8329 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 8330 mutex_enter(&spa->spa_async_lock); 8331 spa->spa_async_tasks |= task; 8332 mutex_exit(&spa->spa_async_lock); 8333 } 8334 8335 int 8336 spa_async_tasks(spa_t *spa) 8337 { 8338 return (spa->spa_async_tasks); 8339 } 8340 8341 /* 8342 * ========================================================================== 8343 * SPA syncing routines 8344 * ========================================================================== 8345 */ 8346 8347 8348 static int 8349 bpobj_enqueue_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 8350 dmu_tx_t *tx) 8351 { 8352 bpobj_t *bpo = arg; 8353 bpobj_enqueue(bpo, bp, bp_freed, tx); 8354 return (0); 8355 } 8356 8357 int 8358 bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 8359 { 8360 return (bpobj_enqueue_cb(arg, bp, B_FALSE, tx)); 8361 } 8362 8363 int 8364 bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 8365 { 8366 return (bpobj_enqueue_cb(arg, bp, B_TRUE, tx)); 8367 } 8368 8369 static int 8370 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 8371 { 8372 zio_t *pio = arg; 8373 8374 zio_nowait(zio_free_sync(pio, pio->io_spa, dmu_tx_get_txg(tx), bp, 8375 pio->io_flags)); 8376 return (0); 8377 } 8378 8379 static int 8380 bpobj_spa_free_sync_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 8381 dmu_tx_t *tx) 8382 { 8383 ASSERT(!bp_freed); 8384 return (spa_free_sync_cb(arg, bp, tx)); 8385 } 8386 8387 /* 8388 * Note: this simple function is not inlined to make it easier to dtrace the 8389 * amount of time spent syncing frees. 8390 */ 8391 static void 8392 spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) 8393 { 8394 zio_t *zio = zio_root(spa, NULL, NULL, 0); 8395 bplist_iterate(bpl, spa_free_sync_cb, zio, tx); 8396 VERIFY(zio_wait(zio) == 0); 8397 } 8398 8399 /* 8400 * Note: this simple function is not inlined to make it easier to dtrace the 8401 * amount of time spent syncing deferred frees. 8402 */ 8403 static void 8404 spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) 8405 { 8406 if (spa_sync_pass(spa) != 1) 8407 return; 8408 8409 /* 8410 * Note: 8411 * If the log space map feature is active, we stop deferring 8412 * frees to the next TXG and therefore running this function 8413 * would be considered a no-op as spa_deferred_bpobj should 8414 * not have any entries. 8415 * 8416 * That said we run this function anyway (instead of returning 8417 * immediately) for the edge-case scenario where we just 8418 * activated the log space map feature in this TXG but we have 8419 * deferred frees from the previous TXG. 8420 */ 8421 zio_t *zio = zio_root(spa, NULL, NULL, 0); 8422 VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, 8423 bpobj_spa_free_sync_cb, zio, tx), ==, 0); 8424 VERIFY0(zio_wait(zio)); 8425 } 8426 8427 static void 8428 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 8429 { 8430 char *packed = NULL; 8431 size_t bufsize; 8432 size_t nvsize = 0; 8433 dmu_buf_t *db; 8434 8435 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 8436 8437 /* 8438 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 8439 * information. This avoids the dmu_buf_will_dirty() path and 8440 * saves us a pre-read to get data we don't actually care about. 8441 */ 8442 bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); 8443 packed = vmem_alloc(bufsize, KM_SLEEP); 8444 8445 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 8446 KM_SLEEP) == 0); 8447 bzero(packed + nvsize, bufsize - nvsize); 8448 8449 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 8450 8451 vmem_free(packed, bufsize); 8452 8453 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 8454 dmu_buf_will_dirty(db, tx); 8455 *(uint64_t *)db->db_data = nvsize; 8456 dmu_buf_rele(db, FTAG); 8457 } 8458 8459 static void 8460 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 8461 const char *config, const char *entry) 8462 { 8463 nvlist_t *nvroot; 8464 nvlist_t **list; 8465 int i; 8466 8467 if (!sav->sav_sync) 8468 return; 8469 8470 /* 8471 * Update the MOS nvlist describing the list of available devices. 8472 * spa_validate_aux() will have already made sure this nvlist is 8473 * valid and the vdevs are labeled appropriately. 8474 */ 8475 if (sav->sav_object == 0) { 8476 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 8477 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 8478 sizeof (uint64_t), tx); 8479 VERIFY(zap_update(spa->spa_meta_objset, 8480 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 8481 &sav->sav_object, tx) == 0); 8482 } 8483 8484 nvroot = fnvlist_alloc(); 8485 if (sav->sav_count == 0) { 8486 fnvlist_add_nvlist_array(nvroot, config, 8487 (const nvlist_t * const *)NULL, 0); 8488 } else { 8489 list = kmem_alloc(sav->sav_count*sizeof (void *), KM_SLEEP); 8490 for (i = 0; i < sav->sav_count; i++) 8491 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 8492 B_FALSE, VDEV_CONFIG_L2CACHE); 8493 fnvlist_add_nvlist_array(nvroot, config, 8494 (const nvlist_t * const *)list, sav->sav_count); 8495 for (i = 0; i < sav->sav_count; i++) 8496 nvlist_free(list[i]); 8497 kmem_free(list, sav->sav_count * sizeof (void *)); 8498 } 8499 8500 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 8501 nvlist_free(nvroot); 8502 8503 sav->sav_sync = B_FALSE; 8504 } 8505 8506 /* 8507 * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t. 8508 * The all-vdev ZAP must be empty. 8509 */ 8510 static void 8511 spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx) 8512 { 8513 spa_t *spa = vd->vdev_spa; 8514 8515 if (vd->vdev_top_zap != 0) { 8516 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 8517 vd->vdev_top_zap, tx)); 8518 } 8519 if (vd->vdev_leaf_zap != 0) { 8520 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 8521 vd->vdev_leaf_zap, tx)); 8522 } 8523 for (uint64_t i = 0; i < vd->vdev_children; i++) { 8524 spa_avz_build(vd->vdev_child[i], avz, tx); 8525 } 8526 } 8527 8528 static void 8529 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 8530 { 8531 nvlist_t *config; 8532 8533 /* 8534 * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS, 8535 * its config may not be dirty but we still need to build per-vdev ZAPs. 8536 * Similarly, if the pool is being assembled (e.g. after a split), we 8537 * need to rebuild the AVZ although the config may not be dirty. 8538 */ 8539 if (list_is_empty(&spa->spa_config_dirty_list) && 8540 spa->spa_avz_action == AVZ_ACTION_NONE) 8541 return; 8542 8543 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 8544 8545 ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE || 8546 spa->spa_avz_action == AVZ_ACTION_INITIALIZE || 8547 spa->spa_all_vdev_zaps != 0); 8548 8549 if (spa->spa_avz_action == AVZ_ACTION_REBUILD) { 8550 /* Make and build the new AVZ */ 8551 uint64_t new_avz = zap_create(spa->spa_meta_objset, 8552 DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); 8553 spa_avz_build(spa->spa_root_vdev, new_avz, tx); 8554 8555 /* Diff old AVZ with new one */ 8556 zap_cursor_t zc; 8557 zap_attribute_t za; 8558 8559 for (zap_cursor_init(&zc, spa->spa_meta_objset, 8560 spa->spa_all_vdev_zaps); 8561 zap_cursor_retrieve(&zc, &za) == 0; 8562 zap_cursor_advance(&zc)) { 8563 uint64_t vdzap = za.za_first_integer; 8564 if (zap_lookup_int(spa->spa_meta_objset, new_avz, 8565 vdzap) == ENOENT) { 8566 /* 8567 * ZAP is listed in old AVZ but not in new one; 8568 * destroy it 8569 */ 8570 VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap, 8571 tx)); 8572 } 8573 } 8574 8575 zap_cursor_fini(&zc); 8576 8577 /* Destroy the old AVZ */ 8578 VERIFY0(zap_destroy(spa->spa_meta_objset, 8579 spa->spa_all_vdev_zaps, tx)); 8580 8581 /* Replace the old AVZ in the dir obj with the new one */ 8582 VERIFY0(zap_update(spa->spa_meta_objset, 8583 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, 8584 sizeof (new_avz), 1, &new_avz, tx)); 8585 8586 spa->spa_all_vdev_zaps = new_avz; 8587 } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) { 8588 zap_cursor_t zc; 8589 zap_attribute_t za; 8590 8591 /* Walk through the AVZ and destroy all listed ZAPs */ 8592 for (zap_cursor_init(&zc, spa->spa_meta_objset, 8593 spa->spa_all_vdev_zaps); 8594 zap_cursor_retrieve(&zc, &za) == 0; 8595 zap_cursor_advance(&zc)) { 8596 uint64_t zap = za.za_first_integer; 8597 VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx)); 8598 } 8599 8600 zap_cursor_fini(&zc); 8601 8602 /* Destroy and unlink the AVZ itself */ 8603 VERIFY0(zap_destroy(spa->spa_meta_objset, 8604 spa->spa_all_vdev_zaps, tx)); 8605 VERIFY0(zap_remove(spa->spa_meta_objset, 8606 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx)); 8607 spa->spa_all_vdev_zaps = 0; 8608 } 8609 8610 if (spa->spa_all_vdev_zaps == 0) { 8611 spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset, 8612 DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, 8613 DMU_POOL_VDEV_ZAP_MAP, tx); 8614 } 8615 spa->spa_avz_action = AVZ_ACTION_NONE; 8616 8617 /* Create ZAPs for vdevs that don't have them. */ 8618 vdev_construct_zaps(spa->spa_root_vdev, tx); 8619 8620 config = spa_config_generate(spa, spa->spa_root_vdev, 8621 dmu_tx_get_txg(tx), B_FALSE); 8622 8623 /* 8624 * If we're upgrading the spa version then make sure that 8625 * the config object gets updated with the correct version. 8626 */ 8627 if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) 8628 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 8629 spa->spa_uberblock.ub_version); 8630 8631 spa_config_exit(spa, SCL_STATE, FTAG); 8632 8633 nvlist_free(spa->spa_config_syncing); 8634 spa->spa_config_syncing = config; 8635 8636 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 8637 } 8638 8639 static void 8640 spa_sync_version(void *arg, dmu_tx_t *tx) 8641 { 8642 uint64_t *versionp = arg; 8643 uint64_t version = *versionp; 8644 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 8645 8646 /* 8647 * Setting the version is special cased when first creating the pool. 8648 */ 8649 ASSERT(tx->tx_txg != TXG_INITIAL); 8650 8651 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 8652 ASSERT(version >= spa_version(spa)); 8653 8654 spa->spa_uberblock.ub_version = version; 8655 vdev_config_dirty(spa->spa_root_vdev); 8656 spa_history_log_internal(spa, "set", tx, "version=%lld", 8657 (longlong_t)version); 8658 } 8659 8660 /* 8661 * Set zpool properties. 8662 */ 8663 static void 8664 spa_sync_props(void *arg, dmu_tx_t *tx) 8665 { 8666 nvlist_t *nvp = arg; 8667 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 8668 objset_t *mos = spa->spa_meta_objset; 8669 nvpair_t *elem = NULL; 8670 8671 mutex_enter(&spa->spa_props_lock); 8672 8673 while ((elem = nvlist_next_nvpair(nvp, elem))) { 8674 uint64_t intval; 8675 char *strval, *fname; 8676 zpool_prop_t prop; 8677 const char *propname; 8678 zprop_type_t proptype; 8679 spa_feature_t fid; 8680 8681 switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 8682 case ZPOOL_PROP_INVAL: 8683 /* 8684 * We checked this earlier in spa_prop_validate(). 8685 */ 8686 ASSERT(zpool_prop_feature(nvpair_name(elem))); 8687 8688 fname = strchr(nvpair_name(elem), '@') + 1; 8689 VERIFY0(zfeature_lookup_name(fname, &fid)); 8690 8691 spa_feature_enable(spa, fid, tx); 8692 spa_history_log_internal(spa, "set", tx, 8693 "%s=enabled", nvpair_name(elem)); 8694 break; 8695 8696 case ZPOOL_PROP_VERSION: 8697 intval = fnvpair_value_uint64(elem); 8698 /* 8699 * The version is synced separately before other 8700 * properties and should be correct by now. 8701 */ 8702 ASSERT3U(spa_version(spa), >=, intval); 8703 break; 8704 8705 case ZPOOL_PROP_ALTROOT: 8706 /* 8707 * 'altroot' is a non-persistent property. It should 8708 * have been set temporarily at creation or import time. 8709 */ 8710 ASSERT(spa->spa_root != NULL); 8711 break; 8712 8713 case ZPOOL_PROP_READONLY: 8714 case ZPOOL_PROP_CACHEFILE: 8715 /* 8716 * 'readonly' and 'cachefile' are also non-persistent 8717 * properties. 8718 */ 8719 break; 8720 case ZPOOL_PROP_COMMENT: 8721 strval = fnvpair_value_string(elem); 8722 if (spa->spa_comment != NULL) 8723 spa_strfree(spa->spa_comment); 8724 spa->spa_comment = spa_strdup(strval); 8725 /* 8726 * We need to dirty the configuration on all the vdevs 8727 * so that their labels get updated. We also need to 8728 * update the cache file to keep it in sync with the 8729 * MOS version. It's unnecessary to do this for pool 8730 * creation since the vdev's configuration has already 8731 * been dirtied. 8732 */ 8733 if (tx->tx_txg != TXG_INITIAL) { 8734 vdev_config_dirty(spa->spa_root_vdev); 8735 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 8736 } 8737 spa_history_log_internal(spa, "set", tx, 8738 "%s=%s", nvpair_name(elem), strval); 8739 break; 8740 case ZPOOL_PROP_COMPATIBILITY: 8741 strval = fnvpair_value_string(elem); 8742 if (spa->spa_compatibility != NULL) 8743 spa_strfree(spa->spa_compatibility); 8744 spa->spa_compatibility = spa_strdup(strval); 8745 /* 8746 * Dirty the configuration on vdevs as above. 8747 */ 8748 if (tx->tx_txg != TXG_INITIAL) { 8749 vdev_config_dirty(spa->spa_root_vdev); 8750 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 8751 } 8752 8753 spa_history_log_internal(spa, "set", tx, 8754 "%s=%s", nvpair_name(elem), strval); 8755 break; 8756 8757 default: 8758 /* 8759 * Set pool property values in the poolprops mos object. 8760 */ 8761 if (spa->spa_pool_props_object == 0) { 8762 spa->spa_pool_props_object = 8763 zap_create_link(mos, DMU_OT_POOL_PROPS, 8764 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 8765 tx); 8766 } 8767 8768 /* normalize the property name */ 8769 propname = zpool_prop_to_name(prop); 8770 proptype = zpool_prop_get_type(prop); 8771 8772 if (nvpair_type(elem) == DATA_TYPE_STRING) { 8773 ASSERT(proptype == PROP_TYPE_STRING); 8774 strval = fnvpair_value_string(elem); 8775 VERIFY0(zap_update(mos, 8776 spa->spa_pool_props_object, propname, 8777 1, strlen(strval) + 1, strval, tx)); 8778 spa_history_log_internal(spa, "set", tx, 8779 "%s=%s", nvpair_name(elem), strval); 8780 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 8781 intval = fnvpair_value_uint64(elem); 8782 8783 if (proptype == PROP_TYPE_INDEX) { 8784 const char *unused; 8785 VERIFY0(zpool_prop_index_to_string( 8786 prop, intval, &unused)); 8787 } 8788 VERIFY0(zap_update(mos, 8789 spa->spa_pool_props_object, propname, 8790 8, 1, &intval, tx)); 8791 spa_history_log_internal(spa, "set", tx, 8792 "%s=%lld", nvpair_name(elem), 8793 (longlong_t)intval); 8794 } else { 8795 ASSERT(0); /* not allowed */ 8796 } 8797 8798 switch (prop) { 8799 case ZPOOL_PROP_DELEGATION: 8800 spa->spa_delegation = intval; 8801 break; 8802 case ZPOOL_PROP_BOOTFS: 8803 spa->spa_bootfs = intval; 8804 break; 8805 case ZPOOL_PROP_FAILUREMODE: 8806 spa->spa_failmode = intval; 8807 break; 8808 case ZPOOL_PROP_AUTOTRIM: 8809 spa->spa_autotrim = intval; 8810 spa_async_request(spa, 8811 SPA_ASYNC_AUTOTRIM_RESTART); 8812 break; 8813 case ZPOOL_PROP_AUTOEXPAND: 8814 spa->spa_autoexpand = intval; 8815 if (tx->tx_txg != TXG_INITIAL) 8816 spa_async_request(spa, 8817 SPA_ASYNC_AUTOEXPAND); 8818 break; 8819 case ZPOOL_PROP_MULTIHOST: 8820 spa->spa_multihost = intval; 8821 break; 8822 default: 8823 break; 8824 } 8825 } 8826 8827 } 8828 8829 mutex_exit(&spa->spa_props_lock); 8830 } 8831 8832 /* 8833 * Perform one-time upgrade on-disk changes. spa_version() does not 8834 * reflect the new version this txg, so there must be no changes this 8835 * txg to anything that the upgrade code depends on after it executes. 8836 * Therefore this must be called after dsl_pool_sync() does the sync 8837 * tasks. 8838 */ 8839 static void 8840 spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 8841 { 8842 if (spa_sync_pass(spa) != 1) 8843 return; 8844 8845 dsl_pool_t *dp = spa->spa_dsl_pool; 8846 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 8847 8848 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 8849 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 8850 dsl_pool_create_origin(dp, tx); 8851 8852 /* Keeping the origin open increases spa_minref */ 8853 spa->spa_minref += 3; 8854 } 8855 8856 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 8857 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 8858 dsl_pool_upgrade_clones(dp, tx); 8859 } 8860 8861 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 8862 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 8863 dsl_pool_upgrade_dir_clones(dp, tx); 8864 8865 /* Keeping the freedir open increases spa_minref */ 8866 spa->spa_minref += 3; 8867 } 8868 8869 if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && 8870 spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 8871 spa_feature_create_zap_objects(spa, tx); 8872 } 8873 8874 /* 8875 * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable 8876 * when possibility to use lz4 compression for metadata was added 8877 * Old pools that have this feature enabled must be upgraded to have 8878 * this feature active 8879 */ 8880 if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 8881 boolean_t lz4_en = spa_feature_is_enabled(spa, 8882 SPA_FEATURE_LZ4_COMPRESS); 8883 boolean_t lz4_ac = spa_feature_is_active(spa, 8884 SPA_FEATURE_LZ4_COMPRESS); 8885 8886 if (lz4_en && !lz4_ac) 8887 spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); 8888 } 8889 8890 /* 8891 * If we haven't written the salt, do so now. Note that the 8892 * feature may not be activated yet, but that's fine since 8893 * the presence of this ZAP entry is backwards compatible. 8894 */ 8895 if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 8896 DMU_POOL_CHECKSUM_SALT) == ENOENT) { 8897 VERIFY0(zap_add(spa->spa_meta_objset, 8898 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, 8899 sizeof (spa->spa_cksum_salt.zcs_bytes), 8900 spa->spa_cksum_salt.zcs_bytes, tx)); 8901 } 8902 8903 rrw_exit(&dp->dp_config_rwlock, FTAG); 8904 } 8905 8906 static void 8907 vdev_indirect_state_sync_verify(vdev_t *vd) 8908 { 8909 vdev_indirect_mapping_t *vim __maybe_unused = vd->vdev_indirect_mapping; 8910 vdev_indirect_births_t *vib __maybe_unused = vd->vdev_indirect_births; 8911 8912 if (vd->vdev_ops == &vdev_indirect_ops) { 8913 ASSERT(vim != NULL); 8914 ASSERT(vib != NULL); 8915 } 8916 8917 uint64_t obsolete_sm_object = 0; 8918 ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); 8919 if (obsolete_sm_object != 0) { 8920 ASSERT(vd->vdev_obsolete_sm != NULL); 8921 ASSERT(vd->vdev_removing || 8922 vd->vdev_ops == &vdev_indirect_ops); 8923 ASSERT(vdev_indirect_mapping_num_entries(vim) > 0); 8924 ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0); 8925 ASSERT3U(obsolete_sm_object, ==, 8926 space_map_object(vd->vdev_obsolete_sm)); 8927 ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=, 8928 space_map_allocated(vd->vdev_obsolete_sm)); 8929 } 8930 ASSERT(vd->vdev_obsolete_segments != NULL); 8931 8932 /* 8933 * Since frees / remaps to an indirect vdev can only 8934 * happen in syncing context, the obsolete segments 8935 * tree must be empty when we start syncing. 8936 */ 8937 ASSERT0(range_tree_space(vd->vdev_obsolete_segments)); 8938 } 8939 8940 /* 8941 * Set the top-level vdev's max queue depth. Evaluate each top-level's 8942 * async write queue depth in case it changed. The max queue depth will 8943 * not change in the middle of syncing out this txg. 8944 */ 8945 static void 8946 spa_sync_adjust_vdev_max_queue_depth(spa_t *spa) 8947 { 8948 ASSERT(spa_writeable(spa)); 8949 8950 vdev_t *rvd = spa->spa_root_vdev; 8951 uint32_t max_queue_depth = zfs_vdev_async_write_max_active * 8952 zfs_vdev_queue_depth_pct / 100; 8953 metaslab_class_t *normal = spa_normal_class(spa); 8954 metaslab_class_t *special = spa_special_class(spa); 8955 metaslab_class_t *dedup = spa_dedup_class(spa); 8956 8957 uint64_t slots_per_allocator = 0; 8958 for (int c = 0; c < rvd->vdev_children; c++) { 8959 vdev_t *tvd = rvd->vdev_child[c]; 8960 8961 metaslab_group_t *mg = tvd->vdev_mg; 8962 if (mg == NULL || !metaslab_group_initialized(mg)) 8963 continue; 8964 8965 metaslab_class_t *mc = mg->mg_class; 8966 if (mc != normal && mc != special && mc != dedup) 8967 continue; 8968 8969 /* 8970 * It is safe to do a lock-free check here because only async 8971 * allocations look at mg_max_alloc_queue_depth, and async 8972 * allocations all happen from spa_sync(). 8973 */ 8974 for (int i = 0; i < mg->mg_allocators; i++) { 8975 ASSERT0(zfs_refcount_count( 8976 &(mg->mg_allocator[i].mga_alloc_queue_depth))); 8977 } 8978 mg->mg_max_alloc_queue_depth = max_queue_depth; 8979 8980 for (int i = 0; i < mg->mg_allocators; i++) { 8981 mg->mg_allocator[i].mga_cur_max_alloc_queue_depth = 8982 zfs_vdev_def_queue_depth; 8983 } 8984 slots_per_allocator += zfs_vdev_def_queue_depth; 8985 } 8986 8987 for (int i = 0; i < spa->spa_alloc_count; i++) { 8988 ASSERT0(zfs_refcount_count(&normal->mc_allocator[i]. 8989 mca_alloc_slots)); 8990 ASSERT0(zfs_refcount_count(&special->mc_allocator[i]. 8991 mca_alloc_slots)); 8992 ASSERT0(zfs_refcount_count(&dedup->mc_allocator[i]. 8993 mca_alloc_slots)); 8994 normal->mc_allocator[i].mca_alloc_max_slots = 8995 slots_per_allocator; 8996 special->mc_allocator[i].mca_alloc_max_slots = 8997 slots_per_allocator; 8998 dedup->mc_allocator[i].mca_alloc_max_slots = 8999 slots_per_allocator; 9000 } 9001 normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; 9002 special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; 9003 dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; 9004 } 9005 9006 static void 9007 spa_sync_condense_indirect(spa_t *spa, dmu_tx_t *tx) 9008 { 9009 ASSERT(spa_writeable(spa)); 9010 9011 vdev_t *rvd = spa->spa_root_vdev; 9012 for (int c = 0; c < rvd->vdev_children; c++) { 9013 vdev_t *vd = rvd->vdev_child[c]; 9014 vdev_indirect_state_sync_verify(vd); 9015 9016 if (vdev_indirect_should_condense(vd)) { 9017 spa_condense_indirect_start_sync(vd, tx); 9018 break; 9019 } 9020 } 9021 } 9022 9023 static void 9024 spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx) 9025 { 9026 objset_t *mos = spa->spa_meta_objset; 9027 dsl_pool_t *dp = spa->spa_dsl_pool; 9028 uint64_t txg = tx->tx_txg; 9029 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 9030 9031 do { 9032 int pass = ++spa->spa_sync_pass; 9033 9034 spa_sync_config_object(spa, tx); 9035 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 9036 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 9037 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 9038 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 9039 spa_errlog_sync(spa, txg); 9040 dsl_pool_sync(dp, txg); 9041 9042 if (pass < zfs_sync_pass_deferred_free || 9043 spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { 9044 /* 9045 * If the log space map feature is active we don't 9046 * care about deferred frees and the deferred bpobj 9047 * as the log space map should effectively have the 9048 * same results (i.e. appending only to one object). 9049 */ 9050 spa_sync_frees(spa, free_bpl, tx); 9051 } else { 9052 /* 9053 * We can not defer frees in pass 1, because 9054 * we sync the deferred frees later in pass 1. 9055 */ 9056 ASSERT3U(pass, >, 1); 9057 bplist_iterate(free_bpl, bpobj_enqueue_alloc_cb, 9058 &spa->spa_deferred_bpobj, tx); 9059 } 9060 9061 ddt_sync(spa, txg); 9062 dsl_scan_sync(dp, tx); 9063 svr_sync(spa, tx); 9064 spa_sync_upgrades(spa, tx); 9065 9066 spa_flush_metaslabs(spa, tx); 9067 9068 vdev_t *vd = NULL; 9069 while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 9070 != NULL) 9071 vdev_sync(vd, txg); 9072 9073 /* 9074 * Note: We need to check if the MOS is dirty because we could 9075 * have marked the MOS dirty without updating the uberblock 9076 * (e.g. if we have sync tasks but no dirty user data). We need 9077 * to check the uberblock's rootbp because it is updated if we 9078 * have synced out dirty data (though in this case the MOS will 9079 * most likely also be dirty due to second order effects, we 9080 * don't want to rely on that here). 9081 */ 9082 if (pass == 1 && 9083 spa->spa_uberblock.ub_rootbp.blk_birth < txg && 9084 !dmu_objset_is_dirty(mos, txg)) { 9085 /* 9086 * Nothing changed on the first pass, therefore this 9087 * TXG is a no-op. Avoid syncing deferred frees, so 9088 * that we can keep this TXG as a no-op. 9089 */ 9090 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 9091 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 9092 ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); 9093 ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, txg)); 9094 break; 9095 } 9096 9097 spa_sync_deferred_frees(spa, tx); 9098 } while (dmu_objset_is_dirty(mos, txg)); 9099 } 9100 9101 /* 9102 * Rewrite the vdev configuration (which includes the uberblock) to 9103 * commit the transaction group. 9104 * 9105 * If there are no dirty vdevs, we sync the uberblock to a few random 9106 * top-level vdevs that are known to be visible in the config cache 9107 * (see spa_vdev_add() for a complete description). If there *are* dirty 9108 * vdevs, sync the uberblock to all vdevs. 9109 */ 9110 static void 9111 spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx) 9112 { 9113 vdev_t *rvd = spa->spa_root_vdev; 9114 uint64_t txg = tx->tx_txg; 9115 9116 for (;;) { 9117 int error = 0; 9118 9119 /* 9120 * We hold SCL_STATE to prevent vdev open/close/etc. 9121 * while we're attempting to write the vdev labels. 9122 */ 9123 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 9124 9125 if (list_is_empty(&spa->spa_config_dirty_list)) { 9126 vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; 9127 int svdcount = 0; 9128 int children = rvd->vdev_children; 9129 int c0 = random_in_range(children); 9130 9131 for (int c = 0; c < children; c++) { 9132 vdev_t *vd = 9133 rvd->vdev_child[(c0 + c) % children]; 9134 9135 /* Stop when revisiting the first vdev */ 9136 if (c > 0 && svd[0] == vd) 9137 break; 9138 9139 if (vd->vdev_ms_array == 0 || 9140 vd->vdev_islog || 9141 !vdev_is_concrete(vd)) 9142 continue; 9143 9144 svd[svdcount++] = vd; 9145 if (svdcount == SPA_SYNC_MIN_VDEVS) 9146 break; 9147 } 9148 error = vdev_config_sync(svd, svdcount, txg); 9149 } else { 9150 error = vdev_config_sync(rvd->vdev_child, 9151 rvd->vdev_children, txg); 9152 } 9153 9154 if (error == 0) 9155 spa->spa_last_synced_guid = rvd->vdev_guid; 9156 9157 spa_config_exit(spa, SCL_STATE, FTAG); 9158 9159 if (error == 0) 9160 break; 9161 zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR); 9162 zio_resume_wait(spa); 9163 } 9164 } 9165 9166 /* 9167 * Sync the specified transaction group. New blocks may be dirtied as 9168 * part of the process, so we iterate until it converges. 9169 */ 9170 void 9171 spa_sync(spa_t *spa, uint64_t txg) 9172 { 9173 vdev_t *vd = NULL; 9174 9175 VERIFY(spa_writeable(spa)); 9176 9177 /* 9178 * Wait for i/os issued in open context that need to complete 9179 * before this txg syncs. 9180 */ 9181 (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]); 9182 spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, 9183 ZIO_FLAG_CANFAIL); 9184 9185 /* 9186 * Lock out configuration changes. 9187 */ 9188 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 9189 9190 spa->spa_syncing_txg = txg; 9191 spa->spa_sync_pass = 0; 9192 9193 for (int i = 0; i < spa->spa_alloc_count; i++) { 9194 mutex_enter(&spa->spa_allocs[i].spaa_lock); 9195 VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree)); 9196 mutex_exit(&spa->spa_allocs[i].spaa_lock); 9197 } 9198 9199 /* 9200 * If there are any pending vdev state changes, convert them 9201 * into config changes that go out with this transaction group. 9202 */ 9203 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 9204 while (list_head(&spa->spa_state_dirty_list) != NULL) { 9205 /* 9206 * We need the write lock here because, for aux vdevs, 9207 * calling vdev_config_dirty() modifies sav_config. 9208 * This is ugly and will become unnecessary when we 9209 * eliminate the aux vdev wart by integrating all vdevs 9210 * into the root vdev tree. 9211 */ 9212 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 9213 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 9214 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 9215 vdev_state_clean(vd); 9216 vdev_config_dirty(vd); 9217 } 9218 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 9219 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 9220 } 9221 spa_config_exit(spa, SCL_STATE, FTAG); 9222 9223 dsl_pool_t *dp = spa->spa_dsl_pool; 9224 dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); 9225 9226 spa->spa_sync_starttime = gethrtime(); 9227 taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); 9228 spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq, 9229 spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + 9230 NSEC_TO_TICK(spa->spa_deadman_synctime)); 9231 9232 /* 9233 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 9234 * set spa_deflate if we have no raid-z vdevs. 9235 */ 9236 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 9237 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 9238 vdev_t *rvd = spa->spa_root_vdev; 9239 9240 int i; 9241 for (i = 0; i < rvd->vdev_children; i++) { 9242 vd = rvd->vdev_child[i]; 9243 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 9244 break; 9245 } 9246 if (i == rvd->vdev_children) { 9247 spa->spa_deflate = TRUE; 9248 VERIFY0(zap_add(spa->spa_meta_objset, 9249 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 9250 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 9251 } 9252 } 9253 9254 spa_sync_adjust_vdev_max_queue_depth(spa); 9255 9256 spa_sync_condense_indirect(spa, tx); 9257 9258 spa_sync_iterate_to_convergence(spa, tx); 9259 9260 #ifdef ZFS_DEBUG 9261 if (!list_is_empty(&spa->spa_config_dirty_list)) { 9262 /* 9263 * Make sure that the number of ZAPs for all the vdevs matches 9264 * the number of ZAPs in the per-vdev ZAP list. This only gets 9265 * called if the config is dirty; otherwise there may be 9266 * outstanding AVZ operations that weren't completed in 9267 * spa_sync_config_object. 9268 */ 9269 uint64_t all_vdev_zap_entry_count; 9270 ASSERT0(zap_count(spa->spa_meta_objset, 9271 spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count)); 9272 ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==, 9273 all_vdev_zap_entry_count); 9274 } 9275 #endif 9276 9277 if (spa->spa_vdev_removal != NULL) { 9278 ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]); 9279 } 9280 9281 spa_sync_rewrite_vdev_config(spa, tx); 9282 dmu_tx_commit(tx); 9283 9284 taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); 9285 spa->spa_deadman_tqid = 0; 9286 9287 /* 9288 * Clear the dirty config list. 9289 */ 9290 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 9291 vdev_config_clean(vd); 9292 9293 /* 9294 * Now that the new config has synced transactionally, 9295 * let it become visible to the config cache. 9296 */ 9297 if (spa->spa_config_syncing != NULL) { 9298 spa_config_set(spa, spa->spa_config_syncing); 9299 spa->spa_config_txg = txg; 9300 spa->spa_config_syncing = NULL; 9301 } 9302 9303 dsl_pool_sync_done(dp, txg); 9304 9305 for (int i = 0; i < spa->spa_alloc_count; i++) { 9306 mutex_enter(&spa->spa_allocs[i].spaa_lock); 9307 VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree)); 9308 mutex_exit(&spa->spa_allocs[i].spaa_lock); 9309 } 9310 9311 /* 9312 * Update usable space statistics. 9313 */ 9314 while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 9315 != NULL) 9316 vdev_sync_done(vd, txg); 9317 9318 metaslab_class_evict_old(spa->spa_normal_class, txg); 9319 metaslab_class_evict_old(spa->spa_log_class, txg); 9320 9321 spa_sync_close_syncing_log_sm(spa); 9322 9323 spa_update_dspace(spa); 9324 9325 /* 9326 * It had better be the case that we didn't dirty anything 9327 * since vdev_config_sync(). 9328 */ 9329 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 9330 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 9331 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 9332 9333 while (zfs_pause_spa_sync) 9334 delay(1); 9335 9336 spa->spa_sync_pass = 0; 9337 9338 /* 9339 * Update the last synced uberblock here. We want to do this at 9340 * the end of spa_sync() so that consumers of spa_last_synced_txg() 9341 * will be guaranteed that all the processing associated with 9342 * that txg has been completed. 9343 */ 9344 spa->spa_ubsync = spa->spa_uberblock; 9345 spa_config_exit(spa, SCL_CONFIG, FTAG); 9346 9347 spa_handle_ignored_writes(spa); 9348 9349 /* 9350 * If any async tasks have been requested, kick them off. 9351 */ 9352 spa_async_dispatch(spa); 9353 } 9354 9355 /* 9356 * Sync all pools. We don't want to hold the namespace lock across these 9357 * operations, so we take a reference on the spa_t and drop the lock during the 9358 * sync. 9359 */ 9360 void 9361 spa_sync_allpools(void) 9362 { 9363 spa_t *spa = NULL; 9364 mutex_enter(&spa_namespace_lock); 9365 while ((spa = spa_next(spa)) != NULL) { 9366 if (spa_state(spa) != POOL_STATE_ACTIVE || 9367 !spa_writeable(spa) || spa_suspended(spa)) 9368 continue; 9369 spa_open_ref(spa, FTAG); 9370 mutex_exit(&spa_namespace_lock); 9371 txg_wait_synced(spa_get_dsl(spa), 0); 9372 mutex_enter(&spa_namespace_lock); 9373 spa_close(spa, FTAG); 9374 } 9375 mutex_exit(&spa_namespace_lock); 9376 } 9377 9378 /* 9379 * ========================================================================== 9380 * Miscellaneous routines 9381 * ========================================================================== 9382 */ 9383 9384 /* 9385 * Remove all pools in the system. 9386 */ 9387 void 9388 spa_evict_all(void) 9389 { 9390 spa_t *spa; 9391 9392 /* 9393 * Remove all cached state. All pools should be closed now, 9394 * so every spa in the AVL tree should be unreferenced. 9395 */ 9396 mutex_enter(&spa_namespace_lock); 9397 while ((spa = spa_next(NULL)) != NULL) { 9398 /* 9399 * Stop async tasks. The async thread may need to detach 9400 * a device that's been replaced, which requires grabbing 9401 * spa_namespace_lock, so we must drop it here. 9402 */ 9403 spa_open_ref(spa, FTAG); 9404 mutex_exit(&spa_namespace_lock); 9405 spa_async_suspend(spa); 9406 mutex_enter(&spa_namespace_lock); 9407 spa_close(spa, FTAG); 9408 9409 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 9410 spa_unload(spa); 9411 spa_deactivate(spa); 9412 } 9413 spa_remove(spa); 9414 } 9415 mutex_exit(&spa_namespace_lock); 9416 } 9417 9418 vdev_t * 9419 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 9420 { 9421 vdev_t *vd; 9422 int i; 9423 9424 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 9425 return (vd); 9426 9427 if (aux) { 9428 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 9429 vd = spa->spa_l2cache.sav_vdevs[i]; 9430 if (vd->vdev_guid == guid) 9431 return (vd); 9432 } 9433 9434 for (i = 0; i < spa->spa_spares.sav_count; i++) { 9435 vd = spa->spa_spares.sav_vdevs[i]; 9436 if (vd->vdev_guid == guid) 9437 return (vd); 9438 } 9439 } 9440 9441 return (NULL); 9442 } 9443 9444 void 9445 spa_upgrade(spa_t *spa, uint64_t version) 9446 { 9447 ASSERT(spa_writeable(spa)); 9448 9449 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 9450 9451 /* 9452 * This should only be called for a non-faulted pool, and since a 9453 * future version would result in an unopenable pool, this shouldn't be 9454 * possible. 9455 */ 9456 ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); 9457 ASSERT3U(version, >=, spa->spa_uberblock.ub_version); 9458 9459 spa->spa_uberblock.ub_version = version; 9460 vdev_config_dirty(spa->spa_root_vdev); 9461 9462 spa_config_exit(spa, SCL_ALL, FTAG); 9463 9464 txg_wait_synced(spa_get_dsl(spa), 0); 9465 } 9466 9467 static boolean_t 9468 spa_has_aux_vdev(spa_t *spa, uint64_t guid, spa_aux_vdev_t *sav) 9469 { 9470 (void) spa; 9471 int i; 9472 uint64_t vdev_guid; 9473 9474 for (i = 0; i < sav->sav_count; i++) 9475 if (sav->sav_vdevs[i]->vdev_guid == guid) 9476 return (B_TRUE); 9477 9478 for (i = 0; i < sav->sav_npending; i++) { 9479 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 9480 &vdev_guid) == 0 && vdev_guid == guid) 9481 return (B_TRUE); 9482 } 9483 9484 return (B_FALSE); 9485 } 9486 9487 boolean_t 9488 spa_has_l2cache(spa_t *spa, uint64_t guid) 9489 { 9490 return (spa_has_aux_vdev(spa, guid, &spa->spa_l2cache)); 9491 } 9492 9493 boolean_t 9494 spa_has_spare(spa_t *spa, uint64_t guid) 9495 { 9496 return (spa_has_aux_vdev(spa, guid, &spa->spa_spares)); 9497 } 9498 9499 /* 9500 * Check if a pool has an active shared spare device. 9501 * Note: reference count of an active spare is 2, as a spare and as a replace 9502 */ 9503 static boolean_t 9504 spa_has_active_shared_spare(spa_t *spa) 9505 { 9506 int i, refcnt; 9507 uint64_t pool; 9508 spa_aux_vdev_t *sav = &spa->spa_spares; 9509 9510 for (i = 0; i < sav->sav_count; i++) { 9511 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 9512 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 9513 refcnt > 2) 9514 return (B_TRUE); 9515 } 9516 9517 return (B_FALSE); 9518 } 9519 9520 uint64_t 9521 spa_total_metaslabs(spa_t *spa) 9522 { 9523 vdev_t *rvd = spa->spa_root_vdev; 9524 9525 uint64_t m = 0; 9526 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 9527 vdev_t *vd = rvd->vdev_child[c]; 9528 if (!vdev_is_concrete(vd)) 9529 continue; 9530 m += vd->vdev_ms_count; 9531 } 9532 return (m); 9533 } 9534 9535 /* 9536 * Notify any waiting threads that some activity has switched from being in- 9537 * progress to not-in-progress so that the thread can wake up and determine 9538 * whether it is finished waiting. 9539 */ 9540 void 9541 spa_notify_waiters(spa_t *spa) 9542 { 9543 /* 9544 * Acquiring spa_activities_lock here prevents the cv_broadcast from 9545 * happening between the waiting thread's check and cv_wait. 9546 */ 9547 mutex_enter(&spa->spa_activities_lock); 9548 cv_broadcast(&spa->spa_activities_cv); 9549 mutex_exit(&spa->spa_activities_lock); 9550 } 9551 9552 /* 9553 * Notify any waiting threads that the pool is exporting, and then block until 9554 * they are finished using the spa_t. 9555 */ 9556 void 9557 spa_wake_waiters(spa_t *spa) 9558 { 9559 mutex_enter(&spa->spa_activities_lock); 9560 spa->spa_waiters_cancel = B_TRUE; 9561 cv_broadcast(&spa->spa_activities_cv); 9562 while (spa->spa_waiters != 0) 9563 cv_wait(&spa->spa_waiters_cv, &spa->spa_activities_lock); 9564 spa->spa_waiters_cancel = B_FALSE; 9565 mutex_exit(&spa->spa_activities_lock); 9566 } 9567 9568 /* Whether the vdev or any of its descendants are being initialized/trimmed. */ 9569 static boolean_t 9570 spa_vdev_activity_in_progress_impl(vdev_t *vd, zpool_wait_activity_t activity) 9571 { 9572 spa_t *spa = vd->vdev_spa; 9573 9574 ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER)); 9575 ASSERT(MUTEX_HELD(&spa->spa_activities_lock)); 9576 ASSERT(activity == ZPOOL_WAIT_INITIALIZE || 9577 activity == ZPOOL_WAIT_TRIM); 9578 9579 kmutex_t *lock = activity == ZPOOL_WAIT_INITIALIZE ? 9580 &vd->vdev_initialize_lock : &vd->vdev_trim_lock; 9581 9582 mutex_exit(&spa->spa_activities_lock); 9583 mutex_enter(lock); 9584 mutex_enter(&spa->spa_activities_lock); 9585 9586 boolean_t in_progress = (activity == ZPOOL_WAIT_INITIALIZE) ? 9587 (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) : 9588 (vd->vdev_trim_state == VDEV_TRIM_ACTIVE); 9589 mutex_exit(lock); 9590 9591 if (in_progress) 9592 return (B_TRUE); 9593 9594 for (int i = 0; i < vd->vdev_children; i++) { 9595 if (spa_vdev_activity_in_progress_impl(vd->vdev_child[i], 9596 activity)) 9597 return (B_TRUE); 9598 } 9599 9600 return (B_FALSE); 9601 } 9602 9603 /* 9604 * If use_guid is true, this checks whether the vdev specified by guid is 9605 * being initialized/trimmed. Otherwise, it checks whether any vdev in the pool 9606 * is being initialized/trimmed. The caller must hold the config lock and 9607 * spa_activities_lock. 9608 */ 9609 static int 9610 spa_vdev_activity_in_progress(spa_t *spa, boolean_t use_guid, uint64_t guid, 9611 zpool_wait_activity_t activity, boolean_t *in_progress) 9612 { 9613 mutex_exit(&spa->spa_activities_lock); 9614 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 9615 mutex_enter(&spa->spa_activities_lock); 9616 9617 vdev_t *vd; 9618 if (use_guid) { 9619 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 9620 if (vd == NULL || !vd->vdev_ops->vdev_op_leaf) { 9621 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 9622 return (EINVAL); 9623 } 9624 } else { 9625 vd = spa->spa_root_vdev; 9626 } 9627 9628 *in_progress = spa_vdev_activity_in_progress_impl(vd, activity); 9629 9630 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 9631 return (0); 9632 } 9633 9634 /* 9635 * Locking for waiting threads 9636 * --------------------------- 9637 * 9638 * Waiting threads need a way to check whether a given activity is in progress, 9639 * and then, if it is, wait for it to complete. Each activity will have some 9640 * in-memory representation of the relevant on-disk state which can be used to 9641 * determine whether or not the activity is in progress. The in-memory state and 9642 * the locking used to protect it will be different for each activity, and may 9643 * not be suitable for use with a cvar (e.g., some state is protected by the 9644 * config lock). To allow waiting threads to wait without any races, another 9645 * lock, spa_activities_lock, is used. 9646 * 9647 * When the state is checked, both the activity-specific lock (if there is one) 9648 * and spa_activities_lock are held. In some cases, the activity-specific lock 9649 * is acquired explicitly (e.g. the config lock). In others, the locking is 9650 * internal to some check (e.g. bpobj_is_empty). After checking, the waiting 9651 * thread releases the activity-specific lock and, if the activity is in 9652 * progress, then cv_waits using spa_activities_lock. 9653 * 9654 * The waiting thread is woken when another thread, one completing some 9655 * activity, updates the state of the activity and then calls 9656 * spa_notify_waiters, which will cv_broadcast. This 'completing' thread only 9657 * needs to hold its activity-specific lock when updating the state, and this 9658 * lock can (but doesn't have to) be dropped before calling spa_notify_waiters. 9659 * 9660 * Because spa_notify_waiters acquires spa_activities_lock before broadcasting, 9661 * and because it is held when the waiting thread checks the state of the 9662 * activity, it can never be the case that the completing thread both updates 9663 * the activity state and cv_broadcasts in between the waiting thread's check 9664 * and cv_wait. Thus, a waiting thread can never miss a wakeup. 9665 * 9666 * In order to prevent deadlock, when the waiting thread does its check, in some 9667 * cases it will temporarily drop spa_activities_lock in order to acquire the 9668 * activity-specific lock. The order in which spa_activities_lock and the 9669 * activity specific lock are acquired in the waiting thread is determined by 9670 * the order in which they are acquired in the completing thread; if the 9671 * completing thread calls spa_notify_waiters with the activity-specific lock 9672 * held, then the waiting thread must also acquire the activity-specific lock 9673 * first. 9674 */ 9675 9676 static int 9677 spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity, 9678 boolean_t use_tag, uint64_t tag, boolean_t *in_progress) 9679 { 9680 int error = 0; 9681 9682 ASSERT(MUTEX_HELD(&spa->spa_activities_lock)); 9683 9684 switch (activity) { 9685 case ZPOOL_WAIT_CKPT_DISCARD: 9686 *in_progress = 9687 (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT) && 9688 zap_contains(spa_meta_objset(spa), 9689 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT) == 9690 ENOENT); 9691 break; 9692 case ZPOOL_WAIT_FREE: 9693 *in_progress = ((spa_version(spa) >= SPA_VERSION_DEADLISTS && 9694 !bpobj_is_empty(&spa->spa_dsl_pool->dp_free_bpobj)) || 9695 spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY) || 9696 spa_livelist_delete_check(spa)); 9697 break; 9698 case ZPOOL_WAIT_INITIALIZE: 9699 case ZPOOL_WAIT_TRIM: 9700 error = spa_vdev_activity_in_progress(spa, use_tag, tag, 9701 activity, in_progress); 9702 break; 9703 case ZPOOL_WAIT_REPLACE: 9704 mutex_exit(&spa->spa_activities_lock); 9705 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 9706 mutex_enter(&spa->spa_activities_lock); 9707 9708 *in_progress = vdev_replace_in_progress(spa->spa_root_vdev); 9709 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 9710 break; 9711 case ZPOOL_WAIT_REMOVE: 9712 *in_progress = (spa->spa_removing_phys.sr_state == 9713 DSS_SCANNING); 9714 break; 9715 case ZPOOL_WAIT_RESILVER: 9716 if ((*in_progress = vdev_rebuild_active(spa->spa_root_vdev))) 9717 break; 9718 fallthrough; 9719 case ZPOOL_WAIT_SCRUB: 9720 { 9721 boolean_t scanning, paused, is_scrub; 9722 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; 9723 9724 is_scrub = (scn->scn_phys.scn_func == POOL_SCAN_SCRUB); 9725 scanning = (scn->scn_phys.scn_state == DSS_SCANNING); 9726 paused = dsl_scan_is_paused_scrub(scn); 9727 *in_progress = (scanning && !paused && 9728 is_scrub == (activity == ZPOOL_WAIT_SCRUB)); 9729 break; 9730 } 9731 default: 9732 panic("unrecognized value for activity %d", activity); 9733 } 9734 9735 return (error); 9736 } 9737 9738 static int 9739 spa_wait_common(const char *pool, zpool_wait_activity_t activity, 9740 boolean_t use_tag, uint64_t tag, boolean_t *waited) 9741 { 9742 /* 9743 * The tag is used to distinguish between instances of an activity. 9744 * 'initialize' and 'trim' are the only activities that we use this for. 9745 * The other activities can only have a single instance in progress in a 9746 * pool at one time, making the tag unnecessary. 9747 * 9748 * There can be multiple devices being replaced at once, but since they 9749 * all finish once resilvering finishes, we don't bother keeping track 9750 * of them individually, we just wait for them all to finish. 9751 */ 9752 if (use_tag && activity != ZPOOL_WAIT_INITIALIZE && 9753 activity != ZPOOL_WAIT_TRIM) 9754 return (EINVAL); 9755 9756 if (activity < 0 || activity >= ZPOOL_WAIT_NUM_ACTIVITIES) 9757 return (EINVAL); 9758 9759 spa_t *spa; 9760 int error = spa_open(pool, &spa, FTAG); 9761 if (error != 0) 9762 return (error); 9763 9764 /* 9765 * Increment the spa's waiter count so that we can call spa_close and 9766 * still ensure that the spa_t doesn't get freed before this thread is 9767 * finished with it when the pool is exported. We want to call spa_close 9768 * before we start waiting because otherwise the additional ref would 9769 * prevent the pool from being exported or destroyed throughout the 9770 * potentially long wait. 9771 */ 9772 mutex_enter(&spa->spa_activities_lock); 9773 spa->spa_waiters++; 9774 spa_close(spa, FTAG); 9775 9776 *waited = B_FALSE; 9777 for (;;) { 9778 boolean_t in_progress; 9779 error = spa_activity_in_progress(spa, activity, use_tag, tag, 9780 &in_progress); 9781 9782 if (error || !in_progress || spa->spa_waiters_cancel) 9783 break; 9784 9785 *waited = B_TRUE; 9786 9787 if (cv_wait_sig(&spa->spa_activities_cv, 9788 &spa->spa_activities_lock) == 0) { 9789 error = EINTR; 9790 break; 9791 } 9792 } 9793 9794 spa->spa_waiters--; 9795 cv_signal(&spa->spa_waiters_cv); 9796 mutex_exit(&spa->spa_activities_lock); 9797 9798 return (error); 9799 } 9800 9801 /* 9802 * Wait for a particular instance of the specified activity to complete, where 9803 * the instance is identified by 'tag' 9804 */ 9805 int 9806 spa_wait_tag(const char *pool, zpool_wait_activity_t activity, uint64_t tag, 9807 boolean_t *waited) 9808 { 9809 return (spa_wait_common(pool, activity, B_TRUE, tag, waited)); 9810 } 9811 9812 /* 9813 * Wait for all instances of the specified activity complete 9814 */ 9815 int 9816 spa_wait(const char *pool, zpool_wait_activity_t activity, boolean_t *waited) 9817 { 9818 9819 return (spa_wait_common(pool, activity, B_FALSE, 0, waited)); 9820 } 9821 9822 sysevent_t * 9823 spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) 9824 { 9825 sysevent_t *ev = NULL; 9826 #ifdef _KERNEL 9827 nvlist_t *resource; 9828 9829 resource = zfs_event_create(spa, vd, FM_SYSEVENT_CLASS, name, hist_nvl); 9830 if (resource) { 9831 ev = kmem_alloc(sizeof (sysevent_t), KM_SLEEP); 9832 ev->resource = resource; 9833 } 9834 #else 9835 (void) spa, (void) vd, (void) hist_nvl, (void) name; 9836 #endif 9837 return (ev); 9838 } 9839 9840 void 9841 spa_event_post(sysevent_t *ev) 9842 { 9843 #ifdef _KERNEL 9844 if (ev) { 9845 zfs_zevent_post(ev->resource, NULL, zfs_zevent_post_cb); 9846 kmem_free(ev, sizeof (*ev)); 9847 } 9848 #else 9849 (void) ev; 9850 #endif 9851 } 9852 9853 /* 9854 * Post a zevent corresponding to the given sysevent. The 'name' must be one 9855 * of the event definitions in sys/sysevent/eventdefs.h. The payload will be 9856 * filled in from the spa and (optionally) the vdev. This doesn't do anything 9857 * in the userland libzpool, as we don't want consumers to misinterpret ztest 9858 * or zdb as real changes. 9859 */ 9860 void 9861 spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) 9862 { 9863 spa_event_post(spa_event_create(spa, vd, hist_nvl, name)); 9864 } 9865 9866 /* state manipulation functions */ 9867 EXPORT_SYMBOL(spa_open); 9868 EXPORT_SYMBOL(spa_open_rewind); 9869 EXPORT_SYMBOL(spa_get_stats); 9870 EXPORT_SYMBOL(spa_create); 9871 EXPORT_SYMBOL(spa_import); 9872 EXPORT_SYMBOL(spa_tryimport); 9873 EXPORT_SYMBOL(spa_destroy); 9874 EXPORT_SYMBOL(spa_export); 9875 EXPORT_SYMBOL(spa_reset); 9876 EXPORT_SYMBOL(spa_async_request); 9877 EXPORT_SYMBOL(spa_async_suspend); 9878 EXPORT_SYMBOL(spa_async_resume); 9879 EXPORT_SYMBOL(spa_inject_addref); 9880 EXPORT_SYMBOL(spa_inject_delref); 9881 EXPORT_SYMBOL(spa_scan_stat_init); 9882 EXPORT_SYMBOL(spa_scan_get_stats); 9883 9884 /* device manipulation */ 9885 EXPORT_SYMBOL(spa_vdev_add); 9886 EXPORT_SYMBOL(spa_vdev_attach); 9887 EXPORT_SYMBOL(spa_vdev_detach); 9888 EXPORT_SYMBOL(spa_vdev_setpath); 9889 EXPORT_SYMBOL(spa_vdev_setfru); 9890 EXPORT_SYMBOL(spa_vdev_split_mirror); 9891 9892 /* spare statech is global across all pools) */ 9893 EXPORT_SYMBOL(spa_spare_add); 9894 EXPORT_SYMBOL(spa_spare_remove); 9895 EXPORT_SYMBOL(spa_spare_exists); 9896 EXPORT_SYMBOL(spa_spare_activate); 9897 9898 /* L2ARC statech is global across all pools) */ 9899 EXPORT_SYMBOL(spa_l2cache_add); 9900 EXPORT_SYMBOL(spa_l2cache_remove); 9901 EXPORT_SYMBOL(spa_l2cache_exists); 9902 EXPORT_SYMBOL(spa_l2cache_activate); 9903 EXPORT_SYMBOL(spa_l2cache_drop); 9904 9905 /* scanning */ 9906 EXPORT_SYMBOL(spa_scan); 9907 EXPORT_SYMBOL(spa_scan_stop); 9908 9909 /* spa syncing */ 9910 EXPORT_SYMBOL(spa_sync); /* only for DMU use */ 9911 EXPORT_SYMBOL(spa_sync_allpools); 9912 9913 /* properties */ 9914 EXPORT_SYMBOL(spa_prop_set); 9915 EXPORT_SYMBOL(spa_prop_get); 9916 EXPORT_SYMBOL(spa_prop_clear_bootfs); 9917 9918 /* asynchronous event notification */ 9919 EXPORT_SYMBOL(spa_event_notify); 9920 9921 /* BEGIN CSTYLED */ 9922 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, INT, ZMOD_RW, 9923 "log2 fraction of arc that can be used by inflight I/Os when " 9924 "verifying pool during import"); 9925 9926 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_metadata, INT, ZMOD_RW, 9927 "Set to traverse metadata on pool import"); 9928 9929 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW, 9930 "Set to traverse data on pool import"); 9931 9932 ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW, 9933 "Print vdev tree to zfs_dbgmsg during pool import"); 9934 9935 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RD, 9936 "Percentage of CPUs to run an IO worker thread"); 9937 9938 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RD, 9939 "Number of threads per IO worker taskqueue"); 9940 9941 ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, ULONG, ZMOD_RW, 9942 "Allow importing pool with up to this number of missing top-level " 9943 "vdevs (in read-only mode)"); 9944 9945 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT, ZMOD_RW, 9946 "Set the livelist condense zthr to pause"); 9947 9948 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_pause, INT, ZMOD_RW, 9949 "Set the livelist condense synctask to pause"); 9950 9951 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_cancel, INT, ZMOD_RW, 9952 "Whether livelist condensing was canceled in the synctask"); 9953 9954 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_cancel, INT, ZMOD_RW, 9955 "Whether livelist condensing was canceled in the zthr function"); 9956 9957 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, ZMOD_RW, 9958 "Whether extra ALLOC blkptrs were added to a livelist entry while it " 9959 "was being condensed"); 9960 /* END CSTYLED */ 9961