1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011, 2020 by Delphix. All rights reserved. 25 * Copyright (c) 2018, Nexenta Systems, Inc. All rights reserved. 26 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. 27 * Copyright 2013 Saso Kiselkov. All rights reserved. 28 * Copyright (c) 2014 Integros [integros.com] 29 * Copyright 2016 Toomas Soome <tsoome@me.com> 30 * Copyright (c) 2016 Actifio, Inc. All rights reserved. 31 * Copyright 2018 Joyent, Inc. 32 * Copyright (c) 2017, 2019, Datto Inc. All rights reserved. 33 * Copyright 2017 Joyent, Inc. 34 * Copyright (c) 2017, Intel Corporation. 35 * Copyright (c) 2021, Colm Buckley <colm@tuatha.org> 36 */ 37 38 /* 39 * SPA: Storage Pool Allocator 40 * 41 * This file contains all the routines used when modifying on-disk SPA state. 42 * This includes opening, importing, destroying, exporting a pool, and syncing a 43 * pool. 44 */ 45 46 #include <sys/zfs_context.h> 47 #include <sys/fm/fs/zfs.h> 48 #include <sys/spa_impl.h> 49 #include <sys/zio.h> 50 #include <sys/zio_checksum.h> 51 #include <sys/dmu.h> 52 #include <sys/dmu_tx.h> 53 #include <sys/zap.h> 54 #include <sys/zil.h> 55 #include <sys/ddt.h> 56 #include <sys/vdev_impl.h> 57 #include <sys/vdev_removal.h> 58 #include <sys/vdev_indirect_mapping.h> 59 #include <sys/vdev_indirect_births.h> 60 #include <sys/vdev_initialize.h> 61 #include <sys/vdev_rebuild.h> 62 #include <sys/vdev_trim.h> 63 #include <sys/vdev_disk.h> 64 #include <sys/vdev_draid.h> 65 #include <sys/metaslab.h> 66 #include <sys/metaslab_impl.h> 67 #include <sys/mmp.h> 68 #include <sys/uberblock_impl.h> 69 #include <sys/txg.h> 70 #include <sys/avl.h> 71 #include <sys/bpobj.h> 72 #include <sys/dmu_traverse.h> 73 #include <sys/dmu_objset.h> 74 #include <sys/unique.h> 75 #include <sys/dsl_pool.h> 76 #include <sys/dsl_dataset.h> 77 #include <sys/dsl_dir.h> 78 #include <sys/dsl_prop.h> 79 #include <sys/dsl_synctask.h> 80 #include <sys/fs/zfs.h> 81 #include <sys/arc.h> 82 #include <sys/callb.h> 83 #include <sys/systeminfo.h> 84 #include <sys/spa_boot.h> 85 #include <sys/zfs_ioctl.h> 86 #include <sys/dsl_scan.h> 87 #include <sys/zfeature.h> 88 #include <sys/dsl_destroy.h> 89 #include <sys/zvol.h> 90 91 #ifdef _KERNEL 92 #include <sys/fm/protocol.h> 93 #include <sys/fm/util.h> 94 #include <sys/callb.h> 95 #include <sys/zone.h> 96 #include <sys/vmsystm.h> 97 #endif /* _KERNEL */ 98 99 #include "zfs_prop.h" 100 #include "zfs_comutil.h" 101 102 /* 103 * The interval, in seconds, at which failed configuration cache file writes 104 * should be retried. 105 */ 106 int zfs_ccw_retry_interval = 300; 107 108 typedef enum zti_modes { 109 ZTI_MODE_FIXED, /* value is # of threads (min 1) */ 110 ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ 111 ZTI_MODE_SCALE, /* Taskqs scale with CPUs. */ 112 ZTI_MODE_NULL, /* don't create a taskq */ 113 ZTI_NMODES 114 } zti_modes_t; 115 116 #define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } 117 #define ZTI_PCT(n) { ZTI_MODE_ONLINE_PERCENT, (n), 1 } 118 #define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } 119 #define ZTI_SCALE { ZTI_MODE_SCALE, 0, 1 } 120 #define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } 121 122 #define ZTI_N(n) ZTI_P(n, 1) 123 #define ZTI_ONE ZTI_N(1) 124 125 typedef struct zio_taskq_info { 126 zti_modes_t zti_mode; 127 uint_t zti_value; 128 uint_t zti_count; 129 } zio_taskq_info_t; 130 131 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 132 "iss", "iss_h", "int", "int_h" 133 }; 134 135 /* 136 * This table defines the taskq settings for each ZFS I/O type. When 137 * initializing a pool, we use this table to create an appropriately sized 138 * taskq. Some operations are low volume and therefore have a small, static 139 * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE 140 * macros. Other operations process a large amount of data; the ZTI_BATCH 141 * macro causes us to create a taskq oriented for throughput. Some operations 142 * are so high frequency and short-lived that the taskq itself can become a 143 * point of lock contention. The ZTI_P(#, #) macro indicates that we need an 144 * additional degree of parallelism specified by the number of threads per- 145 * taskq and the number of taskqs; when dispatching an event in this case, the 146 * particular taskq is chosen at random. ZTI_SCALE is similar to ZTI_BATCH, 147 * but with number of taskqs also scaling with number of CPUs. 148 * 149 * The different taskq priorities are to handle the different contexts (issue 150 * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that 151 * need to be handled with minimum delay. 152 */ 153 const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 154 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 155 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ 156 { ZTI_N(8), ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* READ */ 157 { ZTI_BATCH, ZTI_N(5), ZTI_SCALE, ZTI_N(5) }, /* WRITE */ 158 { ZTI_SCALE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ 159 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ 160 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ 161 { ZTI_N(4), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* TRIM */ 162 }; 163 164 static void spa_sync_version(void *arg, dmu_tx_t *tx); 165 static void spa_sync_props(void *arg, dmu_tx_t *tx); 166 static boolean_t spa_has_active_shared_spare(spa_t *spa); 167 static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport); 168 static void spa_vdev_resilver_done(spa_t *spa); 169 170 uint_t zio_taskq_batch_pct = 80; /* 1 thread per cpu in pset */ 171 uint_t zio_taskq_batch_tpq; /* threads per taskq */ 172 boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 173 uint_t zio_taskq_basedc = 80; /* base duty cycle */ 174 175 boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ 176 177 /* 178 * Report any spa_load_verify errors found, but do not fail spa_load. 179 * This is used by zdb to analyze non-idle pools. 180 */ 181 boolean_t spa_load_verify_dryrun = B_FALSE; 182 183 /* 184 * Allow read spacemaps in case of readonly import (spa_mode == SPA_MODE_READ). 185 * This is used by zdb for spacemaps verification. 186 */ 187 boolean_t spa_mode_readable_spacemaps = B_FALSE; 188 189 /* 190 * This (illegal) pool name is used when temporarily importing a spa_t in order 191 * to get the vdev stats associated with the imported devices. 192 */ 193 #define TRYIMPORT_NAME "$import" 194 195 /* 196 * For debugging purposes: print out vdev tree during pool import. 197 */ 198 int spa_load_print_vdev_tree = B_FALSE; 199 200 /* 201 * A non-zero value for zfs_max_missing_tvds means that we allow importing 202 * pools with missing top-level vdevs. This is strictly intended for advanced 203 * pool recovery cases since missing data is almost inevitable. Pools with 204 * missing devices can only be imported read-only for safety reasons, and their 205 * fail-mode will be automatically set to "continue". 206 * 207 * With 1 missing vdev we should be able to import the pool and mount all 208 * datasets. User data that was not modified after the missing device has been 209 * added should be recoverable. This means that snapshots created prior to the 210 * addition of that device should be completely intact. 211 * 212 * With 2 missing vdevs, some datasets may fail to mount since there are 213 * dataset statistics that are stored as regular metadata. Some data might be 214 * recoverable if those vdevs were added recently. 215 * 216 * With 3 or more missing vdevs, the pool is severely damaged and MOS entries 217 * may be missing entirely. Chances of data recovery are very low. Note that 218 * there are also risks of performing an inadvertent rewind as we might be 219 * missing all the vdevs with the latest uberblocks. 220 */ 221 unsigned long zfs_max_missing_tvds = 0; 222 223 /* 224 * The parameters below are similar to zfs_max_missing_tvds but are only 225 * intended for a preliminary open of the pool with an untrusted config which 226 * might be incomplete or out-dated. 227 * 228 * We are more tolerant for pools opened from a cachefile since we could have 229 * an out-dated cachefile where a device removal was not registered. 230 * We could have set the limit arbitrarily high but in the case where devices 231 * are really missing we would want to return the proper error codes; we chose 232 * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available 233 * and we get a chance to retrieve the trusted config. 234 */ 235 uint64_t zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1; 236 237 /* 238 * In the case where config was assembled by scanning device paths (/dev/dsks 239 * by default) we are less tolerant since all the existing devices should have 240 * been detected and we want spa_load to return the right error codes. 241 */ 242 uint64_t zfs_max_missing_tvds_scan = 0; 243 244 /* 245 * Debugging aid that pauses spa_sync() towards the end. 246 */ 247 boolean_t zfs_pause_spa_sync = B_FALSE; 248 249 /* 250 * Variables to indicate the livelist condense zthr func should wait at certain 251 * points for the livelist to be removed - used to test condense/destroy races 252 */ 253 int zfs_livelist_condense_zthr_pause = 0; 254 int zfs_livelist_condense_sync_pause = 0; 255 256 /* 257 * Variables to track whether or not condense cancellation has been 258 * triggered in testing. 259 */ 260 int zfs_livelist_condense_sync_cancel = 0; 261 int zfs_livelist_condense_zthr_cancel = 0; 262 263 /* 264 * Variable to track whether or not extra ALLOC blkptrs were added to a 265 * livelist entry while it was being condensed (caused by the way we track 266 * remapped blkptrs in dbuf_remap_impl) 267 */ 268 int zfs_livelist_condense_new_alloc = 0; 269 270 /* 271 * ========================================================================== 272 * SPA properties routines 273 * ========================================================================== 274 */ 275 276 /* 277 * Add a (source=src, propname=propval) list to an nvlist. 278 */ 279 static void 280 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 281 uint64_t intval, zprop_source_t src) 282 { 283 const char *propname = zpool_prop_to_name(prop); 284 nvlist_t *propval; 285 286 propval = fnvlist_alloc(); 287 fnvlist_add_uint64(propval, ZPROP_SOURCE, src); 288 289 if (strval != NULL) 290 fnvlist_add_string(propval, ZPROP_VALUE, strval); 291 else 292 fnvlist_add_uint64(propval, ZPROP_VALUE, intval); 293 294 fnvlist_add_nvlist(nvl, propname, propval); 295 nvlist_free(propval); 296 } 297 298 /* 299 * Get property values from the spa configuration. 300 */ 301 static void 302 spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 303 { 304 vdev_t *rvd = spa->spa_root_vdev; 305 dsl_pool_t *pool = spa->spa_dsl_pool; 306 uint64_t size, alloc, cap, version; 307 const zprop_source_t src = ZPROP_SRC_NONE; 308 spa_config_dirent_t *dp; 309 metaslab_class_t *mc = spa_normal_class(spa); 310 311 ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 312 313 if (rvd != NULL) { 314 alloc = metaslab_class_get_alloc(mc); 315 alloc += metaslab_class_get_alloc(spa_special_class(spa)); 316 alloc += metaslab_class_get_alloc(spa_dedup_class(spa)); 317 alloc += metaslab_class_get_alloc(spa_embedded_log_class(spa)); 318 319 size = metaslab_class_get_space(mc); 320 size += metaslab_class_get_space(spa_special_class(spa)); 321 size += metaslab_class_get_space(spa_dedup_class(spa)); 322 size += metaslab_class_get_space(spa_embedded_log_class(spa)); 323 324 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 325 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 326 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 327 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 328 size - alloc, src); 329 spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL, 330 spa->spa_checkpoint_info.sci_dspace, src); 331 332 spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL, 333 metaslab_class_fragmentation(mc), src); 334 spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, 335 metaslab_class_expandable_space(mc), src); 336 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, 337 (spa_mode(spa) == SPA_MODE_READ), src); 338 339 cap = (size == 0) ? 0 : (alloc * 100 / size); 340 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 341 342 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 343 ddt_get_pool_dedup_ratio(spa), src); 344 345 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 346 rvd->vdev_state, src); 347 348 version = spa_version(spa); 349 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) { 350 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, 351 version, ZPROP_SRC_DEFAULT); 352 } else { 353 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, 354 version, ZPROP_SRC_LOCAL); 355 } 356 spa_prop_add_list(*nvp, ZPOOL_PROP_LOAD_GUID, 357 NULL, spa_load_guid(spa), src); 358 } 359 360 if (pool != NULL) { 361 /* 362 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, 363 * when opening pools before this version freedir will be NULL. 364 */ 365 if (pool->dp_free_dir != NULL) { 366 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 367 dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes, 368 src); 369 } else { 370 spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, 371 NULL, 0, src); 372 } 373 374 if (pool->dp_leak_dir != NULL) { 375 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, 376 dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes, 377 src); 378 } else { 379 spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, 380 NULL, 0, src); 381 } 382 } 383 384 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 385 386 if (spa->spa_comment != NULL) { 387 spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 388 0, ZPROP_SRC_LOCAL); 389 } 390 391 if (spa->spa_compatibility != NULL) { 392 spa_prop_add_list(*nvp, ZPOOL_PROP_COMPATIBILITY, 393 spa->spa_compatibility, 0, ZPROP_SRC_LOCAL); 394 } 395 396 if (spa->spa_root != NULL) 397 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 398 0, ZPROP_SRC_LOCAL); 399 400 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { 401 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 402 MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE); 403 } else { 404 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, 405 SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); 406 } 407 408 if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) { 409 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, 410 DNODE_MAX_SIZE, ZPROP_SRC_NONE); 411 } else { 412 spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, 413 DNODE_MIN_SIZE, ZPROP_SRC_NONE); 414 } 415 416 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 417 if (dp->scd_path == NULL) { 418 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 419 "none", 0, ZPROP_SRC_LOCAL); 420 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 421 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 422 dp->scd_path, 0, ZPROP_SRC_LOCAL); 423 } 424 } 425 } 426 427 /* 428 * Get zpool property values. 429 */ 430 int 431 spa_prop_get(spa_t *spa, nvlist_t **nvp) 432 { 433 objset_t *mos = spa->spa_meta_objset; 434 zap_cursor_t zc; 435 zap_attribute_t za; 436 dsl_pool_t *dp; 437 int err; 438 439 err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP); 440 if (err) 441 return (err); 442 443 dp = spa_get_dsl(spa); 444 dsl_pool_config_enter(dp, FTAG); 445 mutex_enter(&spa->spa_props_lock); 446 447 /* 448 * Get properties from the spa config. 449 */ 450 spa_prop_get_config(spa, nvp); 451 452 /* If no pool property object, no more prop to get. */ 453 if (mos == NULL || spa->spa_pool_props_object == 0) 454 goto out; 455 456 /* 457 * Get properties from the MOS pool property object. 458 */ 459 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 460 (err = zap_cursor_retrieve(&zc, &za)) == 0; 461 zap_cursor_advance(&zc)) { 462 uint64_t intval = 0; 463 char *strval = NULL; 464 zprop_source_t src = ZPROP_SRC_DEFAULT; 465 zpool_prop_t prop; 466 467 if ((prop = zpool_name_to_prop(za.za_name)) == ZPOOL_PROP_INVAL) 468 continue; 469 470 switch (za.za_integer_length) { 471 case 8: 472 /* integer property */ 473 if (za.za_first_integer != 474 zpool_prop_default_numeric(prop)) 475 src = ZPROP_SRC_LOCAL; 476 477 if (prop == ZPOOL_PROP_BOOTFS) { 478 dsl_dataset_t *ds = NULL; 479 480 err = dsl_dataset_hold_obj(dp, 481 za.za_first_integer, FTAG, &ds); 482 if (err != 0) 483 break; 484 485 strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, 486 KM_SLEEP); 487 dsl_dataset_name(ds, strval); 488 dsl_dataset_rele(ds, FTAG); 489 } else { 490 strval = NULL; 491 intval = za.za_first_integer; 492 } 493 494 spa_prop_add_list(*nvp, prop, strval, intval, src); 495 496 if (strval != NULL) 497 kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN); 498 499 break; 500 501 case 1: 502 /* string property */ 503 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 504 err = zap_lookup(mos, spa->spa_pool_props_object, 505 za.za_name, 1, za.za_num_integers, strval); 506 if (err) { 507 kmem_free(strval, za.za_num_integers); 508 break; 509 } 510 spa_prop_add_list(*nvp, prop, strval, 0, src); 511 kmem_free(strval, za.za_num_integers); 512 break; 513 514 default: 515 break; 516 } 517 } 518 zap_cursor_fini(&zc); 519 out: 520 mutex_exit(&spa->spa_props_lock); 521 dsl_pool_config_exit(dp, FTAG); 522 if (err && err != ENOENT) { 523 nvlist_free(*nvp); 524 *nvp = NULL; 525 return (err); 526 } 527 528 return (0); 529 } 530 531 /* 532 * Validate the given pool properties nvlist and modify the list 533 * for the property values to be set. 534 */ 535 static int 536 spa_prop_validate(spa_t *spa, nvlist_t *props) 537 { 538 nvpair_t *elem; 539 int error = 0, reset_bootfs = 0; 540 uint64_t objnum = 0; 541 boolean_t has_feature = B_FALSE; 542 543 elem = NULL; 544 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 545 uint64_t intval; 546 char *strval, *slash, *check, *fname; 547 const char *propname = nvpair_name(elem); 548 zpool_prop_t prop = zpool_name_to_prop(propname); 549 550 switch (prop) { 551 case ZPOOL_PROP_INVAL: 552 if (!zpool_prop_feature(propname)) { 553 error = SET_ERROR(EINVAL); 554 break; 555 } 556 557 /* 558 * Sanitize the input. 559 */ 560 if (nvpair_type(elem) != DATA_TYPE_UINT64) { 561 error = SET_ERROR(EINVAL); 562 break; 563 } 564 565 if (nvpair_value_uint64(elem, &intval) != 0) { 566 error = SET_ERROR(EINVAL); 567 break; 568 } 569 570 if (intval != 0) { 571 error = SET_ERROR(EINVAL); 572 break; 573 } 574 575 fname = strchr(propname, '@') + 1; 576 if (zfeature_lookup_name(fname, NULL) != 0) { 577 error = SET_ERROR(EINVAL); 578 break; 579 } 580 581 has_feature = B_TRUE; 582 break; 583 584 case ZPOOL_PROP_VERSION: 585 error = nvpair_value_uint64(elem, &intval); 586 if (!error && 587 (intval < spa_version(spa) || 588 intval > SPA_VERSION_BEFORE_FEATURES || 589 has_feature)) 590 error = SET_ERROR(EINVAL); 591 break; 592 593 case ZPOOL_PROP_DELEGATION: 594 case ZPOOL_PROP_AUTOREPLACE: 595 case ZPOOL_PROP_LISTSNAPS: 596 case ZPOOL_PROP_AUTOEXPAND: 597 case ZPOOL_PROP_AUTOTRIM: 598 error = nvpair_value_uint64(elem, &intval); 599 if (!error && intval > 1) 600 error = SET_ERROR(EINVAL); 601 break; 602 603 case ZPOOL_PROP_MULTIHOST: 604 error = nvpair_value_uint64(elem, &intval); 605 if (!error && intval > 1) 606 error = SET_ERROR(EINVAL); 607 608 if (!error) { 609 uint32_t hostid = zone_get_hostid(NULL); 610 if (hostid) 611 spa->spa_hostid = hostid; 612 else 613 error = SET_ERROR(ENOTSUP); 614 } 615 616 break; 617 618 case ZPOOL_PROP_BOOTFS: 619 /* 620 * If the pool version is less than SPA_VERSION_BOOTFS, 621 * or the pool is still being created (version == 0), 622 * the bootfs property cannot be set. 623 */ 624 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 625 error = SET_ERROR(ENOTSUP); 626 break; 627 } 628 629 /* 630 * Make sure the vdev config is bootable 631 */ 632 if (!vdev_is_bootable(spa->spa_root_vdev)) { 633 error = SET_ERROR(ENOTSUP); 634 break; 635 } 636 637 reset_bootfs = 1; 638 639 error = nvpair_value_string(elem, &strval); 640 641 if (!error) { 642 objset_t *os; 643 644 if (strval == NULL || strval[0] == '\0') { 645 objnum = zpool_prop_default_numeric( 646 ZPOOL_PROP_BOOTFS); 647 break; 648 } 649 650 error = dmu_objset_hold(strval, FTAG, &os); 651 if (error != 0) 652 break; 653 654 /* Must be ZPL. */ 655 if (dmu_objset_type(os) != DMU_OST_ZFS) { 656 error = SET_ERROR(ENOTSUP); 657 } else { 658 objnum = dmu_objset_id(os); 659 } 660 dmu_objset_rele(os, FTAG); 661 } 662 break; 663 664 case ZPOOL_PROP_FAILUREMODE: 665 error = nvpair_value_uint64(elem, &intval); 666 if (!error && intval > ZIO_FAILURE_MODE_PANIC) 667 error = SET_ERROR(EINVAL); 668 669 /* 670 * This is a special case which only occurs when 671 * the pool has completely failed. This allows 672 * the user to change the in-core failmode property 673 * without syncing it out to disk (I/Os might 674 * currently be blocked). We do this by returning 675 * EIO to the caller (spa_prop_set) to trick it 676 * into thinking we encountered a property validation 677 * error. 678 */ 679 if (!error && spa_suspended(spa)) { 680 spa->spa_failmode = intval; 681 error = SET_ERROR(EIO); 682 } 683 break; 684 685 case ZPOOL_PROP_CACHEFILE: 686 if ((error = nvpair_value_string(elem, &strval)) != 0) 687 break; 688 689 if (strval[0] == '\0') 690 break; 691 692 if (strcmp(strval, "none") == 0) 693 break; 694 695 if (strval[0] != '/') { 696 error = SET_ERROR(EINVAL); 697 break; 698 } 699 700 slash = strrchr(strval, '/'); 701 ASSERT(slash != NULL); 702 703 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 704 strcmp(slash, "/..") == 0) 705 error = SET_ERROR(EINVAL); 706 break; 707 708 case ZPOOL_PROP_COMMENT: 709 if ((error = nvpair_value_string(elem, &strval)) != 0) 710 break; 711 for (check = strval; *check != '\0'; check++) { 712 if (!isprint(*check)) { 713 error = SET_ERROR(EINVAL); 714 break; 715 } 716 } 717 if (strlen(strval) > ZPROP_MAX_COMMENT) 718 error = SET_ERROR(E2BIG); 719 break; 720 721 default: 722 break; 723 } 724 725 if (error) 726 break; 727 } 728 729 (void) nvlist_remove_all(props, 730 zpool_prop_to_name(ZPOOL_PROP_DEDUPDITTO)); 731 732 if (!error && reset_bootfs) { 733 error = nvlist_remove(props, 734 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 735 736 if (!error) { 737 error = nvlist_add_uint64(props, 738 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 739 } 740 } 741 742 return (error); 743 } 744 745 void 746 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 747 { 748 char *cachefile; 749 spa_config_dirent_t *dp; 750 751 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 752 &cachefile) != 0) 753 return; 754 755 dp = kmem_alloc(sizeof (spa_config_dirent_t), 756 KM_SLEEP); 757 758 if (cachefile[0] == '\0') 759 dp->scd_path = spa_strdup(spa_config_path); 760 else if (strcmp(cachefile, "none") == 0) 761 dp->scd_path = NULL; 762 else 763 dp->scd_path = spa_strdup(cachefile); 764 765 list_insert_head(&spa->spa_config_list, dp); 766 if (need_sync) 767 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 768 } 769 770 int 771 spa_prop_set(spa_t *spa, nvlist_t *nvp) 772 { 773 int error; 774 nvpair_t *elem = NULL; 775 boolean_t need_sync = B_FALSE; 776 777 if ((error = spa_prop_validate(spa, nvp)) != 0) 778 return (error); 779 780 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 781 zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); 782 783 if (prop == ZPOOL_PROP_CACHEFILE || 784 prop == ZPOOL_PROP_ALTROOT || 785 prop == ZPOOL_PROP_READONLY) 786 continue; 787 788 if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) { 789 uint64_t ver; 790 791 if (prop == ZPOOL_PROP_VERSION) { 792 VERIFY(nvpair_value_uint64(elem, &ver) == 0); 793 } else { 794 ASSERT(zpool_prop_feature(nvpair_name(elem))); 795 ver = SPA_VERSION_FEATURES; 796 need_sync = B_TRUE; 797 } 798 799 /* Save time if the version is already set. */ 800 if (ver == spa_version(spa)) 801 continue; 802 803 /* 804 * In addition to the pool directory object, we might 805 * create the pool properties object, the features for 806 * read object, the features for write object, or the 807 * feature descriptions object. 808 */ 809 error = dsl_sync_task(spa->spa_name, NULL, 810 spa_sync_version, &ver, 811 6, ZFS_SPACE_CHECK_RESERVED); 812 if (error) 813 return (error); 814 continue; 815 } 816 817 need_sync = B_TRUE; 818 break; 819 } 820 821 if (need_sync) { 822 return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, 823 nvp, 6, ZFS_SPACE_CHECK_RESERVED)); 824 } 825 826 return (0); 827 } 828 829 /* 830 * If the bootfs property value is dsobj, clear it. 831 */ 832 void 833 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 834 { 835 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 836 VERIFY(zap_remove(spa->spa_meta_objset, 837 spa->spa_pool_props_object, 838 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 839 spa->spa_bootfs = 0; 840 } 841 } 842 843 /*ARGSUSED*/ 844 static int 845 spa_change_guid_check(void *arg, dmu_tx_t *tx) 846 { 847 uint64_t *newguid __maybe_unused = arg; 848 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 849 vdev_t *rvd = spa->spa_root_vdev; 850 uint64_t vdev_state; 851 852 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 853 int error = (spa_has_checkpoint(spa)) ? 854 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 855 return (SET_ERROR(error)); 856 } 857 858 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 859 vdev_state = rvd->vdev_state; 860 spa_config_exit(spa, SCL_STATE, FTAG); 861 862 if (vdev_state != VDEV_STATE_HEALTHY) 863 return (SET_ERROR(ENXIO)); 864 865 ASSERT3U(spa_guid(spa), !=, *newguid); 866 867 return (0); 868 } 869 870 static void 871 spa_change_guid_sync(void *arg, dmu_tx_t *tx) 872 { 873 uint64_t *newguid = arg; 874 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 875 uint64_t oldguid; 876 vdev_t *rvd = spa->spa_root_vdev; 877 878 oldguid = spa_guid(spa); 879 880 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 881 rvd->vdev_guid = *newguid; 882 rvd->vdev_guid_sum += (*newguid - oldguid); 883 vdev_config_dirty(rvd); 884 spa_config_exit(spa, SCL_STATE, FTAG); 885 886 spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", 887 (u_longlong_t)oldguid, (u_longlong_t)*newguid); 888 } 889 890 /* 891 * Change the GUID for the pool. This is done so that we can later 892 * re-import a pool built from a clone of our own vdevs. We will modify 893 * the root vdev's guid, our own pool guid, and then mark all of our 894 * vdevs dirty. Note that we must make sure that all our vdevs are 895 * online when we do this, or else any vdevs that weren't present 896 * would be orphaned from our pool. We are also going to issue a 897 * sysevent to update any watchers. 898 */ 899 int 900 spa_change_guid(spa_t *spa) 901 { 902 int error; 903 uint64_t guid; 904 905 mutex_enter(&spa->spa_vdev_top_lock); 906 mutex_enter(&spa_namespace_lock); 907 guid = spa_generate_guid(NULL); 908 909 error = dsl_sync_task(spa->spa_name, spa_change_guid_check, 910 spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); 911 912 if (error == 0) { 913 spa_write_cachefile(spa, B_FALSE, B_TRUE); 914 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID); 915 } 916 917 mutex_exit(&spa_namespace_lock); 918 mutex_exit(&spa->spa_vdev_top_lock); 919 920 return (error); 921 } 922 923 /* 924 * ========================================================================== 925 * SPA state manipulation (open/create/destroy/import/export) 926 * ========================================================================== 927 */ 928 929 static int 930 spa_error_entry_compare(const void *a, const void *b) 931 { 932 const spa_error_entry_t *sa = (const spa_error_entry_t *)a; 933 const spa_error_entry_t *sb = (const spa_error_entry_t *)b; 934 int ret; 935 936 ret = memcmp(&sa->se_bookmark, &sb->se_bookmark, 937 sizeof (zbookmark_phys_t)); 938 939 return (TREE_ISIGN(ret)); 940 } 941 942 /* 943 * Utility function which retrieves copies of the current logs and 944 * re-initializes them in the process. 945 */ 946 void 947 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 948 { 949 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 950 951 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 952 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 953 954 avl_create(&spa->spa_errlist_scrub, 955 spa_error_entry_compare, sizeof (spa_error_entry_t), 956 offsetof(spa_error_entry_t, se_avl)); 957 avl_create(&spa->spa_errlist_last, 958 spa_error_entry_compare, sizeof (spa_error_entry_t), 959 offsetof(spa_error_entry_t, se_avl)); 960 } 961 962 static void 963 spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 964 { 965 const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 966 enum zti_modes mode = ztip->zti_mode; 967 uint_t value = ztip->zti_value; 968 uint_t count = ztip->zti_count; 969 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 970 uint_t cpus, flags = TASKQ_DYNAMIC; 971 boolean_t batch = B_FALSE; 972 973 switch (mode) { 974 case ZTI_MODE_FIXED: 975 ASSERT3U(value, >, 0); 976 break; 977 978 case ZTI_MODE_BATCH: 979 batch = B_TRUE; 980 flags |= TASKQ_THREADS_CPU_PCT; 981 value = MIN(zio_taskq_batch_pct, 100); 982 break; 983 984 case ZTI_MODE_SCALE: 985 flags |= TASKQ_THREADS_CPU_PCT; 986 /* 987 * We want more taskqs to reduce lock contention, but we want 988 * less for better request ordering and CPU utilization. 989 */ 990 cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100); 991 if (zio_taskq_batch_tpq > 0) { 992 count = MAX(1, (cpus + zio_taskq_batch_tpq / 2) / 993 zio_taskq_batch_tpq); 994 } else { 995 /* 996 * Prefer 6 threads per taskq, but no more taskqs 997 * than threads in them on large systems. For 80%: 998 * 999 * taskq taskq total 1000 * cpus taskqs percent threads threads 1001 * ------- ------- ------- ------- ------- 1002 * 1 1 80% 1 1 1003 * 2 1 80% 1 1 1004 * 4 1 80% 3 3 1005 * 8 2 40% 3 6 1006 * 16 3 27% 4 12 1007 * 32 5 16% 5 25 1008 * 64 7 11% 7 49 1009 * 128 10 8% 10 100 1010 * 256 14 6% 15 210 1011 */ 1012 count = 1 + cpus / 6; 1013 while (count * count > cpus) 1014 count--; 1015 } 1016 /* Limit each taskq within 100% to not trigger assertion. */ 1017 count = MAX(count, (zio_taskq_batch_pct + 99) / 100); 1018 value = (zio_taskq_batch_pct + count / 2) / count; 1019 break; 1020 1021 case ZTI_MODE_NULL: 1022 tqs->stqs_count = 0; 1023 tqs->stqs_taskq = NULL; 1024 return; 1025 1026 default: 1027 panic("unrecognized mode for %s_%s taskq (%u:%u) in " 1028 "spa_activate()", 1029 zio_type_name[t], zio_taskq_types[q], mode, value); 1030 break; 1031 } 1032 1033 ASSERT3U(count, >, 0); 1034 tqs->stqs_count = count; 1035 tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); 1036 1037 for (uint_t i = 0; i < count; i++) { 1038 taskq_t *tq; 1039 char name[32]; 1040 1041 if (count > 1) 1042 (void) snprintf(name, sizeof (name), "%s_%s_%u", 1043 zio_type_name[t], zio_taskq_types[q], i); 1044 else 1045 (void) snprintf(name, sizeof (name), "%s_%s", 1046 zio_type_name[t], zio_taskq_types[q]); 1047 1048 if (zio_taskq_sysdc && spa->spa_proc != &p0) { 1049 if (batch) 1050 flags |= TASKQ_DC_BATCH; 1051 1052 tq = taskq_create_sysdc(name, value, 50, INT_MAX, 1053 spa->spa_proc, zio_taskq_basedc, flags); 1054 } else { 1055 pri_t pri = maxclsyspri; 1056 /* 1057 * The write issue taskq can be extremely CPU 1058 * intensive. Run it at slightly less important 1059 * priority than the other taskqs. 1060 * 1061 * Under Linux and FreeBSD this means incrementing 1062 * the priority value as opposed to platforms like 1063 * illumos where it should be decremented. 1064 * 1065 * On FreeBSD, if priorities divided by four (RQ_PPQ) 1066 * are equal then a difference between them is 1067 * insignificant. 1068 */ 1069 if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) { 1070 #if defined(__linux__) 1071 pri++; 1072 #elif defined(__FreeBSD__) 1073 pri += 4; 1074 #else 1075 #error "unknown OS" 1076 #endif 1077 } 1078 tq = taskq_create_proc(name, value, pri, 50, 1079 INT_MAX, spa->spa_proc, flags); 1080 } 1081 1082 tqs->stqs_taskq[i] = tq; 1083 } 1084 } 1085 1086 static void 1087 spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) 1088 { 1089 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1090 1091 if (tqs->stqs_taskq == NULL) { 1092 ASSERT3U(tqs->stqs_count, ==, 0); 1093 return; 1094 } 1095 1096 for (uint_t i = 0; i < tqs->stqs_count; i++) { 1097 ASSERT3P(tqs->stqs_taskq[i], !=, NULL); 1098 taskq_destroy(tqs->stqs_taskq[i]); 1099 } 1100 1101 kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); 1102 tqs->stqs_taskq = NULL; 1103 } 1104 1105 /* 1106 * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. 1107 * Note that a type may have multiple discrete taskqs to avoid lock contention 1108 * on the taskq itself. In that case we choose which taskq at random by using 1109 * the low bits of gethrtime(). 1110 */ 1111 void 1112 spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, 1113 task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) 1114 { 1115 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1116 taskq_t *tq; 1117 1118 ASSERT3P(tqs->stqs_taskq, !=, NULL); 1119 ASSERT3U(tqs->stqs_count, !=, 0); 1120 1121 if (tqs->stqs_count == 1) { 1122 tq = tqs->stqs_taskq[0]; 1123 } else { 1124 tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; 1125 } 1126 1127 taskq_dispatch_ent(tq, func, arg, flags, ent); 1128 } 1129 1130 /* 1131 * Same as spa_taskq_dispatch_ent() but block on the task until completion. 1132 */ 1133 void 1134 spa_taskq_dispatch_sync(spa_t *spa, zio_type_t t, zio_taskq_type_t q, 1135 task_func_t *func, void *arg, uint_t flags) 1136 { 1137 spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; 1138 taskq_t *tq; 1139 taskqid_t id; 1140 1141 ASSERT3P(tqs->stqs_taskq, !=, NULL); 1142 ASSERT3U(tqs->stqs_count, !=, 0); 1143 1144 if (tqs->stqs_count == 1) { 1145 tq = tqs->stqs_taskq[0]; 1146 } else { 1147 tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; 1148 } 1149 1150 id = taskq_dispatch(tq, func, arg, flags); 1151 if (id) 1152 taskq_wait_id(tq, id); 1153 } 1154 1155 static void 1156 spa_create_zio_taskqs(spa_t *spa) 1157 { 1158 for (int t = 0; t < ZIO_TYPES; t++) { 1159 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1160 spa_taskqs_init(spa, t, q); 1161 } 1162 } 1163 } 1164 1165 /* 1166 * Disabled until spa_thread() can be adapted for Linux. 1167 */ 1168 #undef HAVE_SPA_THREAD 1169 1170 #if defined(_KERNEL) && defined(HAVE_SPA_THREAD) 1171 static void 1172 spa_thread(void *arg) 1173 { 1174 psetid_t zio_taskq_psrset_bind = PS_NONE; 1175 callb_cpr_t cprinfo; 1176 1177 spa_t *spa = arg; 1178 user_t *pu = PTOU(curproc); 1179 1180 CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 1181 spa->spa_name); 1182 1183 ASSERT(curproc != &p0); 1184 (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 1185 "zpool-%s", spa->spa_name); 1186 (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 1187 1188 /* bind this thread to the requested psrset */ 1189 if (zio_taskq_psrset_bind != PS_NONE) { 1190 pool_lock(); 1191 mutex_enter(&cpu_lock); 1192 mutex_enter(&pidlock); 1193 mutex_enter(&curproc->p_lock); 1194 1195 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 1196 0, NULL, NULL) == 0) { 1197 curthread->t_bind_pset = zio_taskq_psrset_bind; 1198 } else { 1199 cmn_err(CE_WARN, 1200 "Couldn't bind process for zfs pool \"%s\" to " 1201 "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 1202 } 1203 1204 mutex_exit(&curproc->p_lock); 1205 mutex_exit(&pidlock); 1206 mutex_exit(&cpu_lock); 1207 pool_unlock(); 1208 } 1209 1210 if (zio_taskq_sysdc) { 1211 sysdc_thread_enter(curthread, 100, 0); 1212 } 1213 1214 spa->spa_proc = curproc; 1215 spa->spa_did = curthread->t_did; 1216 1217 spa_create_zio_taskqs(spa); 1218 1219 mutex_enter(&spa->spa_proc_lock); 1220 ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 1221 1222 spa->spa_proc_state = SPA_PROC_ACTIVE; 1223 cv_broadcast(&spa->spa_proc_cv); 1224 1225 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1226 while (spa->spa_proc_state == SPA_PROC_ACTIVE) 1227 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1228 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 1229 1230 ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 1231 spa->spa_proc_state = SPA_PROC_GONE; 1232 spa->spa_proc = &p0; 1233 cv_broadcast(&spa->spa_proc_cv); 1234 CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 1235 1236 mutex_enter(&curproc->p_lock); 1237 lwp_exit(); 1238 } 1239 #endif 1240 1241 /* 1242 * Activate an uninitialized pool. 1243 */ 1244 static void 1245 spa_activate(spa_t *spa, spa_mode_t mode) 1246 { 1247 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 1248 1249 spa->spa_state = POOL_STATE_ACTIVE; 1250 spa->spa_mode = mode; 1251 spa->spa_read_spacemaps = spa_mode_readable_spacemaps; 1252 1253 spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); 1254 spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); 1255 spa->spa_embedded_log_class = 1256 metaslab_class_create(spa, zfs_metaslab_ops); 1257 spa->spa_special_class = metaslab_class_create(spa, zfs_metaslab_ops); 1258 spa->spa_dedup_class = metaslab_class_create(spa, zfs_metaslab_ops); 1259 1260 /* Try to create a covering process */ 1261 mutex_enter(&spa->spa_proc_lock); 1262 ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 1263 ASSERT(spa->spa_proc == &p0); 1264 spa->spa_did = 0; 1265 1266 #ifdef HAVE_SPA_THREAD 1267 /* Only create a process if we're going to be around a while. */ 1268 if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 1269 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 1270 NULL, 0) == 0) { 1271 spa->spa_proc_state = SPA_PROC_CREATED; 1272 while (spa->spa_proc_state == SPA_PROC_CREATED) { 1273 cv_wait(&spa->spa_proc_cv, 1274 &spa->spa_proc_lock); 1275 } 1276 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1277 ASSERT(spa->spa_proc != &p0); 1278 ASSERT(spa->spa_did != 0); 1279 } else { 1280 #ifdef _KERNEL 1281 cmn_err(CE_WARN, 1282 "Couldn't create process for zfs pool \"%s\"\n", 1283 spa->spa_name); 1284 #endif 1285 } 1286 } 1287 #endif /* HAVE_SPA_THREAD */ 1288 mutex_exit(&spa->spa_proc_lock); 1289 1290 /* If we didn't create a process, we need to create our taskqs. */ 1291 if (spa->spa_proc == &p0) { 1292 spa_create_zio_taskqs(spa); 1293 } 1294 1295 for (size_t i = 0; i < TXG_SIZE; i++) { 1296 spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, 1297 ZIO_FLAG_CANFAIL); 1298 } 1299 1300 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 1301 offsetof(vdev_t, vdev_config_dirty_node)); 1302 list_create(&spa->spa_evicting_os_list, sizeof (objset_t), 1303 offsetof(objset_t, os_evicting_node)); 1304 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 1305 offsetof(vdev_t, vdev_state_dirty_node)); 1306 1307 txg_list_create(&spa->spa_vdev_txg_list, spa, 1308 offsetof(struct vdev, vdev_txg_node)); 1309 1310 avl_create(&spa->spa_errlist_scrub, 1311 spa_error_entry_compare, sizeof (spa_error_entry_t), 1312 offsetof(spa_error_entry_t, se_avl)); 1313 avl_create(&spa->spa_errlist_last, 1314 spa_error_entry_compare, sizeof (spa_error_entry_t), 1315 offsetof(spa_error_entry_t, se_avl)); 1316 1317 spa_keystore_init(&spa->spa_keystore); 1318 1319 /* 1320 * This taskq is used to perform zvol-minor-related tasks 1321 * asynchronously. This has several advantages, including easy 1322 * resolution of various deadlocks. 1323 * 1324 * The taskq must be single threaded to ensure tasks are always 1325 * processed in the order in which they were dispatched. 1326 * 1327 * A taskq per pool allows one to keep the pools independent. 1328 * This way if one pool is suspended, it will not impact another. 1329 * 1330 * The preferred location to dispatch a zvol minor task is a sync 1331 * task. In this context, there is easy access to the spa_t and minimal 1332 * error handling is required because the sync task must succeed. 1333 */ 1334 spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri, 1335 1, INT_MAX, 0); 1336 1337 /* 1338 * Taskq dedicated to prefetcher threads: this is used to prevent the 1339 * pool traverse code from monopolizing the global (and limited) 1340 * system_taskq by inappropriately scheduling long running tasks on it. 1341 */ 1342 spa->spa_prefetch_taskq = taskq_create("z_prefetch", 100, 1343 defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); 1344 1345 /* 1346 * The taskq to upgrade datasets in this pool. Currently used by 1347 * feature SPA_FEATURE_USEROBJ_ACCOUNTING/SPA_FEATURE_PROJECT_QUOTA. 1348 */ 1349 spa->spa_upgrade_taskq = taskq_create("z_upgrade", 100, 1350 defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); 1351 } 1352 1353 /* 1354 * Opposite of spa_activate(). 1355 */ 1356 static void 1357 spa_deactivate(spa_t *spa) 1358 { 1359 ASSERT(spa->spa_sync_on == B_FALSE); 1360 ASSERT(spa->spa_dsl_pool == NULL); 1361 ASSERT(spa->spa_root_vdev == NULL); 1362 ASSERT(spa->spa_async_zio_root == NULL); 1363 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 1364 1365 spa_evicting_os_wait(spa); 1366 1367 if (spa->spa_zvol_taskq) { 1368 taskq_destroy(spa->spa_zvol_taskq); 1369 spa->spa_zvol_taskq = NULL; 1370 } 1371 1372 if (spa->spa_prefetch_taskq) { 1373 taskq_destroy(spa->spa_prefetch_taskq); 1374 spa->spa_prefetch_taskq = NULL; 1375 } 1376 1377 if (spa->spa_upgrade_taskq) { 1378 taskq_destroy(spa->spa_upgrade_taskq); 1379 spa->spa_upgrade_taskq = NULL; 1380 } 1381 1382 txg_list_destroy(&spa->spa_vdev_txg_list); 1383 1384 list_destroy(&spa->spa_config_dirty_list); 1385 list_destroy(&spa->spa_evicting_os_list); 1386 list_destroy(&spa->spa_state_dirty_list); 1387 1388 taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); 1389 1390 for (int t = 0; t < ZIO_TYPES; t++) { 1391 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 1392 spa_taskqs_fini(spa, t, q); 1393 } 1394 } 1395 1396 for (size_t i = 0; i < TXG_SIZE; i++) { 1397 ASSERT3P(spa->spa_txg_zio[i], !=, NULL); 1398 VERIFY0(zio_wait(spa->spa_txg_zio[i])); 1399 spa->spa_txg_zio[i] = NULL; 1400 } 1401 1402 metaslab_class_destroy(spa->spa_normal_class); 1403 spa->spa_normal_class = NULL; 1404 1405 metaslab_class_destroy(spa->spa_log_class); 1406 spa->spa_log_class = NULL; 1407 1408 metaslab_class_destroy(spa->spa_embedded_log_class); 1409 spa->spa_embedded_log_class = NULL; 1410 1411 metaslab_class_destroy(spa->spa_special_class); 1412 spa->spa_special_class = NULL; 1413 1414 metaslab_class_destroy(spa->spa_dedup_class); 1415 spa->spa_dedup_class = NULL; 1416 1417 /* 1418 * If this was part of an import or the open otherwise failed, we may 1419 * still have errors left in the queues. Empty them just in case. 1420 */ 1421 spa_errlog_drain(spa); 1422 avl_destroy(&spa->spa_errlist_scrub); 1423 avl_destroy(&spa->spa_errlist_last); 1424 1425 spa_keystore_fini(&spa->spa_keystore); 1426 1427 spa->spa_state = POOL_STATE_UNINITIALIZED; 1428 1429 mutex_enter(&spa->spa_proc_lock); 1430 if (spa->spa_proc_state != SPA_PROC_NONE) { 1431 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 1432 spa->spa_proc_state = SPA_PROC_DEACTIVATE; 1433 cv_broadcast(&spa->spa_proc_cv); 1434 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 1435 ASSERT(spa->spa_proc != &p0); 1436 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 1437 } 1438 ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 1439 spa->spa_proc_state = SPA_PROC_NONE; 1440 } 1441 ASSERT(spa->spa_proc == &p0); 1442 mutex_exit(&spa->spa_proc_lock); 1443 1444 /* 1445 * We want to make sure spa_thread() has actually exited the ZFS 1446 * module, so that the module can't be unloaded out from underneath 1447 * it. 1448 */ 1449 if (spa->spa_did != 0) { 1450 thread_join(spa->spa_did); 1451 spa->spa_did = 0; 1452 } 1453 } 1454 1455 /* 1456 * Verify a pool configuration, and construct the vdev tree appropriately. This 1457 * will create all the necessary vdevs in the appropriate layout, with each vdev 1458 * in the CLOSED state. This will prep the pool before open/creation/import. 1459 * All vdev validation is done by the vdev_alloc() routine. 1460 */ 1461 int 1462 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 1463 uint_t id, int atype) 1464 { 1465 nvlist_t **child; 1466 uint_t children; 1467 int error; 1468 1469 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 1470 return (error); 1471 1472 if ((*vdp)->vdev_ops->vdev_op_leaf) 1473 return (0); 1474 1475 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 1476 &child, &children); 1477 1478 if (error == ENOENT) 1479 return (0); 1480 1481 if (error) { 1482 vdev_free(*vdp); 1483 *vdp = NULL; 1484 return (SET_ERROR(EINVAL)); 1485 } 1486 1487 for (int c = 0; c < children; c++) { 1488 vdev_t *vd; 1489 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 1490 atype)) != 0) { 1491 vdev_free(*vdp); 1492 *vdp = NULL; 1493 return (error); 1494 } 1495 } 1496 1497 ASSERT(*vdp != NULL); 1498 1499 return (0); 1500 } 1501 1502 static boolean_t 1503 spa_should_flush_logs_on_unload(spa_t *spa) 1504 { 1505 if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) 1506 return (B_FALSE); 1507 1508 if (!spa_writeable(spa)) 1509 return (B_FALSE); 1510 1511 if (!spa->spa_sync_on) 1512 return (B_FALSE); 1513 1514 if (spa_state(spa) != POOL_STATE_EXPORTED) 1515 return (B_FALSE); 1516 1517 if (zfs_keep_log_spacemaps_at_export) 1518 return (B_FALSE); 1519 1520 return (B_TRUE); 1521 } 1522 1523 /* 1524 * Opens a transaction that will set the flag that will instruct 1525 * spa_sync to attempt to flush all the metaslabs for that txg. 1526 */ 1527 static void 1528 spa_unload_log_sm_flush_all(spa_t *spa) 1529 { 1530 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 1531 VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); 1532 1533 ASSERT3U(spa->spa_log_flushall_txg, ==, 0); 1534 spa->spa_log_flushall_txg = dmu_tx_get_txg(tx); 1535 1536 dmu_tx_commit(tx); 1537 txg_wait_synced(spa_get_dsl(spa), spa->spa_log_flushall_txg); 1538 } 1539 1540 static void 1541 spa_unload_log_sm_metadata(spa_t *spa) 1542 { 1543 void *cookie = NULL; 1544 spa_log_sm_t *sls; 1545 while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg, 1546 &cookie)) != NULL) { 1547 VERIFY0(sls->sls_mscount); 1548 kmem_free(sls, sizeof (spa_log_sm_t)); 1549 } 1550 1551 for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); 1552 e != NULL; e = list_head(&spa->spa_log_summary)) { 1553 VERIFY0(e->lse_mscount); 1554 list_remove(&spa->spa_log_summary, e); 1555 kmem_free(e, sizeof (log_summary_entry_t)); 1556 } 1557 1558 spa->spa_unflushed_stats.sus_nblocks = 0; 1559 spa->spa_unflushed_stats.sus_memused = 0; 1560 spa->spa_unflushed_stats.sus_blocklimit = 0; 1561 } 1562 1563 static void 1564 spa_destroy_aux_threads(spa_t *spa) 1565 { 1566 if (spa->spa_condense_zthr != NULL) { 1567 zthr_destroy(spa->spa_condense_zthr); 1568 spa->spa_condense_zthr = NULL; 1569 } 1570 if (spa->spa_checkpoint_discard_zthr != NULL) { 1571 zthr_destroy(spa->spa_checkpoint_discard_zthr); 1572 spa->spa_checkpoint_discard_zthr = NULL; 1573 } 1574 if (spa->spa_livelist_delete_zthr != NULL) { 1575 zthr_destroy(spa->spa_livelist_delete_zthr); 1576 spa->spa_livelist_delete_zthr = NULL; 1577 } 1578 if (spa->spa_livelist_condense_zthr != NULL) { 1579 zthr_destroy(spa->spa_livelist_condense_zthr); 1580 spa->spa_livelist_condense_zthr = NULL; 1581 } 1582 } 1583 1584 /* 1585 * Opposite of spa_load(). 1586 */ 1587 static void 1588 spa_unload(spa_t *spa) 1589 { 1590 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1591 ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED); 1592 1593 spa_import_progress_remove(spa_guid(spa)); 1594 spa_load_note(spa, "UNLOADING"); 1595 1596 spa_wake_waiters(spa); 1597 1598 /* 1599 * If the log space map feature is enabled and the pool is getting 1600 * exported (but not destroyed), we want to spend some time flushing 1601 * as many metaslabs as we can in an attempt to destroy log space 1602 * maps and save import time. 1603 */ 1604 if (spa_should_flush_logs_on_unload(spa)) 1605 spa_unload_log_sm_flush_all(spa); 1606 1607 /* 1608 * Stop async tasks. 1609 */ 1610 spa_async_suspend(spa); 1611 1612 if (spa->spa_root_vdev) { 1613 vdev_t *root_vdev = spa->spa_root_vdev; 1614 vdev_initialize_stop_all(root_vdev, VDEV_INITIALIZE_ACTIVE); 1615 vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE); 1616 vdev_autotrim_stop_all(spa); 1617 vdev_rebuild_stop_all(spa); 1618 } 1619 1620 /* 1621 * Stop syncing. 1622 */ 1623 if (spa->spa_sync_on) { 1624 txg_sync_stop(spa->spa_dsl_pool); 1625 spa->spa_sync_on = B_FALSE; 1626 } 1627 1628 /* 1629 * This ensures that there is no async metaslab prefetching 1630 * while we attempt to unload the spa. 1631 */ 1632 if (spa->spa_root_vdev != NULL) { 1633 for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) { 1634 vdev_t *vc = spa->spa_root_vdev->vdev_child[c]; 1635 if (vc->vdev_mg != NULL) 1636 taskq_wait(vc->vdev_mg->mg_taskq); 1637 } 1638 } 1639 1640 if (spa->spa_mmp.mmp_thread) 1641 mmp_thread_stop(spa); 1642 1643 /* 1644 * Wait for any outstanding async I/O to complete. 1645 */ 1646 if (spa->spa_async_zio_root != NULL) { 1647 for (int i = 0; i < max_ncpus; i++) 1648 (void) zio_wait(spa->spa_async_zio_root[i]); 1649 kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *)); 1650 spa->spa_async_zio_root = NULL; 1651 } 1652 1653 if (spa->spa_vdev_removal != NULL) { 1654 spa_vdev_removal_destroy(spa->spa_vdev_removal); 1655 spa->spa_vdev_removal = NULL; 1656 } 1657 1658 spa_destroy_aux_threads(spa); 1659 1660 spa_condense_fini(spa); 1661 1662 bpobj_close(&spa->spa_deferred_bpobj); 1663 1664 spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); 1665 1666 /* 1667 * Close all vdevs. 1668 */ 1669 if (spa->spa_root_vdev) 1670 vdev_free(spa->spa_root_vdev); 1671 ASSERT(spa->spa_root_vdev == NULL); 1672 1673 /* 1674 * Close the dsl pool. 1675 */ 1676 if (spa->spa_dsl_pool) { 1677 dsl_pool_close(spa->spa_dsl_pool); 1678 spa->spa_dsl_pool = NULL; 1679 spa->spa_meta_objset = NULL; 1680 } 1681 1682 ddt_unload(spa); 1683 spa_unload_log_sm_metadata(spa); 1684 1685 /* 1686 * Drop and purge level 2 cache 1687 */ 1688 spa_l2cache_drop(spa); 1689 1690 for (int i = 0; i < spa->spa_spares.sav_count; i++) 1691 vdev_free(spa->spa_spares.sav_vdevs[i]); 1692 if (spa->spa_spares.sav_vdevs) { 1693 kmem_free(spa->spa_spares.sav_vdevs, 1694 spa->spa_spares.sav_count * sizeof (void *)); 1695 spa->spa_spares.sav_vdevs = NULL; 1696 } 1697 if (spa->spa_spares.sav_config) { 1698 nvlist_free(spa->spa_spares.sav_config); 1699 spa->spa_spares.sav_config = NULL; 1700 } 1701 spa->spa_spares.sav_count = 0; 1702 1703 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { 1704 vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); 1705 vdev_free(spa->spa_l2cache.sav_vdevs[i]); 1706 } 1707 if (spa->spa_l2cache.sav_vdevs) { 1708 kmem_free(spa->spa_l2cache.sav_vdevs, 1709 spa->spa_l2cache.sav_count * sizeof (void *)); 1710 spa->spa_l2cache.sav_vdevs = NULL; 1711 } 1712 if (spa->spa_l2cache.sav_config) { 1713 nvlist_free(spa->spa_l2cache.sav_config); 1714 spa->spa_l2cache.sav_config = NULL; 1715 } 1716 spa->spa_l2cache.sav_count = 0; 1717 1718 spa->spa_async_suspended = 0; 1719 1720 spa->spa_indirect_vdevs_loaded = B_FALSE; 1721 1722 if (spa->spa_comment != NULL) { 1723 spa_strfree(spa->spa_comment); 1724 spa->spa_comment = NULL; 1725 } 1726 if (spa->spa_compatibility != NULL) { 1727 spa_strfree(spa->spa_compatibility); 1728 spa->spa_compatibility = NULL; 1729 } 1730 1731 spa_config_exit(spa, SCL_ALL, spa); 1732 } 1733 1734 /* 1735 * Load (or re-load) the current list of vdevs describing the active spares for 1736 * this pool. When this is called, we have some form of basic information in 1737 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 1738 * then re-generate a more complete list including status information. 1739 */ 1740 void 1741 spa_load_spares(spa_t *spa) 1742 { 1743 nvlist_t **spares; 1744 uint_t nspares; 1745 int i; 1746 vdev_t *vd, *tvd; 1747 1748 #ifndef _KERNEL 1749 /* 1750 * zdb opens both the current state of the pool and the 1751 * checkpointed state (if present), with a different spa_t. 1752 * 1753 * As spare vdevs are shared among open pools, we skip loading 1754 * them when we load the checkpointed state of the pool. 1755 */ 1756 if (!spa_writeable(spa)) 1757 return; 1758 #endif 1759 1760 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1761 1762 /* 1763 * First, close and free any existing spare vdevs. 1764 */ 1765 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1766 vd = spa->spa_spares.sav_vdevs[i]; 1767 1768 /* Undo the call to spa_activate() below */ 1769 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1770 B_FALSE)) != NULL && tvd->vdev_isspare) 1771 spa_spare_remove(tvd); 1772 vdev_close(vd); 1773 vdev_free(vd); 1774 } 1775 1776 if (spa->spa_spares.sav_vdevs) 1777 kmem_free(spa->spa_spares.sav_vdevs, 1778 spa->spa_spares.sav_count * sizeof (void *)); 1779 1780 if (spa->spa_spares.sav_config == NULL) 1781 nspares = 0; 1782 else 1783 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1784 ZPOOL_CONFIG_SPARES, &spares, &nspares)); 1785 1786 spa->spa_spares.sav_count = (int)nspares; 1787 spa->spa_spares.sav_vdevs = NULL; 1788 1789 if (nspares == 0) 1790 return; 1791 1792 /* 1793 * Construct the array of vdevs, opening them to get status in the 1794 * process. For each spare, there is potentially two different vdev_t 1795 * structures associated with it: one in the list of spares (used only 1796 * for basic validation purposes) and one in the active vdev 1797 * configuration (if it's spared in). During this phase we open and 1798 * validate each vdev on the spare list. If the vdev also exists in the 1799 * active configuration, then we also mark this vdev as an active spare. 1800 */ 1801 spa->spa_spares.sav_vdevs = kmem_zalloc(nspares * sizeof (void *), 1802 KM_SLEEP); 1803 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1804 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 1805 VDEV_ALLOC_SPARE) == 0); 1806 ASSERT(vd != NULL); 1807 1808 spa->spa_spares.sav_vdevs[i] = vd; 1809 1810 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1811 B_FALSE)) != NULL) { 1812 if (!tvd->vdev_isspare) 1813 spa_spare_add(tvd); 1814 1815 /* 1816 * We only mark the spare active if we were successfully 1817 * able to load the vdev. Otherwise, importing a pool 1818 * with a bad active spare would result in strange 1819 * behavior, because multiple pool would think the spare 1820 * is actively in use. 1821 * 1822 * There is a vulnerability here to an equally bizarre 1823 * circumstance, where a dead active spare is later 1824 * brought back to life (onlined or otherwise). Given 1825 * the rarity of this scenario, and the extra complexity 1826 * it adds, we ignore the possibility. 1827 */ 1828 if (!vdev_is_dead(tvd)) 1829 spa_spare_activate(tvd); 1830 } 1831 1832 vd->vdev_top = vd; 1833 vd->vdev_aux = &spa->spa_spares; 1834 1835 if (vdev_open(vd) != 0) 1836 continue; 1837 1838 if (vdev_validate_aux(vd) == 0) 1839 spa_spare_add(vd); 1840 } 1841 1842 /* 1843 * Recompute the stashed list of spares, with status information 1844 * this time. 1845 */ 1846 fnvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES); 1847 1848 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 1849 KM_SLEEP); 1850 for (i = 0; i < spa->spa_spares.sav_count; i++) 1851 spares[i] = vdev_config_generate(spa, 1852 spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 1853 fnvlist_add_nvlist_array(spa->spa_spares.sav_config, 1854 ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count); 1855 for (i = 0; i < spa->spa_spares.sav_count; i++) 1856 nvlist_free(spares[i]); 1857 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 1858 } 1859 1860 /* 1861 * Load (or re-load) the current list of vdevs describing the active l2cache for 1862 * this pool. When this is called, we have some form of basic information in 1863 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 1864 * then re-generate a more complete list including status information. 1865 * Devices which are already active have their details maintained, and are 1866 * not re-opened. 1867 */ 1868 void 1869 spa_load_l2cache(spa_t *spa) 1870 { 1871 nvlist_t **l2cache = NULL; 1872 uint_t nl2cache; 1873 int i, j, oldnvdevs; 1874 uint64_t guid; 1875 vdev_t *vd, **oldvdevs, **newvdevs; 1876 spa_aux_vdev_t *sav = &spa->spa_l2cache; 1877 1878 #ifndef _KERNEL 1879 /* 1880 * zdb opens both the current state of the pool and the 1881 * checkpointed state (if present), with a different spa_t. 1882 * 1883 * As L2 caches are part of the ARC which is shared among open 1884 * pools, we skip loading them when we load the checkpointed 1885 * state of the pool. 1886 */ 1887 if (!spa_writeable(spa)) 1888 return; 1889 #endif 1890 1891 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1892 1893 oldvdevs = sav->sav_vdevs; 1894 oldnvdevs = sav->sav_count; 1895 sav->sav_vdevs = NULL; 1896 sav->sav_count = 0; 1897 1898 if (sav->sav_config == NULL) { 1899 nl2cache = 0; 1900 newvdevs = NULL; 1901 goto out; 1902 } 1903 1904 VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, 1905 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache)); 1906 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 1907 1908 /* 1909 * Process new nvlist of vdevs. 1910 */ 1911 for (i = 0; i < nl2cache; i++) { 1912 guid = fnvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID); 1913 1914 newvdevs[i] = NULL; 1915 for (j = 0; j < oldnvdevs; j++) { 1916 vd = oldvdevs[j]; 1917 if (vd != NULL && guid == vd->vdev_guid) { 1918 /* 1919 * Retain previous vdev for add/remove ops. 1920 */ 1921 newvdevs[i] = vd; 1922 oldvdevs[j] = NULL; 1923 break; 1924 } 1925 } 1926 1927 if (newvdevs[i] == NULL) { 1928 /* 1929 * Create new vdev 1930 */ 1931 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 1932 VDEV_ALLOC_L2CACHE) == 0); 1933 ASSERT(vd != NULL); 1934 newvdevs[i] = vd; 1935 1936 /* 1937 * Commit this vdev as an l2cache device, 1938 * even if it fails to open. 1939 */ 1940 spa_l2cache_add(vd); 1941 1942 vd->vdev_top = vd; 1943 vd->vdev_aux = sav; 1944 1945 spa_l2cache_activate(vd); 1946 1947 if (vdev_open(vd) != 0) 1948 continue; 1949 1950 (void) vdev_validate_aux(vd); 1951 1952 if (!vdev_is_dead(vd)) 1953 l2arc_add_vdev(spa, vd); 1954 1955 /* 1956 * Upon cache device addition to a pool or pool 1957 * creation with a cache device or if the header 1958 * of the device is invalid we issue an async 1959 * TRIM command for the whole device which will 1960 * execute if l2arc_trim_ahead > 0. 1961 */ 1962 spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM); 1963 } 1964 } 1965 1966 sav->sav_vdevs = newvdevs; 1967 sav->sav_count = (int)nl2cache; 1968 1969 /* 1970 * Recompute the stashed list of l2cache devices, with status 1971 * information this time. 1972 */ 1973 fnvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE); 1974 1975 if (sav->sav_count > 0) 1976 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), 1977 KM_SLEEP); 1978 for (i = 0; i < sav->sav_count; i++) 1979 l2cache[i] = vdev_config_generate(spa, 1980 sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 1981 fnvlist_add_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, l2cache, 1982 sav->sav_count); 1983 1984 out: 1985 /* 1986 * Purge vdevs that were dropped 1987 */ 1988 for (i = 0; i < oldnvdevs; i++) { 1989 uint64_t pool; 1990 1991 vd = oldvdevs[i]; 1992 if (vd != NULL) { 1993 ASSERT(vd->vdev_isl2cache); 1994 1995 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1996 pool != 0ULL && l2arc_vdev_present(vd)) 1997 l2arc_remove_vdev(vd); 1998 vdev_clear_stats(vd); 1999 vdev_free(vd); 2000 } 2001 } 2002 2003 if (oldvdevs) 2004 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 2005 2006 for (i = 0; i < sav->sav_count; i++) 2007 nvlist_free(l2cache[i]); 2008 if (sav->sav_count) 2009 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 2010 } 2011 2012 static int 2013 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 2014 { 2015 dmu_buf_t *db; 2016 char *packed = NULL; 2017 size_t nvsize = 0; 2018 int error; 2019 *value = NULL; 2020 2021 error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); 2022 if (error) 2023 return (error); 2024 2025 nvsize = *(uint64_t *)db->db_data; 2026 dmu_buf_rele(db, FTAG); 2027 2028 packed = vmem_alloc(nvsize, KM_SLEEP); 2029 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 2030 DMU_READ_PREFETCH); 2031 if (error == 0) 2032 error = nvlist_unpack(packed, nvsize, value, 0); 2033 vmem_free(packed, nvsize); 2034 2035 return (error); 2036 } 2037 2038 /* 2039 * Concrete top-level vdevs that are not missing and are not logs. At every 2040 * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds. 2041 */ 2042 static uint64_t 2043 spa_healthy_core_tvds(spa_t *spa) 2044 { 2045 vdev_t *rvd = spa->spa_root_vdev; 2046 uint64_t tvds = 0; 2047 2048 for (uint64_t i = 0; i < rvd->vdev_children; i++) { 2049 vdev_t *vd = rvd->vdev_child[i]; 2050 if (vd->vdev_islog) 2051 continue; 2052 if (vdev_is_concrete(vd) && !vdev_is_dead(vd)) 2053 tvds++; 2054 } 2055 2056 return (tvds); 2057 } 2058 2059 /* 2060 * Checks to see if the given vdev could not be opened, in which case we post a 2061 * sysevent to notify the autoreplace code that the device has been removed. 2062 */ 2063 static void 2064 spa_check_removed(vdev_t *vd) 2065 { 2066 for (uint64_t c = 0; c < vd->vdev_children; c++) 2067 spa_check_removed(vd->vdev_child[c]); 2068 2069 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && 2070 vdev_is_concrete(vd)) { 2071 zfs_post_autoreplace(vd->vdev_spa, vd); 2072 spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK); 2073 } 2074 } 2075 2076 static int 2077 spa_check_for_missing_logs(spa_t *spa) 2078 { 2079 vdev_t *rvd = spa->spa_root_vdev; 2080 2081 /* 2082 * If we're doing a normal import, then build up any additional 2083 * diagnostic information about missing log devices. 2084 * We'll pass this up to the user for further processing. 2085 */ 2086 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 2087 nvlist_t **child, *nv; 2088 uint64_t idx = 0; 2089 2090 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t *), 2091 KM_SLEEP); 2092 nv = fnvlist_alloc(); 2093 2094 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 2095 vdev_t *tvd = rvd->vdev_child[c]; 2096 2097 /* 2098 * We consider a device as missing only if it failed 2099 * to open (i.e. offline or faulted is not considered 2100 * as missing). 2101 */ 2102 if (tvd->vdev_islog && 2103 tvd->vdev_state == VDEV_STATE_CANT_OPEN) { 2104 child[idx++] = vdev_config_generate(spa, tvd, 2105 B_FALSE, VDEV_CONFIG_MISSING); 2106 } 2107 } 2108 2109 if (idx > 0) { 2110 fnvlist_add_nvlist_array(nv, 2111 ZPOOL_CONFIG_CHILDREN, child, idx); 2112 fnvlist_add_nvlist(spa->spa_load_info, 2113 ZPOOL_CONFIG_MISSING_DEVICES, nv); 2114 2115 for (uint64_t i = 0; i < idx; i++) 2116 nvlist_free(child[i]); 2117 } 2118 nvlist_free(nv); 2119 kmem_free(child, rvd->vdev_children * sizeof (char **)); 2120 2121 if (idx > 0) { 2122 spa_load_failed(spa, "some log devices are missing"); 2123 vdev_dbgmsg_print_tree(rvd, 2); 2124 return (SET_ERROR(ENXIO)); 2125 } 2126 } else { 2127 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 2128 vdev_t *tvd = rvd->vdev_child[c]; 2129 2130 if (tvd->vdev_islog && 2131 tvd->vdev_state == VDEV_STATE_CANT_OPEN) { 2132 spa_set_log_state(spa, SPA_LOG_CLEAR); 2133 spa_load_note(spa, "some log devices are " 2134 "missing, ZIL is dropped."); 2135 vdev_dbgmsg_print_tree(rvd, 2); 2136 break; 2137 } 2138 } 2139 } 2140 2141 return (0); 2142 } 2143 2144 /* 2145 * Check for missing log devices 2146 */ 2147 static boolean_t 2148 spa_check_logs(spa_t *spa) 2149 { 2150 boolean_t rv = B_FALSE; 2151 dsl_pool_t *dp = spa_get_dsl(spa); 2152 2153 switch (spa->spa_log_state) { 2154 default: 2155 break; 2156 case SPA_LOG_MISSING: 2157 /* need to recheck in case slog has been restored */ 2158 case SPA_LOG_UNKNOWN: 2159 rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 2160 zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0); 2161 if (rv) 2162 spa_set_log_state(spa, SPA_LOG_MISSING); 2163 break; 2164 } 2165 return (rv); 2166 } 2167 2168 /* 2169 * Passivate any log vdevs (note, does not apply to embedded log metaslabs). 2170 */ 2171 static boolean_t 2172 spa_passivate_log(spa_t *spa) 2173 { 2174 vdev_t *rvd = spa->spa_root_vdev; 2175 boolean_t slog_found = B_FALSE; 2176 2177 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 2178 2179 for (int c = 0; c < rvd->vdev_children; c++) { 2180 vdev_t *tvd = rvd->vdev_child[c]; 2181 2182 if (tvd->vdev_islog) { 2183 ASSERT3P(tvd->vdev_log_mg, ==, NULL); 2184 metaslab_group_passivate(tvd->vdev_mg); 2185 slog_found = B_TRUE; 2186 } 2187 } 2188 2189 return (slog_found); 2190 } 2191 2192 /* 2193 * Activate any log vdevs (note, does not apply to embedded log metaslabs). 2194 */ 2195 static void 2196 spa_activate_log(spa_t *spa) 2197 { 2198 vdev_t *rvd = spa->spa_root_vdev; 2199 2200 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 2201 2202 for (int c = 0; c < rvd->vdev_children; c++) { 2203 vdev_t *tvd = rvd->vdev_child[c]; 2204 2205 if (tvd->vdev_islog) { 2206 ASSERT3P(tvd->vdev_log_mg, ==, NULL); 2207 metaslab_group_activate(tvd->vdev_mg); 2208 } 2209 } 2210 } 2211 2212 int 2213 spa_reset_logs(spa_t *spa) 2214 { 2215 int error; 2216 2217 error = dmu_objset_find(spa_name(spa), zil_reset, 2218 NULL, DS_FIND_CHILDREN); 2219 if (error == 0) { 2220 /* 2221 * We successfully offlined the log device, sync out the 2222 * current txg so that the "stubby" block can be removed 2223 * by zil_sync(). 2224 */ 2225 txg_wait_synced(spa->spa_dsl_pool, 0); 2226 } 2227 return (error); 2228 } 2229 2230 static void 2231 spa_aux_check_removed(spa_aux_vdev_t *sav) 2232 { 2233 for (int i = 0; i < sav->sav_count; i++) 2234 spa_check_removed(sav->sav_vdevs[i]); 2235 } 2236 2237 void 2238 spa_claim_notify(zio_t *zio) 2239 { 2240 spa_t *spa = zio->io_spa; 2241 2242 if (zio->io_error) 2243 return; 2244 2245 mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 2246 if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 2247 spa->spa_claim_max_txg = zio->io_bp->blk_birth; 2248 mutex_exit(&spa->spa_props_lock); 2249 } 2250 2251 typedef struct spa_load_error { 2252 uint64_t sle_meta_count; 2253 uint64_t sle_data_count; 2254 } spa_load_error_t; 2255 2256 static void 2257 spa_load_verify_done(zio_t *zio) 2258 { 2259 blkptr_t *bp = zio->io_bp; 2260 spa_load_error_t *sle = zio->io_private; 2261 dmu_object_type_t type = BP_GET_TYPE(bp); 2262 int error = zio->io_error; 2263 spa_t *spa = zio->io_spa; 2264 2265 abd_free(zio->io_abd); 2266 if (error) { 2267 if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && 2268 type != DMU_OT_INTENT_LOG) 2269 atomic_inc_64(&sle->sle_meta_count); 2270 else 2271 atomic_inc_64(&sle->sle_data_count); 2272 } 2273 2274 mutex_enter(&spa->spa_scrub_lock); 2275 spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp); 2276 cv_broadcast(&spa->spa_scrub_io_cv); 2277 mutex_exit(&spa->spa_scrub_lock); 2278 } 2279 2280 /* 2281 * Maximum number of inflight bytes is the log2 fraction of the arc size. 2282 * By default, we set it to 1/16th of the arc. 2283 */ 2284 int spa_load_verify_shift = 4; 2285 int spa_load_verify_metadata = B_TRUE; 2286 int spa_load_verify_data = B_TRUE; 2287 2288 /*ARGSUSED*/ 2289 static int 2290 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 2291 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) 2292 { 2293 if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || 2294 BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp)) 2295 return (0); 2296 /* 2297 * Note: normally this routine will not be called if 2298 * spa_load_verify_metadata is not set. However, it may be useful 2299 * to manually set the flag after the traversal has begun. 2300 */ 2301 if (!spa_load_verify_metadata) 2302 return (0); 2303 if (!BP_IS_METADATA(bp) && !spa_load_verify_data) 2304 return (0); 2305 2306 uint64_t maxinflight_bytes = 2307 arc_target_bytes() >> spa_load_verify_shift; 2308 zio_t *rio = arg; 2309 size_t size = BP_GET_PSIZE(bp); 2310 2311 mutex_enter(&spa->spa_scrub_lock); 2312 while (spa->spa_load_verify_bytes >= maxinflight_bytes) 2313 cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); 2314 spa->spa_load_verify_bytes += size; 2315 mutex_exit(&spa->spa_scrub_lock); 2316 2317 zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size, 2318 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 2319 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 2320 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 2321 return (0); 2322 } 2323 2324 /* ARGSUSED */ 2325 static int 2326 verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) 2327 { 2328 if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN) 2329 return (SET_ERROR(ENAMETOOLONG)); 2330 2331 return (0); 2332 } 2333 2334 static int 2335 spa_load_verify(spa_t *spa) 2336 { 2337 zio_t *rio; 2338 spa_load_error_t sle = { 0 }; 2339 zpool_load_policy_t policy; 2340 boolean_t verify_ok = B_FALSE; 2341 int error = 0; 2342 2343 zpool_get_load_policy(spa->spa_config, &policy); 2344 2345 if (policy.zlp_rewind & ZPOOL_NEVER_REWIND) 2346 return (0); 2347 2348 dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); 2349 error = dmu_objset_find_dp(spa->spa_dsl_pool, 2350 spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL, 2351 DS_FIND_CHILDREN); 2352 dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); 2353 if (error != 0) 2354 return (error); 2355 2356 rio = zio_root(spa, NULL, &sle, 2357 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 2358 2359 if (spa_load_verify_metadata) { 2360 if (spa->spa_extreme_rewind) { 2361 spa_load_note(spa, "performing a complete scan of the " 2362 "pool since extreme rewind is on. This may take " 2363 "a very long time.\n (spa_load_verify_data=%u, " 2364 "spa_load_verify_metadata=%u)", 2365 spa_load_verify_data, spa_load_verify_metadata); 2366 } 2367 2368 error = traverse_pool(spa, spa->spa_verify_min_txg, 2369 TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | 2370 TRAVERSE_NO_DECRYPT, spa_load_verify_cb, rio); 2371 } 2372 2373 (void) zio_wait(rio); 2374 ASSERT0(spa->spa_load_verify_bytes); 2375 2376 spa->spa_load_meta_errors = sle.sle_meta_count; 2377 spa->spa_load_data_errors = sle.sle_data_count; 2378 2379 if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) { 2380 spa_load_note(spa, "spa_load_verify found %llu metadata errors " 2381 "and %llu data errors", (u_longlong_t)sle.sle_meta_count, 2382 (u_longlong_t)sle.sle_data_count); 2383 } 2384 2385 if (spa_load_verify_dryrun || 2386 (!error && sle.sle_meta_count <= policy.zlp_maxmeta && 2387 sle.sle_data_count <= policy.zlp_maxdata)) { 2388 int64_t loss = 0; 2389 2390 verify_ok = B_TRUE; 2391 spa->spa_load_txg = spa->spa_uberblock.ub_txg; 2392 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 2393 2394 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 2395 fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_TIME, 2396 spa->spa_load_txg_ts); 2397 fnvlist_add_int64(spa->spa_load_info, ZPOOL_CONFIG_REWIND_TIME, 2398 loss); 2399 fnvlist_add_uint64(spa->spa_load_info, 2400 ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count); 2401 } else { 2402 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 2403 } 2404 2405 if (spa_load_verify_dryrun) 2406 return (0); 2407 2408 if (error) { 2409 if (error != ENXIO && error != EIO) 2410 error = SET_ERROR(EIO); 2411 return (error); 2412 } 2413 2414 return (verify_ok ? 0 : EIO); 2415 } 2416 2417 /* 2418 * Find a value in the pool props object. 2419 */ 2420 static void 2421 spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 2422 { 2423 (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 2424 zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 2425 } 2426 2427 /* 2428 * Find a value in the pool directory object. 2429 */ 2430 static int 2431 spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent) 2432 { 2433 int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 2434 name, sizeof (uint64_t), 1, val); 2435 2436 if (error != 0 && (error != ENOENT || log_enoent)) { 2437 spa_load_failed(spa, "couldn't get '%s' value in MOS directory " 2438 "[error=%d]", name, error); 2439 } 2440 2441 return (error); 2442 } 2443 2444 static int 2445 spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 2446 { 2447 vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 2448 return (SET_ERROR(err)); 2449 } 2450 2451 boolean_t 2452 spa_livelist_delete_check(spa_t *spa) 2453 { 2454 return (spa->spa_livelists_to_delete != 0); 2455 } 2456 2457 /* ARGSUSED */ 2458 static boolean_t 2459 spa_livelist_delete_cb_check(void *arg, zthr_t *z) 2460 { 2461 spa_t *spa = arg; 2462 return (spa_livelist_delete_check(spa)); 2463 } 2464 2465 static int 2466 delete_blkptr_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 2467 { 2468 spa_t *spa = arg; 2469 zio_free(spa, tx->tx_txg, bp); 2470 dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, 2471 -bp_get_dsize_sync(spa, bp), 2472 -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); 2473 return (0); 2474 } 2475 2476 static int 2477 dsl_get_next_livelist_obj(objset_t *os, uint64_t zap_obj, uint64_t *llp) 2478 { 2479 int err; 2480 zap_cursor_t zc; 2481 zap_attribute_t za; 2482 zap_cursor_init(&zc, os, zap_obj); 2483 err = zap_cursor_retrieve(&zc, &za); 2484 zap_cursor_fini(&zc); 2485 if (err == 0) 2486 *llp = za.za_first_integer; 2487 return (err); 2488 } 2489 2490 /* 2491 * Components of livelist deletion that must be performed in syncing 2492 * context: freeing block pointers and updating the pool-wide data 2493 * structures to indicate how much work is left to do 2494 */ 2495 typedef struct sublist_delete_arg { 2496 spa_t *spa; 2497 dsl_deadlist_t *ll; 2498 uint64_t key; 2499 bplist_t *to_free; 2500 } sublist_delete_arg_t; 2501 2502 static void 2503 sublist_delete_sync(void *arg, dmu_tx_t *tx) 2504 { 2505 sublist_delete_arg_t *sda = arg; 2506 spa_t *spa = sda->spa; 2507 dsl_deadlist_t *ll = sda->ll; 2508 uint64_t key = sda->key; 2509 bplist_t *to_free = sda->to_free; 2510 2511 bplist_iterate(to_free, delete_blkptr_cb, spa, tx); 2512 dsl_deadlist_remove_entry(ll, key, tx); 2513 } 2514 2515 typedef struct livelist_delete_arg { 2516 spa_t *spa; 2517 uint64_t ll_obj; 2518 uint64_t zap_obj; 2519 } livelist_delete_arg_t; 2520 2521 static void 2522 livelist_delete_sync(void *arg, dmu_tx_t *tx) 2523 { 2524 livelist_delete_arg_t *lda = arg; 2525 spa_t *spa = lda->spa; 2526 uint64_t ll_obj = lda->ll_obj; 2527 uint64_t zap_obj = lda->zap_obj; 2528 objset_t *mos = spa->spa_meta_objset; 2529 uint64_t count; 2530 2531 /* free the livelist and decrement the feature count */ 2532 VERIFY0(zap_remove_int(mos, zap_obj, ll_obj, tx)); 2533 dsl_deadlist_free(mos, ll_obj, tx); 2534 spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx); 2535 VERIFY0(zap_count(mos, zap_obj, &count)); 2536 if (count == 0) { 2537 /* no more livelists to delete */ 2538 VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, 2539 DMU_POOL_DELETED_CLONES, tx)); 2540 VERIFY0(zap_destroy(mos, zap_obj, tx)); 2541 spa->spa_livelists_to_delete = 0; 2542 spa_notify_waiters(spa); 2543 } 2544 } 2545 2546 /* 2547 * Load in the value for the livelist to be removed and open it. Then, 2548 * load its first sublist and determine which block pointers should actually 2549 * be freed. Then, call a synctask which performs the actual frees and updates 2550 * the pool-wide livelist data. 2551 */ 2552 /* ARGSUSED */ 2553 static void 2554 spa_livelist_delete_cb(void *arg, zthr_t *z) 2555 { 2556 spa_t *spa = arg; 2557 uint64_t ll_obj = 0, count; 2558 objset_t *mos = spa->spa_meta_objset; 2559 uint64_t zap_obj = spa->spa_livelists_to_delete; 2560 /* 2561 * Determine the next livelist to delete. This function should only 2562 * be called if there is at least one deleted clone. 2563 */ 2564 VERIFY0(dsl_get_next_livelist_obj(mos, zap_obj, &ll_obj)); 2565 VERIFY0(zap_count(mos, ll_obj, &count)); 2566 if (count > 0) { 2567 dsl_deadlist_t *ll; 2568 dsl_deadlist_entry_t *dle; 2569 bplist_t to_free; 2570 ll = kmem_zalloc(sizeof (dsl_deadlist_t), KM_SLEEP); 2571 dsl_deadlist_open(ll, mos, ll_obj); 2572 dle = dsl_deadlist_first(ll); 2573 ASSERT3P(dle, !=, NULL); 2574 bplist_create(&to_free); 2575 int err = dsl_process_sub_livelist(&dle->dle_bpobj, &to_free, 2576 z, NULL); 2577 if (err == 0) { 2578 sublist_delete_arg_t sync_arg = { 2579 .spa = spa, 2580 .ll = ll, 2581 .key = dle->dle_mintxg, 2582 .to_free = &to_free 2583 }; 2584 zfs_dbgmsg("deleting sublist (id %llu) from" 2585 " livelist %llu, %lld remaining", 2586 (u_longlong_t)dle->dle_bpobj.bpo_object, 2587 (u_longlong_t)ll_obj, (longlong_t)count - 1); 2588 VERIFY0(dsl_sync_task(spa_name(spa), NULL, 2589 sublist_delete_sync, &sync_arg, 0, 2590 ZFS_SPACE_CHECK_DESTROY)); 2591 } else { 2592 VERIFY3U(err, ==, EINTR); 2593 } 2594 bplist_clear(&to_free); 2595 bplist_destroy(&to_free); 2596 dsl_deadlist_close(ll); 2597 kmem_free(ll, sizeof (dsl_deadlist_t)); 2598 } else { 2599 livelist_delete_arg_t sync_arg = { 2600 .spa = spa, 2601 .ll_obj = ll_obj, 2602 .zap_obj = zap_obj 2603 }; 2604 zfs_dbgmsg("deletion of livelist %llu completed", 2605 (u_longlong_t)ll_obj); 2606 VERIFY0(dsl_sync_task(spa_name(spa), NULL, livelist_delete_sync, 2607 &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY)); 2608 } 2609 } 2610 2611 static void 2612 spa_start_livelist_destroy_thread(spa_t *spa) 2613 { 2614 ASSERT3P(spa->spa_livelist_delete_zthr, ==, NULL); 2615 spa->spa_livelist_delete_zthr = 2616 zthr_create("z_livelist_destroy", 2617 spa_livelist_delete_cb_check, spa_livelist_delete_cb, spa, 2618 minclsyspri); 2619 } 2620 2621 typedef struct livelist_new_arg { 2622 bplist_t *allocs; 2623 bplist_t *frees; 2624 } livelist_new_arg_t; 2625 2626 static int 2627 livelist_track_new_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 2628 dmu_tx_t *tx) 2629 { 2630 ASSERT(tx == NULL); 2631 livelist_new_arg_t *lna = arg; 2632 if (bp_freed) { 2633 bplist_append(lna->frees, bp); 2634 } else { 2635 bplist_append(lna->allocs, bp); 2636 zfs_livelist_condense_new_alloc++; 2637 } 2638 return (0); 2639 } 2640 2641 typedef struct livelist_condense_arg { 2642 spa_t *spa; 2643 bplist_t to_keep; 2644 uint64_t first_size; 2645 uint64_t next_size; 2646 } livelist_condense_arg_t; 2647 2648 static void 2649 spa_livelist_condense_sync(void *arg, dmu_tx_t *tx) 2650 { 2651 livelist_condense_arg_t *lca = arg; 2652 spa_t *spa = lca->spa; 2653 bplist_t new_frees; 2654 dsl_dataset_t *ds = spa->spa_to_condense.ds; 2655 2656 /* Have we been cancelled? */ 2657 if (spa->spa_to_condense.cancelled) { 2658 zfs_livelist_condense_sync_cancel++; 2659 goto out; 2660 } 2661 2662 dsl_deadlist_entry_t *first = spa->spa_to_condense.first; 2663 dsl_deadlist_entry_t *next = spa->spa_to_condense.next; 2664 dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist; 2665 2666 /* 2667 * It's possible that the livelist was changed while the zthr was 2668 * running. Therefore, we need to check for new blkptrs in the two 2669 * entries being condensed and continue to track them in the livelist. 2670 * Because of the way we handle remapped blkptrs (see dbuf_remap_impl), 2671 * it's possible that the newly added blkptrs are FREEs or ALLOCs so 2672 * we need to sort them into two different bplists. 2673 */ 2674 uint64_t first_obj = first->dle_bpobj.bpo_object; 2675 uint64_t next_obj = next->dle_bpobj.bpo_object; 2676 uint64_t cur_first_size = first->dle_bpobj.bpo_phys->bpo_num_blkptrs; 2677 uint64_t cur_next_size = next->dle_bpobj.bpo_phys->bpo_num_blkptrs; 2678 2679 bplist_create(&new_frees); 2680 livelist_new_arg_t new_bps = { 2681 .allocs = &lca->to_keep, 2682 .frees = &new_frees, 2683 }; 2684 2685 if (cur_first_size > lca->first_size) { 2686 VERIFY0(livelist_bpobj_iterate_from_nofree(&first->dle_bpobj, 2687 livelist_track_new_cb, &new_bps, lca->first_size)); 2688 } 2689 if (cur_next_size > lca->next_size) { 2690 VERIFY0(livelist_bpobj_iterate_from_nofree(&next->dle_bpobj, 2691 livelist_track_new_cb, &new_bps, lca->next_size)); 2692 } 2693 2694 dsl_deadlist_clear_entry(first, ll, tx); 2695 ASSERT(bpobj_is_empty(&first->dle_bpobj)); 2696 dsl_deadlist_remove_entry(ll, next->dle_mintxg, tx); 2697 2698 bplist_iterate(&lca->to_keep, dsl_deadlist_insert_alloc_cb, ll, tx); 2699 bplist_iterate(&new_frees, dsl_deadlist_insert_free_cb, ll, tx); 2700 bplist_destroy(&new_frees); 2701 2702 char dsname[ZFS_MAX_DATASET_NAME_LEN]; 2703 dsl_dataset_name(ds, dsname); 2704 zfs_dbgmsg("txg %llu condensing livelist of %s (id %llu), bpobj %llu " 2705 "(%llu blkptrs) and bpobj %llu (%llu blkptrs) -> bpobj %llu " 2706 "(%llu blkptrs)", (u_longlong_t)tx->tx_txg, dsname, 2707 (u_longlong_t)ds->ds_object, (u_longlong_t)first_obj, 2708 (u_longlong_t)cur_first_size, (u_longlong_t)next_obj, 2709 (u_longlong_t)cur_next_size, 2710 (u_longlong_t)first->dle_bpobj.bpo_object, 2711 (u_longlong_t)first->dle_bpobj.bpo_phys->bpo_num_blkptrs); 2712 out: 2713 dmu_buf_rele(ds->ds_dbuf, spa); 2714 spa->spa_to_condense.ds = NULL; 2715 bplist_clear(&lca->to_keep); 2716 bplist_destroy(&lca->to_keep); 2717 kmem_free(lca, sizeof (livelist_condense_arg_t)); 2718 spa->spa_to_condense.syncing = B_FALSE; 2719 } 2720 2721 static void 2722 spa_livelist_condense_cb(void *arg, zthr_t *t) 2723 { 2724 while (zfs_livelist_condense_zthr_pause && 2725 !(zthr_has_waiters(t) || zthr_iscancelled(t))) 2726 delay(1); 2727 2728 spa_t *spa = arg; 2729 dsl_deadlist_entry_t *first = spa->spa_to_condense.first; 2730 dsl_deadlist_entry_t *next = spa->spa_to_condense.next; 2731 uint64_t first_size, next_size; 2732 2733 livelist_condense_arg_t *lca = 2734 kmem_alloc(sizeof (livelist_condense_arg_t), KM_SLEEP); 2735 bplist_create(&lca->to_keep); 2736 2737 /* 2738 * Process the livelists (matching FREEs and ALLOCs) in open context 2739 * so we have minimal work in syncing context to condense. 2740 * 2741 * We save bpobj sizes (first_size and next_size) to use later in 2742 * syncing context to determine if entries were added to these sublists 2743 * while in open context. This is possible because the clone is still 2744 * active and open for normal writes and we want to make sure the new, 2745 * unprocessed blockpointers are inserted into the livelist normally. 2746 * 2747 * Note that dsl_process_sub_livelist() both stores the size number of 2748 * blockpointers and iterates over them while the bpobj's lock held, so 2749 * the sizes returned to us are consistent which what was actually 2750 * processed. 2751 */ 2752 int err = dsl_process_sub_livelist(&first->dle_bpobj, &lca->to_keep, t, 2753 &first_size); 2754 if (err == 0) 2755 err = dsl_process_sub_livelist(&next->dle_bpobj, &lca->to_keep, 2756 t, &next_size); 2757 2758 if (err == 0) { 2759 while (zfs_livelist_condense_sync_pause && 2760 !(zthr_has_waiters(t) || zthr_iscancelled(t))) 2761 delay(1); 2762 2763 dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 2764 dmu_tx_mark_netfree(tx); 2765 dmu_tx_hold_space(tx, 1); 2766 err = dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE); 2767 if (err == 0) { 2768 /* 2769 * Prevent the condense zthr restarting before 2770 * the synctask completes. 2771 */ 2772 spa->spa_to_condense.syncing = B_TRUE; 2773 lca->spa = spa; 2774 lca->first_size = first_size; 2775 lca->next_size = next_size; 2776 dsl_sync_task_nowait(spa_get_dsl(spa), 2777 spa_livelist_condense_sync, lca, tx); 2778 dmu_tx_commit(tx); 2779 return; 2780 } 2781 } 2782 /* 2783 * Condensing can not continue: either it was externally stopped or 2784 * we were unable to assign to a tx because the pool has run out of 2785 * space. In the second case, we'll just end up trying to condense 2786 * again in a later txg. 2787 */ 2788 ASSERT(err != 0); 2789 bplist_clear(&lca->to_keep); 2790 bplist_destroy(&lca->to_keep); 2791 kmem_free(lca, sizeof (livelist_condense_arg_t)); 2792 dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, spa); 2793 spa->spa_to_condense.ds = NULL; 2794 if (err == EINTR) 2795 zfs_livelist_condense_zthr_cancel++; 2796 } 2797 2798 /* ARGSUSED */ 2799 /* 2800 * Check that there is something to condense but that a condense is not 2801 * already in progress and that condensing has not been cancelled. 2802 */ 2803 static boolean_t 2804 spa_livelist_condense_cb_check(void *arg, zthr_t *z) 2805 { 2806 spa_t *spa = arg; 2807 if ((spa->spa_to_condense.ds != NULL) && 2808 (spa->spa_to_condense.syncing == B_FALSE) && 2809 (spa->spa_to_condense.cancelled == B_FALSE)) { 2810 return (B_TRUE); 2811 } 2812 return (B_FALSE); 2813 } 2814 2815 static void 2816 spa_start_livelist_condensing_thread(spa_t *spa) 2817 { 2818 spa->spa_to_condense.ds = NULL; 2819 spa->spa_to_condense.first = NULL; 2820 spa->spa_to_condense.next = NULL; 2821 spa->spa_to_condense.syncing = B_FALSE; 2822 spa->spa_to_condense.cancelled = B_FALSE; 2823 2824 ASSERT3P(spa->spa_livelist_condense_zthr, ==, NULL); 2825 spa->spa_livelist_condense_zthr = 2826 zthr_create("z_livelist_condense", 2827 spa_livelist_condense_cb_check, 2828 spa_livelist_condense_cb, spa, minclsyspri); 2829 } 2830 2831 static void 2832 spa_spawn_aux_threads(spa_t *spa) 2833 { 2834 ASSERT(spa_writeable(spa)); 2835 2836 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 2837 2838 spa_start_indirect_condensing_thread(spa); 2839 spa_start_livelist_destroy_thread(spa); 2840 spa_start_livelist_condensing_thread(spa); 2841 2842 ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL); 2843 spa->spa_checkpoint_discard_zthr = 2844 zthr_create("z_checkpoint_discard", 2845 spa_checkpoint_discard_thread_check, 2846 spa_checkpoint_discard_thread, spa, minclsyspri); 2847 } 2848 2849 /* 2850 * Fix up config after a partly-completed split. This is done with the 2851 * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 2852 * pool have that entry in their config, but only the splitting one contains 2853 * a list of all the guids of the vdevs that are being split off. 2854 * 2855 * This function determines what to do with that list: either rejoin 2856 * all the disks to the pool, or complete the splitting process. To attempt 2857 * the rejoin, each disk that is offlined is marked online again, and 2858 * we do a reopen() call. If the vdev label for every disk that was 2859 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 2860 * then we call vdev_split() on each disk, and complete the split. 2861 * 2862 * Otherwise we leave the config alone, with all the vdevs in place in 2863 * the original pool. 2864 */ 2865 static void 2866 spa_try_repair(spa_t *spa, nvlist_t *config) 2867 { 2868 uint_t extracted; 2869 uint64_t *glist; 2870 uint_t i, gcount; 2871 nvlist_t *nvl; 2872 vdev_t **vd; 2873 boolean_t attempt_reopen; 2874 2875 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 2876 return; 2877 2878 /* check that the config is complete */ 2879 if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 2880 &glist, &gcount) != 0) 2881 return; 2882 2883 vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 2884 2885 /* attempt to online all the vdevs & validate */ 2886 attempt_reopen = B_TRUE; 2887 for (i = 0; i < gcount; i++) { 2888 if (glist[i] == 0) /* vdev is hole */ 2889 continue; 2890 2891 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 2892 if (vd[i] == NULL) { 2893 /* 2894 * Don't bother attempting to reopen the disks; 2895 * just do the split. 2896 */ 2897 attempt_reopen = B_FALSE; 2898 } else { 2899 /* attempt to re-online it */ 2900 vd[i]->vdev_offline = B_FALSE; 2901 } 2902 } 2903 2904 if (attempt_reopen) { 2905 vdev_reopen(spa->spa_root_vdev); 2906 2907 /* check each device to see what state it's in */ 2908 for (extracted = 0, i = 0; i < gcount; i++) { 2909 if (vd[i] != NULL && 2910 vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 2911 break; 2912 ++extracted; 2913 } 2914 } 2915 2916 /* 2917 * If every disk has been moved to the new pool, or if we never 2918 * even attempted to look at them, then we split them off for 2919 * good. 2920 */ 2921 if (!attempt_reopen || gcount == extracted) { 2922 for (i = 0; i < gcount; i++) 2923 if (vd[i] != NULL) 2924 vdev_split(vd[i]); 2925 vdev_reopen(spa->spa_root_vdev); 2926 } 2927 2928 kmem_free(vd, gcount * sizeof (vdev_t *)); 2929 } 2930 2931 static int 2932 spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type) 2933 { 2934 char *ereport = FM_EREPORT_ZFS_POOL; 2935 int error; 2936 2937 spa->spa_load_state = state; 2938 (void) spa_import_progress_set_state(spa_guid(spa), 2939 spa_load_state(spa)); 2940 2941 gethrestime(&spa->spa_loaded_ts); 2942 error = spa_load_impl(spa, type, &ereport); 2943 2944 /* 2945 * Don't count references from objsets that are already closed 2946 * and are making their way through the eviction process. 2947 */ 2948 spa_evicting_os_wait(spa); 2949 spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); 2950 if (error) { 2951 if (error != EEXIST) { 2952 spa->spa_loaded_ts.tv_sec = 0; 2953 spa->spa_loaded_ts.tv_nsec = 0; 2954 } 2955 if (error != EBADF) { 2956 (void) zfs_ereport_post(ereport, spa, 2957 NULL, NULL, NULL, 0); 2958 } 2959 } 2960 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 2961 spa->spa_ena = 0; 2962 2963 (void) spa_import_progress_set_state(spa_guid(spa), 2964 spa_load_state(spa)); 2965 2966 return (error); 2967 } 2968 2969 #ifdef ZFS_DEBUG 2970 /* 2971 * Count the number of per-vdev ZAPs associated with all of the vdevs in the 2972 * vdev tree rooted in the given vd, and ensure that each ZAP is present in the 2973 * spa's per-vdev ZAP list. 2974 */ 2975 static uint64_t 2976 vdev_count_verify_zaps(vdev_t *vd) 2977 { 2978 spa_t *spa = vd->vdev_spa; 2979 uint64_t total = 0; 2980 2981 if (vd->vdev_top_zap != 0) { 2982 total++; 2983 ASSERT0(zap_lookup_int(spa->spa_meta_objset, 2984 spa->spa_all_vdev_zaps, vd->vdev_top_zap)); 2985 } 2986 if (vd->vdev_leaf_zap != 0) { 2987 total++; 2988 ASSERT0(zap_lookup_int(spa->spa_meta_objset, 2989 spa->spa_all_vdev_zaps, vd->vdev_leaf_zap)); 2990 } 2991 2992 for (uint64_t i = 0; i < vd->vdev_children; i++) { 2993 total += vdev_count_verify_zaps(vd->vdev_child[i]); 2994 } 2995 2996 return (total); 2997 } 2998 #endif 2999 3000 /* 3001 * Determine whether the activity check is required. 3002 */ 3003 static boolean_t 3004 spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label, 3005 nvlist_t *config) 3006 { 3007 uint64_t state = 0; 3008 uint64_t hostid = 0; 3009 uint64_t tryconfig_txg = 0; 3010 uint64_t tryconfig_timestamp = 0; 3011 uint16_t tryconfig_mmp_seq = 0; 3012 nvlist_t *nvinfo; 3013 3014 if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { 3015 nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); 3016 (void) nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG, 3017 &tryconfig_txg); 3018 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 3019 &tryconfig_timestamp); 3020 (void) nvlist_lookup_uint16(nvinfo, ZPOOL_CONFIG_MMP_SEQ, 3021 &tryconfig_mmp_seq); 3022 } 3023 3024 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state); 3025 3026 /* 3027 * Disable the MMP activity check - This is used by zdb which 3028 * is intended to be used on potentially active pools. 3029 */ 3030 if (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) 3031 return (B_FALSE); 3032 3033 /* 3034 * Skip the activity check when the MMP feature is disabled. 3035 */ 3036 if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay == 0) 3037 return (B_FALSE); 3038 3039 /* 3040 * If the tryconfig_ values are nonzero, they are the results of an 3041 * earlier tryimport. If they all match the uberblock we just found, 3042 * then the pool has not changed and we return false so we do not test 3043 * a second time. 3044 */ 3045 if (tryconfig_txg && tryconfig_txg == ub->ub_txg && 3046 tryconfig_timestamp && tryconfig_timestamp == ub->ub_timestamp && 3047 tryconfig_mmp_seq && tryconfig_mmp_seq == 3048 (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) 3049 return (B_FALSE); 3050 3051 /* 3052 * Allow the activity check to be skipped when importing the pool 3053 * on the same host which last imported it. Since the hostid from 3054 * configuration may be stale use the one read from the label. 3055 */ 3056 if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID)) 3057 hostid = fnvlist_lookup_uint64(label, ZPOOL_CONFIG_HOSTID); 3058 3059 if (hostid == spa_get_hostid(spa)) 3060 return (B_FALSE); 3061 3062 /* 3063 * Skip the activity test when the pool was cleanly exported. 3064 */ 3065 if (state != POOL_STATE_ACTIVE) 3066 return (B_FALSE); 3067 3068 return (B_TRUE); 3069 } 3070 3071 /* 3072 * Nanoseconds the activity check must watch for changes on-disk. 3073 */ 3074 static uint64_t 3075 spa_activity_check_duration(spa_t *spa, uberblock_t *ub) 3076 { 3077 uint64_t import_intervals = MAX(zfs_multihost_import_intervals, 1); 3078 uint64_t multihost_interval = MSEC2NSEC( 3079 MMP_INTERVAL_OK(zfs_multihost_interval)); 3080 uint64_t import_delay = MAX(NANOSEC, import_intervals * 3081 multihost_interval); 3082 3083 /* 3084 * Local tunables determine a minimum duration except for the case 3085 * where we know when the remote host will suspend the pool if MMP 3086 * writes do not land. 3087 * 3088 * See Big Theory comment at the top of mmp.c for the reasoning behind 3089 * these cases and times. 3090 */ 3091 3092 ASSERT(MMP_IMPORT_SAFETY_FACTOR >= 100); 3093 3094 if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) && 3095 MMP_FAIL_INT(ub) > 0) { 3096 3097 /* MMP on remote host will suspend pool after failed writes */ 3098 import_delay = MMP_FAIL_INT(ub) * MSEC2NSEC(MMP_INTERVAL(ub)) * 3099 MMP_IMPORT_SAFETY_FACTOR / 100; 3100 3101 zfs_dbgmsg("fail_intvals>0 import_delay=%llu ub_mmp " 3102 "mmp_fails=%llu ub_mmp mmp_interval=%llu " 3103 "import_intervals=%llu", (u_longlong_t)import_delay, 3104 (u_longlong_t)MMP_FAIL_INT(ub), 3105 (u_longlong_t)MMP_INTERVAL(ub), 3106 (u_longlong_t)import_intervals); 3107 3108 } else if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) && 3109 MMP_FAIL_INT(ub) == 0) { 3110 3111 /* MMP on remote host will never suspend pool */ 3112 import_delay = MAX(import_delay, (MSEC2NSEC(MMP_INTERVAL(ub)) + 3113 ub->ub_mmp_delay) * import_intervals); 3114 3115 zfs_dbgmsg("fail_intvals=0 import_delay=%llu ub_mmp " 3116 "mmp_interval=%llu ub_mmp_delay=%llu " 3117 "import_intervals=%llu", (u_longlong_t)import_delay, 3118 (u_longlong_t)MMP_INTERVAL(ub), 3119 (u_longlong_t)ub->ub_mmp_delay, 3120 (u_longlong_t)import_intervals); 3121 3122 } else if (MMP_VALID(ub)) { 3123 /* 3124 * zfs-0.7 compatibility case 3125 */ 3126 3127 import_delay = MAX(import_delay, (multihost_interval + 3128 ub->ub_mmp_delay) * import_intervals); 3129 3130 zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu " 3131 "import_intervals=%llu leaves=%u", 3132 (u_longlong_t)import_delay, 3133 (u_longlong_t)ub->ub_mmp_delay, 3134 (u_longlong_t)import_intervals, 3135 vdev_count_leaves(spa)); 3136 } else { 3137 /* Using local tunings is the only reasonable option */ 3138 zfs_dbgmsg("pool last imported on non-MMP aware " 3139 "host using import_delay=%llu multihost_interval=%llu " 3140 "import_intervals=%llu", (u_longlong_t)import_delay, 3141 (u_longlong_t)multihost_interval, 3142 (u_longlong_t)import_intervals); 3143 } 3144 3145 return (import_delay); 3146 } 3147 3148 /* 3149 * Perform the import activity check. If the user canceled the import or 3150 * we detected activity then fail. 3151 */ 3152 static int 3153 spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config) 3154 { 3155 uint64_t txg = ub->ub_txg; 3156 uint64_t timestamp = ub->ub_timestamp; 3157 uint64_t mmp_config = ub->ub_mmp_config; 3158 uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0; 3159 uint64_t import_delay; 3160 hrtime_t import_expire; 3161 nvlist_t *mmp_label = NULL; 3162 vdev_t *rvd = spa->spa_root_vdev; 3163 kcondvar_t cv; 3164 kmutex_t mtx; 3165 int error = 0; 3166 3167 cv_init(&cv, NULL, CV_DEFAULT, NULL); 3168 mutex_init(&mtx, NULL, MUTEX_DEFAULT, NULL); 3169 mutex_enter(&mtx); 3170 3171 /* 3172 * If ZPOOL_CONFIG_MMP_TXG is present an activity check was performed 3173 * during the earlier tryimport. If the txg recorded there is 0 then 3174 * the pool is known to be active on another host. 3175 * 3176 * Otherwise, the pool might be in use on another host. Check for 3177 * changes in the uberblocks on disk if necessary. 3178 */ 3179 if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { 3180 nvlist_t *nvinfo = fnvlist_lookup_nvlist(config, 3181 ZPOOL_CONFIG_LOAD_INFO); 3182 3183 if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_TXG) && 3184 fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG) == 0) { 3185 vdev_uberblock_load(rvd, ub, &mmp_label); 3186 error = SET_ERROR(EREMOTEIO); 3187 goto out; 3188 } 3189 } 3190 3191 import_delay = spa_activity_check_duration(spa, ub); 3192 3193 /* Add a small random factor in case of simultaneous imports (0-25%) */ 3194 import_delay += import_delay * random_in_range(250) / 1000; 3195 3196 import_expire = gethrtime() + import_delay; 3197 3198 while (gethrtime() < import_expire) { 3199 (void) spa_import_progress_set_mmp_check(spa_guid(spa), 3200 NSEC2SEC(import_expire - gethrtime())); 3201 3202 vdev_uberblock_load(rvd, ub, &mmp_label); 3203 3204 if (txg != ub->ub_txg || timestamp != ub->ub_timestamp || 3205 mmp_seq != (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) { 3206 zfs_dbgmsg("multihost activity detected " 3207 "txg %llu ub_txg %llu " 3208 "timestamp %llu ub_timestamp %llu " 3209 "mmp_config %#llx ub_mmp_config %#llx", 3210 (u_longlong_t)txg, (u_longlong_t)ub->ub_txg, 3211 (u_longlong_t)timestamp, 3212 (u_longlong_t)ub->ub_timestamp, 3213 (u_longlong_t)mmp_config, 3214 (u_longlong_t)ub->ub_mmp_config); 3215 3216 error = SET_ERROR(EREMOTEIO); 3217 break; 3218 } 3219 3220 if (mmp_label) { 3221 nvlist_free(mmp_label); 3222 mmp_label = NULL; 3223 } 3224 3225 error = cv_timedwait_sig(&cv, &mtx, ddi_get_lbolt() + hz); 3226 if (error != -1) { 3227 error = SET_ERROR(EINTR); 3228 break; 3229 } 3230 error = 0; 3231 } 3232 3233 out: 3234 mutex_exit(&mtx); 3235 mutex_destroy(&mtx); 3236 cv_destroy(&cv); 3237 3238 /* 3239 * If the pool is determined to be active store the status in the 3240 * spa->spa_load_info nvlist. If the remote hostname or hostid are 3241 * available from configuration read from disk store them as well. 3242 * This allows 'zpool import' to generate a more useful message. 3243 * 3244 * ZPOOL_CONFIG_MMP_STATE - observed pool status (mandatory) 3245 * ZPOOL_CONFIG_MMP_HOSTNAME - hostname from the active pool 3246 * ZPOOL_CONFIG_MMP_HOSTID - hostid from the active pool 3247 */ 3248 if (error == EREMOTEIO) { 3249 char *hostname = "<unknown>"; 3250 uint64_t hostid = 0; 3251 3252 if (mmp_label) { 3253 if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTNAME)) { 3254 hostname = fnvlist_lookup_string(mmp_label, 3255 ZPOOL_CONFIG_HOSTNAME); 3256 fnvlist_add_string(spa->spa_load_info, 3257 ZPOOL_CONFIG_MMP_HOSTNAME, hostname); 3258 } 3259 3260 if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTID)) { 3261 hostid = fnvlist_lookup_uint64(mmp_label, 3262 ZPOOL_CONFIG_HOSTID); 3263 fnvlist_add_uint64(spa->spa_load_info, 3264 ZPOOL_CONFIG_MMP_HOSTID, hostid); 3265 } 3266 } 3267 3268 fnvlist_add_uint64(spa->spa_load_info, 3269 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_ACTIVE); 3270 fnvlist_add_uint64(spa->spa_load_info, 3271 ZPOOL_CONFIG_MMP_TXG, 0); 3272 3273 error = spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO); 3274 } 3275 3276 if (mmp_label) 3277 nvlist_free(mmp_label); 3278 3279 return (error); 3280 } 3281 3282 static int 3283 spa_verify_host(spa_t *spa, nvlist_t *mos_config) 3284 { 3285 uint64_t hostid; 3286 char *hostname; 3287 uint64_t myhostid = 0; 3288 3289 if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config, 3290 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 3291 hostname = fnvlist_lookup_string(mos_config, 3292 ZPOOL_CONFIG_HOSTNAME); 3293 3294 myhostid = zone_get_hostid(NULL); 3295 3296 if (hostid != 0 && myhostid != 0 && hostid != myhostid) { 3297 cmn_err(CE_WARN, "pool '%s' could not be " 3298 "loaded as it was last accessed by " 3299 "another system (host: %s hostid: 0x%llx). " 3300 "See: https://openzfs.github.io/openzfs-docs/msg/" 3301 "ZFS-8000-EY", 3302 spa_name(spa), hostname, (u_longlong_t)hostid); 3303 spa_load_failed(spa, "hostid verification failed: pool " 3304 "last accessed by host: %s (hostid: 0x%llx)", 3305 hostname, (u_longlong_t)hostid); 3306 return (SET_ERROR(EBADF)); 3307 } 3308 } 3309 3310 return (0); 3311 } 3312 3313 static int 3314 spa_ld_parse_config(spa_t *spa, spa_import_type_t type) 3315 { 3316 int error = 0; 3317 nvlist_t *nvtree, *nvl, *config = spa->spa_config; 3318 int parse; 3319 vdev_t *rvd; 3320 uint64_t pool_guid; 3321 char *comment; 3322 char *compatibility; 3323 3324 /* 3325 * Versioning wasn't explicitly added to the label until later, so if 3326 * it's not present treat it as the initial version. 3327 */ 3328 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 3329 &spa->spa_ubsync.ub_version) != 0) 3330 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 3331 3332 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { 3333 spa_load_failed(spa, "invalid config provided: '%s' missing", 3334 ZPOOL_CONFIG_POOL_GUID); 3335 return (SET_ERROR(EINVAL)); 3336 } 3337 3338 /* 3339 * If we are doing an import, ensure that the pool is not already 3340 * imported by checking if its pool guid already exists in the 3341 * spa namespace. 3342 * 3343 * The only case that we allow an already imported pool to be 3344 * imported again, is when the pool is checkpointed and we want to 3345 * look at its checkpointed state from userland tools like zdb. 3346 */ 3347 #ifdef _KERNEL 3348 if ((spa->spa_load_state == SPA_LOAD_IMPORT || 3349 spa->spa_load_state == SPA_LOAD_TRYIMPORT) && 3350 spa_guid_exists(pool_guid, 0)) { 3351 #else 3352 if ((spa->spa_load_state == SPA_LOAD_IMPORT || 3353 spa->spa_load_state == SPA_LOAD_TRYIMPORT) && 3354 spa_guid_exists(pool_guid, 0) && 3355 !spa_importing_readonly_checkpoint(spa)) { 3356 #endif 3357 spa_load_failed(spa, "a pool with guid %llu is already open", 3358 (u_longlong_t)pool_guid); 3359 return (SET_ERROR(EEXIST)); 3360 } 3361 3362 spa->spa_config_guid = pool_guid; 3363 3364 nvlist_free(spa->spa_load_info); 3365 spa->spa_load_info = fnvlist_alloc(); 3366 3367 ASSERT(spa->spa_comment == NULL); 3368 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) 3369 spa->spa_comment = spa_strdup(comment); 3370 3371 ASSERT(spa->spa_compatibility == NULL); 3372 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMPATIBILITY, 3373 &compatibility) == 0) 3374 spa->spa_compatibility = spa_strdup(compatibility); 3375 3376 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 3377 &spa->spa_config_txg); 3378 3379 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0) 3380 spa->spa_config_splitting = fnvlist_dup(nvl); 3381 3382 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) { 3383 spa_load_failed(spa, "invalid config provided: '%s' missing", 3384 ZPOOL_CONFIG_VDEV_TREE); 3385 return (SET_ERROR(EINVAL)); 3386 } 3387 3388 /* 3389 * Create "The Godfather" zio to hold all async IOs 3390 */ 3391 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 3392 KM_SLEEP); 3393 for (int i = 0; i < max_ncpus; i++) { 3394 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 3395 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 3396 ZIO_FLAG_GODFATHER); 3397 } 3398 3399 /* 3400 * Parse the configuration into a vdev tree. We explicitly set the 3401 * value that will be returned by spa_version() since parsing the 3402 * configuration requires knowing the version number. 3403 */ 3404 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3405 parse = (type == SPA_IMPORT_EXISTING ? 3406 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 3407 error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse); 3408 spa_config_exit(spa, SCL_ALL, FTAG); 3409 3410 if (error != 0) { 3411 spa_load_failed(spa, "unable to parse config [error=%d]", 3412 error); 3413 return (error); 3414 } 3415 3416 ASSERT(spa->spa_root_vdev == rvd); 3417 ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); 3418 ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT); 3419 3420 if (type != SPA_IMPORT_ASSEMBLE) { 3421 ASSERT(spa_guid(spa) == pool_guid); 3422 } 3423 3424 return (0); 3425 } 3426 3427 /* 3428 * Recursively open all vdevs in the vdev tree. This function is called twice: 3429 * first with the untrusted config, then with the trusted config. 3430 */ 3431 static int 3432 spa_ld_open_vdevs(spa_t *spa) 3433 { 3434 int error = 0; 3435 3436 /* 3437 * spa_missing_tvds_allowed defines how many top-level vdevs can be 3438 * missing/unopenable for the root vdev to be still considered openable. 3439 */ 3440 if (spa->spa_trust_config) { 3441 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds; 3442 } else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) { 3443 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile; 3444 } else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) { 3445 spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan; 3446 } else { 3447 spa->spa_missing_tvds_allowed = 0; 3448 } 3449 3450 spa->spa_missing_tvds_allowed = 3451 MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed); 3452 3453 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3454 error = vdev_open(spa->spa_root_vdev); 3455 spa_config_exit(spa, SCL_ALL, FTAG); 3456 3457 if (spa->spa_missing_tvds != 0) { 3458 spa_load_note(spa, "vdev tree has %lld missing top-level " 3459 "vdevs.", (u_longlong_t)spa->spa_missing_tvds); 3460 if (spa->spa_trust_config && (spa->spa_mode & SPA_MODE_WRITE)) { 3461 /* 3462 * Although theoretically we could allow users to open 3463 * incomplete pools in RW mode, we'd need to add a lot 3464 * of extra logic (e.g. adjust pool space to account 3465 * for missing vdevs). 3466 * This limitation also prevents users from accidentally 3467 * opening the pool in RW mode during data recovery and 3468 * damaging it further. 3469 */ 3470 spa_load_note(spa, "pools with missing top-level " 3471 "vdevs can only be opened in read-only mode."); 3472 error = SET_ERROR(ENXIO); 3473 } else { 3474 spa_load_note(spa, "current settings allow for maximum " 3475 "%lld missing top-level vdevs at this stage.", 3476 (u_longlong_t)spa->spa_missing_tvds_allowed); 3477 } 3478 } 3479 if (error != 0) { 3480 spa_load_failed(spa, "unable to open vdev tree [error=%d]", 3481 error); 3482 } 3483 if (spa->spa_missing_tvds != 0 || error != 0) 3484 vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2); 3485 3486 return (error); 3487 } 3488 3489 /* 3490 * We need to validate the vdev labels against the configuration that 3491 * we have in hand. This function is called twice: first with an untrusted 3492 * config, then with a trusted config. The validation is more strict when the 3493 * config is trusted. 3494 */ 3495 static int 3496 spa_ld_validate_vdevs(spa_t *spa) 3497 { 3498 int error = 0; 3499 vdev_t *rvd = spa->spa_root_vdev; 3500 3501 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3502 error = vdev_validate(rvd); 3503 spa_config_exit(spa, SCL_ALL, FTAG); 3504 3505 if (error != 0) { 3506 spa_load_failed(spa, "vdev_validate failed [error=%d]", error); 3507 return (error); 3508 } 3509 3510 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { 3511 spa_load_failed(spa, "cannot open vdev tree after invalidating " 3512 "some vdevs"); 3513 vdev_dbgmsg_print_tree(rvd, 2); 3514 return (SET_ERROR(ENXIO)); 3515 } 3516 3517 return (0); 3518 } 3519 3520 static void 3521 spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub) 3522 { 3523 spa->spa_state = POOL_STATE_ACTIVE; 3524 spa->spa_ubsync = spa->spa_uberblock; 3525 spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 3526 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 3527 spa->spa_first_txg = spa->spa_last_ubsync_txg ? 3528 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 3529 spa->spa_claim_max_txg = spa->spa_first_txg; 3530 spa->spa_prev_software_version = ub->ub_software_version; 3531 } 3532 3533 static int 3534 spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) 3535 { 3536 vdev_t *rvd = spa->spa_root_vdev; 3537 nvlist_t *label; 3538 uberblock_t *ub = &spa->spa_uberblock; 3539 boolean_t activity_check = B_FALSE; 3540 3541 /* 3542 * If we are opening the checkpointed state of the pool by 3543 * rewinding to it, at this point we will have written the 3544 * checkpointed uberblock to the vdev labels, so searching 3545 * the labels will find the right uberblock. However, if 3546 * we are opening the checkpointed state read-only, we have 3547 * not modified the labels. Therefore, we must ignore the 3548 * labels and continue using the spa_uberblock that was set 3549 * by spa_ld_checkpoint_rewind. 3550 * 3551 * Note that it would be fine to ignore the labels when 3552 * rewinding (opening writeable) as well. However, if we 3553 * crash just after writing the labels, we will end up 3554 * searching the labels. Doing so in the common case means 3555 * that this code path gets exercised normally, rather than 3556 * just in the edge case. 3557 */ 3558 if (ub->ub_checkpoint_txg != 0 && 3559 spa_importing_readonly_checkpoint(spa)) { 3560 spa_ld_select_uberblock_done(spa, ub); 3561 return (0); 3562 } 3563 3564 /* 3565 * Find the best uberblock. 3566 */ 3567 vdev_uberblock_load(rvd, ub, &label); 3568 3569 /* 3570 * If we weren't able to find a single valid uberblock, return failure. 3571 */ 3572 if (ub->ub_txg == 0) { 3573 nvlist_free(label); 3574 spa_load_failed(spa, "no valid uberblock found"); 3575 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 3576 } 3577 3578 if (spa->spa_load_max_txg != UINT64_MAX) { 3579 (void) spa_import_progress_set_max_txg(spa_guid(spa), 3580 (u_longlong_t)spa->spa_load_max_txg); 3581 } 3582 spa_load_note(spa, "using uberblock with txg=%llu", 3583 (u_longlong_t)ub->ub_txg); 3584 3585 3586 /* 3587 * For pools which have the multihost property on determine if the 3588 * pool is truly inactive and can be safely imported. Prevent 3589 * hosts which don't have a hostid set from importing the pool. 3590 */ 3591 activity_check = spa_activity_check_required(spa, ub, label, 3592 spa->spa_config); 3593 if (activity_check) { 3594 if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay && 3595 spa_get_hostid(spa) == 0) { 3596 nvlist_free(label); 3597 fnvlist_add_uint64(spa->spa_load_info, 3598 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); 3599 return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); 3600 } 3601 3602 int error = spa_activity_check(spa, ub, spa->spa_config); 3603 if (error) { 3604 nvlist_free(label); 3605 return (error); 3606 } 3607 3608 fnvlist_add_uint64(spa->spa_load_info, 3609 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_INACTIVE); 3610 fnvlist_add_uint64(spa->spa_load_info, 3611 ZPOOL_CONFIG_MMP_TXG, ub->ub_txg); 3612 fnvlist_add_uint16(spa->spa_load_info, 3613 ZPOOL_CONFIG_MMP_SEQ, 3614 (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)); 3615 } 3616 3617 /* 3618 * If the pool has an unsupported version we can't open it. 3619 */ 3620 if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { 3621 nvlist_free(label); 3622 spa_load_failed(spa, "version %llu is not supported", 3623 (u_longlong_t)ub->ub_version); 3624 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 3625 } 3626 3627 if (ub->ub_version >= SPA_VERSION_FEATURES) { 3628 nvlist_t *features; 3629 3630 /* 3631 * If we weren't able to find what's necessary for reading the 3632 * MOS in the label, return failure. 3633 */ 3634 if (label == NULL) { 3635 spa_load_failed(spa, "label config unavailable"); 3636 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 3637 ENXIO)); 3638 } 3639 3640 if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ, 3641 &features) != 0) { 3642 nvlist_free(label); 3643 spa_load_failed(spa, "invalid label: '%s' missing", 3644 ZPOOL_CONFIG_FEATURES_FOR_READ); 3645 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 3646 ENXIO)); 3647 } 3648 3649 /* 3650 * Update our in-core representation with the definitive values 3651 * from the label. 3652 */ 3653 nvlist_free(spa->spa_label_features); 3654 spa->spa_label_features = fnvlist_dup(features); 3655 } 3656 3657 nvlist_free(label); 3658 3659 /* 3660 * Look through entries in the label nvlist's features_for_read. If 3661 * there is a feature listed there which we don't understand then we 3662 * cannot open a pool. 3663 */ 3664 if (ub->ub_version >= SPA_VERSION_FEATURES) { 3665 nvlist_t *unsup_feat; 3666 3667 unsup_feat = fnvlist_alloc(); 3668 3669 for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, 3670 NULL); nvp != NULL; 3671 nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { 3672 if (!zfeature_is_supported(nvpair_name(nvp))) { 3673 fnvlist_add_string(unsup_feat, 3674 nvpair_name(nvp), ""); 3675 } 3676 } 3677 3678 if (!nvlist_empty(unsup_feat)) { 3679 fnvlist_add_nvlist(spa->spa_load_info, 3680 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 3681 nvlist_free(unsup_feat); 3682 spa_load_failed(spa, "some features are unsupported"); 3683 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 3684 ENOTSUP)); 3685 } 3686 3687 nvlist_free(unsup_feat); 3688 } 3689 3690 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 3691 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3692 spa_try_repair(spa, spa->spa_config); 3693 spa_config_exit(spa, SCL_ALL, FTAG); 3694 nvlist_free(spa->spa_config_splitting); 3695 spa->spa_config_splitting = NULL; 3696 } 3697 3698 /* 3699 * Initialize internal SPA structures. 3700 */ 3701 spa_ld_select_uberblock_done(spa, ub); 3702 3703 return (0); 3704 } 3705 3706 static int 3707 spa_ld_open_rootbp(spa_t *spa) 3708 { 3709 int error = 0; 3710 vdev_t *rvd = spa->spa_root_vdev; 3711 3712 error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 3713 if (error != 0) { 3714 spa_load_failed(spa, "unable to open rootbp in dsl_pool_init " 3715 "[error=%d]", error); 3716 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3717 } 3718 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 3719 3720 return (0); 3721 } 3722 3723 static int 3724 spa_ld_trusted_config(spa_t *spa, spa_import_type_t type, 3725 boolean_t reloading) 3726 { 3727 vdev_t *mrvd, *rvd = spa->spa_root_vdev; 3728 nvlist_t *nv, *mos_config, *policy; 3729 int error = 0, copy_error; 3730 uint64_t healthy_tvds, healthy_tvds_mos; 3731 uint64_t mos_config_txg; 3732 3733 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE) 3734 != 0) 3735 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3736 3737 /* 3738 * If we're assembling a pool from a split, the config provided is 3739 * already trusted so there is nothing to do. 3740 */ 3741 if (type == SPA_IMPORT_ASSEMBLE) 3742 return (0); 3743 3744 healthy_tvds = spa_healthy_core_tvds(spa); 3745 3746 if (load_nvlist(spa, spa->spa_config_object, &mos_config) 3747 != 0) { 3748 spa_load_failed(spa, "unable to retrieve MOS config"); 3749 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3750 } 3751 3752 /* 3753 * If we are doing an open, pool owner wasn't verified yet, thus do 3754 * the verification here. 3755 */ 3756 if (spa->spa_load_state == SPA_LOAD_OPEN) { 3757 error = spa_verify_host(spa, mos_config); 3758 if (error != 0) { 3759 nvlist_free(mos_config); 3760 return (error); 3761 } 3762 } 3763 3764 nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE); 3765 3766 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3767 3768 /* 3769 * Build a new vdev tree from the trusted config 3770 */ 3771 error = spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD); 3772 if (error != 0) { 3773 nvlist_free(mos_config); 3774 spa_config_exit(spa, SCL_ALL, FTAG); 3775 spa_load_failed(spa, "spa_config_parse failed [error=%d]", 3776 error); 3777 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 3778 } 3779 3780 /* 3781 * Vdev paths in the MOS may be obsolete. If the untrusted config was 3782 * obtained by scanning /dev/dsk, then it will have the right vdev 3783 * paths. We update the trusted MOS config with this information. 3784 * We first try to copy the paths with vdev_copy_path_strict, which 3785 * succeeds only when both configs have exactly the same vdev tree. 3786 * If that fails, we fall back to a more flexible method that has a 3787 * best effort policy. 3788 */ 3789 copy_error = vdev_copy_path_strict(rvd, mrvd); 3790 if (copy_error != 0 || spa_load_print_vdev_tree) { 3791 spa_load_note(spa, "provided vdev tree:"); 3792 vdev_dbgmsg_print_tree(rvd, 2); 3793 spa_load_note(spa, "MOS vdev tree:"); 3794 vdev_dbgmsg_print_tree(mrvd, 2); 3795 } 3796 if (copy_error != 0) { 3797 spa_load_note(spa, "vdev_copy_path_strict failed, falling " 3798 "back to vdev_copy_path_relaxed"); 3799 vdev_copy_path_relaxed(rvd, mrvd); 3800 } 3801 3802 vdev_close(rvd); 3803 vdev_free(rvd); 3804 spa->spa_root_vdev = mrvd; 3805 rvd = mrvd; 3806 spa_config_exit(spa, SCL_ALL, FTAG); 3807 3808 /* 3809 * We will use spa_config if we decide to reload the spa or if spa_load 3810 * fails and we rewind. We must thus regenerate the config using the 3811 * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to 3812 * pass settings on how to load the pool and is not stored in the MOS. 3813 * We copy it over to our new, trusted config. 3814 */ 3815 mos_config_txg = fnvlist_lookup_uint64(mos_config, 3816 ZPOOL_CONFIG_POOL_TXG); 3817 nvlist_free(mos_config); 3818 mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE); 3819 if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY, 3820 &policy) == 0) 3821 fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy); 3822 spa_config_set(spa, mos_config); 3823 spa->spa_config_source = SPA_CONFIG_SRC_MOS; 3824 3825 /* 3826 * Now that we got the config from the MOS, we should be more strict 3827 * in checking blkptrs and can make assumptions about the consistency 3828 * of the vdev tree. spa_trust_config must be set to true before opening 3829 * vdevs in order for them to be writeable. 3830 */ 3831 spa->spa_trust_config = B_TRUE; 3832 3833 /* 3834 * Open and validate the new vdev tree 3835 */ 3836 error = spa_ld_open_vdevs(spa); 3837 if (error != 0) 3838 return (error); 3839 3840 error = spa_ld_validate_vdevs(spa); 3841 if (error != 0) 3842 return (error); 3843 3844 if (copy_error != 0 || spa_load_print_vdev_tree) { 3845 spa_load_note(spa, "final vdev tree:"); 3846 vdev_dbgmsg_print_tree(rvd, 2); 3847 } 3848 3849 if (spa->spa_load_state != SPA_LOAD_TRYIMPORT && 3850 !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) { 3851 /* 3852 * Sanity check to make sure that we are indeed loading the 3853 * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds 3854 * in the config provided and they happened to be the only ones 3855 * to have the latest uberblock, we could involuntarily perform 3856 * an extreme rewind. 3857 */ 3858 healthy_tvds_mos = spa_healthy_core_tvds(spa); 3859 if (healthy_tvds_mos - healthy_tvds >= 3860 SPA_SYNC_MIN_VDEVS) { 3861 spa_load_note(spa, "config provided misses too many " 3862 "top-level vdevs compared to MOS (%lld vs %lld). ", 3863 (u_longlong_t)healthy_tvds, 3864 (u_longlong_t)healthy_tvds_mos); 3865 spa_load_note(spa, "vdev tree:"); 3866 vdev_dbgmsg_print_tree(rvd, 2); 3867 if (reloading) { 3868 spa_load_failed(spa, "config was already " 3869 "provided from MOS. Aborting."); 3870 return (spa_vdev_err(rvd, 3871 VDEV_AUX_CORRUPT_DATA, EIO)); 3872 } 3873 spa_load_note(spa, "spa must be reloaded using MOS " 3874 "config"); 3875 return (SET_ERROR(EAGAIN)); 3876 } 3877 } 3878 3879 error = spa_check_for_missing_logs(spa); 3880 if (error != 0) 3881 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 3882 3883 if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) { 3884 spa_load_failed(spa, "uberblock guid sum doesn't match MOS " 3885 "guid sum (%llu != %llu)", 3886 (u_longlong_t)spa->spa_uberblock.ub_guid_sum, 3887 (u_longlong_t)rvd->vdev_guid_sum); 3888 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 3889 ENXIO)); 3890 } 3891 3892 return (0); 3893 } 3894 3895 static int 3896 spa_ld_open_indirect_vdev_metadata(spa_t *spa) 3897 { 3898 int error = 0; 3899 vdev_t *rvd = spa->spa_root_vdev; 3900 3901 /* 3902 * Everything that we read before spa_remove_init() must be stored 3903 * on concreted vdevs. Therefore we do this as early as possible. 3904 */ 3905 error = spa_remove_init(spa); 3906 if (error != 0) { 3907 spa_load_failed(spa, "spa_remove_init failed [error=%d]", 3908 error); 3909 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3910 } 3911 3912 /* 3913 * Retrieve information needed to condense indirect vdev mappings. 3914 */ 3915 error = spa_condense_init(spa); 3916 if (error != 0) { 3917 spa_load_failed(spa, "spa_condense_init failed [error=%d]", 3918 error); 3919 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 3920 } 3921 3922 return (0); 3923 } 3924 3925 static int 3926 spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep) 3927 { 3928 int error = 0; 3929 vdev_t *rvd = spa->spa_root_vdev; 3930 3931 if (spa_version(spa) >= SPA_VERSION_FEATURES) { 3932 boolean_t missing_feat_read = B_FALSE; 3933 nvlist_t *unsup_feat, *enabled_feat; 3934 3935 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, 3936 &spa->spa_feat_for_read_obj, B_TRUE) != 0) { 3937 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3938 } 3939 3940 if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, 3941 &spa->spa_feat_for_write_obj, B_TRUE) != 0) { 3942 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3943 } 3944 3945 if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, 3946 &spa->spa_feat_desc_obj, B_TRUE) != 0) { 3947 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 3948 } 3949 3950 enabled_feat = fnvlist_alloc(); 3951 unsup_feat = fnvlist_alloc(); 3952 3953 if (!spa_features_check(spa, B_FALSE, 3954 unsup_feat, enabled_feat)) 3955 missing_feat_read = B_TRUE; 3956 3957 if (spa_writeable(spa) || 3958 spa->spa_load_state == SPA_LOAD_TRYIMPORT) { 3959 if (!spa_features_check(spa, B_TRUE, 3960 unsup_feat, enabled_feat)) { 3961 *missing_feat_writep = B_TRUE; 3962 } 3963 } 3964 3965 fnvlist_add_nvlist(spa->spa_load_info, 3966 ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); 3967 3968 if (!nvlist_empty(unsup_feat)) { 3969 fnvlist_add_nvlist(spa->spa_load_info, 3970 ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); 3971 } 3972 3973 fnvlist_free(enabled_feat); 3974 fnvlist_free(unsup_feat); 3975 3976 if (!missing_feat_read) { 3977 fnvlist_add_boolean(spa->spa_load_info, 3978 ZPOOL_CONFIG_CAN_RDONLY); 3979 } 3980 3981 /* 3982 * If the state is SPA_LOAD_TRYIMPORT, our objective is 3983 * twofold: to determine whether the pool is available for 3984 * import in read-write mode and (if it is not) whether the 3985 * pool is available for import in read-only mode. If the pool 3986 * is available for import in read-write mode, it is displayed 3987 * as available in userland; if it is not available for import 3988 * in read-only mode, it is displayed as unavailable in 3989 * userland. If the pool is available for import in read-only 3990 * mode but not read-write mode, it is displayed as unavailable 3991 * in userland with a special note that the pool is actually 3992 * available for open in read-only mode. 3993 * 3994 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are 3995 * missing a feature for write, we must first determine whether 3996 * the pool can be opened read-only before returning to 3997 * userland in order to know whether to display the 3998 * abovementioned note. 3999 */ 4000 if (missing_feat_read || (*missing_feat_writep && 4001 spa_writeable(spa))) { 4002 spa_load_failed(spa, "pool uses unsupported features"); 4003 return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, 4004 ENOTSUP)); 4005 } 4006 4007 /* 4008 * Load refcounts for ZFS features from disk into an in-memory 4009 * cache during SPA initialization. 4010 */ 4011 for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { 4012 uint64_t refcount; 4013 4014 error = feature_get_refcount_from_disk(spa, 4015 &spa_feature_table[i], &refcount); 4016 if (error == 0) { 4017 spa->spa_feat_refcount_cache[i] = refcount; 4018 } else if (error == ENOTSUP) { 4019 spa->spa_feat_refcount_cache[i] = 4020 SPA_FEATURE_DISABLED; 4021 } else { 4022 spa_load_failed(spa, "error getting refcount " 4023 "for feature %s [error=%d]", 4024 spa_feature_table[i].fi_guid, error); 4025 return (spa_vdev_err(rvd, 4026 VDEV_AUX_CORRUPT_DATA, EIO)); 4027 } 4028 } 4029 } 4030 4031 if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { 4032 if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, 4033 &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0) 4034 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4035 } 4036 4037 /* 4038 * Encryption was added before bookmark_v2, even though bookmark_v2 4039 * is now a dependency. If this pool has encryption enabled without 4040 * bookmark_v2, trigger an errata message. 4041 */ 4042 if (spa_feature_is_enabled(spa, SPA_FEATURE_ENCRYPTION) && 4043 !spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_V2)) { 4044 spa->spa_errata = ZPOOL_ERRATA_ZOL_8308_ENCRYPTION; 4045 } 4046 4047 return (0); 4048 } 4049 4050 static int 4051 spa_ld_load_special_directories(spa_t *spa) 4052 { 4053 int error = 0; 4054 vdev_t *rvd = spa->spa_root_vdev; 4055 4056 spa->spa_is_initializing = B_TRUE; 4057 error = dsl_pool_open(spa->spa_dsl_pool); 4058 spa->spa_is_initializing = B_FALSE; 4059 if (error != 0) { 4060 spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error); 4061 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4062 } 4063 4064 return (0); 4065 } 4066 4067 static int 4068 spa_ld_get_props(spa_t *spa) 4069 { 4070 int error = 0; 4071 uint64_t obj; 4072 vdev_t *rvd = spa->spa_root_vdev; 4073 4074 /* Grab the checksum salt from the MOS. */ 4075 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 4076 DMU_POOL_CHECKSUM_SALT, 1, 4077 sizeof (spa->spa_cksum_salt.zcs_bytes), 4078 spa->spa_cksum_salt.zcs_bytes); 4079 if (error == ENOENT) { 4080 /* Generate a new salt for subsequent use */ 4081 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 4082 sizeof (spa->spa_cksum_salt.zcs_bytes)); 4083 } else if (error != 0) { 4084 spa_load_failed(spa, "unable to retrieve checksum salt from " 4085 "MOS [error=%d]", error); 4086 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4087 } 4088 4089 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0) 4090 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4091 error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 4092 if (error != 0) { 4093 spa_load_failed(spa, "error opening deferred-frees bpobj " 4094 "[error=%d]", error); 4095 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4096 } 4097 4098 /* 4099 * Load the bit that tells us to use the new accounting function 4100 * (raid-z deflation). If we have an older pool, this will not 4101 * be present. 4102 */ 4103 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE); 4104 if (error != 0 && error != ENOENT) 4105 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4106 4107 error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 4108 &spa->spa_creation_version, B_FALSE); 4109 if (error != 0 && error != ENOENT) 4110 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4111 4112 /* 4113 * Load the persistent error log. If we have an older pool, this will 4114 * not be present. 4115 */ 4116 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last, 4117 B_FALSE); 4118 if (error != 0 && error != ENOENT) 4119 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4120 4121 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 4122 &spa->spa_errlog_scrub, B_FALSE); 4123 if (error != 0 && error != ENOENT) 4124 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4125 4126 /* 4127 * Load the livelist deletion field. If a livelist is queued for 4128 * deletion, indicate that in the spa 4129 */ 4130 error = spa_dir_prop(spa, DMU_POOL_DELETED_CLONES, 4131 &spa->spa_livelists_to_delete, B_FALSE); 4132 if (error != 0 && error != ENOENT) 4133 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4134 4135 /* 4136 * Load the history object. If we have an older pool, this 4137 * will not be present. 4138 */ 4139 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE); 4140 if (error != 0 && error != ENOENT) 4141 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4142 4143 /* 4144 * Load the per-vdev ZAP map. If we have an older pool, this will not 4145 * be present; in this case, defer its creation to a later time to 4146 * avoid dirtying the MOS this early / out of sync context. See 4147 * spa_sync_config_object. 4148 */ 4149 4150 /* The sentinel is only available in the MOS config. */ 4151 nvlist_t *mos_config; 4152 if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) { 4153 spa_load_failed(spa, "unable to retrieve MOS config"); 4154 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4155 } 4156 4157 error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP, 4158 &spa->spa_all_vdev_zaps, B_FALSE); 4159 4160 if (error == ENOENT) { 4161 VERIFY(!nvlist_exists(mos_config, 4162 ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); 4163 spa->spa_avz_action = AVZ_ACTION_INITIALIZE; 4164 ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); 4165 } else if (error != 0) { 4166 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4167 } else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) { 4168 /* 4169 * An older version of ZFS overwrote the sentinel value, so 4170 * we have orphaned per-vdev ZAPs in the MOS. Defer their 4171 * destruction to later; see spa_sync_config_object. 4172 */ 4173 spa->spa_avz_action = AVZ_ACTION_DESTROY; 4174 /* 4175 * We're assuming that no vdevs have had their ZAPs created 4176 * before this. Better be sure of it. 4177 */ 4178 ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); 4179 } 4180 nvlist_free(mos_config); 4181 4182 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 4183 4184 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object, 4185 B_FALSE); 4186 if (error && error != ENOENT) 4187 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4188 4189 if (error == 0) { 4190 uint64_t autoreplace = 0; 4191 4192 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 4193 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 4194 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 4195 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 4196 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 4197 spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost); 4198 spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_autotrim); 4199 spa->spa_autoreplace = (autoreplace != 0); 4200 } 4201 4202 /* 4203 * If we are importing a pool with missing top-level vdevs, 4204 * we enforce that the pool doesn't panic or get suspended on 4205 * error since the likelihood of missing data is extremely high. 4206 */ 4207 if (spa->spa_missing_tvds > 0 && 4208 spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE && 4209 spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 4210 spa_load_note(spa, "forcing failmode to 'continue' " 4211 "as some top level vdevs are missing"); 4212 spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE; 4213 } 4214 4215 return (0); 4216 } 4217 4218 static int 4219 spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type) 4220 { 4221 int error = 0; 4222 vdev_t *rvd = spa->spa_root_vdev; 4223 4224 /* 4225 * If we're assembling the pool from the split-off vdevs of 4226 * an existing pool, we don't want to attach the spares & cache 4227 * devices. 4228 */ 4229 4230 /* 4231 * Load any hot spares for this pool. 4232 */ 4233 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object, 4234 B_FALSE); 4235 if (error != 0 && error != ENOENT) 4236 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4237 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 4238 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 4239 if (load_nvlist(spa, spa->spa_spares.sav_object, 4240 &spa->spa_spares.sav_config) != 0) { 4241 spa_load_failed(spa, "error loading spares nvlist"); 4242 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4243 } 4244 4245 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4246 spa_load_spares(spa); 4247 spa_config_exit(spa, SCL_ALL, FTAG); 4248 } else if (error == 0) { 4249 spa->spa_spares.sav_sync = B_TRUE; 4250 } 4251 4252 /* 4253 * Load any level 2 ARC devices for this pool. 4254 */ 4255 error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 4256 &spa->spa_l2cache.sav_object, B_FALSE); 4257 if (error != 0 && error != ENOENT) 4258 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4259 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 4260 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 4261 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 4262 &spa->spa_l2cache.sav_config) != 0) { 4263 spa_load_failed(spa, "error loading l2cache nvlist"); 4264 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4265 } 4266 4267 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4268 spa_load_l2cache(spa); 4269 spa_config_exit(spa, SCL_ALL, FTAG); 4270 } else if (error == 0) { 4271 spa->spa_l2cache.sav_sync = B_TRUE; 4272 } 4273 4274 return (0); 4275 } 4276 4277 static int 4278 spa_ld_load_vdev_metadata(spa_t *spa) 4279 { 4280 int error = 0; 4281 vdev_t *rvd = spa->spa_root_vdev; 4282 4283 /* 4284 * If the 'multihost' property is set, then never allow a pool to 4285 * be imported when the system hostid is zero. The exception to 4286 * this rule is zdb which is always allowed to access pools. 4287 */ 4288 if (spa_multihost(spa) && spa_get_hostid(spa) == 0 && 4289 (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) == 0) { 4290 fnvlist_add_uint64(spa->spa_load_info, 4291 ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); 4292 return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); 4293 } 4294 4295 /* 4296 * If the 'autoreplace' property is set, then post a resource notifying 4297 * the ZFS DE that it should not issue any faults for unopenable 4298 * devices. We also iterate over the vdevs, and post a sysevent for any 4299 * unopenable vdevs so that the normal autoreplace handler can take 4300 * over. 4301 */ 4302 if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 4303 spa_check_removed(spa->spa_root_vdev); 4304 /* 4305 * For the import case, this is done in spa_import(), because 4306 * at this point we're using the spare definitions from 4307 * the MOS config, not necessarily from the userland config. 4308 */ 4309 if (spa->spa_load_state != SPA_LOAD_IMPORT) { 4310 spa_aux_check_removed(&spa->spa_spares); 4311 spa_aux_check_removed(&spa->spa_l2cache); 4312 } 4313 } 4314 4315 /* 4316 * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc. 4317 */ 4318 error = vdev_load(rvd); 4319 if (error != 0) { 4320 spa_load_failed(spa, "vdev_load failed [error=%d]", error); 4321 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 4322 } 4323 4324 error = spa_ld_log_spacemaps(spa); 4325 if (error != 0) { 4326 spa_load_failed(spa, "spa_ld_log_sm_data failed [error=%d]", 4327 error); 4328 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); 4329 } 4330 4331 /* 4332 * Propagate the leaf DTLs we just loaded all the way up the vdev tree. 4333 */ 4334 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4335 vdev_dtl_reassess(rvd, 0, 0, B_FALSE, B_FALSE); 4336 spa_config_exit(spa, SCL_ALL, FTAG); 4337 4338 return (0); 4339 } 4340 4341 static int 4342 spa_ld_load_dedup_tables(spa_t *spa) 4343 { 4344 int error = 0; 4345 vdev_t *rvd = spa->spa_root_vdev; 4346 4347 error = ddt_load(spa); 4348 if (error != 0) { 4349 spa_load_failed(spa, "ddt_load failed [error=%d]", error); 4350 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 4351 } 4352 4353 return (0); 4354 } 4355 4356 static int 4357 spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, char **ereport) 4358 { 4359 vdev_t *rvd = spa->spa_root_vdev; 4360 4361 if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) { 4362 boolean_t missing = spa_check_logs(spa); 4363 if (missing) { 4364 if (spa->spa_missing_tvds != 0) { 4365 spa_load_note(spa, "spa_check_logs failed " 4366 "so dropping the logs"); 4367 } else { 4368 *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 4369 spa_load_failed(spa, "spa_check_logs failed"); 4370 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, 4371 ENXIO)); 4372 } 4373 } 4374 } 4375 4376 return (0); 4377 } 4378 4379 static int 4380 spa_ld_verify_pool_data(spa_t *spa) 4381 { 4382 int error = 0; 4383 vdev_t *rvd = spa->spa_root_vdev; 4384 4385 /* 4386 * We've successfully opened the pool, verify that we're ready 4387 * to start pushing transactions. 4388 */ 4389 if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) { 4390 error = spa_load_verify(spa); 4391 if (error != 0) { 4392 spa_load_failed(spa, "spa_load_verify failed " 4393 "[error=%d]", error); 4394 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 4395 error)); 4396 } 4397 } 4398 4399 return (0); 4400 } 4401 4402 static void 4403 spa_ld_claim_log_blocks(spa_t *spa) 4404 { 4405 dmu_tx_t *tx; 4406 dsl_pool_t *dp = spa_get_dsl(spa); 4407 4408 /* 4409 * Claim log blocks that haven't been committed yet. 4410 * This must all happen in a single txg. 4411 * Note: spa_claim_max_txg is updated by spa_claim_notify(), 4412 * invoked from zil_claim_log_block()'s i/o done callback. 4413 * Price of rollback is that we abandon the log. 4414 */ 4415 spa->spa_claiming = B_TRUE; 4416 4417 tx = dmu_tx_create_assigned(dp, spa_first_txg(spa)); 4418 (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj, 4419 zil_claim, tx, DS_FIND_CHILDREN); 4420 dmu_tx_commit(tx); 4421 4422 spa->spa_claiming = B_FALSE; 4423 4424 spa_set_log_state(spa, SPA_LOG_GOOD); 4425 } 4426 4427 static void 4428 spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg, 4429 boolean_t update_config_cache) 4430 { 4431 vdev_t *rvd = spa->spa_root_vdev; 4432 int need_update = B_FALSE; 4433 4434 /* 4435 * If the config cache is stale, or we have uninitialized 4436 * metaslabs (see spa_vdev_add()), then update the config. 4437 * 4438 * If this is a verbatim import, trust the current 4439 * in-core spa_config and update the disk labels. 4440 */ 4441 if (update_config_cache || config_cache_txg != spa->spa_config_txg || 4442 spa->spa_load_state == SPA_LOAD_IMPORT || 4443 spa->spa_load_state == SPA_LOAD_RECOVER || 4444 (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 4445 need_update = B_TRUE; 4446 4447 for (int c = 0; c < rvd->vdev_children; c++) 4448 if (rvd->vdev_child[c]->vdev_ms_array == 0) 4449 need_update = B_TRUE; 4450 4451 /* 4452 * Update the config cache asynchronously in case we're the 4453 * root pool, in which case the config cache isn't writable yet. 4454 */ 4455 if (need_update) 4456 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 4457 } 4458 4459 static void 4460 spa_ld_prepare_for_reload(spa_t *spa) 4461 { 4462 spa_mode_t mode = spa->spa_mode; 4463 int async_suspended = spa->spa_async_suspended; 4464 4465 spa_unload(spa); 4466 spa_deactivate(spa); 4467 spa_activate(spa, mode); 4468 4469 /* 4470 * We save the value of spa_async_suspended as it gets reset to 0 by 4471 * spa_unload(). We want to restore it back to the original value before 4472 * returning as we might be calling spa_async_resume() later. 4473 */ 4474 spa->spa_async_suspended = async_suspended; 4475 } 4476 4477 static int 4478 spa_ld_read_checkpoint_txg(spa_t *spa) 4479 { 4480 uberblock_t checkpoint; 4481 int error = 0; 4482 4483 ASSERT0(spa->spa_checkpoint_txg); 4484 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 4485 4486 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 4487 DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), 4488 sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); 4489 4490 if (error == ENOENT) 4491 return (0); 4492 4493 if (error != 0) 4494 return (error); 4495 4496 ASSERT3U(checkpoint.ub_txg, !=, 0); 4497 ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0); 4498 ASSERT3U(checkpoint.ub_timestamp, !=, 0); 4499 spa->spa_checkpoint_txg = checkpoint.ub_txg; 4500 spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp; 4501 4502 return (0); 4503 } 4504 4505 static int 4506 spa_ld_mos_init(spa_t *spa, spa_import_type_t type) 4507 { 4508 int error = 0; 4509 4510 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 4511 ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); 4512 4513 /* 4514 * Never trust the config that is provided unless we are assembling 4515 * a pool following a split. 4516 * This means don't trust blkptrs and the vdev tree in general. This 4517 * also effectively puts the spa in read-only mode since 4518 * spa_writeable() checks for spa_trust_config to be true. 4519 * We will later load a trusted config from the MOS. 4520 */ 4521 if (type != SPA_IMPORT_ASSEMBLE) 4522 spa->spa_trust_config = B_FALSE; 4523 4524 /* 4525 * Parse the config provided to create a vdev tree. 4526 */ 4527 error = spa_ld_parse_config(spa, type); 4528 if (error != 0) 4529 return (error); 4530 4531 spa_import_progress_add(spa); 4532 4533 /* 4534 * Now that we have the vdev tree, try to open each vdev. This involves 4535 * opening the underlying physical device, retrieving its geometry and 4536 * probing the vdev with a dummy I/O. The state of each vdev will be set 4537 * based on the success of those operations. After this we'll be ready 4538 * to read from the vdevs. 4539 */ 4540 error = spa_ld_open_vdevs(spa); 4541 if (error != 0) 4542 return (error); 4543 4544 /* 4545 * Read the label of each vdev and make sure that the GUIDs stored 4546 * there match the GUIDs in the config provided. 4547 * If we're assembling a new pool that's been split off from an 4548 * existing pool, the labels haven't yet been updated so we skip 4549 * validation for now. 4550 */ 4551 if (type != SPA_IMPORT_ASSEMBLE) { 4552 error = spa_ld_validate_vdevs(spa); 4553 if (error != 0) 4554 return (error); 4555 } 4556 4557 /* 4558 * Read all vdev labels to find the best uberblock (i.e. latest, 4559 * unless spa_load_max_txg is set) and store it in spa_uberblock. We 4560 * get the list of features required to read blkptrs in the MOS from 4561 * the vdev label with the best uberblock and verify that our version 4562 * of zfs supports them all. 4563 */ 4564 error = spa_ld_select_uberblock(spa, type); 4565 if (error != 0) 4566 return (error); 4567 4568 /* 4569 * Pass that uberblock to the dsl_pool layer which will open the root 4570 * blkptr. This blkptr points to the latest version of the MOS and will 4571 * allow us to read its contents. 4572 */ 4573 error = spa_ld_open_rootbp(spa); 4574 if (error != 0) 4575 return (error); 4576 4577 return (0); 4578 } 4579 4580 static int 4581 spa_ld_checkpoint_rewind(spa_t *spa) 4582 { 4583 uberblock_t checkpoint; 4584 int error = 0; 4585 4586 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 4587 ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); 4588 4589 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 4590 DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), 4591 sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); 4592 4593 if (error != 0) { 4594 spa_load_failed(spa, "unable to retrieve checkpointed " 4595 "uberblock from the MOS config [error=%d]", error); 4596 4597 if (error == ENOENT) 4598 error = ZFS_ERR_NO_CHECKPOINT; 4599 4600 return (error); 4601 } 4602 4603 ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg); 4604 ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg); 4605 4606 /* 4607 * We need to update the txg and timestamp of the checkpointed 4608 * uberblock to be higher than the latest one. This ensures that 4609 * the checkpointed uberblock is selected if we were to close and 4610 * reopen the pool right after we've written it in the vdev labels. 4611 * (also see block comment in vdev_uberblock_compare) 4612 */ 4613 checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1; 4614 checkpoint.ub_timestamp = gethrestime_sec(); 4615 4616 /* 4617 * Set current uberblock to be the checkpointed uberblock. 4618 */ 4619 spa->spa_uberblock = checkpoint; 4620 4621 /* 4622 * If we are doing a normal rewind, then the pool is open for 4623 * writing and we sync the "updated" checkpointed uberblock to 4624 * disk. Once this is done, we've basically rewound the whole 4625 * pool and there is no way back. 4626 * 4627 * There are cases when we don't want to attempt and sync the 4628 * checkpointed uberblock to disk because we are opening a 4629 * pool as read-only. Specifically, verifying the checkpointed 4630 * state with zdb, and importing the checkpointed state to get 4631 * a "preview" of its content. 4632 */ 4633 if (spa_writeable(spa)) { 4634 vdev_t *rvd = spa->spa_root_vdev; 4635 4636 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4637 vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; 4638 int svdcount = 0; 4639 int children = rvd->vdev_children; 4640 int c0 = random_in_range(children); 4641 4642 for (int c = 0; c < children; c++) { 4643 vdev_t *vd = rvd->vdev_child[(c0 + c) % children]; 4644 4645 /* Stop when revisiting the first vdev */ 4646 if (c > 0 && svd[0] == vd) 4647 break; 4648 4649 if (vd->vdev_ms_array == 0 || vd->vdev_islog || 4650 !vdev_is_concrete(vd)) 4651 continue; 4652 4653 svd[svdcount++] = vd; 4654 if (svdcount == SPA_SYNC_MIN_VDEVS) 4655 break; 4656 } 4657 error = vdev_config_sync(svd, svdcount, spa->spa_first_txg); 4658 if (error == 0) 4659 spa->spa_last_synced_guid = rvd->vdev_guid; 4660 spa_config_exit(spa, SCL_ALL, FTAG); 4661 4662 if (error != 0) { 4663 spa_load_failed(spa, "failed to write checkpointed " 4664 "uberblock to the vdev labels [error=%d]", error); 4665 return (error); 4666 } 4667 } 4668 4669 return (0); 4670 } 4671 4672 static int 4673 spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type, 4674 boolean_t *update_config_cache) 4675 { 4676 int error; 4677 4678 /* 4679 * Parse the config for pool, open and validate vdevs, 4680 * select an uberblock, and use that uberblock to open 4681 * the MOS. 4682 */ 4683 error = spa_ld_mos_init(spa, type); 4684 if (error != 0) 4685 return (error); 4686 4687 /* 4688 * Retrieve the trusted config stored in the MOS and use it to create 4689 * a new, exact version of the vdev tree, then reopen all vdevs. 4690 */ 4691 error = spa_ld_trusted_config(spa, type, B_FALSE); 4692 if (error == EAGAIN) { 4693 if (update_config_cache != NULL) 4694 *update_config_cache = B_TRUE; 4695 4696 /* 4697 * Redo the loading process with the trusted config if it is 4698 * too different from the untrusted config. 4699 */ 4700 spa_ld_prepare_for_reload(spa); 4701 spa_load_note(spa, "RELOADING"); 4702 error = spa_ld_mos_init(spa, type); 4703 if (error != 0) 4704 return (error); 4705 4706 error = spa_ld_trusted_config(spa, type, B_TRUE); 4707 if (error != 0) 4708 return (error); 4709 4710 } else if (error != 0) { 4711 return (error); 4712 } 4713 4714 return (0); 4715 } 4716 4717 /* 4718 * Load an existing storage pool, using the config provided. This config 4719 * describes which vdevs are part of the pool and is later validated against 4720 * partial configs present in each vdev's label and an entire copy of the 4721 * config stored in the MOS. 4722 */ 4723 static int 4724 spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) 4725 { 4726 int error = 0; 4727 boolean_t missing_feat_write = B_FALSE; 4728 boolean_t checkpoint_rewind = 4729 (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); 4730 boolean_t update_config_cache = B_FALSE; 4731 4732 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 4733 ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); 4734 4735 spa_load_note(spa, "LOADING"); 4736 4737 error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache); 4738 if (error != 0) 4739 return (error); 4740 4741 /* 4742 * If we are rewinding to the checkpoint then we need to repeat 4743 * everything we've done so far in this function but this time 4744 * selecting the checkpointed uberblock and using that to open 4745 * the MOS. 4746 */ 4747 if (checkpoint_rewind) { 4748 /* 4749 * If we are rewinding to the checkpoint update config cache 4750 * anyway. 4751 */ 4752 update_config_cache = B_TRUE; 4753 4754 /* 4755 * Extract the checkpointed uberblock from the current MOS 4756 * and use this as the pool's uberblock from now on. If the 4757 * pool is imported as writeable we also write the checkpoint 4758 * uberblock to the labels, making the rewind permanent. 4759 */ 4760 error = spa_ld_checkpoint_rewind(spa); 4761 if (error != 0) 4762 return (error); 4763 4764 /* 4765 * Redo the loading process again with the 4766 * checkpointed uberblock. 4767 */ 4768 spa_ld_prepare_for_reload(spa); 4769 spa_load_note(spa, "LOADING checkpointed uberblock"); 4770 error = spa_ld_mos_with_trusted_config(spa, type, NULL); 4771 if (error != 0) 4772 return (error); 4773 } 4774 4775 /* 4776 * Retrieve the checkpoint txg if the pool has a checkpoint. 4777 */ 4778 error = spa_ld_read_checkpoint_txg(spa); 4779 if (error != 0) 4780 return (error); 4781 4782 /* 4783 * Retrieve the mapping of indirect vdevs. Those vdevs were removed 4784 * from the pool and their contents were re-mapped to other vdevs. Note 4785 * that everything that we read before this step must have been 4786 * rewritten on concrete vdevs after the last device removal was 4787 * initiated. Otherwise we could be reading from indirect vdevs before 4788 * we have loaded their mappings. 4789 */ 4790 error = spa_ld_open_indirect_vdev_metadata(spa); 4791 if (error != 0) 4792 return (error); 4793 4794 /* 4795 * Retrieve the full list of active features from the MOS and check if 4796 * they are all supported. 4797 */ 4798 error = spa_ld_check_features(spa, &missing_feat_write); 4799 if (error != 0) 4800 return (error); 4801 4802 /* 4803 * Load several special directories from the MOS needed by the dsl_pool 4804 * layer. 4805 */ 4806 error = spa_ld_load_special_directories(spa); 4807 if (error != 0) 4808 return (error); 4809 4810 /* 4811 * Retrieve pool properties from the MOS. 4812 */ 4813 error = spa_ld_get_props(spa); 4814 if (error != 0) 4815 return (error); 4816 4817 /* 4818 * Retrieve the list of auxiliary devices - cache devices and spares - 4819 * and open them. 4820 */ 4821 error = spa_ld_open_aux_vdevs(spa, type); 4822 if (error != 0) 4823 return (error); 4824 4825 /* 4826 * Load the metadata for all vdevs. Also check if unopenable devices 4827 * should be autoreplaced. 4828 */ 4829 error = spa_ld_load_vdev_metadata(spa); 4830 if (error != 0) 4831 return (error); 4832 4833 error = spa_ld_load_dedup_tables(spa); 4834 if (error != 0) 4835 return (error); 4836 4837 /* 4838 * Verify the logs now to make sure we don't have any unexpected errors 4839 * when we claim log blocks later. 4840 */ 4841 error = spa_ld_verify_logs(spa, type, ereport); 4842 if (error != 0) 4843 return (error); 4844 4845 if (missing_feat_write) { 4846 ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT); 4847 4848 /* 4849 * At this point, we know that we can open the pool in 4850 * read-only mode but not read-write mode. We now have enough 4851 * information and can return to userland. 4852 */ 4853 return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT, 4854 ENOTSUP)); 4855 } 4856 4857 /* 4858 * Traverse the last txgs to make sure the pool was left off in a safe 4859 * state. When performing an extreme rewind, we verify the whole pool, 4860 * which can take a very long time. 4861 */ 4862 error = spa_ld_verify_pool_data(spa); 4863 if (error != 0) 4864 return (error); 4865 4866 /* 4867 * Calculate the deflated space for the pool. This must be done before 4868 * we write anything to the pool because we'd need to update the space 4869 * accounting using the deflated sizes. 4870 */ 4871 spa_update_dspace(spa); 4872 4873 /* 4874 * We have now retrieved all the information we needed to open the 4875 * pool. If we are importing the pool in read-write mode, a few 4876 * additional steps must be performed to finish the import. 4877 */ 4878 if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER || 4879 spa->spa_load_max_txg == UINT64_MAX)) { 4880 uint64_t config_cache_txg = spa->spa_config_txg; 4881 4882 ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT); 4883 4884 /* 4885 * In case of a checkpoint rewind, log the original txg 4886 * of the checkpointed uberblock. 4887 */ 4888 if (checkpoint_rewind) { 4889 spa_history_log_internal(spa, "checkpoint rewind", 4890 NULL, "rewound state to txg=%llu", 4891 (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg); 4892 } 4893 4894 /* 4895 * Traverse the ZIL and claim all blocks. 4896 */ 4897 spa_ld_claim_log_blocks(spa); 4898 4899 /* 4900 * Kick-off the syncing thread. 4901 */ 4902 spa->spa_sync_on = B_TRUE; 4903 txg_sync_start(spa->spa_dsl_pool); 4904 mmp_thread_start(spa); 4905 4906 /* 4907 * Wait for all claims to sync. We sync up to the highest 4908 * claimed log block birth time so that claimed log blocks 4909 * don't appear to be from the future. spa_claim_max_txg 4910 * will have been set for us by ZIL traversal operations 4911 * performed above. 4912 */ 4913 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 4914 4915 /* 4916 * Check if we need to request an update of the config. On the 4917 * next sync, we would update the config stored in vdev labels 4918 * and the cachefile (by default /etc/zfs/zpool.cache). 4919 */ 4920 spa_ld_check_for_config_update(spa, config_cache_txg, 4921 update_config_cache); 4922 4923 /* 4924 * Check if a rebuild was in progress and if so resume it. 4925 * Then check all DTLs to see if anything needs resilvering. 4926 * The resilver will be deferred if a rebuild was started. 4927 */ 4928 if (vdev_rebuild_active(spa->spa_root_vdev)) { 4929 vdev_rebuild_restart(spa); 4930 } else if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 4931 vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 4932 spa_async_request(spa, SPA_ASYNC_RESILVER); 4933 } 4934 4935 /* 4936 * Log the fact that we booted up (so that we can detect if 4937 * we rebooted in the middle of an operation). 4938 */ 4939 spa_history_log_version(spa, "open", NULL); 4940 4941 spa_restart_removal(spa); 4942 spa_spawn_aux_threads(spa); 4943 4944 /* 4945 * Delete any inconsistent datasets. 4946 * 4947 * Note: 4948 * Since we may be issuing deletes for clones here, 4949 * we make sure to do so after we've spawned all the 4950 * auxiliary threads above (from which the livelist 4951 * deletion zthr is part of). 4952 */ 4953 (void) dmu_objset_find(spa_name(spa), 4954 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 4955 4956 /* 4957 * Clean up any stale temporary dataset userrefs. 4958 */ 4959 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 4960 4961 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 4962 vdev_initialize_restart(spa->spa_root_vdev); 4963 vdev_trim_restart(spa->spa_root_vdev); 4964 vdev_autotrim_restart(spa); 4965 spa_config_exit(spa, SCL_CONFIG, FTAG); 4966 } 4967 4968 spa_import_progress_remove(spa_guid(spa)); 4969 spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); 4970 4971 spa_load_note(spa, "LOADED"); 4972 4973 return (0); 4974 } 4975 4976 static int 4977 spa_load_retry(spa_t *spa, spa_load_state_t state) 4978 { 4979 spa_mode_t mode = spa->spa_mode; 4980 4981 spa_unload(spa); 4982 spa_deactivate(spa); 4983 4984 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1; 4985 4986 spa_activate(spa, mode); 4987 spa_async_suspend(spa); 4988 4989 spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu", 4990 (u_longlong_t)spa->spa_load_max_txg); 4991 4992 return (spa_load(spa, state, SPA_IMPORT_EXISTING)); 4993 } 4994 4995 /* 4996 * If spa_load() fails this function will try loading prior txg's. If 4997 * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool 4998 * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this 4999 * function will not rewind the pool and will return the same error as 5000 * spa_load(). 5001 */ 5002 static int 5003 spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request, 5004 int rewind_flags) 5005 { 5006 nvlist_t *loadinfo = NULL; 5007 nvlist_t *config = NULL; 5008 int load_error, rewind_error; 5009 uint64_t safe_rewind_txg; 5010 uint64_t min_txg; 5011 5012 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 5013 spa->spa_load_max_txg = spa->spa_load_txg; 5014 spa_set_log_state(spa, SPA_LOG_CLEAR); 5015 } else { 5016 spa->spa_load_max_txg = max_request; 5017 if (max_request != UINT64_MAX) 5018 spa->spa_extreme_rewind = B_TRUE; 5019 } 5020 5021 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING); 5022 if (load_error == 0) 5023 return (0); 5024 if (load_error == ZFS_ERR_NO_CHECKPOINT) { 5025 /* 5026 * When attempting checkpoint-rewind on a pool with no 5027 * checkpoint, we should not attempt to load uberblocks 5028 * from previous txgs when spa_load fails. 5029 */ 5030 ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); 5031 spa_import_progress_remove(spa_guid(spa)); 5032 return (load_error); 5033 } 5034 5035 if (spa->spa_root_vdev != NULL) 5036 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 5037 5038 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 5039 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 5040 5041 if (rewind_flags & ZPOOL_NEVER_REWIND) { 5042 nvlist_free(config); 5043 spa_import_progress_remove(spa_guid(spa)); 5044 return (load_error); 5045 } 5046 5047 if (state == SPA_LOAD_RECOVER) { 5048 /* Price of rolling back is discarding txgs, including log */ 5049 spa_set_log_state(spa, SPA_LOG_CLEAR); 5050 } else { 5051 /* 5052 * If we aren't rolling back save the load info from our first 5053 * import attempt so that we can restore it after attempting 5054 * to rewind. 5055 */ 5056 loadinfo = spa->spa_load_info; 5057 spa->spa_load_info = fnvlist_alloc(); 5058 } 5059 5060 spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 5061 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 5062 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 5063 TXG_INITIAL : safe_rewind_txg; 5064 5065 /* 5066 * Continue as long as we're finding errors, we're still within 5067 * the acceptable rewind range, and we're still finding uberblocks 5068 */ 5069 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 5070 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 5071 if (spa->spa_load_max_txg < safe_rewind_txg) 5072 spa->spa_extreme_rewind = B_TRUE; 5073 rewind_error = spa_load_retry(spa, state); 5074 } 5075 5076 spa->spa_extreme_rewind = B_FALSE; 5077 spa->spa_load_max_txg = UINT64_MAX; 5078 5079 if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 5080 spa_config_set(spa, config); 5081 else 5082 nvlist_free(config); 5083 5084 if (state == SPA_LOAD_RECOVER) { 5085 ASSERT3P(loadinfo, ==, NULL); 5086 spa_import_progress_remove(spa_guid(spa)); 5087 return (rewind_error); 5088 } else { 5089 /* Store the rewind info as part of the initial load info */ 5090 fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, 5091 spa->spa_load_info); 5092 5093 /* Restore the initial load info */ 5094 fnvlist_free(spa->spa_load_info); 5095 spa->spa_load_info = loadinfo; 5096 5097 spa_import_progress_remove(spa_guid(spa)); 5098 return (load_error); 5099 } 5100 } 5101 5102 /* 5103 * Pool Open/Import 5104 * 5105 * The import case is identical to an open except that the configuration is sent 5106 * down from userland, instead of grabbed from the configuration cache. For the 5107 * case of an open, the pool configuration will exist in the 5108 * POOL_STATE_UNINITIALIZED state. 5109 * 5110 * The stats information (gen/count/ustats) is used to gather vdev statistics at 5111 * the same time open the pool, without having to keep around the spa_t in some 5112 * ambiguous state. 5113 */ 5114 static int 5115 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, 5116 nvlist_t **config) 5117 { 5118 spa_t *spa; 5119 spa_load_state_t state = SPA_LOAD_OPEN; 5120 int error; 5121 int locked = B_FALSE; 5122 int firstopen = B_FALSE; 5123 5124 *spapp = NULL; 5125 5126 /* 5127 * As disgusting as this is, we need to support recursive calls to this 5128 * function because dsl_dir_open() is called during spa_load(), and ends 5129 * up calling spa_open() again. The real fix is to figure out how to 5130 * avoid dsl_dir_open() calling this in the first place. 5131 */ 5132 if (MUTEX_NOT_HELD(&spa_namespace_lock)) { 5133 mutex_enter(&spa_namespace_lock); 5134 locked = B_TRUE; 5135 } 5136 5137 if ((spa = spa_lookup(pool)) == NULL) { 5138 if (locked) 5139 mutex_exit(&spa_namespace_lock); 5140 return (SET_ERROR(ENOENT)); 5141 } 5142 5143 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 5144 zpool_load_policy_t policy; 5145 5146 firstopen = B_TRUE; 5147 5148 zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config, 5149 &policy); 5150 if (policy.zlp_rewind & ZPOOL_DO_REWIND) 5151 state = SPA_LOAD_RECOVER; 5152 5153 spa_activate(spa, spa_mode_global); 5154 5155 if (state != SPA_LOAD_RECOVER) 5156 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 5157 spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; 5158 5159 zfs_dbgmsg("spa_open_common: opening %s", pool); 5160 error = spa_load_best(spa, state, policy.zlp_txg, 5161 policy.zlp_rewind); 5162 5163 if (error == EBADF) { 5164 /* 5165 * If vdev_validate() returns failure (indicated by 5166 * EBADF), it indicates that one of the vdevs indicates 5167 * that the pool has been exported or destroyed. If 5168 * this is the case, the config cache is out of sync and 5169 * we should remove the pool from the namespace. 5170 */ 5171 spa_unload(spa); 5172 spa_deactivate(spa); 5173 spa_write_cachefile(spa, B_TRUE, B_TRUE); 5174 spa_remove(spa); 5175 if (locked) 5176 mutex_exit(&spa_namespace_lock); 5177 return (SET_ERROR(ENOENT)); 5178 } 5179 5180 if (error) { 5181 /* 5182 * We can't open the pool, but we still have useful 5183 * information: the state of each vdev after the 5184 * attempted vdev_open(). Return this to the user. 5185 */ 5186 if (config != NULL && spa->spa_config) { 5187 *config = fnvlist_dup(spa->spa_config); 5188 fnvlist_add_nvlist(*config, 5189 ZPOOL_CONFIG_LOAD_INFO, 5190 spa->spa_load_info); 5191 } 5192 spa_unload(spa); 5193 spa_deactivate(spa); 5194 spa->spa_last_open_failed = error; 5195 if (locked) 5196 mutex_exit(&spa_namespace_lock); 5197 *spapp = NULL; 5198 return (error); 5199 } 5200 } 5201 5202 spa_open_ref(spa, tag); 5203 5204 if (config != NULL) 5205 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 5206 5207 /* 5208 * If we've recovered the pool, pass back any information we 5209 * gathered while doing the load. 5210 */ 5211 if (state == SPA_LOAD_RECOVER) { 5212 fnvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 5213 spa->spa_load_info); 5214 } 5215 5216 if (locked) { 5217 spa->spa_last_open_failed = 0; 5218 spa->spa_last_ubsync_txg = 0; 5219 spa->spa_load_txg = 0; 5220 mutex_exit(&spa_namespace_lock); 5221 } 5222 5223 if (firstopen) 5224 zvol_create_minors_recursive(spa_name(spa)); 5225 5226 *spapp = spa; 5227 5228 return (0); 5229 } 5230 5231 int 5232 spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, 5233 nvlist_t **config) 5234 { 5235 return (spa_open_common(name, spapp, tag, policy, config)); 5236 } 5237 5238 int 5239 spa_open(const char *name, spa_t **spapp, void *tag) 5240 { 5241 return (spa_open_common(name, spapp, tag, NULL, NULL)); 5242 } 5243 5244 /* 5245 * Lookup the given spa_t, incrementing the inject count in the process, 5246 * preventing it from being exported or destroyed. 5247 */ 5248 spa_t * 5249 spa_inject_addref(char *name) 5250 { 5251 spa_t *spa; 5252 5253 mutex_enter(&spa_namespace_lock); 5254 if ((spa = spa_lookup(name)) == NULL) { 5255 mutex_exit(&spa_namespace_lock); 5256 return (NULL); 5257 } 5258 spa->spa_inject_ref++; 5259 mutex_exit(&spa_namespace_lock); 5260 5261 return (spa); 5262 } 5263 5264 void 5265 spa_inject_delref(spa_t *spa) 5266 { 5267 mutex_enter(&spa_namespace_lock); 5268 spa->spa_inject_ref--; 5269 mutex_exit(&spa_namespace_lock); 5270 } 5271 5272 /* 5273 * Add spares device information to the nvlist. 5274 */ 5275 static void 5276 spa_add_spares(spa_t *spa, nvlist_t *config) 5277 { 5278 nvlist_t **spares; 5279 uint_t i, nspares; 5280 nvlist_t *nvroot; 5281 uint64_t guid; 5282 vdev_stat_t *vs; 5283 uint_t vsc; 5284 uint64_t pool; 5285 5286 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 5287 5288 if (spa->spa_spares.sav_count == 0) 5289 return; 5290 5291 nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); 5292 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 5293 ZPOOL_CONFIG_SPARES, &spares, &nspares)); 5294 if (nspares != 0) { 5295 fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, spares, 5296 nspares); 5297 VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 5298 &spares, &nspares)); 5299 5300 /* 5301 * Go through and find any spares which have since been 5302 * repurposed as an active spare. If this is the case, update 5303 * their status appropriately. 5304 */ 5305 for (i = 0; i < nspares; i++) { 5306 guid = fnvlist_lookup_uint64(spares[i], 5307 ZPOOL_CONFIG_GUID); 5308 if (spa_spare_exists(guid, &pool, NULL) && 5309 pool != 0ULL) { 5310 VERIFY0(nvlist_lookup_uint64_array(spares[i], 5311 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, 5312 &vsc)); 5313 vs->vs_state = VDEV_STATE_CANT_OPEN; 5314 vs->vs_aux = VDEV_AUX_SPARED; 5315 } 5316 } 5317 } 5318 } 5319 5320 /* 5321 * Add l2cache device information to the nvlist, including vdev stats. 5322 */ 5323 static void 5324 spa_add_l2cache(spa_t *spa, nvlist_t *config) 5325 { 5326 nvlist_t **l2cache; 5327 uint_t i, j, nl2cache; 5328 nvlist_t *nvroot; 5329 uint64_t guid; 5330 vdev_t *vd; 5331 vdev_stat_t *vs; 5332 uint_t vsc; 5333 5334 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 5335 5336 if (spa->spa_l2cache.sav_count == 0) 5337 return; 5338 5339 nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); 5340 VERIFY0(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 5341 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache)); 5342 if (nl2cache != 0) { 5343 fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, l2cache, 5344 nl2cache); 5345 VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 5346 &l2cache, &nl2cache)); 5347 5348 /* 5349 * Update level 2 cache device stats. 5350 */ 5351 5352 for (i = 0; i < nl2cache; i++) { 5353 guid = fnvlist_lookup_uint64(l2cache[i], 5354 ZPOOL_CONFIG_GUID); 5355 5356 vd = NULL; 5357 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 5358 if (guid == 5359 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 5360 vd = spa->spa_l2cache.sav_vdevs[j]; 5361 break; 5362 } 5363 } 5364 ASSERT(vd != NULL); 5365 5366 VERIFY0(nvlist_lookup_uint64_array(l2cache[i], 5367 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)); 5368 vdev_get_stats(vd, vs); 5369 vdev_config_generate_stats(vd, l2cache[i]); 5370 5371 } 5372 } 5373 } 5374 5375 static void 5376 spa_feature_stats_from_disk(spa_t *spa, nvlist_t *features) 5377 { 5378 zap_cursor_t zc; 5379 zap_attribute_t za; 5380 5381 if (spa->spa_feat_for_read_obj != 0) { 5382 for (zap_cursor_init(&zc, spa->spa_meta_objset, 5383 spa->spa_feat_for_read_obj); 5384 zap_cursor_retrieve(&zc, &za) == 0; 5385 zap_cursor_advance(&zc)) { 5386 ASSERT(za.za_integer_length == sizeof (uint64_t) && 5387 za.za_num_integers == 1); 5388 VERIFY0(nvlist_add_uint64(features, za.za_name, 5389 za.za_first_integer)); 5390 } 5391 zap_cursor_fini(&zc); 5392 } 5393 5394 if (spa->spa_feat_for_write_obj != 0) { 5395 for (zap_cursor_init(&zc, spa->spa_meta_objset, 5396 spa->spa_feat_for_write_obj); 5397 zap_cursor_retrieve(&zc, &za) == 0; 5398 zap_cursor_advance(&zc)) { 5399 ASSERT(za.za_integer_length == sizeof (uint64_t) && 5400 za.za_num_integers == 1); 5401 VERIFY0(nvlist_add_uint64(features, za.za_name, 5402 za.za_first_integer)); 5403 } 5404 zap_cursor_fini(&zc); 5405 } 5406 } 5407 5408 static void 5409 spa_feature_stats_from_cache(spa_t *spa, nvlist_t *features) 5410 { 5411 int i; 5412 5413 for (i = 0; i < SPA_FEATURES; i++) { 5414 zfeature_info_t feature = spa_feature_table[i]; 5415 uint64_t refcount; 5416 5417 if (feature_get_refcount(spa, &feature, &refcount) != 0) 5418 continue; 5419 5420 VERIFY0(nvlist_add_uint64(features, feature.fi_guid, refcount)); 5421 } 5422 } 5423 5424 /* 5425 * Store a list of pool features and their reference counts in the 5426 * config. 5427 * 5428 * The first time this is called on a spa, allocate a new nvlist, fetch 5429 * the pool features and reference counts from disk, then save the list 5430 * in the spa. In subsequent calls on the same spa use the saved nvlist 5431 * and refresh its values from the cached reference counts. This 5432 * ensures we don't block here on I/O on a suspended pool so 'zpool 5433 * clear' can resume the pool. 5434 */ 5435 static void 5436 spa_add_feature_stats(spa_t *spa, nvlist_t *config) 5437 { 5438 nvlist_t *features; 5439 5440 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 5441 5442 mutex_enter(&spa->spa_feat_stats_lock); 5443 features = spa->spa_feat_stats; 5444 5445 if (features != NULL) { 5446 spa_feature_stats_from_cache(spa, features); 5447 } else { 5448 VERIFY0(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP)); 5449 spa->spa_feat_stats = features; 5450 spa_feature_stats_from_disk(spa, features); 5451 } 5452 5453 VERIFY0(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, 5454 features)); 5455 5456 mutex_exit(&spa->spa_feat_stats_lock); 5457 } 5458 5459 int 5460 spa_get_stats(const char *name, nvlist_t **config, 5461 char *altroot, size_t buflen) 5462 { 5463 int error; 5464 spa_t *spa; 5465 5466 *config = NULL; 5467 error = spa_open_common(name, &spa, FTAG, NULL, config); 5468 5469 if (spa != NULL) { 5470 /* 5471 * This still leaves a window of inconsistency where the spares 5472 * or l2cache devices could change and the config would be 5473 * self-inconsistent. 5474 */ 5475 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5476 5477 if (*config != NULL) { 5478 uint64_t loadtimes[2]; 5479 5480 loadtimes[0] = spa->spa_loaded_ts.tv_sec; 5481 loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 5482 fnvlist_add_uint64_array(*config, 5483 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2); 5484 5485 fnvlist_add_uint64(*config, 5486 ZPOOL_CONFIG_ERRCOUNT, 5487 spa_get_errlog_size(spa)); 5488 5489 if (spa_suspended(spa)) { 5490 fnvlist_add_uint64(*config, 5491 ZPOOL_CONFIG_SUSPENDED, 5492 spa->spa_failmode); 5493 fnvlist_add_uint64(*config, 5494 ZPOOL_CONFIG_SUSPENDED_REASON, 5495 spa->spa_suspended); 5496 } 5497 5498 spa_add_spares(spa, *config); 5499 spa_add_l2cache(spa, *config); 5500 spa_add_feature_stats(spa, *config); 5501 } 5502 } 5503 5504 /* 5505 * We want to get the alternate root even for faulted pools, so we cheat 5506 * and call spa_lookup() directly. 5507 */ 5508 if (altroot) { 5509 if (spa == NULL) { 5510 mutex_enter(&spa_namespace_lock); 5511 spa = spa_lookup(name); 5512 if (spa) 5513 spa_altroot(spa, altroot, buflen); 5514 else 5515 altroot[0] = '\0'; 5516 spa = NULL; 5517 mutex_exit(&spa_namespace_lock); 5518 } else { 5519 spa_altroot(spa, altroot, buflen); 5520 } 5521 } 5522 5523 if (spa != NULL) { 5524 spa_config_exit(spa, SCL_CONFIG, FTAG); 5525 spa_close(spa, FTAG); 5526 } 5527 5528 return (error); 5529 } 5530 5531 /* 5532 * Validate that the auxiliary device array is well formed. We must have an 5533 * array of nvlists, each which describes a valid leaf vdev. If this is an 5534 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 5535 * specified, as long as they are well-formed. 5536 */ 5537 static int 5538 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 5539 spa_aux_vdev_t *sav, const char *config, uint64_t version, 5540 vdev_labeltype_t label) 5541 { 5542 nvlist_t **dev; 5543 uint_t i, ndev; 5544 vdev_t *vd; 5545 int error; 5546 5547 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 5548 5549 /* 5550 * It's acceptable to have no devs specified. 5551 */ 5552 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 5553 return (0); 5554 5555 if (ndev == 0) 5556 return (SET_ERROR(EINVAL)); 5557 5558 /* 5559 * Make sure the pool is formatted with a version that supports this 5560 * device type. 5561 */ 5562 if (spa_version(spa) < version) 5563 return (SET_ERROR(ENOTSUP)); 5564 5565 /* 5566 * Set the pending device list so we correctly handle device in-use 5567 * checking. 5568 */ 5569 sav->sav_pending = dev; 5570 sav->sav_npending = ndev; 5571 5572 for (i = 0; i < ndev; i++) { 5573 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 5574 mode)) != 0) 5575 goto out; 5576 5577 if (!vd->vdev_ops->vdev_op_leaf) { 5578 vdev_free(vd); 5579 error = SET_ERROR(EINVAL); 5580 goto out; 5581 } 5582 5583 vd->vdev_top = vd; 5584 5585 if ((error = vdev_open(vd)) == 0 && 5586 (error = vdev_label_init(vd, crtxg, label)) == 0) { 5587 fnvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 5588 vd->vdev_guid); 5589 } 5590 5591 vdev_free(vd); 5592 5593 if (error && 5594 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 5595 goto out; 5596 else 5597 error = 0; 5598 } 5599 5600 out: 5601 sav->sav_pending = NULL; 5602 sav->sav_npending = 0; 5603 return (error); 5604 } 5605 5606 static int 5607 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 5608 { 5609 int error; 5610 5611 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 5612 5613 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 5614 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 5615 VDEV_LABEL_SPARE)) != 0) { 5616 return (error); 5617 } 5618 5619 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 5620 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 5621 VDEV_LABEL_L2CACHE)); 5622 } 5623 5624 static void 5625 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 5626 const char *config) 5627 { 5628 int i; 5629 5630 if (sav->sav_config != NULL) { 5631 nvlist_t **olddevs; 5632 uint_t oldndevs; 5633 nvlist_t **newdevs; 5634 5635 /* 5636 * Generate new dev list by concatenating with the 5637 * current dev list. 5638 */ 5639 VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, config, 5640 &olddevs, &oldndevs)); 5641 5642 newdevs = kmem_alloc(sizeof (void *) * 5643 (ndevs + oldndevs), KM_SLEEP); 5644 for (i = 0; i < oldndevs; i++) 5645 newdevs[i] = fnvlist_dup(olddevs[i]); 5646 for (i = 0; i < ndevs; i++) 5647 newdevs[i + oldndevs] = fnvlist_dup(devs[i]); 5648 5649 fnvlist_remove(sav->sav_config, config); 5650 5651 fnvlist_add_nvlist_array(sav->sav_config, config, newdevs, 5652 ndevs + oldndevs); 5653 for (i = 0; i < oldndevs + ndevs; i++) 5654 nvlist_free(newdevs[i]); 5655 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 5656 } else { 5657 /* 5658 * Generate a new dev list. 5659 */ 5660 sav->sav_config = fnvlist_alloc(); 5661 fnvlist_add_nvlist_array(sav->sav_config, config, devs, ndevs); 5662 } 5663 } 5664 5665 /* 5666 * Stop and drop level 2 ARC devices 5667 */ 5668 void 5669 spa_l2cache_drop(spa_t *spa) 5670 { 5671 vdev_t *vd; 5672 int i; 5673 spa_aux_vdev_t *sav = &spa->spa_l2cache; 5674 5675 for (i = 0; i < sav->sav_count; i++) { 5676 uint64_t pool; 5677 5678 vd = sav->sav_vdevs[i]; 5679 ASSERT(vd != NULL); 5680 5681 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 5682 pool != 0ULL && l2arc_vdev_present(vd)) 5683 l2arc_remove_vdev(vd); 5684 } 5685 } 5686 5687 /* 5688 * Verify encryption parameters for spa creation. If we are encrypting, we must 5689 * have the encryption feature flag enabled. 5690 */ 5691 static int 5692 spa_create_check_encryption_params(dsl_crypto_params_t *dcp, 5693 boolean_t has_encryption) 5694 { 5695 if (dcp->cp_crypt != ZIO_CRYPT_OFF && 5696 dcp->cp_crypt != ZIO_CRYPT_INHERIT && 5697 !has_encryption) 5698 return (SET_ERROR(ENOTSUP)); 5699 5700 return (dmu_objset_create_crypt_check(NULL, dcp, NULL)); 5701 } 5702 5703 /* 5704 * Pool Creation 5705 */ 5706 int 5707 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 5708 nvlist_t *zplprops, dsl_crypto_params_t *dcp) 5709 { 5710 spa_t *spa; 5711 char *altroot = NULL; 5712 vdev_t *rvd; 5713 dsl_pool_t *dp; 5714 dmu_tx_t *tx; 5715 int error = 0; 5716 uint64_t txg = TXG_INITIAL; 5717 nvlist_t **spares, **l2cache; 5718 uint_t nspares, nl2cache; 5719 uint64_t version, obj, ndraid = 0; 5720 boolean_t has_features; 5721 boolean_t has_encryption; 5722 boolean_t has_allocclass; 5723 spa_feature_t feat; 5724 char *feat_name; 5725 char *poolname; 5726 nvlist_t *nvl; 5727 5728 if (props == NULL || 5729 nvlist_lookup_string(props, "tname", &poolname) != 0) 5730 poolname = (char *)pool; 5731 5732 /* 5733 * If this pool already exists, return failure. 5734 */ 5735 mutex_enter(&spa_namespace_lock); 5736 if (spa_lookup(poolname) != NULL) { 5737 mutex_exit(&spa_namespace_lock); 5738 return (SET_ERROR(EEXIST)); 5739 } 5740 5741 /* 5742 * Allocate a new spa_t structure. 5743 */ 5744 nvl = fnvlist_alloc(); 5745 fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool); 5746 (void) nvlist_lookup_string(props, 5747 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 5748 spa = spa_add(poolname, nvl, altroot); 5749 fnvlist_free(nvl); 5750 spa_activate(spa, spa_mode_global); 5751 5752 if (props && (error = spa_prop_validate(spa, props))) { 5753 spa_deactivate(spa); 5754 spa_remove(spa); 5755 mutex_exit(&spa_namespace_lock); 5756 return (error); 5757 } 5758 5759 /* 5760 * Temporary pool names should never be written to disk. 5761 */ 5762 if (poolname != pool) 5763 spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME; 5764 5765 has_features = B_FALSE; 5766 has_encryption = B_FALSE; 5767 has_allocclass = B_FALSE; 5768 for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); 5769 elem != NULL; elem = nvlist_next_nvpair(props, elem)) { 5770 if (zpool_prop_feature(nvpair_name(elem))) { 5771 has_features = B_TRUE; 5772 5773 feat_name = strchr(nvpair_name(elem), '@') + 1; 5774 VERIFY0(zfeature_lookup_name(feat_name, &feat)); 5775 if (feat == SPA_FEATURE_ENCRYPTION) 5776 has_encryption = B_TRUE; 5777 if (feat == SPA_FEATURE_ALLOCATION_CLASSES) 5778 has_allocclass = B_TRUE; 5779 } 5780 } 5781 5782 /* verify encryption params, if they were provided */ 5783 if (dcp != NULL) { 5784 error = spa_create_check_encryption_params(dcp, has_encryption); 5785 if (error != 0) { 5786 spa_deactivate(spa); 5787 spa_remove(spa); 5788 mutex_exit(&spa_namespace_lock); 5789 return (error); 5790 } 5791 } 5792 if (!has_allocclass && zfs_special_devs(nvroot, NULL)) { 5793 spa_deactivate(spa); 5794 spa_remove(spa); 5795 mutex_exit(&spa_namespace_lock); 5796 return (ENOTSUP); 5797 } 5798 5799 if (has_features || nvlist_lookup_uint64(props, 5800 zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { 5801 version = SPA_VERSION; 5802 } 5803 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 5804 5805 spa->spa_first_txg = txg; 5806 spa->spa_uberblock.ub_txg = txg - 1; 5807 spa->spa_uberblock.ub_version = version; 5808 spa->spa_ubsync = spa->spa_uberblock; 5809 spa->spa_load_state = SPA_LOAD_CREATE; 5810 spa->spa_removing_phys.sr_state = DSS_NONE; 5811 spa->spa_removing_phys.sr_removing_vdev = -1; 5812 spa->spa_removing_phys.sr_prev_indirect_vdev = -1; 5813 spa->spa_indirect_vdevs_loaded = B_TRUE; 5814 5815 /* 5816 * Create "The Godfather" zio to hold all async IOs 5817 */ 5818 spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), 5819 KM_SLEEP); 5820 for (int i = 0; i < max_ncpus; i++) { 5821 spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, 5822 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | 5823 ZIO_FLAG_GODFATHER); 5824 } 5825 5826 /* 5827 * Create the root vdev. 5828 */ 5829 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5830 5831 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 5832 5833 ASSERT(error != 0 || rvd != NULL); 5834 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 5835 5836 if (error == 0 && !zfs_allocatable_devs(nvroot)) 5837 error = SET_ERROR(EINVAL); 5838 5839 if (error == 0 && 5840 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 5841 (error = vdev_draid_spare_create(nvroot, rvd, &ndraid, 0)) == 0 && 5842 (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { 5843 /* 5844 * instantiate the metaslab groups (this will dirty the vdevs) 5845 * we can no longer error exit past this point 5846 */ 5847 for (int c = 0; error == 0 && c < rvd->vdev_children; c++) { 5848 vdev_t *vd = rvd->vdev_child[c]; 5849 5850 vdev_metaslab_set_size(vd); 5851 vdev_expand(vd, txg); 5852 } 5853 } 5854 5855 spa_config_exit(spa, SCL_ALL, FTAG); 5856 5857 if (error != 0) { 5858 spa_unload(spa); 5859 spa_deactivate(spa); 5860 spa_remove(spa); 5861 mutex_exit(&spa_namespace_lock); 5862 return (error); 5863 } 5864 5865 /* 5866 * Get the list of spares, if specified. 5867 */ 5868 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 5869 &spares, &nspares) == 0) { 5870 spa->spa_spares.sav_config = fnvlist_alloc(); 5871 fnvlist_add_nvlist_array(spa->spa_spares.sav_config, 5872 ZPOOL_CONFIG_SPARES, spares, nspares); 5873 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5874 spa_load_spares(spa); 5875 spa_config_exit(spa, SCL_ALL, FTAG); 5876 spa->spa_spares.sav_sync = B_TRUE; 5877 } 5878 5879 /* 5880 * Get the list of level 2 cache devices, if specified. 5881 */ 5882 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 5883 &l2cache, &nl2cache) == 0) { 5884 spa->spa_l2cache.sav_config = fnvlist_alloc(); 5885 fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 5886 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache); 5887 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5888 spa_load_l2cache(spa); 5889 spa_config_exit(spa, SCL_ALL, FTAG); 5890 spa->spa_l2cache.sav_sync = B_TRUE; 5891 } 5892 5893 spa->spa_is_initializing = B_TRUE; 5894 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, dcp, txg); 5895 spa->spa_is_initializing = B_FALSE; 5896 5897 /* 5898 * Create DDTs (dedup tables). 5899 */ 5900 ddt_create(spa); 5901 5902 spa_update_dspace(spa); 5903 5904 tx = dmu_tx_create_assigned(dp, txg); 5905 5906 /* 5907 * Create the pool's history object. 5908 */ 5909 if (version >= SPA_VERSION_ZPOOL_HISTORY && !spa->spa_history) 5910 spa_history_create_obj(spa, tx); 5911 5912 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE); 5913 spa_history_log_version(spa, "create", tx); 5914 5915 /* 5916 * Create the pool config object. 5917 */ 5918 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 5919 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 5920 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 5921 5922 if (zap_add(spa->spa_meta_objset, 5923 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 5924 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 5925 cmn_err(CE_PANIC, "failed to add pool config"); 5926 } 5927 5928 if (zap_add(spa->spa_meta_objset, 5929 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 5930 sizeof (uint64_t), 1, &version, tx) != 0) { 5931 cmn_err(CE_PANIC, "failed to add pool version"); 5932 } 5933 5934 /* Newly created pools with the right version are always deflated. */ 5935 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 5936 spa->spa_deflate = TRUE; 5937 if (zap_add(spa->spa_meta_objset, 5938 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 5939 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 5940 cmn_err(CE_PANIC, "failed to add deflate"); 5941 } 5942 } 5943 5944 /* 5945 * Create the deferred-free bpobj. Turn off compression 5946 * because sync-to-convergence takes longer if the blocksize 5947 * keeps changing. 5948 */ 5949 obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 5950 dmu_object_set_compress(spa->spa_meta_objset, obj, 5951 ZIO_COMPRESS_OFF, tx); 5952 if (zap_add(spa->spa_meta_objset, 5953 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 5954 sizeof (uint64_t), 1, &obj, tx) != 0) { 5955 cmn_err(CE_PANIC, "failed to add bpobj"); 5956 } 5957 VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 5958 spa->spa_meta_objset, obj)); 5959 5960 /* 5961 * Generate some random noise for salted checksums to operate on. 5962 */ 5963 (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, 5964 sizeof (spa->spa_cksum_salt.zcs_bytes)); 5965 5966 /* 5967 * Set pool properties. 5968 */ 5969 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 5970 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 5971 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 5972 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 5973 spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST); 5974 spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM); 5975 5976 if (props != NULL) { 5977 spa_configfile_set(spa, props, B_FALSE); 5978 spa_sync_props(props, tx); 5979 } 5980 5981 for (int i = 0; i < ndraid; i++) 5982 spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); 5983 5984 dmu_tx_commit(tx); 5985 5986 spa->spa_sync_on = B_TRUE; 5987 txg_sync_start(dp); 5988 mmp_thread_start(spa); 5989 txg_wait_synced(dp, txg); 5990 5991 spa_spawn_aux_threads(spa); 5992 5993 spa_write_cachefile(spa, B_FALSE, B_TRUE); 5994 5995 /* 5996 * Don't count references from objsets that are already closed 5997 * and are making their way through the eviction process. 5998 */ 5999 spa_evicting_os_wait(spa); 6000 spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); 6001 spa->spa_load_state = SPA_LOAD_NONE; 6002 6003 mutex_exit(&spa_namespace_lock); 6004 6005 return (0); 6006 } 6007 6008 /* 6009 * Import a non-root pool into the system. 6010 */ 6011 int 6012 spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 6013 { 6014 spa_t *spa; 6015 char *altroot = NULL; 6016 spa_load_state_t state = SPA_LOAD_IMPORT; 6017 zpool_load_policy_t policy; 6018 spa_mode_t mode = spa_mode_global; 6019 uint64_t readonly = B_FALSE; 6020 int error; 6021 nvlist_t *nvroot; 6022 nvlist_t **spares, **l2cache; 6023 uint_t nspares, nl2cache; 6024 6025 /* 6026 * If a pool with this name exists, return failure. 6027 */ 6028 mutex_enter(&spa_namespace_lock); 6029 if (spa_lookup(pool) != NULL) { 6030 mutex_exit(&spa_namespace_lock); 6031 return (SET_ERROR(EEXIST)); 6032 } 6033 6034 /* 6035 * Create and initialize the spa structure. 6036 */ 6037 (void) nvlist_lookup_string(props, 6038 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 6039 (void) nvlist_lookup_uint64(props, 6040 zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 6041 if (readonly) 6042 mode = SPA_MODE_READ; 6043 spa = spa_add(pool, config, altroot); 6044 spa->spa_import_flags = flags; 6045 6046 /* 6047 * Verbatim import - Take a pool and insert it into the namespace 6048 * as if it had been loaded at boot. 6049 */ 6050 if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 6051 if (props != NULL) 6052 spa_configfile_set(spa, props, B_FALSE); 6053 6054 spa_write_cachefile(spa, B_FALSE, B_TRUE); 6055 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); 6056 zfs_dbgmsg("spa_import: verbatim import of %s", pool); 6057 mutex_exit(&spa_namespace_lock); 6058 return (0); 6059 } 6060 6061 spa_activate(spa, mode); 6062 6063 /* 6064 * Don't start async tasks until we know everything is healthy. 6065 */ 6066 spa_async_suspend(spa); 6067 6068 zpool_get_load_policy(config, &policy); 6069 if (policy.zlp_rewind & ZPOOL_DO_REWIND) 6070 state = SPA_LOAD_RECOVER; 6071 6072 spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT; 6073 6074 if (state != SPA_LOAD_RECOVER) { 6075 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 6076 zfs_dbgmsg("spa_import: importing %s", pool); 6077 } else { 6078 zfs_dbgmsg("spa_import: importing %s, max_txg=%lld " 6079 "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg); 6080 } 6081 error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind); 6082 6083 /* 6084 * Propagate anything learned while loading the pool and pass it 6085 * back to caller (i.e. rewind info, missing devices, etc). 6086 */ 6087 fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info); 6088 6089 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6090 /* 6091 * Toss any existing sparelist, as it doesn't have any validity 6092 * anymore, and conflicts with spa_has_spare(). 6093 */ 6094 if (spa->spa_spares.sav_config) { 6095 nvlist_free(spa->spa_spares.sav_config); 6096 spa->spa_spares.sav_config = NULL; 6097 spa_load_spares(spa); 6098 } 6099 if (spa->spa_l2cache.sav_config) { 6100 nvlist_free(spa->spa_l2cache.sav_config); 6101 spa->spa_l2cache.sav_config = NULL; 6102 spa_load_l2cache(spa); 6103 } 6104 6105 nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); 6106 spa_config_exit(spa, SCL_ALL, FTAG); 6107 6108 if (props != NULL) 6109 spa_configfile_set(spa, props, B_FALSE); 6110 6111 if (error != 0 || (props && spa_writeable(spa) && 6112 (error = spa_prop_set(spa, props)))) { 6113 spa_unload(spa); 6114 spa_deactivate(spa); 6115 spa_remove(spa); 6116 mutex_exit(&spa_namespace_lock); 6117 return (error); 6118 } 6119 6120 spa_async_resume(spa); 6121 6122 /* 6123 * Override any spares and level 2 cache devices as specified by 6124 * the user, as these may have correct device names/devids, etc. 6125 */ 6126 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 6127 &spares, &nspares) == 0) { 6128 if (spa->spa_spares.sav_config) 6129 fnvlist_remove(spa->spa_spares.sav_config, 6130 ZPOOL_CONFIG_SPARES); 6131 else 6132 spa->spa_spares.sav_config = fnvlist_alloc(); 6133 fnvlist_add_nvlist_array(spa->spa_spares.sav_config, 6134 ZPOOL_CONFIG_SPARES, spares, nspares); 6135 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6136 spa_load_spares(spa); 6137 spa_config_exit(spa, SCL_ALL, FTAG); 6138 spa->spa_spares.sav_sync = B_TRUE; 6139 } 6140 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 6141 &l2cache, &nl2cache) == 0) { 6142 if (spa->spa_l2cache.sav_config) 6143 fnvlist_remove(spa->spa_l2cache.sav_config, 6144 ZPOOL_CONFIG_L2CACHE); 6145 else 6146 spa->spa_l2cache.sav_config = fnvlist_alloc(); 6147 fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 6148 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache); 6149 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6150 spa_load_l2cache(spa); 6151 spa_config_exit(spa, SCL_ALL, FTAG); 6152 spa->spa_l2cache.sav_sync = B_TRUE; 6153 } 6154 6155 /* 6156 * Check for any removed devices. 6157 */ 6158 if (spa->spa_autoreplace) { 6159 spa_aux_check_removed(&spa->spa_spares); 6160 spa_aux_check_removed(&spa->spa_l2cache); 6161 } 6162 6163 if (spa_writeable(spa)) { 6164 /* 6165 * Update the config cache to include the newly-imported pool. 6166 */ 6167 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 6168 } 6169 6170 /* 6171 * It's possible that the pool was expanded while it was exported. 6172 * We kick off an async task to handle this for us. 6173 */ 6174 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 6175 6176 spa_history_log_version(spa, "import", NULL); 6177 6178 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); 6179 6180 mutex_exit(&spa_namespace_lock); 6181 6182 zvol_create_minors_recursive(pool); 6183 6184 return (0); 6185 } 6186 6187 nvlist_t * 6188 spa_tryimport(nvlist_t *tryconfig) 6189 { 6190 nvlist_t *config = NULL; 6191 char *poolname, *cachefile; 6192 spa_t *spa; 6193 uint64_t state; 6194 int error; 6195 zpool_load_policy_t policy; 6196 6197 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 6198 return (NULL); 6199 6200 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 6201 return (NULL); 6202 6203 /* 6204 * Create and initialize the spa structure. 6205 */ 6206 mutex_enter(&spa_namespace_lock); 6207 spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 6208 spa_activate(spa, SPA_MODE_READ); 6209 6210 /* 6211 * Rewind pool if a max txg was provided. 6212 */ 6213 zpool_get_load_policy(spa->spa_config, &policy); 6214 if (policy.zlp_txg != UINT64_MAX) { 6215 spa->spa_load_max_txg = policy.zlp_txg; 6216 spa->spa_extreme_rewind = B_TRUE; 6217 zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld", 6218 poolname, (longlong_t)policy.zlp_txg); 6219 } else { 6220 zfs_dbgmsg("spa_tryimport: importing %s", poolname); 6221 } 6222 6223 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile) 6224 == 0) { 6225 zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile); 6226 spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; 6227 } else { 6228 spa->spa_config_source = SPA_CONFIG_SRC_SCAN; 6229 } 6230 6231 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING); 6232 6233 /* 6234 * If 'tryconfig' was at least parsable, return the current config. 6235 */ 6236 if (spa->spa_root_vdev != NULL) { 6237 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 6238 fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, poolname); 6239 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, state); 6240 fnvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 6241 spa->spa_uberblock.ub_timestamp); 6242 fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 6243 spa->spa_load_info); 6244 fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, 6245 spa->spa_errata); 6246 6247 /* 6248 * If the bootfs property exists on this pool then we 6249 * copy it out so that external consumers can tell which 6250 * pools are bootable. 6251 */ 6252 if ((!error || error == EEXIST) && spa->spa_bootfs) { 6253 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 6254 6255 /* 6256 * We have to play games with the name since the 6257 * pool was opened as TRYIMPORT_NAME. 6258 */ 6259 if (dsl_dsobj_to_dsname(spa_name(spa), 6260 spa->spa_bootfs, tmpname) == 0) { 6261 char *cp; 6262 char *dsname; 6263 6264 dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 6265 6266 cp = strchr(tmpname, '/'); 6267 if (cp == NULL) { 6268 (void) strlcpy(dsname, tmpname, 6269 MAXPATHLEN); 6270 } else { 6271 (void) snprintf(dsname, MAXPATHLEN, 6272 "%s/%s", poolname, ++cp); 6273 } 6274 fnvlist_add_string(config, ZPOOL_CONFIG_BOOTFS, 6275 dsname); 6276 kmem_free(dsname, MAXPATHLEN); 6277 } 6278 kmem_free(tmpname, MAXPATHLEN); 6279 } 6280 6281 /* 6282 * Add the list of hot spares and level 2 cache devices. 6283 */ 6284 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 6285 spa_add_spares(spa, config); 6286 spa_add_l2cache(spa, config); 6287 spa_config_exit(spa, SCL_CONFIG, FTAG); 6288 } 6289 6290 spa_unload(spa); 6291 spa_deactivate(spa); 6292 spa_remove(spa); 6293 mutex_exit(&spa_namespace_lock); 6294 6295 return (config); 6296 } 6297 6298 /* 6299 * Pool export/destroy 6300 * 6301 * The act of destroying or exporting a pool is very simple. We make sure there 6302 * is no more pending I/O and any references to the pool are gone. Then, we 6303 * update the pool state and sync all the labels to disk, removing the 6304 * configuration from the cache afterwards. If the 'hardforce' flag is set, then 6305 * we don't sync the labels or remove the configuration cache. 6306 */ 6307 static int 6308 spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig, 6309 boolean_t force, boolean_t hardforce) 6310 { 6311 int error; 6312 spa_t *spa; 6313 6314 if (oldconfig) 6315 *oldconfig = NULL; 6316 6317 if (!(spa_mode_global & SPA_MODE_WRITE)) 6318 return (SET_ERROR(EROFS)); 6319 6320 mutex_enter(&spa_namespace_lock); 6321 if ((spa = spa_lookup(pool)) == NULL) { 6322 mutex_exit(&spa_namespace_lock); 6323 return (SET_ERROR(ENOENT)); 6324 } 6325 6326 if (spa->spa_is_exporting) { 6327 /* the pool is being exported by another thread */ 6328 mutex_exit(&spa_namespace_lock); 6329 return (SET_ERROR(ZFS_ERR_EXPORT_IN_PROGRESS)); 6330 } 6331 spa->spa_is_exporting = B_TRUE; 6332 6333 /* 6334 * Put a hold on the pool, drop the namespace lock, stop async tasks, 6335 * reacquire the namespace lock, and see if we can export. 6336 */ 6337 spa_open_ref(spa, FTAG); 6338 mutex_exit(&spa_namespace_lock); 6339 spa_async_suspend(spa); 6340 if (spa->spa_zvol_taskq) { 6341 zvol_remove_minors(spa, spa_name(spa), B_TRUE); 6342 taskq_wait(spa->spa_zvol_taskq); 6343 } 6344 mutex_enter(&spa_namespace_lock); 6345 spa_close(spa, FTAG); 6346 6347 if (spa->spa_state == POOL_STATE_UNINITIALIZED) 6348 goto export_spa; 6349 /* 6350 * The pool will be in core if it's openable, in which case we can 6351 * modify its state. Objsets may be open only because they're dirty, 6352 * so we have to force it to sync before checking spa_refcnt. 6353 */ 6354 if (spa->spa_sync_on) { 6355 txg_wait_synced(spa->spa_dsl_pool, 0); 6356 spa_evicting_os_wait(spa); 6357 } 6358 6359 /* 6360 * A pool cannot be exported or destroyed if there are active 6361 * references. If we are resetting a pool, allow references by 6362 * fault injection handlers. 6363 */ 6364 if (!spa_refcount_zero(spa) || (spa->spa_inject_ref != 0)) { 6365 error = SET_ERROR(EBUSY); 6366 goto fail; 6367 } 6368 6369 if (spa->spa_sync_on) { 6370 /* 6371 * A pool cannot be exported if it has an active shared spare. 6372 * This is to prevent other pools stealing the active spare 6373 * from an exported pool. At user's own will, such pool can 6374 * be forcedly exported. 6375 */ 6376 if (!force && new_state == POOL_STATE_EXPORTED && 6377 spa_has_active_shared_spare(spa)) { 6378 error = SET_ERROR(EXDEV); 6379 goto fail; 6380 } 6381 6382 /* 6383 * We're about to export or destroy this pool. Make sure 6384 * we stop all initialization and trim activity here before 6385 * we set the spa_final_txg. This will ensure that all 6386 * dirty data resulting from the initialization is 6387 * committed to disk before we unload the pool. 6388 */ 6389 if (spa->spa_root_vdev != NULL) { 6390 vdev_t *rvd = spa->spa_root_vdev; 6391 vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE); 6392 vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE); 6393 vdev_autotrim_stop_all(spa); 6394 vdev_rebuild_stop_all(spa); 6395 } 6396 6397 /* 6398 * We want this to be reflected on every label, 6399 * so mark them all dirty. spa_unload() will do the 6400 * final sync that pushes these changes out. 6401 */ 6402 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 6403 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 6404 spa->spa_state = new_state; 6405 spa->spa_final_txg = spa_last_synced_txg(spa) + 6406 TXG_DEFER_SIZE + 1; 6407 vdev_config_dirty(spa->spa_root_vdev); 6408 spa_config_exit(spa, SCL_ALL, FTAG); 6409 } 6410 } 6411 6412 export_spa: 6413 if (new_state == POOL_STATE_DESTROYED) 6414 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY); 6415 else if (new_state == POOL_STATE_EXPORTED) 6416 spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_EXPORT); 6417 6418 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 6419 spa_unload(spa); 6420 spa_deactivate(spa); 6421 } 6422 6423 if (oldconfig && spa->spa_config) 6424 *oldconfig = fnvlist_dup(spa->spa_config); 6425 6426 if (new_state != POOL_STATE_UNINITIALIZED) { 6427 if (!hardforce) 6428 spa_write_cachefile(spa, B_TRUE, B_TRUE); 6429 spa_remove(spa); 6430 } else { 6431 /* 6432 * If spa_remove() is not called for this spa_t and 6433 * there is any possibility that it can be reused, 6434 * we make sure to reset the exporting flag. 6435 */ 6436 spa->spa_is_exporting = B_FALSE; 6437 } 6438 6439 mutex_exit(&spa_namespace_lock); 6440 return (0); 6441 6442 fail: 6443 spa->spa_is_exporting = B_FALSE; 6444 spa_async_resume(spa); 6445 mutex_exit(&spa_namespace_lock); 6446 return (error); 6447 } 6448 6449 /* 6450 * Destroy a storage pool. 6451 */ 6452 int 6453 spa_destroy(const char *pool) 6454 { 6455 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 6456 B_FALSE, B_FALSE)); 6457 } 6458 6459 /* 6460 * Export a storage pool. 6461 */ 6462 int 6463 spa_export(const char *pool, nvlist_t **oldconfig, boolean_t force, 6464 boolean_t hardforce) 6465 { 6466 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 6467 force, hardforce)); 6468 } 6469 6470 /* 6471 * Similar to spa_export(), this unloads the spa_t without actually removing it 6472 * from the namespace in any way. 6473 */ 6474 int 6475 spa_reset(const char *pool) 6476 { 6477 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 6478 B_FALSE, B_FALSE)); 6479 } 6480 6481 /* 6482 * ========================================================================== 6483 * Device manipulation 6484 * ========================================================================== 6485 */ 6486 6487 /* 6488 * This is called as a synctask to increment the draid feature flag 6489 */ 6490 static void 6491 spa_draid_feature_incr(void *arg, dmu_tx_t *tx) 6492 { 6493 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 6494 int draid = (int)(uintptr_t)arg; 6495 6496 for (int c = 0; c < draid; c++) 6497 spa_feature_incr(spa, SPA_FEATURE_DRAID, tx); 6498 } 6499 6500 /* 6501 * Add a device to a storage pool. 6502 */ 6503 int 6504 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 6505 { 6506 uint64_t txg, ndraid = 0; 6507 int error; 6508 vdev_t *rvd = spa->spa_root_vdev; 6509 vdev_t *vd, *tvd; 6510 nvlist_t **spares, **l2cache; 6511 uint_t nspares, nl2cache; 6512 6513 ASSERT(spa_writeable(spa)); 6514 6515 txg = spa_vdev_enter(spa); 6516 6517 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 6518 VDEV_ALLOC_ADD)) != 0) 6519 return (spa_vdev_exit(spa, NULL, txg, error)); 6520 6521 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 6522 6523 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 6524 &nspares) != 0) 6525 nspares = 0; 6526 6527 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 6528 &nl2cache) != 0) 6529 nl2cache = 0; 6530 6531 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 6532 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 6533 6534 if (vd->vdev_children != 0 && 6535 (error = vdev_create(vd, txg, B_FALSE)) != 0) { 6536 return (spa_vdev_exit(spa, vd, txg, error)); 6537 } 6538 6539 /* 6540 * The virtual dRAID spares must be added after vdev tree is created 6541 * and the vdev guids are generated. The guid of their associated 6542 * dRAID is stored in the config and used when opening the spare. 6543 */ 6544 if ((error = vdev_draid_spare_create(nvroot, vd, &ndraid, 6545 rvd->vdev_children)) == 0) { 6546 if (ndraid > 0 && nvlist_lookup_nvlist_array(nvroot, 6547 ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0) 6548 nspares = 0; 6549 } else { 6550 return (spa_vdev_exit(spa, vd, txg, error)); 6551 } 6552 6553 /* 6554 * We must validate the spares and l2cache devices after checking the 6555 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 6556 */ 6557 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 6558 return (spa_vdev_exit(spa, vd, txg, error)); 6559 6560 /* 6561 * If we are in the middle of a device removal, we can only add 6562 * devices which match the existing devices in the pool. 6563 * If we are in the middle of a removal, or have some indirect 6564 * vdevs, we can not add raidz or dRAID top levels. 6565 */ 6566 if (spa->spa_vdev_removal != NULL || 6567 spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { 6568 for (int c = 0; c < vd->vdev_children; c++) { 6569 tvd = vd->vdev_child[c]; 6570 if (spa->spa_vdev_removal != NULL && 6571 tvd->vdev_ashift != spa->spa_max_ashift) { 6572 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 6573 } 6574 /* Fail if top level vdev is raidz or a dRAID */ 6575 if (vdev_get_nparity(tvd) != 0) 6576 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 6577 6578 /* 6579 * Need the top level mirror to be 6580 * a mirror of leaf vdevs only 6581 */ 6582 if (tvd->vdev_ops == &vdev_mirror_ops) { 6583 for (uint64_t cid = 0; 6584 cid < tvd->vdev_children; cid++) { 6585 vdev_t *cvd = tvd->vdev_child[cid]; 6586 if (!cvd->vdev_ops->vdev_op_leaf) { 6587 return (spa_vdev_exit(spa, vd, 6588 txg, EINVAL)); 6589 } 6590 } 6591 } 6592 } 6593 } 6594 6595 for (int c = 0; c < vd->vdev_children; c++) { 6596 tvd = vd->vdev_child[c]; 6597 vdev_remove_child(vd, tvd); 6598 tvd->vdev_id = rvd->vdev_children; 6599 vdev_add_child(rvd, tvd); 6600 vdev_config_dirty(tvd); 6601 } 6602 6603 if (nspares != 0) { 6604 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 6605 ZPOOL_CONFIG_SPARES); 6606 spa_load_spares(spa); 6607 spa->spa_spares.sav_sync = B_TRUE; 6608 } 6609 6610 if (nl2cache != 0) { 6611 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 6612 ZPOOL_CONFIG_L2CACHE); 6613 spa_load_l2cache(spa); 6614 spa->spa_l2cache.sav_sync = B_TRUE; 6615 } 6616 6617 /* 6618 * We can't increment a feature while holding spa_vdev so we 6619 * have to do it in a synctask. 6620 */ 6621 if (ndraid != 0) { 6622 dmu_tx_t *tx; 6623 6624 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 6625 dsl_sync_task_nowait(spa->spa_dsl_pool, spa_draid_feature_incr, 6626 (void *)(uintptr_t)ndraid, tx); 6627 dmu_tx_commit(tx); 6628 } 6629 6630 /* 6631 * We have to be careful when adding new vdevs to an existing pool. 6632 * If other threads start allocating from these vdevs before we 6633 * sync the config cache, and we lose power, then upon reboot we may 6634 * fail to open the pool because there are DVAs that the config cache 6635 * can't translate. Therefore, we first add the vdevs without 6636 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 6637 * and then let spa_config_update() initialize the new metaslabs. 6638 * 6639 * spa_load() checks for added-but-not-initialized vdevs, so that 6640 * if we lose power at any point in this sequence, the remaining 6641 * steps will be completed the next time we load the pool. 6642 */ 6643 (void) spa_vdev_exit(spa, vd, txg, 0); 6644 6645 mutex_enter(&spa_namespace_lock); 6646 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 6647 spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD); 6648 mutex_exit(&spa_namespace_lock); 6649 6650 return (0); 6651 } 6652 6653 /* 6654 * Attach a device to a mirror. The arguments are the path to any device 6655 * in the mirror, and the nvroot for the new device. If the path specifies 6656 * a device that is not mirrored, we automatically insert the mirror vdev. 6657 * 6658 * If 'replacing' is specified, the new device is intended to replace the 6659 * existing device; in this case the two devices are made into their own 6660 * mirror using the 'replacing' vdev, which is functionally identical to 6661 * the mirror vdev (it actually reuses all the same ops) but has a few 6662 * extra rules: you can't attach to it after it's been created, and upon 6663 * completion of resilvering, the first disk (the one being replaced) 6664 * is automatically detached. 6665 * 6666 * If 'rebuild' is specified, then sequential reconstruction (a.ka. rebuild) 6667 * should be performed instead of traditional healing reconstruction. From 6668 * an administrators perspective these are both resilver operations. 6669 */ 6670 int 6671 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, 6672 int rebuild) 6673 { 6674 uint64_t txg, dtl_max_txg; 6675 vdev_t *rvd = spa->spa_root_vdev; 6676 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 6677 vdev_ops_t *pvops; 6678 char *oldvdpath, *newvdpath; 6679 int newvd_isspare; 6680 int error; 6681 6682 ASSERT(spa_writeable(spa)); 6683 6684 txg = spa_vdev_enter(spa); 6685 6686 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 6687 6688 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 6689 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 6690 error = (spa_has_checkpoint(spa)) ? 6691 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 6692 return (spa_vdev_exit(spa, NULL, txg, error)); 6693 } 6694 6695 if (rebuild) { 6696 if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) 6697 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 6698 6699 if (dsl_scan_resilvering(spa_get_dsl(spa))) 6700 return (spa_vdev_exit(spa, NULL, txg, 6701 ZFS_ERR_RESILVER_IN_PROGRESS)); 6702 } else { 6703 if (vdev_rebuild_active(rvd)) 6704 return (spa_vdev_exit(spa, NULL, txg, 6705 ZFS_ERR_REBUILD_IN_PROGRESS)); 6706 } 6707 6708 if (spa->spa_vdev_removal != NULL) 6709 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 6710 6711 if (oldvd == NULL) 6712 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 6713 6714 if (!oldvd->vdev_ops->vdev_op_leaf) 6715 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 6716 6717 pvd = oldvd->vdev_parent; 6718 6719 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 6720 VDEV_ALLOC_ATTACH)) != 0) 6721 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 6722 6723 if (newrootvd->vdev_children != 1) 6724 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 6725 6726 newvd = newrootvd->vdev_child[0]; 6727 6728 if (!newvd->vdev_ops->vdev_op_leaf) 6729 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 6730 6731 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 6732 return (spa_vdev_exit(spa, newrootvd, txg, error)); 6733 6734 /* 6735 * Spares can't replace logs 6736 */ 6737 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 6738 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6739 6740 /* 6741 * A dRAID spare can only replace a child of its parent dRAID vdev. 6742 */ 6743 if (newvd->vdev_ops == &vdev_draid_spare_ops && 6744 oldvd->vdev_top != vdev_draid_spare_get_parent(newvd)) { 6745 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6746 } 6747 6748 if (rebuild) { 6749 /* 6750 * For rebuilds, the top vdev must support reconstruction 6751 * using only space maps. This means the only allowable 6752 * vdevs types are the root vdev, a mirror, or dRAID. 6753 */ 6754 tvd = pvd; 6755 if (pvd->vdev_top != NULL) 6756 tvd = pvd->vdev_top; 6757 6758 if (tvd->vdev_ops != &vdev_mirror_ops && 6759 tvd->vdev_ops != &vdev_root_ops && 6760 tvd->vdev_ops != &vdev_draid_ops) { 6761 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6762 } 6763 } 6764 6765 if (!replacing) { 6766 /* 6767 * For attach, the only allowable parent is a mirror or the root 6768 * vdev. 6769 */ 6770 if (pvd->vdev_ops != &vdev_mirror_ops && 6771 pvd->vdev_ops != &vdev_root_ops) 6772 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6773 6774 pvops = &vdev_mirror_ops; 6775 } else { 6776 /* 6777 * Active hot spares can only be replaced by inactive hot 6778 * spares. 6779 */ 6780 if (pvd->vdev_ops == &vdev_spare_ops && 6781 oldvd->vdev_isspare && 6782 !spa_has_spare(spa, newvd->vdev_guid)) 6783 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6784 6785 /* 6786 * If the source is a hot spare, and the parent isn't already a 6787 * spare, then we want to create a new hot spare. Otherwise, we 6788 * want to create a replacing vdev. The user is not allowed to 6789 * attach to a spared vdev child unless the 'isspare' state is 6790 * the same (spare replaces spare, non-spare replaces 6791 * non-spare). 6792 */ 6793 if (pvd->vdev_ops == &vdev_replacing_ops && 6794 spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 6795 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6796 } else if (pvd->vdev_ops == &vdev_spare_ops && 6797 newvd->vdev_isspare != oldvd->vdev_isspare) { 6798 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6799 } 6800 6801 if (newvd->vdev_isspare) 6802 pvops = &vdev_spare_ops; 6803 else 6804 pvops = &vdev_replacing_ops; 6805 } 6806 6807 /* 6808 * Make sure the new device is big enough. 6809 */ 6810 if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 6811 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 6812 6813 /* 6814 * The new device cannot have a higher alignment requirement 6815 * than the top-level vdev. 6816 */ 6817 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 6818 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 6819 6820 /* 6821 * If this is an in-place replacement, update oldvd's path and devid 6822 * to make it distinguishable from newvd, and unopenable from now on. 6823 */ 6824 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 6825 spa_strfree(oldvd->vdev_path); 6826 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 6827 KM_SLEEP); 6828 (void) snprintf(oldvd->vdev_path, strlen(newvd->vdev_path) + 5, 6829 "%s/%s", newvd->vdev_path, "old"); 6830 if (oldvd->vdev_devid != NULL) { 6831 spa_strfree(oldvd->vdev_devid); 6832 oldvd->vdev_devid = NULL; 6833 } 6834 } 6835 6836 /* 6837 * If the parent is not a mirror, or if we're replacing, insert the new 6838 * mirror/replacing/spare vdev above oldvd. 6839 */ 6840 if (pvd->vdev_ops != pvops) 6841 pvd = vdev_add_parent(oldvd, pvops); 6842 6843 ASSERT(pvd->vdev_top->vdev_parent == rvd); 6844 ASSERT(pvd->vdev_ops == pvops); 6845 ASSERT(oldvd->vdev_parent == pvd); 6846 6847 /* 6848 * Extract the new device from its root and add it to pvd. 6849 */ 6850 vdev_remove_child(newrootvd, newvd); 6851 newvd->vdev_id = pvd->vdev_children; 6852 newvd->vdev_crtxg = oldvd->vdev_crtxg; 6853 vdev_add_child(pvd, newvd); 6854 6855 /* 6856 * Reevaluate the parent vdev state. 6857 */ 6858 vdev_propagate_state(pvd); 6859 6860 tvd = newvd->vdev_top; 6861 ASSERT(pvd->vdev_top == tvd); 6862 ASSERT(tvd->vdev_parent == rvd); 6863 6864 vdev_config_dirty(tvd); 6865 6866 /* 6867 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 6868 * for any dmu_sync-ed blocks. It will propagate upward when 6869 * spa_vdev_exit() calls vdev_dtl_reassess(). 6870 */ 6871 dtl_max_txg = txg + TXG_CONCURRENT_STATES; 6872 6873 vdev_dtl_dirty(newvd, DTL_MISSING, 6874 TXG_INITIAL, dtl_max_txg - TXG_INITIAL); 6875 6876 if (newvd->vdev_isspare) { 6877 spa_spare_activate(newvd); 6878 spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE); 6879 } 6880 6881 oldvdpath = spa_strdup(oldvd->vdev_path); 6882 newvdpath = spa_strdup(newvd->vdev_path); 6883 newvd_isspare = newvd->vdev_isspare; 6884 6885 /* 6886 * Mark newvd's DTL dirty in this txg. 6887 */ 6888 vdev_dirty(tvd, VDD_DTL, newvd, txg); 6889 6890 /* 6891 * Schedule the resilver or rebuild to restart in the future. We do 6892 * this to ensure that dmu_sync-ed blocks have been stitched into the 6893 * respective datasets. 6894 */ 6895 if (rebuild) { 6896 newvd->vdev_rebuild_txg = txg; 6897 6898 vdev_rebuild(tvd); 6899 } else { 6900 newvd->vdev_resilver_txg = txg; 6901 6902 if (dsl_scan_resilvering(spa_get_dsl(spa)) && 6903 spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) { 6904 vdev_defer_resilver(newvd); 6905 } else { 6906 dsl_scan_restart_resilver(spa->spa_dsl_pool, 6907 dtl_max_txg); 6908 } 6909 } 6910 6911 if (spa->spa_bootfs) 6912 spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH); 6913 6914 spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH); 6915 6916 /* 6917 * Commit the config 6918 */ 6919 (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 6920 6921 spa_history_log_internal(spa, "vdev attach", NULL, 6922 "%s vdev=%s %s vdev=%s", 6923 replacing && newvd_isspare ? "spare in" : 6924 replacing ? "replace" : "attach", newvdpath, 6925 replacing ? "for" : "to", oldvdpath); 6926 6927 spa_strfree(oldvdpath); 6928 spa_strfree(newvdpath); 6929 6930 return (0); 6931 } 6932 6933 /* 6934 * Detach a device from a mirror or replacing vdev. 6935 * 6936 * If 'replace_done' is specified, only detach if the parent 6937 * is a replacing vdev. 6938 */ 6939 int 6940 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 6941 { 6942 uint64_t txg; 6943 int error; 6944 vdev_t *rvd __maybe_unused = spa->spa_root_vdev; 6945 vdev_t *vd, *pvd, *cvd, *tvd; 6946 boolean_t unspare = B_FALSE; 6947 uint64_t unspare_guid = 0; 6948 char *vdpath; 6949 6950 ASSERT(spa_writeable(spa)); 6951 6952 txg = spa_vdev_detach_enter(spa, guid); 6953 6954 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 6955 6956 /* 6957 * Besides being called directly from the userland through the 6958 * ioctl interface, spa_vdev_detach() can be potentially called 6959 * at the end of spa_vdev_resilver_done(). 6960 * 6961 * In the regular case, when we have a checkpoint this shouldn't 6962 * happen as we never empty the DTLs of a vdev during the scrub 6963 * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done() 6964 * should never get here when we have a checkpoint. 6965 * 6966 * That said, even in a case when we checkpoint the pool exactly 6967 * as spa_vdev_resilver_done() calls this function everything 6968 * should be fine as the resilver will return right away. 6969 */ 6970 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 6971 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 6972 error = (spa_has_checkpoint(spa)) ? 6973 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 6974 return (spa_vdev_exit(spa, NULL, txg, error)); 6975 } 6976 6977 if (vd == NULL) 6978 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 6979 6980 if (!vd->vdev_ops->vdev_op_leaf) 6981 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 6982 6983 pvd = vd->vdev_parent; 6984 6985 /* 6986 * If the parent/child relationship is not as expected, don't do it. 6987 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 6988 * vdev that's replacing B with C. The user's intent in replacing 6989 * is to go from M(A,B) to M(A,C). If the user decides to cancel 6990 * the replace by detaching C, the expected behavior is to end up 6991 * M(A,B). But suppose that right after deciding to detach C, 6992 * the replacement of B completes. We would have M(A,C), and then 6993 * ask to detach C, which would leave us with just A -- not what 6994 * the user wanted. To prevent this, we make sure that the 6995 * parent/child relationship hasn't changed -- in this example, 6996 * that C's parent is still the replacing vdev R. 6997 */ 6998 if (pvd->vdev_guid != pguid && pguid != 0) 6999 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 7000 7001 /* 7002 * Only 'replacing' or 'spare' vdevs can be replaced. 7003 */ 7004 if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 7005 pvd->vdev_ops != &vdev_spare_ops) 7006 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7007 7008 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 7009 spa_version(spa) >= SPA_VERSION_SPARES); 7010 7011 /* 7012 * Only mirror, replacing, and spare vdevs support detach. 7013 */ 7014 if (pvd->vdev_ops != &vdev_replacing_ops && 7015 pvd->vdev_ops != &vdev_mirror_ops && 7016 pvd->vdev_ops != &vdev_spare_ops) 7017 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 7018 7019 /* 7020 * If this device has the only valid copy of some data, 7021 * we cannot safely detach it. 7022 */ 7023 if (vdev_dtl_required(vd)) 7024 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 7025 7026 ASSERT(pvd->vdev_children >= 2); 7027 7028 /* 7029 * If we are detaching the second disk from a replacing vdev, then 7030 * check to see if we changed the original vdev's path to have "/old" 7031 * at the end in spa_vdev_attach(). If so, undo that change now. 7032 */ 7033 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 7034 vd->vdev_path != NULL) { 7035 size_t len = strlen(vd->vdev_path); 7036 7037 for (int c = 0; c < pvd->vdev_children; c++) { 7038 cvd = pvd->vdev_child[c]; 7039 7040 if (cvd == vd || cvd->vdev_path == NULL) 7041 continue; 7042 7043 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 7044 strcmp(cvd->vdev_path + len, "/old") == 0) { 7045 spa_strfree(cvd->vdev_path); 7046 cvd->vdev_path = spa_strdup(vd->vdev_path); 7047 break; 7048 } 7049 } 7050 } 7051 7052 /* 7053 * If we are detaching the original disk from a normal spare, then it 7054 * implies that the spare should become a real disk, and be removed 7055 * from the active spare list for the pool. dRAID spares on the 7056 * other hand are coupled to the pool and thus should never be removed 7057 * from the spares list. 7058 */ 7059 if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0) { 7060 vdev_t *last_cvd = pvd->vdev_child[pvd->vdev_children - 1]; 7061 7062 if (last_cvd->vdev_isspare && 7063 last_cvd->vdev_ops != &vdev_draid_spare_ops) { 7064 unspare = B_TRUE; 7065 } 7066 } 7067 7068 /* 7069 * Erase the disk labels so the disk can be used for other things. 7070 * This must be done after all other error cases are handled, 7071 * but before we disembowel vd (so we can still do I/O to it). 7072 * But if we can't do it, don't treat the error as fatal -- 7073 * it may be that the unwritability of the disk is the reason 7074 * it's being detached! 7075 */ 7076 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 7077 7078 /* 7079 * Remove vd from its parent and compact the parent's children. 7080 */ 7081 vdev_remove_child(pvd, vd); 7082 vdev_compact_children(pvd); 7083 7084 /* 7085 * Remember one of the remaining children so we can get tvd below. 7086 */ 7087 cvd = pvd->vdev_child[pvd->vdev_children - 1]; 7088 7089 /* 7090 * If we need to remove the remaining child from the list of hot spares, 7091 * do it now, marking the vdev as no longer a spare in the process. 7092 * We must do this before vdev_remove_parent(), because that can 7093 * change the GUID if it creates a new toplevel GUID. For a similar 7094 * reason, we must remove the spare now, in the same txg as the detach; 7095 * otherwise someone could attach a new sibling, change the GUID, and 7096 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 7097 */ 7098 if (unspare) { 7099 ASSERT(cvd->vdev_isspare); 7100 spa_spare_remove(cvd); 7101 unspare_guid = cvd->vdev_guid; 7102 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 7103 cvd->vdev_unspare = B_TRUE; 7104 } 7105 7106 /* 7107 * If the parent mirror/replacing vdev only has one child, 7108 * the parent is no longer needed. Remove it from the tree. 7109 */ 7110 if (pvd->vdev_children == 1) { 7111 if (pvd->vdev_ops == &vdev_spare_ops) 7112 cvd->vdev_unspare = B_FALSE; 7113 vdev_remove_parent(cvd); 7114 } 7115 7116 /* 7117 * We don't set tvd until now because the parent we just removed 7118 * may have been the previous top-level vdev. 7119 */ 7120 tvd = cvd->vdev_top; 7121 ASSERT(tvd->vdev_parent == rvd); 7122 7123 /* 7124 * Reevaluate the parent vdev state. 7125 */ 7126 vdev_propagate_state(cvd); 7127 7128 /* 7129 * If the 'autoexpand' property is set on the pool then automatically 7130 * try to expand the size of the pool. For example if the device we 7131 * just detached was smaller than the others, it may be possible to 7132 * add metaslabs (i.e. grow the pool). We need to reopen the vdev 7133 * first so that we can obtain the updated sizes of the leaf vdevs. 7134 */ 7135 if (spa->spa_autoexpand) { 7136 vdev_reopen(tvd); 7137 vdev_expand(tvd, txg); 7138 } 7139 7140 vdev_config_dirty(tvd); 7141 7142 /* 7143 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 7144 * vd->vdev_detached is set and free vd's DTL object in syncing context. 7145 * But first make sure we're not on any *other* txg's DTL list, to 7146 * prevent vd from being accessed after it's freed. 7147 */ 7148 vdpath = spa_strdup(vd->vdev_path ? vd->vdev_path : "none"); 7149 for (int t = 0; t < TXG_SIZE; t++) 7150 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 7151 vd->vdev_detached = B_TRUE; 7152 vdev_dirty(tvd, VDD_DTL, vd, txg); 7153 7154 spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE); 7155 spa_notify_waiters(spa); 7156 7157 /* hang on to the spa before we release the lock */ 7158 spa_open_ref(spa, FTAG); 7159 7160 error = spa_vdev_exit(spa, vd, txg, 0); 7161 7162 spa_history_log_internal(spa, "detach", NULL, 7163 "vdev=%s", vdpath); 7164 spa_strfree(vdpath); 7165 7166 /* 7167 * If this was the removal of the original device in a hot spare vdev, 7168 * then we want to go through and remove the device from the hot spare 7169 * list of every other pool. 7170 */ 7171 if (unspare) { 7172 spa_t *altspa = NULL; 7173 7174 mutex_enter(&spa_namespace_lock); 7175 while ((altspa = spa_next(altspa)) != NULL) { 7176 if (altspa->spa_state != POOL_STATE_ACTIVE || 7177 altspa == spa) 7178 continue; 7179 7180 spa_open_ref(altspa, FTAG); 7181 mutex_exit(&spa_namespace_lock); 7182 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 7183 mutex_enter(&spa_namespace_lock); 7184 spa_close(altspa, FTAG); 7185 } 7186 mutex_exit(&spa_namespace_lock); 7187 7188 /* search the rest of the vdevs for spares to remove */ 7189 spa_vdev_resilver_done(spa); 7190 } 7191 7192 /* all done with the spa; OK to release */ 7193 mutex_enter(&spa_namespace_lock); 7194 spa_close(spa, FTAG); 7195 mutex_exit(&spa_namespace_lock); 7196 7197 return (error); 7198 } 7199 7200 static int 7201 spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, 7202 list_t *vd_list) 7203 { 7204 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 7205 7206 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 7207 7208 /* Look up vdev and ensure it's a leaf. */ 7209 vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); 7210 if (vd == NULL || vd->vdev_detached) { 7211 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7212 return (SET_ERROR(ENODEV)); 7213 } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { 7214 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7215 return (SET_ERROR(EINVAL)); 7216 } else if (!vdev_writeable(vd)) { 7217 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7218 return (SET_ERROR(EROFS)); 7219 } 7220 mutex_enter(&vd->vdev_initialize_lock); 7221 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7222 7223 /* 7224 * When we activate an initialize action we check to see 7225 * if the vdev_initialize_thread is NULL. We do this instead 7226 * of using the vdev_initialize_state since there might be 7227 * a previous initialization process which has completed but 7228 * the thread is not exited. 7229 */ 7230 if (cmd_type == POOL_INITIALIZE_START && 7231 (vd->vdev_initialize_thread != NULL || 7232 vd->vdev_top->vdev_removing)) { 7233 mutex_exit(&vd->vdev_initialize_lock); 7234 return (SET_ERROR(EBUSY)); 7235 } else if (cmd_type == POOL_INITIALIZE_CANCEL && 7236 (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE && 7237 vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) { 7238 mutex_exit(&vd->vdev_initialize_lock); 7239 return (SET_ERROR(ESRCH)); 7240 } else if (cmd_type == POOL_INITIALIZE_SUSPEND && 7241 vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) { 7242 mutex_exit(&vd->vdev_initialize_lock); 7243 return (SET_ERROR(ESRCH)); 7244 } 7245 7246 switch (cmd_type) { 7247 case POOL_INITIALIZE_START: 7248 vdev_initialize(vd); 7249 break; 7250 case POOL_INITIALIZE_CANCEL: 7251 vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED, vd_list); 7252 break; 7253 case POOL_INITIALIZE_SUSPEND: 7254 vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list); 7255 break; 7256 default: 7257 panic("invalid cmd_type %llu", (unsigned long long)cmd_type); 7258 } 7259 mutex_exit(&vd->vdev_initialize_lock); 7260 7261 return (0); 7262 } 7263 7264 int 7265 spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, 7266 nvlist_t *vdev_errlist) 7267 { 7268 int total_errors = 0; 7269 list_t vd_list; 7270 7271 list_create(&vd_list, sizeof (vdev_t), 7272 offsetof(vdev_t, vdev_initialize_node)); 7273 7274 /* 7275 * We hold the namespace lock through the whole function 7276 * to prevent any changes to the pool while we're starting or 7277 * stopping initialization. The config and state locks are held so that 7278 * we can properly assess the vdev state before we commit to 7279 * the initializing operation. 7280 */ 7281 mutex_enter(&spa_namespace_lock); 7282 7283 for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); 7284 pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { 7285 uint64_t vdev_guid = fnvpair_value_uint64(pair); 7286 7287 int error = spa_vdev_initialize_impl(spa, vdev_guid, cmd_type, 7288 &vd_list); 7289 if (error != 0) { 7290 char guid_as_str[MAXNAMELEN]; 7291 7292 (void) snprintf(guid_as_str, sizeof (guid_as_str), 7293 "%llu", (unsigned long long)vdev_guid); 7294 fnvlist_add_int64(vdev_errlist, guid_as_str, error); 7295 total_errors++; 7296 } 7297 } 7298 7299 /* Wait for all initialize threads to stop. */ 7300 vdev_initialize_stop_wait(spa, &vd_list); 7301 7302 /* Sync out the initializing state */ 7303 txg_wait_synced(spa->spa_dsl_pool, 0); 7304 mutex_exit(&spa_namespace_lock); 7305 7306 list_destroy(&vd_list); 7307 7308 return (total_errors); 7309 } 7310 7311 static int 7312 spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, 7313 uint64_t rate, boolean_t partial, boolean_t secure, list_t *vd_list) 7314 { 7315 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 7316 7317 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 7318 7319 /* Look up vdev and ensure it's a leaf. */ 7320 vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); 7321 if (vd == NULL || vd->vdev_detached) { 7322 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7323 return (SET_ERROR(ENODEV)); 7324 } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { 7325 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7326 return (SET_ERROR(EINVAL)); 7327 } else if (!vdev_writeable(vd)) { 7328 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7329 return (SET_ERROR(EROFS)); 7330 } else if (!vd->vdev_has_trim) { 7331 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7332 return (SET_ERROR(EOPNOTSUPP)); 7333 } else if (secure && !vd->vdev_has_securetrim) { 7334 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7335 return (SET_ERROR(EOPNOTSUPP)); 7336 } 7337 mutex_enter(&vd->vdev_trim_lock); 7338 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 7339 7340 /* 7341 * When we activate a TRIM action we check to see if the 7342 * vdev_trim_thread is NULL. We do this instead of using the 7343 * vdev_trim_state since there might be a previous TRIM process 7344 * which has completed but the thread is not exited. 7345 */ 7346 if (cmd_type == POOL_TRIM_START && 7347 (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing)) { 7348 mutex_exit(&vd->vdev_trim_lock); 7349 return (SET_ERROR(EBUSY)); 7350 } else if (cmd_type == POOL_TRIM_CANCEL && 7351 (vd->vdev_trim_state != VDEV_TRIM_ACTIVE && 7352 vd->vdev_trim_state != VDEV_TRIM_SUSPENDED)) { 7353 mutex_exit(&vd->vdev_trim_lock); 7354 return (SET_ERROR(ESRCH)); 7355 } else if (cmd_type == POOL_TRIM_SUSPEND && 7356 vd->vdev_trim_state != VDEV_TRIM_ACTIVE) { 7357 mutex_exit(&vd->vdev_trim_lock); 7358 return (SET_ERROR(ESRCH)); 7359 } 7360 7361 switch (cmd_type) { 7362 case POOL_TRIM_START: 7363 vdev_trim(vd, rate, partial, secure); 7364 break; 7365 case POOL_TRIM_CANCEL: 7366 vdev_trim_stop(vd, VDEV_TRIM_CANCELED, vd_list); 7367 break; 7368 case POOL_TRIM_SUSPEND: 7369 vdev_trim_stop(vd, VDEV_TRIM_SUSPENDED, vd_list); 7370 break; 7371 default: 7372 panic("invalid cmd_type %llu", (unsigned long long)cmd_type); 7373 } 7374 mutex_exit(&vd->vdev_trim_lock); 7375 7376 return (0); 7377 } 7378 7379 /* 7380 * Initiates a manual TRIM for the requested vdevs. This kicks off individual 7381 * TRIM threads for each child vdev. These threads pass over all of the free 7382 * space in the vdev's metaslabs and issues TRIM commands for that space. 7383 */ 7384 int 7385 spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate, 7386 boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist) 7387 { 7388 int total_errors = 0; 7389 list_t vd_list; 7390 7391 list_create(&vd_list, sizeof (vdev_t), 7392 offsetof(vdev_t, vdev_trim_node)); 7393 7394 /* 7395 * We hold the namespace lock through the whole function 7396 * to prevent any changes to the pool while we're starting or 7397 * stopping TRIM. The config and state locks are held so that 7398 * we can properly assess the vdev state before we commit to 7399 * the TRIM operation. 7400 */ 7401 mutex_enter(&spa_namespace_lock); 7402 7403 for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); 7404 pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { 7405 uint64_t vdev_guid = fnvpair_value_uint64(pair); 7406 7407 int error = spa_vdev_trim_impl(spa, vdev_guid, cmd_type, 7408 rate, partial, secure, &vd_list); 7409 if (error != 0) { 7410 char guid_as_str[MAXNAMELEN]; 7411 7412 (void) snprintf(guid_as_str, sizeof (guid_as_str), 7413 "%llu", (unsigned long long)vdev_guid); 7414 fnvlist_add_int64(vdev_errlist, guid_as_str, error); 7415 total_errors++; 7416 } 7417 } 7418 7419 /* Wait for all TRIM threads to stop. */ 7420 vdev_trim_stop_wait(spa, &vd_list); 7421 7422 /* Sync out the TRIM state */ 7423 txg_wait_synced(spa->spa_dsl_pool, 0); 7424 mutex_exit(&spa_namespace_lock); 7425 7426 list_destroy(&vd_list); 7427 7428 return (total_errors); 7429 } 7430 7431 /* 7432 * Split a set of devices from their mirrors, and create a new pool from them. 7433 */ 7434 int 7435 spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, 7436 nvlist_t *props, boolean_t exp) 7437 { 7438 int error = 0; 7439 uint64_t txg, *glist; 7440 spa_t *newspa; 7441 uint_t c, children, lastlog; 7442 nvlist_t **child, *nvl, *tmp; 7443 dmu_tx_t *tx; 7444 char *altroot = NULL; 7445 vdev_t *rvd, **vml = NULL; /* vdev modify list */ 7446 boolean_t activate_slog; 7447 7448 ASSERT(spa_writeable(spa)); 7449 7450 txg = spa_vdev_enter(spa); 7451 7452 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 7453 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { 7454 error = (spa_has_checkpoint(spa)) ? 7455 ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; 7456 return (spa_vdev_exit(spa, NULL, txg, error)); 7457 } 7458 7459 /* clear the log and flush everything up to now */ 7460 activate_slog = spa_passivate_log(spa); 7461 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 7462 error = spa_reset_logs(spa); 7463 txg = spa_vdev_config_enter(spa); 7464 7465 if (activate_slog) 7466 spa_activate_log(spa); 7467 7468 if (error != 0) 7469 return (spa_vdev_exit(spa, NULL, txg, error)); 7470 7471 /* check new spa name before going any further */ 7472 if (spa_lookup(newname) != NULL) 7473 return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 7474 7475 /* 7476 * scan through all the children to ensure they're all mirrors 7477 */ 7478 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 7479 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 7480 &children) != 0) 7481 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 7482 7483 /* first, check to ensure we've got the right child count */ 7484 rvd = spa->spa_root_vdev; 7485 lastlog = 0; 7486 for (c = 0; c < rvd->vdev_children; c++) { 7487 vdev_t *vd = rvd->vdev_child[c]; 7488 7489 /* don't count the holes & logs as children */ 7490 if (vd->vdev_islog || (vd->vdev_ops != &vdev_indirect_ops && 7491 !vdev_is_concrete(vd))) { 7492 if (lastlog == 0) 7493 lastlog = c; 7494 continue; 7495 } 7496 7497 lastlog = 0; 7498 } 7499 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 7500 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 7501 7502 /* next, ensure no spare or cache devices are part of the split */ 7503 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 7504 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 7505 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 7506 7507 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 7508 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 7509 7510 /* then, loop over each vdev and validate it */ 7511 for (c = 0; c < children; c++) { 7512 uint64_t is_hole = 0; 7513 7514 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 7515 &is_hole); 7516 7517 if (is_hole != 0) { 7518 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 7519 spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 7520 continue; 7521 } else { 7522 error = SET_ERROR(EINVAL); 7523 break; 7524 } 7525 } 7526 7527 /* deal with indirect vdevs */ 7528 if (spa->spa_root_vdev->vdev_child[c]->vdev_ops == 7529 &vdev_indirect_ops) 7530 continue; 7531 7532 /* which disk is going to be split? */ 7533 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 7534 &glist[c]) != 0) { 7535 error = SET_ERROR(EINVAL); 7536 break; 7537 } 7538 7539 /* look it up in the spa */ 7540 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 7541 if (vml[c] == NULL) { 7542 error = SET_ERROR(ENODEV); 7543 break; 7544 } 7545 7546 /* make sure there's nothing stopping the split */ 7547 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 7548 vml[c]->vdev_islog || 7549 !vdev_is_concrete(vml[c]) || 7550 vml[c]->vdev_isspare || 7551 vml[c]->vdev_isl2cache || 7552 !vdev_writeable(vml[c]) || 7553 vml[c]->vdev_children != 0 || 7554 vml[c]->vdev_state != VDEV_STATE_HEALTHY || 7555 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 7556 error = SET_ERROR(EINVAL); 7557 break; 7558 } 7559 7560 if (vdev_dtl_required(vml[c]) || 7561 vdev_resilver_needed(vml[c], NULL, NULL)) { 7562 error = SET_ERROR(EBUSY); 7563 break; 7564 } 7565 7566 /* we need certain info from the top level */ 7567 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 7568 vml[c]->vdev_top->vdev_ms_array); 7569 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 7570 vml[c]->vdev_top->vdev_ms_shift); 7571 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 7572 vml[c]->vdev_top->vdev_asize); 7573 fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 7574 vml[c]->vdev_top->vdev_ashift); 7575 7576 /* transfer per-vdev ZAPs */ 7577 ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0); 7578 VERIFY0(nvlist_add_uint64(child[c], 7579 ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap)); 7580 7581 ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0); 7582 VERIFY0(nvlist_add_uint64(child[c], 7583 ZPOOL_CONFIG_VDEV_TOP_ZAP, 7584 vml[c]->vdev_parent->vdev_top_zap)); 7585 } 7586 7587 if (error != 0) { 7588 kmem_free(vml, children * sizeof (vdev_t *)); 7589 kmem_free(glist, children * sizeof (uint64_t)); 7590 return (spa_vdev_exit(spa, NULL, txg, error)); 7591 } 7592 7593 /* stop writers from using the disks */ 7594 for (c = 0; c < children; c++) { 7595 if (vml[c] != NULL) 7596 vml[c]->vdev_offline = B_TRUE; 7597 } 7598 vdev_reopen(spa->spa_root_vdev); 7599 7600 /* 7601 * Temporarily record the splitting vdevs in the spa config. This 7602 * will disappear once the config is regenerated. 7603 */ 7604 nvl = fnvlist_alloc(); 7605 fnvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, glist, children); 7606 kmem_free(glist, children * sizeof (uint64_t)); 7607 7608 mutex_enter(&spa->spa_props_lock); 7609 fnvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, nvl); 7610 mutex_exit(&spa->spa_props_lock); 7611 spa->spa_config_splitting = nvl; 7612 vdev_config_dirty(spa->spa_root_vdev); 7613 7614 /* configure and create the new pool */ 7615 fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname); 7616 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 7617 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE); 7618 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa)); 7619 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg); 7620 fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 7621 spa_generate_guid(NULL)); 7622 VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); 7623 (void) nvlist_lookup_string(props, 7624 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 7625 7626 /* add the new pool to the namespace */ 7627 newspa = spa_add(newname, config, altroot); 7628 newspa->spa_avz_action = AVZ_ACTION_REBUILD; 7629 newspa->spa_config_txg = spa->spa_config_txg; 7630 spa_set_log_state(newspa, SPA_LOG_CLEAR); 7631 7632 /* release the spa config lock, retaining the namespace lock */ 7633 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 7634 7635 if (zio_injection_enabled) 7636 zio_handle_panic_injection(spa, FTAG, 1); 7637 7638 spa_activate(newspa, spa_mode_global); 7639 spa_async_suspend(newspa); 7640 7641 /* 7642 * Temporarily stop the initializing and TRIM activity. We set the 7643 * state to ACTIVE so that we know to resume initializing or TRIM 7644 * once the split has completed. 7645 */ 7646 list_t vd_initialize_list; 7647 list_create(&vd_initialize_list, sizeof (vdev_t), 7648 offsetof(vdev_t, vdev_initialize_node)); 7649 7650 list_t vd_trim_list; 7651 list_create(&vd_trim_list, sizeof (vdev_t), 7652 offsetof(vdev_t, vdev_trim_node)); 7653 7654 for (c = 0; c < children; c++) { 7655 if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { 7656 mutex_enter(&vml[c]->vdev_initialize_lock); 7657 vdev_initialize_stop(vml[c], 7658 VDEV_INITIALIZE_ACTIVE, &vd_initialize_list); 7659 mutex_exit(&vml[c]->vdev_initialize_lock); 7660 7661 mutex_enter(&vml[c]->vdev_trim_lock); 7662 vdev_trim_stop(vml[c], VDEV_TRIM_ACTIVE, &vd_trim_list); 7663 mutex_exit(&vml[c]->vdev_trim_lock); 7664 } 7665 } 7666 7667 vdev_initialize_stop_wait(spa, &vd_initialize_list); 7668 vdev_trim_stop_wait(spa, &vd_trim_list); 7669 7670 list_destroy(&vd_initialize_list); 7671 list_destroy(&vd_trim_list); 7672 7673 newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT; 7674 newspa->spa_is_splitting = B_TRUE; 7675 7676 /* create the new pool from the disks of the original pool */ 7677 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE); 7678 if (error) 7679 goto out; 7680 7681 /* if that worked, generate a real config for the new pool */ 7682 if (newspa->spa_root_vdev != NULL) { 7683 newspa->spa_config_splitting = fnvlist_alloc(); 7684 fnvlist_add_uint64(newspa->spa_config_splitting, 7685 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)); 7686 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 7687 B_TRUE)); 7688 } 7689 7690 /* set the props */ 7691 if (props != NULL) { 7692 spa_configfile_set(newspa, props, B_FALSE); 7693 error = spa_prop_set(newspa, props); 7694 if (error) 7695 goto out; 7696 } 7697 7698 /* flush everything */ 7699 txg = spa_vdev_config_enter(newspa); 7700 vdev_config_dirty(newspa->spa_root_vdev); 7701 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 7702 7703 if (zio_injection_enabled) 7704 zio_handle_panic_injection(spa, FTAG, 2); 7705 7706 spa_async_resume(newspa); 7707 7708 /* finally, update the original pool's config */ 7709 txg = spa_vdev_config_enter(spa); 7710 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 7711 error = dmu_tx_assign(tx, TXG_WAIT); 7712 if (error != 0) 7713 dmu_tx_abort(tx); 7714 for (c = 0; c < children; c++) { 7715 if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { 7716 vdev_t *tvd = vml[c]->vdev_top; 7717 7718 /* 7719 * Need to be sure the detachable VDEV is not 7720 * on any *other* txg's DTL list to prevent it 7721 * from being accessed after it's freed. 7722 */ 7723 for (int t = 0; t < TXG_SIZE; t++) { 7724 (void) txg_list_remove_this( 7725 &tvd->vdev_dtl_list, vml[c], t); 7726 } 7727 7728 vdev_split(vml[c]); 7729 if (error == 0) 7730 spa_history_log_internal(spa, "detach", tx, 7731 "vdev=%s", vml[c]->vdev_path); 7732 7733 vdev_free(vml[c]); 7734 } 7735 } 7736 spa->spa_avz_action = AVZ_ACTION_REBUILD; 7737 vdev_config_dirty(spa->spa_root_vdev); 7738 spa->spa_config_splitting = NULL; 7739 nvlist_free(nvl); 7740 if (error == 0) 7741 dmu_tx_commit(tx); 7742 (void) spa_vdev_exit(spa, NULL, txg, 0); 7743 7744 if (zio_injection_enabled) 7745 zio_handle_panic_injection(spa, FTAG, 3); 7746 7747 /* split is complete; log a history record */ 7748 spa_history_log_internal(newspa, "split", NULL, 7749 "from pool %s", spa_name(spa)); 7750 7751 newspa->spa_is_splitting = B_FALSE; 7752 kmem_free(vml, children * sizeof (vdev_t *)); 7753 7754 /* if we're not going to mount the filesystems in userland, export */ 7755 if (exp) 7756 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 7757 B_FALSE, B_FALSE); 7758 7759 return (error); 7760 7761 out: 7762 spa_unload(newspa); 7763 spa_deactivate(newspa); 7764 spa_remove(newspa); 7765 7766 txg = spa_vdev_config_enter(spa); 7767 7768 /* re-online all offlined disks */ 7769 for (c = 0; c < children; c++) { 7770 if (vml[c] != NULL) 7771 vml[c]->vdev_offline = B_FALSE; 7772 } 7773 7774 /* restart initializing or trimming disks as necessary */ 7775 spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); 7776 spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); 7777 spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); 7778 7779 vdev_reopen(spa->spa_root_vdev); 7780 7781 nvlist_free(spa->spa_config_splitting); 7782 spa->spa_config_splitting = NULL; 7783 (void) spa_vdev_exit(spa, NULL, txg, error); 7784 7785 kmem_free(vml, children * sizeof (vdev_t *)); 7786 return (error); 7787 } 7788 7789 /* 7790 * Find any device that's done replacing, or a vdev marked 'unspare' that's 7791 * currently spared, so we can detach it. 7792 */ 7793 static vdev_t * 7794 spa_vdev_resilver_done_hunt(vdev_t *vd) 7795 { 7796 vdev_t *newvd, *oldvd; 7797 7798 for (int c = 0; c < vd->vdev_children; c++) { 7799 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 7800 if (oldvd != NULL) 7801 return (oldvd); 7802 } 7803 7804 /* 7805 * Check for a completed replacement. We always consider the first 7806 * vdev in the list to be the oldest vdev, and the last one to be 7807 * the newest (see spa_vdev_attach() for how that works). In 7808 * the case where the newest vdev is faulted, we will not automatically 7809 * remove it after a resilver completes. This is OK as it will require 7810 * user intervention to determine which disk the admin wishes to keep. 7811 */ 7812 if (vd->vdev_ops == &vdev_replacing_ops) { 7813 ASSERT(vd->vdev_children > 1); 7814 7815 newvd = vd->vdev_child[vd->vdev_children - 1]; 7816 oldvd = vd->vdev_child[0]; 7817 7818 if (vdev_dtl_empty(newvd, DTL_MISSING) && 7819 vdev_dtl_empty(newvd, DTL_OUTAGE) && 7820 !vdev_dtl_required(oldvd)) 7821 return (oldvd); 7822 } 7823 7824 /* 7825 * Check for a completed resilver with the 'unspare' flag set. 7826 * Also potentially update faulted state. 7827 */ 7828 if (vd->vdev_ops == &vdev_spare_ops) { 7829 vdev_t *first = vd->vdev_child[0]; 7830 vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 7831 7832 if (last->vdev_unspare) { 7833 oldvd = first; 7834 newvd = last; 7835 } else if (first->vdev_unspare) { 7836 oldvd = last; 7837 newvd = first; 7838 } else { 7839 oldvd = NULL; 7840 } 7841 7842 if (oldvd != NULL && 7843 vdev_dtl_empty(newvd, DTL_MISSING) && 7844 vdev_dtl_empty(newvd, DTL_OUTAGE) && 7845 !vdev_dtl_required(oldvd)) 7846 return (oldvd); 7847 7848 vdev_propagate_state(vd); 7849 7850 /* 7851 * If there are more than two spares attached to a disk, 7852 * and those spares are not required, then we want to 7853 * attempt to free them up now so that they can be used 7854 * by other pools. Once we're back down to a single 7855 * disk+spare, we stop removing them. 7856 */ 7857 if (vd->vdev_children > 2) { 7858 newvd = vd->vdev_child[1]; 7859 7860 if (newvd->vdev_isspare && last->vdev_isspare && 7861 vdev_dtl_empty(last, DTL_MISSING) && 7862 vdev_dtl_empty(last, DTL_OUTAGE) && 7863 !vdev_dtl_required(newvd)) 7864 return (newvd); 7865 } 7866 } 7867 7868 return (NULL); 7869 } 7870 7871 static void 7872 spa_vdev_resilver_done(spa_t *spa) 7873 { 7874 vdev_t *vd, *pvd, *ppvd; 7875 uint64_t guid, sguid, pguid, ppguid; 7876 7877 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 7878 7879 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 7880 pvd = vd->vdev_parent; 7881 ppvd = pvd->vdev_parent; 7882 guid = vd->vdev_guid; 7883 pguid = pvd->vdev_guid; 7884 ppguid = ppvd->vdev_guid; 7885 sguid = 0; 7886 /* 7887 * If we have just finished replacing a hot spared device, then 7888 * we need to detach the parent's first child (the original hot 7889 * spare) as well. 7890 */ 7891 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 7892 ppvd->vdev_children == 2) { 7893 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 7894 sguid = ppvd->vdev_child[1]->vdev_guid; 7895 } 7896 ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); 7897 7898 spa_config_exit(spa, SCL_ALL, FTAG); 7899 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 7900 return; 7901 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 7902 return; 7903 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 7904 } 7905 7906 spa_config_exit(spa, SCL_ALL, FTAG); 7907 7908 /* 7909 * If a detach was not performed above replace waiters will not have 7910 * been notified. In which case we must do so now. 7911 */ 7912 spa_notify_waiters(spa); 7913 } 7914 7915 /* 7916 * Update the stored path or FRU for this vdev. 7917 */ 7918 static int 7919 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 7920 boolean_t ispath) 7921 { 7922 vdev_t *vd; 7923 boolean_t sync = B_FALSE; 7924 7925 ASSERT(spa_writeable(spa)); 7926 7927 spa_vdev_state_enter(spa, SCL_ALL); 7928 7929 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 7930 return (spa_vdev_state_exit(spa, NULL, ENOENT)); 7931 7932 if (!vd->vdev_ops->vdev_op_leaf) 7933 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 7934 7935 if (ispath) { 7936 if (strcmp(value, vd->vdev_path) != 0) { 7937 spa_strfree(vd->vdev_path); 7938 vd->vdev_path = spa_strdup(value); 7939 sync = B_TRUE; 7940 } 7941 } else { 7942 if (vd->vdev_fru == NULL) { 7943 vd->vdev_fru = spa_strdup(value); 7944 sync = B_TRUE; 7945 } else if (strcmp(value, vd->vdev_fru) != 0) { 7946 spa_strfree(vd->vdev_fru); 7947 vd->vdev_fru = spa_strdup(value); 7948 sync = B_TRUE; 7949 } 7950 } 7951 7952 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 7953 } 7954 7955 int 7956 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 7957 { 7958 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 7959 } 7960 7961 int 7962 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 7963 { 7964 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 7965 } 7966 7967 /* 7968 * ========================================================================== 7969 * SPA Scanning 7970 * ========================================================================== 7971 */ 7972 int 7973 spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd) 7974 { 7975 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 7976 7977 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 7978 return (SET_ERROR(EBUSY)); 7979 7980 return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd)); 7981 } 7982 7983 int 7984 spa_scan_stop(spa_t *spa) 7985 { 7986 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 7987 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 7988 return (SET_ERROR(EBUSY)); 7989 return (dsl_scan_cancel(spa->spa_dsl_pool)); 7990 } 7991 7992 int 7993 spa_scan(spa_t *spa, pool_scan_func_t func) 7994 { 7995 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 7996 7997 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 7998 return (SET_ERROR(ENOTSUP)); 7999 8000 if (func == POOL_SCAN_RESILVER && 8001 !spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) 8002 return (SET_ERROR(ENOTSUP)); 8003 8004 /* 8005 * If a resilver was requested, but there is no DTL on a 8006 * writeable leaf device, we have nothing to do. 8007 */ 8008 if (func == POOL_SCAN_RESILVER && 8009 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 8010 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 8011 return (0); 8012 } 8013 8014 return (dsl_scan(spa->spa_dsl_pool, func)); 8015 } 8016 8017 /* 8018 * ========================================================================== 8019 * SPA async task processing 8020 * ========================================================================== 8021 */ 8022 8023 static void 8024 spa_async_remove(spa_t *spa, vdev_t *vd) 8025 { 8026 if (vd->vdev_remove_wanted) { 8027 vd->vdev_remove_wanted = B_FALSE; 8028 vd->vdev_delayed_close = B_FALSE; 8029 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 8030 8031 /* 8032 * We want to clear the stats, but we don't want to do a full 8033 * vdev_clear() as that will cause us to throw away 8034 * degraded/faulted state as well as attempt to reopen the 8035 * device, all of which is a waste. 8036 */ 8037 vd->vdev_stat.vs_read_errors = 0; 8038 vd->vdev_stat.vs_write_errors = 0; 8039 vd->vdev_stat.vs_checksum_errors = 0; 8040 8041 vdev_state_dirty(vd->vdev_top); 8042 8043 /* Tell userspace that the vdev is gone. */ 8044 zfs_post_remove(spa, vd); 8045 } 8046 8047 for (int c = 0; c < vd->vdev_children; c++) 8048 spa_async_remove(spa, vd->vdev_child[c]); 8049 } 8050 8051 static void 8052 spa_async_probe(spa_t *spa, vdev_t *vd) 8053 { 8054 if (vd->vdev_probe_wanted) { 8055 vd->vdev_probe_wanted = B_FALSE; 8056 vdev_reopen(vd); /* vdev_open() does the actual probe */ 8057 } 8058 8059 for (int c = 0; c < vd->vdev_children; c++) 8060 spa_async_probe(spa, vd->vdev_child[c]); 8061 } 8062 8063 static void 8064 spa_async_autoexpand(spa_t *spa, vdev_t *vd) 8065 { 8066 if (!spa->spa_autoexpand) 8067 return; 8068 8069 for (int c = 0; c < vd->vdev_children; c++) { 8070 vdev_t *cvd = vd->vdev_child[c]; 8071 spa_async_autoexpand(spa, cvd); 8072 } 8073 8074 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 8075 return; 8076 8077 spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_AUTOEXPAND); 8078 } 8079 8080 static void 8081 spa_async_thread(void *arg) 8082 { 8083 spa_t *spa = (spa_t *)arg; 8084 dsl_pool_t *dp = spa->spa_dsl_pool; 8085 int tasks; 8086 8087 ASSERT(spa->spa_sync_on); 8088 8089 mutex_enter(&spa->spa_async_lock); 8090 tasks = spa->spa_async_tasks; 8091 spa->spa_async_tasks = 0; 8092 mutex_exit(&spa->spa_async_lock); 8093 8094 /* 8095 * See if the config needs to be updated. 8096 */ 8097 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 8098 uint64_t old_space, new_space; 8099 8100 mutex_enter(&spa_namespace_lock); 8101 old_space = metaslab_class_get_space(spa_normal_class(spa)); 8102 old_space += metaslab_class_get_space(spa_special_class(spa)); 8103 old_space += metaslab_class_get_space(spa_dedup_class(spa)); 8104 old_space += metaslab_class_get_space( 8105 spa_embedded_log_class(spa)); 8106 8107 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 8108 8109 new_space = metaslab_class_get_space(spa_normal_class(spa)); 8110 new_space += metaslab_class_get_space(spa_special_class(spa)); 8111 new_space += metaslab_class_get_space(spa_dedup_class(spa)); 8112 new_space += metaslab_class_get_space( 8113 spa_embedded_log_class(spa)); 8114 mutex_exit(&spa_namespace_lock); 8115 8116 /* 8117 * If the pool grew as a result of the config update, 8118 * then log an internal history event. 8119 */ 8120 if (new_space != old_space) { 8121 spa_history_log_internal(spa, "vdev online", NULL, 8122 "pool '%s' size: %llu(+%llu)", 8123 spa_name(spa), (u_longlong_t)new_space, 8124 (u_longlong_t)(new_space - old_space)); 8125 } 8126 } 8127 8128 /* 8129 * See if any devices need to be marked REMOVED. 8130 */ 8131 if (tasks & SPA_ASYNC_REMOVE) { 8132 spa_vdev_state_enter(spa, SCL_NONE); 8133 spa_async_remove(spa, spa->spa_root_vdev); 8134 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 8135 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 8136 for (int i = 0; i < spa->spa_spares.sav_count; i++) 8137 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 8138 (void) spa_vdev_state_exit(spa, NULL, 0); 8139 } 8140 8141 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 8142 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8143 spa_async_autoexpand(spa, spa->spa_root_vdev); 8144 spa_config_exit(spa, SCL_CONFIG, FTAG); 8145 } 8146 8147 /* 8148 * See if any devices need to be probed. 8149 */ 8150 if (tasks & SPA_ASYNC_PROBE) { 8151 spa_vdev_state_enter(spa, SCL_NONE); 8152 spa_async_probe(spa, spa->spa_root_vdev); 8153 (void) spa_vdev_state_exit(spa, NULL, 0); 8154 } 8155 8156 /* 8157 * If any devices are done replacing, detach them. 8158 */ 8159 if (tasks & SPA_ASYNC_RESILVER_DONE || 8160 tasks & SPA_ASYNC_REBUILD_DONE) { 8161 spa_vdev_resilver_done(spa); 8162 } 8163 8164 /* 8165 * Kick off a resilver. 8166 */ 8167 if (tasks & SPA_ASYNC_RESILVER && 8168 !vdev_rebuild_active(spa->spa_root_vdev) && 8169 (!dsl_scan_resilvering(dp) || 8170 !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))) 8171 dsl_scan_restart_resilver(dp, 0); 8172 8173 if (tasks & SPA_ASYNC_INITIALIZE_RESTART) { 8174 mutex_enter(&spa_namespace_lock); 8175 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8176 vdev_initialize_restart(spa->spa_root_vdev); 8177 spa_config_exit(spa, SCL_CONFIG, FTAG); 8178 mutex_exit(&spa_namespace_lock); 8179 } 8180 8181 if (tasks & SPA_ASYNC_TRIM_RESTART) { 8182 mutex_enter(&spa_namespace_lock); 8183 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8184 vdev_trim_restart(spa->spa_root_vdev); 8185 spa_config_exit(spa, SCL_CONFIG, FTAG); 8186 mutex_exit(&spa_namespace_lock); 8187 } 8188 8189 if (tasks & SPA_ASYNC_AUTOTRIM_RESTART) { 8190 mutex_enter(&spa_namespace_lock); 8191 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8192 vdev_autotrim_restart(spa); 8193 spa_config_exit(spa, SCL_CONFIG, FTAG); 8194 mutex_exit(&spa_namespace_lock); 8195 } 8196 8197 /* 8198 * Kick off L2 cache whole device TRIM. 8199 */ 8200 if (tasks & SPA_ASYNC_L2CACHE_TRIM) { 8201 mutex_enter(&spa_namespace_lock); 8202 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 8203 vdev_trim_l2arc(spa); 8204 spa_config_exit(spa, SCL_CONFIG, FTAG); 8205 mutex_exit(&spa_namespace_lock); 8206 } 8207 8208 /* 8209 * Kick off L2 cache rebuilding. 8210 */ 8211 if (tasks & SPA_ASYNC_L2CACHE_REBUILD) { 8212 mutex_enter(&spa_namespace_lock); 8213 spa_config_enter(spa, SCL_L2ARC, FTAG, RW_READER); 8214 l2arc_spa_rebuild_start(spa); 8215 spa_config_exit(spa, SCL_L2ARC, FTAG); 8216 mutex_exit(&spa_namespace_lock); 8217 } 8218 8219 /* 8220 * Let the world know that we're done. 8221 */ 8222 mutex_enter(&spa->spa_async_lock); 8223 spa->spa_async_thread = NULL; 8224 cv_broadcast(&spa->spa_async_cv); 8225 mutex_exit(&spa->spa_async_lock); 8226 thread_exit(); 8227 } 8228 8229 void 8230 spa_async_suspend(spa_t *spa) 8231 { 8232 mutex_enter(&spa->spa_async_lock); 8233 spa->spa_async_suspended++; 8234 while (spa->spa_async_thread != NULL) 8235 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 8236 mutex_exit(&spa->spa_async_lock); 8237 8238 spa_vdev_remove_suspend(spa); 8239 8240 zthr_t *condense_thread = spa->spa_condense_zthr; 8241 if (condense_thread != NULL) 8242 zthr_cancel(condense_thread); 8243 8244 zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; 8245 if (discard_thread != NULL) 8246 zthr_cancel(discard_thread); 8247 8248 zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; 8249 if (ll_delete_thread != NULL) 8250 zthr_cancel(ll_delete_thread); 8251 8252 zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; 8253 if (ll_condense_thread != NULL) 8254 zthr_cancel(ll_condense_thread); 8255 } 8256 8257 void 8258 spa_async_resume(spa_t *spa) 8259 { 8260 mutex_enter(&spa->spa_async_lock); 8261 ASSERT(spa->spa_async_suspended != 0); 8262 spa->spa_async_suspended--; 8263 mutex_exit(&spa->spa_async_lock); 8264 spa_restart_removal(spa); 8265 8266 zthr_t *condense_thread = spa->spa_condense_zthr; 8267 if (condense_thread != NULL) 8268 zthr_resume(condense_thread); 8269 8270 zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; 8271 if (discard_thread != NULL) 8272 zthr_resume(discard_thread); 8273 8274 zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; 8275 if (ll_delete_thread != NULL) 8276 zthr_resume(ll_delete_thread); 8277 8278 zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; 8279 if (ll_condense_thread != NULL) 8280 zthr_resume(ll_condense_thread); 8281 } 8282 8283 static boolean_t 8284 spa_async_tasks_pending(spa_t *spa) 8285 { 8286 uint_t non_config_tasks; 8287 uint_t config_task; 8288 boolean_t config_task_suspended; 8289 8290 non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE; 8291 config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; 8292 if (spa->spa_ccw_fail_time == 0) { 8293 config_task_suspended = B_FALSE; 8294 } else { 8295 config_task_suspended = 8296 (gethrtime() - spa->spa_ccw_fail_time) < 8297 ((hrtime_t)zfs_ccw_retry_interval * NANOSEC); 8298 } 8299 8300 return (non_config_tasks || (config_task && !config_task_suspended)); 8301 } 8302 8303 static void 8304 spa_async_dispatch(spa_t *spa) 8305 { 8306 mutex_enter(&spa->spa_async_lock); 8307 if (spa_async_tasks_pending(spa) && 8308 !spa->spa_async_suspended && 8309 spa->spa_async_thread == NULL) 8310 spa->spa_async_thread = thread_create(NULL, 0, 8311 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 8312 mutex_exit(&spa->spa_async_lock); 8313 } 8314 8315 void 8316 spa_async_request(spa_t *spa, int task) 8317 { 8318 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 8319 mutex_enter(&spa->spa_async_lock); 8320 spa->spa_async_tasks |= task; 8321 mutex_exit(&spa->spa_async_lock); 8322 } 8323 8324 int 8325 spa_async_tasks(spa_t *spa) 8326 { 8327 return (spa->spa_async_tasks); 8328 } 8329 8330 /* 8331 * ========================================================================== 8332 * SPA syncing routines 8333 * ========================================================================== 8334 */ 8335 8336 8337 static int 8338 bpobj_enqueue_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 8339 dmu_tx_t *tx) 8340 { 8341 bpobj_t *bpo = arg; 8342 bpobj_enqueue(bpo, bp, bp_freed, tx); 8343 return (0); 8344 } 8345 8346 int 8347 bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 8348 { 8349 return (bpobj_enqueue_cb(arg, bp, B_FALSE, tx)); 8350 } 8351 8352 int 8353 bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 8354 { 8355 return (bpobj_enqueue_cb(arg, bp, B_TRUE, tx)); 8356 } 8357 8358 static int 8359 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 8360 { 8361 zio_t *pio = arg; 8362 8363 zio_nowait(zio_free_sync(pio, pio->io_spa, dmu_tx_get_txg(tx), bp, 8364 pio->io_flags)); 8365 return (0); 8366 } 8367 8368 static int 8369 bpobj_spa_free_sync_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, 8370 dmu_tx_t *tx) 8371 { 8372 ASSERT(!bp_freed); 8373 return (spa_free_sync_cb(arg, bp, tx)); 8374 } 8375 8376 /* 8377 * Note: this simple function is not inlined to make it easier to dtrace the 8378 * amount of time spent syncing frees. 8379 */ 8380 static void 8381 spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) 8382 { 8383 zio_t *zio = zio_root(spa, NULL, NULL, 0); 8384 bplist_iterate(bpl, spa_free_sync_cb, zio, tx); 8385 VERIFY(zio_wait(zio) == 0); 8386 } 8387 8388 /* 8389 * Note: this simple function is not inlined to make it easier to dtrace the 8390 * amount of time spent syncing deferred frees. 8391 */ 8392 static void 8393 spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) 8394 { 8395 if (spa_sync_pass(spa) != 1) 8396 return; 8397 8398 /* 8399 * Note: 8400 * If the log space map feature is active, we stop deferring 8401 * frees to the next TXG and therefore running this function 8402 * would be considered a no-op as spa_deferred_bpobj should 8403 * not have any entries. 8404 * 8405 * That said we run this function anyway (instead of returning 8406 * immediately) for the edge-case scenario where we just 8407 * activated the log space map feature in this TXG but we have 8408 * deferred frees from the previous TXG. 8409 */ 8410 zio_t *zio = zio_root(spa, NULL, NULL, 0); 8411 VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, 8412 bpobj_spa_free_sync_cb, zio, tx), ==, 0); 8413 VERIFY0(zio_wait(zio)); 8414 } 8415 8416 static void 8417 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 8418 { 8419 char *packed = NULL; 8420 size_t bufsize; 8421 size_t nvsize = 0; 8422 dmu_buf_t *db; 8423 8424 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 8425 8426 /* 8427 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 8428 * information. This avoids the dmu_buf_will_dirty() path and 8429 * saves us a pre-read to get data we don't actually care about. 8430 */ 8431 bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); 8432 packed = vmem_alloc(bufsize, KM_SLEEP); 8433 8434 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 8435 KM_SLEEP) == 0); 8436 bzero(packed + nvsize, bufsize - nvsize); 8437 8438 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 8439 8440 vmem_free(packed, bufsize); 8441 8442 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 8443 dmu_buf_will_dirty(db, tx); 8444 *(uint64_t *)db->db_data = nvsize; 8445 dmu_buf_rele(db, FTAG); 8446 } 8447 8448 static void 8449 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 8450 const char *config, const char *entry) 8451 { 8452 nvlist_t *nvroot; 8453 nvlist_t **list; 8454 int i; 8455 8456 if (!sav->sav_sync) 8457 return; 8458 8459 /* 8460 * Update the MOS nvlist describing the list of available devices. 8461 * spa_validate_aux() will have already made sure this nvlist is 8462 * valid and the vdevs are labeled appropriately. 8463 */ 8464 if (sav->sav_object == 0) { 8465 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 8466 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 8467 sizeof (uint64_t), tx); 8468 VERIFY(zap_update(spa->spa_meta_objset, 8469 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 8470 &sav->sav_object, tx) == 0); 8471 } 8472 8473 nvroot = fnvlist_alloc(); 8474 if (sav->sav_count == 0) { 8475 fnvlist_add_nvlist_array(nvroot, config, NULL, 0); 8476 } else { 8477 list = kmem_alloc(sav->sav_count*sizeof (void *), KM_SLEEP); 8478 for (i = 0; i < sav->sav_count; i++) 8479 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 8480 B_FALSE, VDEV_CONFIG_L2CACHE); 8481 fnvlist_add_nvlist_array(nvroot, config, list, sav->sav_count); 8482 for (i = 0; i < sav->sav_count; i++) 8483 nvlist_free(list[i]); 8484 kmem_free(list, sav->sav_count * sizeof (void *)); 8485 } 8486 8487 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 8488 nvlist_free(nvroot); 8489 8490 sav->sav_sync = B_FALSE; 8491 } 8492 8493 /* 8494 * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t. 8495 * The all-vdev ZAP must be empty. 8496 */ 8497 static void 8498 spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx) 8499 { 8500 spa_t *spa = vd->vdev_spa; 8501 8502 if (vd->vdev_top_zap != 0) { 8503 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 8504 vd->vdev_top_zap, tx)); 8505 } 8506 if (vd->vdev_leaf_zap != 0) { 8507 VERIFY0(zap_add_int(spa->spa_meta_objset, avz, 8508 vd->vdev_leaf_zap, tx)); 8509 } 8510 for (uint64_t i = 0; i < vd->vdev_children; i++) { 8511 spa_avz_build(vd->vdev_child[i], avz, tx); 8512 } 8513 } 8514 8515 static void 8516 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 8517 { 8518 nvlist_t *config; 8519 8520 /* 8521 * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS, 8522 * its config may not be dirty but we still need to build per-vdev ZAPs. 8523 * Similarly, if the pool is being assembled (e.g. after a split), we 8524 * need to rebuild the AVZ although the config may not be dirty. 8525 */ 8526 if (list_is_empty(&spa->spa_config_dirty_list) && 8527 spa->spa_avz_action == AVZ_ACTION_NONE) 8528 return; 8529 8530 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 8531 8532 ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE || 8533 spa->spa_avz_action == AVZ_ACTION_INITIALIZE || 8534 spa->spa_all_vdev_zaps != 0); 8535 8536 if (spa->spa_avz_action == AVZ_ACTION_REBUILD) { 8537 /* Make and build the new AVZ */ 8538 uint64_t new_avz = zap_create(spa->spa_meta_objset, 8539 DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); 8540 spa_avz_build(spa->spa_root_vdev, new_avz, tx); 8541 8542 /* Diff old AVZ with new one */ 8543 zap_cursor_t zc; 8544 zap_attribute_t za; 8545 8546 for (zap_cursor_init(&zc, spa->spa_meta_objset, 8547 spa->spa_all_vdev_zaps); 8548 zap_cursor_retrieve(&zc, &za) == 0; 8549 zap_cursor_advance(&zc)) { 8550 uint64_t vdzap = za.za_first_integer; 8551 if (zap_lookup_int(spa->spa_meta_objset, new_avz, 8552 vdzap) == ENOENT) { 8553 /* 8554 * ZAP is listed in old AVZ but not in new one; 8555 * destroy it 8556 */ 8557 VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap, 8558 tx)); 8559 } 8560 } 8561 8562 zap_cursor_fini(&zc); 8563 8564 /* Destroy the old AVZ */ 8565 VERIFY0(zap_destroy(spa->spa_meta_objset, 8566 spa->spa_all_vdev_zaps, tx)); 8567 8568 /* Replace the old AVZ in the dir obj with the new one */ 8569 VERIFY0(zap_update(spa->spa_meta_objset, 8570 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, 8571 sizeof (new_avz), 1, &new_avz, tx)); 8572 8573 spa->spa_all_vdev_zaps = new_avz; 8574 } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) { 8575 zap_cursor_t zc; 8576 zap_attribute_t za; 8577 8578 /* Walk through the AVZ and destroy all listed ZAPs */ 8579 for (zap_cursor_init(&zc, spa->spa_meta_objset, 8580 spa->spa_all_vdev_zaps); 8581 zap_cursor_retrieve(&zc, &za) == 0; 8582 zap_cursor_advance(&zc)) { 8583 uint64_t zap = za.za_first_integer; 8584 VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx)); 8585 } 8586 8587 zap_cursor_fini(&zc); 8588 8589 /* Destroy and unlink the AVZ itself */ 8590 VERIFY0(zap_destroy(spa->spa_meta_objset, 8591 spa->spa_all_vdev_zaps, tx)); 8592 VERIFY0(zap_remove(spa->spa_meta_objset, 8593 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx)); 8594 spa->spa_all_vdev_zaps = 0; 8595 } 8596 8597 if (spa->spa_all_vdev_zaps == 0) { 8598 spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset, 8599 DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, 8600 DMU_POOL_VDEV_ZAP_MAP, tx); 8601 } 8602 spa->spa_avz_action = AVZ_ACTION_NONE; 8603 8604 /* Create ZAPs for vdevs that don't have them. */ 8605 vdev_construct_zaps(spa->spa_root_vdev, tx); 8606 8607 config = spa_config_generate(spa, spa->spa_root_vdev, 8608 dmu_tx_get_txg(tx), B_FALSE); 8609 8610 /* 8611 * If we're upgrading the spa version then make sure that 8612 * the config object gets updated with the correct version. 8613 */ 8614 if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) 8615 fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 8616 spa->spa_uberblock.ub_version); 8617 8618 spa_config_exit(spa, SCL_STATE, FTAG); 8619 8620 nvlist_free(spa->spa_config_syncing); 8621 spa->spa_config_syncing = config; 8622 8623 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 8624 } 8625 8626 static void 8627 spa_sync_version(void *arg, dmu_tx_t *tx) 8628 { 8629 uint64_t *versionp = arg; 8630 uint64_t version = *versionp; 8631 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 8632 8633 /* 8634 * Setting the version is special cased when first creating the pool. 8635 */ 8636 ASSERT(tx->tx_txg != TXG_INITIAL); 8637 8638 ASSERT(SPA_VERSION_IS_SUPPORTED(version)); 8639 ASSERT(version >= spa_version(spa)); 8640 8641 spa->spa_uberblock.ub_version = version; 8642 vdev_config_dirty(spa->spa_root_vdev); 8643 spa_history_log_internal(spa, "set", tx, "version=%lld", 8644 (longlong_t)version); 8645 } 8646 8647 /* 8648 * Set zpool properties. 8649 */ 8650 static void 8651 spa_sync_props(void *arg, dmu_tx_t *tx) 8652 { 8653 nvlist_t *nvp = arg; 8654 spa_t *spa = dmu_tx_pool(tx)->dp_spa; 8655 objset_t *mos = spa->spa_meta_objset; 8656 nvpair_t *elem = NULL; 8657 8658 mutex_enter(&spa->spa_props_lock); 8659 8660 while ((elem = nvlist_next_nvpair(nvp, elem))) { 8661 uint64_t intval; 8662 char *strval, *fname; 8663 zpool_prop_t prop; 8664 const char *propname; 8665 zprop_type_t proptype; 8666 spa_feature_t fid; 8667 8668 switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 8669 case ZPOOL_PROP_INVAL: 8670 /* 8671 * We checked this earlier in spa_prop_validate(). 8672 */ 8673 ASSERT(zpool_prop_feature(nvpair_name(elem))); 8674 8675 fname = strchr(nvpair_name(elem), '@') + 1; 8676 VERIFY0(zfeature_lookup_name(fname, &fid)); 8677 8678 spa_feature_enable(spa, fid, tx); 8679 spa_history_log_internal(spa, "set", tx, 8680 "%s=enabled", nvpair_name(elem)); 8681 break; 8682 8683 case ZPOOL_PROP_VERSION: 8684 intval = fnvpair_value_uint64(elem); 8685 /* 8686 * The version is synced separately before other 8687 * properties and should be correct by now. 8688 */ 8689 ASSERT3U(spa_version(spa), >=, intval); 8690 break; 8691 8692 case ZPOOL_PROP_ALTROOT: 8693 /* 8694 * 'altroot' is a non-persistent property. It should 8695 * have been set temporarily at creation or import time. 8696 */ 8697 ASSERT(spa->spa_root != NULL); 8698 break; 8699 8700 case ZPOOL_PROP_READONLY: 8701 case ZPOOL_PROP_CACHEFILE: 8702 /* 8703 * 'readonly' and 'cachefile' are also non-persistent 8704 * properties. 8705 */ 8706 break; 8707 case ZPOOL_PROP_COMMENT: 8708 strval = fnvpair_value_string(elem); 8709 if (spa->spa_comment != NULL) 8710 spa_strfree(spa->spa_comment); 8711 spa->spa_comment = spa_strdup(strval); 8712 /* 8713 * We need to dirty the configuration on all the vdevs 8714 * so that their labels get updated. We also need to 8715 * update the cache file to keep it in sync with the 8716 * MOS version. It's unnecessary to do this for pool 8717 * creation since the vdev's configuration has already 8718 * been dirtied. 8719 */ 8720 if (tx->tx_txg != TXG_INITIAL) { 8721 vdev_config_dirty(spa->spa_root_vdev); 8722 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 8723 } 8724 spa_history_log_internal(spa, "set", tx, 8725 "%s=%s", nvpair_name(elem), strval); 8726 break; 8727 case ZPOOL_PROP_COMPATIBILITY: 8728 strval = fnvpair_value_string(elem); 8729 if (spa->spa_compatibility != NULL) 8730 spa_strfree(spa->spa_compatibility); 8731 spa->spa_compatibility = spa_strdup(strval); 8732 /* 8733 * Dirty the configuration on vdevs as above. 8734 */ 8735 if (tx->tx_txg != TXG_INITIAL) { 8736 vdev_config_dirty(spa->spa_root_vdev); 8737 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 8738 } 8739 8740 spa_history_log_internal(spa, "set", tx, 8741 "%s=%s", nvpair_name(elem), strval); 8742 break; 8743 8744 default: 8745 /* 8746 * Set pool property values in the poolprops mos object. 8747 */ 8748 if (spa->spa_pool_props_object == 0) { 8749 spa->spa_pool_props_object = 8750 zap_create_link(mos, DMU_OT_POOL_PROPS, 8751 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 8752 tx); 8753 } 8754 8755 /* normalize the property name */ 8756 propname = zpool_prop_to_name(prop); 8757 proptype = zpool_prop_get_type(prop); 8758 8759 if (nvpair_type(elem) == DATA_TYPE_STRING) { 8760 ASSERT(proptype == PROP_TYPE_STRING); 8761 strval = fnvpair_value_string(elem); 8762 VERIFY0(zap_update(mos, 8763 spa->spa_pool_props_object, propname, 8764 1, strlen(strval) + 1, strval, tx)); 8765 spa_history_log_internal(spa, "set", tx, 8766 "%s=%s", nvpair_name(elem), strval); 8767 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 8768 intval = fnvpair_value_uint64(elem); 8769 8770 if (proptype == PROP_TYPE_INDEX) { 8771 const char *unused; 8772 VERIFY0(zpool_prop_index_to_string( 8773 prop, intval, &unused)); 8774 } 8775 VERIFY0(zap_update(mos, 8776 spa->spa_pool_props_object, propname, 8777 8, 1, &intval, tx)); 8778 spa_history_log_internal(spa, "set", tx, 8779 "%s=%lld", nvpair_name(elem), 8780 (longlong_t)intval); 8781 } else { 8782 ASSERT(0); /* not allowed */ 8783 } 8784 8785 switch (prop) { 8786 case ZPOOL_PROP_DELEGATION: 8787 spa->spa_delegation = intval; 8788 break; 8789 case ZPOOL_PROP_BOOTFS: 8790 spa->spa_bootfs = intval; 8791 break; 8792 case ZPOOL_PROP_FAILUREMODE: 8793 spa->spa_failmode = intval; 8794 break; 8795 case ZPOOL_PROP_AUTOTRIM: 8796 spa->spa_autotrim = intval; 8797 spa_async_request(spa, 8798 SPA_ASYNC_AUTOTRIM_RESTART); 8799 break; 8800 case ZPOOL_PROP_AUTOEXPAND: 8801 spa->spa_autoexpand = intval; 8802 if (tx->tx_txg != TXG_INITIAL) 8803 spa_async_request(spa, 8804 SPA_ASYNC_AUTOEXPAND); 8805 break; 8806 case ZPOOL_PROP_MULTIHOST: 8807 spa->spa_multihost = intval; 8808 break; 8809 default: 8810 break; 8811 } 8812 } 8813 8814 } 8815 8816 mutex_exit(&spa->spa_props_lock); 8817 } 8818 8819 /* 8820 * Perform one-time upgrade on-disk changes. spa_version() does not 8821 * reflect the new version this txg, so there must be no changes this 8822 * txg to anything that the upgrade code depends on after it executes. 8823 * Therefore this must be called after dsl_pool_sync() does the sync 8824 * tasks. 8825 */ 8826 static void 8827 spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 8828 { 8829 if (spa_sync_pass(spa) != 1) 8830 return; 8831 8832 dsl_pool_t *dp = spa->spa_dsl_pool; 8833 rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); 8834 8835 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 8836 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 8837 dsl_pool_create_origin(dp, tx); 8838 8839 /* Keeping the origin open increases spa_minref */ 8840 spa->spa_minref += 3; 8841 } 8842 8843 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 8844 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 8845 dsl_pool_upgrade_clones(dp, tx); 8846 } 8847 8848 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 8849 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 8850 dsl_pool_upgrade_dir_clones(dp, tx); 8851 8852 /* Keeping the freedir open increases spa_minref */ 8853 spa->spa_minref += 3; 8854 } 8855 8856 if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && 8857 spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 8858 spa_feature_create_zap_objects(spa, tx); 8859 } 8860 8861 /* 8862 * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable 8863 * when possibility to use lz4 compression for metadata was added 8864 * Old pools that have this feature enabled must be upgraded to have 8865 * this feature active 8866 */ 8867 if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { 8868 boolean_t lz4_en = spa_feature_is_enabled(spa, 8869 SPA_FEATURE_LZ4_COMPRESS); 8870 boolean_t lz4_ac = spa_feature_is_active(spa, 8871 SPA_FEATURE_LZ4_COMPRESS); 8872 8873 if (lz4_en && !lz4_ac) 8874 spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); 8875 } 8876 8877 /* 8878 * If we haven't written the salt, do so now. Note that the 8879 * feature may not be activated yet, but that's fine since 8880 * the presence of this ZAP entry is backwards compatible. 8881 */ 8882 if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 8883 DMU_POOL_CHECKSUM_SALT) == ENOENT) { 8884 VERIFY0(zap_add(spa->spa_meta_objset, 8885 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, 8886 sizeof (spa->spa_cksum_salt.zcs_bytes), 8887 spa->spa_cksum_salt.zcs_bytes, tx)); 8888 } 8889 8890 rrw_exit(&dp->dp_config_rwlock, FTAG); 8891 } 8892 8893 static void 8894 vdev_indirect_state_sync_verify(vdev_t *vd) 8895 { 8896 vdev_indirect_mapping_t *vim __maybe_unused = vd->vdev_indirect_mapping; 8897 vdev_indirect_births_t *vib __maybe_unused = vd->vdev_indirect_births; 8898 8899 if (vd->vdev_ops == &vdev_indirect_ops) { 8900 ASSERT(vim != NULL); 8901 ASSERT(vib != NULL); 8902 } 8903 8904 uint64_t obsolete_sm_object = 0; 8905 ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); 8906 if (obsolete_sm_object != 0) { 8907 ASSERT(vd->vdev_obsolete_sm != NULL); 8908 ASSERT(vd->vdev_removing || 8909 vd->vdev_ops == &vdev_indirect_ops); 8910 ASSERT(vdev_indirect_mapping_num_entries(vim) > 0); 8911 ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0); 8912 ASSERT3U(obsolete_sm_object, ==, 8913 space_map_object(vd->vdev_obsolete_sm)); 8914 ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=, 8915 space_map_allocated(vd->vdev_obsolete_sm)); 8916 } 8917 ASSERT(vd->vdev_obsolete_segments != NULL); 8918 8919 /* 8920 * Since frees / remaps to an indirect vdev can only 8921 * happen in syncing context, the obsolete segments 8922 * tree must be empty when we start syncing. 8923 */ 8924 ASSERT0(range_tree_space(vd->vdev_obsolete_segments)); 8925 } 8926 8927 /* 8928 * Set the top-level vdev's max queue depth. Evaluate each top-level's 8929 * async write queue depth in case it changed. The max queue depth will 8930 * not change in the middle of syncing out this txg. 8931 */ 8932 static void 8933 spa_sync_adjust_vdev_max_queue_depth(spa_t *spa) 8934 { 8935 ASSERT(spa_writeable(spa)); 8936 8937 vdev_t *rvd = spa->spa_root_vdev; 8938 uint32_t max_queue_depth = zfs_vdev_async_write_max_active * 8939 zfs_vdev_queue_depth_pct / 100; 8940 metaslab_class_t *normal = spa_normal_class(spa); 8941 metaslab_class_t *special = spa_special_class(spa); 8942 metaslab_class_t *dedup = spa_dedup_class(spa); 8943 8944 uint64_t slots_per_allocator = 0; 8945 for (int c = 0; c < rvd->vdev_children; c++) { 8946 vdev_t *tvd = rvd->vdev_child[c]; 8947 8948 metaslab_group_t *mg = tvd->vdev_mg; 8949 if (mg == NULL || !metaslab_group_initialized(mg)) 8950 continue; 8951 8952 metaslab_class_t *mc = mg->mg_class; 8953 if (mc != normal && mc != special && mc != dedup) 8954 continue; 8955 8956 /* 8957 * It is safe to do a lock-free check here because only async 8958 * allocations look at mg_max_alloc_queue_depth, and async 8959 * allocations all happen from spa_sync(). 8960 */ 8961 for (int i = 0; i < mg->mg_allocators; i++) { 8962 ASSERT0(zfs_refcount_count( 8963 &(mg->mg_allocator[i].mga_alloc_queue_depth))); 8964 } 8965 mg->mg_max_alloc_queue_depth = max_queue_depth; 8966 8967 for (int i = 0; i < mg->mg_allocators; i++) { 8968 mg->mg_allocator[i].mga_cur_max_alloc_queue_depth = 8969 zfs_vdev_def_queue_depth; 8970 } 8971 slots_per_allocator += zfs_vdev_def_queue_depth; 8972 } 8973 8974 for (int i = 0; i < spa->spa_alloc_count; i++) { 8975 ASSERT0(zfs_refcount_count(&normal->mc_allocator[i]. 8976 mca_alloc_slots)); 8977 ASSERT0(zfs_refcount_count(&special->mc_allocator[i]. 8978 mca_alloc_slots)); 8979 ASSERT0(zfs_refcount_count(&dedup->mc_allocator[i]. 8980 mca_alloc_slots)); 8981 normal->mc_allocator[i].mca_alloc_max_slots = 8982 slots_per_allocator; 8983 special->mc_allocator[i].mca_alloc_max_slots = 8984 slots_per_allocator; 8985 dedup->mc_allocator[i].mca_alloc_max_slots = 8986 slots_per_allocator; 8987 } 8988 normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; 8989 special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; 8990 dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; 8991 } 8992 8993 static void 8994 spa_sync_condense_indirect(spa_t *spa, dmu_tx_t *tx) 8995 { 8996 ASSERT(spa_writeable(spa)); 8997 8998 vdev_t *rvd = spa->spa_root_vdev; 8999 for (int c = 0; c < rvd->vdev_children; c++) { 9000 vdev_t *vd = rvd->vdev_child[c]; 9001 vdev_indirect_state_sync_verify(vd); 9002 9003 if (vdev_indirect_should_condense(vd)) { 9004 spa_condense_indirect_start_sync(vd, tx); 9005 break; 9006 } 9007 } 9008 } 9009 9010 static void 9011 spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx) 9012 { 9013 objset_t *mos = spa->spa_meta_objset; 9014 dsl_pool_t *dp = spa->spa_dsl_pool; 9015 uint64_t txg = tx->tx_txg; 9016 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 9017 9018 do { 9019 int pass = ++spa->spa_sync_pass; 9020 9021 spa_sync_config_object(spa, tx); 9022 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 9023 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 9024 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 9025 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 9026 spa_errlog_sync(spa, txg); 9027 dsl_pool_sync(dp, txg); 9028 9029 if (pass < zfs_sync_pass_deferred_free || 9030 spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { 9031 /* 9032 * If the log space map feature is active we don't 9033 * care about deferred frees and the deferred bpobj 9034 * as the log space map should effectively have the 9035 * same results (i.e. appending only to one object). 9036 */ 9037 spa_sync_frees(spa, free_bpl, tx); 9038 } else { 9039 /* 9040 * We can not defer frees in pass 1, because 9041 * we sync the deferred frees later in pass 1. 9042 */ 9043 ASSERT3U(pass, >, 1); 9044 bplist_iterate(free_bpl, bpobj_enqueue_alloc_cb, 9045 &spa->spa_deferred_bpobj, tx); 9046 } 9047 9048 ddt_sync(spa, txg); 9049 dsl_scan_sync(dp, tx); 9050 svr_sync(spa, tx); 9051 spa_sync_upgrades(spa, tx); 9052 9053 spa_flush_metaslabs(spa, tx); 9054 9055 vdev_t *vd = NULL; 9056 while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 9057 != NULL) 9058 vdev_sync(vd, txg); 9059 9060 /* 9061 * Note: We need to check if the MOS is dirty because we could 9062 * have marked the MOS dirty without updating the uberblock 9063 * (e.g. if we have sync tasks but no dirty user data). We need 9064 * to check the uberblock's rootbp because it is updated if we 9065 * have synced out dirty data (though in this case the MOS will 9066 * most likely also be dirty due to second order effects, we 9067 * don't want to rely on that here). 9068 */ 9069 if (pass == 1 && 9070 spa->spa_uberblock.ub_rootbp.blk_birth < txg && 9071 !dmu_objset_is_dirty(mos, txg)) { 9072 /* 9073 * Nothing changed on the first pass, therefore this 9074 * TXG is a no-op. Avoid syncing deferred frees, so 9075 * that we can keep this TXG as a no-op. 9076 */ 9077 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 9078 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 9079 ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); 9080 ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, txg)); 9081 break; 9082 } 9083 9084 spa_sync_deferred_frees(spa, tx); 9085 } while (dmu_objset_is_dirty(mos, txg)); 9086 } 9087 9088 /* 9089 * Rewrite the vdev configuration (which includes the uberblock) to 9090 * commit the transaction group. 9091 * 9092 * If there are no dirty vdevs, we sync the uberblock to a few random 9093 * top-level vdevs that are known to be visible in the config cache 9094 * (see spa_vdev_add() for a complete description). If there *are* dirty 9095 * vdevs, sync the uberblock to all vdevs. 9096 */ 9097 static void 9098 spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx) 9099 { 9100 vdev_t *rvd = spa->spa_root_vdev; 9101 uint64_t txg = tx->tx_txg; 9102 9103 for (;;) { 9104 int error = 0; 9105 9106 /* 9107 * We hold SCL_STATE to prevent vdev open/close/etc. 9108 * while we're attempting to write the vdev labels. 9109 */ 9110 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 9111 9112 if (list_is_empty(&spa->spa_config_dirty_list)) { 9113 vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; 9114 int svdcount = 0; 9115 int children = rvd->vdev_children; 9116 int c0 = random_in_range(children); 9117 9118 for (int c = 0; c < children; c++) { 9119 vdev_t *vd = 9120 rvd->vdev_child[(c0 + c) % children]; 9121 9122 /* Stop when revisiting the first vdev */ 9123 if (c > 0 && svd[0] == vd) 9124 break; 9125 9126 if (vd->vdev_ms_array == 0 || 9127 vd->vdev_islog || 9128 !vdev_is_concrete(vd)) 9129 continue; 9130 9131 svd[svdcount++] = vd; 9132 if (svdcount == SPA_SYNC_MIN_VDEVS) 9133 break; 9134 } 9135 error = vdev_config_sync(svd, svdcount, txg); 9136 } else { 9137 error = vdev_config_sync(rvd->vdev_child, 9138 rvd->vdev_children, txg); 9139 } 9140 9141 if (error == 0) 9142 spa->spa_last_synced_guid = rvd->vdev_guid; 9143 9144 spa_config_exit(spa, SCL_STATE, FTAG); 9145 9146 if (error == 0) 9147 break; 9148 zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR); 9149 zio_resume_wait(spa); 9150 } 9151 } 9152 9153 /* 9154 * Sync the specified transaction group. New blocks may be dirtied as 9155 * part of the process, so we iterate until it converges. 9156 */ 9157 void 9158 spa_sync(spa_t *spa, uint64_t txg) 9159 { 9160 vdev_t *vd = NULL; 9161 9162 VERIFY(spa_writeable(spa)); 9163 9164 /* 9165 * Wait for i/os issued in open context that need to complete 9166 * before this txg syncs. 9167 */ 9168 (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]); 9169 spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, 9170 ZIO_FLAG_CANFAIL); 9171 9172 /* 9173 * Lock out configuration changes. 9174 */ 9175 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 9176 9177 spa->spa_syncing_txg = txg; 9178 spa->spa_sync_pass = 0; 9179 9180 for (int i = 0; i < spa->spa_alloc_count; i++) { 9181 mutex_enter(&spa->spa_allocs[i].spaa_lock); 9182 VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree)); 9183 mutex_exit(&spa->spa_allocs[i].spaa_lock); 9184 } 9185 9186 /* 9187 * If there are any pending vdev state changes, convert them 9188 * into config changes that go out with this transaction group. 9189 */ 9190 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 9191 while (list_head(&spa->spa_state_dirty_list) != NULL) { 9192 /* 9193 * We need the write lock here because, for aux vdevs, 9194 * calling vdev_config_dirty() modifies sav_config. 9195 * This is ugly and will become unnecessary when we 9196 * eliminate the aux vdev wart by integrating all vdevs 9197 * into the root vdev tree. 9198 */ 9199 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 9200 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 9201 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 9202 vdev_state_clean(vd); 9203 vdev_config_dirty(vd); 9204 } 9205 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 9206 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 9207 } 9208 spa_config_exit(spa, SCL_STATE, FTAG); 9209 9210 dsl_pool_t *dp = spa->spa_dsl_pool; 9211 dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); 9212 9213 spa->spa_sync_starttime = gethrtime(); 9214 taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); 9215 spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq, 9216 spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + 9217 NSEC_TO_TICK(spa->spa_deadman_synctime)); 9218 9219 /* 9220 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 9221 * set spa_deflate if we have no raid-z vdevs. 9222 */ 9223 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 9224 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 9225 vdev_t *rvd = spa->spa_root_vdev; 9226 9227 int i; 9228 for (i = 0; i < rvd->vdev_children; i++) { 9229 vd = rvd->vdev_child[i]; 9230 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 9231 break; 9232 } 9233 if (i == rvd->vdev_children) { 9234 spa->spa_deflate = TRUE; 9235 VERIFY0(zap_add(spa->spa_meta_objset, 9236 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 9237 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 9238 } 9239 } 9240 9241 spa_sync_adjust_vdev_max_queue_depth(spa); 9242 9243 spa_sync_condense_indirect(spa, tx); 9244 9245 spa_sync_iterate_to_convergence(spa, tx); 9246 9247 #ifdef ZFS_DEBUG 9248 if (!list_is_empty(&spa->spa_config_dirty_list)) { 9249 /* 9250 * Make sure that the number of ZAPs for all the vdevs matches 9251 * the number of ZAPs in the per-vdev ZAP list. This only gets 9252 * called if the config is dirty; otherwise there may be 9253 * outstanding AVZ operations that weren't completed in 9254 * spa_sync_config_object. 9255 */ 9256 uint64_t all_vdev_zap_entry_count; 9257 ASSERT0(zap_count(spa->spa_meta_objset, 9258 spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count)); 9259 ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==, 9260 all_vdev_zap_entry_count); 9261 } 9262 #endif 9263 9264 if (spa->spa_vdev_removal != NULL) { 9265 ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]); 9266 } 9267 9268 spa_sync_rewrite_vdev_config(spa, tx); 9269 dmu_tx_commit(tx); 9270 9271 taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); 9272 spa->spa_deadman_tqid = 0; 9273 9274 /* 9275 * Clear the dirty config list. 9276 */ 9277 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 9278 vdev_config_clean(vd); 9279 9280 /* 9281 * Now that the new config has synced transactionally, 9282 * let it become visible to the config cache. 9283 */ 9284 if (spa->spa_config_syncing != NULL) { 9285 spa_config_set(spa, spa->spa_config_syncing); 9286 spa->spa_config_txg = txg; 9287 spa->spa_config_syncing = NULL; 9288 } 9289 9290 dsl_pool_sync_done(dp, txg); 9291 9292 for (int i = 0; i < spa->spa_alloc_count; i++) { 9293 mutex_enter(&spa->spa_allocs[i].spaa_lock); 9294 VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree)); 9295 mutex_exit(&spa->spa_allocs[i].spaa_lock); 9296 } 9297 9298 /* 9299 * Update usable space statistics. 9300 */ 9301 while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 9302 != NULL) 9303 vdev_sync_done(vd, txg); 9304 9305 metaslab_class_evict_old(spa->spa_normal_class, txg); 9306 metaslab_class_evict_old(spa->spa_log_class, txg); 9307 9308 spa_sync_close_syncing_log_sm(spa); 9309 9310 spa_update_dspace(spa); 9311 9312 /* 9313 * It had better be the case that we didn't dirty anything 9314 * since vdev_config_sync(). 9315 */ 9316 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 9317 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 9318 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 9319 9320 while (zfs_pause_spa_sync) 9321 delay(1); 9322 9323 spa->spa_sync_pass = 0; 9324 9325 /* 9326 * Update the last synced uberblock here. We want to do this at 9327 * the end of spa_sync() so that consumers of spa_last_synced_txg() 9328 * will be guaranteed that all the processing associated with 9329 * that txg has been completed. 9330 */ 9331 spa->spa_ubsync = spa->spa_uberblock; 9332 spa_config_exit(spa, SCL_CONFIG, FTAG); 9333 9334 spa_handle_ignored_writes(spa); 9335 9336 /* 9337 * If any async tasks have been requested, kick them off. 9338 */ 9339 spa_async_dispatch(spa); 9340 } 9341 9342 /* 9343 * Sync all pools. We don't want to hold the namespace lock across these 9344 * operations, so we take a reference on the spa_t and drop the lock during the 9345 * sync. 9346 */ 9347 void 9348 spa_sync_allpools(void) 9349 { 9350 spa_t *spa = NULL; 9351 mutex_enter(&spa_namespace_lock); 9352 while ((spa = spa_next(spa)) != NULL) { 9353 if (spa_state(spa) != POOL_STATE_ACTIVE || 9354 !spa_writeable(spa) || spa_suspended(spa)) 9355 continue; 9356 spa_open_ref(spa, FTAG); 9357 mutex_exit(&spa_namespace_lock); 9358 txg_wait_synced(spa_get_dsl(spa), 0); 9359 mutex_enter(&spa_namespace_lock); 9360 spa_close(spa, FTAG); 9361 } 9362 mutex_exit(&spa_namespace_lock); 9363 } 9364 9365 /* 9366 * ========================================================================== 9367 * Miscellaneous routines 9368 * ========================================================================== 9369 */ 9370 9371 /* 9372 * Remove all pools in the system. 9373 */ 9374 void 9375 spa_evict_all(void) 9376 { 9377 spa_t *spa; 9378 9379 /* 9380 * Remove all cached state. All pools should be closed now, 9381 * so every spa in the AVL tree should be unreferenced. 9382 */ 9383 mutex_enter(&spa_namespace_lock); 9384 while ((spa = spa_next(NULL)) != NULL) { 9385 /* 9386 * Stop async tasks. The async thread may need to detach 9387 * a device that's been replaced, which requires grabbing 9388 * spa_namespace_lock, so we must drop it here. 9389 */ 9390 spa_open_ref(spa, FTAG); 9391 mutex_exit(&spa_namespace_lock); 9392 spa_async_suspend(spa); 9393 mutex_enter(&spa_namespace_lock); 9394 spa_close(spa, FTAG); 9395 9396 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 9397 spa_unload(spa); 9398 spa_deactivate(spa); 9399 } 9400 spa_remove(spa); 9401 } 9402 mutex_exit(&spa_namespace_lock); 9403 } 9404 9405 vdev_t * 9406 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 9407 { 9408 vdev_t *vd; 9409 int i; 9410 9411 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 9412 return (vd); 9413 9414 if (aux) { 9415 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 9416 vd = spa->spa_l2cache.sav_vdevs[i]; 9417 if (vd->vdev_guid == guid) 9418 return (vd); 9419 } 9420 9421 for (i = 0; i < spa->spa_spares.sav_count; i++) { 9422 vd = spa->spa_spares.sav_vdevs[i]; 9423 if (vd->vdev_guid == guid) 9424 return (vd); 9425 } 9426 } 9427 9428 return (NULL); 9429 } 9430 9431 void 9432 spa_upgrade(spa_t *spa, uint64_t version) 9433 { 9434 ASSERT(spa_writeable(spa)); 9435 9436 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 9437 9438 /* 9439 * This should only be called for a non-faulted pool, and since a 9440 * future version would result in an unopenable pool, this shouldn't be 9441 * possible. 9442 */ 9443 ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); 9444 ASSERT3U(version, >=, spa->spa_uberblock.ub_version); 9445 9446 spa->spa_uberblock.ub_version = version; 9447 vdev_config_dirty(spa->spa_root_vdev); 9448 9449 spa_config_exit(spa, SCL_ALL, FTAG); 9450 9451 txg_wait_synced(spa_get_dsl(spa), 0); 9452 } 9453 9454 boolean_t 9455 spa_has_spare(spa_t *spa, uint64_t guid) 9456 { 9457 int i; 9458 uint64_t spareguid; 9459 spa_aux_vdev_t *sav = &spa->spa_spares; 9460 9461 for (i = 0; i < sav->sav_count; i++) 9462 if (sav->sav_vdevs[i]->vdev_guid == guid) 9463 return (B_TRUE); 9464 9465 for (i = 0; i < sav->sav_npending; i++) { 9466 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 9467 &spareguid) == 0 && spareguid == guid) 9468 return (B_TRUE); 9469 } 9470 9471 return (B_FALSE); 9472 } 9473 9474 /* 9475 * Check if a pool has an active shared spare device. 9476 * Note: reference count of an active spare is 2, as a spare and as a replace 9477 */ 9478 static boolean_t 9479 spa_has_active_shared_spare(spa_t *spa) 9480 { 9481 int i, refcnt; 9482 uint64_t pool; 9483 spa_aux_vdev_t *sav = &spa->spa_spares; 9484 9485 for (i = 0; i < sav->sav_count; i++) { 9486 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 9487 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 9488 refcnt > 2) 9489 return (B_TRUE); 9490 } 9491 9492 return (B_FALSE); 9493 } 9494 9495 uint64_t 9496 spa_total_metaslabs(spa_t *spa) 9497 { 9498 vdev_t *rvd = spa->spa_root_vdev; 9499 9500 uint64_t m = 0; 9501 for (uint64_t c = 0; c < rvd->vdev_children; c++) { 9502 vdev_t *vd = rvd->vdev_child[c]; 9503 if (!vdev_is_concrete(vd)) 9504 continue; 9505 m += vd->vdev_ms_count; 9506 } 9507 return (m); 9508 } 9509 9510 /* 9511 * Notify any waiting threads that some activity has switched from being in- 9512 * progress to not-in-progress so that the thread can wake up and determine 9513 * whether it is finished waiting. 9514 */ 9515 void 9516 spa_notify_waiters(spa_t *spa) 9517 { 9518 /* 9519 * Acquiring spa_activities_lock here prevents the cv_broadcast from 9520 * happening between the waiting thread's check and cv_wait. 9521 */ 9522 mutex_enter(&spa->spa_activities_lock); 9523 cv_broadcast(&spa->spa_activities_cv); 9524 mutex_exit(&spa->spa_activities_lock); 9525 } 9526 9527 /* 9528 * Notify any waiting threads that the pool is exporting, and then block until 9529 * they are finished using the spa_t. 9530 */ 9531 void 9532 spa_wake_waiters(spa_t *spa) 9533 { 9534 mutex_enter(&spa->spa_activities_lock); 9535 spa->spa_waiters_cancel = B_TRUE; 9536 cv_broadcast(&spa->spa_activities_cv); 9537 while (spa->spa_waiters != 0) 9538 cv_wait(&spa->spa_waiters_cv, &spa->spa_activities_lock); 9539 spa->spa_waiters_cancel = B_FALSE; 9540 mutex_exit(&spa->spa_activities_lock); 9541 } 9542 9543 /* Whether the vdev or any of its descendants are being initialized/trimmed. */ 9544 static boolean_t 9545 spa_vdev_activity_in_progress_impl(vdev_t *vd, zpool_wait_activity_t activity) 9546 { 9547 spa_t *spa = vd->vdev_spa; 9548 9549 ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER)); 9550 ASSERT(MUTEX_HELD(&spa->spa_activities_lock)); 9551 ASSERT(activity == ZPOOL_WAIT_INITIALIZE || 9552 activity == ZPOOL_WAIT_TRIM); 9553 9554 kmutex_t *lock = activity == ZPOOL_WAIT_INITIALIZE ? 9555 &vd->vdev_initialize_lock : &vd->vdev_trim_lock; 9556 9557 mutex_exit(&spa->spa_activities_lock); 9558 mutex_enter(lock); 9559 mutex_enter(&spa->spa_activities_lock); 9560 9561 boolean_t in_progress = (activity == ZPOOL_WAIT_INITIALIZE) ? 9562 (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) : 9563 (vd->vdev_trim_state == VDEV_TRIM_ACTIVE); 9564 mutex_exit(lock); 9565 9566 if (in_progress) 9567 return (B_TRUE); 9568 9569 for (int i = 0; i < vd->vdev_children; i++) { 9570 if (spa_vdev_activity_in_progress_impl(vd->vdev_child[i], 9571 activity)) 9572 return (B_TRUE); 9573 } 9574 9575 return (B_FALSE); 9576 } 9577 9578 /* 9579 * If use_guid is true, this checks whether the vdev specified by guid is 9580 * being initialized/trimmed. Otherwise, it checks whether any vdev in the pool 9581 * is being initialized/trimmed. The caller must hold the config lock and 9582 * spa_activities_lock. 9583 */ 9584 static int 9585 spa_vdev_activity_in_progress(spa_t *spa, boolean_t use_guid, uint64_t guid, 9586 zpool_wait_activity_t activity, boolean_t *in_progress) 9587 { 9588 mutex_exit(&spa->spa_activities_lock); 9589 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 9590 mutex_enter(&spa->spa_activities_lock); 9591 9592 vdev_t *vd; 9593 if (use_guid) { 9594 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 9595 if (vd == NULL || !vd->vdev_ops->vdev_op_leaf) { 9596 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 9597 return (EINVAL); 9598 } 9599 } else { 9600 vd = spa->spa_root_vdev; 9601 } 9602 9603 *in_progress = spa_vdev_activity_in_progress_impl(vd, activity); 9604 9605 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 9606 return (0); 9607 } 9608 9609 /* 9610 * Locking for waiting threads 9611 * --------------------------- 9612 * 9613 * Waiting threads need a way to check whether a given activity is in progress, 9614 * and then, if it is, wait for it to complete. Each activity will have some 9615 * in-memory representation of the relevant on-disk state which can be used to 9616 * determine whether or not the activity is in progress. The in-memory state and 9617 * the locking used to protect it will be different for each activity, and may 9618 * not be suitable for use with a cvar (e.g., some state is protected by the 9619 * config lock). To allow waiting threads to wait without any races, another 9620 * lock, spa_activities_lock, is used. 9621 * 9622 * When the state is checked, both the activity-specific lock (if there is one) 9623 * and spa_activities_lock are held. In some cases, the activity-specific lock 9624 * is acquired explicitly (e.g. the config lock). In others, the locking is 9625 * internal to some check (e.g. bpobj_is_empty). After checking, the waiting 9626 * thread releases the activity-specific lock and, if the activity is in 9627 * progress, then cv_waits using spa_activities_lock. 9628 * 9629 * The waiting thread is woken when another thread, one completing some 9630 * activity, updates the state of the activity and then calls 9631 * spa_notify_waiters, which will cv_broadcast. This 'completing' thread only 9632 * needs to hold its activity-specific lock when updating the state, and this 9633 * lock can (but doesn't have to) be dropped before calling spa_notify_waiters. 9634 * 9635 * Because spa_notify_waiters acquires spa_activities_lock before broadcasting, 9636 * and because it is held when the waiting thread checks the state of the 9637 * activity, it can never be the case that the completing thread both updates 9638 * the activity state and cv_broadcasts in between the waiting thread's check 9639 * and cv_wait. Thus, a waiting thread can never miss a wakeup. 9640 * 9641 * In order to prevent deadlock, when the waiting thread does its check, in some 9642 * cases it will temporarily drop spa_activities_lock in order to acquire the 9643 * activity-specific lock. The order in which spa_activities_lock and the 9644 * activity specific lock are acquired in the waiting thread is determined by 9645 * the order in which they are acquired in the completing thread; if the 9646 * completing thread calls spa_notify_waiters with the activity-specific lock 9647 * held, then the waiting thread must also acquire the activity-specific lock 9648 * first. 9649 */ 9650 9651 static int 9652 spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity, 9653 boolean_t use_tag, uint64_t tag, boolean_t *in_progress) 9654 { 9655 int error = 0; 9656 9657 ASSERT(MUTEX_HELD(&spa->spa_activities_lock)); 9658 9659 switch (activity) { 9660 case ZPOOL_WAIT_CKPT_DISCARD: 9661 *in_progress = 9662 (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT) && 9663 zap_contains(spa_meta_objset(spa), 9664 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT) == 9665 ENOENT); 9666 break; 9667 case ZPOOL_WAIT_FREE: 9668 *in_progress = ((spa_version(spa) >= SPA_VERSION_DEADLISTS && 9669 !bpobj_is_empty(&spa->spa_dsl_pool->dp_free_bpobj)) || 9670 spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY) || 9671 spa_livelist_delete_check(spa)); 9672 break; 9673 case ZPOOL_WAIT_INITIALIZE: 9674 case ZPOOL_WAIT_TRIM: 9675 error = spa_vdev_activity_in_progress(spa, use_tag, tag, 9676 activity, in_progress); 9677 break; 9678 case ZPOOL_WAIT_REPLACE: 9679 mutex_exit(&spa->spa_activities_lock); 9680 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 9681 mutex_enter(&spa->spa_activities_lock); 9682 9683 *in_progress = vdev_replace_in_progress(spa->spa_root_vdev); 9684 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 9685 break; 9686 case ZPOOL_WAIT_REMOVE: 9687 *in_progress = (spa->spa_removing_phys.sr_state == 9688 DSS_SCANNING); 9689 break; 9690 case ZPOOL_WAIT_RESILVER: 9691 if ((*in_progress = vdev_rebuild_active(spa->spa_root_vdev))) 9692 break; 9693 fallthrough; 9694 case ZPOOL_WAIT_SCRUB: 9695 { 9696 boolean_t scanning, paused, is_scrub; 9697 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; 9698 9699 is_scrub = (scn->scn_phys.scn_func == POOL_SCAN_SCRUB); 9700 scanning = (scn->scn_phys.scn_state == DSS_SCANNING); 9701 paused = dsl_scan_is_paused_scrub(scn); 9702 *in_progress = (scanning && !paused && 9703 is_scrub == (activity == ZPOOL_WAIT_SCRUB)); 9704 break; 9705 } 9706 default: 9707 panic("unrecognized value for activity %d", activity); 9708 } 9709 9710 return (error); 9711 } 9712 9713 static int 9714 spa_wait_common(const char *pool, zpool_wait_activity_t activity, 9715 boolean_t use_tag, uint64_t tag, boolean_t *waited) 9716 { 9717 /* 9718 * The tag is used to distinguish between instances of an activity. 9719 * 'initialize' and 'trim' are the only activities that we use this for. 9720 * The other activities can only have a single instance in progress in a 9721 * pool at one time, making the tag unnecessary. 9722 * 9723 * There can be multiple devices being replaced at once, but since they 9724 * all finish once resilvering finishes, we don't bother keeping track 9725 * of them individually, we just wait for them all to finish. 9726 */ 9727 if (use_tag && activity != ZPOOL_WAIT_INITIALIZE && 9728 activity != ZPOOL_WAIT_TRIM) 9729 return (EINVAL); 9730 9731 if (activity < 0 || activity >= ZPOOL_WAIT_NUM_ACTIVITIES) 9732 return (EINVAL); 9733 9734 spa_t *spa; 9735 int error = spa_open(pool, &spa, FTAG); 9736 if (error != 0) 9737 return (error); 9738 9739 /* 9740 * Increment the spa's waiter count so that we can call spa_close and 9741 * still ensure that the spa_t doesn't get freed before this thread is 9742 * finished with it when the pool is exported. We want to call spa_close 9743 * before we start waiting because otherwise the additional ref would 9744 * prevent the pool from being exported or destroyed throughout the 9745 * potentially long wait. 9746 */ 9747 mutex_enter(&spa->spa_activities_lock); 9748 spa->spa_waiters++; 9749 spa_close(spa, FTAG); 9750 9751 *waited = B_FALSE; 9752 for (;;) { 9753 boolean_t in_progress; 9754 error = spa_activity_in_progress(spa, activity, use_tag, tag, 9755 &in_progress); 9756 9757 if (error || !in_progress || spa->spa_waiters_cancel) 9758 break; 9759 9760 *waited = B_TRUE; 9761 9762 if (cv_wait_sig(&spa->spa_activities_cv, 9763 &spa->spa_activities_lock) == 0) { 9764 error = EINTR; 9765 break; 9766 } 9767 } 9768 9769 spa->spa_waiters--; 9770 cv_signal(&spa->spa_waiters_cv); 9771 mutex_exit(&spa->spa_activities_lock); 9772 9773 return (error); 9774 } 9775 9776 /* 9777 * Wait for a particular instance of the specified activity to complete, where 9778 * the instance is identified by 'tag' 9779 */ 9780 int 9781 spa_wait_tag(const char *pool, zpool_wait_activity_t activity, uint64_t tag, 9782 boolean_t *waited) 9783 { 9784 return (spa_wait_common(pool, activity, B_TRUE, tag, waited)); 9785 } 9786 9787 /* 9788 * Wait for all instances of the specified activity complete 9789 */ 9790 int 9791 spa_wait(const char *pool, zpool_wait_activity_t activity, boolean_t *waited) 9792 { 9793 9794 return (spa_wait_common(pool, activity, B_FALSE, 0, waited)); 9795 } 9796 9797 sysevent_t * 9798 spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) 9799 { 9800 sysevent_t *ev = NULL; 9801 #ifdef _KERNEL 9802 nvlist_t *resource; 9803 9804 resource = zfs_event_create(spa, vd, FM_SYSEVENT_CLASS, name, hist_nvl); 9805 if (resource) { 9806 ev = kmem_alloc(sizeof (sysevent_t), KM_SLEEP); 9807 ev->resource = resource; 9808 } 9809 #endif 9810 return (ev); 9811 } 9812 9813 void 9814 spa_event_post(sysevent_t *ev) 9815 { 9816 #ifdef _KERNEL 9817 if (ev) { 9818 zfs_zevent_post(ev->resource, NULL, zfs_zevent_post_cb); 9819 kmem_free(ev, sizeof (*ev)); 9820 } 9821 #endif 9822 } 9823 9824 /* 9825 * Post a zevent corresponding to the given sysevent. The 'name' must be one 9826 * of the event definitions in sys/sysevent/eventdefs.h. The payload will be 9827 * filled in from the spa and (optionally) the vdev. This doesn't do anything 9828 * in the userland libzpool, as we don't want consumers to misinterpret ztest 9829 * or zdb as real changes. 9830 */ 9831 void 9832 spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) 9833 { 9834 spa_event_post(spa_event_create(spa, vd, hist_nvl, name)); 9835 } 9836 9837 /* state manipulation functions */ 9838 EXPORT_SYMBOL(spa_open); 9839 EXPORT_SYMBOL(spa_open_rewind); 9840 EXPORT_SYMBOL(spa_get_stats); 9841 EXPORT_SYMBOL(spa_create); 9842 EXPORT_SYMBOL(spa_import); 9843 EXPORT_SYMBOL(spa_tryimport); 9844 EXPORT_SYMBOL(spa_destroy); 9845 EXPORT_SYMBOL(spa_export); 9846 EXPORT_SYMBOL(spa_reset); 9847 EXPORT_SYMBOL(spa_async_request); 9848 EXPORT_SYMBOL(spa_async_suspend); 9849 EXPORT_SYMBOL(spa_async_resume); 9850 EXPORT_SYMBOL(spa_inject_addref); 9851 EXPORT_SYMBOL(spa_inject_delref); 9852 EXPORT_SYMBOL(spa_scan_stat_init); 9853 EXPORT_SYMBOL(spa_scan_get_stats); 9854 9855 /* device manipulation */ 9856 EXPORT_SYMBOL(spa_vdev_add); 9857 EXPORT_SYMBOL(spa_vdev_attach); 9858 EXPORT_SYMBOL(spa_vdev_detach); 9859 EXPORT_SYMBOL(spa_vdev_setpath); 9860 EXPORT_SYMBOL(spa_vdev_setfru); 9861 EXPORT_SYMBOL(spa_vdev_split_mirror); 9862 9863 /* spare statech is global across all pools) */ 9864 EXPORT_SYMBOL(spa_spare_add); 9865 EXPORT_SYMBOL(spa_spare_remove); 9866 EXPORT_SYMBOL(spa_spare_exists); 9867 EXPORT_SYMBOL(spa_spare_activate); 9868 9869 /* L2ARC statech is global across all pools) */ 9870 EXPORT_SYMBOL(spa_l2cache_add); 9871 EXPORT_SYMBOL(spa_l2cache_remove); 9872 EXPORT_SYMBOL(spa_l2cache_exists); 9873 EXPORT_SYMBOL(spa_l2cache_activate); 9874 EXPORT_SYMBOL(spa_l2cache_drop); 9875 9876 /* scanning */ 9877 EXPORT_SYMBOL(spa_scan); 9878 EXPORT_SYMBOL(spa_scan_stop); 9879 9880 /* spa syncing */ 9881 EXPORT_SYMBOL(spa_sync); /* only for DMU use */ 9882 EXPORT_SYMBOL(spa_sync_allpools); 9883 9884 /* properties */ 9885 EXPORT_SYMBOL(spa_prop_set); 9886 EXPORT_SYMBOL(spa_prop_get); 9887 EXPORT_SYMBOL(spa_prop_clear_bootfs); 9888 9889 /* asynchronous event notification */ 9890 EXPORT_SYMBOL(spa_event_notify); 9891 9892 /* BEGIN CSTYLED */ 9893 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, INT, ZMOD_RW, 9894 "log2 fraction of arc that can be used by inflight I/Os when " 9895 "verifying pool during import"); 9896 9897 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_metadata, INT, ZMOD_RW, 9898 "Set to traverse metadata on pool import"); 9899 9900 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW, 9901 "Set to traverse data on pool import"); 9902 9903 ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW, 9904 "Print vdev tree to zfs_dbgmsg during pool import"); 9905 9906 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RD, 9907 "Percentage of CPUs to run an IO worker thread"); 9908 9909 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RD, 9910 "Number of threads per IO worker taskqueue"); 9911 9912 ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, ULONG, ZMOD_RW, 9913 "Allow importing pool with up to this number of missing top-level " 9914 "vdevs (in read-only mode)"); 9915 9916 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT, ZMOD_RW, 9917 "Set the livelist condense zthr to pause"); 9918 9919 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_pause, INT, ZMOD_RW, 9920 "Set the livelist condense synctask to pause"); 9921 9922 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_cancel, INT, ZMOD_RW, 9923 "Whether livelist condensing was canceled in the synctask"); 9924 9925 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_cancel, INT, ZMOD_RW, 9926 "Whether livelist condensing was canceled in the zthr function"); 9927 9928 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, ZMOD_RW, 9929 "Whether extra ALLOC blkptrs were added to a livelist entry while it " 9930 "was being condensed"); 9931 /* END CSTYLED */ 9932