xref: /freebsd/sys/contrib/openzfs/module/zfs/spa.c (revision b670c9bafc0e31c7609969bf374b2e80bdc00211)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or https://opensource.org/licenses/CDDL-1.0.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 
23 /*
24  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
25  * Copyright (c) 2011, 2024 by Delphix. All rights reserved.
26  * Copyright (c) 2018, Nexenta Systems, Inc.  All rights reserved.
27  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
28  * Copyright 2013 Saso Kiselkov. All rights reserved.
29  * Copyright (c) 2014 Integros [integros.com]
30  * Copyright 2016 Toomas Soome <tsoome@me.com>
31  * Copyright (c) 2016 Actifio, Inc. All rights reserved.
32  * Copyright 2018 Joyent, Inc.
33  * Copyright (c) 2017, 2019, Datto Inc. All rights reserved.
34  * Copyright 2017 Joyent, Inc.
35  * Copyright (c) 2017, Intel Corporation.
36  * Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
37  * Copyright (c) 2023 Hewlett Packard Enterprise Development LP.
38  * Copyright (c) 2023, 2024, Klara Inc.
39  */
40 
41 /*
42  * SPA: Storage Pool Allocator
43  *
44  * This file contains all the routines used when modifying on-disk SPA state.
45  * This includes opening, importing, destroying, exporting a pool, and syncing a
46  * pool.
47  */
48 
49 #include <sys/zfs_context.h>
50 #include <sys/fm/fs/zfs.h>
51 #include <sys/spa_impl.h>
52 #include <sys/zio.h>
53 #include <sys/zio_checksum.h>
54 #include <sys/dmu.h>
55 #include <sys/dmu_tx.h>
56 #include <sys/zap.h>
57 #include <sys/zil.h>
58 #include <sys/brt.h>
59 #include <sys/ddt.h>
60 #include <sys/vdev_impl.h>
61 #include <sys/vdev_removal.h>
62 #include <sys/vdev_indirect_mapping.h>
63 #include <sys/vdev_indirect_births.h>
64 #include <sys/vdev_initialize.h>
65 #include <sys/vdev_rebuild.h>
66 #include <sys/vdev_trim.h>
67 #include <sys/vdev_disk.h>
68 #include <sys/vdev_raidz.h>
69 #include <sys/vdev_draid.h>
70 #include <sys/metaslab.h>
71 #include <sys/metaslab_impl.h>
72 #include <sys/mmp.h>
73 #include <sys/uberblock_impl.h>
74 #include <sys/txg.h>
75 #include <sys/avl.h>
76 #include <sys/bpobj.h>
77 #include <sys/dmu_traverse.h>
78 #include <sys/dmu_objset.h>
79 #include <sys/unique.h>
80 #include <sys/dsl_pool.h>
81 #include <sys/dsl_dataset.h>
82 #include <sys/dsl_dir.h>
83 #include <sys/dsl_prop.h>
84 #include <sys/dsl_synctask.h>
85 #include <sys/fs/zfs.h>
86 #include <sys/arc.h>
87 #include <sys/callb.h>
88 #include <sys/systeminfo.h>
89 #include <sys/zfs_ioctl.h>
90 #include <sys/dsl_scan.h>
91 #include <sys/zfeature.h>
92 #include <sys/dsl_destroy.h>
93 #include <sys/zvol.h>
94 
95 #ifdef	_KERNEL
96 #include <sys/fm/protocol.h>
97 #include <sys/fm/util.h>
98 #include <sys/callb.h>
99 #include <sys/zone.h>
100 #include <sys/vmsystm.h>
101 #endif	/* _KERNEL */
102 
103 #include "zfs_prop.h"
104 #include "zfs_comutil.h"
105 #include <cityhash.h>
106 
107 /*
108  * spa_thread() existed on Illumos as a parent thread for the various worker
109  * threads that actually run the pool, as a way to both reference the entire
110  * pool work as a single object, and to share properties like scheduling
111  * options. It has not yet been adapted to Linux or FreeBSD. This define is
112  * used to mark related parts of the code to make things easier for the reader,
113  * and to compile this code out. It can be removed when someone implements it,
114  * moves it to some Illumos-specific place, or removes it entirely.
115  */
116 #undef HAVE_SPA_THREAD
117 
118 /*
119  * The "System Duty Cycle" scheduling class is an Illumos feature to help
120  * prevent CPU-intensive kernel threads from affecting latency on interactive
121  * threads. It doesn't exist on Linux or FreeBSD, so the supporting code is
122  * gated behind a define. On Illumos SDC depends on spa_thread(), but
123  * spa_thread() also has other uses, so this is a separate define.
124  */
125 #undef HAVE_SYSDC
126 
127 /*
128  * The interval, in seconds, at which failed configuration cache file writes
129  * should be retried.
130  */
131 int zfs_ccw_retry_interval = 300;
132 
133 typedef enum zti_modes {
134 	ZTI_MODE_FIXED,			/* value is # of threads (min 1) */
135 	ZTI_MODE_SCALE,			/* Taskqs scale with CPUs. */
136 	ZTI_MODE_SYNC,			/* sync thread assigned */
137 	ZTI_MODE_NULL,			/* don't create a taskq */
138 	ZTI_NMODES
139 } zti_modes_t;
140 
141 #define	ZTI_P(n, q)	{ ZTI_MODE_FIXED, (n), (q) }
142 #define	ZTI_PCT(n)	{ ZTI_MODE_ONLINE_PERCENT, (n), 1 }
143 #define	ZTI_SCALE	{ ZTI_MODE_SCALE, 0, 1 }
144 #define	ZTI_SYNC	{ ZTI_MODE_SYNC, 0, 1 }
145 #define	ZTI_NULL	{ ZTI_MODE_NULL, 0, 0 }
146 
147 #define	ZTI_N(n)	ZTI_P(n, 1)
148 #define	ZTI_ONE		ZTI_N(1)
149 
150 typedef struct zio_taskq_info {
151 	zti_modes_t zti_mode;
152 	uint_t zti_value;
153 	uint_t zti_count;
154 } zio_taskq_info_t;
155 
156 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
157 	"iss", "iss_h", "int", "int_h"
158 };
159 
160 /*
161  * This table defines the taskq settings for each ZFS I/O type. When
162  * initializing a pool, we use this table to create an appropriately sized
163  * taskq. Some operations are low volume and therefore have a small, static
164  * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
165  * macros. Other operations process a large amount of data; the ZTI_SCALE
166  * macro causes us to create a taskq oriented for throughput. Some operations
167  * are so high frequency and short-lived that the taskq itself can become a
168  * point of lock contention. The ZTI_P(#, #) macro indicates that we need an
169  * additional degree of parallelism specified by the number of threads per-
170  * taskq and the number of taskqs; when dispatching an event in this case, the
171  * particular taskq is chosen at random. ZTI_SCALE uses a number of taskqs
172  * that scales with the number of CPUs.
173  *
174  * The different taskq priorities are to handle the different contexts (issue
175  * and interrupt) and then to reserve threads for high priority I/Os that
176  * need to be handled with minimum delay.  Illumos taskq has unfair TQ_FRONT
177  * implementation, so separate high priority threads are used there.
178  */
179 static zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
180 	/* ISSUE	ISSUE_HIGH	INTR		INTR_HIGH */
181 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* NULL */
182 	{ ZTI_N(8),	ZTI_NULL,	ZTI_SCALE,	ZTI_NULL }, /* READ */
183 #ifdef illumos
184 	{ ZTI_SYNC,	ZTI_N(5),	ZTI_SCALE,	ZTI_N(5) }, /* WRITE */
185 #else
186 	{ ZTI_SYNC,	ZTI_NULL,	ZTI_SCALE,	ZTI_NULL }, /* WRITE */
187 #endif
188 	{ ZTI_SCALE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* FREE */
189 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* CLAIM */
190 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* FLUSH */
191 	{ ZTI_N(4),	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* TRIM */
192 };
193 
194 static void spa_sync_version(void *arg, dmu_tx_t *tx);
195 static void spa_sync_props(void *arg, dmu_tx_t *tx);
196 static boolean_t spa_has_active_shared_spare(spa_t *spa);
197 static int spa_load_impl(spa_t *spa, spa_import_type_t type,
198     const char **ereport);
199 static void spa_vdev_resilver_done(spa_t *spa);
200 
201 /*
202  * Percentage of all CPUs that can be used by the metaslab preload taskq.
203  */
204 static uint_t metaslab_preload_pct = 50;
205 
206 static uint_t	zio_taskq_batch_pct = 80;	  /* 1 thread per cpu in pset */
207 static uint_t	zio_taskq_batch_tpq;		  /* threads per taskq */
208 
209 #ifdef HAVE_SYSDC
210 static const boolean_t	zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */
211 static const uint_t	zio_taskq_basedc = 80;	  /* base duty cycle */
212 #endif
213 
214 #ifdef HAVE_SPA_THREAD
215 static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */
216 #endif
217 
218 static uint_t	zio_taskq_write_tpq = 16;
219 
220 /*
221  * Report any spa_load_verify errors found, but do not fail spa_load.
222  * This is used by zdb to analyze non-idle pools.
223  */
224 boolean_t	spa_load_verify_dryrun = B_FALSE;
225 
226 /*
227  * Allow read spacemaps in case of readonly import (spa_mode == SPA_MODE_READ).
228  * This is used by zdb for spacemaps verification.
229  */
230 boolean_t	spa_mode_readable_spacemaps = B_FALSE;
231 
232 /*
233  * This (illegal) pool name is used when temporarily importing a spa_t in order
234  * to get the vdev stats associated with the imported devices.
235  */
236 #define	TRYIMPORT_NAME	"$import"
237 
238 /*
239  * For debugging purposes: print out vdev tree during pool import.
240  */
241 static int		spa_load_print_vdev_tree = B_FALSE;
242 
243 /*
244  * A non-zero value for zfs_max_missing_tvds means that we allow importing
245  * pools with missing top-level vdevs. This is strictly intended for advanced
246  * pool recovery cases since missing data is almost inevitable. Pools with
247  * missing devices can only be imported read-only for safety reasons, and their
248  * fail-mode will be automatically set to "continue".
249  *
250  * With 1 missing vdev we should be able to import the pool and mount all
251  * datasets. User data that was not modified after the missing device has been
252  * added should be recoverable. This means that snapshots created prior to the
253  * addition of that device should be completely intact.
254  *
255  * With 2 missing vdevs, some datasets may fail to mount since there are
256  * dataset statistics that are stored as regular metadata. Some data might be
257  * recoverable if those vdevs were added recently.
258  *
259  * With 3 or more missing vdevs, the pool is severely damaged and MOS entries
260  * may be missing entirely. Chances of data recovery are very low. Note that
261  * there are also risks of performing an inadvertent rewind as we might be
262  * missing all the vdevs with the latest uberblocks.
263  */
264 uint64_t	zfs_max_missing_tvds = 0;
265 
266 /*
267  * The parameters below are similar to zfs_max_missing_tvds but are only
268  * intended for a preliminary open of the pool with an untrusted config which
269  * might be incomplete or out-dated.
270  *
271  * We are more tolerant for pools opened from a cachefile since we could have
272  * an out-dated cachefile where a device removal was not registered.
273  * We could have set the limit arbitrarily high but in the case where devices
274  * are really missing we would want to return the proper error codes; we chose
275  * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available
276  * and we get a chance to retrieve the trusted config.
277  */
278 uint64_t	zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1;
279 
280 /*
281  * In the case where config was assembled by scanning device paths (/dev/dsks
282  * by default) we are less tolerant since all the existing devices should have
283  * been detected and we want spa_load to return the right error codes.
284  */
285 uint64_t	zfs_max_missing_tvds_scan = 0;
286 
287 /*
288  * Debugging aid that pauses spa_sync() towards the end.
289  */
290 static const boolean_t	zfs_pause_spa_sync = B_FALSE;
291 
292 /*
293  * Variables to indicate the livelist condense zthr func should wait at certain
294  * points for the livelist to be removed - used to test condense/destroy races
295  */
296 static int zfs_livelist_condense_zthr_pause = 0;
297 static int zfs_livelist_condense_sync_pause = 0;
298 
299 /*
300  * Variables to track whether or not condense cancellation has been
301  * triggered in testing.
302  */
303 static int zfs_livelist_condense_sync_cancel = 0;
304 static int zfs_livelist_condense_zthr_cancel = 0;
305 
306 /*
307  * Variable to track whether or not extra ALLOC blkptrs were added to a
308  * livelist entry while it was being condensed (caused by the way we track
309  * remapped blkptrs in dbuf_remap_impl)
310  */
311 static int zfs_livelist_condense_new_alloc = 0;
312 
313 /*
314  * ==========================================================================
315  * SPA properties routines
316  * ==========================================================================
317  */
318 
319 /*
320  * Add a (source=src, propname=propval) list to an nvlist.
321  */
322 static void
323 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, const char *strval,
324     uint64_t intval, zprop_source_t src)
325 {
326 	const char *propname = zpool_prop_to_name(prop);
327 	nvlist_t *propval;
328 
329 	propval = fnvlist_alloc();
330 	fnvlist_add_uint64(propval, ZPROP_SOURCE, src);
331 
332 	if (strval != NULL)
333 		fnvlist_add_string(propval, ZPROP_VALUE, strval);
334 	else
335 		fnvlist_add_uint64(propval, ZPROP_VALUE, intval);
336 
337 	fnvlist_add_nvlist(nvl, propname, propval);
338 	nvlist_free(propval);
339 }
340 
341 static int
342 spa_prop_add(spa_t *spa, const char *propname, nvlist_t *outnvl)
343 {
344 	zpool_prop_t prop = zpool_name_to_prop(propname);
345 	zprop_source_t src = ZPROP_SRC_NONE;
346 	uint64_t intval;
347 	int err;
348 
349 	/*
350 	 * NB: Not all properties lookups via this API require
351 	 * the spa props lock, so they must explicitly grab it here.
352 	 */
353 	switch (prop) {
354 	case ZPOOL_PROP_DEDUPCACHED:
355 		err = ddt_get_pool_dedup_cached(spa, &intval);
356 		if (err != 0)
357 			return (SET_ERROR(err));
358 		break;
359 	default:
360 		return (SET_ERROR(EINVAL));
361 	}
362 
363 	spa_prop_add_list(outnvl, prop, NULL, intval, src);
364 
365 	return (0);
366 }
367 
368 int
369 spa_prop_get_nvlist(spa_t *spa, char **props, unsigned int n_props,
370     nvlist_t *outnvl)
371 {
372 	int err = 0;
373 
374 	if (props == NULL)
375 		return (0);
376 
377 	for (unsigned int i = 0; i < n_props && err == 0; i++) {
378 		err = spa_prop_add(spa, props[i], outnvl);
379 	}
380 
381 	return (err);
382 }
383 
384 /*
385  * Add a user property (source=src, propname=propval) to an nvlist.
386  */
387 static void
388 spa_prop_add_user(nvlist_t *nvl, const char *propname, char *strval,
389     zprop_source_t src)
390 {
391 	nvlist_t *propval;
392 
393 	VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
394 	VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
395 	VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
396 	VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
397 	nvlist_free(propval);
398 }
399 
400 /*
401  * Get property values from the spa configuration.
402  */
403 static void
404 spa_prop_get_config(spa_t *spa, nvlist_t *nv)
405 {
406 	vdev_t *rvd = spa->spa_root_vdev;
407 	dsl_pool_t *pool = spa->spa_dsl_pool;
408 	uint64_t size, alloc, cap, version;
409 	const zprop_source_t src = ZPROP_SRC_NONE;
410 	spa_config_dirent_t *dp;
411 	metaslab_class_t *mc = spa_normal_class(spa);
412 
413 	ASSERT(MUTEX_HELD(&spa->spa_props_lock));
414 
415 	if (rvd != NULL) {
416 		alloc = metaslab_class_get_alloc(mc);
417 		alloc += metaslab_class_get_alloc(spa_special_class(spa));
418 		alloc += metaslab_class_get_alloc(spa_dedup_class(spa));
419 		alloc += metaslab_class_get_alloc(spa_embedded_log_class(spa));
420 
421 		size = metaslab_class_get_space(mc);
422 		size += metaslab_class_get_space(spa_special_class(spa));
423 		size += metaslab_class_get_space(spa_dedup_class(spa));
424 		size += metaslab_class_get_space(spa_embedded_log_class(spa));
425 
426 		spa_prop_add_list(nv, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
427 		spa_prop_add_list(nv, ZPOOL_PROP_SIZE, NULL, size, src);
428 		spa_prop_add_list(nv, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
429 		spa_prop_add_list(nv, ZPOOL_PROP_FREE, NULL,
430 		    size - alloc, src);
431 		spa_prop_add_list(nv, ZPOOL_PROP_CHECKPOINT, NULL,
432 		    spa->spa_checkpoint_info.sci_dspace, src);
433 
434 		spa_prop_add_list(nv, ZPOOL_PROP_FRAGMENTATION, NULL,
435 		    metaslab_class_fragmentation(mc), src);
436 		spa_prop_add_list(nv, ZPOOL_PROP_EXPANDSZ, NULL,
437 		    metaslab_class_expandable_space(mc), src);
438 		spa_prop_add_list(nv, ZPOOL_PROP_READONLY, NULL,
439 		    (spa_mode(spa) == SPA_MODE_READ), src);
440 
441 		cap = (size == 0) ? 0 : (alloc * 100 / size);
442 		spa_prop_add_list(nv, ZPOOL_PROP_CAPACITY, NULL, cap, src);
443 
444 		spa_prop_add_list(nv, ZPOOL_PROP_DEDUPRATIO, NULL,
445 		    ddt_get_pool_dedup_ratio(spa), src);
446 		spa_prop_add_list(nv, ZPOOL_PROP_BCLONEUSED, NULL,
447 		    brt_get_used(spa), src);
448 		spa_prop_add_list(nv, ZPOOL_PROP_BCLONESAVED, NULL,
449 		    brt_get_saved(spa), src);
450 		spa_prop_add_list(nv, ZPOOL_PROP_BCLONERATIO, NULL,
451 		    brt_get_ratio(spa), src);
452 
453 		spa_prop_add_list(nv, ZPOOL_PROP_DEDUP_TABLE_SIZE, NULL,
454 		    ddt_get_ddt_dsize(spa), src);
455 		spa_prop_add_list(nv, ZPOOL_PROP_HEALTH, NULL,
456 		    rvd->vdev_state, src);
457 		spa_prop_add_list(nv, ZPOOL_PROP_LAST_SCRUBBED_TXG, NULL,
458 		    spa_get_last_scrubbed_txg(spa), src);
459 
460 		version = spa_version(spa);
461 		if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) {
462 			spa_prop_add_list(nv, ZPOOL_PROP_VERSION, NULL,
463 			    version, ZPROP_SRC_DEFAULT);
464 		} else {
465 			spa_prop_add_list(nv, ZPOOL_PROP_VERSION, NULL,
466 			    version, ZPROP_SRC_LOCAL);
467 		}
468 		spa_prop_add_list(nv, ZPOOL_PROP_LOAD_GUID,
469 		    NULL, spa_load_guid(spa), src);
470 	}
471 
472 	if (pool != NULL) {
473 		/*
474 		 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
475 		 * when opening pools before this version freedir will be NULL.
476 		 */
477 		if (pool->dp_free_dir != NULL) {
478 			spa_prop_add_list(nv, ZPOOL_PROP_FREEING, NULL,
479 			    dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes,
480 			    src);
481 		} else {
482 			spa_prop_add_list(nv, ZPOOL_PROP_FREEING,
483 			    NULL, 0, src);
484 		}
485 
486 		if (pool->dp_leak_dir != NULL) {
487 			spa_prop_add_list(nv, ZPOOL_PROP_LEAKED, NULL,
488 			    dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes,
489 			    src);
490 		} else {
491 			spa_prop_add_list(nv, ZPOOL_PROP_LEAKED,
492 			    NULL, 0, src);
493 		}
494 	}
495 
496 	spa_prop_add_list(nv, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
497 
498 	if (spa->spa_comment != NULL) {
499 		spa_prop_add_list(nv, ZPOOL_PROP_COMMENT, spa->spa_comment,
500 		    0, ZPROP_SRC_LOCAL);
501 	}
502 
503 	if (spa->spa_compatibility != NULL) {
504 		spa_prop_add_list(nv, ZPOOL_PROP_COMPATIBILITY,
505 		    spa->spa_compatibility, 0, ZPROP_SRC_LOCAL);
506 	}
507 
508 	if (spa->spa_root != NULL)
509 		spa_prop_add_list(nv, ZPOOL_PROP_ALTROOT, spa->spa_root,
510 		    0, ZPROP_SRC_LOCAL);
511 
512 	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
513 		spa_prop_add_list(nv, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
514 		    MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE);
515 	} else {
516 		spa_prop_add_list(nv, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
517 		    SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE);
518 	}
519 
520 	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) {
521 		spa_prop_add_list(nv, ZPOOL_PROP_MAXDNODESIZE, NULL,
522 		    DNODE_MAX_SIZE, ZPROP_SRC_NONE);
523 	} else {
524 		spa_prop_add_list(nv, ZPOOL_PROP_MAXDNODESIZE, NULL,
525 		    DNODE_MIN_SIZE, ZPROP_SRC_NONE);
526 	}
527 
528 	if ((dp = list_head(&spa->spa_config_list)) != NULL) {
529 		if (dp->scd_path == NULL) {
530 			spa_prop_add_list(nv, ZPOOL_PROP_CACHEFILE,
531 			    "none", 0, ZPROP_SRC_LOCAL);
532 		} else if (strcmp(dp->scd_path, spa_config_path) != 0) {
533 			spa_prop_add_list(nv, ZPOOL_PROP_CACHEFILE,
534 			    dp->scd_path, 0, ZPROP_SRC_LOCAL);
535 		}
536 	}
537 }
538 
539 /*
540  * Get zpool property values.
541  */
542 int
543 spa_prop_get(spa_t *spa, nvlist_t *nv)
544 {
545 	objset_t *mos = spa->spa_meta_objset;
546 	zap_cursor_t zc;
547 	zap_attribute_t *za;
548 	dsl_pool_t *dp;
549 	int err = 0;
550 
551 	dp = spa_get_dsl(spa);
552 	dsl_pool_config_enter(dp, FTAG);
553 	za = zap_attribute_alloc();
554 	mutex_enter(&spa->spa_props_lock);
555 
556 	/*
557 	 * Get properties from the spa config.
558 	 */
559 	spa_prop_get_config(spa, nv);
560 
561 	/* If no pool property object, no more prop to get. */
562 	if (mos == NULL || spa->spa_pool_props_object == 0)
563 		goto out;
564 
565 	/*
566 	 * Get properties from the MOS pool property object.
567 	 */
568 	for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
569 	    (err = zap_cursor_retrieve(&zc, za)) == 0;
570 	    zap_cursor_advance(&zc)) {
571 		uint64_t intval = 0;
572 		char *strval = NULL;
573 		zprop_source_t src = ZPROP_SRC_DEFAULT;
574 		zpool_prop_t prop;
575 
576 		if ((prop = zpool_name_to_prop(za->za_name)) ==
577 		    ZPOOL_PROP_INVAL && !zfs_prop_user(za->za_name))
578 			continue;
579 
580 		switch (za->za_integer_length) {
581 		case 8:
582 			/* integer property */
583 			if (za->za_first_integer !=
584 			    zpool_prop_default_numeric(prop))
585 				src = ZPROP_SRC_LOCAL;
586 
587 			if (prop == ZPOOL_PROP_BOOTFS) {
588 				dsl_dataset_t *ds = NULL;
589 
590 				err = dsl_dataset_hold_obj(dp,
591 				    za->za_first_integer, FTAG, &ds);
592 				if (err != 0)
593 					break;
594 
595 				strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN,
596 				    KM_SLEEP);
597 				dsl_dataset_name(ds, strval);
598 				dsl_dataset_rele(ds, FTAG);
599 			} else {
600 				strval = NULL;
601 				intval = za->za_first_integer;
602 			}
603 
604 			spa_prop_add_list(nv, prop, strval, intval, src);
605 
606 			if (strval != NULL)
607 				kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN);
608 
609 			break;
610 
611 		case 1:
612 			/* string property */
613 			strval = kmem_alloc(za->za_num_integers, KM_SLEEP);
614 			err = zap_lookup(mos, spa->spa_pool_props_object,
615 			    za->za_name, 1, za->za_num_integers, strval);
616 			if (err) {
617 				kmem_free(strval, za->za_num_integers);
618 				break;
619 			}
620 			if (prop != ZPOOL_PROP_INVAL) {
621 				spa_prop_add_list(nv, prop, strval, 0, src);
622 			} else {
623 				src = ZPROP_SRC_LOCAL;
624 				spa_prop_add_user(nv, za->za_name, strval,
625 				    src);
626 			}
627 			kmem_free(strval, za->za_num_integers);
628 			break;
629 
630 		default:
631 			break;
632 		}
633 	}
634 	zap_cursor_fini(&zc);
635 out:
636 	mutex_exit(&spa->spa_props_lock);
637 	dsl_pool_config_exit(dp, FTAG);
638 	zap_attribute_free(za);
639 
640 	if (err && err != ENOENT)
641 		return (err);
642 
643 	return (0);
644 }
645 
646 /*
647  * Validate the given pool properties nvlist and modify the list
648  * for the property values to be set.
649  */
650 static int
651 spa_prop_validate(spa_t *spa, nvlist_t *props)
652 {
653 	nvpair_t *elem;
654 	int error = 0, reset_bootfs = 0;
655 	uint64_t objnum = 0;
656 	boolean_t has_feature = B_FALSE;
657 
658 	elem = NULL;
659 	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
660 		uint64_t intval;
661 		const char *strval, *slash, *check, *fname;
662 		const char *propname = nvpair_name(elem);
663 		zpool_prop_t prop = zpool_name_to_prop(propname);
664 
665 		switch (prop) {
666 		case ZPOOL_PROP_INVAL:
667 			/*
668 			 * Sanitize the input.
669 			 */
670 			if (zfs_prop_user(propname)) {
671 				if (strlen(propname) >= ZAP_MAXNAMELEN) {
672 					error = SET_ERROR(ENAMETOOLONG);
673 					break;
674 				}
675 
676 				if (strlen(fnvpair_value_string(elem)) >=
677 				    ZAP_MAXVALUELEN) {
678 					error = SET_ERROR(E2BIG);
679 					break;
680 				}
681 			} else if (zpool_prop_feature(propname)) {
682 				if (nvpair_type(elem) != DATA_TYPE_UINT64) {
683 					error = SET_ERROR(EINVAL);
684 					break;
685 				}
686 
687 				if (nvpair_value_uint64(elem, &intval) != 0) {
688 					error = SET_ERROR(EINVAL);
689 					break;
690 				}
691 
692 				if (intval != 0) {
693 					error = SET_ERROR(EINVAL);
694 					break;
695 				}
696 
697 				fname = strchr(propname, '@') + 1;
698 				if (zfeature_lookup_name(fname, NULL) != 0) {
699 					error = SET_ERROR(EINVAL);
700 					break;
701 				}
702 
703 				has_feature = B_TRUE;
704 			} else {
705 				error = SET_ERROR(EINVAL);
706 				break;
707 			}
708 			break;
709 
710 		case ZPOOL_PROP_VERSION:
711 			error = nvpair_value_uint64(elem, &intval);
712 			if (!error &&
713 			    (intval < spa_version(spa) ||
714 			    intval > SPA_VERSION_BEFORE_FEATURES ||
715 			    has_feature))
716 				error = SET_ERROR(EINVAL);
717 			break;
718 
719 		case ZPOOL_PROP_DEDUP_TABLE_QUOTA:
720 			error = nvpair_value_uint64(elem, &intval);
721 			break;
722 
723 		case ZPOOL_PROP_DELEGATION:
724 		case ZPOOL_PROP_AUTOREPLACE:
725 		case ZPOOL_PROP_LISTSNAPS:
726 		case ZPOOL_PROP_AUTOEXPAND:
727 		case ZPOOL_PROP_AUTOTRIM:
728 			error = nvpair_value_uint64(elem, &intval);
729 			if (!error && intval > 1)
730 				error = SET_ERROR(EINVAL);
731 			break;
732 
733 		case ZPOOL_PROP_MULTIHOST:
734 			error = nvpair_value_uint64(elem, &intval);
735 			if (!error && intval > 1)
736 				error = SET_ERROR(EINVAL);
737 
738 			if (!error) {
739 				uint32_t hostid = zone_get_hostid(NULL);
740 				if (hostid)
741 					spa->spa_hostid = hostid;
742 				else
743 					error = SET_ERROR(ENOTSUP);
744 			}
745 
746 			break;
747 
748 		case ZPOOL_PROP_BOOTFS:
749 			/*
750 			 * If the pool version is less than SPA_VERSION_BOOTFS,
751 			 * or the pool is still being created (version == 0),
752 			 * the bootfs property cannot be set.
753 			 */
754 			if (spa_version(spa) < SPA_VERSION_BOOTFS) {
755 				error = SET_ERROR(ENOTSUP);
756 				break;
757 			}
758 
759 			/*
760 			 * Make sure the vdev config is bootable
761 			 */
762 			if (!vdev_is_bootable(spa->spa_root_vdev)) {
763 				error = SET_ERROR(ENOTSUP);
764 				break;
765 			}
766 
767 			reset_bootfs = 1;
768 
769 			error = nvpair_value_string(elem, &strval);
770 
771 			if (!error) {
772 				objset_t *os;
773 
774 				if (strval == NULL || strval[0] == '\0') {
775 					objnum = zpool_prop_default_numeric(
776 					    ZPOOL_PROP_BOOTFS);
777 					break;
778 				}
779 
780 				error = dmu_objset_hold(strval, FTAG, &os);
781 				if (error != 0)
782 					break;
783 
784 				/* Must be ZPL. */
785 				if (dmu_objset_type(os) != DMU_OST_ZFS) {
786 					error = SET_ERROR(ENOTSUP);
787 				} else {
788 					objnum = dmu_objset_id(os);
789 				}
790 				dmu_objset_rele(os, FTAG);
791 			}
792 			break;
793 
794 		case ZPOOL_PROP_FAILUREMODE:
795 			error = nvpair_value_uint64(elem, &intval);
796 			if (!error && intval > ZIO_FAILURE_MODE_PANIC)
797 				error = SET_ERROR(EINVAL);
798 
799 			/*
800 			 * This is a special case which only occurs when
801 			 * the pool has completely failed. This allows
802 			 * the user to change the in-core failmode property
803 			 * without syncing it out to disk (I/Os might
804 			 * currently be blocked). We do this by returning
805 			 * EIO to the caller (spa_prop_set) to trick it
806 			 * into thinking we encountered a property validation
807 			 * error.
808 			 */
809 			if (!error && spa_suspended(spa)) {
810 				spa->spa_failmode = intval;
811 				error = SET_ERROR(EIO);
812 			}
813 			break;
814 
815 		case ZPOOL_PROP_CACHEFILE:
816 			if ((error = nvpair_value_string(elem, &strval)) != 0)
817 				break;
818 
819 			if (strval[0] == '\0')
820 				break;
821 
822 			if (strcmp(strval, "none") == 0)
823 				break;
824 
825 			if (strval[0] != '/') {
826 				error = SET_ERROR(EINVAL);
827 				break;
828 			}
829 
830 			slash = strrchr(strval, '/');
831 			ASSERT(slash != NULL);
832 
833 			if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
834 			    strcmp(slash, "/..") == 0)
835 				error = SET_ERROR(EINVAL);
836 			break;
837 
838 		case ZPOOL_PROP_COMMENT:
839 			if ((error = nvpair_value_string(elem, &strval)) != 0)
840 				break;
841 			for (check = strval; *check != '\0'; check++) {
842 				if (!isprint(*check)) {
843 					error = SET_ERROR(EINVAL);
844 					break;
845 				}
846 			}
847 			if (strlen(strval) > ZPROP_MAX_COMMENT)
848 				error = SET_ERROR(E2BIG);
849 			break;
850 
851 		default:
852 			break;
853 		}
854 
855 		if (error)
856 			break;
857 	}
858 
859 	(void) nvlist_remove_all(props,
860 	    zpool_prop_to_name(ZPOOL_PROP_DEDUPDITTO));
861 
862 	if (!error && reset_bootfs) {
863 		error = nvlist_remove(props,
864 		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
865 
866 		if (!error) {
867 			error = nvlist_add_uint64(props,
868 			    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
869 		}
870 	}
871 
872 	return (error);
873 }
874 
875 void
876 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
877 {
878 	const char *cachefile;
879 	spa_config_dirent_t *dp;
880 
881 	if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
882 	    &cachefile) != 0)
883 		return;
884 
885 	dp = kmem_alloc(sizeof (spa_config_dirent_t),
886 	    KM_SLEEP);
887 
888 	if (cachefile[0] == '\0')
889 		dp->scd_path = spa_strdup(spa_config_path);
890 	else if (strcmp(cachefile, "none") == 0)
891 		dp->scd_path = NULL;
892 	else
893 		dp->scd_path = spa_strdup(cachefile);
894 
895 	list_insert_head(&spa->spa_config_list, dp);
896 	if (need_sync)
897 		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
898 }
899 
900 int
901 spa_prop_set(spa_t *spa, nvlist_t *nvp)
902 {
903 	int error;
904 	nvpair_t *elem = NULL;
905 	boolean_t need_sync = B_FALSE;
906 
907 	if ((error = spa_prop_validate(spa, nvp)) != 0)
908 		return (error);
909 
910 	while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
911 		zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem));
912 
913 		if (prop == ZPOOL_PROP_CACHEFILE ||
914 		    prop == ZPOOL_PROP_ALTROOT ||
915 		    prop == ZPOOL_PROP_READONLY)
916 			continue;
917 
918 		if (prop == ZPOOL_PROP_INVAL &&
919 		    zfs_prop_user(nvpair_name(elem))) {
920 			need_sync = B_TRUE;
921 			break;
922 		}
923 
924 		if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) {
925 			uint64_t ver = 0;
926 
927 			if (prop == ZPOOL_PROP_VERSION) {
928 				VERIFY(nvpair_value_uint64(elem, &ver) == 0);
929 			} else {
930 				ASSERT(zpool_prop_feature(nvpair_name(elem)));
931 				ver = SPA_VERSION_FEATURES;
932 				need_sync = B_TRUE;
933 			}
934 
935 			/* Save time if the version is already set. */
936 			if (ver == spa_version(spa))
937 				continue;
938 
939 			/*
940 			 * In addition to the pool directory object, we might
941 			 * create the pool properties object, the features for
942 			 * read object, the features for write object, or the
943 			 * feature descriptions object.
944 			 */
945 			error = dsl_sync_task(spa->spa_name, NULL,
946 			    spa_sync_version, &ver,
947 			    6, ZFS_SPACE_CHECK_RESERVED);
948 			if (error)
949 				return (error);
950 			continue;
951 		}
952 
953 		need_sync = B_TRUE;
954 		break;
955 	}
956 
957 	if (need_sync) {
958 		return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props,
959 		    nvp, 6, ZFS_SPACE_CHECK_RESERVED));
960 	}
961 
962 	return (0);
963 }
964 
965 /*
966  * If the bootfs property value is dsobj, clear it.
967  */
968 void
969 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
970 {
971 	if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
972 		VERIFY(zap_remove(spa->spa_meta_objset,
973 		    spa->spa_pool_props_object,
974 		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
975 		spa->spa_bootfs = 0;
976 	}
977 }
978 
979 static int
980 spa_change_guid_check(void *arg, dmu_tx_t *tx)
981 {
982 	uint64_t *newguid __maybe_unused = arg;
983 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
984 	vdev_t *rvd = spa->spa_root_vdev;
985 	uint64_t vdev_state;
986 
987 	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
988 		int error = (spa_has_checkpoint(spa)) ?
989 		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
990 		return (SET_ERROR(error));
991 	}
992 
993 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
994 	vdev_state = rvd->vdev_state;
995 	spa_config_exit(spa, SCL_STATE, FTAG);
996 
997 	if (vdev_state != VDEV_STATE_HEALTHY)
998 		return (SET_ERROR(ENXIO));
999 
1000 	ASSERT3U(spa_guid(spa), !=, *newguid);
1001 
1002 	return (0);
1003 }
1004 
1005 static void
1006 spa_change_guid_sync(void *arg, dmu_tx_t *tx)
1007 {
1008 	uint64_t *newguid = arg;
1009 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
1010 	uint64_t oldguid;
1011 	vdev_t *rvd = spa->spa_root_vdev;
1012 
1013 	oldguid = spa_guid(spa);
1014 
1015 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
1016 	rvd->vdev_guid = *newguid;
1017 	rvd->vdev_guid_sum += (*newguid - oldguid);
1018 	vdev_config_dirty(rvd);
1019 	spa_config_exit(spa, SCL_STATE, FTAG);
1020 
1021 	spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu",
1022 	    (u_longlong_t)oldguid, (u_longlong_t)*newguid);
1023 }
1024 
1025 /*
1026  * Change the GUID for the pool.  This is done so that we can later
1027  * re-import a pool built from a clone of our own vdevs.  We will modify
1028  * the root vdev's guid, our own pool guid, and then mark all of our
1029  * vdevs dirty.  Note that we must make sure that all our vdevs are
1030  * online when we do this, or else any vdevs that weren't present
1031  * would be orphaned from our pool.  We are also going to issue a
1032  * sysevent to update any watchers.
1033  *
1034  * The GUID of the pool will be changed to the value pointed to by guidp.
1035  * The GUID may not be set to the reserverd value of 0.
1036  * The new GUID will be generated if guidp is NULL.
1037  */
1038 int
1039 spa_change_guid(spa_t *spa, const uint64_t *guidp)
1040 {
1041 	uint64_t guid;
1042 	int error;
1043 
1044 	mutex_enter(&spa->spa_vdev_top_lock);
1045 	mutex_enter(&spa_namespace_lock);
1046 
1047 	if (guidp != NULL) {
1048 		guid = *guidp;
1049 		if (guid == 0) {
1050 			error = SET_ERROR(EINVAL);
1051 			goto out;
1052 		}
1053 
1054 		if (spa_guid_exists(guid, 0)) {
1055 			error = SET_ERROR(EEXIST);
1056 			goto out;
1057 		}
1058 	} else {
1059 		guid = spa_generate_guid(NULL);
1060 	}
1061 
1062 	error = dsl_sync_task(spa->spa_name, spa_change_guid_check,
1063 	    spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED);
1064 
1065 	if (error == 0) {
1066 		/*
1067 		 * Clear the kobj flag from all the vdevs to allow
1068 		 * vdev_cache_process_kobj_evt() to post events to all the
1069 		 * vdevs since GUID is updated.
1070 		 */
1071 		vdev_clear_kobj_evt(spa->spa_root_vdev);
1072 		for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
1073 			vdev_clear_kobj_evt(spa->spa_l2cache.sav_vdevs[i]);
1074 
1075 		spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE);
1076 		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID);
1077 	}
1078 
1079 out:
1080 	mutex_exit(&spa_namespace_lock);
1081 	mutex_exit(&spa->spa_vdev_top_lock);
1082 
1083 	return (error);
1084 }
1085 
1086 /*
1087  * ==========================================================================
1088  * SPA state manipulation (open/create/destroy/import/export)
1089  * ==========================================================================
1090  */
1091 
1092 static int
1093 spa_error_entry_compare(const void *a, const void *b)
1094 {
1095 	const spa_error_entry_t *sa = (const spa_error_entry_t *)a;
1096 	const spa_error_entry_t *sb = (const spa_error_entry_t *)b;
1097 	int ret;
1098 
1099 	ret = memcmp(&sa->se_bookmark, &sb->se_bookmark,
1100 	    sizeof (zbookmark_phys_t));
1101 
1102 	return (TREE_ISIGN(ret));
1103 }
1104 
1105 /*
1106  * Utility function which retrieves copies of the current logs and
1107  * re-initializes them in the process.
1108  */
1109 void
1110 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
1111 {
1112 	ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
1113 
1114 	memcpy(last, &spa->spa_errlist_last, sizeof (avl_tree_t));
1115 	memcpy(scrub, &spa->spa_errlist_scrub, sizeof (avl_tree_t));
1116 
1117 	avl_create(&spa->spa_errlist_scrub,
1118 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
1119 	    offsetof(spa_error_entry_t, se_avl));
1120 	avl_create(&spa->spa_errlist_last,
1121 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
1122 	    offsetof(spa_error_entry_t, se_avl));
1123 }
1124 
1125 static void
1126 spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
1127 {
1128 	const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
1129 	enum zti_modes mode = ztip->zti_mode;
1130 	uint_t value = ztip->zti_value;
1131 	uint_t count = ztip->zti_count;
1132 	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
1133 	uint_t cpus, flags = TASKQ_DYNAMIC;
1134 
1135 	switch (mode) {
1136 	case ZTI_MODE_FIXED:
1137 		ASSERT3U(value, >, 0);
1138 		break;
1139 
1140 	case ZTI_MODE_SYNC:
1141 
1142 		/*
1143 		 * Create one wr_iss taskq for every 'zio_taskq_write_tpq' CPUs,
1144 		 * not to exceed the number of spa allocators, and align to it.
1145 		 */
1146 		cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100);
1147 		count = MAX(1, cpus / MAX(1, zio_taskq_write_tpq));
1148 		count = MAX(count, (zio_taskq_batch_pct + 99) / 100);
1149 		count = MIN(count, spa->spa_alloc_count);
1150 		while (spa->spa_alloc_count % count != 0 &&
1151 		    spa->spa_alloc_count < count * 2)
1152 			count--;
1153 
1154 		/*
1155 		 * zio_taskq_batch_pct is unbounded and may exceed 100%, but no
1156 		 * single taskq may have more threads than 100% of online cpus.
1157 		 */
1158 		value = (zio_taskq_batch_pct + count / 2) / count;
1159 		value = MIN(value, 100);
1160 		flags |= TASKQ_THREADS_CPU_PCT;
1161 		break;
1162 
1163 	case ZTI_MODE_SCALE:
1164 		flags |= TASKQ_THREADS_CPU_PCT;
1165 		/*
1166 		 * We want more taskqs to reduce lock contention, but we want
1167 		 * less for better request ordering and CPU utilization.
1168 		 */
1169 		cpus = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100);
1170 		if (zio_taskq_batch_tpq > 0) {
1171 			count = MAX(1, (cpus + zio_taskq_batch_tpq / 2) /
1172 			    zio_taskq_batch_tpq);
1173 		} else {
1174 			/*
1175 			 * Prefer 6 threads per taskq, but no more taskqs
1176 			 * than threads in them on large systems. For 80%:
1177 			 *
1178 			 *                 taskq   taskq   total
1179 			 * cpus    taskqs  percent threads threads
1180 			 * ------- ------- ------- ------- -------
1181 			 * 1       1       80%     1       1
1182 			 * 2       1       80%     1       1
1183 			 * 4       1       80%     3       3
1184 			 * 8       2       40%     3       6
1185 			 * 16      3       27%     4       12
1186 			 * 32      5       16%     5       25
1187 			 * 64      7       11%     7       49
1188 			 * 128     10      8%      10      100
1189 			 * 256     14      6%      15      210
1190 			 */
1191 			count = 1 + cpus / 6;
1192 			while (count * count > cpus)
1193 				count--;
1194 		}
1195 		/* Limit each taskq within 100% to not trigger assertion. */
1196 		count = MAX(count, (zio_taskq_batch_pct + 99) / 100);
1197 		value = (zio_taskq_batch_pct + count / 2) / count;
1198 		break;
1199 
1200 	case ZTI_MODE_NULL:
1201 		tqs->stqs_count = 0;
1202 		tqs->stqs_taskq = NULL;
1203 		return;
1204 
1205 	default:
1206 		panic("unrecognized mode for %s_%s taskq (%u:%u) in "
1207 		    "spa_taskqs_init()",
1208 		    zio_type_name[t], zio_taskq_types[q], mode, value);
1209 		break;
1210 	}
1211 
1212 	ASSERT3U(count, >, 0);
1213 	tqs->stqs_count = count;
1214 	tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
1215 
1216 	for (uint_t i = 0; i < count; i++) {
1217 		taskq_t *tq;
1218 		char name[32];
1219 
1220 		if (count > 1)
1221 			(void) snprintf(name, sizeof (name), "%s_%s_%u",
1222 			    zio_type_name[t], zio_taskq_types[q], i);
1223 		else
1224 			(void) snprintf(name, sizeof (name), "%s_%s",
1225 			    zio_type_name[t], zio_taskq_types[q]);
1226 
1227 #ifdef HAVE_SYSDC
1228 		if (zio_taskq_sysdc && spa->spa_proc != &p0) {
1229 			(void) zio_taskq_basedc;
1230 			tq = taskq_create_sysdc(name, value, 50, INT_MAX,
1231 			    spa->spa_proc, zio_taskq_basedc, flags);
1232 		} else {
1233 #endif
1234 			/*
1235 			 * The write issue taskq can be extremely CPU
1236 			 * intensive.  Run it at slightly less important
1237 			 * priority than the other taskqs.
1238 			 */
1239 			const pri_t pri = (t == ZIO_TYPE_WRITE &&
1240 			    q == ZIO_TASKQ_ISSUE) ?
1241 			    wtqclsyspri : maxclsyspri;
1242 			tq = taskq_create_proc(name, value, pri, 50,
1243 			    INT_MAX, spa->spa_proc, flags);
1244 #ifdef HAVE_SYSDC
1245 		}
1246 #endif
1247 
1248 		tqs->stqs_taskq[i] = tq;
1249 	}
1250 }
1251 
1252 static void
1253 spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
1254 {
1255 	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
1256 
1257 	if (tqs->stqs_taskq == NULL) {
1258 		ASSERT3U(tqs->stqs_count, ==, 0);
1259 		return;
1260 	}
1261 
1262 	for (uint_t i = 0; i < tqs->stqs_count; i++) {
1263 		ASSERT3P(tqs->stqs_taskq[i], !=, NULL);
1264 		taskq_destroy(tqs->stqs_taskq[i]);
1265 	}
1266 
1267 	kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *));
1268 	tqs->stqs_taskq = NULL;
1269 }
1270 
1271 #ifdef _KERNEL
1272 /*
1273  * The READ and WRITE rows of zio_taskqs are configurable at module load time
1274  * by setting zio_taskq_read or zio_taskq_write.
1275  *
1276  * Example (the defaults for READ and WRITE)
1277  *   zio_taskq_read='fixed,1,8 null scale null'
1278  *   zio_taskq_write='sync null scale null'
1279  *
1280  * Each sets the entire row at a time.
1281  *
1282  * 'fixed' is parameterised: fixed,Q,T where Q is number of taskqs, T is number
1283  * of threads per taskq.
1284  *
1285  * 'null' can only be set on the high-priority queues (queue selection for
1286  * high-priority queues will fall back to the regular queue if the high-pri
1287  * is NULL.
1288  */
1289 static const char *const modes[ZTI_NMODES] = {
1290 	"fixed", "scale", "sync", "null"
1291 };
1292 
1293 /* Parse the incoming config string. Modifies cfg */
1294 static int
1295 spa_taskq_param_set(zio_type_t t, char *cfg)
1296 {
1297 	int err = 0;
1298 
1299 	zio_taskq_info_t row[ZIO_TASKQ_TYPES] = {{0}};
1300 
1301 	char *next = cfg, *tok, *c;
1302 
1303 	/*
1304 	 * Parse out each element from the string and fill `row`. The entire
1305 	 * row has to be set at once, so any errors are flagged by just
1306 	 * breaking out of this loop early.
1307 	 */
1308 	uint_t q;
1309 	for (q = 0; q < ZIO_TASKQ_TYPES; q++) {
1310 		/* `next` is the start of the config */
1311 		if (next == NULL)
1312 			break;
1313 
1314 		/* Eat up leading space */
1315 		while (isspace(*next))
1316 			next++;
1317 		if (*next == '\0')
1318 			break;
1319 
1320 		/* Mode ends at space or end of string */
1321 		tok = next;
1322 		next = strchr(tok, ' ');
1323 		if (next != NULL) *next++ = '\0';
1324 
1325 		/* Parameters start after a comma */
1326 		c = strchr(tok, ',');
1327 		if (c != NULL) *c++ = '\0';
1328 
1329 		/* Match mode string */
1330 		uint_t mode;
1331 		for (mode = 0; mode < ZTI_NMODES; mode++)
1332 			if (strcmp(tok, modes[mode]) == 0)
1333 				break;
1334 		if (mode == ZTI_NMODES)
1335 			break;
1336 
1337 		/* Invalid canary */
1338 		row[q].zti_mode = ZTI_NMODES;
1339 
1340 		/* Per-mode setup */
1341 		switch (mode) {
1342 
1343 		/*
1344 		 * FIXED is parameterised: number of queues, and number of
1345 		 * threads per queue.
1346 		 */
1347 		case ZTI_MODE_FIXED: {
1348 			/* No parameters? */
1349 			if (c == NULL || *c == '\0')
1350 				break;
1351 
1352 			/* Find next parameter */
1353 			tok = c;
1354 			c = strchr(tok, ',');
1355 			if (c == NULL)
1356 				break;
1357 
1358 			/* Take digits and convert */
1359 			unsigned long long nq;
1360 			if (!(isdigit(*tok)))
1361 				break;
1362 			err = ddi_strtoull(tok, &tok, 10, &nq);
1363 			/* Must succeed and also end at the next param sep */
1364 			if (err != 0 || tok != c)
1365 				break;
1366 
1367 			/* Move past the comma */
1368 			tok++;
1369 			/* Need another number */
1370 			if (!(isdigit(*tok)))
1371 				break;
1372 			/* Remember start to make sure we moved */
1373 			c = tok;
1374 
1375 			/* Take digits */
1376 			unsigned long long ntpq;
1377 			err = ddi_strtoull(tok, &tok, 10, &ntpq);
1378 			/* Must succeed, and moved forward */
1379 			if (err != 0 || tok == c || *tok != '\0')
1380 				break;
1381 
1382 			/*
1383 			 * sanity; zero queues/threads make no sense, and
1384 			 * 16K is almost certainly more than anyone will ever
1385 			 * need and avoids silly numbers like UINT32_MAX
1386 			 */
1387 			if (nq == 0 || nq >= 16384 ||
1388 			    ntpq == 0 || ntpq >= 16384)
1389 				break;
1390 
1391 			const zio_taskq_info_t zti = ZTI_P(ntpq, nq);
1392 			row[q] = zti;
1393 			break;
1394 		}
1395 
1396 		case ZTI_MODE_SCALE: {
1397 			const zio_taskq_info_t zti = ZTI_SCALE;
1398 			row[q] = zti;
1399 			break;
1400 		}
1401 
1402 		case ZTI_MODE_SYNC: {
1403 			const zio_taskq_info_t zti = ZTI_SYNC;
1404 			row[q] = zti;
1405 			break;
1406 		}
1407 
1408 		case ZTI_MODE_NULL: {
1409 			/*
1410 			 * Can only null the high-priority queues; the general-
1411 			 * purpose ones have to exist.
1412 			 */
1413 			if (q != ZIO_TASKQ_ISSUE_HIGH &&
1414 			    q != ZIO_TASKQ_INTERRUPT_HIGH)
1415 				break;
1416 
1417 			const zio_taskq_info_t zti = ZTI_NULL;
1418 			row[q] = zti;
1419 			break;
1420 		}
1421 
1422 		default:
1423 			break;
1424 		}
1425 
1426 		/* Ensure we set a mode */
1427 		if (row[q].zti_mode == ZTI_NMODES)
1428 			break;
1429 	}
1430 
1431 	/* Didn't get a full row, fail */
1432 	if (q < ZIO_TASKQ_TYPES)
1433 		return (SET_ERROR(EINVAL));
1434 
1435 	/* Eat trailing space */
1436 	if (next != NULL)
1437 		while (isspace(*next))
1438 			next++;
1439 
1440 	/* If there's anything left over then fail */
1441 	if (next != NULL && *next != '\0')
1442 		return (SET_ERROR(EINVAL));
1443 
1444 	/* Success! Copy it into the real config */
1445 	for (q = 0; q < ZIO_TASKQ_TYPES; q++)
1446 		zio_taskqs[t][q] = row[q];
1447 
1448 	return (0);
1449 }
1450 
1451 static int
1452 spa_taskq_param_get(zio_type_t t, char *buf, boolean_t add_newline)
1453 {
1454 	int pos = 0;
1455 
1456 	/* Build paramater string from live config */
1457 	const char *sep = "";
1458 	for (uint_t q = 0; q < ZIO_TASKQ_TYPES; q++) {
1459 		const zio_taskq_info_t *zti = &zio_taskqs[t][q];
1460 		if (zti->zti_mode == ZTI_MODE_FIXED)
1461 			pos += sprintf(&buf[pos], "%s%s,%u,%u", sep,
1462 			    modes[zti->zti_mode], zti->zti_count,
1463 			    zti->zti_value);
1464 		else
1465 			pos += sprintf(&buf[pos], "%s%s", sep,
1466 			    modes[zti->zti_mode]);
1467 		sep = " ";
1468 	}
1469 
1470 	if (add_newline)
1471 		buf[pos++] = '\n';
1472 	buf[pos] = '\0';
1473 
1474 	return (pos);
1475 }
1476 
1477 #ifdef __linux__
1478 static int
1479 spa_taskq_read_param_set(const char *val, zfs_kernel_param_t *kp)
1480 {
1481 	char *cfg = kmem_strdup(val);
1482 	int err = spa_taskq_param_set(ZIO_TYPE_READ, cfg);
1483 	kmem_free(cfg, strlen(val)+1);
1484 	return (-err);
1485 }
1486 static int
1487 spa_taskq_read_param_get(char *buf, zfs_kernel_param_t *kp)
1488 {
1489 	return (spa_taskq_param_get(ZIO_TYPE_READ, buf, TRUE));
1490 }
1491 
1492 static int
1493 spa_taskq_write_param_set(const char *val, zfs_kernel_param_t *kp)
1494 {
1495 	char *cfg = kmem_strdup(val);
1496 	int err = spa_taskq_param_set(ZIO_TYPE_WRITE, cfg);
1497 	kmem_free(cfg, strlen(val)+1);
1498 	return (-err);
1499 }
1500 static int
1501 spa_taskq_write_param_get(char *buf, zfs_kernel_param_t *kp)
1502 {
1503 	return (spa_taskq_param_get(ZIO_TYPE_WRITE, buf, TRUE));
1504 }
1505 #else
1506 /*
1507  * On FreeBSD load-time parameters can be set up before malloc() is available,
1508  * so we have to do all the parsing work on the stack.
1509  */
1510 #define	SPA_TASKQ_PARAM_MAX	(128)
1511 
1512 static int
1513 spa_taskq_read_param(ZFS_MODULE_PARAM_ARGS)
1514 {
1515 	char buf[SPA_TASKQ_PARAM_MAX];
1516 	int err;
1517 
1518 	(void) spa_taskq_param_get(ZIO_TYPE_READ, buf, FALSE);
1519 	err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
1520 	if (err || req->newptr == NULL)
1521 		return (err);
1522 	return (spa_taskq_param_set(ZIO_TYPE_READ, buf));
1523 }
1524 
1525 static int
1526 spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS)
1527 {
1528 	char buf[SPA_TASKQ_PARAM_MAX];
1529 	int err;
1530 
1531 	(void) spa_taskq_param_get(ZIO_TYPE_WRITE, buf, FALSE);
1532 	err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
1533 	if (err || req->newptr == NULL)
1534 		return (err);
1535 	return (spa_taskq_param_set(ZIO_TYPE_WRITE, buf));
1536 }
1537 #endif
1538 #endif /* _KERNEL */
1539 
1540 /*
1541  * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
1542  * Note that a type may have multiple discrete taskqs to avoid lock contention
1543  * on the taskq itself.
1544  */
1545 void
1546 spa_taskq_dispatch(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
1547     task_func_t *func, zio_t *zio, boolean_t cutinline)
1548 {
1549 	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
1550 	taskq_t *tq;
1551 
1552 	ASSERT3P(tqs->stqs_taskq, !=, NULL);
1553 	ASSERT3U(tqs->stqs_count, !=, 0);
1554 
1555 	/*
1556 	 * NB: We are assuming that the zio can only be dispatched
1557 	 * to a single taskq at a time.  It would be a grievous error
1558 	 * to dispatch the zio to another taskq at the same time.
1559 	 */
1560 	ASSERT(zio);
1561 	ASSERT(taskq_empty_ent(&zio->io_tqent));
1562 
1563 	if (tqs->stqs_count == 1) {
1564 		tq = tqs->stqs_taskq[0];
1565 	} else if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) &&
1566 	    ZIO_HAS_ALLOCATOR(zio)) {
1567 		tq = tqs->stqs_taskq[zio->io_allocator % tqs->stqs_count];
1568 	} else {
1569 		tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
1570 	}
1571 
1572 	taskq_dispatch_ent(tq, func, zio, cutinline ? TQ_FRONT : 0,
1573 	    &zio->io_tqent);
1574 }
1575 
1576 static void
1577 spa_create_zio_taskqs(spa_t *spa)
1578 {
1579 	for (int t = 0; t < ZIO_TYPES; t++) {
1580 		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
1581 			spa_taskqs_init(spa, t, q);
1582 		}
1583 	}
1584 }
1585 
1586 #if defined(_KERNEL) && defined(HAVE_SPA_THREAD)
1587 static void
1588 spa_thread(void *arg)
1589 {
1590 	psetid_t zio_taskq_psrset_bind = PS_NONE;
1591 	callb_cpr_t cprinfo;
1592 
1593 	spa_t *spa = arg;
1594 	user_t *pu = PTOU(curproc);
1595 
1596 	CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
1597 	    spa->spa_name);
1598 
1599 	ASSERT(curproc != &p0);
1600 	(void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
1601 	    "zpool-%s", spa->spa_name);
1602 	(void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
1603 
1604 	/* bind this thread to the requested psrset */
1605 	if (zio_taskq_psrset_bind != PS_NONE) {
1606 		pool_lock();
1607 		mutex_enter(&cpu_lock);
1608 		mutex_enter(&pidlock);
1609 		mutex_enter(&curproc->p_lock);
1610 
1611 		if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
1612 		    0, NULL, NULL) == 0)  {
1613 			curthread->t_bind_pset = zio_taskq_psrset_bind;
1614 		} else {
1615 			cmn_err(CE_WARN,
1616 			    "Couldn't bind process for zfs pool \"%s\" to "
1617 			    "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
1618 		}
1619 
1620 		mutex_exit(&curproc->p_lock);
1621 		mutex_exit(&pidlock);
1622 		mutex_exit(&cpu_lock);
1623 		pool_unlock();
1624 	}
1625 
1626 #ifdef HAVE_SYSDC
1627 	if (zio_taskq_sysdc) {
1628 		sysdc_thread_enter(curthread, 100, 0);
1629 	}
1630 #endif
1631 
1632 	spa->spa_proc = curproc;
1633 	spa->spa_did = curthread->t_did;
1634 
1635 	spa_create_zio_taskqs(spa);
1636 
1637 	mutex_enter(&spa->spa_proc_lock);
1638 	ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
1639 
1640 	spa->spa_proc_state = SPA_PROC_ACTIVE;
1641 	cv_broadcast(&spa->spa_proc_cv);
1642 
1643 	CALLB_CPR_SAFE_BEGIN(&cprinfo);
1644 	while (spa->spa_proc_state == SPA_PROC_ACTIVE)
1645 		cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
1646 	CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
1647 
1648 	ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
1649 	spa->spa_proc_state = SPA_PROC_GONE;
1650 	spa->spa_proc = &p0;
1651 	cv_broadcast(&spa->spa_proc_cv);
1652 	CALLB_CPR_EXIT(&cprinfo);	/* drops spa_proc_lock */
1653 
1654 	mutex_enter(&curproc->p_lock);
1655 	lwp_exit();
1656 }
1657 #endif
1658 
1659 extern metaslab_ops_t *metaslab_allocator(spa_t *spa);
1660 
1661 /*
1662  * Activate an uninitialized pool.
1663  */
1664 static void
1665 spa_activate(spa_t *spa, spa_mode_t mode)
1666 {
1667 	metaslab_ops_t *msp = metaslab_allocator(spa);
1668 	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
1669 
1670 	spa->spa_state = POOL_STATE_ACTIVE;
1671 	spa->spa_final_txg = UINT64_MAX;
1672 	spa->spa_mode = mode;
1673 	spa->spa_read_spacemaps = spa_mode_readable_spacemaps;
1674 
1675 	spa->spa_normal_class = metaslab_class_create(spa, msp, B_FALSE);
1676 	spa->spa_log_class = metaslab_class_create(spa, msp, B_TRUE);
1677 	spa->spa_embedded_log_class = metaslab_class_create(spa, msp, B_TRUE);
1678 	spa->spa_special_class = metaslab_class_create(spa, msp, B_FALSE);
1679 	spa->spa_dedup_class = metaslab_class_create(spa, msp, B_FALSE);
1680 
1681 	/* Try to create a covering process */
1682 	mutex_enter(&spa->spa_proc_lock);
1683 	ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
1684 	ASSERT(spa->spa_proc == &p0);
1685 	spa->spa_did = 0;
1686 
1687 #ifdef HAVE_SPA_THREAD
1688 	/* Only create a process if we're going to be around a while. */
1689 	if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
1690 		if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
1691 		    NULL, 0) == 0) {
1692 			spa->spa_proc_state = SPA_PROC_CREATED;
1693 			while (spa->spa_proc_state == SPA_PROC_CREATED) {
1694 				cv_wait(&spa->spa_proc_cv,
1695 				    &spa->spa_proc_lock);
1696 			}
1697 			ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
1698 			ASSERT(spa->spa_proc != &p0);
1699 			ASSERT(spa->spa_did != 0);
1700 		} else {
1701 #ifdef _KERNEL
1702 			cmn_err(CE_WARN,
1703 			    "Couldn't create process for zfs pool \"%s\"\n",
1704 			    spa->spa_name);
1705 #endif
1706 		}
1707 	}
1708 #endif /* HAVE_SPA_THREAD */
1709 	mutex_exit(&spa->spa_proc_lock);
1710 
1711 	/* If we didn't create a process, we need to create our taskqs. */
1712 	if (spa->spa_proc == &p0) {
1713 		spa_create_zio_taskqs(spa);
1714 	}
1715 
1716 	for (size_t i = 0; i < TXG_SIZE; i++) {
1717 		spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL,
1718 		    ZIO_FLAG_CANFAIL);
1719 	}
1720 
1721 	list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
1722 	    offsetof(vdev_t, vdev_config_dirty_node));
1723 	list_create(&spa->spa_evicting_os_list, sizeof (objset_t),
1724 	    offsetof(objset_t, os_evicting_node));
1725 	list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
1726 	    offsetof(vdev_t, vdev_state_dirty_node));
1727 
1728 	txg_list_create(&spa->spa_vdev_txg_list, spa,
1729 	    offsetof(struct vdev, vdev_txg_node));
1730 
1731 	avl_create(&spa->spa_errlist_scrub,
1732 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
1733 	    offsetof(spa_error_entry_t, se_avl));
1734 	avl_create(&spa->spa_errlist_last,
1735 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
1736 	    offsetof(spa_error_entry_t, se_avl));
1737 	avl_create(&spa->spa_errlist_healed,
1738 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
1739 	    offsetof(spa_error_entry_t, se_avl));
1740 
1741 	spa_activate_os(spa);
1742 
1743 	spa_keystore_init(&spa->spa_keystore);
1744 
1745 	/*
1746 	 * This taskq is used to perform zvol-minor-related tasks
1747 	 * asynchronously. This has several advantages, including easy
1748 	 * resolution of various deadlocks.
1749 	 *
1750 	 * The taskq must be single threaded to ensure tasks are always
1751 	 * processed in the order in which they were dispatched.
1752 	 *
1753 	 * A taskq per pool allows one to keep the pools independent.
1754 	 * This way if one pool is suspended, it will not impact another.
1755 	 *
1756 	 * The preferred location to dispatch a zvol minor task is a sync
1757 	 * task. In this context, there is easy access to the spa_t and minimal
1758 	 * error handling is required because the sync task must succeed.
1759 	 */
1760 	spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri,
1761 	    1, INT_MAX, 0);
1762 
1763 	/*
1764 	 * The taskq to preload metaslabs.
1765 	 */
1766 	spa->spa_metaslab_taskq = taskq_create("z_metaslab",
1767 	    metaslab_preload_pct, maxclsyspri, 1, INT_MAX,
1768 	    TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
1769 
1770 	/*
1771 	 * Taskq dedicated to prefetcher threads: this is used to prevent the
1772 	 * pool traverse code from monopolizing the global (and limited)
1773 	 * system_taskq by inappropriately scheduling long running tasks on it.
1774 	 */
1775 	spa->spa_prefetch_taskq = taskq_create("z_prefetch", 100,
1776 	    defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
1777 
1778 	/*
1779 	 * The taskq to upgrade datasets in this pool. Currently used by
1780 	 * feature SPA_FEATURE_USEROBJ_ACCOUNTING/SPA_FEATURE_PROJECT_QUOTA.
1781 	 */
1782 	spa->spa_upgrade_taskq = taskq_create("z_upgrade", 100,
1783 	    defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
1784 }
1785 
1786 /*
1787  * Opposite of spa_activate().
1788  */
1789 static void
1790 spa_deactivate(spa_t *spa)
1791 {
1792 	ASSERT(spa->spa_sync_on == B_FALSE);
1793 	ASSERT(spa->spa_dsl_pool == NULL);
1794 	ASSERT(spa->spa_root_vdev == NULL);
1795 	ASSERT(spa->spa_async_zio_root == NULL);
1796 	ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
1797 
1798 	spa_evicting_os_wait(spa);
1799 
1800 	if (spa->spa_zvol_taskq) {
1801 		taskq_destroy(spa->spa_zvol_taskq);
1802 		spa->spa_zvol_taskq = NULL;
1803 	}
1804 
1805 	if (spa->spa_metaslab_taskq) {
1806 		taskq_destroy(spa->spa_metaslab_taskq);
1807 		spa->spa_metaslab_taskq = NULL;
1808 	}
1809 
1810 	if (spa->spa_prefetch_taskq) {
1811 		taskq_destroy(spa->spa_prefetch_taskq);
1812 		spa->spa_prefetch_taskq = NULL;
1813 	}
1814 
1815 	if (spa->spa_upgrade_taskq) {
1816 		taskq_destroy(spa->spa_upgrade_taskq);
1817 		spa->spa_upgrade_taskq = NULL;
1818 	}
1819 
1820 	txg_list_destroy(&spa->spa_vdev_txg_list);
1821 
1822 	list_destroy(&spa->spa_config_dirty_list);
1823 	list_destroy(&spa->spa_evicting_os_list);
1824 	list_destroy(&spa->spa_state_dirty_list);
1825 
1826 	taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
1827 
1828 	for (int t = 0; t < ZIO_TYPES; t++) {
1829 		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
1830 			spa_taskqs_fini(spa, t, q);
1831 		}
1832 	}
1833 
1834 	for (size_t i = 0; i < TXG_SIZE; i++) {
1835 		ASSERT3P(spa->spa_txg_zio[i], !=, NULL);
1836 		VERIFY0(zio_wait(spa->spa_txg_zio[i]));
1837 		spa->spa_txg_zio[i] = NULL;
1838 	}
1839 
1840 	metaslab_class_destroy(spa->spa_normal_class);
1841 	spa->spa_normal_class = NULL;
1842 
1843 	metaslab_class_destroy(spa->spa_log_class);
1844 	spa->spa_log_class = NULL;
1845 
1846 	metaslab_class_destroy(spa->spa_embedded_log_class);
1847 	spa->spa_embedded_log_class = NULL;
1848 
1849 	metaslab_class_destroy(spa->spa_special_class);
1850 	spa->spa_special_class = NULL;
1851 
1852 	metaslab_class_destroy(spa->spa_dedup_class);
1853 	spa->spa_dedup_class = NULL;
1854 
1855 	/*
1856 	 * If this was part of an import or the open otherwise failed, we may
1857 	 * still have errors left in the queues.  Empty them just in case.
1858 	 */
1859 	spa_errlog_drain(spa);
1860 	avl_destroy(&spa->spa_errlist_scrub);
1861 	avl_destroy(&spa->spa_errlist_last);
1862 	avl_destroy(&spa->spa_errlist_healed);
1863 
1864 	spa_keystore_fini(&spa->spa_keystore);
1865 
1866 	spa->spa_state = POOL_STATE_UNINITIALIZED;
1867 
1868 	mutex_enter(&spa->spa_proc_lock);
1869 	if (spa->spa_proc_state != SPA_PROC_NONE) {
1870 		ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
1871 		spa->spa_proc_state = SPA_PROC_DEACTIVATE;
1872 		cv_broadcast(&spa->spa_proc_cv);
1873 		while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
1874 			ASSERT(spa->spa_proc != &p0);
1875 			cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
1876 		}
1877 		ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
1878 		spa->spa_proc_state = SPA_PROC_NONE;
1879 	}
1880 	ASSERT(spa->spa_proc == &p0);
1881 	mutex_exit(&spa->spa_proc_lock);
1882 
1883 	/*
1884 	 * We want to make sure spa_thread() has actually exited the ZFS
1885 	 * module, so that the module can't be unloaded out from underneath
1886 	 * it.
1887 	 */
1888 	if (spa->spa_did != 0) {
1889 		thread_join(spa->spa_did);
1890 		spa->spa_did = 0;
1891 	}
1892 
1893 	spa_deactivate_os(spa);
1894 
1895 }
1896 
1897 /*
1898  * Verify a pool configuration, and construct the vdev tree appropriately.  This
1899  * will create all the necessary vdevs in the appropriate layout, with each vdev
1900  * in the CLOSED state.  This will prep the pool before open/creation/import.
1901  * All vdev validation is done by the vdev_alloc() routine.
1902  */
1903 int
1904 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
1905     uint_t id, int atype)
1906 {
1907 	nvlist_t **child;
1908 	uint_t children;
1909 	int error;
1910 
1911 	if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
1912 		return (error);
1913 
1914 	if ((*vdp)->vdev_ops->vdev_op_leaf)
1915 		return (0);
1916 
1917 	error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
1918 	    &child, &children);
1919 
1920 	if (error == ENOENT)
1921 		return (0);
1922 
1923 	if (error) {
1924 		vdev_free(*vdp);
1925 		*vdp = NULL;
1926 		return (SET_ERROR(EINVAL));
1927 	}
1928 
1929 	for (int c = 0; c < children; c++) {
1930 		vdev_t *vd;
1931 		if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
1932 		    atype)) != 0) {
1933 			vdev_free(*vdp);
1934 			*vdp = NULL;
1935 			return (error);
1936 		}
1937 	}
1938 
1939 	ASSERT(*vdp != NULL);
1940 
1941 	return (0);
1942 }
1943 
1944 static boolean_t
1945 spa_should_flush_logs_on_unload(spa_t *spa)
1946 {
1947 	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
1948 		return (B_FALSE);
1949 
1950 	if (!spa_writeable(spa))
1951 		return (B_FALSE);
1952 
1953 	if (!spa->spa_sync_on)
1954 		return (B_FALSE);
1955 
1956 	if (spa_state(spa) != POOL_STATE_EXPORTED)
1957 		return (B_FALSE);
1958 
1959 	if (zfs_keep_log_spacemaps_at_export)
1960 		return (B_FALSE);
1961 
1962 	return (B_TRUE);
1963 }
1964 
1965 /*
1966  * Opens a transaction that will set the flag that will instruct
1967  * spa_sync to attempt to flush all the metaslabs for that txg.
1968  */
1969 static void
1970 spa_unload_log_sm_flush_all(spa_t *spa)
1971 {
1972 	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
1973 	VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT | DMU_TX_SUSPEND));
1974 
1975 	ASSERT3U(spa->spa_log_flushall_txg, ==, 0);
1976 	spa->spa_log_flushall_txg = dmu_tx_get_txg(tx);
1977 
1978 	dmu_tx_commit(tx);
1979 	txg_wait_synced(spa_get_dsl(spa), spa->spa_log_flushall_txg);
1980 }
1981 
1982 static void
1983 spa_unload_log_sm_metadata(spa_t *spa)
1984 {
1985 	void *cookie = NULL;
1986 	spa_log_sm_t *sls;
1987 	log_summary_entry_t *e;
1988 
1989 	while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg,
1990 	    &cookie)) != NULL) {
1991 		VERIFY0(sls->sls_mscount);
1992 		kmem_free(sls, sizeof (spa_log_sm_t));
1993 	}
1994 
1995 	while ((e = list_remove_head(&spa->spa_log_summary)) != NULL) {
1996 		VERIFY0(e->lse_mscount);
1997 		kmem_free(e, sizeof (log_summary_entry_t));
1998 	}
1999 
2000 	spa->spa_unflushed_stats.sus_nblocks = 0;
2001 	spa->spa_unflushed_stats.sus_memused = 0;
2002 	spa->spa_unflushed_stats.sus_blocklimit = 0;
2003 }
2004 
2005 static void
2006 spa_destroy_aux_threads(spa_t *spa)
2007 {
2008 	if (spa->spa_condense_zthr != NULL) {
2009 		zthr_destroy(spa->spa_condense_zthr);
2010 		spa->spa_condense_zthr = NULL;
2011 	}
2012 	if (spa->spa_checkpoint_discard_zthr != NULL) {
2013 		zthr_destroy(spa->spa_checkpoint_discard_zthr);
2014 		spa->spa_checkpoint_discard_zthr = NULL;
2015 	}
2016 	if (spa->spa_livelist_delete_zthr != NULL) {
2017 		zthr_destroy(spa->spa_livelist_delete_zthr);
2018 		spa->spa_livelist_delete_zthr = NULL;
2019 	}
2020 	if (spa->spa_livelist_condense_zthr != NULL) {
2021 		zthr_destroy(spa->spa_livelist_condense_zthr);
2022 		spa->spa_livelist_condense_zthr = NULL;
2023 	}
2024 	if (spa->spa_raidz_expand_zthr != NULL) {
2025 		zthr_destroy(spa->spa_raidz_expand_zthr);
2026 		spa->spa_raidz_expand_zthr = NULL;
2027 	}
2028 }
2029 
2030 /*
2031  * Opposite of spa_load().
2032  */
2033 static void
2034 spa_unload(spa_t *spa)
2035 {
2036 	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
2037 	    spa->spa_export_thread == curthread);
2038 	ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED);
2039 
2040 	spa_import_progress_remove(spa_guid(spa));
2041 	spa_load_note(spa, "UNLOADING");
2042 
2043 	spa_wake_waiters(spa);
2044 
2045 	/*
2046 	 * If we have set the spa_final_txg, we have already performed the
2047 	 * tasks below in spa_export_common(). We should not redo it here since
2048 	 * we delay the final TXGs beyond what spa_final_txg is set at.
2049 	 */
2050 	if (spa->spa_final_txg == UINT64_MAX) {
2051 		/*
2052 		 * If the log space map feature is enabled and the pool is
2053 		 * getting exported (but not destroyed), we want to spend some
2054 		 * time flushing as many metaslabs as we can in an attempt to
2055 		 * destroy log space maps and save import time.
2056 		 */
2057 		if (spa_should_flush_logs_on_unload(spa))
2058 			spa_unload_log_sm_flush_all(spa);
2059 
2060 		/*
2061 		 * Stop async tasks.
2062 		 */
2063 		spa_async_suspend(spa);
2064 
2065 		if (spa->spa_root_vdev) {
2066 			vdev_t *root_vdev = spa->spa_root_vdev;
2067 			vdev_initialize_stop_all(root_vdev,
2068 			    VDEV_INITIALIZE_ACTIVE);
2069 			vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE);
2070 			vdev_autotrim_stop_all(spa);
2071 			vdev_rebuild_stop_all(spa);
2072 			l2arc_spa_rebuild_stop(spa);
2073 		}
2074 
2075 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2076 		spa->spa_final_txg = spa_last_synced_txg(spa) +
2077 		    TXG_DEFER_SIZE + 1;
2078 		spa_config_exit(spa, SCL_ALL, FTAG);
2079 	}
2080 
2081 	/*
2082 	 * Stop syncing.
2083 	 */
2084 	if (spa->spa_sync_on) {
2085 		txg_sync_stop(spa->spa_dsl_pool);
2086 		spa->spa_sync_on = B_FALSE;
2087 	}
2088 
2089 	/*
2090 	 * This ensures that there is no async metaslab prefetching
2091 	 * while we attempt to unload the spa.
2092 	 */
2093 	taskq_wait(spa->spa_metaslab_taskq);
2094 
2095 	if (spa->spa_mmp.mmp_thread)
2096 		mmp_thread_stop(spa);
2097 
2098 	/*
2099 	 * Wait for any outstanding async I/O to complete.
2100 	 */
2101 	if (spa->spa_async_zio_root != NULL) {
2102 		for (int i = 0; i < max_ncpus; i++)
2103 			(void) zio_wait(spa->spa_async_zio_root[i]);
2104 		kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *));
2105 		spa->spa_async_zio_root = NULL;
2106 	}
2107 
2108 	if (spa->spa_vdev_removal != NULL) {
2109 		spa_vdev_removal_destroy(spa->spa_vdev_removal);
2110 		spa->spa_vdev_removal = NULL;
2111 	}
2112 
2113 	spa_destroy_aux_threads(spa);
2114 
2115 	spa_condense_fini(spa);
2116 
2117 	bpobj_close(&spa->spa_deferred_bpobj);
2118 
2119 	spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
2120 
2121 	/*
2122 	 * Close all vdevs.
2123 	 */
2124 	if (spa->spa_root_vdev)
2125 		vdev_free(spa->spa_root_vdev);
2126 	ASSERT(spa->spa_root_vdev == NULL);
2127 
2128 	/*
2129 	 * Close the dsl pool.
2130 	 */
2131 	if (spa->spa_dsl_pool) {
2132 		dsl_pool_close(spa->spa_dsl_pool);
2133 		spa->spa_dsl_pool = NULL;
2134 		spa->spa_meta_objset = NULL;
2135 	}
2136 
2137 	ddt_unload(spa);
2138 	brt_unload(spa);
2139 	spa_unload_log_sm_metadata(spa);
2140 
2141 	/*
2142 	 * Drop and purge level 2 cache
2143 	 */
2144 	spa_l2cache_drop(spa);
2145 
2146 	if (spa->spa_spares.sav_vdevs) {
2147 		for (int i = 0; i < spa->spa_spares.sav_count; i++)
2148 			vdev_free(spa->spa_spares.sav_vdevs[i]);
2149 		kmem_free(spa->spa_spares.sav_vdevs,
2150 		    spa->spa_spares.sav_count * sizeof (void *));
2151 		spa->spa_spares.sav_vdevs = NULL;
2152 	}
2153 	if (spa->spa_spares.sav_config) {
2154 		nvlist_free(spa->spa_spares.sav_config);
2155 		spa->spa_spares.sav_config = NULL;
2156 	}
2157 	spa->spa_spares.sav_count = 0;
2158 
2159 	if (spa->spa_l2cache.sav_vdevs) {
2160 		for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
2161 			vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
2162 			vdev_free(spa->spa_l2cache.sav_vdevs[i]);
2163 		}
2164 		kmem_free(spa->spa_l2cache.sav_vdevs,
2165 		    spa->spa_l2cache.sav_count * sizeof (void *));
2166 		spa->spa_l2cache.sav_vdevs = NULL;
2167 	}
2168 	if (spa->spa_l2cache.sav_config) {
2169 		nvlist_free(spa->spa_l2cache.sav_config);
2170 		spa->spa_l2cache.sav_config = NULL;
2171 	}
2172 	spa->spa_l2cache.sav_count = 0;
2173 
2174 	spa->spa_async_suspended = 0;
2175 
2176 	spa->spa_indirect_vdevs_loaded = B_FALSE;
2177 
2178 	if (spa->spa_comment != NULL) {
2179 		spa_strfree(spa->spa_comment);
2180 		spa->spa_comment = NULL;
2181 	}
2182 	if (spa->spa_compatibility != NULL) {
2183 		spa_strfree(spa->spa_compatibility);
2184 		spa->spa_compatibility = NULL;
2185 	}
2186 
2187 	spa->spa_raidz_expand = NULL;
2188 	spa->spa_checkpoint_txg = 0;
2189 
2190 	spa_config_exit(spa, SCL_ALL, spa);
2191 }
2192 
2193 /*
2194  * Load (or re-load) the current list of vdevs describing the active spares for
2195  * this pool.  When this is called, we have some form of basic information in
2196  * 'spa_spares.sav_config'.  We parse this into vdevs, try to open them, and
2197  * then re-generate a more complete list including status information.
2198  */
2199 void
2200 spa_load_spares(spa_t *spa)
2201 {
2202 	nvlist_t **spares;
2203 	uint_t nspares;
2204 	int i;
2205 	vdev_t *vd, *tvd;
2206 
2207 #ifndef _KERNEL
2208 	/*
2209 	 * zdb opens both the current state of the pool and the
2210 	 * checkpointed state (if present), with a different spa_t.
2211 	 *
2212 	 * As spare vdevs are shared among open pools, we skip loading
2213 	 * them when we load the checkpointed state of the pool.
2214 	 */
2215 	if (!spa_writeable(spa))
2216 		return;
2217 #endif
2218 
2219 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
2220 
2221 	/*
2222 	 * First, close and free any existing spare vdevs.
2223 	 */
2224 	if (spa->spa_spares.sav_vdevs) {
2225 		for (i = 0; i < spa->spa_spares.sav_count; i++) {
2226 			vd = spa->spa_spares.sav_vdevs[i];
2227 
2228 			/* Undo the call to spa_activate() below */
2229 			if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
2230 			    B_FALSE)) != NULL && tvd->vdev_isspare)
2231 				spa_spare_remove(tvd);
2232 			vdev_close(vd);
2233 			vdev_free(vd);
2234 		}
2235 
2236 		kmem_free(spa->spa_spares.sav_vdevs,
2237 		    spa->spa_spares.sav_count * sizeof (void *));
2238 	}
2239 
2240 	if (spa->spa_spares.sav_config == NULL)
2241 		nspares = 0;
2242 	else
2243 		VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
2244 		    ZPOOL_CONFIG_SPARES, &spares, &nspares));
2245 
2246 	spa->spa_spares.sav_count = (int)nspares;
2247 	spa->spa_spares.sav_vdevs = NULL;
2248 
2249 	if (nspares == 0)
2250 		return;
2251 
2252 	/*
2253 	 * Construct the array of vdevs, opening them to get status in the
2254 	 * process.   For each spare, there is potentially two different vdev_t
2255 	 * structures associated with it: one in the list of spares (used only
2256 	 * for basic validation purposes) and one in the active vdev
2257 	 * configuration (if it's spared in).  During this phase we open and
2258 	 * validate each vdev on the spare list.  If the vdev also exists in the
2259 	 * active configuration, then we also mark this vdev as an active spare.
2260 	 */
2261 	spa->spa_spares.sav_vdevs = kmem_zalloc(nspares * sizeof (void *),
2262 	    KM_SLEEP);
2263 	for (i = 0; i < spa->spa_spares.sav_count; i++) {
2264 		VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
2265 		    VDEV_ALLOC_SPARE) == 0);
2266 		ASSERT(vd != NULL);
2267 
2268 		spa->spa_spares.sav_vdevs[i] = vd;
2269 
2270 		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
2271 		    B_FALSE)) != NULL) {
2272 			if (!tvd->vdev_isspare)
2273 				spa_spare_add(tvd);
2274 
2275 			/*
2276 			 * We only mark the spare active if we were successfully
2277 			 * able to load the vdev.  Otherwise, importing a pool
2278 			 * with a bad active spare would result in strange
2279 			 * behavior, because multiple pool would think the spare
2280 			 * is actively in use.
2281 			 *
2282 			 * There is a vulnerability here to an equally bizarre
2283 			 * circumstance, where a dead active spare is later
2284 			 * brought back to life (onlined or otherwise).  Given
2285 			 * the rarity of this scenario, and the extra complexity
2286 			 * it adds, we ignore the possibility.
2287 			 */
2288 			if (!vdev_is_dead(tvd))
2289 				spa_spare_activate(tvd);
2290 		}
2291 
2292 		vd->vdev_top = vd;
2293 		vd->vdev_aux = &spa->spa_spares;
2294 
2295 		if (vdev_open(vd) != 0)
2296 			continue;
2297 
2298 		if (vdev_validate_aux(vd) == 0)
2299 			spa_spare_add(vd);
2300 	}
2301 
2302 	/*
2303 	 * Recompute the stashed list of spares, with status information
2304 	 * this time.
2305 	 */
2306 	fnvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES);
2307 
2308 	spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
2309 	    KM_SLEEP);
2310 	for (i = 0; i < spa->spa_spares.sav_count; i++)
2311 		spares[i] = vdev_config_generate(spa,
2312 		    spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
2313 	fnvlist_add_nvlist_array(spa->spa_spares.sav_config,
2314 	    ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares,
2315 	    spa->spa_spares.sav_count);
2316 	for (i = 0; i < spa->spa_spares.sav_count; i++)
2317 		nvlist_free(spares[i]);
2318 	kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
2319 }
2320 
2321 /*
2322  * Load (or re-load) the current list of vdevs describing the active l2cache for
2323  * this pool.  When this is called, we have some form of basic information in
2324  * 'spa_l2cache.sav_config'.  We parse this into vdevs, try to open them, and
2325  * then re-generate a more complete list including status information.
2326  * Devices which are already active have their details maintained, and are
2327  * not re-opened.
2328  */
2329 void
2330 spa_load_l2cache(spa_t *spa)
2331 {
2332 	nvlist_t **l2cache = NULL;
2333 	uint_t nl2cache;
2334 	int i, j, oldnvdevs;
2335 	uint64_t guid;
2336 	vdev_t *vd, **oldvdevs, **newvdevs;
2337 	spa_aux_vdev_t *sav = &spa->spa_l2cache;
2338 
2339 #ifndef _KERNEL
2340 	/*
2341 	 * zdb opens both the current state of the pool and the
2342 	 * checkpointed state (if present), with a different spa_t.
2343 	 *
2344 	 * As L2 caches are part of the ARC which is shared among open
2345 	 * pools, we skip loading them when we load the checkpointed
2346 	 * state of the pool.
2347 	 */
2348 	if (!spa_writeable(spa))
2349 		return;
2350 #endif
2351 
2352 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
2353 
2354 	oldvdevs = sav->sav_vdevs;
2355 	oldnvdevs = sav->sav_count;
2356 	sav->sav_vdevs = NULL;
2357 	sav->sav_count = 0;
2358 
2359 	if (sav->sav_config == NULL) {
2360 		nl2cache = 0;
2361 		newvdevs = NULL;
2362 		goto out;
2363 	}
2364 
2365 	VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config,
2366 	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache));
2367 	newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
2368 
2369 	/*
2370 	 * Process new nvlist of vdevs.
2371 	 */
2372 	for (i = 0; i < nl2cache; i++) {
2373 		guid = fnvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID);
2374 
2375 		newvdevs[i] = NULL;
2376 		for (j = 0; j < oldnvdevs; j++) {
2377 			vd = oldvdevs[j];
2378 			if (vd != NULL && guid == vd->vdev_guid) {
2379 				/*
2380 				 * Retain previous vdev for add/remove ops.
2381 				 */
2382 				newvdevs[i] = vd;
2383 				oldvdevs[j] = NULL;
2384 				break;
2385 			}
2386 		}
2387 
2388 		if (newvdevs[i] == NULL) {
2389 			/*
2390 			 * Create new vdev
2391 			 */
2392 			VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
2393 			    VDEV_ALLOC_L2CACHE) == 0);
2394 			ASSERT(vd != NULL);
2395 			newvdevs[i] = vd;
2396 
2397 			/*
2398 			 * Commit this vdev as an l2cache device,
2399 			 * even if it fails to open.
2400 			 */
2401 			spa_l2cache_add(vd);
2402 
2403 			vd->vdev_top = vd;
2404 			vd->vdev_aux = sav;
2405 
2406 			spa_l2cache_activate(vd);
2407 
2408 			if (vdev_open(vd) != 0)
2409 				continue;
2410 
2411 			(void) vdev_validate_aux(vd);
2412 
2413 			if (!vdev_is_dead(vd))
2414 				l2arc_add_vdev(spa, vd);
2415 
2416 			/*
2417 			 * Upon cache device addition to a pool or pool
2418 			 * creation with a cache device or if the header
2419 			 * of the device is invalid we issue an async
2420 			 * TRIM command for the whole device which will
2421 			 * execute if l2arc_trim_ahead > 0.
2422 			 */
2423 			spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM);
2424 		}
2425 	}
2426 
2427 	sav->sav_vdevs = newvdevs;
2428 	sav->sav_count = (int)nl2cache;
2429 
2430 	/*
2431 	 * Recompute the stashed list of l2cache devices, with status
2432 	 * information this time.
2433 	 */
2434 	fnvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE);
2435 
2436 	if (sav->sav_count > 0)
2437 		l2cache = kmem_alloc(sav->sav_count * sizeof (void *),
2438 		    KM_SLEEP);
2439 	for (i = 0; i < sav->sav_count; i++)
2440 		l2cache[i] = vdev_config_generate(spa,
2441 		    sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
2442 	fnvlist_add_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
2443 	    (const nvlist_t * const *)l2cache, sav->sav_count);
2444 
2445 out:
2446 	/*
2447 	 * Purge vdevs that were dropped
2448 	 */
2449 	if (oldvdevs) {
2450 		for (i = 0; i < oldnvdevs; i++) {
2451 			uint64_t pool;
2452 
2453 			vd = oldvdevs[i];
2454 			if (vd != NULL) {
2455 				ASSERT(vd->vdev_isl2cache);
2456 
2457 				if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
2458 				    pool != 0ULL && l2arc_vdev_present(vd))
2459 					l2arc_remove_vdev(vd);
2460 				vdev_clear_stats(vd);
2461 				vdev_free(vd);
2462 			}
2463 		}
2464 
2465 		kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
2466 	}
2467 
2468 	for (i = 0; i < sav->sav_count; i++)
2469 		nvlist_free(l2cache[i]);
2470 	if (sav->sav_count)
2471 		kmem_free(l2cache, sav->sav_count * sizeof (void *));
2472 }
2473 
2474 static int
2475 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
2476 {
2477 	dmu_buf_t *db;
2478 	char *packed = NULL;
2479 	size_t nvsize = 0;
2480 	int error;
2481 	*value = NULL;
2482 
2483 	error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db);
2484 	if (error)
2485 		return (error);
2486 
2487 	nvsize = *(uint64_t *)db->db_data;
2488 	dmu_buf_rele(db, FTAG);
2489 
2490 	packed = vmem_alloc(nvsize, KM_SLEEP);
2491 	error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
2492 	    DMU_READ_PREFETCH);
2493 	if (error == 0)
2494 		error = nvlist_unpack(packed, nvsize, value, 0);
2495 	vmem_free(packed, nvsize);
2496 
2497 	return (error);
2498 }
2499 
2500 /*
2501  * Concrete top-level vdevs that are not missing and are not logs. At every
2502  * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds.
2503  */
2504 static uint64_t
2505 spa_healthy_core_tvds(spa_t *spa)
2506 {
2507 	vdev_t *rvd = spa->spa_root_vdev;
2508 	uint64_t tvds = 0;
2509 
2510 	for (uint64_t i = 0; i < rvd->vdev_children; i++) {
2511 		vdev_t *vd = rvd->vdev_child[i];
2512 		if (vd->vdev_islog)
2513 			continue;
2514 		if (vdev_is_concrete(vd) && !vdev_is_dead(vd))
2515 			tvds++;
2516 	}
2517 
2518 	return (tvds);
2519 }
2520 
2521 /*
2522  * Checks to see if the given vdev could not be opened, in which case we post a
2523  * sysevent to notify the autoreplace code that the device has been removed.
2524  */
2525 static void
2526 spa_check_removed(vdev_t *vd)
2527 {
2528 	for (uint64_t c = 0; c < vd->vdev_children; c++)
2529 		spa_check_removed(vd->vdev_child[c]);
2530 
2531 	if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) &&
2532 	    vdev_is_concrete(vd)) {
2533 		zfs_post_autoreplace(vd->vdev_spa, vd);
2534 		spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK);
2535 	}
2536 }
2537 
2538 static int
2539 spa_check_for_missing_logs(spa_t *spa)
2540 {
2541 	vdev_t *rvd = spa->spa_root_vdev;
2542 
2543 	/*
2544 	 * If we're doing a normal import, then build up any additional
2545 	 * diagnostic information about missing log devices.
2546 	 * We'll pass this up to the user for further processing.
2547 	 */
2548 	if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
2549 		nvlist_t **child, *nv;
2550 		uint64_t idx = 0;
2551 
2552 		child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t *),
2553 		    KM_SLEEP);
2554 		nv = fnvlist_alloc();
2555 
2556 		for (uint64_t c = 0; c < rvd->vdev_children; c++) {
2557 			vdev_t *tvd = rvd->vdev_child[c];
2558 
2559 			/*
2560 			 * We consider a device as missing only if it failed
2561 			 * to open (i.e. offline or faulted is not considered
2562 			 * as missing).
2563 			 */
2564 			if (tvd->vdev_islog &&
2565 			    tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
2566 				child[idx++] = vdev_config_generate(spa, tvd,
2567 				    B_FALSE, VDEV_CONFIG_MISSING);
2568 			}
2569 		}
2570 
2571 		if (idx > 0) {
2572 			fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
2573 			    (const nvlist_t * const *)child, idx);
2574 			fnvlist_add_nvlist(spa->spa_load_info,
2575 			    ZPOOL_CONFIG_MISSING_DEVICES, nv);
2576 
2577 			for (uint64_t i = 0; i < idx; i++)
2578 				nvlist_free(child[i]);
2579 		}
2580 		nvlist_free(nv);
2581 		kmem_free(child, rvd->vdev_children * sizeof (char **));
2582 
2583 		if (idx > 0) {
2584 			spa_load_failed(spa, "some log devices are missing");
2585 			vdev_dbgmsg_print_tree(rvd, 2);
2586 			return (SET_ERROR(ENXIO));
2587 		}
2588 	} else {
2589 		for (uint64_t c = 0; c < rvd->vdev_children; c++) {
2590 			vdev_t *tvd = rvd->vdev_child[c];
2591 
2592 			if (tvd->vdev_islog &&
2593 			    tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
2594 				spa_set_log_state(spa, SPA_LOG_CLEAR);
2595 				spa_load_note(spa, "some log devices are "
2596 				    "missing, ZIL is dropped.");
2597 				vdev_dbgmsg_print_tree(rvd, 2);
2598 				break;
2599 			}
2600 		}
2601 	}
2602 
2603 	return (0);
2604 }
2605 
2606 /*
2607  * Check for missing log devices
2608  */
2609 static boolean_t
2610 spa_check_logs(spa_t *spa)
2611 {
2612 	boolean_t rv = B_FALSE;
2613 	dsl_pool_t *dp = spa_get_dsl(spa);
2614 
2615 	switch (spa->spa_log_state) {
2616 	default:
2617 		break;
2618 	case SPA_LOG_MISSING:
2619 		/* need to recheck in case slog has been restored */
2620 	case SPA_LOG_UNKNOWN:
2621 		rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
2622 		    zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0);
2623 		if (rv)
2624 			spa_set_log_state(spa, SPA_LOG_MISSING);
2625 		break;
2626 	}
2627 	return (rv);
2628 }
2629 
2630 /*
2631  * Passivate any log vdevs (note, does not apply to embedded log metaslabs).
2632  */
2633 static boolean_t
2634 spa_passivate_log(spa_t *spa)
2635 {
2636 	vdev_t *rvd = spa->spa_root_vdev;
2637 	boolean_t slog_found = B_FALSE;
2638 
2639 	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
2640 
2641 	for (int c = 0; c < rvd->vdev_children; c++) {
2642 		vdev_t *tvd = rvd->vdev_child[c];
2643 
2644 		if (tvd->vdev_islog) {
2645 			ASSERT3P(tvd->vdev_log_mg, ==, NULL);
2646 			metaslab_group_passivate(tvd->vdev_mg);
2647 			slog_found = B_TRUE;
2648 		}
2649 	}
2650 
2651 	return (slog_found);
2652 }
2653 
2654 /*
2655  * Activate any log vdevs (note, does not apply to embedded log metaslabs).
2656  */
2657 static void
2658 spa_activate_log(spa_t *spa)
2659 {
2660 	vdev_t *rvd = spa->spa_root_vdev;
2661 
2662 	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
2663 
2664 	for (int c = 0; c < rvd->vdev_children; c++) {
2665 		vdev_t *tvd = rvd->vdev_child[c];
2666 
2667 		if (tvd->vdev_islog) {
2668 			ASSERT3P(tvd->vdev_log_mg, ==, NULL);
2669 			metaslab_group_activate(tvd->vdev_mg);
2670 		}
2671 	}
2672 }
2673 
2674 int
2675 spa_reset_logs(spa_t *spa)
2676 {
2677 	int error;
2678 
2679 	error = dmu_objset_find(spa_name(spa), zil_reset,
2680 	    NULL, DS_FIND_CHILDREN);
2681 	if (error == 0) {
2682 		/*
2683 		 * We successfully offlined the log device, sync out the
2684 		 * current txg so that the "stubby" block can be removed
2685 		 * by zil_sync().
2686 		 */
2687 		txg_wait_synced(spa->spa_dsl_pool, 0);
2688 	}
2689 	return (error);
2690 }
2691 
2692 static void
2693 spa_aux_check_removed(spa_aux_vdev_t *sav)
2694 {
2695 	for (int i = 0; i < sav->sav_count; i++)
2696 		spa_check_removed(sav->sav_vdevs[i]);
2697 }
2698 
2699 void
2700 spa_claim_notify(zio_t *zio)
2701 {
2702 	spa_t *spa = zio->io_spa;
2703 
2704 	if (zio->io_error)
2705 		return;
2706 
2707 	mutex_enter(&spa->spa_props_lock);	/* any mutex will do */
2708 	if (spa->spa_claim_max_txg < BP_GET_LOGICAL_BIRTH(zio->io_bp))
2709 		spa->spa_claim_max_txg = BP_GET_LOGICAL_BIRTH(zio->io_bp);
2710 	mutex_exit(&spa->spa_props_lock);
2711 }
2712 
2713 typedef struct spa_load_error {
2714 	boolean_t	sle_verify_data;
2715 	uint64_t	sle_meta_count;
2716 	uint64_t	sle_data_count;
2717 } spa_load_error_t;
2718 
2719 static void
2720 spa_load_verify_done(zio_t *zio)
2721 {
2722 	blkptr_t *bp = zio->io_bp;
2723 	spa_load_error_t *sle = zio->io_private;
2724 	dmu_object_type_t type = BP_GET_TYPE(bp);
2725 	int error = zio->io_error;
2726 	spa_t *spa = zio->io_spa;
2727 
2728 	abd_free(zio->io_abd);
2729 	if (error) {
2730 		if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
2731 		    type != DMU_OT_INTENT_LOG)
2732 			atomic_inc_64(&sle->sle_meta_count);
2733 		else
2734 			atomic_inc_64(&sle->sle_data_count);
2735 	}
2736 
2737 	mutex_enter(&spa->spa_scrub_lock);
2738 	spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp);
2739 	cv_broadcast(&spa->spa_scrub_io_cv);
2740 	mutex_exit(&spa->spa_scrub_lock);
2741 }
2742 
2743 /*
2744  * Maximum number of inflight bytes is the log2 fraction of the arc size.
2745  * By default, we set it to 1/16th of the arc.
2746  */
2747 static uint_t spa_load_verify_shift = 4;
2748 static int spa_load_verify_metadata = B_TRUE;
2749 static int spa_load_verify_data = B_TRUE;
2750 
2751 static int
2752 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
2753     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
2754 {
2755 	zio_t *rio = arg;
2756 	spa_load_error_t *sle = rio->io_private;
2757 
2758 	(void) zilog, (void) dnp;
2759 
2760 	/*
2761 	 * Note: normally this routine will not be called if
2762 	 * spa_load_verify_metadata is not set.  However, it may be useful
2763 	 * to manually set the flag after the traversal has begun.
2764 	 */
2765 	if (!spa_load_verify_metadata)
2766 		return (0);
2767 
2768 	/*
2769 	 * Sanity check the block pointer in order to detect obvious damage
2770 	 * before using the contents in subsequent checks or in zio_read().
2771 	 * When damaged consider it to be a metadata error since we cannot
2772 	 * trust the BP_GET_TYPE and BP_GET_LEVEL values.
2773 	 */
2774 	if (zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
2775 		atomic_inc_64(&sle->sle_meta_count);
2776 		return (0);
2777 	}
2778 
2779 	if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) ||
2780 	    BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp))
2781 		return (0);
2782 
2783 	if (!BP_IS_METADATA(bp) &&
2784 	    (!spa_load_verify_data || !sle->sle_verify_data))
2785 		return (0);
2786 
2787 	uint64_t maxinflight_bytes =
2788 	    arc_target_bytes() >> spa_load_verify_shift;
2789 	size_t size = BP_GET_PSIZE(bp);
2790 
2791 	mutex_enter(&spa->spa_scrub_lock);
2792 	while (spa->spa_load_verify_bytes >= maxinflight_bytes)
2793 		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
2794 	spa->spa_load_verify_bytes += size;
2795 	mutex_exit(&spa->spa_scrub_lock);
2796 
2797 	zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size,
2798 	    spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
2799 	    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
2800 	    ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
2801 	return (0);
2802 }
2803 
2804 static int
2805 verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
2806 {
2807 	(void) dp, (void) arg;
2808 
2809 	if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN)
2810 		return (SET_ERROR(ENAMETOOLONG));
2811 
2812 	return (0);
2813 }
2814 
2815 static int
2816 spa_load_verify(spa_t *spa)
2817 {
2818 	zio_t *rio;
2819 	spa_load_error_t sle = { 0 };
2820 	zpool_load_policy_t policy;
2821 	boolean_t verify_ok = B_FALSE;
2822 	int error = 0;
2823 
2824 	zpool_get_load_policy(spa->spa_config, &policy);
2825 
2826 	if (policy.zlp_rewind & ZPOOL_NEVER_REWIND ||
2827 	    policy.zlp_maxmeta == UINT64_MAX)
2828 		return (0);
2829 
2830 	dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
2831 	error = dmu_objset_find_dp(spa->spa_dsl_pool,
2832 	    spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL,
2833 	    DS_FIND_CHILDREN);
2834 	dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
2835 	if (error != 0)
2836 		return (error);
2837 
2838 	/*
2839 	 * Verify data only if we are rewinding or error limit was set.
2840 	 * Otherwise nothing except dbgmsg care about it to waste time.
2841 	 */
2842 	sle.sle_verify_data = (policy.zlp_rewind & ZPOOL_REWIND_MASK) ||
2843 	    (policy.zlp_maxdata < UINT64_MAX);
2844 
2845 	rio = zio_root(spa, NULL, &sle,
2846 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
2847 
2848 	if (spa_load_verify_metadata) {
2849 		if (spa->spa_extreme_rewind) {
2850 			spa_load_note(spa, "performing a complete scan of the "
2851 			    "pool since extreme rewind is on. This may take "
2852 			    "a very long time.\n  (spa_load_verify_data=%u, "
2853 			    "spa_load_verify_metadata=%u)",
2854 			    spa_load_verify_data, spa_load_verify_metadata);
2855 		}
2856 
2857 		error = traverse_pool(spa, spa->spa_verify_min_txg,
2858 		    TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |
2859 		    TRAVERSE_NO_DECRYPT, spa_load_verify_cb, rio);
2860 	}
2861 
2862 	(void) zio_wait(rio);
2863 	ASSERT0(spa->spa_load_verify_bytes);
2864 
2865 	spa->spa_load_meta_errors = sle.sle_meta_count;
2866 	spa->spa_load_data_errors = sle.sle_data_count;
2867 
2868 	if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) {
2869 		spa_load_note(spa, "spa_load_verify found %llu metadata errors "
2870 		    "and %llu data errors", (u_longlong_t)sle.sle_meta_count,
2871 		    (u_longlong_t)sle.sle_data_count);
2872 	}
2873 
2874 	if (spa_load_verify_dryrun ||
2875 	    (!error && sle.sle_meta_count <= policy.zlp_maxmeta &&
2876 	    sle.sle_data_count <= policy.zlp_maxdata)) {
2877 		int64_t loss = 0;
2878 
2879 		verify_ok = B_TRUE;
2880 		spa->spa_load_txg = spa->spa_uberblock.ub_txg;
2881 		spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
2882 
2883 		loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
2884 		fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_TIME,
2885 		    spa->spa_load_txg_ts);
2886 		fnvlist_add_int64(spa->spa_load_info, ZPOOL_CONFIG_REWIND_TIME,
2887 		    loss);
2888 		fnvlist_add_uint64(spa->spa_load_info,
2889 		    ZPOOL_CONFIG_LOAD_META_ERRORS, sle.sle_meta_count);
2890 		fnvlist_add_uint64(spa->spa_load_info,
2891 		    ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count);
2892 	} else {
2893 		spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
2894 	}
2895 
2896 	if (spa_load_verify_dryrun)
2897 		return (0);
2898 
2899 	if (error) {
2900 		if (error != ENXIO && error != EIO)
2901 			error = SET_ERROR(EIO);
2902 		return (error);
2903 	}
2904 
2905 	return (verify_ok ? 0 : EIO);
2906 }
2907 
2908 /*
2909  * Find a value in the pool props object.
2910  */
2911 static void
2912 spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
2913 {
2914 	(void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
2915 	    zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
2916 }
2917 
2918 /*
2919  * Find a value in the pool directory object.
2920  */
2921 static int
2922 spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent)
2923 {
2924 	int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
2925 	    name, sizeof (uint64_t), 1, val);
2926 
2927 	if (error != 0 && (error != ENOENT || log_enoent)) {
2928 		spa_load_failed(spa, "couldn't get '%s' value in MOS directory "
2929 		    "[error=%d]", name, error);
2930 	}
2931 
2932 	return (error);
2933 }
2934 
2935 static int
2936 spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
2937 {
2938 	vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
2939 	return (SET_ERROR(err));
2940 }
2941 
2942 boolean_t
2943 spa_livelist_delete_check(spa_t *spa)
2944 {
2945 	return (spa->spa_livelists_to_delete != 0);
2946 }
2947 
2948 static boolean_t
2949 spa_livelist_delete_cb_check(void *arg, zthr_t *z)
2950 {
2951 	(void) z;
2952 	spa_t *spa = arg;
2953 	return (spa_livelist_delete_check(spa));
2954 }
2955 
2956 static int
2957 delete_blkptr_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
2958 {
2959 	spa_t *spa = arg;
2960 	zio_free(spa, tx->tx_txg, bp);
2961 	dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
2962 	    -bp_get_dsize_sync(spa, bp),
2963 	    -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
2964 	return (0);
2965 }
2966 
2967 static int
2968 dsl_get_next_livelist_obj(objset_t *os, uint64_t zap_obj, uint64_t *llp)
2969 {
2970 	int err;
2971 	zap_cursor_t zc;
2972 	zap_attribute_t *za = zap_attribute_alloc();
2973 	zap_cursor_init(&zc, os, zap_obj);
2974 	err = zap_cursor_retrieve(&zc, za);
2975 	zap_cursor_fini(&zc);
2976 	if (err == 0)
2977 		*llp = za->za_first_integer;
2978 	zap_attribute_free(za);
2979 	return (err);
2980 }
2981 
2982 /*
2983  * Components of livelist deletion that must be performed in syncing
2984  * context: freeing block pointers and updating the pool-wide data
2985  * structures to indicate how much work is left to do
2986  */
2987 typedef struct sublist_delete_arg {
2988 	spa_t *spa;
2989 	dsl_deadlist_t *ll;
2990 	uint64_t key;
2991 	bplist_t *to_free;
2992 } sublist_delete_arg_t;
2993 
2994 static void
2995 sublist_delete_sync(void *arg, dmu_tx_t *tx)
2996 {
2997 	sublist_delete_arg_t *sda = arg;
2998 	spa_t *spa = sda->spa;
2999 	dsl_deadlist_t *ll = sda->ll;
3000 	uint64_t key = sda->key;
3001 	bplist_t *to_free = sda->to_free;
3002 
3003 	bplist_iterate(to_free, delete_blkptr_cb, spa, tx);
3004 	dsl_deadlist_remove_entry(ll, key, tx);
3005 }
3006 
3007 typedef struct livelist_delete_arg {
3008 	spa_t *spa;
3009 	uint64_t ll_obj;
3010 	uint64_t zap_obj;
3011 } livelist_delete_arg_t;
3012 
3013 static void
3014 livelist_delete_sync(void *arg, dmu_tx_t *tx)
3015 {
3016 	livelist_delete_arg_t *lda = arg;
3017 	spa_t *spa = lda->spa;
3018 	uint64_t ll_obj = lda->ll_obj;
3019 	uint64_t zap_obj = lda->zap_obj;
3020 	objset_t *mos = spa->spa_meta_objset;
3021 	uint64_t count;
3022 
3023 	/* free the livelist and decrement the feature count */
3024 	VERIFY0(zap_remove_int(mos, zap_obj, ll_obj, tx));
3025 	dsl_deadlist_free(mos, ll_obj, tx);
3026 	spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx);
3027 	VERIFY0(zap_count(mos, zap_obj, &count));
3028 	if (count == 0) {
3029 		/* no more livelists to delete */
3030 		VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT,
3031 		    DMU_POOL_DELETED_CLONES, tx));
3032 		VERIFY0(zap_destroy(mos, zap_obj, tx));
3033 		spa->spa_livelists_to_delete = 0;
3034 		spa_notify_waiters(spa);
3035 	}
3036 }
3037 
3038 /*
3039  * Load in the value for the livelist to be removed and open it. Then,
3040  * load its first sublist and determine which block pointers should actually
3041  * be freed. Then, call a synctask which performs the actual frees and updates
3042  * the pool-wide livelist data.
3043  */
3044 static void
3045 spa_livelist_delete_cb(void *arg, zthr_t *z)
3046 {
3047 	spa_t *spa = arg;
3048 	uint64_t ll_obj = 0, count;
3049 	objset_t *mos = spa->spa_meta_objset;
3050 	uint64_t zap_obj = spa->spa_livelists_to_delete;
3051 	/*
3052 	 * Determine the next livelist to delete. This function should only
3053 	 * be called if there is at least one deleted clone.
3054 	 */
3055 	VERIFY0(dsl_get_next_livelist_obj(mos, zap_obj, &ll_obj));
3056 	VERIFY0(zap_count(mos, ll_obj, &count));
3057 	if (count > 0) {
3058 		dsl_deadlist_t *ll;
3059 		dsl_deadlist_entry_t *dle;
3060 		bplist_t to_free;
3061 		ll = kmem_zalloc(sizeof (dsl_deadlist_t), KM_SLEEP);
3062 		VERIFY0(dsl_deadlist_open(ll, mos, ll_obj));
3063 		dle = dsl_deadlist_first(ll);
3064 		ASSERT3P(dle, !=, NULL);
3065 		bplist_create(&to_free);
3066 		int err = dsl_process_sub_livelist(&dle->dle_bpobj, &to_free,
3067 		    z, NULL);
3068 		if (err == 0) {
3069 			sublist_delete_arg_t sync_arg = {
3070 			    .spa = spa,
3071 			    .ll = ll,
3072 			    .key = dle->dle_mintxg,
3073 			    .to_free = &to_free
3074 			};
3075 			zfs_dbgmsg("deleting sublist (id %llu) from"
3076 			    " livelist %llu, %lld remaining",
3077 			    (u_longlong_t)dle->dle_bpobj.bpo_object,
3078 			    (u_longlong_t)ll_obj, (longlong_t)count - 1);
3079 			VERIFY0(dsl_sync_task(spa_name(spa), NULL,
3080 			    sublist_delete_sync, &sync_arg, 0,
3081 			    ZFS_SPACE_CHECK_DESTROY));
3082 		} else {
3083 			VERIFY3U(err, ==, EINTR);
3084 		}
3085 		bplist_clear(&to_free);
3086 		bplist_destroy(&to_free);
3087 		dsl_deadlist_close(ll);
3088 		kmem_free(ll, sizeof (dsl_deadlist_t));
3089 	} else {
3090 		livelist_delete_arg_t sync_arg = {
3091 		    .spa = spa,
3092 		    .ll_obj = ll_obj,
3093 		    .zap_obj = zap_obj
3094 		};
3095 		zfs_dbgmsg("deletion of livelist %llu completed",
3096 		    (u_longlong_t)ll_obj);
3097 		VERIFY0(dsl_sync_task(spa_name(spa), NULL, livelist_delete_sync,
3098 		    &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY));
3099 	}
3100 }
3101 
3102 static void
3103 spa_start_livelist_destroy_thread(spa_t *spa)
3104 {
3105 	ASSERT3P(spa->spa_livelist_delete_zthr, ==, NULL);
3106 	spa->spa_livelist_delete_zthr =
3107 	    zthr_create("z_livelist_destroy",
3108 	    spa_livelist_delete_cb_check, spa_livelist_delete_cb, spa,
3109 	    minclsyspri);
3110 }
3111 
3112 typedef struct livelist_new_arg {
3113 	bplist_t *allocs;
3114 	bplist_t *frees;
3115 } livelist_new_arg_t;
3116 
3117 static int
3118 livelist_track_new_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
3119     dmu_tx_t *tx)
3120 {
3121 	ASSERT(tx == NULL);
3122 	livelist_new_arg_t *lna = arg;
3123 	if (bp_freed) {
3124 		bplist_append(lna->frees, bp);
3125 	} else {
3126 		bplist_append(lna->allocs, bp);
3127 		zfs_livelist_condense_new_alloc++;
3128 	}
3129 	return (0);
3130 }
3131 
3132 typedef struct livelist_condense_arg {
3133 	spa_t *spa;
3134 	bplist_t to_keep;
3135 	uint64_t first_size;
3136 	uint64_t next_size;
3137 } livelist_condense_arg_t;
3138 
3139 static void
3140 spa_livelist_condense_sync(void *arg, dmu_tx_t *tx)
3141 {
3142 	livelist_condense_arg_t *lca = arg;
3143 	spa_t *spa = lca->spa;
3144 	bplist_t new_frees;
3145 	dsl_dataset_t *ds = spa->spa_to_condense.ds;
3146 
3147 	/* Have we been cancelled? */
3148 	if (spa->spa_to_condense.cancelled) {
3149 		zfs_livelist_condense_sync_cancel++;
3150 		goto out;
3151 	}
3152 
3153 	dsl_deadlist_entry_t *first = spa->spa_to_condense.first;
3154 	dsl_deadlist_entry_t *next = spa->spa_to_condense.next;
3155 	dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist;
3156 
3157 	/*
3158 	 * It's possible that the livelist was changed while the zthr was
3159 	 * running. Therefore, we need to check for new blkptrs in the two
3160 	 * entries being condensed and continue to track them in the livelist.
3161 	 * Because of the way we handle remapped blkptrs (see dbuf_remap_impl),
3162 	 * it's possible that the newly added blkptrs are FREEs or ALLOCs so
3163 	 * we need to sort them into two different bplists.
3164 	 */
3165 	uint64_t first_obj = first->dle_bpobj.bpo_object;
3166 	uint64_t next_obj = next->dle_bpobj.bpo_object;
3167 	uint64_t cur_first_size = first->dle_bpobj.bpo_phys->bpo_num_blkptrs;
3168 	uint64_t cur_next_size = next->dle_bpobj.bpo_phys->bpo_num_blkptrs;
3169 
3170 	bplist_create(&new_frees);
3171 	livelist_new_arg_t new_bps = {
3172 	    .allocs = &lca->to_keep,
3173 	    .frees = &new_frees,
3174 	};
3175 
3176 	if (cur_first_size > lca->first_size) {
3177 		VERIFY0(livelist_bpobj_iterate_from_nofree(&first->dle_bpobj,
3178 		    livelist_track_new_cb, &new_bps, lca->first_size));
3179 	}
3180 	if (cur_next_size > lca->next_size) {
3181 		VERIFY0(livelist_bpobj_iterate_from_nofree(&next->dle_bpobj,
3182 		    livelist_track_new_cb, &new_bps, lca->next_size));
3183 	}
3184 
3185 	dsl_deadlist_clear_entry(first, ll, tx);
3186 	ASSERT(bpobj_is_empty(&first->dle_bpobj));
3187 	dsl_deadlist_remove_entry(ll, next->dle_mintxg, tx);
3188 
3189 	bplist_iterate(&lca->to_keep, dsl_deadlist_insert_alloc_cb, ll, tx);
3190 	bplist_iterate(&new_frees, dsl_deadlist_insert_free_cb, ll, tx);
3191 	bplist_destroy(&new_frees);
3192 
3193 	char dsname[ZFS_MAX_DATASET_NAME_LEN];
3194 	dsl_dataset_name(ds, dsname);
3195 	zfs_dbgmsg("txg %llu condensing livelist of %s (id %llu), bpobj %llu "
3196 	    "(%llu blkptrs) and bpobj %llu (%llu blkptrs) -> bpobj %llu "
3197 	    "(%llu blkptrs)", (u_longlong_t)tx->tx_txg, dsname,
3198 	    (u_longlong_t)ds->ds_object, (u_longlong_t)first_obj,
3199 	    (u_longlong_t)cur_first_size, (u_longlong_t)next_obj,
3200 	    (u_longlong_t)cur_next_size,
3201 	    (u_longlong_t)first->dle_bpobj.bpo_object,
3202 	    (u_longlong_t)first->dle_bpobj.bpo_phys->bpo_num_blkptrs);
3203 out:
3204 	dmu_buf_rele(ds->ds_dbuf, spa);
3205 	spa->spa_to_condense.ds = NULL;
3206 	bplist_clear(&lca->to_keep);
3207 	bplist_destroy(&lca->to_keep);
3208 	kmem_free(lca, sizeof (livelist_condense_arg_t));
3209 	spa->spa_to_condense.syncing = B_FALSE;
3210 }
3211 
3212 static void
3213 spa_livelist_condense_cb(void *arg, zthr_t *t)
3214 {
3215 	while (zfs_livelist_condense_zthr_pause &&
3216 	    !(zthr_has_waiters(t) || zthr_iscancelled(t)))
3217 		delay(1);
3218 
3219 	spa_t *spa = arg;
3220 	dsl_deadlist_entry_t *first = spa->spa_to_condense.first;
3221 	dsl_deadlist_entry_t *next = spa->spa_to_condense.next;
3222 	uint64_t first_size, next_size;
3223 
3224 	livelist_condense_arg_t *lca =
3225 	    kmem_alloc(sizeof (livelist_condense_arg_t), KM_SLEEP);
3226 	bplist_create(&lca->to_keep);
3227 
3228 	/*
3229 	 * Process the livelists (matching FREEs and ALLOCs) in open context
3230 	 * so we have minimal work in syncing context to condense.
3231 	 *
3232 	 * We save bpobj sizes (first_size and next_size) to use later in
3233 	 * syncing context to determine if entries were added to these sublists
3234 	 * while in open context. This is possible because the clone is still
3235 	 * active and open for normal writes and we want to make sure the new,
3236 	 * unprocessed blockpointers are inserted into the livelist normally.
3237 	 *
3238 	 * Note that dsl_process_sub_livelist() both stores the size number of
3239 	 * blockpointers and iterates over them while the bpobj's lock held, so
3240 	 * the sizes returned to us are consistent which what was actually
3241 	 * processed.
3242 	 */
3243 	int err = dsl_process_sub_livelist(&first->dle_bpobj, &lca->to_keep, t,
3244 	    &first_size);
3245 	if (err == 0)
3246 		err = dsl_process_sub_livelist(&next->dle_bpobj, &lca->to_keep,
3247 		    t, &next_size);
3248 
3249 	if (err == 0) {
3250 		while (zfs_livelist_condense_sync_pause &&
3251 		    !(zthr_has_waiters(t) || zthr_iscancelled(t)))
3252 			delay(1);
3253 
3254 		dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
3255 		dmu_tx_mark_netfree(tx);
3256 		dmu_tx_hold_space(tx, 1);
3257 		err = dmu_tx_assign(tx, DMU_TX_NOWAIT | DMU_TX_NOTHROTTLE);
3258 		if (err == 0) {
3259 			/*
3260 			 * Prevent the condense zthr restarting before
3261 			 * the synctask completes.
3262 			 */
3263 			spa->spa_to_condense.syncing = B_TRUE;
3264 			lca->spa = spa;
3265 			lca->first_size = first_size;
3266 			lca->next_size = next_size;
3267 			dsl_sync_task_nowait(spa_get_dsl(spa),
3268 			    spa_livelist_condense_sync, lca, tx);
3269 			dmu_tx_commit(tx);
3270 			return;
3271 		}
3272 	}
3273 	/*
3274 	 * Condensing can not continue: either it was externally stopped or
3275 	 * we were unable to assign to a tx because the pool has run out of
3276 	 * space. In the second case, we'll just end up trying to condense
3277 	 * again in a later txg.
3278 	 */
3279 	ASSERT(err != 0);
3280 	bplist_clear(&lca->to_keep);
3281 	bplist_destroy(&lca->to_keep);
3282 	kmem_free(lca, sizeof (livelist_condense_arg_t));
3283 	dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, spa);
3284 	spa->spa_to_condense.ds = NULL;
3285 	if (err == EINTR)
3286 		zfs_livelist_condense_zthr_cancel++;
3287 }
3288 
3289 /*
3290  * Check that there is something to condense but that a condense is not
3291  * already in progress and that condensing has not been cancelled.
3292  */
3293 static boolean_t
3294 spa_livelist_condense_cb_check(void *arg, zthr_t *z)
3295 {
3296 	(void) z;
3297 	spa_t *spa = arg;
3298 	if ((spa->spa_to_condense.ds != NULL) &&
3299 	    (spa->spa_to_condense.syncing == B_FALSE) &&
3300 	    (spa->spa_to_condense.cancelled == B_FALSE)) {
3301 		return (B_TRUE);
3302 	}
3303 	return (B_FALSE);
3304 }
3305 
3306 static void
3307 spa_start_livelist_condensing_thread(spa_t *spa)
3308 {
3309 	spa->spa_to_condense.ds = NULL;
3310 	spa->spa_to_condense.first = NULL;
3311 	spa->spa_to_condense.next = NULL;
3312 	spa->spa_to_condense.syncing = B_FALSE;
3313 	spa->spa_to_condense.cancelled = B_FALSE;
3314 
3315 	ASSERT3P(spa->spa_livelist_condense_zthr, ==, NULL);
3316 	spa->spa_livelist_condense_zthr =
3317 	    zthr_create("z_livelist_condense",
3318 	    spa_livelist_condense_cb_check,
3319 	    spa_livelist_condense_cb, spa, minclsyspri);
3320 }
3321 
3322 static void
3323 spa_spawn_aux_threads(spa_t *spa)
3324 {
3325 	ASSERT(spa_writeable(spa));
3326 
3327 	spa_start_raidz_expansion_thread(spa);
3328 	spa_start_indirect_condensing_thread(spa);
3329 	spa_start_livelist_destroy_thread(spa);
3330 	spa_start_livelist_condensing_thread(spa);
3331 
3332 	ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL);
3333 	spa->spa_checkpoint_discard_zthr =
3334 	    zthr_create("z_checkpoint_discard",
3335 	    spa_checkpoint_discard_thread_check,
3336 	    spa_checkpoint_discard_thread, spa, minclsyspri);
3337 }
3338 
3339 /*
3340  * Fix up config after a partly-completed split.  This is done with the
3341  * ZPOOL_CONFIG_SPLIT nvlist.  Both the splitting pool and the split-off
3342  * pool have that entry in their config, but only the splitting one contains
3343  * a list of all the guids of the vdevs that are being split off.
3344  *
3345  * This function determines what to do with that list: either rejoin
3346  * all the disks to the pool, or complete the splitting process.  To attempt
3347  * the rejoin, each disk that is offlined is marked online again, and
3348  * we do a reopen() call.  If the vdev label for every disk that was
3349  * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
3350  * then we call vdev_split() on each disk, and complete the split.
3351  *
3352  * Otherwise we leave the config alone, with all the vdevs in place in
3353  * the original pool.
3354  */
3355 static void
3356 spa_try_repair(spa_t *spa, nvlist_t *config)
3357 {
3358 	uint_t extracted;
3359 	uint64_t *glist;
3360 	uint_t i, gcount;
3361 	nvlist_t *nvl;
3362 	vdev_t **vd;
3363 	boolean_t attempt_reopen;
3364 
3365 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
3366 		return;
3367 
3368 	/* check that the config is complete */
3369 	if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
3370 	    &glist, &gcount) != 0)
3371 		return;
3372 
3373 	vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
3374 
3375 	/* attempt to online all the vdevs & validate */
3376 	attempt_reopen = B_TRUE;
3377 	for (i = 0; i < gcount; i++) {
3378 		if (glist[i] == 0)	/* vdev is hole */
3379 			continue;
3380 
3381 		vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
3382 		if (vd[i] == NULL) {
3383 			/*
3384 			 * Don't bother attempting to reopen the disks;
3385 			 * just do the split.
3386 			 */
3387 			attempt_reopen = B_FALSE;
3388 		} else {
3389 			/* attempt to re-online it */
3390 			vd[i]->vdev_offline = B_FALSE;
3391 		}
3392 	}
3393 
3394 	if (attempt_reopen) {
3395 		vdev_reopen(spa->spa_root_vdev);
3396 
3397 		/* check each device to see what state it's in */
3398 		for (extracted = 0, i = 0; i < gcount; i++) {
3399 			if (vd[i] != NULL &&
3400 			    vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
3401 				break;
3402 			++extracted;
3403 		}
3404 	}
3405 
3406 	/*
3407 	 * If every disk has been moved to the new pool, or if we never
3408 	 * even attempted to look at them, then we split them off for
3409 	 * good.
3410 	 */
3411 	if (!attempt_reopen || gcount == extracted) {
3412 		for (i = 0; i < gcount; i++)
3413 			if (vd[i] != NULL)
3414 				vdev_split(vd[i]);
3415 		vdev_reopen(spa->spa_root_vdev);
3416 	}
3417 
3418 	kmem_free(vd, gcount * sizeof (vdev_t *));
3419 }
3420 
3421 static int
3422 spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type)
3423 {
3424 	const char *ereport = FM_EREPORT_ZFS_POOL;
3425 	int error;
3426 
3427 	spa->spa_load_state = state;
3428 	(void) spa_import_progress_set_state(spa_guid(spa),
3429 	    spa_load_state(spa));
3430 	spa_import_progress_set_notes(spa, "spa_load()");
3431 
3432 	gethrestime(&spa->spa_loaded_ts);
3433 	error = spa_load_impl(spa, type, &ereport);
3434 
3435 	/*
3436 	 * Don't count references from objsets that are already closed
3437 	 * and are making their way through the eviction process.
3438 	 */
3439 	spa_evicting_os_wait(spa);
3440 	spa->spa_minref = zfs_refcount_count(&spa->spa_refcount);
3441 	if (error) {
3442 		if (error != EEXIST) {
3443 			spa->spa_loaded_ts.tv_sec = 0;
3444 			spa->spa_loaded_ts.tv_nsec = 0;
3445 		}
3446 		if (error != EBADF) {
3447 			(void) zfs_ereport_post(ereport, spa,
3448 			    NULL, NULL, NULL, 0);
3449 		}
3450 	}
3451 	spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
3452 	spa->spa_ena = 0;
3453 
3454 	(void) spa_import_progress_set_state(spa_guid(spa),
3455 	    spa_load_state(spa));
3456 
3457 	return (error);
3458 }
3459 
3460 #ifdef ZFS_DEBUG
3461 /*
3462  * Count the number of per-vdev ZAPs associated with all of the vdevs in the
3463  * vdev tree rooted in the given vd, and ensure that each ZAP is present in the
3464  * spa's per-vdev ZAP list.
3465  */
3466 static uint64_t
3467 vdev_count_verify_zaps(vdev_t *vd)
3468 {
3469 	spa_t *spa = vd->vdev_spa;
3470 	uint64_t total = 0;
3471 
3472 	if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2) &&
3473 	    vd->vdev_root_zap != 0) {
3474 		total++;
3475 		ASSERT0(zap_lookup_int(spa->spa_meta_objset,
3476 		    spa->spa_all_vdev_zaps, vd->vdev_root_zap));
3477 	}
3478 	if (vd->vdev_top_zap != 0) {
3479 		total++;
3480 		ASSERT0(zap_lookup_int(spa->spa_meta_objset,
3481 		    spa->spa_all_vdev_zaps, vd->vdev_top_zap));
3482 	}
3483 	if (vd->vdev_leaf_zap != 0) {
3484 		total++;
3485 		ASSERT0(zap_lookup_int(spa->spa_meta_objset,
3486 		    spa->spa_all_vdev_zaps, vd->vdev_leaf_zap));
3487 	}
3488 
3489 	for (uint64_t i = 0; i < vd->vdev_children; i++) {
3490 		total += vdev_count_verify_zaps(vd->vdev_child[i]);
3491 	}
3492 
3493 	return (total);
3494 }
3495 #else
3496 #define	vdev_count_verify_zaps(vd) ((void) sizeof (vd), 0)
3497 #endif
3498 
3499 /*
3500  * Determine whether the activity check is required.
3501  */
3502 static boolean_t
3503 spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label,
3504     nvlist_t *config)
3505 {
3506 	uint64_t state = 0;
3507 	uint64_t hostid = 0;
3508 	uint64_t tryconfig_txg = 0;
3509 	uint64_t tryconfig_timestamp = 0;
3510 	uint16_t tryconfig_mmp_seq = 0;
3511 	nvlist_t *nvinfo;
3512 
3513 	if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) {
3514 		nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO);
3515 		(void) nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG,
3516 		    &tryconfig_txg);
3517 		(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
3518 		    &tryconfig_timestamp);
3519 		(void) nvlist_lookup_uint16(nvinfo, ZPOOL_CONFIG_MMP_SEQ,
3520 		    &tryconfig_mmp_seq);
3521 	}
3522 
3523 	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state);
3524 
3525 	/*
3526 	 * Disable the MMP activity check - This is used by zdb which
3527 	 * is intended to be used on potentially active pools.
3528 	 */
3529 	if (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP)
3530 		return (B_FALSE);
3531 
3532 	/*
3533 	 * Skip the activity check when the MMP feature is disabled.
3534 	 */
3535 	if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay == 0)
3536 		return (B_FALSE);
3537 
3538 	/*
3539 	 * If the tryconfig_ values are nonzero, they are the results of an
3540 	 * earlier tryimport.  If they all match the uberblock we just found,
3541 	 * then the pool has not changed and we return false so we do not test
3542 	 * a second time.
3543 	 */
3544 	if (tryconfig_txg && tryconfig_txg == ub->ub_txg &&
3545 	    tryconfig_timestamp && tryconfig_timestamp == ub->ub_timestamp &&
3546 	    tryconfig_mmp_seq && tryconfig_mmp_seq ==
3547 	    (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0))
3548 		return (B_FALSE);
3549 
3550 	/*
3551 	 * Allow the activity check to be skipped when importing the pool
3552 	 * on the same host which last imported it.  Since the hostid from
3553 	 * configuration may be stale use the one read from the label.
3554 	 */
3555 	if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID))
3556 		hostid = fnvlist_lookup_uint64(label, ZPOOL_CONFIG_HOSTID);
3557 
3558 	if (hostid == spa_get_hostid(spa))
3559 		return (B_FALSE);
3560 
3561 	/*
3562 	 * Skip the activity test when the pool was cleanly exported.
3563 	 */
3564 	if (state != POOL_STATE_ACTIVE)
3565 		return (B_FALSE);
3566 
3567 	return (B_TRUE);
3568 }
3569 
3570 /*
3571  * Nanoseconds the activity check must watch for changes on-disk.
3572  */
3573 static uint64_t
3574 spa_activity_check_duration(spa_t *spa, uberblock_t *ub)
3575 {
3576 	uint64_t import_intervals = MAX(zfs_multihost_import_intervals, 1);
3577 	uint64_t multihost_interval = MSEC2NSEC(
3578 	    MMP_INTERVAL_OK(zfs_multihost_interval));
3579 	uint64_t import_delay = MAX(NANOSEC, import_intervals *
3580 	    multihost_interval);
3581 
3582 	/*
3583 	 * Local tunables determine a minimum duration except for the case
3584 	 * where we know when the remote host will suspend the pool if MMP
3585 	 * writes do not land.
3586 	 *
3587 	 * See Big Theory comment at the top of mmp.c for the reasoning behind
3588 	 * these cases and times.
3589 	 */
3590 
3591 	ASSERT(MMP_IMPORT_SAFETY_FACTOR >= 100);
3592 
3593 	if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) &&
3594 	    MMP_FAIL_INT(ub) > 0) {
3595 
3596 		/* MMP on remote host will suspend pool after failed writes */
3597 		import_delay = MMP_FAIL_INT(ub) * MSEC2NSEC(MMP_INTERVAL(ub)) *
3598 		    MMP_IMPORT_SAFETY_FACTOR / 100;
3599 
3600 		zfs_dbgmsg("fail_intvals>0 import_delay=%llu ub_mmp "
3601 		    "mmp_fails=%llu ub_mmp mmp_interval=%llu "
3602 		    "import_intervals=%llu", (u_longlong_t)import_delay,
3603 		    (u_longlong_t)MMP_FAIL_INT(ub),
3604 		    (u_longlong_t)MMP_INTERVAL(ub),
3605 		    (u_longlong_t)import_intervals);
3606 
3607 	} else if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) &&
3608 	    MMP_FAIL_INT(ub) == 0) {
3609 
3610 		/* MMP on remote host will never suspend pool */
3611 		import_delay = MAX(import_delay, (MSEC2NSEC(MMP_INTERVAL(ub)) +
3612 		    ub->ub_mmp_delay) * import_intervals);
3613 
3614 		zfs_dbgmsg("fail_intvals=0 import_delay=%llu ub_mmp "
3615 		    "mmp_interval=%llu ub_mmp_delay=%llu "
3616 		    "import_intervals=%llu", (u_longlong_t)import_delay,
3617 		    (u_longlong_t)MMP_INTERVAL(ub),
3618 		    (u_longlong_t)ub->ub_mmp_delay,
3619 		    (u_longlong_t)import_intervals);
3620 
3621 	} else if (MMP_VALID(ub)) {
3622 		/*
3623 		 * zfs-0.7 compatibility case
3624 		 */
3625 
3626 		import_delay = MAX(import_delay, (multihost_interval +
3627 		    ub->ub_mmp_delay) * import_intervals);
3628 
3629 		zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu "
3630 		    "import_intervals=%llu leaves=%u",
3631 		    (u_longlong_t)import_delay,
3632 		    (u_longlong_t)ub->ub_mmp_delay,
3633 		    (u_longlong_t)import_intervals,
3634 		    vdev_count_leaves(spa));
3635 	} else {
3636 		/* Using local tunings is the only reasonable option */
3637 		zfs_dbgmsg("pool last imported on non-MMP aware "
3638 		    "host using import_delay=%llu multihost_interval=%llu "
3639 		    "import_intervals=%llu", (u_longlong_t)import_delay,
3640 		    (u_longlong_t)multihost_interval,
3641 		    (u_longlong_t)import_intervals);
3642 	}
3643 
3644 	return (import_delay);
3645 }
3646 
3647 /*
3648  * Remote host activity check.
3649  *
3650  * error results:
3651  *          0 - no activity detected
3652  *  EREMOTEIO - remote activity detected
3653  *      EINTR - user canceled the operation
3654  */
3655 static int
3656 spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config,
3657     boolean_t importing)
3658 {
3659 	uint64_t txg = ub->ub_txg;
3660 	uint64_t timestamp = ub->ub_timestamp;
3661 	uint64_t mmp_config = ub->ub_mmp_config;
3662 	uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0;
3663 	uint64_t import_delay;
3664 	hrtime_t import_expire, now;
3665 	nvlist_t *mmp_label = NULL;
3666 	vdev_t *rvd = spa->spa_root_vdev;
3667 	kcondvar_t cv;
3668 	kmutex_t mtx;
3669 	int error = 0;
3670 
3671 	cv_init(&cv, NULL, CV_DEFAULT, NULL);
3672 	mutex_init(&mtx, NULL, MUTEX_DEFAULT, NULL);
3673 	mutex_enter(&mtx);
3674 
3675 	/*
3676 	 * If ZPOOL_CONFIG_MMP_TXG is present an activity check was performed
3677 	 * during the earlier tryimport.  If the txg recorded there is 0 then
3678 	 * the pool is known to be active on another host.
3679 	 *
3680 	 * Otherwise, the pool might be in use on another host.  Check for
3681 	 * changes in the uberblocks on disk if necessary.
3682 	 */
3683 	if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) {
3684 		nvlist_t *nvinfo = fnvlist_lookup_nvlist(config,
3685 		    ZPOOL_CONFIG_LOAD_INFO);
3686 
3687 		if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_TXG) &&
3688 		    fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG) == 0) {
3689 			vdev_uberblock_load(rvd, ub, &mmp_label);
3690 			error = SET_ERROR(EREMOTEIO);
3691 			goto out;
3692 		}
3693 	}
3694 
3695 	import_delay = spa_activity_check_duration(spa, ub);
3696 
3697 	/* Add a small random factor in case of simultaneous imports (0-25%) */
3698 	import_delay += import_delay * random_in_range(250) / 1000;
3699 
3700 	import_expire = gethrtime() + import_delay;
3701 
3702 	if (importing) {
3703 		spa_import_progress_set_notes(spa, "Checking MMP activity, "
3704 		    "waiting %llu ms", (u_longlong_t)NSEC2MSEC(import_delay));
3705 	}
3706 
3707 	int iterations = 0;
3708 	while ((now = gethrtime()) < import_expire) {
3709 		if (importing && iterations++ % 30 == 0) {
3710 			spa_import_progress_set_notes(spa, "Checking MMP "
3711 			    "activity, %llu ms remaining",
3712 			    (u_longlong_t)NSEC2MSEC(import_expire - now));
3713 		}
3714 
3715 		if (importing) {
3716 			(void) spa_import_progress_set_mmp_check(spa_guid(spa),
3717 			    NSEC2SEC(import_expire - gethrtime()));
3718 		}
3719 
3720 		vdev_uberblock_load(rvd, ub, &mmp_label);
3721 
3722 		if (txg != ub->ub_txg || timestamp != ub->ub_timestamp ||
3723 		    mmp_seq != (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) {
3724 			zfs_dbgmsg("multihost activity detected "
3725 			    "txg %llu ub_txg  %llu "
3726 			    "timestamp %llu ub_timestamp  %llu "
3727 			    "mmp_config %#llx ub_mmp_config %#llx",
3728 			    (u_longlong_t)txg, (u_longlong_t)ub->ub_txg,
3729 			    (u_longlong_t)timestamp,
3730 			    (u_longlong_t)ub->ub_timestamp,
3731 			    (u_longlong_t)mmp_config,
3732 			    (u_longlong_t)ub->ub_mmp_config);
3733 
3734 			error = SET_ERROR(EREMOTEIO);
3735 			break;
3736 		}
3737 
3738 		if (mmp_label) {
3739 			nvlist_free(mmp_label);
3740 			mmp_label = NULL;
3741 		}
3742 
3743 		error = cv_timedwait_sig(&cv, &mtx, ddi_get_lbolt() + hz);
3744 		if (error != -1) {
3745 			error = SET_ERROR(EINTR);
3746 			break;
3747 		}
3748 		error = 0;
3749 	}
3750 
3751 out:
3752 	mutex_exit(&mtx);
3753 	mutex_destroy(&mtx);
3754 	cv_destroy(&cv);
3755 
3756 	/*
3757 	 * If the pool is determined to be active store the status in the
3758 	 * spa->spa_load_info nvlist.  If the remote hostname or hostid are
3759 	 * available from configuration read from disk store them as well.
3760 	 * This allows 'zpool import' to generate a more useful message.
3761 	 *
3762 	 * ZPOOL_CONFIG_MMP_STATE    - observed pool status (mandatory)
3763 	 * ZPOOL_CONFIG_MMP_HOSTNAME - hostname from the active pool
3764 	 * ZPOOL_CONFIG_MMP_HOSTID   - hostid from the active pool
3765 	 */
3766 	if (error == EREMOTEIO) {
3767 		const char *hostname = "<unknown>";
3768 		uint64_t hostid = 0;
3769 
3770 		if (mmp_label) {
3771 			if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTNAME)) {
3772 				hostname = fnvlist_lookup_string(mmp_label,
3773 				    ZPOOL_CONFIG_HOSTNAME);
3774 				fnvlist_add_string(spa->spa_load_info,
3775 				    ZPOOL_CONFIG_MMP_HOSTNAME, hostname);
3776 			}
3777 
3778 			if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTID)) {
3779 				hostid = fnvlist_lookup_uint64(mmp_label,
3780 				    ZPOOL_CONFIG_HOSTID);
3781 				fnvlist_add_uint64(spa->spa_load_info,
3782 				    ZPOOL_CONFIG_MMP_HOSTID, hostid);
3783 			}
3784 		}
3785 
3786 		fnvlist_add_uint64(spa->spa_load_info,
3787 		    ZPOOL_CONFIG_MMP_STATE, MMP_STATE_ACTIVE);
3788 		fnvlist_add_uint64(spa->spa_load_info,
3789 		    ZPOOL_CONFIG_MMP_TXG, 0);
3790 
3791 		error = spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO);
3792 	}
3793 
3794 	if (mmp_label)
3795 		nvlist_free(mmp_label);
3796 
3797 	return (error);
3798 }
3799 
3800 /*
3801  * Called from zfs_ioc_clear for a pool that was suspended
3802  * after failing mmp write checks.
3803  */
3804 boolean_t
3805 spa_mmp_remote_host_activity(spa_t *spa)
3806 {
3807 	ASSERT(spa_multihost(spa) && spa_suspended(spa));
3808 
3809 	nvlist_t *best_label;
3810 	uberblock_t best_ub;
3811 
3812 	/*
3813 	 * Locate the best uberblock on disk
3814 	 */
3815 	vdev_uberblock_load(spa->spa_root_vdev, &best_ub, &best_label);
3816 	if (best_label) {
3817 		/*
3818 		 * confirm that the best hostid matches our hostid
3819 		 */
3820 		if (nvlist_exists(best_label, ZPOOL_CONFIG_HOSTID) &&
3821 		    spa_get_hostid(spa) !=
3822 		    fnvlist_lookup_uint64(best_label, ZPOOL_CONFIG_HOSTID)) {
3823 			nvlist_free(best_label);
3824 			return (B_TRUE);
3825 		}
3826 		nvlist_free(best_label);
3827 	} else {
3828 		return (B_TRUE);
3829 	}
3830 
3831 	if (!MMP_VALID(&best_ub) ||
3832 	    !MMP_FAIL_INT_VALID(&best_ub) ||
3833 	    MMP_FAIL_INT(&best_ub) == 0) {
3834 		return (B_TRUE);
3835 	}
3836 
3837 	if (best_ub.ub_txg != spa->spa_uberblock.ub_txg ||
3838 	    best_ub.ub_timestamp != spa->spa_uberblock.ub_timestamp) {
3839 		zfs_dbgmsg("txg mismatch detected during pool clear "
3840 		    "txg %llu ub_txg %llu timestamp %llu ub_timestamp %llu",
3841 		    (u_longlong_t)spa->spa_uberblock.ub_txg,
3842 		    (u_longlong_t)best_ub.ub_txg,
3843 		    (u_longlong_t)spa->spa_uberblock.ub_timestamp,
3844 		    (u_longlong_t)best_ub.ub_timestamp);
3845 		return (B_TRUE);
3846 	}
3847 
3848 	/*
3849 	 * Perform an activity check looking for any remote writer
3850 	 */
3851 	return (spa_activity_check(spa, &spa->spa_uberblock, spa->spa_config,
3852 	    B_FALSE) != 0);
3853 }
3854 
3855 static int
3856 spa_verify_host(spa_t *spa, nvlist_t *mos_config)
3857 {
3858 	uint64_t hostid;
3859 	const char *hostname;
3860 	uint64_t myhostid = 0;
3861 
3862 	if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config,
3863 	    ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
3864 		hostname = fnvlist_lookup_string(mos_config,
3865 		    ZPOOL_CONFIG_HOSTNAME);
3866 
3867 		myhostid = zone_get_hostid(NULL);
3868 
3869 		if (hostid != 0 && myhostid != 0 && hostid != myhostid) {
3870 			cmn_err(CE_WARN, "pool '%s' could not be "
3871 			    "loaded as it was last accessed by "
3872 			    "another system (host: %s hostid: 0x%llx). "
3873 			    "See: https://openzfs.github.io/openzfs-docs/msg/"
3874 			    "ZFS-8000-EY",
3875 			    spa_name(spa), hostname, (u_longlong_t)hostid);
3876 			spa_load_failed(spa, "hostid verification failed: pool "
3877 			    "last accessed by host: %s (hostid: 0x%llx)",
3878 			    hostname, (u_longlong_t)hostid);
3879 			return (SET_ERROR(EBADF));
3880 		}
3881 	}
3882 
3883 	return (0);
3884 }
3885 
3886 static int
3887 spa_ld_parse_config(spa_t *spa, spa_import_type_t type)
3888 {
3889 	int error = 0;
3890 	nvlist_t *nvtree, *nvl, *config = spa->spa_config;
3891 	int parse;
3892 	vdev_t *rvd;
3893 	uint64_t pool_guid;
3894 	const char *comment;
3895 	const char *compatibility;
3896 
3897 	/*
3898 	 * Versioning wasn't explicitly added to the label until later, so if
3899 	 * it's not present treat it as the initial version.
3900 	 */
3901 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
3902 	    &spa->spa_ubsync.ub_version) != 0)
3903 		spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
3904 
3905 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
3906 		spa_load_failed(spa, "invalid config provided: '%s' missing",
3907 		    ZPOOL_CONFIG_POOL_GUID);
3908 		return (SET_ERROR(EINVAL));
3909 	}
3910 
3911 	/*
3912 	 * If we are doing an import, ensure that the pool is not already
3913 	 * imported by checking if its pool guid already exists in the
3914 	 * spa namespace.
3915 	 *
3916 	 * The only case that we allow an already imported pool to be
3917 	 * imported again, is when the pool is checkpointed and we want to
3918 	 * look at its checkpointed state from userland tools like zdb.
3919 	 */
3920 #ifdef _KERNEL
3921 	if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
3922 	    spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
3923 	    spa_guid_exists(pool_guid, 0)) {
3924 #else
3925 	if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
3926 	    spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
3927 	    spa_guid_exists(pool_guid, 0) &&
3928 	    !spa_importing_readonly_checkpoint(spa)) {
3929 #endif
3930 		spa_load_failed(spa, "a pool with guid %llu is already open",
3931 		    (u_longlong_t)pool_guid);
3932 		return (SET_ERROR(EEXIST));
3933 	}
3934 
3935 	spa->spa_config_guid = pool_guid;
3936 
3937 	nvlist_free(spa->spa_load_info);
3938 	spa->spa_load_info = fnvlist_alloc();
3939 
3940 	ASSERT(spa->spa_comment == NULL);
3941 	if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
3942 		spa->spa_comment = spa_strdup(comment);
3943 
3944 	ASSERT(spa->spa_compatibility == NULL);
3945 	if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMPATIBILITY,
3946 	    &compatibility) == 0)
3947 		spa->spa_compatibility = spa_strdup(compatibility);
3948 
3949 	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
3950 	    &spa->spa_config_txg);
3951 
3952 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0)
3953 		spa->spa_config_splitting = fnvlist_dup(nvl);
3954 
3955 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) {
3956 		spa_load_failed(spa, "invalid config provided: '%s' missing",
3957 		    ZPOOL_CONFIG_VDEV_TREE);
3958 		return (SET_ERROR(EINVAL));
3959 	}
3960 
3961 	/*
3962 	 * Create "The Godfather" zio to hold all async IOs
3963 	 */
3964 	spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
3965 	    KM_SLEEP);
3966 	for (int i = 0; i < max_ncpus; i++) {
3967 		spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
3968 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
3969 		    ZIO_FLAG_GODFATHER);
3970 	}
3971 
3972 	/*
3973 	 * Parse the configuration into a vdev tree.  We explicitly set the
3974 	 * value that will be returned by spa_version() since parsing the
3975 	 * configuration requires knowing the version number.
3976 	 */
3977 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3978 	parse = (type == SPA_IMPORT_EXISTING ?
3979 	    VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
3980 	error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse);
3981 	spa_config_exit(spa, SCL_ALL, FTAG);
3982 
3983 	if (error != 0) {
3984 		spa_load_failed(spa, "unable to parse config [error=%d]",
3985 		    error);
3986 		return (error);
3987 	}
3988 
3989 	ASSERT(spa->spa_root_vdev == rvd);
3990 	ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
3991 	ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT);
3992 
3993 	if (type != SPA_IMPORT_ASSEMBLE) {
3994 		ASSERT(spa_guid(spa) == pool_guid);
3995 	}
3996 
3997 	return (0);
3998 }
3999 
4000 /*
4001  * Recursively open all vdevs in the vdev tree. This function is called twice:
4002  * first with the untrusted config, then with the trusted config.
4003  */
4004 static int
4005 spa_ld_open_vdevs(spa_t *spa)
4006 {
4007 	int error = 0;
4008 
4009 	/*
4010 	 * spa_missing_tvds_allowed defines how many top-level vdevs can be
4011 	 * missing/unopenable for the root vdev to be still considered openable.
4012 	 */
4013 	if (spa->spa_trust_config) {
4014 		spa->spa_missing_tvds_allowed = zfs_max_missing_tvds;
4015 	} else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) {
4016 		spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile;
4017 	} else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) {
4018 		spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan;
4019 	} else {
4020 		spa->spa_missing_tvds_allowed = 0;
4021 	}
4022 
4023 	spa->spa_missing_tvds_allowed =
4024 	    MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed);
4025 
4026 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4027 	error = vdev_open(spa->spa_root_vdev);
4028 	spa_config_exit(spa, SCL_ALL, FTAG);
4029 
4030 	if (spa->spa_missing_tvds != 0) {
4031 		spa_load_note(spa, "vdev tree has %lld missing top-level "
4032 		    "vdevs.", (u_longlong_t)spa->spa_missing_tvds);
4033 		if (spa->spa_trust_config && (spa->spa_mode & SPA_MODE_WRITE)) {
4034 			/*
4035 			 * Although theoretically we could allow users to open
4036 			 * incomplete pools in RW mode, we'd need to add a lot
4037 			 * of extra logic (e.g. adjust pool space to account
4038 			 * for missing vdevs).
4039 			 * This limitation also prevents users from accidentally
4040 			 * opening the pool in RW mode during data recovery and
4041 			 * damaging it further.
4042 			 */
4043 			spa_load_note(spa, "pools with missing top-level "
4044 			    "vdevs can only be opened in read-only mode.");
4045 			error = SET_ERROR(ENXIO);
4046 		} else {
4047 			spa_load_note(spa, "current settings allow for maximum "
4048 			    "%lld missing top-level vdevs at this stage.",
4049 			    (u_longlong_t)spa->spa_missing_tvds_allowed);
4050 		}
4051 	}
4052 	if (error != 0) {
4053 		spa_load_failed(spa, "unable to open vdev tree [error=%d]",
4054 		    error);
4055 	}
4056 	if (spa->spa_missing_tvds != 0 || error != 0)
4057 		vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2);
4058 
4059 	return (error);
4060 }
4061 
4062 /*
4063  * We need to validate the vdev labels against the configuration that
4064  * we have in hand. This function is called twice: first with an untrusted
4065  * config, then with a trusted config. The validation is more strict when the
4066  * config is trusted.
4067  */
4068 static int
4069 spa_ld_validate_vdevs(spa_t *spa)
4070 {
4071 	int error = 0;
4072 	vdev_t *rvd = spa->spa_root_vdev;
4073 
4074 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4075 	error = vdev_validate(rvd);
4076 	spa_config_exit(spa, SCL_ALL, FTAG);
4077 
4078 	if (error != 0) {
4079 		spa_load_failed(spa, "vdev_validate failed [error=%d]", error);
4080 		return (error);
4081 	}
4082 
4083 	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
4084 		spa_load_failed(spa, "cannot open vdev tree after invalidating "
4085 		    "some vdevs");
4086 		vdev_dbgmsg_print_tree(rvd, 2);
4087 		return (SET_ERROR(ENXIO));
4088 	}
4089 
4090 	return (0);
4091 }
4092 
4093 static void
4094 spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub)
4095 {
4096 	spa->spa_state = POOL_STATE_ACTIVE;
4097 	spa->spa_ubsync = spa->spa_uberblock;
4098 	spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
4099 	    TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
4100 	spa->spa_first_txg = spa->spa_last_ubsync_txg ?
4101 	    spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
4102 	spa->spa_claim_max_txg = spa->spa_first_txg;
4103 	spa->spa_prev_software_version = ub->ub_software_version;
4104 }
4105 
4106 static int
4107 spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
4108 {
4109 	vdev_t *rvd = spa->spa_root_vdev;
4110 	nvlist_t *label;
4111 	uberblock_t *ub = &spa->spa_uberblock;
4112 	boolean_t activity_check = B_FALSE;
4113 
4114 	/*
4115 	 * If we are opening the checkpointed state of the pool by
4116 	 * rewinding to it, at this point we will have written the
4117 	 * checkpointed uberblock to the vdev labels, so searching
4118 	 * the labels will find the right uberblock.  However, if
4119 	 * we are opening the checkpointed state read-only, we have
4120 	 * not modified the labels. Therefore, we must ignore the
4121 	 * labels and continue using the spa_uberblock that was set
4122 	 * by spa_ld_checkpoint_rewind.
4123 	 *
4124 	 * Note that it would be fine to ignore the labels when
4125 	 * rewinding (opening writeable) as well. However, if we
4126 	 * crash just after writing the labels, we will end up
4127 	 * searching the labels. Doing so in the common case means
4128 	 * that this code path gets exercised normally, rather than
4129 	 * just in the edge case.
4130 	 */
4131 	if (ub->ub_checkpoint_txg != 0 &&
4132 	    spa_importing_readonly_checkpoint(spa)) {
4133 		spa_ld_select_uberblock_done(spa, ub);
4134 		return (0);
4135 	}
4136 
4137 	/*
4138 	 * Find the best uberblock.
4139 	 */
4140 	vdev_uberblock_load(rvd, ub, &label);
4141 
4142 	/*
4143 	 * If we weren't able to find a single valid uberblock, return failure.
4144 	 */
4145 	if (ub->ub_txg == 0) {
4146 		nvlist_free(label);
4147 		spa_load_failed(spa, "no valid uberblock found");
4148 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
4149 	}
4150 
4151 	if (spa->spa_load_max_txg != UINT64_MAX) {
4152 		(void) spa_import_progress_set_max_txg(spa_guid(spa),
4153 		    (u_longlong_t)spa->spa_load_max_txg);
4154 	}
4155 	spa_load_note(spa, "using uberblock with txg=%llu",
4156 	    (u_longlong_t)ub->ub_txg);
4157 	if (ub->ub_raidz_reflow_info != 0) {
4158 		spa_load_note(spa, "uberblock raidz_reflow_info: "
4159 		    "state=%u offset=%llu",
4160 		    (int)RRSS_GET_STATE(ub),
4161 		    (u_longlong_t)RRSS_GET_OFFSET(ub));
4162 	}
4163 
4164 
4165 	/*
4166 	 * For pools which have the multihost property on determine if the
4167 	 * pool is truly inactive and can be safely imported.  Prevent
4168 	 * hosts which don't have a hostid set from importing the pool.
4169 	 */
4170 	activity_check = spa_activity_check_required(spa, ub, label,
4171 	    spa->spa_config);
4172 	if (activity_check) {
4173 		if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay &&
4174 		    spa_get_hostid(spa) == 0) {
4175 			nvlist_free(label);
4176 			fnvlist_add_uint64(spa->spa_load_info,
4177 			    ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID);
4178 			return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
4179 		}
4180 
4181 		int error =
4182 		    spa_activity_check(spa, ub, spa->spa_config, B_TRUE);
4183 		if (error) {
4184 			nvlist_free(label);
4185 			return (error);
4186 		}
4187 
4188 		fnvlist_add_uint64(spa->spa_load_info,
4189 		    ZPOOL_CONFIG_MMP_STATE, MMP_STATE_INACTIVE);
4190 		fnvlist_add_uint64(spa->spa_load_info,
4191 		    ZPOOL_CONFIG_MMP_TXG, ub->ub_txg);
4192 		fnvlist_add_uint16(spa->spa_load_info,
4193 		    ZPOOL_CONFIG_MMP_SEQ,
4194 		    (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0));
4195 	}
4196 
4197 	/*
4198 	 * If the pool has an unsupported version we can't open it.
4199 	 */
4200 	if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) {
4201 		nvlist_free(label);
4202 		spa_load_failed(spa, "version %llu is not supported",
4203 		    (u_longlong_t)ub->ub_version);
4204 		return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
4205 	}
4206 
4207 	if (ub->ub_version >= SPA_VERSION_FEATURES) {
4208 		nvlist_t *features;
4209 
4210 		/*
4211 		 * If we weren't able to find what's necessary for reading the
4212 		 * MOS in the label, return failure.
4213 		 */
4214 		if (label == NULL) {
4215 			spa_load_failed(spa, "label config unavailable");
4216 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
4217 			    ENXIO));
4218 		}
4219 
4220 		if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ,
4221 		    &features) != 0) {
4222 			nvlist_free(label);
4223 			spa_load_failed(spa, "invalid label: '%s' missing",
4224 			    ZPOOL_CONFIG_FEATURES_FOR_READ);
4225 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
4226 			    ENXIO));
4227 		}
4228 
4229 		/*
4230 		 * Update our in-core representation with the definitive values
4231 		 * from the label.
4232 		 */
4233 		nvlist_free(spa->spa_label_features);
4234 		spa->spa_label_features = fnvlist_dup(features);
4235 	}
4236 
4237 	nvlist_free(label);
4238 
4239 	/*
4240 	 * Look through entries in the label nvlist's features_for_read. If
4241 	 * there is a feature listed there which we don't understand then we
4242 	 * cannot open a pool.
4243 	 */
4244 	if (ub->ub_version >= SPA_VERSION_FEATURES) {
4245 		nvlist_t *unsup_feat;
4246 
4247 		unsup_feat = fnvlist_alloc();
4248 
4249 		for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features,
4250 		    NULL); nvp != NULL;
4251 		    nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) {
4252 			if (!zfeature_is_supported(nvpair_name(nvp))) {
4253 				fnvlist_add_string(unsup_feat,
4254 				    nvpair_name(nvp), "");
4255 			}
4256 		}
4257 
4258 		if (!nvlist_empty(unsup_feat)) {
4259 			fnvlist_add_nvlist(spa->spa_load_info,
4260 			    ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
4261 			nvlist_free(unsup_feat);
4262 			spa_load_failed(spa, "some features are unsupported");
4263 			return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
4264 			    ENOTSUP));
4265 		}
4266 
4267 		nvlist_free(unsup_feat);
4268 	}
4269 
4270 	if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
4271 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4272 		spa_try_repair(spa, spa->spa_config);
4273 		spa_config_exit(spa, SCL_ALL, FTAG);
4274 		nvlist_free(spa->spa_config_splitting);
4275 		spa->spa_config_splitting = NULL;
4276 	}
4277 
4278 	/*
4279 	 * Initialize internal SPA structures.
4280 	 */
4281 	spa_ld_select_uberblock_done(spa, ub);
4282 
4283 	return (0);
4284 }
4285 
4286 static int
4287 spa_ld_open_rootbp(spa_t *spa)
4288 {
4289 	int error = 0;
4290 	vdev_t *rvd = spa->spa_root_vdev;
4291 
4292 	error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
4293 	if (error != 0) {
4294 		spa_load_failed(spa, "unable to open rootbp in dsl_pool_init "
4295 		    "[error=%d]", error);
4296 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4297 	}
4298 	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
4299 
4300 	return (0);
4301 }
4302 
4303 static int
4304 spa_ld_trusted_config(spa_t *spa, spa_import_type_t type,
4305     boolean_t reloading)
4306 {
4307 	vdev_t *mrvd, *rvd = spa->spa_root_vdev;
4308 	nvlist_t *nv, *mos_config, *policy;
4309 	int error = 0, copy_error;
4310 	uint64_t healthy_tvds, healthy_tvds_mos;
4311 	uint64_t mos_config_txg;
4312 
4313 	if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE)
4314 	    != 0)
4315 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4316 
4317 	/*
4318 	 * If we're assembling a pool from a split, the config provided is
4319 	 * already trusted so there is nothing to do.
4320 	 */
4321 	if (type == SPA_IMPORT_ASSEMBLE)
4322 		return (0);
4323 
4324 	healthy_tvds = spa_healthy_core_tvds(spa);
4325 
4326 	if (load_nvlist(spa, spa->spa_config_object, &mos_config)
4327 	    != 0) {
4328 		spa_load_failed(spa, "unable to retrieve MOS config");
4329 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4330 	}
4331 
4332 	/*
4333 	 * If we are doing an open, pool owner wasn't verified yet, thus do
4334 	 * the verification here.
4335 	 */
4336 	if (spa->spa_load_state == SPA_LOAD_OPEN) {
4337 		error = spa_verify_host(spa, mos_config);
4338 		if (error != 0) {
4339 			nvlist_free(mos_config);
4340 			return (error);
4341 		}
4342 	}
4343 
4344 	nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE);
4345 
4346 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4347 
4348 	/*
4349 	 * Build a new vdev tree from the trusted config
4350 	 */
4351 	error = spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD);
4352 	if (error != 0) {
4353 		nvlist_free(mos_config);
4354 		spa_config_exit(spa, SCL_ALL, FTAG);
4355 		spa_load_failed(spa, "spa_config_parse failed [error=%d]",
4356 		    error);
4357 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
4358 	}
4359 
4360 	/*
4361 	 * Vdev paths in the MOS may be obsolete. If the untrusted config was
4362 	 * obtained by scanning /dev/dsk, then it will have the right vdev
4363 	 * paths. We update the trusted MOS config with this information.
4364 	 * We first try to copy the paths with vdev_copy_path_strict, which
4365 	 * succeeds only when both configs have exactly the same vdev tree.
4366 	 * If that fails, we fall back to a more flexible method that has a
4367 	 * best effort policy.
4368 	 */
4369 	copy_error = vdev_copy_path_strict(rvd, mrvd);
4370 	if (copy_error != 0 || spa_load_print_vdev_tree) {
4371 		spa_load_note(spa, "provided vdev tree:");
4372 		vdev_dbgmsg_print_tree(rvd, 2);
4373 		spa_load_note(spa, "MOS vdev tree:");
4374 		vdev_dbgmsg_print_tree(mrvd, 2);
4375 	}
4376 	if (copy_error != 0) {
4377 		spa_load_note(spa, "vdev_copy_path_strict failed, falling "
4378 		    "back to vdev_copy_path_relaxed");
4379 		vdev_copy_path_relaxed(rvd, mrvd);
4380 	}
4381 
4382 	vdev_close(rvd);
4383 	vdev_free(rvd);
4384 	spa->spa_root_vdev = mrvd;
4385 	rvd = mrvd;
4386 	spa_config_exit(spa, SCL_ALL, FTAG);
4387 
4388 	/*
4389 	 * If 'zpool import' used a cached config, then the on-disk hostid and
4390 	 * hostname may be different to the cached config in ways that should
4391 	 * prevent import.  Userspace can't discover this without a scan, but
4392 	 * we know, so we add these values to LOAD_INFO so the caller can know
4393 	 * the difference.
4394 	 *
4395 	 * Note that we have to do this before the config is regenerated,
4396 	 * because the new config will have the hostid and hostname for this
4397 	 * host, in readiness for import.
4398 	 */
4399 	if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTID))
4400 		fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_HOSTID,
4401 		    fnvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_HOSTID));
4402 	if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTNAME))
4403 		fnvlist_add_string(spa->spa_load_info, ZPOOL_CONFIG_HOSTNAME,
4404 		    fnvlist_lookup_string(mos_config, ZPOOL_CONFIG_HOSTNAME));
4405 
4406 	/*
4407 	 * We will use spa_config if we decide to reload the spa or if spa_load
4408 	 * fails and we rewind. We must thus regenerate the config using the
4409 	 * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to
4410 	 * pass settings on how to load the pool and is not stored in the MOS.
4411 	 * We copy it over to our new, trusted config.
4412 	 */
4413 	mos_config_txg = fnvlist_lookup_uint64(mos_config,
4414 	    ZPOOL_CONFIG_POOL_TXG);
4415 	nvlist_free(mos_config);
4416 	mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE);
4417 	if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY,
4418 	    &policy) == 0)
4419 		fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy);
4420 	spa_config_set(spa, mos_config);
4421 	spa->spa_config_source = SPA_CONFIG_SRC_MOS;
4422 
4423 	/*
4424 	 * Now that we got the config from the MOS, we should be more strict
4425 	 * in checking blkptrs and can make assumptions about the consistency
4426 	 * of the vdev tree. spa_trust_config must be set to true before opening
4427 	 * vdevs in order for them to be writeable.
4428 	 */
4429 	spa->spa_trust_config = B_TRUE;
4430 
4431 	/*
4432 	 * Open and validate the new vdev tree
4433 	 */
4434 	error = spa_ld_open_vdevs(spa);
4435 	if (error != 0)
4436 		return (error);
4437 
4438 	error = spa_ld_validate_vdevs(spa);
4439 	if (error != 0)
4440 		return (error);
4441 
4442 	if (copy_error != 0 || spa_load_print_vdev_tree) {
4443 		spa_load_note(spa, "final vdev tree:");
4444 		vdev_dbgmsg_print_tree(rvd, 2);
4445 	}
4446 
4447 	if (spa->spa_load_state != SPA_LOAD_TRYIMPORT &&
4448 	    !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) {
4449 		/*
4450 		 * Sanity check to make sure that we are indeed loading the
4451 		 * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds
4452 		 * in the config provided and they happened to be the only ones
4453 		 * to have the latest uberblock, we could involuntarily perform
4454 		 * an extreme rewind.
4455 		 */
4456 		healthy_tvds_mos = spa_healthy_core_tvds(spa);
4457 		if (healthy_tvds_mos - healthy_tvds >=
4458 		    SPA_SYNC_MIN_VDEVS) {
4459 			spa_load_note(spa, "config provided misses too many "
4460 			    "top-level vdevs compared to MOS (%lld vs %lld). ",
4461 			    (u_longlong_t)healthy_tvds,
4462 			    (u_longlong_t)healthy_tvds_mos);
4463 			spa_load_note(spa, "vdev tree:");
4464 			vdev_dbgmsg_print_tree(rvd, 2);
4465 			if (reloading) {
4466 				spa_load_failed(spa, "config was already "
4467 				    "provided from MOS. Aborting.");
4468 				return (spa_vdev_err(rvd,
4469 				    VDEV_AUX_CORRUPT_DATA, EIO));
4470 			}
4471 			spa_load_note(spa, "spa must be reloaded using MOS "
4472 			    "config");
4473 			return (SET_ERROR(EAGAIN));
4474 		}
4475 	}
4476 
4477 	error = spa_check_for_missing_logs(spa);
4478 	if (error != 0)
4479 		return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
4480 
4481 	if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) {
4482 		spa_load_failed(spa, "uberblock guid sum doesn't match MOS "
4483 		    "guid sum (%llu != %llu)",
4484 		    (u_longlong_t)spa->spa_uberblock.ub_guid_sum,
4485 		    (u_longlong_t)rvd->vdev_guid_sum);
4486 		return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
4487 		    ENXIO));
4488 	}
4489 
4490 	return (0);
4491 }
4492 
4493 static int
4494 spa_ld_open_indirect_vdev_metadata(spa_t *spa)
4495 {
4496 	int error = 0;
4497 	vdev_t *rvd = spa->spa_root_vdev;
4498 
4499 	/*
4500 	 * Everything that we read before spa_remove_init() must be stored
4501 	 * on concreted vdevs.  Therefore we do this as early as possible.
4502 	 */
4503 	error = spa_remove_init(spa);
4504 	if (error != 0) {
4505 		spa_load_failed(spa, "spa_remove_init failed [error=%d]",
4506 		    error);
4507 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4508 	}
4509 
4510 	/*
4511 	 * Retrieve information needed to condense indirect vdev mappings.
4512 	 */
4513 	error = spa_condense_init(spa);
4514 	if (error != 0) {
4515 		spa_load_failed(spa, "spa_condense_init failed [error=%d]",
4516 		    error);
4517 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
4518 	}
4519 
4520 	return (0);
4521 }
4522 
4523 static int
4524 spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep)
4525 {
4526 	int error = 0;
4527 	vdev_t *rvd = spa->spa_root_vdev;
4528 
4529 	if (spa_version(spa) >= SPA_VERSION_FEATURES) {
4530 		boolean_t missing_feat_read = B_FALSE;
4531 		nvlist_t *unsup_feat, *enabled_feat;
4532 
4533 		if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ,
4534 		    &spa->spa_feat_for_read_obj, B_TRUE) != 0) {
4535 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4536 		}
4537 
4538 		if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE,
4539 		    &spa->spa_feat_for_write_obj, B_TRUE) != 0) {
4540 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4541 		}
4542 
4543 		if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS,
4544 		    &spa->spa_feat_desc_obj, B_TRUE) != 0) {
4545 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4546 		}
4547 
4548 		enabled_feat = fnvlist_alloc();
4549 		unsup_feat = fnvlist_alloc();
4550 
4551 		if (!spa_features_check(spa, B_FALSE,
4552 		    unsup_feat, enabled_feat))
4553 			missing_feat_read = B_TRUE;
4554 
4555 		if (spa_writeable(spa) ||
4556 		    spa->spa_load_state == SPA_LOAD_TRYIMPORT) {
4557 			if (!spa_features_check(spa, B_TRUE,
4558 			    unsup_feat, enabled_feat)) {
4559 				*missing_feat_writep = B_TRUE;
4560 			}
4561 		}
4562 
4563 		fnvlist_add_nvlist(spa->spa_load_info,
4564 		    ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat);
4565 
4566 		if (!nvlist_empty(unsup_feat)) {
4567 			fnvlist_add_nvlist(spa->spa_load_info,
4568 			    ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
4569 		}
4570 
4571 		fnvlist_free(enabled_feat);
4572 		fnvlist_free(unsup_feat);
4573 
4574 		if (!missing_feat_read) {
4575 			fnvlist_add_boolean(spa->spa_load_info,
4576 			    ZPOOL_CONFIG_CAN_RDONLY);
4577 		}
4578 
4579 		/*
4580 		 * If the state is SPA_LOAD_TRYIMPORT, our objective is
4581 		 * twofold: to determine whether the pool is available for
4582 		 * import in read-write mode and (if it is not) whether the
4583 		 * pool is available for import in read-only mode. If the pool
4584 		 * is available for import in read-write mode, it is displayed
4585 		 * as available in userland; if it is not available for import
4586 		 * in read-only mode, it is displayed as unavailable in
4587 		 * userland. If the pool is available for import in read-only
4588 		 * mode but not read-write mode, it is displayed as unavailable
4589 		 * in userland with a special note that the pool is actually
4590 		 * available for open in read-only mode.
4591 		 *
4592 		 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are
4593 		 * missing a feature for write, we must first determine whether
4594 		 * the pool can be opened read-only before returning to
4595 		 * userland in order to know whether to display the
4596 		 * abovementioned note.
4597 		 */
4598 		if (missing_feat_read || (*missing_feat_writep &&
4599 		    spa_writeable(spa))) {
4600 			spa_load_failed(spa, "pool uses unsupported features");
4601 			return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
4602 			    ENOTSUP));
4603 		}
4604 
4605 		/*
4606 		 * Load refcounts for ZFS features from disk into an in-memory
4607 		 * cache during SPA initialization.
4608 		 */
4609 		for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
4610 			uint64_t refcount;
4611 
4612 			error = feature_get_refcount_from_disk(spa,
4613 			    &spa_feature_table[i], &refcount);
4614 			if (error == 0) {
4615 				spa->spa_feat_refcount_cache[i] = refcount;
4616 			} else if (error == ENOTSUP) {
4617 				spa->spa_feat_refcount_cache[i] =
4618 				    SPA_FEATURE_DISABLED;
4619 			} else {
4620 				spa_load_failed(spa, "error getting refcount "
4621 				    "for feature %s [error=%d]",
4622 				    spa_feature_table[i].fi_guid, error);
4623 				return (spa_vdev_err(rvd,
4624 				    VDEV_AUX_CORRUPT_DATA, EIO));
4625 			}
4626 		}
4627 	}
4628 
4629 	if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) {
4630 		if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG,
4631 		    &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0)
4632 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4633 	}
4634 
4635 	/*
4636 	 * Encryption was added before bookmark_v2, even though bookmark_v2
4637 	 * is now a dependency. If this pool has encryption enabled without
4638 	 * bookmark_v2, trigger an errata message.
4639 	 */
4640 	if (spa_feature_is_enabled(spa, SPA_FEATURE_ENCRYPTION) &&
4641 	    !spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_V2)) {
4642 		spa->spa_errata = ZPOOL_ERRATA_ZOL_8308_ENCRYPTION;
4643 	}
4644 
4645 	return (0);
4646 }
4647 
4648 static int
4649 spa_ld_load_special_directories(spa_t *spa)
4650 {
4651 	int error = 0;
4652 	vdev_t *rvd = spa->spa_root_vdev;
4653 
4654 	spa->spa_is_initializing = B_TRUE;
4655 	error = dsl_pool_open(spa->spa_dsl_pool);
4656 	spa->spa_is_initializing = B_FALSE;
4657 	if (error != 0) {
4658 		spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error);
4659 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4660 	}
4661 
4662 	return (0);
4663 }
4664 
4665 static int
4666 spa_ld_get_props(spa_t *spa)
4667 {
4668 	int error = 0;
4669 	uint64_t obj;
4670 	vdev_t *rvd = spa->spa_root_vdev;
4671 
4672 	/* Grab the checksum salt from the MOS. */
4673 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
4674 	    DMU_POOL_CHECKSUM_SALT, 1,
4675 	    sizeof (spa->spa_cksum_salt.zcs_bytes),
4676 	    spa->spa_cksum_salt.zcs_bytes);
4677 	if (error == ENOENT) {
4678 		/* Generate a new salt for subsequent use */
4679 		(void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
4680 		    sizeof (spa->spa_cksum_salt.zcs_bytes));
4681 	} else if (error != 0) {
4682 		spa_load_failed(spa, "unable to retrieve checksum salt from "
4683 		    "MOS [error=%d]", error);
4684 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4685 	}
4686 
4687 	if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0)
4688 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4689 	error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
4690 	if (error != 0) {
4691 		spa_load_failed(spa, "error opening deferred-frees bpobj "
4692 		    "[error=%d]", error);
4693 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4694 	}
4695 
4696 	/*
4697 	 * Load the bit that tells us to use the new accounting function
4698 	 * (raid-z deflation).  If we have an older pool, this will not
4699 	 * be present.
4700 	 */
4701 	error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE);
4702 	if (error != 0 && error != ENOENT)
4703 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4704 
4705 	error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
4706 	    &spa->spa_creation_version, B_FALSE);
4707 	if (error != 0 && error != ENOENT)
4708 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4709 
4710 	/*
4711 	 * Load the persistent error log.  If we have an older pool, this will
4712 	 * not be present.
4713 	 */
4714 	error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last,
4715 	    B_FALSE);
4716 	if (error != 0 && error != ENOENT)
4717 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4718 
4719 	error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
4720 	    &spa->spa_errlog_scrub, B_FALSE);
4721 	if (error != 0 && error != ENOENT)
4722 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4723 
4724 	/* Load the last scrubbed txg. */
4725 	error = spa_dir_prop(spa, DMU_POOL_LAST_SCRUBBED_TXG,
4726 	    &spa->spa_scrubbed_last_txg, B_FALSE);
4727 	if (error != 0 && error != ENOENT)
4728 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4729 
4730 	/*
4731 	 * Load the livelist deletion field. If a livelist is queued for
4732 	 * deletion, indicate that in the spa
4733 	 */
4734 	error = spa_dir_prop(spa, DMU_POOL_DELETED_CLONES,
4735 	    &spa->spa_livelists_to_delete, B_FALSE);
4736 	if (error != 0 && error != ENOENT)
4737 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4738 
4739 	/*
4740 	 * Load the history object.  If we have an older pool, this
4741 	 * will not be present.
4742 	 */
4743 	error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE);
4744 	if (error != 0 && error != ENOENT)
4745 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4746 
4747 	/*
4748 	 * Load the per-vdev ZAP map. If we have an older pool, this will not
4749 	 * be present; in this case, defer its creation to a later time to
4750 	 * avoid dirtying the MOS this early / out of sync context. See
4751 	 * spa_sync_config_object.
4752 	 */
4753 
4754 	/* The sentinel is only available in the MOS config. */
4755 	nvlist_t *mos_config;
4756 	if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) {
4757 		spa_load_failed(spa, "unable to retrieve MOS config");
4758 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4759 	}
4760 
4761 	error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP,
4762 	    &spa->spa_all_vdev_zaps, B_FALSE);
4763 
4764 	if (error == ENOENT) {
4765 		VERIFY(!nvlist_exists(mos_config,
4766 		    ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
4767 		spa->spa_avz_action = AVZ_ACTION_INITIALIZE;
4768 		ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
4769 	} else if (error != 0) {
4770 		nvlist_free(mos_config);
4771 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4772 	} else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) {
4773 		/*
4774 		 * An older version of ZFS overwrote the sentinel value, so
4775 		 * we have orphaned per-vdev ZAPs in the MOS. Defer their
4776 		 * destruction to later; see spa_sync_config_object.
4777 		 */
4778 		spa->spa_avz_action = AVZ_ACTION_DESTROY;
4779 		/*
4780 		 * We're assuming that no vdevs have had their ZAPs created
4781 		 * before this. Better be sure of it.
4782 		 */
4783 		ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
4784 	}
4785 	nvlist_free(mos_config);
4786 
4787 	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
4788 
4789 	error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object,
4790 	    B_FALSE);
4791 	if (error && error != ENOENT)
4792 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4793 
4794 	if (error == 0) {
4795 		uint64_t autoreplace = 0;
4796 
4797 		spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
4798 		spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
4799 		spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
4800 		spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
4801 		spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
4802 		spa_prop_find(spa, ZPOOL_PROP_DEDUP_TABLE_QUOTA,
4803 		    &spa->spa_dedup_table_quota);
4804 		spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost);
4805 		spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_autotrim);
4806 		spa->spa_autoreplace = (autoreplace != 0);
4807 	}
4808 
4809 	/*
4810 	 * If we are importing a pool with missing top-level vdevs,
4811 	 * we enforce that the pool doesn't panic or get suspended on
4812 	 * error since the likelihood of missing data is extremely high.
4813 	 */
4814 	if (spa->spa_missing_tvds > 0 &&
4815 	    spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE &&
4816 	    spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
4817 		spa_load_note(spa, "forcing failmode to 'continue' "
4818 		    "as some top level vdevs are missing");
4819 		spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE;
4820 	}
4821 
4822 	return (0);
4823 }
4824 
4825 static int
4826 spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type)
4827 {
4828 	int error = 0;
4829 	vdev_t *rvd = spa->spa_root_vdev;
4830 
4831 	/*
4832 	 * If we're assembling the pool from the split-off vdevs of
4833 	 * an existing pool, we don't want to attach the spares & cache
4834 	 * devices.
4835 	 */
4836 
4837 	/*
4838 	 * Load any hot spares for this pool.
4839 	 */
4840 	error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object,
4841 	    B_FALSE);
4842 	if (error != 0 && error != ENOENT)
4843 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4844 	if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
4845 		ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
4846 		if (load_nvlist(spa, spa->spa_spares.sav_object,
4847 		    &spa->spa_spares.sav_config) != 0) {
4848 			spa_load_failed(spa, "error loading spares nvlist");
4849 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4850 		}
4851 
4852 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4853 		spa_load_spares(spa);
4854 		spa_config_exit(spa, SCL_ALL, FTAG);
4855 	} else if (error == 0) {
4856 		spa->spa_spares.sav_sync = B_TRUE;
4857 	}
4858 
4859 	/*
4860 	 * Load any level 2 ARC devices for this pool.
4861 	 */
4862 	error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
4863 	    &spa->spa_l2cache.sav_object, B_FALSE);
4864 	if (error != 0 && error != ENOENT)
4865 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4866 	if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
4867 		ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
4868 		if (load_nvlist(spa, spa->spa_l2cache.sav_object,
4869 		    &spa->spa_l2cache.sav_config) != 0) {
4870 			spa_load_failed(spa, "error loading l2cache nvlist");
4871 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4872 		}
4873 
4874 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4875 		spa_load_l2cache(spa);
4876 		spa_config_exit(spa, SCL_ALL, FTAG);
4877 	} else if (error == 0) {
4878 		spa->spa_l2cache.sav_sync = B_TRUE;
4879 	}
4880 
4881 	return (0);
4882 }
4883 
4884 static int
4885 spa_ld_load_vdev_metadata(spa_t *spa)
4886 {
4887 	int error = 0;
4888 	vdev_t *rvd = spa->spa_root_vdev;
4889 
4890 	/*
4891 	 * If the 'multihost' property is set, then never allow a pool to
4892 	 * be imported when the system hostid is zero.  The exception to
4893 	 * this rule is zdb which is always allowed to access pools.
4894 	 */
4895 	if (spa_multihost(spa) && spa_get_hostid(spa) == 0 &&
4896 	    (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) == 0) {
4897 		fnvlist_add_uint64(spa->spa_load_info,
4898 		    ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID);
4899 		return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
4900 	}
4901 
4902 	/*
4903 	 * If the 'autoreplace' property is set, then post a resource notifying
4904 	 * the ZFS DE that it should not issue any faults for unopenable
4905 	 * devices.  We also iterate over the vdevs, and post a sysevent for any
4906 	 * unopenable vdevs so that the normal autoreplace handler can take
4907 	 * over.
4908 	 */
4909 	if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
4910 		spa_check_removed(spa->spa_root_vdev);
4911 		/*
4912 		 * For the import case, this is done in spa_import(), because
4913 		 * at this point we're using the spare definitions from
4914 		 * the MOS config, not necessarily from the userland config.
4915 		 */
4916 		if (spa->spa_load_state != SPA_LOAD_IMPORT) {
4917 			spa_aux_check_removed(&spa->spa_spares);
4918 			spa_aux_check_removed(&spa->spa_l2cache);
4919 		}
4920 	}
4921 
4922 	/*
4923 	 * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc.
4924 	 */
4925 	error = vdev_load(rvd);
4926 	if (error != 0) {
4927 		spa_load_failed(spa, "vdev_load failed [error=%d]", error);
4928 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
4929 	}
4930 
4931 	error = spa_ld_log_spacemaps(spa);
4932 	if (error != 0) {
4933 		spa_load_failed(spa, "spa_ld_log_spacemaps failed [error=%d]",
4934 		    error);
4935 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
4936 	}
4937 
4938 	/*
4939 	 * Propagate the leaf DTLs we just loaded all the way up the vdev tree.
4940 	 */
4941 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4942 	vdev_dtl_reassess(rvd, 0, 0, B_FALSE, B_FALSE);
4943 	spa_config_exit(spa, SCL_ALL, FTAG);
4944 
4945 	return (0);
4946 }
4947 
4948 static int
4949 spa_ld_load_dedup_tables(spa_t *spa)
4950 {
4951 	int error = 0;
4952 	vdev_t *rvd = spa->spa_root_vdev;
4953 
4954 	error = ddt_load(spa);
4955 	if (error != 0) {
4956 		spa_load_failed(spa, "ddt_load failed [error=%d]", error);
4957 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4958 	}
4959 
4960 	return (0);
4961 }
4962 
4963 static int
4964 spa_ld_load_brt(spa_t *spa)
4965 {
4966 	int error = 0;
4967 	vdev_t *rvd = spa->spa_root_vdev;
4968 
4969 	error = brt_load(spa);
4970 	if (error != 0) {
4971 		spa_load_failed(spa, "brt_load failed [error=%d]", error);
4972 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
4973 	}
4974 
4975 	return (0);
4976 }
4977 
4978 static int
4979 spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, const char **ereport)
4980 {
4981 	vdev_t *rvd = spa->spa_root_vdev;
4982 
4983 	if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) {
4984 		boolean_t missing = spa_check_logs(spa);
4985 		if (missing) {
4986 			if (spa->spa_missing_tvds != 0) {
4987 				spa_load_note(spa, "spa_check_logs failed "
4988 				    "so dropping the logs");
4989 			} else {
4990 				*ereport = FM_EREPORT_ZFS_LOG_REPLAY;
4991 				spa_load_failed(spa, "spa_check_logs failed");
4992 				return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG,
4993 				    ENXIO));
4994 			}
4995 		}
4996 	}
4997 
4998 	return (0);
4999 }
5000 
5001 static int
5002 spa_ld_verify_pool_data(spa_t *spa)
5003 {
5004 	int error = 0;
5005 	vdev_t *rvd = spa->spa_root_vdev;
5006 
5007 	/*
5008 	 * We've successfully opened the pool, verify that we're ready
5009 	 * to start pushing transactions.
5010 	 */
5011 	if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
5012 		error = spa_load_verify(spa);
5013 		if (error != 0) {
5014 			spa_load_failed(spa, "spa_load_verify failed "
5015 			    "[error=%d]", error);
5016 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
5017 			    error));
5018 		}
5019 	}
5020 
5021 	return (0);
5022 }
5023 
5024 static void
5025 spa_ld_claim_log_blocks(spa_t *spa)
5026 {
5027 	dmu_tx_t *tx;
5028 	dsl_pool_t *dp = spa_get_dsl(spa);
5029 
5030 	/*
5031 	 * Claim log blocks that haven't been committed yet.
5032 	 * This must all happen in a single txg.
5033 	 * Note: spa_claim_max_txg is updated by spa_claim_notify(),
5034 	 * invoked from zil_claim_log_block()'s i/o done callback.
5035 	 * Price of rollback is that we abandon the log.
5036 	 */
5037 	spa->spa_claiming = B_TRUE;
5038 
5039 	tx = dmu_tx_create_assigned(dp, spa_first_txg(spa));
5040 	(void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
5041 	    zil_claim, tx, DS_FIND_CHILDREN);
5042 	dmu_tx_commit(tx);
5043 
5044 	spa->spa_claiming = B_FALSE;
5045 
5046 	spa_set_log_state(spa, SPA_LOG_GOOD);
5047 }
5048 
5049 static void
5050 spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg,
5051     boolean_t update_config_cache)
5052 {
5053 	vdev_t *rvd = spa->spa_root_vdev;
5054 	int need_update = B_FALSE;
5055 
5056 	/*
5057 	 * If the config cache is stale, or we have uninitialized
5058 	 * metaslabs (see spa_vdev_add()), then update the config.
5059 	 *
5060 	 * If this is a verbatim import, trust the current
5061 	 * in-core spa_config and update the disk labels.
5062 	 */
5063 	if (update_config_cache || config_cache_txg != spa->spa_config_txg ||
5064 	    spa->spa_load_state == SPA_LOAD_IMPORT ||
5065 	    spa->spa_load_state == SPA_LOAD_RECOVER ||
5066 	    (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
5067 		need_update = B_TRUE;
5068 
5069 	for (int c = 0; c < rvd->vdev_children; c++)
5070 		if (rvd->vdev_child[c]->vdev_ms_array == 0)
5071 			need_update = B_TRUE;
5072 
5073 	/*
5074 	 * Update the config cache asynchronously in case we're the
5075 	 * root pool, in which case the config cache isn't writable yet.
5076 	 */
5077 	if (need_update)
5078 		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
5079 }
5080 
5081 static void
5082 spa_ld_prepare_for_reload(spa_t *spa)
5083 {
5084 	spa_mode_t mode = spa->spa_mode;
5085 	int async_suspended = spa->spa_async_suspended;
5086 
5087 	spa_unload(spa);
5088 	spa_deactivate(spa);
5089 	spa_activate(spa, mode);
5090 
5091 	/*
5092 	 * We save the value of spa_async_suspended as it gets reset to 0 by
5093 	 * spa_unload(). We want to restore it back to the original value before
5094 	 * returning as we might be calling spa_async_resume() later.
5095 	 */
5096 	spa->spa_async_suspended = async_suspended;
5097 }
5098 
5099 static int
5100 spa_ld_read_checkpoint_txg(spa_t *spa)
5101 {
5102 	uberblock_t checkpoint;
5103 	int error = 0;
5104 
5105 	ASSERT0(spa->spa_checkpoint_txg);
5106 	ASSERT(MUTEX_HELD(&spa_namespace_lock) ||
5107 	    spa->spa_load_thread == curthread);
5108 
5109 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
5110 	    DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
5111 	    sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
5112 
5113 	if (error == ENOENT)
5114 		return (0);
5115 
5116 	if (error != 0)
5117 		return (error);
5118 
5119 	ASSERT3U(checkpoint.ub_txg, !=, 0);
5120 	ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0);
5121 	ASSERT3U(checkpoint.ub_timestamp, !=, 0);
5122 	spa->spa_checkpoint_txg = checkpoint.ub_txg;
5123 	spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp;
5124 
5125 	return (0);
5126 }
5127 
5128 static int
5129 spa_ld_mos_init(spa_t *spa, spa_import_type_t type)
5130 {
5131 	int error = 0;
5132 
5133 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
5134 	ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
5135 
5136 	/*
5137 	 * Never trust the config that is provided unless we are assembling
5138 	 * a pool following a split.
5139 	 * This means don't trust blkptrs and the vdev tree in general. This
5140 	 * also effectively puts the spa in read-only mode since
5141 	 * spa_writeable() checks for spa_trust_config to be true.
5142 	 * We will later load a trusted config from the MOS.
5143 	 */
5144 	if (type != SPA_IMPORT_ASSEMBLE)
5145 		spa->spa_trust_config = B_FALSE;
5146 
5147 	/*
5148 	 * Parse the config provided to create a vdev tree.
5149 	 */
5150 	error = spa_ld_parse_config(spa, type);
5151 	if (error != 0)
5152 		return (error);
5153 
5154 	spa_import_progress_add(spa);
5155 
5156 	/*
5157 	 * Now that we have the vdev tree, try to open each vdev. This involves
5158 	 * opening the underlying physical device, retrieving its geometry and
5159 	 * probing the vdev with a dummy I/O. The state of each vdev will be set
5160 	 * based on the success of those operations. After this we'll be ready
5161 	 * to read from the vdevs.
5162 	 */
5163 	error = spa_ld_open_vdevs(spa);
5164 	if (error != 0)
5165 		return (error);
5166 
5167 	/*
5168 	 * Read the label of each vdev and make sure that the GUIDs stored
5169 	 * there match the GUIDs in the config provided.
5170 	 * If we're assembling a new pool that's been split off from an
5171 	 * existing pool, the labels haven't yet been updated so we skip
5172 	 * validation for now.
5173 	 */
5174 	if (type != SPA_IMPORT_ASSEMBLE) {
5175 		error = spa_ld_validate_vdevs(spa);
5176 		if (error != 0)
5177 			return (error);
5178 	}
5179 
5180 	/*
5181 	 * Read all vdev labels to find the best uberblock (i.e. latest,
5182 	 * unless spa_load_max_txg is set) and store it in spa_uberblock. We
5183 	 * get the list of features required to read blkptrs in the MOS from
5184 	 * the vdev label with the best uberblock and verify that our version
5185 	 * of zfs supports them all.
5186 	 */
5187 	error = spa_ld_select_uberblock(spa, type);
5188 	if (error != 0)
5189 		return (error);
5190 
5191 	/*
5192 	 * Pass that uberblock to the dsl_pool layer which will open the root
5193 	 * blkptr. This blkptr points to the latest version of the MOS and will
5194 	 * allow us to read its contents.
5195 	 */
5196 	error = spa_ld_open_rootbp(spa);
5197 	if (error != 0)
5198 		return (error);
5199 
5200 	return (0);
5201 }
5202 
5203 static int
5204 spa_ld_checkpoint_rewind(spa_t *spa)
5205 {
5206 	uberblock_t checkpoint;
5207 	int error = 0;
5208 
5209 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
5210 	ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
5211 
5212 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
5213 	    DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
5214 	    sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
5215 
5216 	if (error != 0) {
5217 		spa_load_failed(spa, "unable to retrieve checkpointed "
5218 		    "uberblock from the MOS config [error=%d]", error);
5219 
5220 		if (error == ENOENT)
5221 			error = ZFS_ERR_NO_CHECKPOINT;
5222 
5223 		return (error);
5224 	}
5225 
5226 	ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg);
5227 	ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg);
5228 
5229 	/*
5230 	 * We need to update the txg and timestamp of the checkpointed
5231 	 * uberblock to be higher than the latest one. This ensures that
5232 	 * the checkpointed uberblock is selected if we were to close and
5233 	 * reopen the pool right after we've written it in the vdev labels.
5234 	 * (also see block comment in vdev_uberblock_compare)
5235 	 */
5236 	checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1;
5237 	checkpoint.ub_timestamp = gethrestime_sec();
5238 
5239 	/*
5240 	 * Set current uberblock to be the checkpointed uberblock.
5241 	 */
5242 	spa->spa_uberblock = checkpoint;
5243 
5244 	/*
5245 	 * If we are doing a normal rewind, then the pool is open for
5246 	 * writing and we sync the "updated" checkpointed uberblock to
5247 	 * disk. Once this is done, we've basically rewound the whole
5248 	 * pool and there is no way back.
5249 	 *
5250 	 * There are cases when we don't want to attempt and sync the
5251 	 * checkpointed uberblock to disk because we are opening a
5252 	 * pool as read-only. Specifically, verifying the checkpointed
5253 	 * state with zdb, and importing the checkpointed state to get
5254 	 * a "preview" of its content.
5255 	 */
5256 	if (spa_writeable(spa)) {
5257 		vdev_t *rvd = spa->spa_root_vdev;
5258 
5259 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5260 		vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
5261 		int svdcount = 0;
5262 		int children = rvd->vdev_children;
5263 		int c0 = random_in_range(children);
5264 
5265 		for (int c = 0; c < children; c++) {
5266 			vdev_t *vd = rvd->vdev_child[(c0 + c) % children];
5267 
5268 			/* Stop when revisiting the first vdev */
5269 			if (c > 0 && svd[0] == vd)
5270 				break;
5271 
5272 			if (vd->vdev_ms_array == 0 || vd->vdev_islog ||
5273 			    !vdev_is_concrete(vd))
5274 				continue;
5275 
5276 			svd[svdcount++] = vd;
5277 			if (svdcount == SPA_SYNC_MIN_VDEVS)
5278 				break;
5279 		}
5280 		error = vdev_config_sync(svd, svdcount, spa->spa_first_txg);
5281 		if (error == 0)
5282 			spa->spa_last_synced_guid = rvd->vdev_guid;
5283 		spa_config_exit(spa, SCL_ALL, FTAG);
5284 
5285 		if (error != 0) {
5286 			spa_load_failed(spa, "failed to write checkpointed "
5287 			    "uberblock to the vdev labels [error=%d]", error);
5288 			return (error);
5289 		}
5290 	}
5291 
5292 	return (0);
5293 }
5294 
5295 static int
5296 spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type,
5297     boolean_t *update_config_cache)
5298 {
5299 	int error;
5300 
5301 	/*
5302 	 * Parse the config for pool, open and validate vdevs,
5303 	 * select an uberblock, and use that uberblock to open
5304 	 * the MOS.
5305 	 */
5306 	error = spa_ld_mos_init(spa, type);
5307 	if (error != 0)
5308 		return (error);
5309 
5310 	/*
5311 	 * Retrieve the trusted config stored in the MOS and use it to create
5312 	 * a new, exact version of the vdev tree, then reopen all vdevs.
5313 	 */
5314 	error = spa_ld_trusted_config(spa, type, B_FALSE);
5315 	if (error == EAGAIN) {
5316 		if (update_config_cache != NULL)
5317 			*update_config_cache = B_TRUE;
5318 
5319 		/*
5320 		 * Redo the loading process with the trusted config if it is
5321 		 * too different from the untrusted config.
5322 		 */
5323 		spa_ld_prepare_for_reload(spa);
5324 		spa_load_note(spa, "RELOADING");
5325 		error = spa_ld_mos_init(spa, type);
5326 		if (error != 0)
5327 			return (error);
5328 
5329 		error = spa_ld_trusted_config(spa, type, B_TRUE);
5330 		if (error != 0)
5331 			return (error);
5332 
5333 	} else if (error != 0) {
5334 		return (error);
5335 	}
5336 
5337 	return (0);
5338 }
5339 
5340 /*
5341  * Load an existing storage pool, using the config provided. This config
5342  * describes which vdevs are part of the pool and is later validated against
5343  * partial configs present in each vdev's label and an entire copy of the
5344  * config stored in the MOS.
5345  */
5346 static int
5347 spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
5348 {
5349 	int error = 0;
5350 	boolean_t missing_feat_write = B_FALSE;
5351 	boolean_t checkpoint_rewind =
5352 	    (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
5353 	boolean_t update_config_cache = B_FALSE;
5354 	hrtime_t load_start = gethrtime();
5355 
5356 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
5357 	ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
5358 
5359 	spa_load_note(spa, "LOADING");
5360 
5361 	error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache);
5362 	if (error != 0)
5363 		return (error);
5364 
5365 	/*
5366 	 * If we are rewinding to the checkpoint then we need to repeat
5367 	 * everything we've done so far in this function but this time
5368 	 * selecting the checkpointed uberblock and using that to open
5369 	 * the MOS.
5370 	 */
5371 	if (checkpoint_rewind) {
5372 		/*
5373 		 * If we are rewinding to the checkpoint update config cache
5374 		 * anyway.
5375 		 */
5376 		update_config_cache = B_TRUE;
5377 
5378 		/*
5379 		 * Extract the checkpointed uberblock from the current MOS
5380 		 * and use this as the pool's uberblock from now on. If the
5381 		 * pool is imported as writeable we also write the checkpoint
5382 		 * uberblock to the labels, making the rewind permanent.
5383 		 */
5384 		error = spa_ld_checkpoint_rewind(spa);
5385 		if (error != 0)
5386 			return (error);
5387 
5388 		/*
5389 		 * Redo the loading process again with the
5390 		 * checkpointed uberblock.
5391 		 */
5392 		spa_ld_prepare_for_reload(spa);
5393 		spa_load_note(spa, "LOADING checkpointed uberblock");
5394 		error = spa_ld_mos_with_trusted_config(spa, type, NULL);
5395 		if (error != 0)
5396 			return (error);
5397 	}
5398 
5399 	/*
5400 	 * Drop the namespace lock for the rest of the function.
5401 	 */
5402 	spa->spa_load_thread = curthread;
5403 	mutex_exit(&spa_namespace_lock);
5404 
5405 	/*
5406 	 * Retrieve the checkpoint txg if the pool has a checkpoint.
5407 	 */
5408 	spa_import_progress_set_notes(spa, "Loading checkpoint txg");
5409 	error = spa_ld_read_checkpoint_txg(spa);
5410 	if (error != 0)
5411 		goto fail;
5412 
5413 	/*
5414 	 * Retrieve the mapping of indirect vdevs. Those vdevs were removed
5415 	 * from the pool and their contents were re-mapped to other vdevs. Note
5416 	 * that everything that we read before this step must have been
5417 	 * rewritten on concrete vdevs after the last device removal was
5418 	 * initiated. Otherwise we could be reading from indirect vdevs before
5419 	 * we have loaded their mappings.
5420 	 */
5421 	spa_import_progress_set_notes(spa, "Loading indirect vdev metadata");
5422 	error = spa_ld_open_indirect_vdev_metadata(spa);
5423 	if (error != 0)
5424 		goto fail;
5425 
5426 	/*
5427 	 * Retrieve the full list of active features from the MOS and check if
5428 	 * they are all supported.
5429 	 */
5430 	spa_import_progress_set_notes(spa, "Checking feature flags");
5431 	error = spa_ld_check_features(spa, &missing_feat_write);
5432 	if (error != 0)
5433 		goto fail;
5434 
5435 	/*
5436 	 * Load several special directories from the MOS needed by the dsl_pool
5437 	 * layer.
5438 	 */
5439 	spa_import_progress_set_notes(spa, "Loading special MOS directories");
5440 	error = spa_ld_load_special_directories(spa);
5441 	if (error != 0)
5442 		goto fail;
5443 
5444 	/*
5445 	 * Retrieve pool properties from the MOS.
5446 	 */
5447 	spa_import_progress_set_notes(spa, "Loading properties");
5448 	error = spa_ld_get_props(spa);
5449 	if (error != 0)
5450 		goto fail;
5451 
5452 	/*
5453 	 * Retrieve the list of auxiliary devices - cache devices and spares -
5454 	 * and open them.
5455 	 */
5456 	spa_import_progress_set_notes(spa, "Loading AUX vdevs");
5457 	error = spa_ld_open_aux_vdevs(spa, type);
5458 	if (error != 0)
5459 		goto fail;
5460 
5461 	/*
5462 	 * Load the metadata for all vdevs. Also check if unopenable devices
5463 	 * should be autoreplaced.
5464 	 */
5465 	spa_import_progress_set_notes(spa, "Loading vdev metadata");
5466 	error = spa_ld_load_vdev_metadata(spa);
5467 	if (error != 0)
5468 		goto fail;
5469 
5470 	spa_import_progress_set_notes(spa, "Loading dedup tables");
5471 	error = spa_ld_load_dedup_tables(spa);
5472 	if (error != 0)
5473 		goto fail;
5474 
5475 	spa_import_progress_set_notes(spa, "Loading BRT");
5476 	error = spa_ld_load_brt(spa);
5477 	if (error != 0)
5478 		goto fail;
5479 
5480 	/*
5481 	 * Verify the logs now to make sure we don't have any unexpected errors
5482 	 * when we claim log blocks later.
5483 	 */
5484 	spa_import_progress_set_notes(spa, "Verifying Log Devices");
5485 	error = spa_ld_verify_logs(spa, type, ereport);
5486 	if (error != 0)
5487 		goto fail;
5488 
5489 	if (missing_feat_write) {
5490 		ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT);
5491 
5492 		/*
5493 		 * At this point, we know that we can open the pool in
5494 		 * read-only mode but not read-write mode. We now have enough
5495 		 * information and can return to userland.
5496 		 */
5497 		error = spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT,
5498 		    ENOTSUP);
5499 		goto fail;
5500 	}
5501 
5502 	/*
5503 	 * Traverse the last txgs to make sure the pool was left off in a safe
5504 	 * state. When performing an extreme rewind, we verify the whole pool,
5505 	 * which can take a very long time.
5506 	 */
5507 	spa_import_progress_set_notes(spa, "Verifying pool data");
5508 	error = spa_ld_verify_pool_data(spa);
5509 	if (error != 0)
5510 		goto fail;
5511 
5512 	/*
5513 	 * Calculate the deflated space for the pool. This must be done before
5514 	 * we write anything to the pool because we'd need to update the space
5515 	 * accounting using the deflated sizes.
5516 	 */
5517 	spa_import_progress_set_notes(spa, "Calculating deflated space");
5518 	spa_update_dspace(spa);
5519 
5520 	/*
5521 	 * We have now retrieved all the information we needed to open the
5522 	 * pool. If we are importing the pool in read-write mode, a few
5523 	 * additional steps must be performed to finish the import.
5524 	 */
5525 	spa_import_progress_set_notes(spa, "Starting import");
5526 	if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER ||
5527 	    spa->spa_load_max_txg == UINT64_MAX)) {
5528 		uint64_t config_cache_txg = spa->spa_config_txg;
5529 
5530 		ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT);
5531 
5532 		/*
5533 		 * Before we do any zio_write's, complete the raidz expansion
5534 		 * scratch space copying, if necessary.
5535 		 */
5536 		if (RRSS_GET_STATE(&spa->spa_uberblock) == RRSS_SCRATCH_VALID)
5537 			vdev_raidz_reflow_copy_scratch(spa);
5538 
5539 		/*
5540 		 * In case of a checkpoint rewind, log the original txg
5541 		 * of the checkpointed uberblock.
5542 		 */
5543 		if (checkpoint_rewind) {
5544 			spa_history_log_internal(spa, "checkpoint rewind",
5545 			    NULL, "rewound state to txg=%llu",
5546 			    (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg);
5547 		}
5548 
5549 		spa_import_progress_set_notes(spa, "Claiming ZIL blocks");
5550 		/*
5551 		 * Traverse the ZIL and claim all blocks.
5552 		 */
5553 		spa_ld_claim_log_blocks(spa);
5554 
5555 		/*
5556 		 * Kick-off the syncing thread.
5557 		 */
5558 		spa->spa_sync_on = B_TRUE;
5559 		txg_sync_start(spa->spa_dsl_pool);
5560 		mmp_thread_start(spa);
5561 
5562 		/*
5563 		 * Wait for all claims to sync.  We sync up to the highest
5564 		 * claimed log block birth time so that claimed log blocks
5565 		 * don't appear to be from the future.  spa_claim_max_txg
5566 		 * will have been set for us by ZIL traversal operations
5567 		 * performed above.
5568 		 */
5569 		spa_import_progress_set_notes(spa, "Syncing ZIL claims");
5570 		txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
5571 
5572 		/*
5573 		 * Check if we need to request an update of the config. On the
5574 		 * next sync, we would update the config stored in vdev labels
5575 		 * and the cachefile (by default /etc/zfs/zpool.cache).
5576 		 */
5577 		spa_import_progress_set_notes(spa, "Updating configs");
5578 		spa_ld_check_for_config_update(spa, config_cache_txg,
5579 		    update_config_cache);
5580 
5581 		/*
5582 		 * Check if a rebuild was in progress and if so resume it.
5583 		 * Then check all DTLs to see if anything needs resilvering.
5584 		 * The resilver will be deferred if a rebuild was started.
5585 		 */
5586 		spa_import_progress_set_notes(spa, "Starting resilvers");
5587 		if (vdev_rebuild_active(spa->spa_root_vdev)) {
5588 			vdev_rebuild_restart(spa);
5589 		} else if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
5590 		    vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
5591 			spa_async_request(spa, SPA_ASYNC_RESILVER);
5592 		}
5593 
5594 		/*
5595 		 * Log the fact that we booted up (so that we can detect if
5596 		 * we rebooted in the middle of an operation).
5597 		 */
5598 		spa_history_log_version(spa, "open", NULL);
5599 
5600 		spa_import_progress_set_notes(spa,
5601 		    "Restarting device removals");
5602 		spa_restart_removal(spa);
5603 		spa_spawn_aux_threads(spa);
5604 
5605 		/*
5606 		 * Delete any inconsistent datasets.
5607 		 *
5608 		 * Note:
5609 		 * Since we may be issuing deletes for clones here,
5610 		 * we make sure to do so after we've spawned all the
5611 		 * auxiliary threads above (from which the livelist
5612 		 * deletion zthr is part of).
5613 		 */
5614 		spa_import_progress_set_notes(spa,
5615 		    "Cleaning up inconsistent objsets");
5616 		(void) dmu_objset_find(spa_name(spa),
5617 		    dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
5618 
5619 		/*
5620 		 * Clean up any stale temporary dataset userrefs.
5621 		 */
5622 		spa_import_progress_set_notes(spa,
5623 		    "Cleaning up temporary userrefs");
5624 		dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
5625 
5626 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
5627 		spa_import_progress_set_notes(spa, "Restarting initialize");
5628 		vdev_initialize_restart(spa->spa_root_vdev);
5629 		spa_import_progress_set_notes(spa, "Restarting TRIM");
5630 		vdev_trim_restart(spa->spa_root_vdev);
5631 		vdev_autotrim_restart(spa);
5632 		spa_config_exit(spa, SCL_CONFIG, FTAG);
5633 		spa_import_progress_set_notes(spa, "Finished importing");
5634 	}
5635 	zio_handle_import_delay(spa, gethrtime() - load_start);
5636 
5637 	spa_import_progress_remove(spa_guid(spa));
5638 	spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD);
5639 
5640 	spa_load_note(spa, "LOADED");
5641 fail:
5642 	mutex_enter(&spa_namespace_lock);
5643 	spa->spa_load_thread = NULL;
5644 	cv_broadcast(&spa_namespace_cv);
5645 
5646 	return (error);
5647 
5648 }
5649 
5650 static int
5651 spa_load_retry(spa_t *spa, spa_load_state_t state)
5652 {
5653 	spa_mode_t mode = spa->spa_mode;
5654 
5655 	spa_unload(spa);
5656 	spa_deactivate(spa);
5657 
5658 	spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1;
5659 
5660 	spa_activate(spa, mode);
5661 	spa_async_suspend(spa);
5662 
5663 	spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu",
5664 	    (u_longlong_t)spa->spa_load_max_txg);
5665 
5666 	return (spa_load(spa, state, SPA_IMPORT_EXISTING));
5667 }
5668 
5669 /*
5670  * If spa_load() fails this function will try loading prior txg's. If
5671  * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool
5672  * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this
5673  * function will not rewind the pool and will return the same error as
5674  * spa_load().
5675  */
5676 static int
5677 spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request,
5678     int rewind_flags)
5679 {
5680 	nvlist_t *loadinfo = NULL;
5681 	nvlist_t *config = NULL;
5682 	int load_error, rewind_error;
5683 	uint64_t safe_rewind_txg;
5684 	uint64_t min_txg;
5685 
5686 	if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
5687 		spa->spa_load_max_txg = spa->spa_load_txg;
5688 		spa_set_log_state(spa, SPA_LOG_CLEAR);
5689 	} else {
5690 		spa->spa_load_max_txg = max_request;
5691 		if (max_request != UINT64_MAX)
5692 			spa->spa_extreme_rewind = B_TRUE;
5693 	}
5694 
5695 	load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING);
5696 	if (load_error == 0)
5697 		return (0);
5698 	if (load_error == ZFS_ERR_NO_CHECKPOINT) {
5699 		/*
5700 		 * When attempting checkpoint-rewind on a pool with no
5701 		 * checkpoint, we should not attempt to load uberblocks
5702 		 * from previous txgs when spa_load fails.
5703 		 */
5704 		ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
5705 		spa_import_progress_remove(spa_guid(spa));
5706 		return (load_error);
5707 	}
5708 
5709 	if (spa->spa_root_vdev != NULL)
5710 		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
5711 
5712 	spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
5713 	spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
5714 
5715 	if (rewind_flags & ZPOOL_NEVER_REWIND) {
5716 		nvlist_free(config);
5717 		spa_import_progress_remove(spa_guid(spa));
5718 		return (load_error);
5719 	}
5720 
5721 	if (state == SPA_LOAD_RECOVER) {
5722 		/* Price of rolling back is discarding txgs, including log */
5723 		spa_set_log_state(spa, SPA_LOG_CLEAR);
5724 	} else {
5725 		/*
5726 		 * If we aren't rolling back save the load info from our first
5727 		 * import attempt so that we can restore it after attempting
5728 		 * to rewind.
5729 		 */
5730 		loadinfo = spa->spa_load_info;
5731 		spa->spa_load_info = fnvlist_alloc();
5732 	}
5733 
5734 	spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
5735 	safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
5736 	min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
5737 	    TXG_INITIAL : safe_rewind_txg;
5738 
5739 	/*
5740 	 * Continue as long as we're finding errors, we're still within
5741 	 * the acceptable rewind range, and we're still finding uberblocks
5742 	 */
5743 	while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
5744 	    spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
5745 		if (spa->spa_load_max_txg < safe_rewind_txg)
5746 			spa->spa_extreme_rewind = B_TRUE;
5747 		rewind_error = spa_load_retry(spa, state);
5748 	}
5749 
5750 	spa->spa_extreme_rewind = B_FALSE;
5751 	spa->spa_load_max_txg = UINT64_MAX;
5752 
5753 	if (config && (rewind_error || state != SPA_LOAD_RECOVER))
5754 		spa_config_set(spa, config);
5755 	else
5756 		nvlist_free(config);
5757 
5758 	if (state == SPA_LOAD_RECOVER) {
5759 		ASSERT3P(loadinfo, ==, NULL);
5760 		spa_import_progress_remove(spa_guid(spa));
5761 		return (rewind_error);
5762 	} else {
5763 		/* Store the rewind info as part of the initial load info */
5764 		fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO,
5765 		    spa->spa_load_info);
5766 
5767 		/* Restore the initial load info */
5768 		fnvlist_free(spa->spa_load_info);
5769 		spa->spa_load_info = loadinfo;
5770 
5771 		spa_import_progress_remove(spa_guid(spa));
5772 		return (load_error);
5773 	}
5774 }
5775 
5776 /*
5777  * Pool Open/Import
5778  *
5779  * The import case is identical to an open except that the configuration is sent
5780  * down from userland, instead of grabbed from the configuration cache.  For the
5781  * case of an open, the pool configuration will exist in the
5782  * POOL_STATE_UNINITIALIZED state.
5783  *
5784  * The stats information (gen/count/ustats) is used to gather vdev statistics at
5785  * the same time open the pool, without having to keep around the spa_t in some
5786  * ambiguous state.
5787  */
5788 static int
5789 spa_open_common(const char *pool, spa_t **spapp, const void *tag,
5790     nvlist_t *nvpolicy, nvlist_t **config)
5791 {
5792 	spa_t *spa;
5793 	spa_load_state_t state = SPA_LOAD_OPEN;
5794 	int error;
5795 	int locked = B_FALSE;
5796 	int firstopen = B_FALSE;
5797 
5798 	*spapp = NULL;
5799 
5800 	/*
5801 	 * As disgusting as this is, we need to support recursive calls to this
5802 	 * function because dsl_dir_open() is called during spa_load(), and ends
5803 	 * up calling spa_open() again.  The real fix is to figure out how to
5804 	 * avoid dsl_dir_open() calling this in the first place.
5805 	 */
5806 	if (MUTEX_NOT_HELD(&spa_namespace_lock)) {
5807 		mutex_enter(&spa_namespace_lock);
5808 		locked = B_TRUE;
5809 	}
5810 
5811 	if ((spa = spa_lookup(pool)) == NULL) {
5812 		if (locked)
5813 			mutex_exit(&spa_namespace_lock);
5814 		return (SET_ERROR(ENOENT));
5815 	}
5816 
5817 	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
5818 		zpool_load_policy_t policy;
5819 
5820 		firstopen = B_TRUE;
5821 
5822 		zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config,
5823 		    &policy);
5824 		if (policy.zlp_rewind & ZPOOL_DO_REWIND)
5825 			state = SPA_LOAD_RECOVER;
5826 
5827 		spa_activate(spa, spa_mode_global);
5828 
5829 		if (state != SPA_LOAD_RECOVER)
5830 			spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
5831 		spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
5832 
5833 		zfs_dbgmsg("spa_open_common: opening %s", pool);
5834 		error = spa_load_best(spa, state, policy.zlp_txg,
5835 		    policy.zlp_rewind);
5836 
5837 		if (error == EBADF) {
5838 			/*
5839 			 * If vdev_validate() returns failure (indicated by
5840 			 * EBADF), it indicates that one of the vdevs indicates
5841 			 * that the pool has been exported or destroyed.  If
5842 			 * this is the case, the config cache is out of sync and
5843 			 * we should remove the pool from the namespace.
5844 			 */
5845 			spa_unload(spa);
5846 			spa_deactivate(spa);
5847 			spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE);
5848 			spa_remove(spa);
5849 			if (locked)
5850 				mutex_exit(&spa_namespace_lock);
5851 			return (SET_ERROR(ENOENT));
5852 		}
5853 
5854 		if (error) {
5855 			/*
5856 			 * We can't open the pool, but we still have useful
5857 			 * information: the state of each vdev after the
5858 			 * attempted vdev_open().  Return this to the user.
5859 			 */
5860 			if (config != NULL && spa->spa_config) {
5861 				*config = fnvlist_dup(spa->spa_config);
5862 				fnvlist_add_nvlist(*config,
5863 				    ZPOOL_CONFIG_LOAD_INFO,
5864 				    spa->spa_load_info);
5865 			}
5866 			spa_unload(spa);
5867 			spa_deactivate(spa);
5868 			spa->spa_last_open_failed = error;
5869 			if (locked)
5870 				mutex_exit(&spa_namespace_lock);
5871 			*spapp = NULL;
5872 			return (error);
5873 		}
5874 	}
5875 
5876 	spa_open_ref(spa, tag);
5877 
5878 	if (config != NULL)
5879 		*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
5880 
5881 	/*
5882 	 * If we've recovered the pool, pass back any information we
5883 	 * gathered while doing the load.
5884 	 */
5885 	if (state == SPA_LOAD_RECOVER && config != NULL) {
5886 		fnvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
5887 		    spa->spa_load_info);
5888 	}
5889 
5890 	if (locked) {
5891 		spa->spa_last_open_failed = 0;
5892 		spa->spa_last_ubsync_txg = 0;
5893 		spa->spa_load_txg = 0;
5894 		mutex_exit(&spa_namespace_lock);
5895 	}
5896 
5897 	if (firstopen)
5898 		zvol_create_minors_recursive(spa_name(spa));
5899 
5900 	*spapp = spa;
5901 
5902 	return (0);
5903 }
5904 
5905 int
5906 spa_open_rewind(const char *name, spa_t **spapp, const void *tag,
5907     nvlist_t *policy, nvlist_t **config)
5908 {
5909 	return (spa_open_common(name, spapp, tag, policy, config));
5910 }
5911 
5912 int
5913 spa_open(const char *name, spa_t **spapp, const void *tag)
5914 {
5915 	return (spa_open_common(name, spapp, tag, NULL, NULL));
5916 }
5917 
5918 /*
5919  * Lookup the given spa_t, incrementing the inject count in the process,
5920  * preventing it from being exported or destroyed.
5921  */
5922 spa_t *
5923 spa_inject_addref(char *name)
5924 {
5925 	spa_t *spa;
5926 
5927 	mutex_enter(&spa_namespace_lock);
5928 	if ((spa = spa_lookup(name)) == NULL) {
5929 		mutex_exit(&spa_namespace_lock);
5930 		return (NULL);
5931 	}
5932 	spa->spa_inject_ref++;
5933 	mutex_exit(&spa_namespace_lock);
5934 
5935 	return (spa);
5936 }
5937 
5938 void
5939 spa_inject_delref(spa_t *spa)
5940 {
5941 	mutex_enter(&spa_namespace_lock);
5942 	spa->spa_inject_ref--;
5943 	mutex_exit(&spa_namespace_lock);
5944 }
5945 
5946 /*
5947  * Add spares device information to the nvlist.
5948  */
5949 static void
5950 spa_add_spares(spa_t *spa, nvlist_t *config)
5951 {
5952 	nvlist_t **spares;
5953 	uint_t i, nspares;
5954 	nvlist_t *nvroot;
5955 	uint64_t guid;
5956 	vdev_stat_t *vs;
5957 	uint_t vsc;
5958 	uint64_t pool;
5959 
5960 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
5961 
5962 	if (spa->spa_spares.sav_count == 0)
5963 		return;
5964 
5965 	nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE);
5966 	VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
5967 	    ZPOOL_CONFIG_SPARES, &spares, &nspares));
5968 	if (nspares != 0) {
5969 		fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
5970 		    (const nvlist_t * const *)spares, nspares);
5971 		VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
5972 		    &spares, &nspares));
5973 
5974 		/*
5975 		 * Go through and find any spares which have since been
5976 		 * repurposed as an active spare.  If this is the case, update
5977 		 * their status appropriately.
5978 		 */
5979 		for (i = 0; i < nspares; i++) {
5980 			guid = fnvlist_lookup_uint64(spares[i],
5981 			    ZPOOL_CONFIG_GUID);
5982 			VERIFY0(nvlist_lookup_uint64_array(spares[i],
5983 			    ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc));
5984 			if (spa_spare_exists(guid, &pool, NULL) &&
5985 			    pool != 0ULL) {
5986 				vs->vs_state = VDEV_STATE_CANT_OPEN;
5987 				vs->vs_aux = VDEV_AUX_SPARED;
5988 			} else {
5989 				vs->vs_state =
5990 				    spa->spa_spares.sav_vdevs[i]->vdev_state;
5991 			}
5992 		}
5993 	}
5994 }
5995 
5996 /*
5997  * Add l2cache device information to the nvlist, including vdev stats.
5998  */
5999 static void
6000 spa_add_l2cache(spa_t *spa, nvlist_t *config)
6001 {
6002 	nvlist_t **l2cache;
6003 	uint_t i, j, nl2cache;
6004 	nvlist_t *nvroot;
6005 	uint64_t guid;
6006 	vdev_t *vd;
6007 	vdev_stat_t *vs;
6008 	uint_t vsc;
6009 
6010 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
6011 
6012 	if (spa->spa_l2cache.sav_count == 0)
6013 		return;
6014 
6015 	nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE);
6016 	VERIFY0(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
6017 	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache));
6018 	if (nl2cache != 0) {
6019 		fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
6020 		    (const nvlist_t * const *)l2cache, nl2cache);
6021 		VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
6022 		    &l2cache, &nl2cache));
6023 
6024 		/*
6025 		 * Update level 2 cache device stats.
6026 		 */
6027 
6028 		for (i = 0; i < nl2cache; i++) {
6029 			guid = fnvlist_lookup_uint64(l2cache[i],
6030 			    ZPOOL_CONFIG_GUID);
6031 
6032 			vd = NULL;
6033 			for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
6034 				if (guid ==
6035 				    spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
6036 					vd = spa->spa_l2cache.sav_vdevs[j];
6037 					break;
6038 				}
6039 			}
6040 			ASSERT(vd != NULL);
6041 
6042 			VERIFY0(nvlist_lookup_uint64_array(l2cache[i],
6043 			    ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc));
6044 			vdev_get_stats(vd, vs);
6045 			vdev_config_generate_stats(vd, l2cache[i]);
6046 
6047 		}
6048 	}
6049 }
6050 
6051 static void
6052 spa_feature_stats_from_disk(spa_t *spa, nvlist_t *features)
6053 {
6054 	zap_cursor_t zc;
6055 	zap_attribute_t *za = zap_attribute_alloc();
6056 
6057 	if (spa->spa_feat_for_read_obj != 0) {
6058 		for (zap_cursor_init(&zc, spa->spa_meta_objset,
6059 		    spa->spa_feat_for_read_obj);
6060 		    zap_cursor_retrieve(&zc, za) == 0;
6061 		    zap_cursor_advance(&zc)) {
6062 			ASSERT(za->za_integer_length == sizeof (uint64_t) &&
6063 			    za->za_num_integers == 1);
6064 			VERIFY0(nvlist_add_uint64(features, za->za_name,
6065 			    za->za_first_integer));
6066 		}
6067 		zap_cursor_fini(&zc);
6068 	}
6069 
6070 	if (spa->spa_feat_for_write_obj != 0) {
6071 		for (zap_cursor_init(&zc, spa->spa_meta_objset,
6072 		    spa->spa_feat_for_write_obj);
6073 		    zap_cursor_retrieve(&zc, za) == 0;
6074 		    zap_cursor_advance(&zc)) {
6075 			ASSERT(za->za_integer_length == sizeof (uint64_t) &&
6076 			    za->za_num_integers == 1);
6077 			VERIFY0(nvlist_add_uint64(features, za->za_name,
6078 			    za->za_first_integer));
6079 		}
6080 		zap_cursor_fini(&zc);
6081 	}
6082 	zap_attribute_free(za);
6083 }
6084 
6085 static void
6086 spa_feature_stats_from_cache(spa_t *spa, nvlist_t *features)
6087 {
6088 	int i;
6089 
6090 	for (i = 0; i < SPA_FEATURES; i++) {
6091 		zfeature_info_t feature = spa_feature_table[i];
6092 		uint64_t refcount;
6093 
6094 		if (feature_get_refcount(spa, &feature, &refcount) != 0)
6095 			continue;
6096 
6097 		VERIFY0(nvlist_add_uint64(features, feature.fi_guid, refcount));
6098 	}
6099 }
6100 
6101 /*
6102  * Store a list of pool features and their reference counts in the
6103  * config.
6104  *
6105  * The first time this is called on a spa, allocate a new nvlist, fetch
6106  * the pool features and reference counts from disk, then save the list
6107  * in the spa. In subsequent calls on the same spa use the saved nvlist
6108  * and refresh its values from the cached reference counts.  This
6109  * ensures we don't block here on I/O on a suspended pool so 'zpool
6110  * clear' can resume the pool.
6111  */
6112 static void
6113 spa_add_feature_stats(spa_t *spa, nvlist_t *config)
6114 {
6115 	nvlist_t *features;
6116 
6117 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
6118 
6119 	mutex_enter(&spa->spa_feat_stats_lock);
6120 	features = spa->spa_feat_stats;
6121 
6122 	if (features != NULL) {
6123 		spa_feature_stats_from_cache(spa, features);
6124 	} else {
6125 		VERIFY0(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP));
6126 		spa->spa_feat_stats = features;
6127 		spa_feature_stats_from_disk(spa, features);
6128 	}
6129 
6130 	VERIFY0(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
6131 	    features));
6132 
6133 	mutex_exit(&spa->spa_feat_stats_lock);
6134 }
6135 
6136 int
6137 spa_get_stats(const char *name, nvlist_t **config,
6138     char *altroot, size_t buflen)
6139 {
6140 	int error;
6141 	spa_t *spa;
6142 
6143 	*config = NULL;
6144 	error = spa_open_common(name, &spa, FTAG, NULL, config);
6145 
6146 	if (spa != NULL) {
6147 		/*
6148 		 * This still leaves a window of inconsistency where the spares
6149 		 * or l2cache devices could change and the config would be
6150 		 * self-inconsistent.
6151 		 */
6152 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
6153 
6154 		if (*config != NULL) {
6155 			uint64_t loadtimes[2];
6156 
6157 			loadtimes[0] = spa->spa_loaded_ts.tv_sec;
6158 			loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
6159 			fnvlist_add_uint64_array(*config,
6160 			    ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2);
6161 
6162 			fnvlist_add_uint64(*config,
6163 			    ZPOOL_CONFIG_ERRCOUNT,
6164 			    spa_approx_errlog_size(spa));
6165 
6166 			if (spa_suspended(spa)) {
6167 				fnvlist_add_uint64(*config,
6168 				    ZPOOL_CONFIG_SUSPENDED,
6169 				    spa->spa_failmode);
6170 				fnvlist_add_uint64(*config,
6171 				    ZPOOL_CONFIG_SUSPENDED_REASON,
6172 				    spa->spa_suspended);
6173 			}
6174 
6175 			spa_add_spares(spa, *config);
6176 			spa_add_l2cache(spa, *config);
6177 			spa_add_feature_stats(spa, *config);
6178 		}
6179 	}
6180 
6181 	/*
6182 	 * We want to get the alternate root even for faulted pools, so we cheat
6183 	 * and call spa_lookup() directly.
6184 	 */
6185 	if (altroot) {
6186 		if (spa == NULL) {
6187 			mutex_enter(&spa_namespace_lock);
6188 			spa = spa_lookup(name);
6189 			if (spa)
6190 				spa_altroot(spa, altroot, buflen);
6191 			else
6192 				altroot[0] = '\0';
6193 			spa = NULL;
6194 			mutex_exit(&spa_namespace_lock);
6195 		} else {
6196 			spa_altroot(spa, altroot, buflen);
6197 		}
6198 	}
6199 
6200 	if (spa != NULL) {
6201 		spa_config_exit(spa, SCL_CONFIG, FTAG);
6202 		spa_close(spa, FTAG);
6203 	}
6204 
6205 	return (error);
6206 }
6207 
6208 /*
6209  * Validate that the auxiliary device array is well formed.  We must have an
6210  * array of nvlists, each which describes a valid leaf vdev.  If this is an
6211  * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
6212  * specified, as long as they are well-formed.
6213  */
6214 static int
6215 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
6216     spa_aux_vdev_t *sav, const char *config, uint64_t version,
6217     vdev_labeltype_t label)
6218 {
6219 	nvlist_t **dev;
6220 	uint_t i, ndev;
6221 	vdev_t *vd;
6222 	int error;
6223 
6224 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
6225 
6226 	/*
6227 	 * It's acceptable to have no devs specified.
6228 	 */
6229 	if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
6230 		return (0);
6231 
6232 	if (ndev == 0)
6233 		return (SET_ERROR(EINVAL));
6234 
6235 	/*
6236 	 * Make sure the pool is formatted with a version that supports this
6237 	 * device type.
6238 	 */
6239 	if (spa_version(spa) < version)
6240 		return (SET_ERROR(ENOTSUP));
6241 
6242 	/*
6243 	 * Set the pending device list so we correctly handle device in-use
6244 	 * checking.
6245 	 */
6246 	sav->sav_pending = dev;
6247 	sav->sav_npending = ndev;
6248 
6249 	for (i = 0; i < ndev; i++) {
6250 		if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
6251 		    mode)) != 0)
6252 			goto out;
6253 
6254 		if (!vd->vdev_ops->vdev_op_leaf) {
6255 			vdev_free(vd);
6256 			error = SET_ERROR(EINVAL);
6257 			goto out;
6258 		}
6259 
6260 		vd->vdev_top = vd;
6261 
6262 		if ((error = vdev_open(vd)) == 0 &&
6263 		    (error = vdev_label_init(vd, crtxg, label)) == 0) {
6264 			fnvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
6265 			    vd->vdev_guid);
6266 		}
6267 
6268 		vdev_free(vd);
6269 
6270 		if (error &&
6271 		    (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
6272 			goto out;
6273 		else
6274 			error = 0;
6275 	}
6276 
6277 out:
6278 	sav->sav_pending = NULL;
6279 	sav->sav_npending = 0;
6280 	return (error);
6281 }
6282 
6283 static int
6284 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
6285 {
6286 	int error;
6287 
6288 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
6289 
6290 	if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
6291 	    &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
6292 	    VDEV_LABEL_SPARE)) != 0) {
6293 		return (error);
6294 	}
6295 
6296 	return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
6297 	    &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
6298 	    VDEV_LABEL_L2CACHE));
6299 }
6300 
6301 static void
6302 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
6303     const char *config)
6304 {
6305 	int i;
6306 
6307 	if (sav->sav_config != NULL) {
6308 		nvlist_t **olddevs;
6309 		uint_t oldndevs;
6310 		nvlist_t **newdevs;
6311 
6312 		/*
6313 		 * Generate new dev list by concatenating with the
6314 		 * current dev list.
6315 		 */
6316 		VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, config,
6317 		    &olddevs, &oldndevs));
6318 
6319 		newdevs = kmem_alloc(sizeof (void *) *
6320 		    (ndevs + oldndevs), KM_SLEEP);
6321 		for (i = 0; i < oldndevs; i++)
6322 			newdevs[i] = fnvlist_dup(olddevs[i]);
6323 		for (i = 0; i < ndevs; i++)
6324 			newdevs[i + oldndevs] = fnvlist_dup(devs[i]);
6325 
6326 		fnvlist_remove(sav->sav_config, config);
6327 
6328 		fnvlist_add_nvlist_array(sav->sav_config, config,
6329 		    (const nvlist_t * const *)newdevs, ndevs + oldndevs);
6330 		for (i = 0; i < oldndevs + ndevs; i++)
6331 			nvlist_free(newdevs[i]);
6332 		kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
6333 	} else {
6334 		/*
6335 		 * Generate a new dev list.
6336 		 */
6337 		sav->sav_config = fnvlist_alloc();
6338 		fnvlist_add_nvlist_array(sav->sav_config, config,
6339 		    (const nvlist_t * const *)devs, ndevs);
6340 	}
6341 }
6342 
6343 /*
6344  * Stop and drop level 2 ARC devices
6345  */
6346 void
6347 spa_l2cache_drop(spa_t *spa)
6348 {
6349 	vdev_t *vd;
6350 	int i;
6351 	spa_aux_vdev_t *sav = &spa->spa_l2cache;
6352 
6353 	for (i = 0; i < sav->sav_count; i++) {
6354 		uint64_t pool;
6355 
6356 		vd = sav->sav_vdevs[i];
6357 		ASSERT(vd != NULL);
6358 
6359 		if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
6360 		    pool != 0ULL && l2arc_vdev_present(vd))
6361 			l2arc_remove_vdev(vd);
6362 	}
6363 }
6364 
6365 /*
6366  * Verify encryption parameters for spa creation. If we are encrypting, we must
6367  * have the encryption feature flag enabled.
6368  */
6369 static int
6370 spa_create_check_encryption_params(dsl_crypto_params_t *dcp,
6371     boolean_t has_encryption)
6372 {
6373 	if (dcp->cp_crypt != ZIO_CRYPT_OFF &&
6374 	    dcp->cp_crypt != ZIO_CRYPT_INHERIT &&
6375 	    !has_encryption)
6376 		return (SET_ERROR(ENOTSUP));
6377 
6378 	return (dmu_objset_create_crypt_check(NULL, dcp, NULL));
6379 }
6380 
6381 /*
6382  * Pool Creation
6383  */
6384 int
6385 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
6386     nvlist_t *zplprops, dsl_crypto_params_t *dcp)
6387 {
6388 	spa_t *spa;
6389 	const char *altroot = NULL;
6390 	vdev_t *rvd;
6391 	dsl_pool_t *dp;
6392 	dmu_tx_t *tx;
6393 	int error = 0;
6394 	uint64_t txg = TXG_INITIAL;
6395 	nvlist_t **spares, **l2cache;
6396 	uint_t nspares, nl2cache;
6397 	uint64_t version, obj, ndraid = 0;
6398 	boolean_t has_features;
6399 	boolean_t has_encryption;
6400 	boolean_t has_allocclass;
6401 	spa_feature_t feat;
6402 	const char *feat_name;
6403 	const char *poolname;
6404 	nvlist_t *nvl;
6405 
6406 	if (props == NULL ||
6407 	    nvlist_lookup_string(props,
6408 	    zpool_prop_to_name(ZPOOL_PROP_TNAME), &poolname) != 0)
6409 		poolname = (char *)pool;
6410 
6411 	/*
6412 	 * If this pool already exists, return failure.
6413 	 */
6414 	mutex_enter(&spa_namespace_lock);
6415 	if (spa_lookup(poolname) != NULL) {
6416 		mutex_exit(&spa_namespace_lock);
6417 		return (SET_ERROR(EEXIST));
6418 	}
6419 
6420 	/*
6421 	 * Allocate a new spa_t structure.
6422 	 */
6423 	nvl = fnvlist_alloc();
6424 	fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool);
6425 	(void) nvlist_lookup_string(props,
6426 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
6427 	spa = spa_add(poolname, nvl, altroot);
6428 	fnvlist_free(nvl);
6429 	spa_activate(spa, spa_mode_global);
6430 
6431 	if (props && (error = spa_prop_validate(spa, props))) {
6432 		spa_deactivate(spa);
6433 		spa_remove(spa);
6434 		mutex_exit(&spa_namespace_lock);
6435 		return (error);
6436 	}
6437 
6438 	/*
6439 	 * Temporary pool names should never be written to disk.
6440 	 */
6441 	if (poolname != pool)
6442 		spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME;
6443 
6444 	has_features = B_FALSE;
6445 	has_encryption = B_FALSE;
6446 	has_allocclass = B_FALSE;
6447 	for (nvpair_t *elem = nvlist_next_nvpair(props, NULL);
6448 	    elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
6449 		if (zpool_prop_feature(nvpair_name(elem))) {
6450 			has_features = B_TRUE;
6451 
6452 			feat_name = strchr(nvpair_name(elem), '@') + 1;
6453 			VERIFY0(zfeature_lookup_name(feat_name, &feat));
6454 			if (feat == SPA_FEATURE_ENCRYPTION)
6455 				has_encryption = B_TRUE;
6456 			if (feat == SPA_FEATURE_ALLOCATION_CLASSES)
6457 				has_allocclass = B_TRUE;
6458 		}
6459 	}
6460 
6461 	/* verify encryption params, if they were provided */
6462 	if (dcp != NULL) {
6463 		error = spa_create_check_encryption_params(dcp, has_encryption);
6464 		if (error != 0) {
6465 			spa_deactivate(spa);
6466 			spa_remove(spa);
6467 			mutex_exit(&spa_namespace_lock);
6468 			return (error);
6469 		}
6470 	}
6471 	if (!has_allocclass && zfs_special_devs(nvroot, NULL)) {
6472 		spa_deactivate(spa);
6473 		spa_remove(spa);
6474 		mutex_exit(&spa_namespace_lock);
6475 		return (ENOTSUP);
6476 	}
6477 
6478 	if (has_features || nvlist_lookup_uint64(props,
6479 	    zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) {
6480 		version = SPA_VERSION;
6481 	}
6482 	ASSERT(SPA_VERSION_IS_SUPPORTED(version));
6483 
6484 	spa->spa_first_txg = txg;
6485 	spa->spa_uberblock.ub_txg = txg - 1;
6486 	spa->spa_uberblock.ub_version = version;
6487 	spa->spa_ubsync = spa->spa_uberblock;
6488 	spa->spa_load_state = SPA_LOAD_CREATE;
6489 	spa->spa_removing_phys.sr_state = DSS_NONE;
6490 	spa->spa_removing_phys.sr_removing_vdev = -1;
6491 	spa->spa_removing_phys.sr_prev_indirect_vdev = -1;
6492 	spa->spa_indirect_vdevs_loaded = B_TRUE;
6493 
6494 	/*
6495 	 * Create "The Godfather" zio to hold all async IOs
6496 	 */
6497 	spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
6498 	    KM_SLEEP);
6499 	for (int i = 0; i < max_ncpus; i++) {
6500 		spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
6501 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
6502 		    ZIO_FLAG_GODFATHER);
6503 	}
6504 
6505 	/*
6506 	 * Create the root vdev.
6507 	 */
6508 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
6509 
6510 	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
6511 
6512 	ASSERT(error != 0 || rvd != NULL);
6513 	ASSERT(error != 0 || spa->spa_root_vdev == rvd);
6514 
6515 	if (error == 0 && !zfs_allocatable_devs(nvroot))
6516 		error = SET_ERROR(EINVAL);
6517 
6518 	if (error == 0 &&
6519 	    (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
6520 	    (error = vdev_draid_spare_create(nvroot, rvd, &ndraid, 0)) == 0 &&
6521 	    (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) {
6522 		/*
6523 		 * instantiate the metaslab groups (this will dirty the vdevs)
6524 		 * we can no longer error exit past this point
6525 		 */
6526 		for (int c = 0; error == 0 && c < rvd->vdev_children; c++) {
6527 			vdev_t *vd = rvd->vdev_child[c];
6528 
6529 			vdev_metaslab_set_size(vd);
6530 			vdev_expand(vd, txg);
6531 		}
6532 	}
6533 
6534 	spa_config_exit(spa, SCL_ALL, FTAG);
6535 
6536 	if (error != 0) {
6537 		spa_unload(spa);
6538 		spa_deactivate(spa);
6539 		spa_remove(spa);
6540 		mutex_exit(&spa_namespace_lock);
6541 		return (error);
6542 	}
6543 
6544 	/*
6545 	 * Get the list of spares, if specified.
6546 	 */
6547 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
6548 	    &spares, &nspares) == 0) {
6549 		spa->spa_spares.sav_config = fnvlist_alloc();
6550 		fnvlist_add_nvlist_array(spa->spa_spares.sav_config,
6551 		    ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares,
6552 		    nspares);
6553 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
6554 		spa_load_spares(spa);
6555 		spa_config_exit(spa, SCL_ALL, FTAG);
6556 		spa->spa_spares.sav_sync = B_TRUE;
6557 	}
6558 
6559 	/*
6560 	 * Get the list of level 2 cache devices, if specified.
6561 	 */
6562 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
6563 	    &l2cache, &nl2cache) == 0) {
6564 		VERIFY0(nvlist_alloc(&spa->spa_l2cache.sav_config,
6565 		    NV_UNIQUE_NAME, KM_SLEEP));
6566 		fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
6567 		    ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache,
6568 		    nl2cache);
6569 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
6570 		spa_load_l2cache(spa);
6571 		spa_config_exit(spa, SCL_ALL, FTAG);
6572 		spa->spa_l2cache.sav_sync = B_TRUE;
6573 	}
6574 
6575 	spa->spa_is_initializing = B_TRUE;
6576 	spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, dcp, txg);
6577 	spa->spa_is_initializing = B_FALSE;
6578 
6579 	/*
6580 	 * Create DDTs (dedup tables).
6581 	 */
6582 	ddt_create(spa);
6583 	/*
6584 	 * Create BRT table and BRT table object.
6585 	 */
6586 	brt_create(spa);
6587 
6588 	spa_update_dspace(spa);
6589 
6590 	tx = dmu_tx_create_assigned(dp, txg);
6591 
6592 	/*
6593 	 * Create the pool's history object.
6594 	 */
6595 	if (version >= SPA_VERSION_ZPOOL_HISTORY && !spa->spa_history)
6596 		spa_history_create_obj(spa, tx);
6597 
6598 	spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE);
6599 	spa_history_log_version(spa, "create", tx);
6600 
6601 	/*
6602 	 * Create the pool config object.
6603 	 */
6604 	spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
6605 	    DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
6606 	    DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
6607 
6608 	if (zap_add(spa->spa_meta_objset,
6609 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
6610 	    sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
6611 		cmn_err(CE_PANIC, "failed to add pool config");
6612 	}
6613 
6614 	if (zap_add(spa->spa_meta_objset,
6615 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
6616 	    sizeof (uint64_t), 1, &version, tx) != 0) {
6617 		cmn_err(CE_PANIC, "failed to add pool version");
6618 	}
6619 
6620 	/* Newly created pools with the right version are always deflated. */
6621 	if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
6622 		spa->spa_deflate = TRUE;
6623 		if (zap_add(spa->spa_meta_objset,
6624 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
6625 		    sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
6626 			cmn_err(CE_PANIC, "failed to add deflate");
6627 		}
6628 	}
6629 
6630 	/*
6631 	 * Create the deferred-free bpobj.  Turn off compression
6632 	 * because sync-to-convergence takes longer if the blocksize
6633 	 * keeps changing.
6634 	 */
6635 	obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
6636 	dmu_object_set_compress(spa->spa_meta_objset, obj,
6637 	    ZIO_COMPRESS_OFF, tx);
6638 	if (zap_add(spa->spa_meta_objset,
6639 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
6640 	    sizeof (uint64_t), 1, &obj, tx) != 0) {
6641 		cmn_err(CE_PANIC, "failed to add bpobj");
6642 	}
6643 	VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
6644 	    spa->spa_meta_objset, obj));
6645 
6646 	/*
6647 	 * Generate some random noise for salted checksums to operate on.
6648 	 */
6649 	(void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
6650 	    sizeof (spa->spa_cksum_salt.zcs_bytes));
6651 
6652 	/*
6653 	 * Set pool properties.
6654 	 */
6655 	spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
6656 	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
6657 	spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
6658 	spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
6659 	spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST);
6660 	spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM);
6661 	spa->spa_dedup_table_quota =
6662 	    zpool_prop_default_numeric(ZPOOL_PROP_DEDUP_TABLE_QUOTA);
6663 
6664 	if (props != NULL) {
6665 		spa_configfile_set(spa, props, B_FALSE);
6666 		spa_sync_props(props, tx);
6667 	}
6668 
6669 	for (int i = 0; i < ndraid; i++)
6670 		spa_feature_incr(spa, SPA_FEATURE_DRAID, tx);
6671 
6672 	dmu_tx_commit(tx);
6673 
6674 	spa->spa_sync_on = B_TRUE;
6675 	txg_sync_start(dp);
6676 	mmp_thread_start(spa);
6677 	txg_wait_synced(dp, txg);
6678 
6679 	spa_spawn_aux_threads(spa);
6680 
6681 	spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE);
6682 
6683 	/*
6684 	 * Don't count references from objsets that are already closed
6685 	 * and are making their way through the eviction process.
6686 	 */
6687 	spa_evicting_os_wait(spa);
6688 	spa->spa_minref = zfs_refcount_count(&spa->spa_refcount);
6689 	spa->spa_load_state = SPA_LOAD_NONE;
6690 
6691 	spa_import_os(spa);
6692 
6693 	mutex_exit(&spa_namespace_lock);
6694 
6695 	return (0);
6696 }
6697 
6698 /*
6699  * Import a non-root pool into the system.
6700  */
6701 int
6702 spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
6703 {
6704 	spa_t *spa;
6705 	const char *altroot = NULL;
6706 	spa_load_state_t state = SPA_LOAD_IMPORT;
6707 	zpool_load_policy_t policy;
6708 	spa_mode_t mode = spa_mode_global;
6709 	uint64_t readonly = B_FALSE;
6710 	int error;
6711 	nvlist_t *nvroot;
6712 	nvlist_t **spares, **l2cache;
6713 	uint_t nspares, nl2cache;
6714 
6715 	/*
6716 	 * If a pool with this name exists, return failure.
6717 	 */
6718 	mutex_enter(&spa_namespace_lock);
6719 	if (spa_lookup(pool) != NULL) {
6720 		mutex_exit(&spa_namespace_lock);
6721 		return (SET_ERROR(EEXIST));
6722 	}
6723 
6724 	/*
6725 	 * Create and initialize the spa structure.
6726 	 */
6727 	(void) nvlist_lookup_string(props,
6728 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
6729 	(void) nvlist_lookup_uint64(props,
6730 	    zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
6731 	if (readonly)
6732 		mode = SPA_MODE_READ;
6733 	spa = spa_add(pool, config, altroot);
6734 	spa->spa_import_flags = flags;
6735 
6736 	/*
6737 	 * Verbatim import - Take a pool and insert it into the namespace
6738 	 * as if it had been loaded at boot.
6739 	 */
6740 	if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
6741 		if (props != NULL)
6742 			spa_configfile_set(spa, props, B_FALSE);
6743 
6744 		spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE);
6745 		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
6746 		zfs_dbgmsg("spa_import: verbatim import of %s", pool);
6747 		mutex_exit(&spa_namespace_lock);
6748 		return (0);
6749 	}
6750 
6751 	spa_activate(spa, mode);
6752 
6753 	/*
6754 	 * Don't start async tasks until we know everything is healthy.
6755 	 */
6756 	spa_async_suspend(spa);
6757 
6758 	zpool_get_load_policy(config, &policy);
6759 	if (policy.zlp_rewind & ZPOOL_DO_REWIND)
6760 		state = SPA_LOAD_RECOVER;
6761 
6762 	spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT;
6763 
6764 	if (state != SPA_LOAD_RECOVER) {
6765 		spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
6766 		zfs_dbgmsg("spa_import: importing %s", pool);
6767 	} else {
6768 		zfs_dbgmsg("spa_import: importing %s, max_txg=%lld "
6769 		    "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg);
6770 	}
6771 	error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind);
6772 
6773 	/*
6774 	 * Propagate anything learned while loading the pool and pass it
6775 	 * back to caller (i.e. rewind info, missing devices, etc).
6776 	 */
6777 	fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info);
6778 
6779 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
6780 	/*
6781 	 * Toss any existing sparelist, as it doesn't have any validity
6782 	 * anymore, and conflicts with spa_has_spare().
6783 	 */
6784 	if (spa->spa_spares.sav_config) {
6785 		nvlist_free(spa->spa_spares.sav_config);
6786 		spa->spa_spares.sav_config = NULL;
6787 		spa_load_spares(spa);
6788 	}
6789 	if (spa->spa_l2cache.sav_config) {
6790 		nvlist_free(spa->spa_l2cache.sav_config);
6791 		spa->spa_l2cache.sav_config = NULL;
6792 		spa_load_l2cache(spa);
6793 	}
6794 
6795 	nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE);
6796 	spa_config_exit(spa, SCL_ALL, FTAG);
6797 
6798 	if (props != NULL)
6799 		spa_configfile_set(spa, props, B_FALSE);
6800 
6801 	if (error != 0 || (props && spa_writeable(spa) &&
6802 	    (error = spa_prop_set(spa, props)))) {
6803 		spa_unload(spa);
6804 		spa_deactivate(spa);
6805 		spa_remove(spa);
6806 		mutex_exit(&spa_namespace_lock);
6807 		return (error);
6808 	}
6809 
6810 	spa_async_resume(spa);
6811 
6812 	/*
6813 	 * Override any spares and level 2 cache devices as specified by
6814 	 * the user, as these may have correct device names/devids, etc.
6815 	 */
6816 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
6817 	    &spares, &nspares) == 0) {
6818 		if (spa->spa_spares.sav_config)
6819 			fnvlist_remove(spa->spa_spares.sav_config,
6820 			    ZPOOL_CONFIG_SPARES);
6821 		else
6822 			spa->spa_spares.sav_config = fnvlist_alloc();
6823 		fnvlist_add_nvlist_array(spa->spa_spares.sav_config,
6824 		    ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares,
6825 		    nspares);
6826 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
6827 		spa_load_spares(spa);
6828 		spa_config_exit(spa, SCL_ALL, FTAG);
6829 		spa->spa_spares.sav_sync = B_TRUE;
6830 		spa->spa_spares.sav_label_sync = B_TRUE;
6831 	}
6832 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
6833 	    &l2cache, &nl2cache) == 0) {
6834 		if (spa->spa_l2cache.sav_config)
6835 			fnvlist_remove(spa->spa_l2cache.sav_config,
6836 			    ZPOOL_CONFIG_L2CACHE);
6837 		else
6838 			spa->spa_l2cache.sav_config = fnvlist_alloc();
6839 		fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
6840 		    ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache,
6841 		    nl2cache);
6842 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
6843 		spa_load_l2cache(spa);
6844 		spa_config_exit(spa, SCL_ALL, FTAG);
6845 		spa->spa_l2cache.sav_sync = B_TRUE;
6846 		spa->spa_l2cache.sav_label_sync = B_TRUE;
6847 	}
6848 
6849 	/*
6850 	 * Check for any removed devices.
6851 	 */
6852 	if (spa->spa_autoreplace) {
6853 		spa_aux_check_removed(&spa->spa_spares);
6854 		spa_aux_check_removed(&spa->spa_l2cache);
6855 	}
6856 
6857 	if (spa_writeable(spa)) {
6858 		/*
6859 		 * Update the config cache to include the newly-imported pool.
6860 		 */
6861 		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
6862 	}
6863 
6864 	/*
6865 	 * It's possible that the pool was expanded while it was exported.
6866 	 * We kick off an async task to handle this for us.
6867 	 */
6868 	spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
6869 
6870 	spa_history_log_version(spa, "import", NULL);
6871 
6872 	spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
6873 
6874 	mutex_exit(&spa_namespace_lock);
6875 
6876 	zvol_create_minors_recursive(pool);
6877 
6878 	spa_import_os(spa);
6879 
6880 	return (0);
6881 }
6882 
6883 nvlist_t *
6884 spa_tryimport(nvlist_t *tryconfig)
6885 {
6886 	nvlist_t *config = NULL;
6887 	const char *poolname, *cachefile;
6888 	spa_t *spa;
6889 	uint64_t state;
6890 	int error;
6891 	zpool_load_policy_t policy;
6892 
6893 	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
6894 		return (NULL);
6895 
6896 	if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
6897 		return (NULL);
6898 
6899 	/*
6900 	 * Create and initialize the spa structure.
6901 	 */
6902 	char *name = kmem_alloc(MAXPATHLEN, KM_SLEEP);
6903 	(void) snprintf(name, MAXPATHLEN, "%s-%llx-%s",
6904 	    TRYIMPORT_NAME, (u_longlong_t)(uintptr_t)curthread, poolname);
6905 
6906 	mutex_enter(&spa_namespace_lock);
6907 	spa = spa_add(name, tryconfig, NULL);
6908 	spa_activate(spa, SPA_MODE_READ);
6909 	kmem_free(name, MAXPATHLEN);
6910 
6911 	/*
6912 	 * Rewind pool if a max txg was provided.
6913 	 */
6914 	zpool_get_load_policy(spa->spa_config, &policy);
6915 	if (policy.zlp_txg != UINT64_MAX) {
6916 		spa->spa_load_max_txg = policy.zlp_txg;
6917 		spa->spa_extreme_rewind = B_TRUE;
6918 		zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld",
6919 		    poolname, (longlong_t)policy.zlp_txg);
6920 	} else {
6921 		zfs_dbgmsg("spa_tryimport: importing %s", poolname);
6922 	}
6923 
6924 	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile)
6925 	    == 0) {
6926 		zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile);
6927 		spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
6928 	} else {
6929 		spa->spa_config_source = SPA_CONFIG_SRC_SCAN;
6930 	}
6931 
6932 	/*
6933 	 * spa_import() relies on a pool config fetched by spa_try_import()
6934 	 * for spare/cache devices. Import flags are not passed to
6935 	 * spa_tryimport(), which makes it return early due to a missing log
6936 	 * device and missing retrieving the cache device and spare eventually.
6937 	 * Passing ZFS_IMPORT_MISSING_LOG to spa_tryimport() makes it fetch
6938 	 * the correct configuration regardless of the missing log device.
6939 	 */
6940 	spa->spa_import_flags |= ZFS_IMPORT_MISSING_LOG;
6941 
6942 	error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING);
6943 
6944 	/*
6945 	 * If 'tryconfig' was at least parsable, return the current config.
6946 	 */
6947 	if (spa->spa_root_vdev != NULL) {
6948 		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
6949 		fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, poolname);
6950 		fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, state);
6951 		fnvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
6952 		    spa->spa_uberblock.ub_timestamp);
6953 		fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
6954 		    spa->spa_load_info);
6955 		fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA,
6956 		    spa->spa_errata);
6957 
6958 		/*
6959 		 * If the bootfs property exists on this pool then we
6960 		 * copy it out so that external consumers can tell which
6961 		 * pools are bootable.
6962 		 */
6963 		if ((!error || error == EEXIST) && spa->spa_bootfs) {
6964 			char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
6965 
6966 			/*
6967 			 * We have to play games with the name since the
6968 			 * pool was opened as TRYIMPORT_NAME.
6969 			 */
6970 			if (dsl_dsobj_to_dsname(spa_name(spa),
6971 			    spa->spa_bootfs, tmpname) == 0) {
6972 				char *cp;
6973 				char *dsname;
6974 
6975 				dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
6976 
6977 				cp = strchr(tmpname, '/');
6978 				if (cp == NULL) {
6979 					(void) strlcpy(dsname, tmpname,
6980 					    MAXPATHLEN);
6981 				} else {
6982 					(void) snprintf(dsname, MAXPATHLEN,
6983 					    "%s/%s", poolname, ++cp);
6984 				}
6985 				fnvlist_add_string(config, ZPOOL_CONFIG_BOOTFS,
6986 				    dsname);
6987 				kmem_free(dsname, MAXPATHLEN);
6988 			}
6989 			kmem_free(tmpname, MAXPATHLEN);
6990 		}
6991 
6992 		/*
6993 		 * Add the list of hot spares and level 2 cache devices.
6994 		 */
6995 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
6996 		spa_add_spares(spa, config);
6997 		spa_add_l2cache(spa, config);
6998 		spa_config_exit(spa, SCL_CONFIG, FTAG);
6999 	}
7000 
7001 	spa_unload(spa);
7002 	spa_deactivate(spa);
7003 	spa_remove(spa);
7004 	mutex_exit(&spa_namespace_lock);
7005 
7006 	return (config);
7007 }
7008 
7009 /*
7010  * Pool export/destroy
7011  *
7012  * The act of destroying or exporting a pool is very simple.  We make sure there
7013  * is no more pending I/O and any references to the pool are gone.  Then, we
7014  * update the pool state and sync all the labels to disk, removing the
7015  * configuration from the cache afterwards. If the 'hardforce' flag is set, then
7016  * we don't sync the labels or remove the configuration cache.
7017  */
7018 static int
7019 spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
7020     boolean_t force, boolean_t hardforce)
7021 {
7022 	int error = 0;
7023 	spa_t *spa;
7024 	hrtime_t export_start = gethrtime();
7025 
7026 	if (oldconfig)
7027 		*oldconfig = NULL;
7028 
7029 	if (!(spa_mode_global & SPA_MODE_WRITE))
7030 		return (SET_ERROR(EROFS));
7031 
7032 	mutex_enter(&spa_namespace_lock);
7033 	if ((spa = spa_lookup(pool)) == NULL) {
7034 		mutex_exit(&spa_namespace_lock);
7035 		return (SET_ERROR(ENOENT));
7036 	}
7037 
7038 	if (spa->spa_is_exporting) {
7039 		/* the pool is being exported by another thread */
7040 		mutex_exit(&spa_namespace_lock);
7041 		return (SET_ERROR(ZFS_ERR_EXPORT_IN_PROGRESS));
7042 	}
7043 	spa->spa_is_exporting = B_TRUE;
7044 
7045 	/*
7046 	 * Put a hold on the pool, drop the namespace lock, stop async tasks
7047 	 * and see if we can export.
7048 	 */
7049 	spa_open_ref(spa, FTAG);
7050 	mutex_exit(&spa_namespace_lock);
7051 	spa_async_suspend(spa);
7052 	if (spa->spa_zvol_taskq) {
7053 		zvol_remove_minors(spa, spa_name(spa), B_TRUE);
7054 		taskq_wait(spa->spa_zvol_taskq);
7055 	}
7056 	mutex_enter(&spa_namespace_lock);
7057 	spa->spa_export_thread = curthread;
7058 	spa_close(spa, FTAG);
7059 
7060 	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
7061 		mutex_exit(&spa_namespace_lock);
7062 		goto export_spa;
7063 	}
7064 
7065 	/*
7066 	 * The pool will be in core if it's openable, in which case we can
7067 	 * modify its state.  Objsets may be open only because they're dirty,
7068 	 * so we have to force it to sync before checking spa_refcnt.
7069 	 */
7070 	if (spa->spa_sync_on) {
7071 		txg_wait_synced(spa->spa_dsl_pool, 0);
7072 		spa_evicting_os_wait(spa);
7073 	}
7074 
7075 	/*
7076 	 * A pool cannot be exported or destroyed if there are active
7077 	 * references.  If we are resetting a pool, allow references by
7078 	 * fault injection handlers.
7079 	 */
7080 	if (!spa_refcount_zero(spa) || (spa->spa_inject_ref != 0)) {
7081 		error = SET_ERROR(EBUSY);
7082 		goto fail;
7083 	}
7084 
7085 	mutex_exit(&spa_namespace_lock);
7086 	/*
7087 	 * At this point we no longer hold the spa_namespace_lock and
7088 	 * there were no references on the spa. Future spa_lookups will
7089 	 * notice the spa->spa_export_thread and wait until we signal
7090 	 * that we are finshed.
7091 	 */
7092 
7093 	if (spa->spa_sync_on) {
7094 		vdev_t *rvd = spa->spa_root_vdev;
7095 		/*
7096 		 * A pool cannot be exported if it has an active shared spare.
7097 		 * This is to prevent other pools stealing the active spare
7098 		 * from an exported pool. At user's own will, such pool can
7099 		 * be forcedly exported.
7100 		 */
7101 		if (!force && new_state == POOL_STATE_EXPORTED &&
7102 		    spa_has_active_shared_spare(spa)) {
7103 			error = SET_ERROR(EXDEV);
7104 			mutex_enter(&spa_namespace_lock);
7105 			goto fail;
7106 		}
7107 
7108 		/*
7109 		 * We're about to export or destroy this pool. Make sure
7110 		 * we stop all initialization and trim activity here before
7111 		 * we set the spa_final_txg. This will ensure that all
7112 		 * dirty data resulting from the initialization is
7113 		 * committed to disk before we unload the pool.
7114 		 */
7115 		vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE);
7116 		vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE);
7117 		vdev_autotrim_stop_all(spa);
7118 		vdev_rebuild_stop_all(spa);
7119 		l2arc_spa_rebuild_stop(spa);
7120 
7121 		/*
7122 		 * We want this to be reflected on every label,
7123 		 * so mark them all dirty.  spa_unload() will do the
7124 		 * final sync that pushes these changes out.
7125 		 */
7126 		if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
7127 			spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
7128 			spa->spa_state = new_state;
7129 			vdev_config_dirty(rvd);
7130 			spa_config_exit(spa, SCL_ALL, FTAG);
7131 		}
7132 
7133 		/*
7134 		 * If the log space map feature is enabled and the pool is
7135 		 * getting exported (but not destroyed), we want to spend some
7136 		 * time flushing as many metaslabs as we can in an attempt to
7137 		 * destroy log space maps and save import time. This has to be
7138 		 * done before we set the spa_final_txg, otherwise
7139 		 * spa_sync() -> spa_flush_metaslabs() may dirty the final TXGs.
7140 		 * spa_should_flush_logs_on_unload() should be called after
7141 		 * spa_state has been set to the new_state.
7142 		 */
7143 		if (spa_should_flush_logs_on_unload(spa))
7144 			spa_unload_log_sm_flush_all(spa);
7145 
7146 		if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
7147 			spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
7148 			spa->spa_final_txg = spa_last_synced_txg(spa) +
7149 			    TXG_DEFER_SIZE + 1;
7150 			spa_config_exit(spa, SCL_ALL, FTAG);
7151 		}
7152 	}
7153 
7154 export_spa:
7155 	spa_export_os(spa);
7156 
7157 	if (new_state == POOL_STATE_DESTROYED)
7158 		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY);
7159 	else if (new_state == POOL_STATE_EXPORTED)
7160 		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_EXPORT);
7161 
7162 	if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
7163 		spa_unload(spa);
7164 		spa_deactivate(spa);
7165 	}
7166 
7167 	if (oldconfig && spa->spa_config)
7168 		*oldconfig = fnvlist_dup(spa->spa_config);
7169 
7170 	if (new_state == POOL_STATE_EXPORTED)
7171 		zio_handle_export_delay(spa, gethrtime() - export_start);
7172 
7173 	/*
7174 	 * Take the namespace lock for the actual spa_t removal
7175 	 */
7176 	mutex_enter(&spa_namespace_lock);
7177 	if (new_state != POOL_STATE_UNINITIALIZED) {
7178 		if (!hardforce)
7179 			spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE);
7180 		spa_remove(spa);
7181 	} else {
7182 		/*
7183 		 * If spa_remove() is not called for this spa_t and
7184 		 * there is any possibility that it can be reused,
7185 		 * we make sure to reset the exporting flag.
7186 		 */
7187 		spa->spa_is_exporting = B_FALSE;
7188 		spa->spa_export_thread = NULL;
7189 	}
7190 
7191 	/*
7192 	 * Wake up any waiters in spa_lookup()
7193 	 */
7194 	cv_broadcast(&spa_namespace_cv);
7195 	mutex_exit(&spa_namespace_lock);
7196 	return (0);
7197 
7198 fail:
7199 	spa->spa_is_exporting = B_FALSE;
7200 	spa->spa_export_thread = NULL;
7201 
7202 	spa_async_resume(spa);
7203 	/*
7204 	 * Wake up any waiters in spa_lookup()
7205 	 */
7206 	cv_broadcast(&spa_namespace_cv);
7207 	mutex_exit(&spa_namespace_lock);
7208 	return (error);
7209 }
7210 
7211 /*
7212  * Destroy a storage pool.
7213  */
7214 int
7215 spa_destroy(const char *pool)
7216 {
7217 	return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
7218 	    B_FALSE, B_FALSE));
7219 }
7220 
7221 /*
7222  * Export a storage pool.
7223  */
7224 int
7225 spa_export(const char *pool, nvlist_t **oldconfig, boolean_t force,
7226     boolean_t hardforce)
7227 {
7228 	return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
7229 	    force, hardforce));
7230 }
7231 
7232 /*
7233  * Similar to spa_export(), this unloads the spa_t without actually removing it
7234  * from the namespace in any way.
7235  */
7236 int
7237 spa_reset(const char *pool)
7238 {
7239 	return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
7240 	    B_FALSE, B_FALSE));
7241 }
7242 
7243 /*
7244  * ==========================================================================
7245  * Device manipulation
7246  * ==========================================================================
7247  */
7248 
7249 /*
7250  * This is called as a synctask to increment the draid feature flag
7251  */
7252 static void
7253 spa_draid_feature_incr(void *arg, dmu_tx_t *tx)
7254 {
7255 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
7256 	int draid = (int)(uintptr_t)arg;
7257 
7258 	for (int c = 0; c < draid; c++)
7259 		spa_feature_incr(spa, SPA_FEATURE_DRAID, tx);
7260 }
7261 
7262 /*
7263  * Add a device to a storage pool.
7264  */
7265 int
7266 spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t check_ashift)
7267 {
7268 	uint64_t txg, ndraid = 0;
7269 	int error;
7270 	vdev_t *rvd = spa->spa_root_vdev;
7271 	vdev_t *vd, *tvd;
7272 	nvlist_t **spares, **l2cache;
7273 	uint_t nspares, nl2cache;
7274 
7275 	ASSERT(spa_writeable(spa));
7276 
7277 	txg = spa_vdev_enter(spa);
7278 
7279 	if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
7280 	    VDEV_ALLOC_ADD)) != 0)
7281 		return (spa_vdev_exit(spa, NULL, txg, error));
7282 
7283 	spa->spa_pending_vdev = vd;	/* spa_vdev_exit() will clear this */
7284 
7285 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
7286 	    &nspares) != 0)
7287 		nspares = 0;
7288 
7289 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
7290 	    &nl2cache) != 0)
7291 		nl2cache = 0;
7292 
7293 	if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
7294 		return (spa_vdev_exit(spa, vd, txg, EINVAL));
7295 
7296 	if (vd->vdev_children != 0 &&
7297 	    (error = vdev_create(vd, txg, B_FALSE)) != 0) {
7298 		return (spa_vdev_exit(spa, vd, txg, error));
7299 	}
7300 
7301 	/*
7302 	 * The virtual dRAID spares must be added after vdev tree is created
7303 	 * and the vdev guids are generated.  The guid of their associated
7304 	 * dRAID is stored in the config and used when opening the spare.
7305 	 */
7306 	if ((error = vdev_draid_spare_create(nvroot, vd, &ndraid,
7307 	    rvd->vdev_children)) == 0) {
7308 		if (ndraid > 0 && nvlist_lookup_nvlist_array(nvroot,
7309 		    ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0)
7310 			nspares = 0;
7311 	} else {
7312 		return (spa_vdev_exit(spa, vd, txg, error));
7313 	}
7314 
7315 	/*
7316 	 * We must validate the spares and l2cache devices after checking the
7317 	 * children.  Otherwise, vdev_inuse() will blindly overwrite the spare.
7318 	 */
7319 	if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
7320 		return (spa_vdev_exit(spa, vd, txg, error));
7321 
7322 	/*
7323 	 * If we are in the middle of a device removal, we can only add
7324 	 * devices which match the existing devices in the pool.
7325 	 * If we are in the middle of a removal, or have some indirect
7326 	 * vdevs, we can not add raidz or dRAID top levels.
7327 	 */
7328 	if (spa->spa_vdev_removal != NULL ||
7329 	    spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
7330 		for (int c = 0; c < vd->vdev_children; c++) {
7331 			tvd = vd->vdev_child[c];
7332 			if (spa->spa_vdev_removal != NULL &&
7333 			    tvd->vdev_ashift != spa->spa_max_ashift) {
7334 				return (spa_vdev_exit(spa, vd, txg, EINVAL));
7335 			}
7336 			/* Fail if top level vdev is raidz or a dRAID */
7337 			if (vdev_get_nparity(tvd) != 0)
7338 				return (spa_vdev_exit(spa, vd, txg, EINVAL));
7339 
7340 			/*
7341 			 * Need the top level mirror to be
7342 			 * a mirror of leaf vdevs only
7343 			 */
7344 			if (tvd->vdev_ops == &vdev_mirror_ops) {
7345 				for (uint64_t cid = 0;
7346 				    cid < tvd->vdev_children; cid++) {
7347 					vdev_t *cvd = tvd->vdev_child[cid];
7348 					if (!cvd->vdev_ops->vdev_op_leaf) {
7349 						return (spa_vdev_exit(spa, vd,
7350 						    txg, EINVAL));
7351 					}
7352 				}
7353 			}
7354 		}
7355 	}
7356 
7357 	if (check_ashift && spa->spa_max_ashift == spa->spa_min_ashift) {
7358 		for (int c = 0; c < vd->vdev_children; c++) {
7359 			tvd = vd->vdev_child[c];
7360 			if (tvd->vdev_ashift != spa->spa_max_ashift) {
7361 				return (spa_vdev_exit(spa, vd, txg,
7362 				    ZFS_ERR_ASHIFT_MISMATCH));
7363 			}
7364 		}
7365 	}
7366 
7367 	for (int c = 0; c < vd->vdev_children; c++) {
7368 		tvd = vd->vdev_child[c];
7369 		vdev_remove_child(vd, tvd);
7370 		tvd->vdev_id = rvd->vdev_children;
7371 		vdev_add_child(rvd, tvd);
7372 		vdev_config_dirty(tvd);
7373 	}
7374 
7375 	if (nspares != 0) {
7376 		spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
7377 		    ZPOOL_CONFIG_SPARES);
7378 		spa_load_spares(spa);
7379 		spa->spa_spares.sav_sync = B_TRUE;
7380 	}
7381 
7382 	if (nl2cache != 0) {
7383 		spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
7384 		    ZPOOL_CONFIG_L2CACHE);
7385 		spa_load_l2cache(spa);
7386 		spa->spa_l2cache.sav_sync = B_TRUE;
7387 	}
7388 
7389 	/*
7390 	 * We can't increment a feature while holding spa_vdev so we
7391 	 * have to do it in a synctask.
7392 	 */
7393 	if (ndraid != 0) {
7394 		dmu_tx_t *tx;
7395 
7396 		tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
7397 		dsl_sync_task_nowait(spa->spa_dsl_pool, spa_draid_feature_incr,
7398 		    (void *)(uintptr_t)ndraid, tx);
7399 		dmu_tx_commit(tx);
7400 	}
7401 
7402 	/*
7403 	 * We have to be careful when adding new vdevs to an existing pool.
7404 	 * If other threads start allocating from these vdevs before we
7405 	 * sync the config cache, and we lose power, then upon reboot we may
7406 	 * fail to open the pool because there are DVAs that the config cache
7407 	 * can't translate.  Therefore, we first add the vdevs without
7408 	 * initializing metaslabs; sync the config cache (via spa_vdev_exit());
7409 	 * and then let spa_config_update() initialize the new metaslabs.
7410 	 *
7411 	 * spa_load() checks for added-but-not-initialized vdevs, so that
7412 	 * if we lose power at any point in this sequence, the remaining
7413 	 * steps will be completed the next time we load the pool.
7414 	 */
7415 	(void) spa_vdev_exit(spa, vd, txg, 0);
7416 
7417 	mutex_enter(&spa_namespace_lock);
7418 	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
7419 	spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD);
7420 	mutex_exit(&spa_namespace_lock);
7421 
7422 	return (0);
7423 }
7424 
7425 /*
7426  * Given a vdev to be replaced and its parent, check for a possible
7427  * "double spare" condition if a vdev is to be replaced by a spare.  When this
7428  * happens, you can get two spares assigned to one failed vdev.
7429  *
7430  * To trigger a double spare condition:
7431  *
7432  * 1. disk1 fails
7433  * 2. 1st spare is kicked in for disk1 and it resilvers
7434  * 3. Someone replaces disk1 with a new blank disk
7435  * 4. New blank disk starts resilvering
7436  * 5. While resilvering, new blank disk has IO errors and faults
7437  * 6. 2nd spare is kicked in for new blank disk
7438  * 7. At this point two spares are kicked in for the original disk1.
7439  *
7440  * It looks like this:
7441  *
7442  * NAME                                            STATE     READ WRITE CKSUM
7443  * tank2                                           DEGRADED     0     0     0
7444  *   draid2:6d:10c:2s-0                            DEGRADED     0     0     0
7445  *     scsi-0QEMU_QEMU_HARDDISK_d1                 ONLINE       0     0     0
7446  *     scsi-0QEMU_QEMU_HARDDISK_d2                 ONLINE       0     0     0
7447  *     scsi-0QEMU_QEMU_HARDDISK_d3                 ONLINE       0     0     0
7448  *     scsi-0QEMU_QEMU_HARDDISK_d4                 ONLINE       0     0     0
7449  *     scsi-0QEMU_QEMU_HARDDISK_d5                 ONLINE       0     0     0
7450  *     scsi-0QEMU_QEMU_HARDDISK_d6                 ONLINE       0     0     0
7451  *     scsi-0QEMU_QEMU_HARDDISK_d7                 ONLINE       0     0     0
7452  *     scsi-0QEMU_QEMU_HARDDISK_d8                 ONLINE       0     0     0
7453  *     scsi-0QEMU_QEMU_HARDDISK_d9                 ONLINE       0     0     0
7454  *     spare-9                                     DEGRADED     0     0     0
7455  *       replacing-0                               DEGRADED     0    93     0
7456  *         scsi-0QEMU_QEMU_HARDDISK_d10-part1/old  UNAVAIL      0     0     0
7457  *         spare-1                                 DEGRADED     0     0     0
7458  *           scsi-0QEMU_QEMU_HARDDISK_d10          REMOVED      0     0     0
7459  *           draid2-0-0                            ONLINE       0     0     0
7460  *       draid2-0-1                                ONLINE       0     0     0
7461  * spares
7462  *   draid2-0-0                                    INUSE     currently in use
7463  *   draid2-0-1                                    INUSE     currently in use
7464  *
7465  * ARGS:
7466  *
7467  * newvd:  New spare disk
7468  * pvd:    Parent vdev_t the spare should attach to
7469  *
7470  * This function returns B_TRUE if adding the new vdev would create a double
7471  * spare condition, B_FALSE otherwise.
7472  */
7473 static boolean_t
7474 spa_vdev_new_spare_would_cause_double_spares(vdev_t *newvd, vdev_t *pvd)
7475 {
7476 	vdev_t *ppvd;
7477 
7478 	ppvd = pvd->vdev_parent;
7479 	if (ppvd == NULL)
7480 		return (B_FALSE);
7481 
7482 	/*
7483 	 * To determine if this configuration would cause a double spare, we
7484 	 * look at the vdev_op of the parent vdev, and of the parent's parent
7485 	 * vdev.  We also look at vdev_isspare on the new disk.  A double spare
7486 	 * condition looks like this:
7487 	 *
7488 	 * 1. parent of parent's op is a spare or draid spare
7489 	 * 2. parent's op is replacing
7490 	 * 3. new disk is a spare
7491 	 */
7492 	if ((ppvd->vdev_ops == &vdev_spare_ops) ||
7493 	    (ppvd->vdev_ops == &vdev_draid_spare_ops))
7494 		if (pvd->vdev_ops == &vdev_replacing_ops)
7495 			if (newvd->vdev_isspare)
7496 				return (B_TRUE);
7497 
7498 	return (B_FALSE);
7499 }
7500 
7501 /*
7502  * Attach a device to a vdev specified by its guid.  The vdev type can be
7503  * a mirror, a raidz, or a leaf device that is also a top-level (e.g. a
7504  * single device). When the vdev is a single device, a mirror vdev will be
7505  * automatically inserted.
7506  *
7507  * If 'replacing' is specified, the new device is intended to replace the
7508  * existing device; in this case the two devices are made into their own
7509  * mirror using the 'replacing' vdev, which is functionally identical to
7510  * the mirror vdev (it actually reuses all the same ops) but has a few
7511  * extra rules: you can't attach to it after it's been created, and upon
7512  * completion of resilvering, the first disk (the one being replaced)
7513  * is automatically detached.
7514  *
7515  * If 'rebuild' is specified, then sequential reconstruction (a.ka. rebuild)
7516  * should be performed instead of traditional healing reconstruction.  From
7517  * an administrators perspective these are both resilver operations.
7518  */
7519 int
7520 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
7521     int rebuild)
7522 {
7523 	uint64_t txg, dtl_max_txg;
7524 	vdev_t *rvd = spa->spa_root_vdev;
7525 	vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
7526 	vdev_ops_t *pvops;
7527 	char *oldvdpath, *newvdpath;
7528 	int newvd_isspare = B_FALSE;
7529 	int error;
7530 
7531 	ASSERT(spa_writeable(spa));
7532 
7533 	txg = spa_vdev_enter(spa);
7534 
7535 	oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
7536 
7537 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
7538 	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
7539 		error = (spa_has_checkpoint(spa)) ?
7540 		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
7541 		return (spa_vdev_exit(spa, NULL, txg, error));
7542 	}
7543 
7544 	if (rebuild) {
7545 		if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD))
7546 			return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
7547 
7548 		if (dsl_scan_resilvering(spa_get_dsl(spa)) ||
7549 		    dsl_scan_resilver_scheduled(spa_get_dsl(spa))) {
7550 			return (spa_vdev_exit(spa, NULL, txg,
7551 			    ZFS_ERR_RESILVER_IN_PROGRESS));
7552 		}
7553 	} else {
7554 		if (vdev_rebuild_active(rvd))
7555 			return (spa_vdev_exit(spa, NULL, txg,
7556 			    ZFS_ERR_REBUILD_IN_PROGRESS));
7557 	}
7558 
7559 	if (spa->spa_vdev_removal != NULL) {
7560 		return (spa_vdev_exit(spa, NULL, txg,
7561 		    ZFS_ERR_DEVRM_IN_PROGRESS));
7562 	}
7563 
7564 	if (oldvd == NULL)
7565 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
7566 
7567 	boolean_t raidz = oldvd->vdev_ops == &vdev_raidz_ops;
7568 
7569 	if (raidz) {
7570 		if (!spa_feature_is_enabled(spa, SPA_FEATURE_RAIDZ_EXPANSION))
7571 			return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
7572 
7573 		/*
7574 		 * Can't expand a raidz while prior expand is in progress.
7575 		 */
7576 		if (spa->spa_raidz_expand != NULL) {
7577 			return (spa_vdev_exit(spa, NULL, txg,
7578 			    ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS));
7579 		}
7580 	} else if (!oldvd->vdev_ops->vdev_op_leaf) {
7581 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
7582 	}
7583 
7584 	if (raidz)
7585 		pvd = oldvd;
7586 	else
7587 		pvd = oldvd->vdev_parent;
7588 
7589 	if (spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
7590 	    VDEV_ALLOC_ATTACH) != 0)
7591 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
7592 
7593 	if (newrootvd->vdev_children != 1)
7594 		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
7595 
7596 	newvd = newrootvd->vdev_child[0];
7597 
7598 	if (!newvd->vdev_ops->vdev_op_leaf)
7599 		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
7600 
7601 	if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
7602 		return (spa_vdev_exit(spa, newrootvd, txg, error));
7603 
7604 	/*
7605 	 * log, dedup and special vdevs should not be replaced by spares.
7606 	 */
7607 	if ((oldvd->vdev_top->vdev_alloc_bias != VDEV_BIAS_NONE ||
7608 	    oldvd->vdev_top->vdev_islog) && newvd->vdev_isspare) {
7609 		return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
7610 	}
7611 
7612 	/*
7613 	 * A dRAID spare can only replace a child of its parent dRAID vdev.
7614 	 */
7615 	if (newvd->vdev_ops == &vdev_draid_spare_ops &&
7616 	    oldvd->vdev_top != vdev_draid_spare_get_parent(newvd)) {
7617 		return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
7618 	}
7619 
7620 	if (rebuild) {
7621 		/*
7622 		 * For rebuilds, the top vdev must support reconstruction
7623 		 * using only space maps.  This means the only allowable
7624 		 * vdevs types are the root vdev, a mirror, or dRAID.
7625 		 */
7626 		tvd = pvd;
7627 		if (pvd->vdev_top != NULL)
7628 			tvd = pvd->vdev_top;
7629 
7630 		if (tvd->vdev_ops != &vdev_mirror_ops &&
7631 		    tvd->vdev_ops != &vdev_root_ops &&
7632 		    tvd->vdev_ops != &vdev_draid_ops) {
7633 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
7634 		}
7635 	}
7636 
7637 	if (!replacing) {
7638 		/*
7639 		 * For attach, the only allowable parent is a mirror or
7640 		 * the root vdev. A raidz vdev can be attached to, but
7641 		 * you cannot attach to a raidz child.
7642 		 */
7643 		if (pvd->vdev_ops != &vdev_mirror_ops &&
7644 		    pvd->vdev_ops != &vdev_root_ops &&
7645 		    !raidz)
7646 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
7647 
7648 		pvops = &vdev_mirror_ops;
7649 	} else {
7650 		/*
7651 		 * Active hot spares can only be replaced by inactive hot
7652 		 * spares.
7653 		 */
7654 		if (pvd->vdev_ops == &vdev_spare_ops &&
7655 		    oldvd->vdev_isspare &&
7656 		    !spa_has_spare(spa, newvd->vdev_guid))
7657 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
7658 
7659 		/*
7660 		 * If the source is a hot spare, and the parent isn't already a
7661 		 * spare, then we want to create a new hot spare.  Otherwise, we
7662 		 * want to create a replacing vdev.  The user is not allowed to
7663 		 * attach to a spared vdev child unless the 'isspare' state is
7664 		 * the same (spare replaces spare, non-spare replaces
7665 		 * non-spare).
7666 		 */
7667 		if (pvd->vdev_ops == &vdev_replacing_ops &&
7668 		    spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
7669 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
7670 		} else if (pvd->vdev_ops == &vdev_spare_ops &&
7671 		    newvd->vdev_isspare != oldvd->vdev_isspare) {
7672 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
7673 		}
7674 
7675 		if (spa_vdev_new_spare_would_cause_double_spares(newvd, pvd)) {
7676 			vdev_dbgmsg(newvd,
7677 			    "disk would create double spares, ignore.");
7678 			return (spa_vdev_exit(spa, newrootvd, txg, EEXIST));
7679 		}
7680 
7681 		if (newvd->vdev_isspare)
7682 			pvops = &vdev_spare_ops;
7683 		else
7684 			pvops = &vdev_replacing_ops;
7685 	}
7686 
7687 	/*
7688 	 * Make sure the new device is big enough.
7689 	 */
7690 	vdev_t *min_vdev = raidz ? oldvd->vdev_child[0] : oldvd;
7691 	if (newvd->vdev_asize < vdev_get_min_asize(min_vdev))
7692 		return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
7693 
7694 	/*
7695 	 * The new device cannot have a higher alignment requirement
7696 	 * than the top-level vdev.
7697 	 */
7698 	if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) {
7699 		return (spa_vdev_exit(spa, newrootvd, txg,
7700 		    ZFS_ERR_ASHIFT_MISMATCH));
7701 	}
7702 
7703 	/*
7704 	 * RAIDZ-expansion-specific checks.
7705 	 */
7706 	if (raidz) {
7707 		if (vdev_raidz_attach_check(newvd) != 0)
7708 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
7709 
7710 		/*
7711 		 * Fail early if a child is not healthy or being replaced
7712 		 */
7713 		for (int i = 0; i < oldvd->vdev_children; i++) {
7714 			if (vdev_is_dead(oldvd->vdev_child[i]) ||
7715 			    !oldvd->vdev_child[i]->vdev_ops->vdev_op_leaf) {
7716 				return (spa_vdev_exit(spa, newrootvd, txg,
7717 				    ENXIO));
7718 			}
7719 			/* Also fail if reserved boot area is in-use */
7720 			if (vdev_check_boot_reserve(spa, oldvd->vdev_child[i])
7721 			    != 0) {
7722 				return (spa_vdev_exit(spa, newrootvd, txg,
7723 				    EADDRINUSE));
7724 			}
7725 		}
7726 	}
7727 
7728 	if (raidz) {
7729 		/*
7730 		 * Note: oldvdpath is freed by spa_strfree(),  but
7731 		 * kmem_asprintf() is freed by kmem_strfree(), so we have to
7732 		 * move it to a spa_strdup-ed string.
7733 		 */
7734 		char *tmp = kmem_asprintf("raidz%u-%u",
7735 		    (uint_t)vdev_get_nparity(oldvd), (uint_t)oldvd->vdev_id);
7736 		oldvdpath = spa_strdup(tmp);
7737 		kmem_strfree(tmp);
7738 	} else {
7739 		oldvdpath = spa_strdup(oldvd->vdev_path);
7740 	}
7741 	newvdpath = spa_strdup(newvd->vdev_path);
7742 
7743 	/*
7744 	 * If this is an in-place replacement, update oldvd's path and devid
7745 	 * to make it distinguishable from newvd, and unopenable from now on.
7746 	 */
7747 	if (strcmp(oldvdpath, newvdpath) == 0) {
7748 		spa_strfree(oldvd->vdev_path);
7749 		oldvd->vdev_path = kmem_alloc(strlen(newvdpath) + 5,
7750 		    KM_SLEEP);
7751 		(void) sprintf(oldvd->vdev_path, "%s/old",
7752 		    newvdpath);
7753 		if (oldvd->vdev_devid != NULL) {
7754 			spa_strfree(oldvd->vdev_devid);
7755 			oldvd->vdev_devid = NULL;
7756 		}
7757 		spa_strfree(oldvdpath);
7758 		oldvdpath = spa_strdup(oldvd->vdev_path);
7759 	}
7760 
7761 	/*
7762 	 * If the parent is not a mirror, or if we're replacing, insert the new
7763 	 * mirror/replacing/spare vdev above oldvd.
7764 	 */
7765 	if (!raidz && pvd->vdev_ops != pvops) {
7766 		pvd = vdev_add_parent(oldvd, pvops);
7767 		ASSERT(pvd->vdev_ops == pvops);
7768 		ASSERT(oldvd->vdev_parent == pvd);
7769 	}
7770 
7771 	ASSERT(pvd->vdev_top->vdev_parent == rvd);
7772 
7773 	/*
7774 	 * Extract the new device from its root and add it to pvd.
7775 	 */
7776 	vdev_remove_child(newrootvd, newvd);
7777 	newvd->vdev_id = pvd->vdev_children;
7778 	newvd->vdev_crtxg = oldvd->vdev_crtxg;
7779 	vdev_add_child(pvd, newvd);
7780 
7781 	/*
7782 	 * Reevaluate the parent vdev state.
7783 	 */
7784 	vdev_propagate_state(pvd);
7785 
7786 	tvd = newvd->vdev_top;
7787 	ASSERT(pvd->vdev_top == tvd);
7788 	ASSERT(tvd->vdev_parent == rvd);
7789 
7790 	vdev_config_dirty(tvd);
7791 
7792 	/*
7793 	 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
7794 	 * for any dmu_sync-ed blocks.  It will propagate upward when
7795 	 * spa_vdev_exit() calls vdev_dtl_reassess().
7796 	 */
7797 	dtl_max_txg = txg + TXG_CONCURRENT_STATES;
7798 
7799 	if (raidz) {
7800 		/*
7801 		 * Wait for the youngest allocations and frees to sync,
7802 		 * and then wait for the deferral of those frees to finish.
7803 		 */
7804 		spa_vdev_config_exit(spa, NULL,
7805 		    txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
7806 
7807 		vdev_initialize_stop_all(tvd, VDEV_INITIALIZE_ACTIVE);
7808 		vdev_trim_stop_all(tvd, VDEV_TRIM_ACTIVE);
7809 		vdev_autotrim_stop_wait(tvd);
7810 
7811 		dtl_max_txg = spa_vdev_config_enter(spa);
7812 
7813 		tvd->vdev_rz_expanding = B_TRUE;
7814 
7815 		vdev_dirty_leaves(tvd, VDD_DTL, dtl_max_txg);
7816 		vdev_config_dirty(tvd);
7817 
7818 		dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool,
7819 		    dtl_max_txg);
7820 		dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_raidz_attach_sync,
7821 		    newvd, tx);
7822 		dmu_tx_commit(tx);
7823 	} else {
7824 		vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
7825 		    dtl_max_txg - TXG_INITIAL);
7826 
7827 		if (newvd->vdev_isspare) {
7828 			spa_spare_activate(newvd);
7829 			spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE);
7830 		}
7831 
7832 		newvd_isspare = newvd->vdev_isspare;
7833 
7834 		/*
7835 		 * Mark newvd's DTL dirty in this txg.
7836 		 */
7837 		vdev_dirty(tvd, VDD_DTL, newvd, txg);
7838 
7839 		/*
7840 		 * Schedule the resilver or rebuild to restart in the future.
7841 		 * We do this to ensure that dmu_sync-ed blocks have been
7842 		 * stitched into the respective datasets.
7843 		 */
7844 		if (rebuild) {
7845 			newvd->vdev_rebuild_txg = txg;
7846 
7847 			vdev_rebuild(tvd);
7848 		} else {
7849 			newvd->vdev_resilver_txg = txg;
7850 
7851 			if (dsl_scan_resilvering(spa_get_dsl(spa)) &&
7852 			    spa_feature_is_enabled(spa,
7853 			    SPA_FEATURE_RESILVER_DEFER)) {
7854 				vdev_defer_resilver(newvd);
7855 			} else {
7856 				dsl_scan_restart_resilver(spa->spa_dsl_pool,
7857 				    dtl_max_txg);
7858 			}
7859 		}
7860 	}
7861 
7862 	if (spa->spa_bootfs)
7863 		spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH);
7864 
7865 	spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH);
7866 
7867 	/*
7868 	 * Commit the config
7869 	 */
7870 	(void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
7871 
7872 	spa_history_log_internal(spa, "vdev attach", NULL,
7873 	    "%s vdev=%s %s vdev=%s",
7874 	    replacing && newvd_isspare ? "spare in" :
7875 	    replacing ? "replace" : "attach", newvdpath,
7876 	    replacing ? "for" : "to", oldvdpath);
7877 
7878 	spa_strfree(oldvdpath);
7879 	spa_strfree(newvdpath);
7880 
7881 	return (0);
7882 }
7883 
7884 /*
7885  * Detach a device from a mirror or replacing vdev.
7886  *
7887  * If 'replace_done' is specified, only detach if the parent
7888  * is a replacing or a spare vdev.
7889  */
7890 int
7891 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
7892 {
7893 	uint64_t txg;
7894 	int error;
7895 	vdev_t *rvd __maybe_unused = spa->spa_root_vdev;
7896 	vdev_t *vd, *pvd, *cvd, *tvd;
7897 	boolean_t unspare = B_FALSE;
7898 	uint64_t unspare_guid = 0;
7899 	char *vdpath;
7900 
7901 	ASSERT(spa_writeable(spa));
7902 
7903 	txg = spa_vdev_detach_enter(spa, guid);
7904 
7905 	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
7906 
7907 	/*
7908 	 * Besides being called directly from the userland through the
7909 	 * ioctl interface, spa_vdev_detach() can be potentially called
7910 	 * at the end of spa_vdev_resilver_done().
7911 	 *
7912 	 * In the regular case, when we have a checkpoint this shouldn't
7913 	 * happen as we never empty the DTLs of a vdev during the scrub
7914 	 * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done()
7915 	 * should never get here when we have a checkpoint.
7916 	 *
7917 	 * That said, even in a case when we checkpoint the pool exactly
7918 	 * as spa_vdev_resilver_done() calls this function everything
7919 	 * should be fine as the resilver will return right away.
7920 	 */
7921 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
7922 	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
7923 		error = (spa_has_checkpoint(spa)) ?
7924 		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
7925 		return (spa_vdev_exit(spa, NULL, txg, error));
7926 	}
7927 
7928 	if (vd == NULL)
7929 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
7930 
7931 	if (!vd->vdev_ops->vdev_op_leaf)
7932 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
7933 
7934 	pvd = vd->vdev_parent;
7935 
7936 	/*
7937 	 * If the parent/child relationship is not as expected, don't do it.
7938 	 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
7939 	 * vdev that's replacing B with C.  The user's intent in replacing
7940 	 * is to go from M(A,B) to M(A,C).  If the user decides to cancel
7941 	 * the replace by detaching C, the expected behavior is to end up
7942 	 * M(A,B).  But suppose that right after deciding to detach C,
7943 	 * the replacement of B completes.  We would have M(A,C), and then
7944 	 * ask to detach C, which would leave us with just A -- not what
7945 	 * the user wanted.  To prevent this, we make sure that the
7946 	 * parent/child relationship hasn't changed -- in this example,
7947 	 * that C's parent is still the replacing vdev R.
7948 	 */
7949 	if (pvd->vdev_guid != pguid && pguid != 0)
7950 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
7951 
7952 	/*
7953 	 * Only 'replacing' or 'spare' vdevs can be replaced.
7954 	 */
7955 	if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
7956 	    pvd->vdev_ops != &vdev_spare_ops)
7957 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
7958 
7959 	ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
7960 	    spa_version(spa) >= SPA_VERSION_SPARES);
7961 
7962 	/*
7963 	 * Only mirror, replacing, and spare vdevs support detach.
7964 	 */
7965 	if (pvd->vdev_ops != &vdev_replacing_ops &&
7966 	    pvd->vdev_ops != &vdev_mirror_ops &&
7967 	    pvd->vdev_ops != &vdev_spare_ops)
7968 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
7969 
7970 	/*
7971 	 * If this device has the only valid copy of some data,
7972 	 * we cannot safely detach it.
7973 	 */
7974 	if (vdev_dtl_required(vd))
7975 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
7976 
7977 	ASSERT(pvd->vdev_children >= 2);
7978 
7979 	/*
7980 	 * If we are detaching the second disk from a replacing vdev, then
7981 	 * check to see if we changed the original vdev's path to have "/old"
7982 	 * at the end in spa_vdev_attach().  If so, undo that change now.
7983 	 */
7984 	if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
7985 	    vd->vdev_path != NULL) {
7986 		size_t len = strlen(vd->vdev_path);
7987 
7988 		for (int c = 0; c < pvd->vdev_children; c++) {
7989 			cvd = pvd->vdev_child[c];
7990 
7991 			if (cvd == vd || cvd->vdev_path == NULL)
7992 				continue;
7993 
7994 			if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
7995 			    strcmp(cvd->vdev_path + len, "/old") == 0) {
7996 				spa_strfree(cvd->vdev_path);
7997 				cvd->vdev_path = spa_strdup(vd->vdev_path);
7998 				break;
7999 			}
8000 		}
8001 	}
8002 
8003 	/*
8004 	 * If we are detaching the original disk from a normal spare, then it
8005 	 * implies that the spare should become a real disk, and be removed
8006 	 * from the active spare list for the pool.  dRAID spares on the
8007 	 * other hand are coupled to the pool and thus should never be removed
8008 	 * from the spares list.
8009 	 */
8010 	if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0) {
8011 		vdev_t *last_cvd = pvd->vdev_child[pvd->vdev_children - 1];
8012 
8013 		if (last_cvd->vdev_isspare &&
8014 		    last_cvd->vdev_ops != &vdev_draid_spare_ops) {
8015 			unspare = B_TRUE;
8016 		}
8017 	}
8018 
8019 	/*
8020 	 * Erase the disk labels so the disk can be used for other things.
8021 	 * This must be done after all other error cases are handled,
8022 	 * but before we disembowel vd (so we can still do I/O to it).
8023 	 * But if we can't do it, don't treat the error as fatal --
8024 	 * it may be that the unwritability of the disk is the reason
8025 	 * it's being detached!
8026 	 */
8027 	(void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
8028 
8029 	/*
8030 	 * Remove vd from its parent and compact the parent's children.
8031 	 */
8032 	vdev_remove_child(pvd, vd);
8033 	vdev_compact_children(pvd);
8034 
8035 	/*
8036 	 * Remember one of the remaining children so we can get tvd below.
8037 	 */
8038 	cvd = pvd->vdev_child[pvd->vdev_children - 1];
8039 
8040 	/*
8041 	 * If we need to remove the remaining child from the list of hot spares,
8042 	 * do it now, marking the vdev as no longer a spare in the process.
8043 	 * We must do this before vdev_remove_parent(), because that can
8044 	 * change the GUID if it creates a new toplevel GUID.  For a similar
8045 	 * reason, we must remove the spare now, in the same txg as the detach;
8046 	 * otherwise someone could attach a new sibling, change the GUID, and
8047 	 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
8048 	 */
8049 	if (unspare) {
8050 		ASSERT(cvd->vdev_isspare);
8051 		spa_spare_remove(cvd);
8052 		unspare_guid = cvd->vdev_guid;
8053 		(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
8054 		cvd->vdev_unspare = B_TRUE;
8055 	}
8056 
8057 	/*
8058 	 * If the parent mirror/replacing vdev only has one child,
8059 	 * the parent is no longer needed.  Remove it from the tree.
8060 	 */
8061 	if (pvd->vdev_children == 1) {
8062 		if (pvd->vdev_ops == &vdev_spare_ops)
8063 			cvd->vdev_unspare = B_FALSE;
8064 		vdev_remove_parent(cvd);
8065 	}
8066 
8067 	/*
8068 	 * We don't set tvd until now because the parent we just removed
8069 	 * may have been the previous top-level vdev.
8070 	 */
8071 	tvd = cvd->vdev_top;
8072 	ASSERT(tvd->vdev_parent == rvd);
8073 
8074 	/*
8075 	 * Reevaluate the parent vdev state.
8076 	 */
8077 	vdev_propagate_state(cvd);
8078 
8079 	/*
8080 	 * If the 'autoexpand' property is set on the pool then automatically
8081 	 * try to expand the size of the pool. For example if the device we
8082 	 * just detached was smaller than the others, it may be possible to
8083 	 * add metaslabs (i.e. grow the pool). We need to reopen the vdev
8084 	 * first so that we can obtain the updated sizes of the leaf vdevs.
8085 	 */
8086 	if (spa->spa_autoexpand) {
8087 		vdev_reopen(tvd);
8088 		vdev_expand(tvd, txg);
8089 	}
8090 
8091 	vdev_config_dirty(tvd);
8092 
8093 	/*
8094 	 * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
8095 	 * vd->vdev_detached is set and free vd's DTL object in syncing context.
8096 	 * But first make sure we're not on any *other* txg's DTL list, to
8097 	 * prevent vd from being accessed after it's freed.
8098 	 */
8099 	vdpath = spa_strdup(vd->vdev_path ? vd->vdev_path : "none");
8100 	for (int t = 0; t < TXG_SIZE; t++)
8101 		(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
8102 	vd->vdev_detached = B_TRUE;
8103 	vdev_dirty(tvd, VDD_DTL, vd, txg);
8104 
8105 	spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE);
8106 	spa_notify_waiters(spa);
8107 
8108 	/* hang on to the spa before we release the lock */
8109 	spa_open_ref(spa, FTAG);
8110 
8111 	error = spa_vdev_exit(spa, vd, txg, 0);
8112 
8113 	spa_history_log_internal(spa, "detach", NULL,
8114 	    "vdev=%s", vdpath);
8115 	spa_strfree(vdpath);
8116 
8117 	/*
8118 	 * If this was the removal of the original device in a hot spare vdev,
8119 	 * then we want to go through and remove the device from the hot spare
8120 	 * list of every other pool.
8121 	 */
8122 	if (unspare) {
8123 		spa_t *altspa = NULL;
8124 
8125 		mutex_enter(&spa_namespace_lock);
8126 		while ((altspa = spa_next(altspa)) != NULL) {
8127 			if (altspa->spa_state != POOL_STATE_ACTIVE ||
8128 			    altspa == spa)
8129 				continue;
8130 
8131 			spa_open_ref(altspa, FTAG);
8132 			mutex_exit(&spa_namespace_lock);
8133 			(void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
8134 			mutex_enter(&spa_namespace_lock);
8135 			spa_close(altspa, FTAG);
8136 		}
8137 		mutex_exit(&spa_namespace_lock);
8138 
8139 		/* search the rest of the vdevs for spares to remove */
8140 		spa_vdev_resilver_done(spa);
8141 	}
8142 
8143 	/* all done with the spa; OK to release */
8144 	mutex_enter(&spa_namespace_lock);
8145 	spa_close(spa, FTAG);
8146 	mutex_exit(&spa_namespace_lock);
8147 
8148 	return (error);
8149 }
8150 
8151 static int
8152 spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
8153     list_t *vd_list)
8154 {
8155 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
8156 
8157 	spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
8158 
8159 	/* Look up vdev and ensure it's a leaf. */
8160 	vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
8161 	if (vd == NULL || vd->vdev_detached) {
8162 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
8163 		return (SET_ERROR(ENODEV));
8164 	} else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) {
8165 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
8166 		return (SET_ERROR(EINVAL));
8167 	} else if (!vdev_writeable(vd)) {
8168 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
8169 		return (SET_ERROR(EROFS));
8170 	}
8171 	mutex_enter(&vd->vdev_initialize_lock);
8172 	spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
8173 
8174 	/*
8175 	 * When we activate an initialize action we check to see
8176 	 * if the vdev_initialize_thread is NULL. We do this instead
8177 	 * of using the vdev_initialize_state since there might be
8178 	 * a previous initialization process which has completed but
8179 	 * the thread is not exited.
8180 	 */
8181 	if (cmd_type == POOL_INITIALIZE_START &&
8182 	    (vd->vdev_initialize_thread != NULL ||
8183 	    vd->vdev_top->vdev_removing || vd->vdev_top->vdev_rz_expanding)) {
8184 		mutex_exit(&vd->vdev_initialize_lock);
8185 		return (SET_ERROR(EBUSY));
8186 	} else if (cmd_type == POOL_INITIALIZE_CANCEL &&
8187 	    (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE &&
8188 	    vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) {
8189 		mutex_exit(&vd->vdev_initialize_lock);
8190 		return (SET_ERROR(ESRCH));
8191 	} else if (cmd_type == POOL_INITIALIZE_SUSPEND &&
8192 	    vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) {
8193 		mutex_exit(&vd->vdev_initialize_lock);
8194 		return (SET_ERROR(ESRCH));
8195 	} else if (cmd_type == POOL_INITIALIZE_UNINIT &&
8196 	    vd->vdev_initialize_thread != NULL) {
8197 		mutex_exit(&vd->vdev_initialize_lock);
8198 		return (SET_ERROR(EBUSY));
8199 	}
8200 
8201 	switch (cmd_type) {
8202 	case POOL_INITIALIZE_START:
8203 		vdev_initialize(vd);
8204 		break;
8205 	case POOL_INITIALIZE_CANCEL:
8206 		vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED, vd_list);
8207 		break;
8208 	case POOL_INITIALIZE_SUSPEND:
8209 		vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list);
8210 		break;
8211 	case POOL_INITIALIZE_UNINIT:
8212 		vdev_uninitialize(vd);
8213 		break;
8214 	default:
8215 		panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
8216 	}
8217 	mutex_exit(&vd->vdev_initialize_lock);
8218 
8219 	return (0);
8220 }
8221 
8222 int
8223 spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type,
8224     nvlist_t *vdev_errlist)
8225 {
8226 	int total_errors = 0;
8227 	list_t vd_list;
8228 
8229 	list_create(&vd_list, sizeof (vdev_t),
8230 	    offsetof(vdev_t, vdev_initialize_node));
8231 
8232 	/*
8233 	 * We hold the namespace lock through the whole function
8234 	 * to prevent any changes to the pool while we're starting or
8235 	 * stopping initialization. The config and state locks are held so that
8236 	 * we can properly assess the vdev state before we commit to
8237 	 * the initializing operation.
8238 	 */
8239 	mutex_enter(&spa_namespace_lock);
8240 
8241 	for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL);
8242 	    pair != NULL; pair = nvlist_next_nvpair(nv, pair)) {
8243 		uint64_t vdev_guid = fnvpair_value_uint64(pair);
8244 
8245 		int error = spa_vdev_initialize_impl(spa, vdev_guid, cmd_type,
8246 		    &vd_list);
8247 		if (error != 0) {
8248 			char guid_as_str[MAXNAMELEN];
8249 
8250 			(void) snprintf(guid_as_str, sizeof (guid_as_str),
8251 			    "%llu", (unsigned long long)vdev_guid);
8252 			fnvlist_add_int64(vdev_errlist, guid_as_str, error);
8253 			total_errors++;
8254 		}
8255 	}
8256 
8257 	/* Wait for all initialize threads to stop. */
8258 	vdev_initialize_stop_wait(spa, &vd_list);
8259 
8260 	/* Sync out the initializing state */
8261 	txg_wait_synced(spa->spa_dsl_pool, 0);
8262 	mutex_exit(&spa_namespace_lock);
8263 
8264 	list_destroy(&vd_list);
8265 
8266 	return (total_errors);
8267 }
8268 
8269 static int
8270 spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
8271     uint64_t rate, boolean_t partial, boolean_t secure, list_t *vd_list)
8272 {
8273 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
8274 
8275 	spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
8276 
8277 	/* Look up vdev and ensure it's a leaf. */
8278 	vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
8279 	if (vd == NULL || vd->vdev_detached) {
8280 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
8281 		return (SET_ERROR(ENODEV));
8282 	} else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) {
8283 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
8284 		return (SET_ERROR(EINVAL));
8285 	} else if (!vdev_writeable(vd)) {
8286 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
8287 		return (SET_ERROR(EROFS));
8288 	} else if (!vd->vdev_has_trim) {
8289 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
8290 		return (SET_ERROR(EOPNOTSUPP));
8291 	} else if (secure && !vd->vdev_has_securetrim) {
8292 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
8293 		return (SET_ERROR(EOPNOTSUPP));
8294 	}
8295 	mutex_enter(&vd->vdev_trim_lock);
8296 	spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
8297 
8298 	/*
8299 	 * When we activate a TRIM action we check to see if the
8300 	 * vdev_trim_thread is NULL. We do this instead of using the
8301 	 * vdev_trim_state since there might be a previous TRIM process
8302 	 * which has completed but the thread is not exited.
8303 	 */
8304 	if (cmd_type == POOL_TRIM_START &&
8305 	    (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing ||
8306 	    vd->vdev_top->vdev_rz_expanding)) {
8307 		mutex_exit(&vd->vdev_trim_lock);
8308 		return (SET_ERROR(EBUSY));
8309 	} else if (cmd_type == POOL_TRIM_CANCEL &&
8310 	    (vd->vdev_trim_state != VDEV_TRIM_ACTIVE &&
8311 	    vd->vdev_trim_state != VDEV_TRIM_SUSPENDED)) {
8312 		mutex_exit(&vd->vdev_trim_lock);
8313 		return (SET_ERROR(ESRCH));
8314 	} else if (cmd_type == POOL_TRIM_SUSPEND &&
8315 	    vd->vdev_trim_state != VDEV_TRIM_ACTIVE) {
8316 		mutex_exit(&vd->vdev_trim_lock);
8317 		return (SET_ERROR(ESRCH));
8318 	}
8319 
8320 	switch (cmd_type) {
8321 	case POOL_TRIM_START:
8322 		vdev_trim(vd, rate, partial, secure);
8323 		break;
8324 	case POOL_TRIM_CANCEL:
8325 		vdev_trim_stop(vd, VDEV_TRIM_CANCELED, vd_list);
8326 		break;
8327 	case POOL_TRIM_SUSPEND:
8328 		vdev_trim_stop(vd, VDEV_TRIM_SUSPENDED, vd_list);
8329 		break;
8330 	default:
8331 		panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
8332 	}
8333 	mutex_exit(&vd->vdev_trim_lock);
8334 
8335 	return (0);
8336 }
8337 
8338 /*
8339  * Initiates a manual TRIM for the requested vdevs. This kicks off individual
8340  * TRIM threads for each child vdev.  These threads pass over all of the free
8341  * space in the vdev's metaslabs and issues TRIM commands for that space.
8342  */
8343 int
8344 spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate,
8345     boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist)
8346 {
8347 	int total_errors = 0;
8348 	list_t vd_list;
8349 
8350 	list_create(&vd_list, sizeof (vdev_t),
8351 	    offsetof(vdev_t, vdev_trim_node));
8352 
8353 	/*
8354 	 * We hold the namespace lock through the whole function
8355 	 * to prevent any changes to the pool while we're starting or
8356 	 * stopping TRIM. The config and state locks are held so that
8357 	 * we can properly assess the vdev state before we commit to
8358 	 * the TRIM operation.
8359 	 */
8360 	mutex_enter(&spa_namespace_lock);
8361 
8362 	for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL);
8363 	    pair != NULL; pair = nvlist_next_nvpair(nv, pair)) {
8364 		uint64_t vdev_guid = fnvpair_value_uint64(pair);
8365 
8366 		int error = spa_vdev_trim_impl(spa, vdev_guid, cmd_type,
8367 		    rate, partial, secure, &vd_list);
8368 		if (error != 0) {
8369 			char guid_as_str[MAXNAMELEN];
8370 
8371 			(void) snprintf(guid_as_str, sizeof (guid_as_str),
8372 			    "%llu", (unsigned long long)vdev_guid);
8373 			fnvlist_add_int64(vdev_errlist, guid_as_str, error);
8374 			total_errors++;
8375 		}
8376 	}
8377 
8378 	/* Wait for all TRIM threads to stop. */
8379 	vdev_trim_stop_wait(spa, &vd_list);
8380 
8381 	/* Sync out the TRIM state */
8382 	txg_wait_synced(spa->spa_dsl_pool, 0);
8383 	mutex_exit(&spa_namespace_lock);
8384 
8385 	list_destroy(&vd_list);
8386 
8387 	return (total_errors);
8388 }
8389 
8390 /*
8391  * Split a set of devices from their mirrors, and create a new pool from them.
8392  */
8393 int
8394 spa_vdev_split_mirror(spa_t *spa, const char *newname, nvlist_t *config,
8395     nvlist_t *props, boolean_t exp)
8396 {
8397 	int error = 0;
8398 	uint64_t txg, *glist;
8399 	spa_t *newspa;
8400 	uint_t c, children, lastlog;
8401 	nvlist_t **child, *nvl, *tmp;
8402 	dmu_tx_t *tx;
8403 	const char *altroot = NULL;
8404 	vdev_t *rvd, **vml = NULL;			/* vdev modify list */
8405 	boolean_t activate_slog;
8406 
8407 	ASSERT(spa_writeable(spa));
8408 
8409 	txg = spa_vdev_enter(spa);
8410 
8411 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
8412 	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
8413 		error = (spa_has_checkpoint(spa)) ?
8414 		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
8415 		return (spa_vdev_exit(spa, NULL, txg, error));
8416 	}
8417 
8418 	/* clear the log and flush everything up to now */
8419 	activate_slog = spa_passivate_log(spa);
8420 	(void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
8421 	error = spa_reset_logs(spa);
8422 	txg = spa_vdev_config_enter(spa);
8423 
8424 	if (activate_slog)
8425 		spa_activate_log(spa);
8426 
8427 	if (error != 0)
8428 		return (spa_vdev_exit(spa, NULL, txg, error));
8429 
8430 	/* check new spa name before going any further */
8431 	if (spa_lookup(newname) != NULL)
8432 		return (spa_vdev_exit(spa, NULL, txg, EEXIST));
8433 
8434 	/*
8435 	 * scan through all the children to ensure they're all mirrors
8436 	 */
8437 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
8438 	    nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
8439 	    &children) != 0)
8440 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
8441 
8442 	/* first, check to ensure we've got the right child count */
8443 	rvd = spa->spa_root_vdev;
8444 	lastlog = 0;
8445 	for (c = 0; c < rvd->vdev_children; c++) {
8446 		vdev_t *vd = rvd->vdev_child[c];
8447 
8448 		/* don't count the holes & logs as children */
8449 		if (vd->vdev_islog || (vd->vdev_ops != &vdev_indirect_ops &&
8450 		    !vdev_is_concrete(vd))) {
8451 			if (lastlog == 0)
8452 				lastlog = c;
8453 			continue;
8454 		}
8455 
8456 		lastlog = 0;
8457 	}
8458 	if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
8459 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
8460 
8461 	/* next, ensure no spare or cache devices are part of the split */
8462 	if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
8463 	    nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
8464 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
8465 
8466 	vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
8467 	glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
8468 
8469 	/* then, loop over each vdev and validate it */
8470 	for (c = 0; c < children; c++) {
8471 		uint64_t is_hole = 0;
8472 
8473 		(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
8474 		    &is_hole);
8475 
8476 		if (is_hole != 0) {
8477 			if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
8478 			    spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
8479 				continue;
8480 			} else {
8481 				error = SET_ERROR(EINVAL);
8482 				break;
8483 			}
8484 		}
8485 
8486 		/* deal with indirect vdevs */
8487 		if (spa->spa_root_vdev->vdev_child[c]->vdev_ops ==
8488 		    &vdev_indirect_ops)
8489 			continue;
8490 
8491 		/* which disk is going to be split? */
8492 		if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
8493 		    &glist[c]) != 0) {
8494 			error = SET_ERROR(EINVAL);
8495 			break;
8496 		}
8497 
8498 		/* look it up in the spa */
8499 		vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
8500 		if (vml[c] == NULL) {
8501 			error = SET_ERROR(ENODEV);
8502 			break;
8503 		}
8504 
8505 		/* make sure there's nothing stopping the split */
8506 		if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
8507 		    vml[c]->vdev_islog ||
8508 		    !vdev_is_concrete(vml[c]) ||
8509 		    vml[c]->vdev_isspare ||
8510 		    vml[c]->vdev_isl2cache ||
8511 		    !vdev_writeable(vml[c]) ||
8512 		    vml[c]->vdev_children != 0 ||
8513 		    vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
8514 		    c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
8515 			error = SET_ERROR(EINVAL);
8516 			break;
8517 		}
8518 
8519 		if (vdev_dtl_required(vml[c]) ||
8520 		    vdev_resilver_needed(vml[c], NULL, NULL)) {
8521 			error = SET_ERROR(EBUSY);
8522 			break;
8523 		}
8524 
8525 		/* we need certain info from the top level */
8526 		fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
8527 		    vml[c]->vdev_top->vdev_ms_array);
8528 		fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
8529 		    vml[c]->vdev_top->vdev_ms_shift);
8530 		fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
8531 		    vml[c]->vdev_top->vdev_asize);
8532 		fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
8533 		    vml[c]->vdev_top->vdev_ashift);
8534 
8535 		/* transfer per-vdev ZAPs */
8536 		ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0);
8537 		VERIFY0(nvlist_add_uint64(child[c],
8538 		    ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap));
8539 
8540 		ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0);
8541 		VERIFY0(nvlist_add_uint64(child[c],
8542 		    ZPOOL_CONFIG_VDEV_TOP_ZAP,
8543 		    vml[c]->vdev_parent->vdev_top_zap));
8544 	}
8545 
8546 	if (error != 0) {
8547 		kmem_free(vml, children * sizeof (vdev_t *));
8548 		kmem_free(glist, children * sizeof (uint64_t));
8549 		return (spa_vdev_exit(spa, NULL, txg, error));
8550 	}
8551 
8552 	/* stop writers from using the disks */
8553 	for (c = 0; c < children; c++) {
8554 		if (vml[c] != NULL)
8555 			vml[c]->vdev_offline = B_TRUE;
8556 	}
8557 	vdev_reopen(spa->spa_root_vdev);
8558 
8559 	/*
8560 	 * Temporarily record the splitting vdevs in the spa config.  This
8561 	 * will disappear once the config is regenerated.
8562 	 */
8563 	nvl = fnvlist_alloc();
8564 	fnvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, glist, children);
8565 	kmem_free(glist, children * sizeof (uint64_t));
8566 
8567 	mutex_enter(&spa->spa_props_lock);
8568 	fnvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, nvl);
8569 	mutex_exit(&spa->spa_props_lock);
8570 	spa->spa_config_splitting = nvl;
8571 	vdev_config_dirty(spa->spa_root_vdev);
8572 
8573 	/* configure and create the new pool */
8574 	fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname);
8575 	fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
8576 	    exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE);
8577 	fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa));
8578 	fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg);
8579 	fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
8580 	    spa_generate_guid(NULL));
8581 	VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
8582 	(void) nvlist_lookup_string(props,
8583 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
8584 
8585 	/* add the new pool to the namespace */
8586 	newspa = spa_add(newname, config, altroot);
8587 	newspa->spa_avz_action = AVZ_ACTION_REBUILD;
8588 	newspa->spa_config_txg = spa->spa_config_txg;
8589 	spa_set_log_state(newspa, SPA_LOG_CLEAR);
8590 
8591 	/* release the spa config lock, retaining the namespace lock */
8592 	spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
8593 
8594 	if (zio_injection_enabled)
8595 		zio_handle_panic_injection(spa, FTAG, 1);
8596 
8597 	spa_activate(newspa, spa_mode_global);
8598 	spa_async_suspend(newspa);
8599 
8600 	/*
8601 	 * Temporarily stop the initializing and TRIM activity.  We set the
8602 	 * state to ACTIVE so that we know to resume initializing or TRIM
8603 	 * once the split has completed.
8604 	 */
8605 	list_t vd_initialize_list;
8606 	list_create(&vd_initialize_list, sizeof (vdev_t),
8607 	    offsetof(vdev_t, vdev_initialize_node));
8608 
8609 	list_t vd_trim_list;
8610 	list_create(&vd_trim_list, sizeof (vdev_t),
8611 	    offsetof(vdev_t, vdev_trim_node));
8612 
8613 	for (c = 0; c < children; c++) {
8614 		if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) {
8615 			mutex_enter(&vml[c]->vdev_initialize_lock);
8616 			vdev_initialize_stop(vml[c],
8617 			    VDEV_INITIALIZE_ACTIVE, &vd_initialize_list);
8618 			mutex_exit(&vml[c]->vdev_initialize_lock);
8619 
8620 			mutex_enter(&vml[c]->vdev_trim_lock);
8621 			vdev_trim_stop(vml[c], VDEV_TRIM_ACTIVE, &vd_trim_list);
8622 			mutex_exit(&vml[c]->vdev_trim_lock);
8623 		}
8624 	}
8625 
8626 	vdev_initialize_stop_wait(spa, &vd_initialize_list);
8627 	vdev_trim_stop_wait(spa, &vd_trim_list);
8628 
8629 	list_destroy(&vd_initialize_list);
8630 	list_destroy(&vd_trim_list);
8631 
8632 	newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT;
8633 	newspa->spa_is_splitting = B_TRUE;
8634 
8635 	/* create the new pool from the disks of the original pool */
8636 	error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE);
8637 	if (error)
8638 		goto out;
8639 
8640 	/* if that worked, generate a real config for the new pool */
8641 	if (newspa->spa_root_vdev != NULL) {
8642 		newspa->spa_config_splitting = fnvlist_alloc();
8643 		fnvlist_add_uint64(newspa->spa_config_splitting,
8644 		    ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa));
8645 		spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
8646 		    B_TRUE));
8647 	}
8648 
8649 	/* set the props */
8650 	if (props != NULL) {
8651 		spa_configfile_set(newspa, props, B_FALSE);
8652 		error = spa_prop_set(newspa, props);
8653 		if (error)
8654 			goto out;
8655 	}
8656 
8657 	/* flush everything */
8658 	txg = spa_vdev_config_enter(newspa);
8659 	vdev_config_dirty(newspa->spa_root_vdev);
8660 	(void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
8661 
8662 	if (zio_injection_enabled)
8663 		zio_handle_panic_injection(spa, FTAG, 2);
8664 
8665 	spa_async_resume(newspa);
8666 
8667 	/* finally, update the original pool's config */
8668 	txg = spa_vdev_config_enter(spa);
8669 	tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
8670 	error = dmu_tx_assign(tx, DMU_TX_WAIT);
8671 	if (error != 0)
8672 		dmu_tx_abort(tx);
8673 	for (c = 0; c < children; c++) {
8674 		if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) {
8675 			vdev_t *tvd = vml[c]->vdev_top;
8676 
8677 			/*
8678 			 * Need to be sure the detachable VDEV is not
8679 			 * on any *other* txg's DTL list to prevent it
8680 			 * from being accessed after it's freed.
8681 			 */
8682 			for (int t = 0; t < TXG_SIZE; t++) {
8683 				(void) txg_list_remove_this(
8684 				    &tvd->vdev_dtl_list, vml[c], t);
8685 			}
8686 
8687 			vdev_split(vml[c]);
8688 			if (error == 0)
8689 				spa_history_log_internal(spa, "detach", tx,
8690 				    "vdev=%s", vml[c]->vdev_path);
8691 
8692 			vdev_free(vml[c]);
8693 		}
8694 	}
8695 	spa->spa_avz_action = AVZ_ACTION_REBUILD;
8696 	vdev_config_dirty(spa->spa_root_vdev);
8697 	spa->spa_config_splitting = NULL;
8698 	nvlist_free(nvl);
8699 	if (error == 0)
8700 		dmu_tx_commit(tx);
8701 	(void) spa_vdev_exit(spa, NULL, txg, 0);
8702 
8703 	if (zio_injection_enabled)
8704 		zio_handle_panic_injection(spa, FTAG, 3);
8705 
8706 	/* split is complete; log a history record */
8707 	spa_history_log_internal(newspa, "split", NULL,
8708 	    "from pool %s", spa_name(spa));
8709 
8710 	newspa->spa_is_splitting = B_FALSE;
8711 	kmem_free(vml, children * sizeof (vdev_t *));
8712 
8713 	/* if we're not going to mount the filesystems in userland, export */
8714 	if (exp)
8715 		error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
8716 		    B_FALSE, B_FALSE);
8717 
8718 	return (error);
8719 
8720 out:
8721 	spa_unload(newspa);
8722 	spa_deactivate(newspa);
8723 	spa_remove(newspa);
8724 
8725 	txg = spa_vdev_config_enter(spa);
8726 
8727 	/* re-online all offlined disks */
8728 	for (c = 0; c < children; c++) {
8729 		if (vml[c] != NULL)
8730 			vml[c]->vdev_offline = B_FALSE;
8731 	}
8732 
8733 	/* restart initializing or trimming disks as necessary */
8734 	spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
8735 	spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
8736 	spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
8737 
8738 	vdev_reopen(spa->spa_root_vdev);
8739 
8740 	nvlist_free(spa->spa_config_splitting);
8741 	spa->spa_config_splitting = NULL;
8742 	(void) spa_vdev_exit(spa, NULL, txg, error);
8743 
8744 	kmem_free(vml, children * sizeof (vdev_t *));
8745 	return (error);
8746 }
8747 
8748 /*
8749  * Find any device that's done replacing, or a vdev marked 'unspare' that's
8750  * currently spared, so we can detach it.
8751  */
8752 static vdev_t *
8753 spa_vdev_resilver_done_hunt(vdev_t *vd)
8754 {
8755 	vdev_t *newvd, *oldvd;
8756 
8757 	for (int c = 0; c < vd->vdev_children; c++) {
8758 		oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
8759 		if (oldvd != NULL)
8760 			return (oldvd);
8761 	}
8762 
8763 	/*
8764 	 * Check for a completed replacement.  We always consider the first
8765 	 * vdev in the list to be the oldest vdev, and the last one to be
8766 	 * the newest (see spa_vdev_attach() for how that works).  In
8767 	 * the case where the newest vdev is faulted, we will not automatically
8768 	 * remove it after a resilver completes.  This is OK as it will require
8769 	 * user intervention to determine which disk the admin wishes to keep.
8770 	 */
8771 	if (vd->vdev_ops == &vdev_replacing_ops) {
8772 		ASSERT(vd->vdev_children > 1);
8773 
8774 		newvd = vd->vdev_child[vd->vdev_children - 1];
8775 		oldvd = vd->vdev_child[0];
8776 
8777 		if (vdev_dtl_empty(newvd, DTL_MISSING) &&
8778 		    vdev_dtl_empty(newvd, DTL_OUTAGE) &&
8779 		    !vdev_dtl_required(oldvd))
8780 			return (oldvd);
8781 	}
8782 
8783 	/*
8784 	 * Check for a completed resilver with the 'unspare' flag set.
8785 	 * Also potentially update faulted state.
8786 	 */
8787 	if (vd->vdev_ops == &vdev_spare_ops) {
8788 		vdev_t *first = vd->vdev_child[0];
8789 		vdev_t *last = vd->vdev_child[vd->vdev_children - 1];
8790 
8791 		if (last->vdev_unspare) {
8792 			oldvd = first;
8793 			newvd = last;
8794 		} else if (first->vdev_unspare) {
8795 			oldvd = last;
8796 			newvd = first;
8797 		} else {
8798 			oldvd = NULL;
8799 		}
8800 
8801 		if (oldvd != NULL &&
8802 		    vdev_dtl_empty(newvd, DTL_MISSING) &&
8803 		    vdev_dtl_empty(newvd, DTL_OUTAGE) &&
8804 		    !vdev_dtl_required(oldvd))
8805 			return (oldvd);
8806 
8807 		vdev_propagate_state(vd);
8808 
8809 		/*
8810 		 * If there are more than two spares attached to a disk,
8811 		 * and those spares are not required, then we want to
8812 		 * attempt to free them up now so that they can be used
8813 		 * by other pools.  Once we're back down to a single
8814 		 * disk+spare, we stop removing them.
8815 		 */
8816 		if (vd->vdev_children > 2) {
8817 			newvd = vd->vdev_child[1];
8818 
8819 			if (newvd->vdev_isspare && last->vdev_isspare &&
8820 			    vdev_dtl_empty(last, DTL_MISSING) &&
8821 			    vdev_dtl_empty(last, DTL_OUTAGE) &&
8822 			    !vdev_dtl_required(newvd))
8823 				return (newvd);
8824 		}
8825 	}
8826 
8827 	return (NULL);
8828 }
8829 
8830 static void
8831 spa_vdev_resilver_done(spa_t *spa)
8832 {
8833 	vdev_t *vd, *pvd, *ppvd;
8834 	uint64_t guid, sguid, pguid, ppguid;
8835 
8836 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
8837 
8838 	while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
8839 		pvd = vd->vdev_parent;
8840 		ppvd = pvd->vdev_parent;
8841 		guid = vd->vdev_guid;
8842 		pguid = pvd->vdev_guid;
8843 		ppguid = ppvd->vdev_guid;
8844 		sguid = 0;
8845 		/*
8846 		 * If we have just finished replacing a hot spared device, then
8847 		 * we need to detach the parent's first child (the original hot
8848 		 * spare) as well.
8849 		 */
8850 		if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
8851 		    ppvd->vdev_children == 2) {
8852 			ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
8853 			sguid = ppvd->vdev_child[1]->vdev_guid;
8854 		}
8855 		ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd));
8856 
8857 		spa_config_exit(spa, SCL_ALL, FTAG);
8858 		if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
8859 			return;
8860 		if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
8861 			return;
8862 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
8863 	}
8864 
8865 	spa_config_exit(spa, SCL_ALL, FTAG);
8866 
8867 	/*
8868 	 * If a detach was not performed above replace waiters will not have
8869 	 * been notified.  In which case we must do so now.
8870 	 */
8871 	spa_notify_waiters(spa);
8872 }
8873 
8874 /*
8875  * Update the stored path or FRU for this vdev.
8876  */
8877 static int
8878 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
8879     boolean_t ispath)
8880 {
8881 	vdev_t *vd;
8882 	boolean_t sync = B_FALSE;
8883 
8884 	ASSERT(spa_writeable(spa));
8885 
8886 	spa_vdev_state_enter(spa, SCL_ALL);
8887 
8888 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
8889 		return (spa_vdev_state_exit(spa, NULL, ENOENT));
8890 
8891 	if (!vd->vdev_ops->vdev_op_leaf)
8892 		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
8893 
8894 	if (ispath) {
8895 		if (strcmp(value, vd->vdev_path) != 0) {
8896 			spa_strfree(vd->vdev_path);
8897 			vd->vdev_path = spa_strdup(value);
8898 			sync = B_TRUE;
8899 		}
8900 	} else {
8901 		if (vd->vdev_fru == NULL) {
8902 			vd->vdev_fru = spa_strdup(value);
8903 			sync = B_TRUE;
8904 		} else if (strcmp(value, vd->vdev_fru) != 0) {
8905 			spa_strfree(vd->vdev_fru);
8906 			vd->vdev_fru = spa_strdup(value);
8907 			sync = B_TRUE;
8908 		}
8909 	}
8910 
8911 	return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0));
8912 }
8913 
8914 int
8915 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
8916 {
8917 	return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
8918 }
8919 
8920 int
8921 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
8922 {
8923 	return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
8924 }
8925 
8926 /*
8927  * ==========================================================================
8928  * SPA Scanning
8929  * ==========================================================================
8930  */
8931 int
8932 spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd)
8933 {
8934 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
8935 
8936 	if (dsl_scan_resilvering(spa->spa_dsl_pool))
8937 		return (SET_ERROR(EBUSY));
8938 
8939 	return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd));
8940 }
8941 
8942 int
8943 spa_scan_stop(spa_t *spa)
8944 {
8945 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
8946 	if (dsl_scan_resilvering(spa->spa_dsl_pool))
8947 		return (SET_ERROR(EBUSY));
8948 
8949 	return (dsl_scan_cancel(spa->spa_dsl_pool));
8950 }
8951 
8952 int
8953 spa_scan(spa_t *spa, pool_scan_func_t func)
8954 {
8955 	return (spa_scan_range(spa, func, 0, 0));
8956 }
8957 
8958 int
8959 spa_scan_range(spa_t *spa, pool_scan_func_t func, uint64_t txgstart,
8960     uint64_t txgend)
8961 {
8962 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
8963 
8964 	if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
8965 		return (SET_ERROR(ENOTSUP));
8966 
8967 	if (func == POOL_SCAN_RESILVER &&
8968 	    !spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER))
8969 		return (SET_ERROR(ENOTSUP));
8970 
8971 	if (func != POOL_SCAN_SCRUB && (txgstart != 0 || txgend != 0))
8972 		return (SET_ERROR(ENOTSUP));
8973 
8974 	/*
8975 	 * If a resilver was requested, but there is no DTL on a
8976 	 * writeable leaf device, we have nothing to do.
8977 	 */
8978 	if (func == POOL_SCAN_RESILVER &&
8979 	    !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
8980 		spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
8981 		return (0);
8982 	}
8983 
8984 	if (func == POOL_SCAN_ERRORSCRUB &&
8985 	    !spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG))
8986 		return (SET_ERROR(ENOTSUP));
8987 
8988 	return (dsl_scan(spa->spa_dsl_pool, func, txgstart, txgend));
8989 }
8990 
8991 /*
8992  * ==========================================================================
8993  * SPA async task processing
8994  * ==========================================================================
8995  */
8996 
8997 static void
8998 spa_async_remove(spa_t *spa, vdev_t *vd, boolean_t by_kernel)
8999 {
9000 	if (vd->vdev_remove_wanted) {
9001 		vd->vdev_remove_wanted = B_FALSE;
9002 		vd->vdev_delayed_close = B_FALSE;
9003 		vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
9004 
9005 		/*
9006 		 * We want to clear the stats, but we don't want to do a full
9007 		 * vdev_clear() as that will cause us to throw away
9008 		 * degraded/faulted state as well as attempt to reopen the
9009 		 * device, all of which is a waste.
9010 		 */
9011 		vd->vdev_stat.vs_read_errors = 0;
9012 		vd->vdev_stat.vs_write_errors = 0;
9013 		vd->vdev_stat.vs_checksum_errors = 0;
9014 
9015 		vdev_state_dirty(vd->vdev_top);
9016 
9017 		/* Tell userspace that the vdev is gone. */
9018 		zfs_post_remove(spa, vd, by_kernel);
9019 	}
9020 
9021 	for (int c = 0; c < vd->vdev_children; c++)
9022 		spa_async_remove(spa, vd->vdev_child[c], by_kernel);
9023 }
9024 
9025 static void
9026 spa_async_fault_vdev(vdev_t *vd, boolean_t *suspend)
9027 {
9028 	if (vd->vdev_fault_wanted) {
9029 		vdev_state_t newstate = VDEV_STATE_FAULTED;
9030 		vd->vdev_fault_wanted = B_FALSE;
9031 
9032 		/*
9033 		 * If this device has the only valid copy of the data, then
9034 		 * back off and simply mark the vdev as degraded instead.
9035 		 */
9036 		if (!vd->vdev_top->vdev_islog && vd->vdev_aux == NULL &&
9037 		    vdev_dtl_required(vd)) {
9038 			newstate = VDEV_STATE_DEGRADED;
9039 			/* A required disk is missing so suspend the pool */
9040 			*suspend = B_TRUE;
9041 		}
9042 		vdev_set_state(vd, B_TRUE, newstate, VDEV_AUX_ERR_EXCEEDED);
9043 	}
9044 	for (int c = 0; c < vd->vdev_children; c++)
9045 		spa_async_fault_vdev(vd->vdev_child[c], suspend);
9046 }
9047 
9048 static void
9049 spa_async_autoexpand(spa_t *spa, vdev_t *vd)
9050 {
9051 	if (!spa->spa_autoexpand)
9052 		return;
9053 
9054 	for (int c = 0; c < vd->vdev_children; c++) {
9055 		vdev_t *cvd = vd->vdev_child[c];
9056 		spa_async_autoexpand(spa, cvd);
9057 	}
9058 
9059 	if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL)
9060 		return;
9061 
9062 	spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_AUTOEXPAND);
9063 }
9064 
9065 static __attribute__((noreturn)) void
9066 spa_async_thread(void *arg)
9067 {
9068 	spa_t *spa = (spa_t *)arg;
9069 	dsl_pool_t *dp = spa->spa_dsl_pool;
9070 	int tasks;
9071 
9072 	ASSERT(spa->spa_sync_on);
9073 
9074 	mutex_enter(&spa->spa_async_lock);
9075 	tasks = spa->spa_async_tasks;
9076 	spa->spa_async_tasks = 0;
9077 	mutex_exit(&spa->spa_async_lock);
9078 
9079 	/*
9080 	 * See if the config needs to be updated.
9081 	 */
9082 	if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
9083 		uint64_t old_space, new_space;
9084 
9085 		mutex_enter(&spa_namespace_lock);
9086 		old_space = metaslab_class_get_space(spa_normal_class(spa));
9087 		old_space += metaslab_class_get_space(spa_special_class(spa));
9088 		old_space += metaslab_class_get_space(spa_dedup_class(spa));
9089 		old_space += metaslab_class_get_space(
9090 		    spa_embedded_log_class(spa));
9091 
9092 		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
9093 
9094 		new_space = metaslab_class_get_space(spa_normal_class(spa));
9095 		new_space += metaslab_class_get_space(spa_special_class(spa));
9096 		new_space += metaslab_class_get_space(spa_dedup_class(spa));
9097 		new_space += metaslab_class_get_space(
9098 		    spa_embedded_log_class(spa));
9099 		mutex_exit(&spa_namespace_lock);
9100 
9101 		/*
9102 		 * If the pool grew as a result of the config update,
9103 		 * then log an internal history event.
9104 		 */
9105 		if (new_space != old_space) {
9106 			spa_history_log_internal(spa, "vdev online", NULL,
9107 			    "pool '%s' size: %llu(+%llu)",
9108 			    spa_name(spa), (u_longlong_t)new_space,
9109 			    (u_longlong_t)(new_space - old_space));
9110 		}
9111 	}
9112 
9113 	/*
9114 	 * See if any devices need to be marked REMOVED.
9115 	 */
9116 	if (tasks & (SPA_ASYNC_REMOVE | SPA_ASYNC_REMOVE_BY_USER)) {
9117 		boolean_t by_kernel = B_TRUE;
9118 		if (tasks & SPA_ASYNC_REMOVE_BY_USER)
9119 			by_kernel = B_FALSE;
9120 		spa_vdev_state_enter(spa, SCL_NONE);
9121 		spa_async_remove(spa, spa->spa_root_vdev, by_kernel);
9122 		for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
9123 			spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i],
9124 			    by_kernel);
9125 		for (int i = 0; i < spa->spa_spares.sav_count; i++)
9126 			spa_async_remove(spa, spa->spa_spares.sav_vdevs[i],
9127 			    by_kernel);
9128 		(void) spa_vdev_state_exit(spa, NULL, 0);
9129 	}
9130 
9131 	if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
9132 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
9133 		spa_async_autoexpand(spa, spa->spa_root_vdev);
9134 		spa_config_exit(spa, SCL_CONFIG, FTAG);
9135 	}
9136 
9137 	/*
9138 	 * See if any devices need to be marked faulted.
9139 	 */
9140 	if (tasks & SPA_ASYNC_FAULT_VDEV) {
9141 		spa_vdev_state_enter(spa, SCL_NONE);
9142 		boolean_t suspend = B_FALSE;
9143 		spa_async_fault_vdev(spa->spa_root_vdev, &suspend);
9144 		(void) spa_vdev_state_exit(spa, NULL, 0);
9145 		if (suspend)
9146 			zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR);
9147 	}
9148 
9149 	/*
9150 	 * If any devices are done replacing, detach them.
9151 	 */
9152 	if (tasks & SPA_ASYNC_RESILVER_DONE ||
9153 	    tasks & SPA_ASYNC_REBUILD_DONE ||
9154 	    tasks & SPA_ASYNC_DETACH_SPARE) {
9155 		spa_vdev_resilver_done(spa);
9156 	}
9157 
9158 	/*
9159 	 * Kick off a resilver.
9160 	 */
9161 	if (tasks & SPA_ASYNC_RESILVER &&
9162 	    !vdev_rebuild_active(spa->spa_root_vdev) &&
9163 	    (!dsl_scan_resilvering(dp) ||
9164 	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER)))
9165 		dsl_scan_restart_resilver(dp, 0);
9166 
9167 	if (tasks & SPA_ASYNC_INITIALIZE_RESTART) {
9168 		mutex_enter(&spa_namespace_lock);
9169 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
9170 		vdev_initialize_restart(spa->spa_root_vdev);
9171 		spa_config_exit(spa, SCL_CONFIG, FTAG);
9172 		mutex_exit(&spa_namespace_lock);
9173 	}
9174 
9175 	if (tasks & SPA_ASYNC_TRIM_RESTART) {
9176 		mutex_enter(&spa_namespace_lock);
9177 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
9178 		vdev_trim_restart(spa->spa_root_vdev);
9179 		spa_config_exit(spa, SCL_CONFIG, FTAG);
9180 		mutex_exit(&spa_namespace_lock);
9181 	}
9182 
9183 	if (tasks & SPA_ASYNC_AUTOTRIM_RESTART) {
9184 		mutex_enter(&spa_namespace_lock);
9185 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
9186 		vdev_autotrim_restart(spa);
9187 		spa_config_exit(spa, SCL_CONFIG, FTAG);
9188 		mutex_exit(&spa_namespace_lock);
9189 	}
9190 
9191 	/*
9192 	 * Kick off L2 cache whole device TRIM.
9193 	 */
9194 	if (tasks & SPA_ASYNC_L2CACHE_TRIM) {
9195 		mutex_enter(&spa_namespace_lock);
9196 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
9197 		vdev_trim_l2arc(spa);
9198 		spa_config_exit(spa, SCL_CONFIG, FTAG);
9199 		mutex_exit(&spa_namespace_lock);
9200 	}
9201 
9202 	/*
9203 	 * Kick off L2 cache rebuilding.
9204 	 */
9205 	if (tasks & SPA_ASYNC_L2CACHE_REBUILD) {
9206 		mutex_enter(&spa_namespace_lock);
9207 		spa_config_enter(spa, SCL_L2ARC, FTAG, RW_READER);
9208 		l2arc_spa_rebuild_start(spa);
9209 		spa_config_exit(spa, SCL_L2ARC, FTAG);
9210 		mutex_exit(&spa_namespace_lock);
9211 	}
9212 
9213 	/*
9214 	 * Let the world know that we're done.
9215 	 */
9216 	mutex_enter(&spa->spa_async_lock);
9217 	spa->spa_async_thread = NULL;
9218 	cv_broadcast(&spa->spa_async_cv);
9219 	mutex_exit(&spa->spa_async_lock);
9220 	thread_exit();
9221 }
9222 
9223 void
9224 spa_async_suspend(spa_t *spa)
9225 {
9226 	mutex_enter(&spa->spa_async_lock);
9227 	spa->spa_async_suspended++;
9228 	while (spa->spa_async_thread != NULL)
9229 		cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
9230 	mutex_exit(&spa->spa_async_lock);
9231 
9232 	spa_vdev_remove_suspend(spa);
9233 
9234 	zthr_t *condense_thread = spa->spa_condense_zthr;
9235 	if (condense_thread != NULL)
9236 		zthr_cancel(condense_thread);
9237 
9238 	zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr;
9239 	if (raidz_expand_thread != NULL)
9240 		zthr_cancel(raidz_expand_thread);
9241 
9242 	zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
9243 	if (discard_thread != NULL)
9244 		zthr_cancel(discard_thread);
9245 
9246 	zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr;
9247 	if (ll_delete_thread != NULL)
9248 		zthr_cancel(ll_delete_thread);
9249 
9250 	zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr;
9251 	if (ll_condense_thread != NULL)
9252 		zthr_cancel(ll_condense_thread);
9253 }
9254 
9255 void
9256 spa_async_resume(spa_t *spa)
9257 {
9258 	mutex_enter(&spa->spa_async_lock);
9259 	ASSERT(spa->spa_async_suspended != 0);
9260 	spa->spa_async_suspended--;
9261 	mutex_exit(&spa->spa_async_lock);
9262 	spa_restart_removal(spa);
9263 
9264 	zthr_t *condense_thread = spa->spa_condense_zthr;
9265 	if (condense_thread != NULL)
9266 		zthr_resume(condense_thread);
9267 
9268 	zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr;
9269 	if (raidz_expand_thread != NULL)
9270 		zthr_resume(raidz_expand_thread);
9271 
9272 	zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
9273 	if (discard_thread != NULL)
9274 		zthr_resume(discard_thread);
9275 
9276 	zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr;
9277 	if (ll_delete_thread != NULL)
9278 		zthr_resume(ll_delete_thread);
9279 
9280 	zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr;
9281 	if (ll_condense_thread != NULL)
9282 		zthr_resume(ll_condense_thread);
9283 }
9284 
9285 static boolean_t
9286 spa_async_tasks_pending(spa_t *spa)
9287 {
9288 	uint_t non_config_tasks;
9289 	uint_t config_task;
9290 	boolean_t config_task_suspended;
9291 
9292 	non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE;
9293 	config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE;
9294 	if (spa->spa_ccw_fail_time == 0) {
9295 		config_task_suspended = B_FALSE;
9296 	} else {
9297 		config_task_suspended =
9298 		    (gethrtime() - spa->spa_ccw_fail_time) <
9299 		    ((hrtime_t)zfs_ccw_retry_interval * NANOSEC);
9300 	}
9301 
9302 	return (non_config_tasks || (config_task && !config_task_suspended));
9303 }
9304 
9305 static void
9306 spa_async_dispatch(spa_t *spa)
9307 {
9308 	mutex_enter(&spa->spa_async_lock);
9309 	if (spa_async_tasks_pending(spa) &&
9310 	    !spa->spa_async_suspended &&
9311 	    spa->spa_async_thread == NULL)
9312 		spa->spa_async_thread = thread_create(NULL, 0,
9313 		    spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
9314 	mutex_exit(&spa->spa_async_lock);
9315 }
9316 
9317 void
9318 spa_async_request(spa_t *spa, int task)
9319 {
9320 	zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
9321 	mutex_enter(&spa->spa_async_lock);
9322 	spa->spa_async_tasks |= task;
9323 	mutex_exit(&spa->spa_async_lock);
9324 }
9325 
9326 int
9327 spa_async_tasks(spa_t *spa)
9328 {
9329 	return (spa->spa_async_tasks);
9330 }
9331 
9332 /*
9333  * ==========================================================================
9334  * SPA syncing routines
9335  * ==========================================================================
9336  */
9337 
9338 
9339 static int
9340 bpobj_enqueue_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
9341     dmu_tx_t *tx)
9342 {
9343 	bpobj_t *bpo = arg;
9344 	bpobj_enqueue(bpo, bp, bp_freed, tx);
9345 	return (0);
9346 }
9347 
9348 int
9349 bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
9350 {
9351 	return (bpobj_enqueue_cb(arg, bp, B_FALSE, tx));
9352 }
9353 
9354 int
9355 bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
9356 {
9357 	return (bpobj_enqueue_cb(arg, bp, B_TRUE, tx));
9358 }
9359 
9360 static int
9361 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
9362 {
9363 	zio_t *pio = arg;
9364 
9365 	zio_nowait(zio_free_sync(pio, pio->io_spa, dmu_tx_get_txg(tx), bp,
9366 	    pio->io_flags));
9367 	return (0);
9368 }
9369 
9370 static int
9371 bpobj_spa_free_sync_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
9372     dmu_tx_t *tx)
9373 {
9374 	ASSERT(!bp_freed);
9375 	return (spa_free_sync_cb(arg, bp, tx));
9376 }
9377 
9378 /*
9379  * Note: this simple function is not inlined to make it easier to dtrace the
9380  * amount of time spent syncing frees.
9381  */
9382 static void
9383 spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx)
9384 {
9385 	zio_t *zio = zio_root(spa, NULL, NULL, 0);
9386 	bplist_iterate(bpl, spa_free_sync_cb, zio, tx);
9387 	VERIFY(zio_wait(zio) == 0);
9388 }
9389 
9390 /*
9391  * Note: this simple function is not inlined to make it easier to dtrace the
9392  * amount of time spent syncing deferred frees.
9393  */
9394 static void
9395 spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
9396 {
9397 	if (spa_sync_pass(spa) != 1)
9398 		return;
9399 
9400 	/*
9401 	 * Note:
9402 	 * If the log space map feature is active, we stop deferring
9403 	 * frees to the next TXG and therefore running this function
9404 	 * would be considered a no-op as spa_deferred_bpobj should
9405 	 * not have any entries.
9406 	 *
9407 	 * That said we run this function anyway (instead of returning
9408 	 * immediately) for the edge-case scenario where we just
9409 	 * activated the log space map feature in this TXG but we have
9410 	 * deferred frees from the previous TXG.
9411 	 */
9412 	zio_t *zio = zio_root(spa, NULL, NULL, 0);
9413 	VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
9414 	    bpobj_spa_free_sync_cb, zio, tx), ==, 0);
9415 	VERIFY0(zio_wait(zio));
9416 }
9417 
9418 static void
9419 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
9420 {
9421 	char *packed = NULL;
9422 	size_t bufsize;
9423 	size_t nvsize = 0;
9424 	dmu_buf_t *db;
9425 
9426 	VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
9427 
9428 	/*
9429 	 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
9430 	 * information.  This avoids the dmu_buf_will_dirty() path and
9431 	 * saves us a pre-read to get data we don't actually care about.
9432 	 */
9433 	bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);
9434 	packed = vmem_alloc(bufsize, KM_SLEEP);
9435 
9436 	VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
9437 	    KM_SLEEP) == 0);
9438 	memset(packed + nvsize, 0, bufsize - nvsize);
9439 
9440 	dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
9441 
9442 	vmem_free(packed, bufsize);
9443 
9444 	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
9445 	dmu_buf_will_dirty(db, tx);
9446 	*(uint64_t *)db->db_data = nvsize;
9447 	dmu_buf_rele(db, FTAG);
9448 }
9449 
9450 static void
9451 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
9452     const char *config, const char *entry)
9453 {
9454 	nvlist_t *nvroot;
9455 	nvlist_t **list;
9456 	int i;
9457 
9458 	if (!sav->sav_sync)
9459 		return;
9460 
9461 	/*
9462 	 * Update the MOS nvlist describing the list of available devices.
9463 	 * spa_validate_aux() will have already made sure this nvlist is
9464 	 * valid and the vdevs are labeled appropriately.
9465 	 */
9466 	if (sav->sav_object == 0) {
9467 		sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
9468 		    DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
9469 		    sizeof (uint64_t), tx);
9470 		VERIFY(zap_update(spa->spa_meta_objset,
9471 		    DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
9472 		    &sav->sav_object, tx) == 0);
9473 	}
9474 
9475 	nvroot = fnvlist_alloc();
9476 	if (sav->sav_count == 0) {
9477 		fnvlist_add_nvlist_array(nvroot, config,
9478 		    (const nvlist_t * const *)NULL, 0);
9479 	} else {
9480 		list = kmem_alloc(sav->sav_count*sizeof (void *), KM_SLEEP);
9481 		for (i = 0; i < sav->sav_count; i++)
9482 			list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
9483 			    B_FALSE, VDEV_CONFIG_L2CACHE);
9484 		fnvlist_add_nvlist_array(nvroot, config,
9485 		    (const nvlist_t * const *)list, sav->sav_count);
9486 		for (i = 0; i < sav->sav_count; i++)
9487 			nvlist_free(list[i]);
9488 		kmem_free(list, sav->sav_count * sizeof (void *));
9489 	}
9490 
9491 	spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
9492 	nvlist_free(nvroot);
9493 
9494 	sav->sav_sync = B_FALSE;
9495 }
9496 
9497 /*
9498  * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t.
9499  * The all-vdev ZAP must be empty.
9500  */
9501 static void
9502 spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx)
9503 {
9504 	spa_t *spa = vd->vdev_spa;
9505 
9506 	if (vd->vdev_root_zap != 0 &&
9507 	    spa_feature_is_active(spa, SPA_FEATURE_AVZ_V2)) {
9508 		VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
9509 		    vd->vdev_root_zap, tx));
9510 	}
9511 	if (vd->vdev_top_zap != 0) {
9512 		VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
9513 		    vd->vdev_top_zap, tx));
9514 	}
9515 	if (vd->vdev_leaf_zap != 0) {
9516 		VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
9517 		    vd->vdev_leaf_zap, tx));
9518 	}
9519 	for (uint64_t i = 0; i < vd->vdev_children; i++) {
9520 		spa_avz_build(vd->vdev_child[i], avz, tx);
9521 	}
9522 }
9523 
9524 static void
9525 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
9526 {
9527 	nvlist_t *config;
9528 
9529 	/*
9530 	 * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS,
9531 	 * its config may not be dirty but we still need to build per-vdev ZAPs.
9532 	 * Similarly, if the pool is being assembled (e.g. after a split), we
9533 	 * need to rebuild the AVZ although the config may not be dirty.
9534 	 */
9535 	if (list_is_empty(&spa->spa_config_dirty_list) &&
9536 	    spa->spa_avz_action == AVZ_ACTION_NONE)
9537 		return;
9538 
9539 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
9540 
9541 	ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE ||
9542 	    spa->spa_avz_action == AVZ_ACTION_INITIALIZE ||
9543 	    spa->spa_all_vdev_zaps != 0);
9544 
9545 	if (spa->spa_avz_action == AVZ_ACTION_REBUILD) {
9546 		/* Make and build the new AVZ */
9547 		uint64_t new_avz = zap_create(spa->spa_meta_objset,
9548 		    DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
9549 		spa_avz_build(spa->spa_root_vdev, new_avz, tx);
9550 
9551 		/* Diff old AVZ with new one */
9552 		zap_cursor_t zc;
9553 		zap_attribute_t *za = zap_attribute_alloc();
9554 
9555 		for (zap_cursor_init(&zc, spa->spa_meta_objset,
9556 		    spa->spa_all_vdev_zaps);
9557 		    zap_cursor_retrieve(&zc, za) == 0;
9558 		    zap_cursor_advance(&zc)) {
9559 			uint64_t vdzap = za->za_first_integer;
9560 			if (zap_lookup_int(spa->spa_meta_objset, new_avz,
9561 			    vdzap) == ENOENT) {
9562 				/*
9563 				 * ZAP is listed in old AVZ but not in new one;
9564 				 * destroy it
9565 				 */
9566 				VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap,
9567 				    tx));
9568 			}
9569 		}
9570 
9571 		zap_cursor_fini(&zc);
9572 		zap_attribute_free(za);
9573 
9574 		/* Destroy the old AVZ */
9575 		VERIFY0(zap_destroy(spa->spa_meta_objset,
9576 		    spa->spa_all_vdev_zaps, tx));
9577 
9578 		/* Replace the old AVZ in the dir obj with the new one */
9579 		VERIFY0(zap_update(spa->spa_meta_objset,
9580 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP,
9581 		    sizeof (new_avz), 1, &new_avz, tx));
9582 
9583 		spa->spa_all_vdev_zaps = new_avz;
9584 	} else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) {
9585 		zap_cursor_t zc;
9586 		zap_attribute_t *za = zap_attribute_alloc();
9587 
9588 		/* Walk through the AVZ and destroy all listed ZAPs */
9589 		for (zap_cursor_init(&zc, spa->spa_meta_objset,
9590 		    spa->spa_all_vdev_zaps);
9591 		    zap_cursor_retrieve(&zc, za) == 0;
9592 		    zap_cursor_advance(&zc)) {
9593 			uint64_t zap = za->za_first_integer;
9594 			VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx));
9595 		}
9596 
9597 		zap_cursor_fini(&zc);
9598 		zap_attribute_free(za);
9599 
9600 		/* Destroy and unlink the AVZ itself */
9601 		VERIFY0(zap_destroy(spa->spa_meta_objset,
9602 		    spa->spa_all_vdev_zaps, tx));
9603 		VERIFY0(zap_remove(spa->spa_meta_objset,
9604 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx));
9605 		spa->spa_all_vdev_zaps = 0;
9606 	}
9607 
9608 	if (spa->spa_all_vdev_zaps == 0) {
9609 		spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset,
9610 		    DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
9611 		    DMU_POOL_VDEV_ZAP_MAP, tx);
9612 	}
9613 	spa->spa_avz_action = AVZ_ACTION_NONE;
9614 
9615 	/* Create ZAPs for vdevs that don't have them. */
9616 	vdev_construct_zaps(spa->spa_root_vdev, tx);
9617 
9618 	config = spa_config_generate(spa, spa->spa_root_vdev,
9619 	    dmu_tx_get_txg(tx), B_FALSE);
9620 
9621 	/*
9622 	 * If we're upgrading the spa version then make sure that
9623 	 * the config object gets updated with the correct version.
9624 	 */
9625 	if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version)
9626 		fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
9627 		    spa->spa_uberblock.ub_version);
9628 
9629 	spa_config_exit(spa, SCL_STATE, FTAG);
9630 
9631 	nvlist_free(spa->spa_config_syncing);
9632 	spa->spa_config_syncing = config;
9633 
9634 	spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
9635 }
9636 
9637 static void
9638 spa_sync_version(void *arg, dmu_tx_t *tx)
9639 {
9640 	uint64_t *versionp = arg;
9641 	uint64_t version = *versionp;
9642 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
9643 
9644 	/*
9645 	 * Setting the version is special cased when first creating the pool.
9646 	 */
9647 	ASSERT(tx->tx_txg != TXG_INITIAL);
9648 
9649 	ASSERT(SPA_VERSION_IS_SUPPORTED(version));
9650 	ASSERT(version >= spa_version(spa));
9651 
9652 	spa->spa_uberblock.ub_version = version;
9653 	vdev_config_dirty(spa->spa_root_vdev);
9654 	spa_history_log_internal(spa, "set", tx, "version=%lld",
9655 	    (longlong_t)version);
9656 }
9657 
9658 /*
9659  * Set zpool properties.
9660  */
9661 static void
9662 spa_sync_props(void *arg, dmu_tx_t *tx)
9663 {
9664 	nvlist_t *nvp = arg;
9665 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
9666 	objset_t *mos = spa->spa_meta_objset;
9667 	nvpair_t *elem = NULL;
9668 
9669 	mutex_enter(&spa->spa_props_lock);
9670 
9671 	while ((elem = nvlist_next_nvpair(nvp, elem))) {
9672 		uint64_t intval;
9673 		const char *strval, *fname;
9674 		zpool_prop_t prop;
9675 		const char *propname;
9676 		const char *elemname = nvpair_name(elem);
9677 		zprop_type_t proptype;
9678 		spa_feature_t fid;
9679 
9680 		switch (prop = zpool_name_to_prop(elemname)) {
9681 		case ZPOOL_PROP_VERSION:
9682 			intval = fnvpair_value_uint64(elem);
9683 			/*
9684 			 * The version is synced separately before other
9685 			 * properties and should be correct by now.
9686 			 */
9687 			ASSERT3U(spa_version(spa), >=, intval);
9688 			break;
9689 
9690 		case ZPOOL_PROP_ALTROOT:
9691 			/*
9692 			 * 'altroot' is a non-persistent property. It should
9693 			 * have been set temporarily at creation or import time.
9694 			 */
9695 			ASSERT(spa->spa_root != NULL);
9696 			break;
9697 
9698 		case ZPOOL_PROP_READONLY:
9699 		case ZPOOL_PROP_CACHEFILE:
9700 			/*
9701 			 * 'readonly' and 'cachefile' are also non-persistent
9702 			 * properties.
9703 			 */
9704 			break;
9705 		case ZPOOL_PROP_COMMENT:
9706 			strval = fnvpair_value_string(elem);
9707 			if (spa->spa_comment != NULL)
9708 				spa_strfree(spa->spa_comment);
9709 			spa->spa_comment = spa_strdup(strval);
9710 			/*
9711 			 * We need to dirty the configuration on all the vdevs
9712 			 * so that their labels get updated.  We also need to
9713 			 * update the cache file to keep it in sync with the
9714 			 * MOS version. It's unnecessary to do this for pool
9715 			 * creation since the vdev's configuration has already
9716 			 * been dirtied.
9717 			 */
9718 			if (tx->tx_txg != TXG_INITIAL) {
9719 				vdev_config_dirty(spa->spa_root_vdev);
9720 				spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
9721 			}
9722 			spa_history_log_internal(spa, "set", tx,
9723 			    "%s=%s", elemname, strval);
9724 			break;
9725 		case ZPOOL_PROP_COMPATIBILITY:
9726 			strval = fnvpair_value_string(elem);
9727 			if (spa->spa_compatibility != NULL)
9728 				spa_strfree(spa->spa_compatibility);
9729 			spa->spa_compatibility = spa_strdup(strval);
9730 			/*
9731 			 * Dirty the configuration on vdevs as above.
9732 			 */
9733 			if (tx->tx_txg != TXG_INITIAL) {
9734 				vdev_config_dirty(spa->spa_root_vdev);
9735 				spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
9736 			}
9737 
9738 			spa_history_log_internal(spa, "set", tx,
9739 			    "%s=%s", nvpair_name(elem), strval);
9740 			break;
9741 
9742 		case ZPOOL_PROP_INVAL:
9743 			if (zpool_prop_feature(elemname)) {
9744 				fname = strchr(elemname, '@') + 1;
9745 				VERIFY0(zfeature_lookup_name(fname, &fid));
9746 
9747 				spa_feature_enable(spa, fid, tx);
9748 				spa_history_log_internal(spa, "set", tx,
9749 				    "%s=enabled", elemname);
9750 				break;
9751 			} else if (!zfs_prop_user(elemname)) {
9752 				ASSERT(zpool_prop_feature(elemname));
9753 				break;
9754 			}
9755 			zfs_fallthrough;
9756 		default:
9757 			/*
9758 			 * Set pool property values in the poolprops mos object.
9759 			 */
9760 			if (spa->spa_pool_props_object == 0) {
9761 				spa->spa_pool_props_object =
9762 				    zap_create_link(mos, DMU_OT_POOL_PROPS,
9763 				    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
9764 				    tx);
9765 			}
9766 
9767 			/* normalize the property name */
9768 			if (prop == ZPOOL_PROP_INVAL) {
9769 				propname = elemname;
9770 				proptype = PROP_TYPE_STRING;
9771 			} else {
9772 				propname = zpool_prop_to_name(prop);
9773 				proptype = zpool_prop_get_type(prop);
9774 			}
9775 
9776 			if (nvpair_type(elem) == DATA_TYPE_STRING) {
9777 				ASSERT(proptype == PROP_TYPE_STRING);
9778 				strval = fnvpair_value_string(elem);
9779 				if (strlen(strval) == 0) {
9780 					/* remove the property if value == "" */
9781 					(void) zap_remove(mos,
9782 					    spa->spa_pool_props_object,
9783 					    propname, tx);
9784 				} else {
9785 					VERIFY0(zap_update(mos,
9786 					    spa->spa_pool_props_object,
9787 					    propname, 1, strlen(strval) + 1,
9788 					    strval, tx));
9789 				}
9790 				spa_history_log_internal(spa, "set", tx,
9791 				    "%s=%s", elemname, strval);
9792 			} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
9793 				intval = fnvpair_value_uint64(elem);
9794 
9795 				if (proptype == PROP_TYPE_INDEX) {
9796 					const char *unused;
9797 					VERIFY0(zpool_prop_index_to_string(
9798 					    prop, intval, &unused));
9799 				}
9800 				VERIFY0(zap_update(mos,
9801 				    spa->spa_pool_props_object, propname,
9802 				    8, 1, &intval, tx));
9803 				spa_history_log_internal(spa, "set", tx,
9804 				    "%s=%lld", elemname,
9805 				    (longlong_t)intval);
9806 
9807 				switch (prop) {
9808 				case ZPOOL_PROP_DELEGATION:
9809 					spa->spa_delegation = intval;
9810 					break;
9811 				case ZPOOL_PROP_BOOTFS:
9812 					spa->spa_bootfs = intval;
9813 					break;
9814 				case ZPOOL_PROP_FAILUREMODE:
9815 					spa->spa_failmode = intval;
9816 					break;
9817 				case ZPOOL_PROP_AUTOTRIM:
9818 					spa->spa_autotrim = intval;
9819 					spa_async_request(spa,
9820 					    SPA_ASYNC_AUTOTRIM_RESTART);
9821 					break;
9822 				case ZPOOL_PROP_AUTOEXPAND:
9823 					spa->spa_autoexpand = intval;
9824 					if (tx->tx_txg != TXG_INITIAL)
9825 						spa_async_request(spa,
9826 						    SPA_ASYNC_AUTOEXPAND);
9827 					break;
9828 				case ZPOOL_PROP_MULTIHOST:
9829 					spa->spa_multihost = intval;
9830 					break;
9831 				case ZPOOL_PROP_DEDUP_TABLE_QUOTA:
9832 					spa->spa_dedup_table_quota = intval;
9833 					break;
9834 				default:
9835 					break;
9836 				}
9837 			} else {
9838 				ASSERT(0); /* not allowed */
9839 			}
9840 		}
9841 
9842 	}
9843 
9844 	mutex_exit(&spa->spa_props_lock);
9845 }
9846 
9847 /*
9848  * Perform one-time upgrade on-disk changes.  spa_version() does not
9849  * reflect the new version this txg, so there must be no changes this
9850  * txg to anything that the upgrade code depends on after it executes.
9851  * Therefore this must be called after dsl_pool_sync() does the sync
9852  * tasks.
9853  */
9854 static void
9855 spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
9856 {
9857 	if (spa_sync_pass(spa) != 1)
9858 		return;
9859 
9860 	dsl_pool_t *dp = spa->spa_dsl_pool;
9861 	rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
9862 
9863 	if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
9864 	    spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
9865 		dsl_pool_create_origin(dp, tx);
9866 
9867 		/* Keeping the origin open increases spa_minref */
9868 		spa->spa_minref += 3;
9869 	}
9870 
9871 	if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
9872 	    spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
9873 		dsl_pool_upgrade_clones(dp, tx);
9874 	}
9875 
9876 	if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
9877 	    spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
9878 		dsl_pool_upgrade_dir_clones(dp, tx);
9879 
9880 		/* Keeping the freedir open increases spa_minref */
9881 		spa->spa_minref += 3;
9882 	}
9883 
9884 	if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES &&
9885 	    spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
9886 		spa_feature_create_zap_objects(spa, tx);
9887 	}
9888 
9889 	/*
9890 	 * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable
9891 	 * when possibility to use lz4 compression for metadata was added
9892 	 * Old pools that have this feature enabled must be upgraded to have
9893 	 * this feature active
9894 	 */
9895 	if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
9896 		boolean_t lz4_en = spa_feature_is_enabled(spa,
9897 		    SPA_FEATURE_LZ4_COMPRESS);
9898 		boolean_t lz4_ac = spa_feature_is_active(spa,
9899 		    SPA_FEATURE_LZ4_COMPRESS);
9900 
9901 		if (lz4_en && !lz4_ac)
9902 			spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx);
9903 	}
9904 
9905 	/*
9906 	 * If we haven't written the salt, do so now.  Note that the
9907 	 * feature may not be activated yet, but that's fine since
9908 	 * the presence of this ZAP entry is backwards compatible.
9909 	 */
9910 	if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
9911 	    DMU_POOL_CHECKSUM_SALT) == ENOENT) {
9912 		VERIFY0(zap_add(spa->spa_meta_objset,
9913 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1,
9914 		    sizeof (spa->spa_cksum_salt.zcs_bytes),
9915 		    spa->spa_cksum_salt.zcs_bytes, tx));
9916 	}
9917 
9918 	rrw_exit(&dp->dp_config_rwlock, FTAG);
9919 }
9920 
9921 static void
9922 vdev_indirect_state_sync_verify(vdev_t *vd)
9923 {
9924 	vdev_indirect_mapping_t *vim __maybe_unused = vd->vdev_indirect_mapping;
9925 	vdev_indirect_births_t *vib __maybe_unused = vd->vdev_indirect_births;
9926 
9927 	if (vd->vdev_ops == &vdev_indirect_ops) {
9928 		ASSERT(vim != NULL);
9929 		ASSERT(vib != NULL);
9930 	}
9931 
9932 	uint64_t obsolete_sm_object = 0;
9933 	ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
9934 	if (obsolete_sm_object != 0) {
9935 		ASSERT(vd->vdev_obsolete_sm != NULL);
9936 		ASSERT(vd->vdev_removing ||
9937 		    vd->vdev_ops == &vdev_indirect_ops);
9938 		ASSERT(vdev_indirect_mapping_num_entries(vim) > 0);
9939 		ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0);
9940 		ASSERT3U(obsolete_sm_object, ==,
9941 		    space_map_object(vd->vdev_obsolete_sm));
9942 		ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=,
9943 		    space_map_allocated(vd->vdev_obsolete_sm));
9944 	}
9945 	ASSERT(vd->vdev_obsolete_segments != NULL);
9946 
9947 	/*
9948 	 * Since frees / remaps to an indirect vdev can only
9949 	 * happen in syncing context, the obsolete segments
9950 	 * tree must be empty when we start syncing.
9951 	 */
9952 	ASSERT0(zfs_range_tree_space(vd->vdev_obsolete_segments));
9953 }
9954 
9955 /*
9956  * Set the top-level vdev's max queue depth. Evaluate each top-level's
9957  * async write queue depth in case it changed. The max queue depth will
9958  * not change in the middle of syncing out this txg.
9959  */
9960 static void
9961 spa_sync_adjust_vdev_max_queue_depth(spa_t *spa)
9962 {
9963 	ASSERT(spa_writeable(spa));
9964 
9965 	metaslab_class_balance(spa_normal_class(spa), B_TRUE);
9966 	metaslab_class_balance(spa_special_class(spa), B_TRUE);
9967 	metaslab_class_balance(spa_dedup_class(spa), B_TRUE);
9968 }
9969 
9970 static void
9971 spa_sync_condense_indirect(spa_t *spa, dmu_tx_t *tx)
9972 {
9973 	ASSERT(spa_writeable(spa));
9974 
9975 	vdev_t *rvd = spa->spa_root_vdev;
9976 	for (int c = 0; c < rvd->vdev_children; c++) {
9977 		vdev_t *vd = rvd->vdev_child[c];
9978 		vdev_indirect_state_sync_verify(vd);
9979 
9980 		if (vdev_indirect_should_condense(vd)) {
9981 			spa_condense_indirect_start_sync(vd, tx);
9982 			break;
9983 		}
9984 	}
9985 }
9986 
9987 static void
9988 spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)
9989 {
9990 	objset_t *mos = spa->spa_meta_objset;
9991 	dsl_pool_t *dp = spa->spa_dsl_pool;
9992 	uint64_t txg = tx->tx_txg;
9993 	bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
9994 
9995 	do {
9996 		int pass = ++spa->spa_sync_pass;
9997 
9998 		spa_sync_config_object(spa, tx);
9999 		spa_sync_aux_dev(spa, &spa->spa_spares, tx,
10000 		    ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
10001 		spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
10002 		    ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
10003 		spa_errlog_sync(spa, txg);
10004 		dsl_pool_sync(dp, txg);
10005 
10006 		if (pass < zfs_sync_pass_deferred_free ||
10007 		    spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
10008 			/*
10009 			 * If the log space map feature is active we don't
10010 			 * care about deferred frees and the deferred bpobj
10011 			 * as the log space map should effectively have the
10012 			 * same results (i.e. appending only to one object).
10013 			 */
10014 			spa_sync_frees(spa, free_bpl, tx);
10015 		} else {
10016 			/*
10017 			 * We can not defer frees in pass 1, because
10018 			 * we sync the deferred frees later in pass 1.
10019 			 */
10020 			ASSERT3U(pass, >, 1);
10021 			bplist_iterate(free_bpl, bpobj_enqueue_alloc_cb,
10022 			    &spa->spa_deferred_bpobj, tx);
10023 		}
10024 
10025 		brt_sync(spa, txg);
10026 		ddt_sync(spa, txg);
10027 		dsl_scan_sync(dp, tx);
10028 		dsl_errorscrub_sync(dp, tx);
10029 		svr_sync(spa, tx);
10030 		spa_sync_upgrades(spa, tx);
10031 
10032 		spa_flush_metaslabs(spa, tx);
10033 
10034 		vdev_t *vd = NULL;
10035 		while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
10036 		    != NULL)
10037 			vdev_sync(vd, txg);
10038 
10039 		if (pass == 1) {
10040 			/*
10041 			 * dsl_pool_sync() -> dp_sync_tasks may have dirtied
10042 			 * the config. If that happens, this txg should not
10043 			 * be a no-op. So we must sync the config to the MOS
10044 			 * before checking for no-op.
10045 			 *
10046 			 * Note that when the config is dirty, it will
10047 			 * be written to the MOS (i.e. the MOS will be
10048 			 * dirtied) every time we call spa_sync_config_object()
10049 			 * in this txg.  Therefore we can't call this after
10050 			 * dsl_pool_sync() every pass, because it would
10051 			 * prevent us from converging, since we'd dirty
10052 			 * the MOS every pass.
10053 			 *
10054 			 * Sync tasks can only be processed in pass 1, so
10055 			 * there's no need to do this in later passes.
10056 			 */
10057 			spa_sync_config_object(spa, tx);
10058 		}
10059 
10060 		/*
10061 		 * Note: We need to check if the MOS is dirty because we could
10062 		 * have marked the MOS dirty without updating the uberblock
10063 		 * (e.g. if we have sync tasks but no dirty user data). We need
10064 		 * to check the uberblock's rootbp because it is updated if we
10065 		 * have synced out dirty data (though in this case the MOS will
10066 		 * most likely also be dirty due to second order effects, we
10067 		 * don't want to rely on that here).
10068 		 */
10069 		if (pass == 1 &&
10070 		    BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp) < txg &&
10071 		    !dmu_objset_is_dirty(mos, txg)) {
10072 			/*
10073 			 * Nothing changed on the first pass, therefore this
10074 			 * TXG is a no-op. Avoid syncing deferred frees, so
10075 			 * that we can keep this TXG as a no-op.
10076 			 */
10077 			ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
10078 			ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
10079 			ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg));
10080 			ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, txg));
10081 			break;
10082 		}
10083 
10084 		spa_sync_deferred_frees(spa, tx);
10085 	} while (dmu_objset_is_dirty(mos, txg));
10086 }
10087 
10088 /*
10089  * Rewrite the vdev configuration (which includes the uberblock) to
10090  * commit the transaction group.
10091  *
10092  * If there are no dirty vdevs, we sync the uberblock to a few random
10093  * top-level vdevs that are known to be visible in the config cache
10094  * (see spa_vdev_add() for a complete description). If there *are* dirty
10095  * vdevs, sync the uberblock to all vdevs.
10096  */
10097 static void
10098 spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx)
10099 {
10100 	vdev_t *rvd = spa->spa_root_vdev;
10101 	uint64_t txg = tx->tx_txg;
10102 
10103 	for (;;) {
10104 		int error = 0;
10105 
10106 		/*
10107 		 * We hold SCL_STATE to prevent vdev open/close/etc.
10108 		 * while we're attempting to write the vdev labels.
10109 		 */
10110 		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
10111 
10112 		if (list_is_empty(&spa->spa_config_dirty_list)) {
10113 			vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
10114 			int svdcount = 0;
10115 			int children = rvd->vdev_children;
10116 			int c0 = random_in_range(children);
10117 
10118 			for (int c = 0; c < children; c++) {
10119 				vdev_t *vd =
10120 				    rvd->vdev_child[(c0 + c) % children];
10121 
10122 				/* Stop when revisiting the first vdev */
10123 				if (c > 0 && svd[0] == vd)
10124 					break;
10125 
10126 				if (vd->vdev_ms_array == 0 ||
10127 				    vd->vdev_islog ||
10128 				    !vdev_is_concrete(vd))
10129 					continue;
10130 
10131 				svd[svdcount++] = vd;
10132 				if (svdcount == SPA_SYNC_MIN_VDEVS)
10133 					break;
10134 			}
10135 			error = vdev_config_sync(svd, svdcount, txg);
10136 		} else {
10137 			error = vdev_config_sync(rvd->vdev_child,
10138 			    rvd->vdev_children, txg);
10139 		}
10140 
10141 		if (error == 0)
10142 			spa->spa_last_synced_guid = rvd->vdev_guid;
10143 
10144 		spa_config_exit(spa, SCL_STATE, FTAG);
10145 
10146 		if (error == 0)
10147 			break;
10148 		zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR);
10149 		zio_resume_wait(spa);
10150 	}
10151 }
10152 
10153 /*
10154  * Sync the specified transaction group.  New blocks may be dirtied as
10155  * part of the process, so we iterate until it converges.
10156  */
10157 void
10158 spa_sync(spa_t *spa, uint64_t txg)
10159 {
10160 	vdev_t *vd = NULL;
10161 
10162 	VERIFY(spa_writeable(spa));
10163 
10164 	/*
10165 	 * Wait for i/os issued in open context that need to complete
10166 	 * before this txg syncs.
10167 	 */
10168 	(void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]);
10169 	spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL,
10170 	    ZIO_FLAG_CANFAIL);
10171 
10172 	/*
10173 	 * Now that there can be no more cloning in this transaction group,
10174 	 * but we are still before issuing frees, we can process pending BRT
10175 	 * updates.
10176 	 */
10177 	brt_pending_apply(spa, txg);
10178 
10179 	/*
10180 	 * Lock out configuration changes.
10181 	 */
10182 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
10183 
10184 	spa->spa_syncing_txg = txg;
10185 	spa->spa_sync_pass = 0;
10186 
10187 	/*
10188 	 * If there are any pending vdev state changes, convert them
10189 	 * into config changes that go out with this transaction group.
10190 	 */
10191 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
10192 	while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
10193 		/* Avoid holding the write lock unless actually necessary */
10194 		if (vd->vdev_aux == NULL) {
10195 			vdev_state_clean(vd);
10196 			vdev_config_dirty(vd);
10197 			continue;
10198 		}
10199 		/*
10200 		 * We need the write lock here because, for aux vdevs,
10201 		 * calling vdev_config_dirty() modifies sav_config.
10202 		 * This is ugly and will become unnecessary when we
10203 		 * eliminate the aux vdev wart by integrating all vdevs
10204 		 * into the root vdev tree.
10205 		 */
10206 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
10207 		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
10208 		while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
10209 			vdev_state_clean(vd);
10210 			vdev_config_dirty(vd);
10211 		}
10212 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
10213 		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
10214 	}
10215 	spa_config_exit(spa, SCL_STATE, FTAG);
10216 
10217 	dsl_pool_t *dp = spa->spa_dsl_pool;
10218 	dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
10219 
10220 	spa->spa_sync_starttime = gethrtime();
10221 	taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
10222 	spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq,
10223 	    spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() +
10224 	    NSEC_TO_TICK(spa->spa_deadman_synctime));
10225 
10226 	/*
10227 	 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
10228 	 * set spa_deflate if we have no raid-z vdevs.
10229 	 */
10230 	if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
10231 	    spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
10232 		vdev_t *rvd = spa->spa_root_vdev;
10233 
10234 		int i;
10235 		for (i = 0; i < rvd->vdev_children; i++) {
10236 			vd = rvd->vdev_child[i];
10237 			if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
10238 				break;
10239 		}
10240 		if (i == rvd->vdev_children) {
10241 			spa->spa_deflate = TRUE;
10242 			VERIFY0(zap_add(spa->spa_meta_objset,
10243 			    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
10244 			    sizeof (uint64_t), 1, &spa->spa_deflate, tx));
10245 		}
10246 	}
10247 
10248 	spa_sync_adjust_vdev_max_queue_depth(spa);
10249 
10250 	spa_sync_condense_indirect(spa, tx);
10251 
10252 	spa_sync_iterate_to_convergence(spa, tx);
10253 
10254 #ifdef ZFS_DEBUG
10255 	if (!list_is_empty(&spa->spa_config_dirty_list)) {
10256 	/*
10257 	 * Make sure that the number of ZAPs for all the vdevs matches
10258 	 * the number of ZAPs in the per-vdev ZAP list. This only gets
10259 	 * called if the config is dirty; otherwise there may be
10260 	 * outstanding AVZ operations that weren't completed in
10261 	 * spa_sync_config_object.
10262 	 */
10263 		uint64_t all_vdev_zap_entry_count;
10264 		ASSERT0(zap_count(spa->spa_meta_objset,
10265 		    spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count));
10266 		ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==,
10267 		    all_vdev_zap_entry_count);
10268 	}
10269 #endif
10270 
10271 	if (spa->spa_vdev_removal != NULL) {
10272 		ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]);
10273 	}
10274 
10275 	spa_sync_rewrite_vdev_config(spa, tx);
10276 	dmu_tx_commit(tx);
10277 
10278 	taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
10279 	spa->spa_deadman_tqid = 0;
10280 
10281 	/*
10282 	 * Clear the dirty config list.
10283 	 */
10284 	while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL)
10285 		vdev_config_clean(vd);
10286 
10287 	/*
10288 	 * Now that the new config has synced transactionally,
10289 	 * let it become visible to the config cache.
10290 	 */
10291 	if (spa->spa_config_syncing != NULL) {
10292 		spa_config_set(spa, spa->spa_config_syncing);
10293 		spa->spa_config_txg = txg;
10294 		spa->spa_config_syncing = NULL;
10295 	}
10296 
10297 	dsl_pool_sync_done(dp, txg);
10298 
10299 	/*
10300 	 * Update usable space statistics.
10301 	 */
10302 	while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
10303 	    != NULL)
10304 		vdev_sync_done(vd, txg);
10305 
10306 	metaslab_class_evict_old(spa->spa_normal_class, txg);
10307 	metaslab_class_evict_old(spa->spa_log_class, txg);
10308 	/* spa_embedded_log_class has only one metaslab per vdev. */
10309 	metaslab_class_evict_old(spa->spa_special_class, txg);
10310 	metaslab_class_evict_old(spa->spa_dedup_class, txg);
10311 
10312 	spa_sync_close_syncing_log_sm(spa);
10313 
10314 	spa_update_dspace(spa);
10315 
10316 	if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON)
10317 		vdev_autotrim_kick(spa);
10318 
10319 	/*
10320 	 * It had better be the case that we didn't dirty anything
10321 	 * since vdev_config_sync().
10322 	 */
10323 	ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
10324 	ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
10325 	ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
10326 
10327 	while (zfs_pause_spa_sync)
10328 		delay(1);
10329 
10330 	spa->spa_sync_pass = 0;
10331 
10332 	/*
10333 	 * Update the last synced uberblock here. We want to do this at
10334 	 * the end of spa_sync() so that consumers of spa_last_synced_txg()
10335 	 * will be guaranteed that all the processing associated with
10336 	 * that txg has been completed.
10337 	 */
10338 	spa->spa_ubsync = spa->spa_uberblock;
10339 	spa_config_exit(spa, SCL_CONFIG, FTAG);
10340 
10341 	spa_handle_ignored_writes(spa);
10342 
10343 	/*
10344 	 * If any async tasks have been requested, kick them off.
10345 	 */
10346 	spa_async_dispatch(spa);
10347 }
10348 
10349 /*
10350  * Sync all pools.  We don't want to hold the namespace lock across these
10351  * operations, so we take a reference on the spa_t and drop the lock during the
10352  * sync.
10353  */
10354 void
10355 spa_sync_allpools(void)
10356 {
10357 	spa_t *spa = NULL;
10358 	mutex_enter(&spa_namespace_lock);
10359 	while ((spa = spa_next(spa)) != NULL) {
10360 		if (spa_state(spa) != POOL_STATE_ACTIVE ||
10361 		    !spa_writeable(spa) || spa_suspended(spa))
10362 			continue;
10363 		spa_open_ref(spa, FTAG);
10364 		mutex_exit(&spa_namespace_lock);
10365 		txg_wait_synced(spa_get_dsl(spa), 0);
10366 		mutex_enter(&spa_namespace_lock);
10367 		spa_close(spa, FTAG);
10368 	}
10369 	mutex_exit(&spa_namespace_lock);
10370 }
10371 
10372 taskq_t *
10373 spa_sync_tq_create(spa_t *spa, const char *name)
10374 {
10375 	kthread_t **kthreads;
10376 
10377 	ASSERT(spa->spa_sync_tq == NULL);
10378 	ASSERT3S(spa->spa_alloc_count, <=, boot_ncpus);
10379 
10380 	/*
10381 	 * - do not allow more allocators than cpus.
10382 	 * - there may be more cpus than allocators.
10383 	 * - do not allow more sync taskq threads than allocators or cpus.
10384 	 */
10385 	int nthreads = spa->spa_alloc_count;
10386 	spa->spa_syncthreads = kmem_zalloc(sizeof (spa_syncthread_info_t) *
10387 	    nthreads, KM_SLEEP);
10388 
10389 	spa->spa_sync_tq = taskq_create_synced(name, nthreads, minclsyspri,
10390 	    nthreads, INT_MAX, TASKQ_PREPOPULATE, &kthreads);
10391 	VERIFY(spa->spa_sync_tq != NULL);
10392 	VERIFY(kthreads != NULL);
10393 
10394 	spa_syncthread_info_t *ti = spa->spa_syncthreads;
10395 	for (int i = 0; i < nthreads; i++, ti++) {
10396 		ti->sti_thread = kthreads[i];
10397 		ti->sti_allocator = i;
10398 	}
10399 
10400 	kmem_free(kthreads, sizeof (*kthreads) * nthreads);
10401 	return (spa->spa_sync_tq);
10402 }
10403 
10404 void
10405 spa_sync_tq_destroy(spa_t *spa)
10406 {
10407 	ASSERT(spa->spa_sync_tq != NULL);
10408 
10409 	taskq_wait(spa->spa_sync_tq);
10410 	taskq_destroy(spa->spa_sync_tq);
10411 	kmem_free(spa->spa_syncthreads,
10412 	    sizeof (spa_syncthread_info_t) * spa->spa_alloc_count);
10413 	spa->spa_sync_tq = NULL;
10414 }
10415 
10416 uint_t
10417 spa_acq_allocator(spa_t *spa)
10418 {
10419 	int i;
10420 
10421 	if (spa->spa_alloc_count == 1)
10422 		return (0);
10423 
10424 	mutex_enter(&spa->spa_allocs_use->sau_lock);
10425 	uint_t r = spa->spa_allocs_use->sau_rotor;
10426 	do {
10427 		if (++r == spa->spa_alloc_count)
10428 			r = 0;
10429 	} while (spa->spa_allocs_use->sau_inuse[r]);
10430 	spa->spa_allocs_use->sau_inuse[r] = B_TRUE;
10431 	spa->spa_allocs_use->sau_rotor = r;
10432 	mutex_exit(&spa->spa_allocs_use->sau_lock);
10433 
10434 	spa_syncthread_info_t *ti = spa->spa_syncthreads;
10435 	for (i = 0; i < spa->spa_alloc_count; i++, ti++) {
10436 		if (ti->sti_thread == curthread) {
10437 			ti->sti_allocator = r;
10438 			break;
10439 		}
10440 	}
10441 	ASSERT3S(i, <, spa->spa_alloc_count);
10442 	return (r);
10443 }
10444 
10445 void
10446 spa_rel_allocator(spa_t *spa, uint_t allocator)
10447 {
10448 	if (spa->spa_alloc_count > 1)
10449 		spa->spa_allocs_use->sau_inuse[allocator] = B_FALSE;
10450 }
10451 
10452 void
10453 spa_select_allocator(zio_t *zio)
10454 {
10455 	zbookmark_phys_t *bm = &zio->io_bookmark;
10456 	spa_t *spa = zio->io_spa;
10457 
10458 	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
10459 
10460 	/*
10461 	 * A gang block (for example) may have inherited its parent's
10462 	 * allocator, in which case there is nothing further to do here.
10463 	 */
10464 	if (ZIO_HAS_ALLOCATOR(zio))
10465 		return;
10466 
10467 	ASSERT(spa != NULL);
10468 	ASSERT(bm != NULL);
10469 
10470 	/*
10471 	 * First try to use an allocator assigned to the syncthread, and set
10472 	 * the corresponding write issue taskq for the allocator.
10473 	 * Note, we must have an open pool to do this.
10474 	 */
10475 	if (spa->spa_sync_tq != NULL) {
10476 		spa_syncthread_info_t *ti = spa->spa_syncthreads;
10477 		for (int i = 0; i < spa->spa_alloc_count; i++, ti++) {
10478 			if (ti->sti_thread == curthread) {
10479 				zio->io_allocator = ti->sti_allocator;
10480 				return;
10481 			}
10482 		}
10483 	}
10484 
10485 	/*
10486 	 * We want to try to use as many allocators as possible to help improve
10487 	 * performance, but we also want logically adjacent IOs to be physically
10488 	 * adjacent to improve sequential read performance. We chunk each object
10489 	 * into 2^20 block regions, and then hash based on the objset, object,
10490 	 * level, and region to accomplish both of these goals.
10491 	 */
10492 	uint64_t hv = cityhash4(bm->zb_objset, bm->zb_object, bm->zb_level,
10493 	    bm->zb_blkid >> 20);
10494 
10495 	zio->io_allocator = (uint_t)hv % spa->spa_alloc_count;
10496 }
10497 
10498 /*
10499  * ==========================================================================
10500  * Miscellaneous routines
10501  * ==========================================================================
10502  */
10503 
10504 /*
10505  * Remove all pools in the system.
10506  */
10507 void
10508 spa_evict_all(void)
10509 {
10510 	spa_t *spa;
10511 
10512 	/*
10513 	 * Remove all cached state.  All pools should be closed now,
10514 	 * so every spa in the AVL tree should be unreferenced.
10515 	 */
10516 	mutex_enter(&spa_namespace_lock);
10517 	while ((spa = spa_next(NULL)) != NULL) {
10518 		/*
10519 		 * Stop async tasks.  The async thread may need to detach
10520 		 * a device that's been replaced, which requires grabbing
10521 		 * spa_namespace_lock, so we must drop it here.
10522 		 */
10523 		spa_open_ref(spa, FTAG);
10524 		mutex_exit(&spa_namespace_lock);
10525 		spa_async_suspend(spa);
10526 		mutex_enter(&spa_namespace_lock);
10527 		spa_close(spa, FTAG);
10528 
10529 		if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
10530 			spa_unload(spa);
10531 			spa_deactivate(spa);
10532 		}
10533 		spa_remove(spa);
10534 	}
10535 	mutex_exit(&spa_namespace_lock);
10536 }
10537 
10538 vdev_t *
10539 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
10540 {
10541 	vdev_t *vd;
10542 	int i;
10543 
10544 	if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
10545 		return (vd);
10546 
10547 	if (aux) {
10548 		for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
10549 			vd = spa->spa_l2cache.sav_vdevs[i];
10550 			if (vd->vdev_guid == guid)
10551 				return (vd);
10552 		}
10553 
10554 		for (i = 0; i < spa->spa_spares.sav_count; i++) {
10555 			vd = spa->spa_spares.sav_vdevs[i];
10556 			if (vd->vdev_guid == guid)
10557 				return (vd);
10558 		}
10559 	}
10560 
10561 	return (NULL);
10562 }
10563 
10564 void
10565 spa_upgrade(spa_t *spa, uint64_t version)
10566 {
10567 	ASSERT(spa_writeable(spa));
10568 
10569 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
10570 
10571 	/*
10572 	 * This should only be called for a non-faulted pool, and since a
10573 	 * future version would result in an unopenable pool, this shouldn't be
10574 	 * possible.
10575 	 */
10576 	ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version));
10577 	ASSERT3U(version, >=, spa->spa_uberblock.ub_version);
10578 
10579 	spa->spa_uberblock.ub_version = version;
10580 	vdev_config_dirty(spa->spa_root_vdev);
10581 
10582 	spa_config_exit(spa, SCL_ALL, FTAG);
10583 
10584 	txg_wait_synced(spa_get_dsl(spa), 0);
10585 }
10586 
10587 static boolean_t
10588 spa_has_aux_vdev(spa_t *spa, uint64_t guid, spa_aux_vdev_t *sav)
10589 {
10590 	(void) spa;
10591 	int i;
10592 	uint64_t vdev_guid;
10593 
10594 	for (i = 0; i < sav->sav_count; i++)
10595 		if (sav->sav_vdevs[i]->vdev_guid == guid)
10596 			return (B_TRUE);
10597 
10598 	for (i = 0; i < sav->sav_npending; i++) {
10599 		if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
10600 		    &vdev_guid) == 0 && vdev_guid == guid)
10601 			return (B_TRUE);
10602 	}
10603 
10604 	return (B_FALSE);
10605 }
10606 
10607 boolean_t
10608 spa_has_l2cache(spa_t *spa, uint64_t guid)
10609 {
10610 	return (spa_has_aux_vdev(spa, guid, &spa->spa_l2cache));
10611 }
10612 
10613 boolean_t
10614 spa_has_spare(spa_t *spa, uint64_t guid)
10615 {
10616 	return (spa_has_aux_vdev(spa, guid, &spa->spa_spares));
10617 }
10618 
10619 /*
10620  * Check if a pool has an active shared spare device.
10621  * Note: reference count of an active spare is 2, as a spare and as a replace
10622  */
10623 static boolean_t
10624 spa_has_active_shared_spare(spa_t *spa)
10625 {
10626 	int i, refcnt;
10627 	uint64_t pool;
10628 	spa_aux_vdev_t *sav = &spa->spa_spares;
10629 
10630 	for (i = 0; i < sav->sav_count; i++) {
10631 		if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool,
10632 		    &refcnt) && pool != 0ULL && pool == spa_guid(spa) &&
10633 		    refcnt > 2)
10634 			return (B_TRUE);
10635 	}
10636 
10637 	return (B_FALSE);
10638 }
10639 
10640 uint64_t
10641 spa_total_metaslabs(spa_t *spa)
10642 {
10643 	vdev_t *rvd = spa->spa_root_vdev;
10644 
10645 	uint64_t m = 0;
10646 	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
10647 		vdev_t *vd = rvd->vdev_child[c];
10648 		if (!vdev_is_concrete(vd))
10649 			continue;
10650 		m += vd->vdev_ms_count;
10651 	}
10652 	return (m);
10653 }
10654 
10655 /*
10656  * Notify any waiting threads that some activity has switched from being in-
10657  * progress to not-in-progress so that the thread can wake up and determine
10658  * whether it is finished waiting.
10659  */
10660 void
10661 spa_notify_waiters(spa_t *spa)
10662 {
10663 	/*
10664 	 * Acquiring spa_activities_lock here prevents the cv_broadcast from
10665 	 * happening between the waiting thread's check and cv_wait.
10666 	 */
10667 	mutex_enter(&spa->spa_activities_lock);
10668 	cv_broadcast(&spa->spa_activities_cv);
10669 	mutex_exit(&spa->spa_activities_lock);
10670 }
10671 
10672 /*
10673  * Notify any waiting threads that the pool is exporting, and then block until
10674  * they are finished using the spa_t.
10675  */
10676 void
10677 spa_wake_waiters(spa_t *spa)
10678 {
10679 	mutex_enter(&spa->spa_activities_lock);
10680 	spa->spa_waiters_cancel = B_TRUE;
10681 	cv_broadcast(&spa->spa_activities_cv);
10682 	while (spa->spa_waiters != 0)
10683 		cv_wait(&spa->spa_waiters_cv, &spa->spa_activities_lock);
10684 	spa->spa_waiters_cancel = B_FALSE;
10685 	mutex_exit(&spa->spa_activities_lock);
10686 }
10687 
10688 /* Whether the vdev or any of its descendants are being initialized/trimmed. */
10689 static boolean_t
10690 spa_vdev_activity_in_progress_impl(vdev_t *vd, zpool_wait_activity_t activity)
10691 {
10692 	spa_t *spa = vd->vdev_spa;
10693 
10694 	ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER));
10695 	ASSERT(MUTEX_HELD(&spa->spa_activities_lock));
10696 	ASSERT(activity == ZPOOL_WAIT_INITIALIZE ||
10697 	    activity == ZPOOL_WAIT_TRIM);
10698 
10699 	kmutex_t *lock = activity == ZPOOL_WAIT_INITIALIZE ?
10700 	    &vd->vdev_initialize_lock : &vd->vdev_trim_lock;
10701 
10702 	mutex_exit(&spa->spa_activities_lock);
10703 	mutex_enter(lock);
10704 	mutex_enter(&spa->spa_activities_lock);
10705 
10706 	boolean_t in_progress = (activity == ZPOOL_WAIT_INITIALIZE) ?
10707 	    (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) :
10708 	    (vd->vdev_trim_state == VDEV_TRIM_ACTIVE);
10709 	mutex_exit(lock);
10710 
10711 	if (in_progress)
10712 		return (B_TRUE);
10713 
10714 	for (int i = 0; i < vd->vdev_children; i++) {
10715 		if (spa_vdev_activity_in_progress_impl(vd->vdev_child[i],
10716 		    activity))
10717 			return (B_TRUE);
10718 	}
10719 
10720 	return (B_FALSE);
10721 }
10722 
10723 /*
10724  * If use_guid is true, this checks whether the vdev specified by guid is
10725  * being initialized/trimmed. Otherwise, it checks whether any vdev in the pool
10726  * is being initialized/trimmed. The caller must hold the config lock and
10727  * spa_activities_lock.
10728  */
10729 static int
10730 spa_vdev_activity_in_progress(spa_t *spa, boolean_t use_guid, uint64_t guid,
10731     zpool_wait_activity_t activity, boolean_t *in_progress)
10732 {
10733 	mutex_exit(&spa->spa_activities_lock);
10734 	spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
10735 	mutex_enter(&spa->spa_activities_lock);
10736 
10737 	vdev_t *vd;
10738 	if (use_guid) {
10739 		vd = spa_lookup_by_guid(spa, guid, B_FALSE);
10740 		if (vd == NULL || !vd->vdev_ops->vdev_op_leaf) {
10741 			spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
10742 			return (EINVAL);
10743 		}
10744 	} else {
10745 		vd = spa->spa_root_vdev;
10746 	}
10747 
10748 	*in_progress = spa_vdev_activity_in_progress_impl(vd, activity);
10749 
10750 	spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
10751 	return (0);
10752 }
10753 
10754 /*
10755  * Locking for waiting threads
10756  * ---------------------------
10757  *
10758  * Waiting threads need a way to check whether a given activity is in progress,
10759  * and then, if it is, wait for it to complete. Each activity will have some
10760  * in-memory representation of the relevant on-disk state which can be used to
10761  * determine whether or not the activity is in progress. The in-memory state and
10762  * the locking used to protect it will be different for each activity, and may
10763  * not be suitable for use with a cvar (e.g., some state is protected by the
10764  * config lock). To allow waiting threads to wait without any races, another
10765  * lock, spa_activities_lock, is used.
10766  *
10767  * When the state is checked, both the activity-specific lock (if there is one)
10768  * and spa_activities_lock are held. In some cases, the activity-specific lock
10769  * is acquired explicitly (e.g. the config lock). In others, the locking is
10770  * internal to some check (e.g. bpobj_is_empty). After checking, the waiting
10771  * thread releases the activity-specific lock and, if the activity is in
10772  * progress, then cv_waits using spa_activities_lock.
10773  *
10774  * The waiting thread is woken when another thread, one completing some
10775  * activity, updates the state of the activity and then calls
10776  * spa_notify_waiters, which will cv_broadcast. This 'completing' thread only
10777  * needs to hold its activity-specific lock when updating the state, and this
10778  * lock can (but doesn't have to) be dropped before calling spa_notify_waiters.
10779  *
10780  * Because spa_notify_waiters acquires spa_activities_lock before broadcasting,
10781  * and because it is held when the waiting thread checks the state of the
10782  * activity, it can never be the case that the completing thread both updates
10783  * the activity state and cv_broadcasts in between the waiting thread's check
10784  * and cv_wait. Thus, a waiting thread can never miss a wakeup.
10785  *
10786  * In order to prevent deadlock, when the waiting thread does its check, in some
10787  * cases it will temporarily drop spa_activities_lock in order to acquire the
10788  * activity-specific lock. The order in which spa_activities_lock and the
10789  * activity specific lock are acquired in the waiting thread is determined by
10790  * the order in which they are acquired in the completing thread; if the
10791  * completing thread calls spa_notify_waiters with the activity-specific lock
10792  * held, then the waiting thread must also acquire the activity-specific lock
10793  * first.
10794  */
10795 
10796 static int
10797 spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity,
10798     boolean_t use_tag, uint64_t tag, boolean_t *in_progress)
10799 {
10800 	int error = 0;
10801 
10802 	ASSERT(MUTEX_HELD(&spa->spa_activities_lock));
10803 
10804 	switch (activity) {
10805 	case ZPOOL_WAIT_CKPT_DISCARD:
10806 		*in_progress =
10807 		    (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT) &&
10808 		    zap_contains(spa_meta_objset(spa),
10809 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT) ==
10810 		    ENOENT);
10811 		break;
10812 	case ZPOOL_WAIT_FREE:
10813 		*in_progress = ((spa_version(spa) >= SPA_VERSION_DEADLISTS &&
10814 		    !bpobj_is_empty(&spa->spa_dsl_pool->dp_free_bpobj)) ||
10815 		    spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY) ||
10816 		    spa_livelist_delete_check(spa));
10817 		break;
10818 	case ZPOOL_WAIT_INITIALIZE:
10819 	case ZPOOL_WAIT_TRIM:
10820 		error = spa_vdev_activity_in_progress(spa, use_tag, tag,
10821 		    activity, in_progress);
10822 		break;
10823 	case ZPOOL_WAIT_REPLACE:
10824 		mutex_exit(&spa->spa_activities_lock);
10825 		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
10826 		mutex_enter(&spa->spa_activities_lock);
10827 
10828 		*in_progress = vdev_replace_in_progress(spa->spa_root_vdev);
10829 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
10830 		break;
10831 	case ZPOOL_WAIT_REMOVE:
10832 		*in_progress = (spa->spa_removing_phys.sr_state ==
10833 		    DSS_SCANNING);
10834 		break;
10835 	case ZPOOL_WAIT_RESILVER:
10836 		*in_progress = vdev_rebuild_active(spa->spa_root_vdev);
10837 		if (*in_progress)
10838 			break;
10839 		zfs_fallthrough;
10840 	case ZPOOL_WAIT_SCRUB:
10841 	{
10842 		boolean_t scanning, paused, is_scrub;
10843 		dsl_scan_t *scn =  spa->spa_dsl_pool->dp_scan;
10844 
10845 		is_scrub = (scn->scn_phys.scn_func == POOL_SCAN_SCRUB);
10846 		scanning = (scn->scn_phys.scn_state == DSS_SCANNING);
10847 		paused = dsl_scan_is_paused_scrub(scn);
10848 		*in_progress = (scanning && !paused &&
10849 		    is_scrub == (activity == ZPOOL_WAIT_SCRUB));
10850 		break;
10851 	}
10852 	case ZPOOL_WAIT_RAIDZ_EXPAND:
10853 	{
10854 		vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
10855 		*in_progress = (vre != NULL && vre->vre_state == DSS_SCANNING);
10856 		break;
10857 	}
10858 	default:
10859 		panic("unrecognized value for activity %d", activity);
10860 	}
10861 
10862 	return (error);
10863 }
10864 
10865 static int
10866 spa_wait_common(const char *pool, zpool_wait_activity_t activity,
10867     boolean_t use_tag, uint64_t tag, boolean_t *waited)
10868 {
10869 	/*
10870 	 * The tag is used to distinguish between instances of an activity.
10871 	 * 'initialize' and 'trim' are the only activities that we use this for.
10872 	 * The other activities can only have a single instance in progress in a
10873 	 * pool at one time, making the tag unnecessary.
10874 	 *
10875 	 * There can be multiple devices being replaced at once, but since they
10876 	 * all finish once resilvering finishes, we don't bother keeping track
10877 	 * of them individually, we just wait for them all to finish.
10878 	 */
10879 	if (use_tag && activity != ZPOOL_WAIT_INITIALIZE &&
10880 	    activity != ZPOOL_WAIT_TRIM)
10881 		return (EINVAL);
10882 
10883 	if (activity < 0 || activity >= ZPOOL_WAIT_NUM_ACTIVITIES)
10884 		return (EINVAL);
10885 
10886 	spa_t *spa;
10887 	int error = spa_open(pool, &spa, FTAG);
10888 	if (error != 0)
10889 		return (error);
10890 
10891 	/*
10892 	 * Increment the spa's waiter count so that we can call spa_close and
10893 	 * still ensure that the spa_t doesn't get freed before this thread is
10894 	 * finished with it when the pool is exported. We want to call spa_close
10895 	 * before we start waiting because otherwise the additional ref would
10896 	 * prevent the pool from being exported or destroyed throughout the
10897 	 * potentially long wait.
10898 	 */
10899 	mutex_enter(&spa->spa_activities_lock);
10900 	spa->spa_waiters++;
10901 	spa_close(spa, FTAG);
10902 
10903 	*waited = B_FALSE;
10904 	for (;;) {
10905 		boolean_t in_progress;
10906 		error = spa_activity_in_progress(spa, activity, use_tag, tag,
10907 		    &in_progress);
10908 
10909 		if (error || !in_progress || spa->spa_waiters_cancel)
10910 			break;
10911 
10912 		*waited = B_TRUE;
10913 
10914 		if (cv_wait_sig(&spa->spa_activities_cv,
10915 		    &spa->spa_activities_lock) == 0) {
10916 			error = EINTR;
10917 			break;
10918 		}
10919 	}
10920 
10921 	spa->spa_waiters--;
10922 	cv_signal(&spa->spa_waiters_cv);
10923 	mutex_exit(&spa->spa_activities_lock);
10924 
10925 	return (error);
10926 }
10927 
10928 /*
10929  * Wait for a particular instance of the specified activity to complete, where
10930  * the instance is identified by 'tag'
10931  */
10932 int
10933 spa_wait_tag(const char *pool, zpool_wait_activity_t activity, uint64_t tag,
10934     boolean_t *waited)
10935 {
10936 	return (spa_wait_common(pool, activity, B_TRUE, tag, waited));
10937 }
10938 
10939 /*
10940  * Wait for all instances of the specified activity complete
10941  */
10942 int
10943 spa_wait(const char *pool, zpool_wait_activity_t activity, boolean_t *waited)
10944 {
10945 
10946 	return (spa_wait_common(pool, activity, B_FALSE, 0, waited));
10947 }
10948 
10949 sysevent_t *
10950 spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name)
10951 {
10952 	sysevent_t *ev = NULL;
10953 #ifdef _KERNEL
10954 	nvlist_t *resource;
10955 
10956 	resource = zfs_event_create(spa, vd, FM_SYSEVENT_CLASS, name, hist_nvl);
10957 	if (resource) {
10958 		ev = kmem_alloc(sizeof (sysevent_t), KM_SLEEP);
10959 		ev->resource = resource;
10960 	}
10961 #else
10962 	(void) spa, (void) vd, (void) hist_nvl, (void) name;
10963 #endif
10964 	return (ev);
10965 }
10966 
10967 void
10968 spa_event_post(sysevent_t *ev)
10969 {
10970 #ifdef _KERNEL
10971 	if (ev) {
10972 		zfs_zevent_post(ev->resource, NULL, zfs_zevent_post_cb);
10973 		kmem_free(ev, sizeof (*ev));
10974 	}
10975 #else
10976 	(void) ev;
10977 #endif
10978 }
10979 
10980 /*
10981  * Post a zevent corresponding to the given sysevent.   The 'name' must be one
10982  * of the event definitions in sys/sysevent/eventdefs.h.  The payload will be
10983  * filled in from the spa and (optionally) the vdev.  This doesn't do anything
10984  * in the userland libzpool, as we don't want consumers to misinterpret ztest
10985  * or zdb as real changes.
10986  */
10987 void
10988 spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name)
10989 {
10990 	spa_event_post(spa_event_create(spa, vd, hist_nvl, name));
10991 }
10992 
10993 /* state manipulation functions */
10994 EXPORT_SYMBOL(spa_open);
10995 EXPORT_SYMBOL(spa_open_rewind);
10996 EXPORT_SYMBOL(spa_get_stats);
10997 EXPORT_SYMBOL(spa_create);
10998 EXPORT_SYMBOL(spa_import);
10999 EXPORT_SYMBOL(spa_tryimport);
11000 EXPORT_SYMBOL(spa_destroy);
11001 EXPORT_SYMBOL(spa_export);
11002 EXPORT_SYMBOL(spa_reset);
11003 EXPORT_SYMBOL(spa_async_request);
11004 EXPORT_SYMBOL(spa_async_suspend);
11005 EXPORT_SYMBOL(spa_async_resume);
11006 EXPORT_SYMBOL(spa_inject_addref);
11007 EXPORT_SYMBOL(spa_inject_delref);
11008 EXPORT_SYMBOL(spa_scan_stat_init);
11009 EXPORT_SYMBOL(spa_scan_get_stats);
11010 
11011 /* device manipulation */
11012 EXPORT_SYMBOL(spa_vdev_add);
11013 EXPORT_SYMBOL(spa_vdev_attach);
11014 EXPORT_SYMBOL(spa_vdev_detach);
11015 EXPORT_SYMBOL(spa_vdev_setpath);
11016 EXPORT_SYMBOL(spa_vdev_setfru);
11017 EXPORT_SYMBOL(spa_vdev_split_mirror);
11018 
11019 /* spare statech is global across all pools) */
11020 EXPORT_SYMBOL(spa_spare_add);
11021 EXPORT_SYMBOL(spa_spare_remove);
11022 EXPORT_SYMBOL(spa_spare_exists);
11023 EXPORT_SYMBOL(spa_spare_activate);
11024 
11025 /* L2ARC statech is global across all pools) */
11026 EXPORT_SYMBOL(spa_l2cache_add);
11027 EXPORT_SYMBOL(spa_l2cache_remove);
11028 EXPORT_SYMBOL(spa_l2cache_exists);
11029 EXPORT_SYMBOL(spa_l2cache_activate);
11030 EXPORT_SYMBOL(spa_l2cache_drop);
11031 
11032 /* scanning */
11033 EXPORT_SYMBOL(spa_scan);
11034 EXPORT_SYMBOL(spa_scan_range);
11035 EXPORT_SYMBOL(spa_scan_stop);
11036 
11037 /* spa syncing */
11038 EXPORT_SYMBOL(spa_sync); /* only for DMU use */
11039 EXPORT_SYMBOL(spa_sync_allpools);
11040 
11041 /* properties */
11042 EXPORT_SYMBOL(spa_prop_set);
11043 EXPORT_SYMBOL(spa_prop_get);
11044 EXPORT_SYMBOL(spa_prop_clear_bootfs);
11045 
11046 /* asynchronous event notification */
11047 EXPORT_SYMBOL(spa_event_notify);
11048 
11049 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_pct, UINT, ZMOD_RW,
11050 	"Percentage of CPUs to run a metaslab preload taskq");
11051 
11052 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, UINT, ZMOD_RW,
11053 	"log2 fraction of arc that can be used by inflight I/Os when "
11054 	"verifying pool during import");
11055 
11056 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_metadata, INT, ZMOD_RW,
11057 	"Set to traverse metadata on pool import");
11058 
11059 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW,
11060 	"Set to traverse data on pool import");
11061 
11062 ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW,
11063 	"Print vdev tree to zfs_dbgmsg during pool import");
11064 
11065 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RW,
11066 	"Percentage of CPUs to run an IO worker thread");
11067 
11068 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RW,
11069 	"Number of threads per IO worker taskqueue");
11070 
11071 ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, U64, ZMOD_RW,
11072 	"Allow importing pool with up to this number of missing top-level "
11073 	"vdevs (in read-only mode)");
11074 
11075 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT,
11076 	ZMOD_RW, "Set the livelist condense zthr to pause");
11077 
11078 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_pause, INT,
11079 	ZMOD_RW, "Set the livelist condense synctask to pause");
11080 
11081 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_cancel,
11082 	INT, ZMOD_RW,
11083 	"Whether livelist condensing was canceled in the synctask");
11084 
11085 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_cancel,
11086 	INT, ZMOD_RW,
11087 	"Whether livelist condensing was canceled in the zthr function");
11088 
11089 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT,
11090 	ZMOD_RW,
11091 	"Whether extra ALLOC blkptrs were added to a livelist entry while it "
11092 	"was being condensed");
11093 
11094 #ifdef _KERNEL
11095 ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read,
11096 	spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RW,
11097 	"Configure IO queues for read IO");
11098 ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write,
11099 	spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RW,
11100 	"Configure IO queues for write IO");
11101 #endif
11102 
11103 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_write_tpq, UINT, ZMOD_RW,
11104 	"Number of CPUs per write issue taskq");
11105