xref: /titanic_51/usr/src/uts/common/fs/zfs/spa.c (revision 5420b8055570e39d22735d20973c2979c9adba94)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * This file contains all the routines used when modifying on-disk SPA state.
29  * This includes opening, importing, destroying, exporting a pool, and syncing a
30  * pool.
31  */
32 
33 #include <sys/zfs_context.h>
34 #include <sys/fm/fs/zfs.h>
35 #include <sys/spa_impl.h>
36 #include <sys/zio.h>
37 #include <sys/zio_checksum.h>
38 #include <sys/zio_compress.h>
39 #include <sys/dmu.h>
40 #include <sys/dmu_tx.h>
41 #include <sys/zap.h>
42 #include <sys/zil.h>
43 #include <sys/vdev_impl.h>
44 #include <sys/metaslab.h>
45 #include <sys/uberblock_impl.h>
46 #include <sys/txg.h>
47 #include <sys/avl.h>
48 #include <sys/dmu_traverse.h>
49 #include <sys/dmu_objset.h>
50 #include <sys/unique.h>
51 #include <sys/dsl_pool.h>
52 #include <sys/dsl_dataset.h>
53 #include <sys/dsl_dir.h>
54 #include <sys/dsl_prop.h>
55 #include <sys/dsl_synctask.h>
56 #include <sys/fs/zfs.h>
57 #include <sys/arc.h>
58 #include <sys/callb.h>
59 #include <sys/systeminfo.h>
60 #include <sys/sunddi.h>
61 #include <sys/spa_boot.h>
62 #include <sys/zfs_ioctl.h>
63 
64 #ifdef	_KERNEL
65 #include <sys/zone.h>
66 #endif	/* _KERNEL */
67 
68 #include "zfs_prop.h"
69 #include "zfs_comutil.h"
70 
71 enum zti_modes {
72 	zti_mode_fixed,			/* value is # of threads (min 1) */
73 	zti_mode_online_percent,	/* value is % of online CPUs */
74 	zti_mode_tune,			/* fill from zio_taskq_tune_* */
75 	zti_nmodes
76 };
77 
78 #define	ZTI_THREAD_FIX(n)	{ zti_mode_fixed, (n) }
79 #define	ZTI_THREAD_PCT(n)	{ zti_mode_online_percent, (n) }
80 #define	ZTI_THREAD_TUNE		{ zti_mode_tune, 0 }
81 
82 #define	ZTI_THREAD_ONE		ZTI_THREAD_FIX(1)
83 
84 typedef struct zio_taskq_info {
85 	const char *zti_name;
86 	struct {
87 		enum zti_modes zti_mode;
88 		uint_t zti_value;
89 	} zti_nthreads[ZIO_TASKQ_TYPES];
90 } zio_taskq_info_t;
91 
92 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
93 				"issue",		"intr"
94 };
95 
96 const zio_taskq_info_t zio_taskqs[ZIO_TYPES] = {
97 	/*			ISSUE			INTR		*/
98 	{ "spa_zio_null",	{ ZTI_THREAD_ONE,	ZTI_THREAD_ONE } },
99 	{ "spa_zio_read",	{ ZTI_THREAD_FIX(8),	ZTI_THREAD_TUNE } },
100 	{ "spa_zio_write",	{ ZTI_THREAD_TUNE,	ZTI_THREAD_FIX(8) } },
101 	{ "spa_zio_free",	{ ZTI_THREAD_ONE,	ZTI_THREAD_ONE } },
102 	{ "spa_zio_claim",	{ ZTI_THREAD_ONE,	ZTI_THREAD_ONE } },
103 	{ "spa_zio_ioctl",	{ ZTI_THREAD_ONE,	ZTI_THREAD_ONE } },
104 };
105 
106 enum zti_modes zio_taskq_tune_mode = zti_mode_online_percent;
107 uint_t zio_taskq_tune_value = 80;	/* #threads = 80% of # online CPUs */
108 
109 static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx);
110 static boolean_t spa_has_active_shared_spare(spa_t *spa);
111 
112 /*
113  * ==========================================================================
114  * SPA properties routines
115  * ==========================================================================
116  */
117 
118 /*
119  * Add a (source=src, propname=propval) list to an nvlist.
120  */
121 static void
122 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
123     uint64_t intval, zprop_source_t src)
124 {
125 	const char *propname = zpool_prop_to_name(prop);
126 	nvlist_t *propval;
127 
128 	VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
129 	VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
130 
131 	if (strval != NULL)
132 		VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
133 	else
134 		VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);
135 
136 	VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
137 	nvlist_free(propval);
138 }
139 
140 /*
141  * Get property values from the spa configuration.
142  */
143 static void
144 spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
145 {
146 	uint64_t size;
147 	uint64_t used;
148 	uint64_t cap, version;
149 	zprop_source_t src = ZPROP_SRC_NONE;
150 	spa_config_dirent_t *dp;
151 
152 	ASSERT(MUTEX_HELD(&spa->spa_props_lock));
153 
154 	if (spa->spa_root_vdev != NULL) {
155 		size = spa_get_space(spa);
156 		used = spa_get_alloc(spa);
157 		spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
158 		spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
159 		spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src);
160 		spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL,
161 		    size - used, src);
162 
163 		cap = (size == 0) ? 0 : (used * 100 / size);
164 		spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
165 
166 		spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
167 		    spa->spa_root_vdev->vdev_state, src);
168 
169 		version = spa_version(spa);
170 		if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
171 			src = ZPROP_SRC_DEFAULT;
172 		else
173 			src = ZPROP_SRC_LOCAL;
174 		spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
175 	}
176 
177 	spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
178 
179 	if (spa->spa_root != NULL)
180 		spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
181 		    0, ZPROP_SRC_LOCAL);
182 
183 	if ((dp = list_head(&spa->spa_config_list)) != NULL) {
184 		if (dp->scd_path == NULL) {
185 			spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
186 			    "none", 0, ZPROP_SRC_LOCAL);
187 		} else if (strcmp(dp->scd_path, spa_config_path) != 0) {
188 			spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
189 			    dp->scd_path, 0, ZPROP_SRC_LOCAL);
190 		}
191 	}
192 }
193 
194 /*
195  * Get zpool property values.
196  */
197 int
198 spa_prop_get(spa_t *spa, nvlist_t **nvp)
199 {
200 	zap_cursor_t zc;
201 	zap_attribute_t za;
202 	objset_t *mos = spa->spa_meta_objset;
203 	int err;
204 
205 	VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
206 
207 	mutex_enter(&spa->spa_props_lock);
208 
209 	/*
210 	 * Get properties from the spa config.
211 	 */
212 	spa_prop_get_config(spa, nvp);
213 
214 	/* If no pool property object, no more prop to get. */
215 	if (spa->spa_pool_props_object == 0) {
216 		mutex_exit(&spa->spa_props_lock);
217 		return (0);
218 	}
219 
220 	/*
221 	 * Get properties from the MOS pool property object.
222 	 */
223 	for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
224 	    (err = zap_cursor_retrieve(&zc, &za)) == 0;
225 	    zap_cursor_advance(&zc)) {
226 		uint64_t intval = 0;
227 		char *strval = NULL;
228 		zprop_source_t src = ZPROP_SRC_DEFAULT;
229 		zpool_prop_t prop;
230 
231 		if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL)
232 			continue;
233 
234 		switch (za.za_integer_length) {
235 		case 8:
236 			/* integer property */
237 			if (za.za_first_integer !=
238 			    zpool_prop_default_numeric(prop))
239 				src = ZPROP_SRC_LOCAL;
240 
241 			if (prop == ZPOOL_PROP_BOOTFS) {
242 				dsl_pool_t *dp;
243 				dsl_dataset_t *ds = NULL;
244 
245 				dp = spa_get_dsl(spa);
246 				rw_enter(&dp->dp_config_rwlock, RW_READER);
247 				if (err = dsl_dataset_hold_obj(dp,
248 				    za.za_first_integer, FTAG, &ds)) {
249 					rw_exit(&dp->dp_config_rwlock);
250 					break;
251 				}
252 
253 				strval = kmem_alloc(
254 				    MAXNAMELEN + strlen(MOS_DIR_NAME) + 1,
255 				    KM_SLEEP);
256 				dsl_dataset_name(ds, strval);
257 				dsl_dataset_rele(ds, FTAG);
258 				rw_exit(&dp->dp_config_rwlock);
259 			} else {
260 				strval = NULL;
261 				intval = za.za_first_integer;
262 			}
263 
264 			spa_prop_add_list(*nvp, prop, strval, intval, src);
265 
266 			if (strval != NULL)
267 				kmem_free(strval,
268 				    MAXNAMELEN + strlen(MOS_DIR_NAME) + 1);
269 
270 			break;
271 
272 		case 1:
273 			/* string property */
274 			strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
275 			err = zap_lookup(mos, spa->spa_pool_props_object,
276 			    za.za_name, 1, za.za_num_integers, strval);
277 			if (err) {
278 				kmem_free(strval, za.za_num_integers);
279 				break;
280 			}
281 			spa_prop_add_list(*nvp, prop, strval, 0, src);
282 			kmem_free(strval, za.za_num_integers);
283 			break;
284 
285 		default:
286 			break;
287 		}
288 	}
289 	zap_cursor_fini(&zc);
290 	mutex_exit(&spa->spa_props_lock);
291 out:
292 	if (err && err != ENOENT) {
293 		nvlist_free(*nvp);
294 		*nvp = NULL;
295 		return (err);
296 	}
297 
298 	return (0);
299 }
300 
301 /*
302  * Validate the given pool properties nvlist and modify the list
303  * for the property values to be set.
304  */
305 static int
306 spa_prop_validate(spa_t *spa, nvlist_t *props)
307 {
308 	nvpair_t *elem;
309 	int error = 0, reset_bootfs = 0;
310 	uint64_t objnum;
311 
312 	elem = NULL;
313 	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
314 		zpool_prop_t prop;
315 		char *propname, *strval;
316 		uint64_t intval;
317 		objset_t *os;
318 		char *slash;
319 
320 		propname = nvpair_name(elem);
321 
322 		if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL)
323 			return (EINVAL);
324 
325 		switch (prop) {
326 		case ZPOOL_PROP_VERSION:
327 			error = nvpair_value_uint64(elem, &intval);
328 			if (!error &&
329 			    (intval < spa_version(spa) || intval > SPA_VERSION))
330 				error = EINVAL;
331 			break;
332 
333 		case ZPOOL_PROP_DELEGATION:
334 		case ZPOOL_PROP_AUTOREPLACE:
335 		case ZPOOL_PROP_LISTSNAPS:
336 		case ZPOOL_PROP_AUTOEXPAND:
337 			error = nvpair_value_uint64(elem, &intval);
338 			if (!error && intval > 1)
339 				error = EINVAL;
340 			break;
341 
342 		case ZPOOL_PROP_BOOTFS:
343 			/*
344 			 * If the pool version is less than SPA_VERSION_BOOTFS,
345 			 * or the pool is still being created (version == 0),
346 			 * the bootfs property cannot be set.
347 			 */
348 			if (spa_version(spa) < SPA_VERSION_BOOTFS) {
349 				error = ENOTSUP;
350 				break;
351 			}
352 
353 			/*
354 			 * Make sure the vdev config is bootable
355 			 */
356 			if (!vdev_is_bootable(spa->spa_root_vdev)) {
357 				error = ENOTSUP;
358 				break;
359 			}
360 
361 			reset_bootfs = 1;
362 
363 			error = nvpair_value_string(elem, &strval);
364 
365 			if (!error) {
366 				uint64_t compress;
367 
368 				if (strval == NULL || strval[0] == '\0') {
369 					objnum = zpool_prop_default_numeric(
370 					    ZPOOL_PROP_BOOTFS);
371 					break;
372 				}
373 
374 				if (error = dmu_objset_hold(strval, FTAG, &os))
375 					break;
376 
377 				/* Must be ZPL and not gzip compressed. */
378 
379 				if (dmu_objset_type(os) != DMU_OST_ZFS) {
380 					error = ENOTSUP;
381 				} else if ((error = dsl_prop_get_integer(strval,
382 				    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
383 				    &compress, NULL)) == 0 &&
384 				    !BOOTFS_COMPRESS_VALID(compress)) {
385 					error = ENOTSUP;
386 				} else {
387 					objnum = dmu_objset_id(os);
388 				}
389 				dmu_objset_rele(os, FTAG);
390 			}
391 			break;
392 
393 		case ZPOOL_PROP_FAILUREMODE:
394 			error = nvpair_value_uint64(elem, &intval);
395 			if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
396 			    intval > ZIO_FAILURE_MODE_PANIC))
397 				error = EINVAL;
398 
399 			/*
400 			 * This is a special case which only occurs when
401 			 * the pool has completely failed. This allows
402 			 * the user to change the in-core failmode property
403 			 * without syncing it out to disk (I/Os might
404 			 * currently be blocked). We do this by returning
405 			 * EIO to the caller (spa_prop_set) to trick it
406 			 * into thinking we encountered a property validation
407 			 * error.
408 			 */
409 			if (!error && spa_suspended(spa)) {
410 				spa->spa_failmode = intval;
411 				error = EIO;
412 			}
413 			break;
414 
415 		case ZPOOL_PROP_CACHEFILE:
416 			if ((error = nvpair_value_string(elem, &strval)) != 0)
417 				break;
418 
419 			if (strval[0] == '\0')
420 				break;
421 
422 			if (strcmp(strval, "none") == 0)
423 				break;
424 
425 			if (strval[0] != '/') {
426 				error = EINVAL;
427 				break;
428 			}
429 
430 			slash = strrchr(strval, '/');
431 			ASSERT(slash != NULL);
432 
433 			if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
434 			    strcmp(slash, "/..") == 0)
435 				error = EINVAL;
436 			break;
437 		}
438 
439 		if (error)
440 			break;
441 	}
442 
443 	if (!error && reset_bootfs) {
444 		error = nvlist_remove(props,
445 		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
446 
447 		if (!error) {
448 			error = nvlist_add_uint64(props,
449 			    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
450 		}
451 	}
452 
453 	return (error);
454 }
455 
456 void
457 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
458 {
459 	char *cachefile;
460 	spa_config_dirent_t *dp;
461 
462 	if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
463 	    &cachefile) != 0)
464 		return;
465 
466 	dp = kmem_alloc(sizeof (spa_config_dirent_t),
467 	    KM_SLEEP);
468 
469 	if (cachefile[0] == '\0')
470 		dp->scd_path = spa_strdup(spa_config_path);
471 	else if (strcmp(cachefile, "none") == 0)
472 		dp->scd_path = NULL;
473 	else
474 		dp->scd_path = spa_strdup(cachefile);
475 
476 	list_insert_head(&spa->spa_config_list, dp);
477 	if (need_sync)
478 		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
479 }
480 
481 int
482 spa_prop_set(spa_t *spa, nvlist_t *nvp)
483 {
484 	int error;
485 	nvpair_t *elem;
486 	boolean_t need_sync = B_FALSE;
487 	zpool_prop_t prop;
488 
489 	if ((error = spa_prop_validate(spa, nvp)) != 0)
490 		return (error);
491 
492 	elem = NULL;
493 	while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
494 		if ((prop = zpool_name_to_prop(
495 		    nvpair_name(elem))) == ZPROP_INVAL)
496 			return (EINVAL);
497 
498 		if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT)
499 			continue;
500 
501 		need_sync = B_TRUE;
502 		break;
503 	}
504 
505 	if (need_sync)
506 		return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props,
507 		    spa, nvp, 3));
508 	else
509 		return (0);
510 }
511 
512 /*
513  * If the bootfs property value is dsobj, clear it.
514  */
515 void
516 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
517 {
518 	if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
519 		VERIFY(zap_remove(spa->spa_meta_objset,
520 		    spa->spa_pool_props_object,
521 		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
522 		spa->spa_bootfs = 0;
523 	}
524 }
525 
526 /*
527  * ==========================================================================
528  * SPA state manipulation (open/create/destroy/import/export)
529  * ==========================================================================
530  */
531 
532 static int
533 spa_error_entry_compare(const void *a, const void *b)
534 {
535 	spa_error_entry_t *sa = (spa_error_entry_t *)a;
536 	spa_error_entry_t *sb = (spa_error_entry_t *)b;
537 	int ret;
538 
539 	ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
540 	    sizeof (zbookmark_t));
541 
542 	if (ret < 0)
543 		return (-1);
544 	else if (ret > 0)
545 		return (1);
546 	else
547 		return (0);
548 }
549 
550 /*
551  * Utility function which retrieves copies of the current logs and
552  * re-initializes them in the process.
553  */
554 void
555 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
556 {
557 	ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
558 
559 	bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
560 	bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
561 
562 	avl_create(&spa->spa_errlist_scrub,
563 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
564 	    offsetof(spa_error_entry_t, se_avl));
565 	avl_create(&spa->spa_errlist_last,
566 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
567 	    offsetof(spa_error_entry_t, se_avl));
568 }
569 
570 /*
571  * Activate an uninitialized pool.
572  */
573 static void
574 spa_activate(spa_t *spa, int mode)
575 {
576 	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
577 
578 	spa->spa_state = POOL_STATE_ACTIVE;
579 	spa->spa_mode = mode;
580 
581 	spa->spa_normal_class = metaslab_class_create(zfs_metaslab_ops);
582 	spa->spa_log_class = metaslab_class_create(zfs_metaslab_ops);
583 
584 	for (int t = 0; t < ZIO_TYPES; t++) {
585 		const zio_taskq_info_t *ztip = &zio_taskqs[t];
586 		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
587 			enum zti_modes mode = ztip->zti_nthreads[q].zti_mode;
588 			uint_t value = ztip->zti_nthreads[q].zti_value;
589 			char name[32];
590 
591 			(void) snprintf(name, sizeof (name),
592 			    "%s_%s", ztip->zti_name, zio_taskq_types[q]);
593 
594 			if (mode == zti_mode_tune) {
595 				mode = zio_taskq_tune_mode;
596 				value = zio_taskq_tune_value;
597 				if (mode == zti_mode_tune)
598 					mode = zti_mode_online_percent;
599 			}
600 
601 			switch (mode) {
602 			case zti_mode_fixed:
603 				ASSERT3U(value, >=, 1);
604 				value = MAX(value, 1);
605 
606 				spa->spa_zio_taskq[t][q] = taskq_create(name,
607 				    value, maxclsyspri, 50, INT_MAX,
608 				    TASKQ_PREPOPULATE);
609 				break;
610 
611 			case zti_mode_online_percent:
612 				spa->spa_zio_taskq[t][q] = taskq_create(name,
613 				    value, maxclsyspri, 50, INT_MAX,
614 				    TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT);
615 				break;
616 
617 			case zti_mode_tune:
618 			default:
619 				panic("unrecognized mode for "
620 				    "zio_taskqs[%u]->zti_nthreads[%u] (%u:%u) "
621 				    "in spa_activate()",
622 				    t, q, mode, value);
623 				break;
624 			}
625 		}
626 	}
627 
628 	list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
629 	    offsetof(vdev_t, vdev_config_dirty_node));
630 	list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
631 	    offsetof(vdev_t, vdev_state_dirty_node));
632 
633 	txg_list_create(&spa->spa_vdev_txg_list,
634 	    offsetof(struct vdev, vdev_txg_node));
635 
636 	avl_create(&spa->spa_errlist_scrub,
637 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
638 	    offsetof(spa_error_entry_t, se_avl));
639 	avl_create(&spa->spa_errlist_last,
640 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
641 	    offsetof(spa_error_entry_t, se_avl));
642 }
643 
644 /*
645  * Opposite of spa_activate().
646  */
647 static void
648 spa_deactivate(spa_t *spa)
649 {
650 	ASSERT(spa->spa_sync_on == B_FALSE);
651 	ASSERT(spa->spa_dsl_pool == NULL);
652 	ASSERT(spa->spa_root_vdev == NULL);
653 	ASSERT(spa->spa_async_zio_root == NULL);
654 	ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
655 
656 	txg_list_destroy(&spa->spa_vdev_txg_list);
657 
658 	list_destroy(&spa->spa_config_dirty_list);
659 	list_destroy(&spa->spa_state_dirty_list);
660 
661 	for (int t = 0; t < ZIO_TYPES; t++) {
662 		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
663 			taskq_destroy(spa->spa_zio_taskq[t][q]);
664 			spa->spa_zio_taskq[t][q] = NULL;
665 		}
666 	}
667 
668 	metaslab_class_destroy(spa->spa_normal_class);
669 	spa->spa_normal_class = NULL;
670 
671 	metaslab_class_destroy(spa->spa_log_class);
672 	spa->spa_log_class = NULL;
673 
674 	/*
675 	 * If this was part of an import or the open otherwise failed, we may
676 	 * still have errors left in the queues.  Empty them just in case.
677 	 */
678 	spa_errlog_drain(spa);
679 
680 	avl_destroy(&spa->spa_errlist_scrub);
681 	avl_destroy(&spa->spa_errlist_last);
682 
683 	spa->spa_state = POOL_STATE_UNINITIALIZED;
684 }
685 
686 /*
687  * Verify a pool configuration, and construct the vdev tree appropriately.  This
688  * will create all the necessary vdevs in the appropriate layout, with each vdev
689  * in the CLOSED state.  This will prep the pool before open/creation/import.
690  * All vdev validation is done by the vdev_alloc() routine.
691  */
692 static int
693 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
694     uint_t id, int atype)
695 {
696 	nvlist_t **child;
697 	uint_t children;
698 	int error;
699 
700 	if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
701 		return (error);
702 
703 	if ((*vdp)->vdev_ops->vdev_op_leaf)
704 		return (0);
705 
706 	error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
707 	    &child, &children);
708 
709 	if (error == ENOENT)
710 		return (0);
711 
712 	if (error) {
713 		vdev_free(*vdp);
714 		*vdp = NULL;
715 		return (EINVAL);
716 	}
717 
718 	for (int c = 0; c < children; c++) {
719 		vdev_t *vd;
720 		if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
721 		    atype)) != 0) {
722 			vdev_free(*vdp);
723 			*vdp = NULL;
724 			return (error);
725 		}
726 	}
727 
728 	ASSERT(*vdp != NULL);
729 
730 	return (0);
731 }
732 
733 /*
734  * Opposite of spa_load().
735  */
736 static void
737 spa_unload(spa_t *spa)
738 {
739 	int i;
740 
741 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
742 
743 	/*
744 	 * Stop async tasks.
745 	 */
746 	spa_async_suspend(spa);
747 
748 	/*
749 	 * Stop syncing.
750 	 */
751 	if (spa->spa_sync_on) {
752 		txg_sync_stop(spa->spa_dsl_pool);
753 		spa->spa_sync_on = B_FALSE;
754 	}
755 
756 	/*
757 	 * Wait for any outstanding async I/O to complete.
758 	 */
759 	if (spa->spa_async_zio_root != NULL) {
760 		(void) zio_wait(spa->spa_async_zio_root);
761 		spa->spa_async_zio_root = NULL;
762 	}
763 
764 	/*
765 	 * Close the dsl pool.
766 	 */
767 	if (spa->spa_dsl_pool) {
768 		dsl_pool_close(spa->spa_dsl_pool);
769 		spa->spa_dsl_pool = NULL;
770 	}
771 
772 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
773 
774 	/*
775 	 * Drop and purge level 2 cache
776 	 */
777 	spa_l2cache_drop(spa);
778 
779 	/*
780 	 * Close all vdevs.
781 	 */
782 	if (spa->spa_root_vdev)
783 		vdev_free(spa->spa_root_vdev);
784 	ASSERT(spa->spa_root_vdev == NULL);
785 
786 	for (i = 0; i < spa->spa_spares.sav_count; i++)
787 		vdev_free(spa->spa_spares.sav_vdevs[i]);
788 	if (spa->spa_spares.sav_vdevs) {
789 		kmem_free(spa->spa_spares.sav_vdevs,
790 		    spa->spa_spares.sav_count * sizeof (void *));
791 		spa->spa_spares.sav_vdevs = NULL;
792 	}
793 	if (spa->spa_spares.sav_config) {
794 		nvlist_free(spa->spa_spares.sav_config);
795 		spa->spa_spares.sav_config = NULL;
796 	}
797 	spa->spa_spares.sav_count = 0;
798 
799 	for (i = 0; i < spa->spa_l2cache.sav_count; i++)
800 		vdev_free(spa->spa_l2cache.sav_vdevs[i]);
801 	if (spa->spa_l2cache.sav_vdevs) {
802 		kmem_free(spa->spa_l2cache.sav_vdevs,
803 		    spa->spa_l2cache.sav_count * sizeof (void *));
804 		spa->spa_l2cache.sav_vdevs = NULL;
805 	}
806 	if (spa->spa_l2cache.sav_config) {
807 		nvlist_free(spa->spa_l2cache.sav_config);
808 		spa->spa_l2cache.sav_config = NULL;
809 	}
810 	spa->spa_l2cache.sav_count = 0;
811 
812 	spa->spa_async_suspended = 0;
813 
814 	spa_config_exit(spa, SCL_ALL, FTAG);
815 }
816 
817 /*
818  * Load (or re-load) the current list of vdevs describing the active spares for
819  * this pool.  When this is called, we have some form of basic information in
820  * 'spa_spares.sav_config'.  We parse this into vdevs, try to open them, and
821  * then re-generate a more complete list including status information.
822  */
823 static void
824 spa_load_spares(spa_t *spa)
825 {
826 	nvlist_t **spares;
827 	uint_t nspares;
828 	int i;
829 	vdev_t *vd, *tvd;
830 
831 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
832 
833 	/*
834 	 * First, close and free any existing spare vdevs.
835 	 */
836 	for (i = 0; i < spa->spa_spares.sav_count; i++) {
837 		vd = spa->spa_spares.sav_vdevs[i];
838 
839 		/* Undo the call to spa_activate() below */
840 		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
841 		    B_FALSE)) != NULL && tvd->vdev_isspare)
842 			spa_spare_remove(tvd);
843 		vdev_close(vd);
844 		vdev_free(vd);
845 	}
846 
847 	if (spa->spa_spares.sav_vdevs)
848 		kmem_free(spa->spa_spares.sav_vdevs,
849 		    spa->spa_spares.sav_count * sizeof (void *));
850 
851 	if (spa->spa_spares.sav_config == NULL)
852 		nspares = 0;
853 	else
854 		VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
855 		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
856 
857 	spa->spa_spares.sav_count = (int)nspares;
858 	spa->spa_spares.sav_vdevs = NULL;
859 
860 	if (nspares == 0)
861 		return;
862 
863 	/*
864 	 * Construct the array of vdevs, opening them to get status in the
865 	 * process.   For each spare, there is potentially two different vdev_t
866 	 * structures associated with it: one in the list of spares (used only
867 	 * for basic validation purposes) and one in the active vdev
868 	 * configuration (if it's spared in).  During this phase we open and
869 	 * validate each vdev on the spare list.  If the vdev also exists in the
870 	 * active configuration, then we also mark this vdev as an active spare.
871 	 */
872 	spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *),
873 	    KM_SLEEP);
874 	for (i = 0; i < spa->spa_spares.sav_count; i++) {
875 		VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
876 		    VDEV_ALLOC_SPARE) == 0);
877 		ASSERT(vd != NULL);
878 
879 		spa->spa_spares.sav_vdevs[i] = vd;
880 
881 		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
882 		    B_FALSE)) != NULL) {
883 			if (!tvd->vdev_isspare)
884 				spa_spare_add(tvd);
885 
886 			/*
887 			 * We only mark the spare active if we were successfully
888 			 * able to load the vdev.  Otherwise, importing a pool
889 			 * with a bad active spare would result in strange
890 			 * behavior, because multiple pool would think the spare
891 			 * is actively in use.
892 			 *
893 			 * There is a vulnerability here to an equally bizarre
894 			 * circumstance, where a dead active spare is later
895 			 * brought back to life (onlined or otherwise).  Given
896 			 * the rarity of this scenario, and the extra complexity
897 			 * it adds, we ignore the possibility.
898 			 */
899 			if (!vdev_is_dead(tvd))
900 				spa_spare_activate(tvd);
901 		}
902 
903 		vd->vdev_top = vd;
904 		vd->vdev_aux = &spa->spa_spares;
905 
906 		if (vdev_open(vd) != 0)
907 			continue;
908 
909 		if (vdev_validate_aux(vd) == 0)
910 			spa_spare_add(vd);
911 	}
912 
913 	/*
914 	 * Recompute the stashed list of spares, with status information
915 	 * this time.
916 	 */
917 	VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
918 	    DATA_TYPE_NVLIST_ARRAY) == 0);
919 
920 	spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
921 	    KM_SLEEP);
922 	for (i = 0; i < spa->spa_spares.sav_count; i++)
923 		spares[i] = vdev_config_generate(spa,
924 		    spa->spa_spares.sav_vdevs[i], B_TRUE, B_TRUE, B_FALSE);
925 	VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
926 	    ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
927 	for (i = 0; i < spa->spa_spares.sav_count; i++)
928 		nvlist_free(spares[i]);
929 	kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
930 }
931 
932 /*
933  * Load (or re-load) the current list of vdevs describing the active l2cache for
934  * this pool.  When this is called, we have some form of basic information in
935  * 'spa_l2cache.sav_config'.  We parse this into vdevs, try to open them, and
936  * then re-generate a more complete list including status information.
937  * Devices which are already active have their details maintained, and are
938  * not re-opened.
939  */
940 static void
941 spa_load_l2cache(spa_t *spa)
942 {
943 	nvlist_t **l2cache;
944 	uint_t nl2cache;
945 	int i, j, oldnvdevs;
946 	uint64_t guid;
947 	vdev_t *vd, **oldvdevs, **newvdevs;
948 	spa_aux_vdev_t *sav = &spa->spa_l2cache;
949 
950 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
951 
952 	if (sav->sav_config != NULL) {
953 		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
954 		    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
955 		newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
956 	} else {
957 		nl2cache = 0;
958 	}
959 
960 	oldvdevs = sav->sav_vdevs;
961 	oldnvdevs = sav->sav_count;
962 	sav->sav_vdevs = NULL;
963 	sav->sav_count = 0;
964 
965 	/*
966 	 * Process new nvlist of vdevs.
967 	 */
968 	for (i = 0; i < nl2cache; i++) {
969 		VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
970 		    &guid) == 0);
971 
972 		newvdevs[i] = NULL;
973 		for (j = 0; j < oldnvdevs; j++) {
974 			vd = oldvdevs[j];
975 			if (vd != NULL && guid == vd->vdev_guid) {
976 				/*
977 				 * Retain previous vdev for add/remove ops.
978 				 */
979 				newvdevs[i] = vd;
980 				oldvdevs[j] = NULL;
981 				break;
982 			}
983 		}
984 
985 		if (newvdevs[i] == NULL) {
986 			/*
987 			 * Create new vdev
988 			 */
989 			VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
990 			    VDEV_ALLOC_L2CACHE) == 0);
991 			ASSERT(vd != NULL);
992 			newvdevs[i] = vd;
993 
994 			/*
995 			 * Commit this vdev as an l2cache device,
996 			 * even if it fails to open.
997 			 */
998 			spa_l2cache_add(vd);
999 
1000 			vd->vdev_top = vd;
1001 			vd->vdev_aux = sav;
1002 
1003 			spa_l2cache_activate(vd);
1004 
1005 			if (vdev_open(vd) != 0)
1006 				continue;
1007 
1008 			(void) vdev_validate_aux(vd);
1009 
1010 			if (!vdev_is_dead(vd))
1011 				l2arc_add_vdev(spa, vd);
1012 		}
1013 	}
1014 
1015 	/*
1016 	 * Purge vdevs that were dropped
1017 	 */
1018 	for (i = 0; i < oldnvdevs; i++) {
1019 		uint64_t pool;
1020 
1021 		vd = oldvdevs[i];
1022 		if (vd != NULL) {
1023 			if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
1024 			    pool != 0ULL && l2arc_vdev_present(vd))
1025 				l2arc_remove_vdev(vd);
1026 			(void) vdev_close(vd);
1027 			spa_l2cache_remove(vd);
1028 		}
1029 	}
1030 
1031 	if (oldvdevs)
1032 		kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
1033 
1034 	if (sav->sav_config == NULL)
1035 		goto out;
1036 
1037 	sav->sav_vdevs = newvdevs;
1038 	sav->sav_count = (int)nl2cache;
1039 
1040 	/*
1041 	 * Recompute the stashed list of l2cache devices, with status
1042 	 * information this time.
1043 	 */
1044 	VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
1045 	    DATA_TYPE_NVLIST_ARRAY) == 0);
1046 
1047 	l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
1048 	for (i = 0; i < sav->sav_count; i++)
1049 		l2cache[i] = vdev_config_generate(spa,
1050 		    sav->sav_vdevs[i], B_TRUE, B_FALSE, B_TRUE);
1051 	VERIFY(nvlist_add_nvlist_array(sav->sav_config,
1052 	    ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
1053 out:
1054 	for (i = 0; i < sav->sav_count; i++)
1055 		nvlist_free(l2cache[i]);
1056 	if (sav->sav_count)
1057 		kmem_free(l2cache, sav->sav_count * sizeof (void *));
1058 }
1059 
1060 static int
1061 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
1062 {
1063 	dmu_buf_t *db;
1064 	char *packed = NULL;
1065 	size_t nvsize = 0;
1066 	int error;
1067 	*value = NULL;
1068 
1069 	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
1070 	nvsize = *(uint64_t *)db->db_data;
1071 	dmu_buf_rele(db, FTAG);
1072 
1073 	packed = kmem_alloc(nvsize, KM_SLEEP);
1074 	error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
1075 	    DMU_READ_PREFETCH);
1076 	if (error == 0)
1077 		error = nvlist_unpack(packed, nvsize, value, 0);
1078 	kmem_free(packed, nvsize);
1079 
1080 	return (error);
1081 }
1082 
1083 /*
1084  * Checks to see if the given vdev could not be opened, in which case we post a
1085  * sysevent to notify the autoreplace code that the device has been removed.
1086  */
1087 static void
1088 spa_check_removed(vdev_t *vd)
1089 {
1090 	for (int c = 0; c < vd->vdev_children; c++)
1091 		spa_check_removed(vd->vdev_child[c]);
1092 
1093 	if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) {
1094 		zfs_post_autoreplace(vd->vdev_spa, vd);
1095 		spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK);
1096 	}
1097 }
1098 
1099 /*
1100  * Load the slog device state from the config object since it's possible
1101  * that the label does not contain the most up-to-date information.
1102  */
1103 void
1104 spa_load_log_state(spa_t *spa)
1105 {
1106 	nvlist_t *nv, *nvroot, **child;
1107 	uint64_t is_log;
1108 	uint_t children;
1109 	vdev_t *rvd = spa->spa_root_vdev;
1110 
1111 	VERIFY(load_nvlist(spa, spa->spa_config_object, &nv) == 0);
1112 	VERIFY(nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
1113 	VERIFY(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
1114 	    &child, &children) == 0);
1115 
1116 	for (int c = 0; c < children; c++) {
1117 		vdev_t *tvd = rvd->vdev_child[c];
1118 
1119 		if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
1120 		    &is_log) == 0 && is_log)
1121 			vdev_load_log_state(tvd, child[c]);
1122 	}
1123 	nvlist_free(nv);
1124 }
1125 
1126 /*
1127  * Check for missing log devices
1128  */
1129 int
1130 spa_check_logs(spa_t *spa)
1131 {
1132 	switch (spa->spa_log_state) {
1133 	case SPA_LOG_MISSING:
1134 		/* need to recheck in case slog has been restored */
1135 	case SPA_LOG_UNKNOWN:
1136 		if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL,
1137 		    DS_FIND_CHILDREN)) {
1138 			spa->spa_log_state = SPA_LOG_MISSING;
1139 			return (1);
1140 		}
1141 		break;
1142 	}
1143 	return (0);
1144 }
1145 
1146 /*
1147  * Load an existing storage pool, using the pool's builtin spa_config as a
1148  * source of configuration information.
1149  */
1150 static int
1151 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
1152 {
1153 	int error = 0;
1154 	nvlist_t *nvroot = NULL;
1155 	vdev_t *rvd;
1156 	uberblock_t *ub = &spa->spa_uberblock;
1157 	uint64_t config_cache_txg = spa->spa_config_txg;
1158 	uint64_t pool_guid;
1159 	uint64_t version;
1160 	uint64_t autoreplace = 0;
1161 	int orig_mode = spa->spa_mode;
1162 	char *ereport = FM_EREPORT_ZFS_POOL;
1163 
1164 	/*
1165 	 * If this is an untrusted config, access the pool in read-only mode.
1166 	 * This prevents things like resilvering recently removed devices.
1167 	 */
1168 	if (!mosconfig)
1169 		spa->spa_mode = FREAD;
1170 
1171 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
1172 
1173 	spa->spa_load_state = state;
1174 
1175 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) ||
1176 	    nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
1177 		error = EINVAL;
1178 		goto out;
1179 	}
1180 
1181 	/*
1182 	 * Versioning wasn't explicitly added to the label until later, so if
1183 	 * it's not present treat it as the initial version.
1184 	 */
1185 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0)
1186 		version = SPA_VERSION_INITIAL;
1187 
1188 	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
1189 	    &spa->spa_config_txg);
1190 
1191 	if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
1192 	    spa_guid_exists(pool_guid, 0)) {
1193 		error = EEXIST;
1194 		goto out;
1195 	}
1196 
1197 	spa->spa_load_guid = pool_guid;
1198 
1199 	/*
1200 	 * Create "The Godfather" zio to hold all async IOs
1201 	 */
1202 	spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
1203 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
1204 
1205 	/*
1206 	 * Parse the configuration into a vdev tree.  We explicitly set the
1207 	 * value that will be returned by spa_version() since parsing the
1208 	 * configuration requires knowing the version number.
1209 	 */
1210 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1211 	spa->spa_ubsync.ub_version = version;
1212 	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD);
1213 	spa_config_exit(spa, SCL_ALL, FTAG);
1214 
1215 	if (error != 0)
1216 		goto out;
1217 
1218 	ASSERT(spa->spa_root_vdev == rvd);
1219 	ASSERT(spa_guid(spa) == pool_guid);
1220 
1221 	/*
1222 	 * Try to open all vdevs, loading each label in the process.
1223 	 */
1224 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1225 	error = vdev_open(rvd);
1226 	spa_config_exit(spa, SCL_ALL, FTAG);
1227 	if (error != 0)
1228 		goto out;
1229 
1230 	/*
1231 	 * We need to validate the vdev labels against the configuration that
1232 	 * we have in hand, which is dependent on the setting of mosconfig. If
1233 	 * mosconfig is true then we're validating the vdev labels based on
1234 	 * that config. Otherwise, we're validating against the cached config
1235 	 * (zpool.cache) that was read when we loaded the zfs module, and then
1236 	 * later we will recursively call spa_load() and validate against
1237 	 * the vdev config.
1238 	 */
1239 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1240 	error = vdev_validate(rvd);
1241 	spa_config_exit(spa, SCL_ALL, FTAG);
1242 	if (error != 0)
1243 		goto out;
1244 
1245 	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
1246 		error = ENXIO;
1247 		goto out;
1248 	}
1249 
1250 	/*
1251 	 * Find the best uberblock.
1252 	 */
1253 	vdev_uberblock_load(NULL, rvd, ub);
1254 
1255 	/*
1256 	 * If we weren't able to find a single valid uberblock, return failure.
1257 	 */
1258 	if (ub->ub_txg == 0) {
1259 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1260 		    VDEV_AUX_CORRUPT_DATA);
1261 		error = ENXIO;
1262 		goto out;
1263 	}
1264 
1265 	/*
1266 	 * If the pool is newer than the code, we can't open it.
1267 	 */
1268 	if (ub->ub_version > SPA_VERSION) {
1269 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1270 		    VDEV_AUX_VERSION_NEWER);
1271 		error = ENOTSUP;
1272 		goto out;
1273 	}
1274 
1275 	/*
1276 	 * If the vdev guid sum doesn't match the uberblock, we have an
1277 	 * incomplete configuration.
1278 	 */
1279 	if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) {
1280 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1281 		    VDEV_AUX_BAD_GUID_SUM);
1282 		error = ENXIO;
1283 		goto out;
1284 	}
1285 
1286 	/*
1287 	 * Initialize internal SPA structures.
1288 	 */
1289 	spa->spa_state = POOL_STATE_ACTIVE;
1290 	spa->spa_ubsync = spa->spa_uberblock;
1291 	spa->spa_first_txg = spa_last_synced_txg(spa) + 1;
1292 	error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
1293 	if (error) {
1294 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1295 		    VDEV_AUX_CORRUPT_DATA);
1296 		goto out;
1297 	}
1298 	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
1299 
1300 	if (zap_lookup(spa->spa_meta_objset,
1301 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
1302 	    sizeof (uint64_t), 1, &spa->spa_config_object) != 0) {
1303 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1304 		    VDEV_AUX_CORRUPT_DATA);
1305 		error = EIO;
1306 		goto out;
1307 	}
1308 
1309 	if (!mosconfig) {
1310 		nvlist_t *newconfig;
1311 		uint64_t hostid;
1312 
1313 		if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) {
1314 			vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1315 			    VDEV_AUX_CORRUPT_DATA);
1316 			error = EIO;
1317 			goto out;
1318 		}
1319 
1320 		if (!spa_is_root(spa) && nvlist_lookup_uint64(newconfig,
1321 		    ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
1322 			char *hostname;
1323 			unsigned long myhostid = 0;
1324 
1325 			VERIFY(nvlist_lookup_string(newconfig,
1326 			    ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
1327 
1328 #ifdef	_KERNEL
1329 			myhostid = zone_get_hostid(NULL);
1330 #else	/* _KERNEL */
1331 			/*
1332 			 * We're emulating the system's hostid in userland, so
1333 			 * we can't use zone_get_hostid().
1334 			 */
1335 			(void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
1336 #endif	/* _KERNEL */
1337 			if (hostid != 0 && myhostid != 0 &&
1338 			    hostid != myhostid) {
1339 				cmn_err(CE_WARN, "pool '%s' could not be "
1340 				    "loaded as it was last accessed by "
1341 				    "another system (host: %s hostid: 0x%lx). "
1342 				    "See: http://www.sun.com/msg/ZFS-8000-EY",
1343 				    spa_name(spa), hostname,
1344 				    (unsigned long)hostid);
1345 				error = EBADF;
1346 				goto out;
1347 			}
1348 		}
1349 
1350 		spa_config_set(spa, newconfig);
1351 		spa_unload(spa);
1352 		spa_deactivate(spa);
1353 		spa_activate(spa, orig_mode);
1354 
1355 		return (spa_load(spa, newconfig, state, B_TRUE));
1356 	}
1357 
1358 	if (zap_lookup(spa->spa_meta_objset,
1359 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
1360 	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) {
1361 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1362 		    VDEV_AUX_CORRUPT_DATA);
1363 		error = EIO;
1364 		goto out;
1365 	}
1366 
1367 	/*
1368 	 * Load the bit that tells us to use the new accounting function
1369 	 * (raid-z deflation).  If we have an older pool, this will not
1370 	 * be present.
1371 	 */
1372 	error = zap_lookup(spa->spa_meta_objset,
1373 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
1374 	    sizeof (uint64_t), 1, &spa->spa_deflate);
1375 	if (error != 0 && error != ENOENT) {
1376 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1377 		    VDEV_AUX_CORRUPT_DATA);
1378 		error = EIO;
1379 		goto out;
1380 	}
1381 
1382 	/*
1383 	 * Load the persistent error log.  If we have an older pool, this will
1384 	 * not be present.
1385 	 */
1386 	error = zap_lookup(spa->spa_meta_objset,
1387 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST,
1388 	    sizeof (uint64_t), 1, &spa->spa_errlog_last);
1389 	if (error != 0 && error != ENOENT) {
1390 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1391 		    VDEV_AUX_CORRUPT_DATA);
1392 		error = EIO;
1393 		goto out;
1394 	}
1395 
1396 	error = zap_lookup(spa->spa_meta_objset,
1397 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB,
1398 	    sizeof (uint64_t), 1, &spa->spa_errlog_scrub);
1399 	if (error != 0 && error != ENOENT) {
1400 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1401 		    VDEV_AUX_CORRUPT_DATA);
1402 		error = EIO;
1403 		goto out;
1404 	}
1405 
1406 	/*
1407 	 * Load the history object.  If we have an older pool, this
1408 	 * will not be present.
1409 	 */
1410 	error = zap_lookup(spa->spa_meta_objset,
1411 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY,
1412 	    sizeof (uint64_t), 1, &spa->spa_history);
1413 	if (error != 0 && error != ENOENT) {
1414 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1415 		    VDEV_AUX_CORRUPT_DATA);
1416 		error = EIO;
1417 		goto out;
1418 	}
1419 
1420 	/*
1421 	 * Load any hot spares for this pool.
1422 	 */
1423 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
1424 	    DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares.sav_object);
1425 	if (error != 0 && error != ENOENT) {
1426 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1427 		    VDEV_AUX_CORRUPT_DATA);
1428 		error = EIO;
1429 		goto out;
1430 	}
1431 	if (error == 0) {
1432 		ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
1433 		if (load_nvlist(spa, spa->spa_spares.sav_object,
1434 		    &spa->spa_spares.sav_config) != 0) {
1435 			vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1436 			    VDEV_AUX_CORRUPT_DATA);
1437 			error = EIO;
1438 			goto out;
1439 		}
1440 
1441 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1442 		spa_load_spares(spa);
1443 		spa_config_exit(spa, SCL_ALL, FTAG);
1444 	}
1445 
1446 	/*
1447 	 * Load any level 2 ARC devices for this pool.
1448 	 */
1449 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
1450 	    DMU_POOL_L2CACHE, sizeof (uint64_t), 1,
1451 	    &spa->spa_l2cache.sav_object);
1452 	if (error != 0 && error != ENOENT) {
1453 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1454 		    VDEV_AUX_CORRUPT_DATA);
1455 		error = EIO;
1456 		goto out;
1457 	}
1458 	if (error == 0) {
1459 		ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
1460 		if (load_nvlist(spa, spa->spa_l2cache.sav_object,
1461 		    &spa->spa_l2cache.sav_config) != 0) {
1462 			vdev_set_state(rvd, B_TRUE,
1463 			    VDEV_STATE_CANT_OPEN,
1464 			    VDEV_AUX_CORRUPT_DATA);
1465 			error = EIO;
1466 			goto out;
1467 		}
1468 
1469 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1470 		spa_load_l2cache(spa);
1471 		spa_config_exit(spa, SCL_ALL, FTAG);
1472 	}
1473 
1474 	spa_load_log_state(spa);
1475 
1476 	if (spa_check_logs(spa)) {
1477 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1478 		    VDEV_AUX_BAD_LOG);
1479 		error = ENXIO;
1480 		ereport = FM_EREPORT_ZFS_LOG_REPLAY;
1481 		goto out;
1482 	}
1483 
1484 
1485 	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
1486 
1487 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
1488 	    DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object);
1489 
1490 	if (error && error != ENOENT) {
1491 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1492 		    VDEV_AUX_CORRUPT_DATA);
1493 		error = EIO;
1494 		goto out;
1495 	}
1496 
1497 	if (error == 0) {
1498 		(void) zap_lookup(spa->spa_meta_objset,
1499 		    spa->spa_pool_props_object,
1500 		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS),
1501 		    sizeof (uint64_t), 1, &spa->spa_bootfs);
1502 		(void) zap_lookup(spa->spa_meta_objset,
1503 		    spa->spa_pool_props_object,
1504 		    zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE),
1505 		    sizeof (uint64_t), 1, &autoreplace);
1506 		(void) zap_lookup(spa->spa_meta_objset,
1507 		    spa->spa_pool_props_object,
1508 		    zpool_prop_to_name(ZPOOL_PROP_DELEGATION),
1509 		    sizeof (uint64_t), 1, &spa->spa_delegation);
1510 		(void) zap_lookup(spa->spa_meta_objset,
1511 		    spa->spa_pool_props_object,
1512 		    zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE),
1513 		    sizeof (uint64_t), 1, &spa->spa_failmode);
1514 		(void) zap_lookup(spa->spa_meta_objset,
1515 		    spa->spa_pool_props_object,
1516 		    zpool_prop_to_name(ZPOOL_PROP_AUTOEXPAND),
1517 		    sizeof (uint64_t), 1, &spa->spa_autoexpand);
1518 	}
1519 
1520 	/*
1521 	 * If the 'autoreplace' property is set, then post a resource notifying
1522 	 * the ZFS DE that it should not issue any faults for unopenable
1523 	 * devices.  We also iterate over the vdevs, and post a sysevent for any
1524 	 * unopenable vdevs so that the normal autoreplace handler can take
1525 	 * over.
1526 	 */
1527 	if (autoreplace && state != SPA_LOAD_TRYIMPORT)
1528 		spa_check_removed(spa->spa_root_vdev);
1529 
1530 	/*
1531 	 * Load the vdev state for all toplevel vdevs.
1532 	 */
1533 	vdev_load(rvd);
1534 
1535 	/*
1536 	 * Propagate the leaf DTLs we just loaded all the way up the tree.
1537 	 */
1538 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1539 	vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
1540 	spa_config_exit(spa, SCL_ALL, FTAG);
1541 
1542 	/*
1543 	 * Check the state of the root vdev.  If it can't be opened, it
1544 	 * indicates one or more toplevel vdevs are faulted.
1545 	 */
1546 	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
1547 		error = ENXIO;
1548 		goto out;
1549 	}
1550 
1551 	if (spa_writeable(spa)) {
1552 		dmu_tx_t *tx;
1553 		int need_update = B_FALSE;
1554 
1555 		ASSERT(state != SPA_LOAD_TRYIMPORT);
1556 
1557 		/*
1558 		 * Claim log blocks that haven't been committed yet.
1559 		 * This must all happen in a single txg.
1560 		 */
1561 		tx = dmu_tx_create_assigned(spa_get_dsl(spa),
1562 		    spa_first_txg(spa));
1563 		(void) dmu_objset_find(spa_name(spa),
1564 		    zil_claim, tx, DS_FIND_CHILDREN);
1565 		dmu_tx_commit(tx);
1566 
1567 		spa->spa_log_state = SPA_LOG_GOOD;
1568 		spa->spa_sync_on = B_TRUE;
1569 		txg_sync_start(spa->spa_dsl_pool);
1570 
1571 		/*
1572 		 * Wait for all claims to sync.
1573 		 */
1574 		txg_wait_synced(spa->spa_dsl_pool, 0);
1575 
1576 		/*
1577 		 * If the config cache is stale, or we have uninitialized
1578 		 * metaslabs (see spa_vdev_add()), then update the config.
1579 		 *
1580 		 * If spa_load_verbatim is true, trust the current
1581 		 * in-core spa_config and update the disk labels.
1582 		 */
1583 		if (config_cache_txg != spa->spa_config_txg ||
1584 		    state == SPA_LOAD_IMPORT || spa->spa_load_verbatim)
1585 			need_update = B_TRUE;
1586 
1587 		for (int c = 0; c < rvd->vdev_children; c++)
1588 			if (rvd->vdev_child[c]->vdev_ms_array == 0)
1589 				need_update = B_TRUE;
1590 
1591 		/*
1592 		 * Update the config cache asychronously in case we're the
1593 		 * root pool, in which case the config cache isn't writable yet.
1594 		 */
1595 		if (need_update)
1596 			spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
1597 
1598 		/*
1599 		 * Check all DTLs to see if anything needs resilvering.
1600 		 */
1601 		if (vdev_resilver_needed(rvd, NULL, NULL))
1602 			spa_async_request(spa, SPA_ASYNC_RESILVER);
1603 
1604 		/*
1605 		 * Delete any inconsistent datasets.
1606 		 */
1607 		(void) dmu_objset_find(spa_name(spa),
1608 		    dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
1609 	}
1610 
1611 	error = 0;
1612 out:
1613 	spa->spa_minref = refcount_count(&spa->spa_refcount);
1614 	if (error && error != EBADF)
1615 		zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
1616 	spa->spa_load_state = SPA_LOAD_NONE;
1617 	spa->spa_ena = 0;
1618 
1619 	return (error);
1620 }
1621 
1622 /*
1623  * Pool Open/Import
1624  *
1625  * The import case is identical to an open except that the configuration is sent
1626  * down from userland, instead of grabbed from the configuration cache.  For the
1627  * case of an open, the pool configuration will exist in the
1628  * POOL_STATE_UNINITIALIZED state.
1629  *
1630  * The stats information (gen/count/ustats) is used to gather vdev statistics at
1631  * the same time open the pool, without having to keep around the spa_t in some
1632  * ambiguous state.
1633  */
1634 static int
1635 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
1636 {
1637 	spa_t *spa;
1638 	int error;
1639 	int locked = B_FALSE;
1640 
1641 	*spapp = NULL;
1642 
1643 	/*
1644 	 * As disgusting as this is, we need to support recursive calls to this
1645 	 * function because dsl_dir_open() is called during spa_load(), and ends
1646 	 * up calling spa_open() again.  The real fix is to figure out how to
1647 	 * avoid dsl_dir_open() calling this in the first place.
1648 	 */
1649 	if (mutex_owner(&spa_namespace_lock) != curthread) {
1650 		mutex_enter(&spa_namespace_lock);
1651 		locked = B_TRUE;
1652 	}
1653 
1654 	if ((spa = spa_lookup(pool)) == NULL) {
1655 		if (locked)
1656 			mutex_exit(&spa_namespace_lock);
1657 		return (ENOENT);
1658 	}
1659 	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
1660 
1661 		spa_activate(spa, spa_mode_global);
1662 
1663 		error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE);
1664 
1665 		if (error == EBADF) {
1666 			/*
1667 			 * If vdev_validate() returns failure (indicated by
1668 			 * EBADF), it indicates that one of the vdevs indicates
1669 			 * that the pool has been exported or destroyed.  If
1670 			 * this is the case, the config cache is out of sync and
1671 			 * we should remove the pool from the namespace.
1672 			 */
1673 			spa_unload(spa);
1674 			spa_deactivate(spa);
1675 			spa_config_sync(spa, B_TRUE, B_TRUE);
1676 			spa_remove(spa);
1677 			if (locked)
1678 				mutex_exit(&spa_namespace_lock);
1679 			return (ENOENT);
1680 		}
1681 
1682 		if (error) {
1683 			/*
1684 			 * We can't open the pool, but we still have useful
1685 			 * information: the state of each vdev after the
1686 			 * attempted vdev_open().  Return this to the user.
1687 			 */
1688 			if (config != NULL && spa->spa_root_vdev != NULL)
1689 				*config = spa_config_generate(spa, NULL, -1ULL,
1690 				    B_TRUE);
1691 			spa_unload(spa);
1692 			spa_deactivate(spa);
1693 			spa->spa_last_open_failed = B_TRUE;
1694 			if (locked)
1695 				mutex_exit(&spa_namespace_lock);
1696 			*spapp = NULL;
1697 			return (error);
1698 		} else {
1699 			spa->spa_last_open_failed = B_FALSE;
1700 		}
1701 	}
1702 
1703 	spa_open_ref(spa, tag);
1704 
1705 	if (locked)
1706 		mutex_exit(&spa_namespace_lock);
1707 
1708 	*spapp = spa;
1709 
1710 	if (config != NULL)
1711 		*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
1712 
1713 	return (0);
1714 }
1715 
1716 int
1717 spa_open(const char *name, spa_t **spapp, void *tag)
1718 {
1719 	return (spa_open_common(name, spapp, tag, NULL));
1720 }
1721 
1722 /*
1723  * Lookup the given spa_t, incrementing the inject count in the process,
1724  * preventing it from being exported or destroyed.
1725  */
1726 spa_t *
1727 spa_inject_addref(char *name)
1728 {
1729 	spa_t *spa;
1730 
1731 	mutex_enter(&spa_namespace_lock);
1732 	if ((spa = spa_lookup(name)) == NULL) {
1733 		mutex_exit(&spa_namespace_lock);
1734 		return (NULL);
1735 	}
1736 	spa->spa_inject_ref++;
1737 	mutex_exit(&spa_namespace_lock);
1738 
1739 	return (spa);
1740 }
1741 
1742 void
1743 spa_inject_delref(spa_t *spa)
1744 {
1745 	mutex_enter(&spa_namespace_lock);
1746 	spa->spa_inject_ref--;
1747 	mutex_exit(&spa_namespace_lock);
1748 }
1749 
1750 /*
1751  * Add spares device information to the nvlist.
1752  */
1753 static void
1754 spa_add_spares(spa_t *spa, nvlist_t *config)
1755 {
1756 	nvlist_t **spares;
1757 	uint_t i, nspares;
1758 	nvlist_t *nvroot;
1759 	uint64_t guid;
1760 	vdev_stat_t *vs;
1761 	uint_t vsc;
1762 	uint64_t pool;
1763 
1764 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
1765 
1766 	if (spa->spa_spares.sav_count == 0)
1767 		return;
1768 
1769 	VERIFY(nvlist_lookup_nvlist(config,
1770 	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
1771 	VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
1772 	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
1773 	if (nspares != 0) {
1774 		VERIFY(nvlist_add_nvlist_array(nvroot,
1775 		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
1776 		VERIFY(nvlist_lookup_nvlist_array(nvroot,
1777 		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
1778 
1779 		/*
1780 		 * Go through and find any spares which have since been
1781 		 * repurposed as an active spare.  If this is the case, update
1782 		 * their status appropriately.
1783 		 */
1784 		for (i = 0; i < nspares; i++) {
1785 			VERIFY(nvlist_lookup_uint64(spares[i],
1786 			    ZPOOL_CONFIG_GUID, &guid) == 0);
1787 			if (spa_spare_exists(guid, &pool, NULL) &&
1788 			    pool != 0ULL) {
1789 				VERIFY(nvlist_lookup_uint64_array(
1790 				    spares[i], ZPOOL_CONFIG_STATS,
1791 				    (uint64_t **)&vs, &vsc) == 0);
1792 				vs->vs_state = VDEV_STATE_CANT_OPEN;
1793 				vs->vs_aux = VDEV_AUX_SPARED;
1794 			}
1795 		}
1796 	}
1797 }
1798 
1799 /*
1800  * Add l2cache device information to the nvlist, including vdev stats.
1801  */
1802 static void
1803 spa_add_l2cache(spa_t *spa, nvlist_t *config)
1804 {
1805 	nvlist_t **l2cache;
1806 	uint_t i, j, nl2cache;
1807 	nvlist_t *nvroot;
1808 	uint64_t guid;
1809 	vdev_t *vd;
1810 	vdev_stat_t *vs;
1811 	uint_t vsc;
1812 
1813 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
1814 
1815 	if (spa->spa_l2cache.sav_count == 0)
1816 		return;
1817 
1818 	VERIFY(nvlist_lookup_nvlist(config,
1819 	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
1820 	VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
1821 	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
1822 	if (nl2cache != 0) {
1823 		VERIFY(nvlist_add_nvlist_array(nvroot,
1824 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
1825 		VERIFY(nvlist_lookup_nvlist_array(nvroot,
1826 		    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
1827 
1828 		/*
1829 		 * Update level 2 cache device stats.
1830 		 */
1831 
1832 		for (i = 0; i < nl2cache; i++) {
1833 			VERIFY(nvlist_lookup_uint64(l2cache[i],
1834 			    ZPOOL_CONFIG_GUID, &guid) == 0);
1835 
1836 			vd = NULL;
1837 			for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
1838 				if (guid ==
1839 				    spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
1840 					vd = spa->spa_l2cache.sav_vdevs[j];
1841 					break;
1842 				}
1843 			}
1844 			ASSERT(vd != NULL);
1845 
1846 			VERIFY(nvlist_lookup_uint64_array(l2cache[i],
1847 			    ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0);
1848 			vdev_get_stats(vd, vs);
1849 		}
1850 	}
1851 }
1852 
1853 int
1854 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
1855 {
1856 	int error;
1857 	spa_t *spa;
1858 
1859 	*config = NULL;
1860 	error = spa_open_common(name, &spa, FTAG, config);
1861 
1862 	if (spa != NULL) {
1863 		/*
1864 		 * This still leaves a window of inconsistency where the spares
1865 		 * or l2cache devices could change and the config would be
1866 		 * self-inconsistent.
1867 		 */
1868 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
1869 
1870 		if (*config != NULL) {
1871 			VERIFY(nvlist_add_uint64(*config,
1872 			    ZPOOL_CONFIG_ERRCOUNT,
1873 			    spa_get_errlog_size(spa)) == 0);
1874 
1875 			if (spa_suspended(spa))
1876 				VERIFY(nvlist_add_uint64(*config,
1877 				    ZPOOL_CONFIG_SUSPENDED,
1878 				    spa->spa_failmode) == 0);
1879 
1880 			spa_add_spares(spa, *config);
1881 			spa_add_l2cache(spa, *config);
1882 		}
1883 	}
1884 
1885 	/*
1886 	 * We want to get the alternate root even for faulted pools, so we cheat
1887 	 * and call spa_lookup() directly.
1888 	 */
1889 	if (altroot) {
1890 		if (spa == NULL) {
1891 			mutex_enter(&spa_namespace_lock);
1892 			spa = spa_lookup(name);
1893 			if (spa)
1894 				spa_altroot(spa, altroot, buflen);
1895 			else
1896 				altroot[0] = '\0';
1897 			spa = NULL;
1898 			mutex_exit(&spa_namespace_lock);
1899 		} else {
1900 			spa_altroot(spa, altroot, buflen);
1901 		}
1902 	}
1903 
1904 	if (spa != NULL) {
1905 		spa_config_exit(spa, SCL_CONFIG, FTAG);
1906 		spa_close(spa, FTAG);
1907 	}
1908 
1909 	return (error);
1910 }
1911 
1912 /*
1913  * Validate that the auxiliary device array is well formed.  We must have an
1914  * array of nvlists, each which describes a valid leaf vdev.  If this is an
1915  * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
1916  * specified, as long as they are well-formed.
1917  */
1918 static int
1919 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
1920     spa_aux_vdev_t *sav, const char *config, uint64_t version,
1921     vdev_labeltype_t label)
1922 {
1923 	nvlist_t **dev;
1924 	uint_t i, ndev;
1925 	vdev_t *vd;
1926 	int error;
1927 
1928 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1929 
1930 	/*
1931 	 * It's acceptable to have no devs specified.
1932 	 */
1933 	if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
1934 		return (0);
1935 
1936 	if (ndev == 0)
1937 		return (EINVAL);
1938 
1939 	/*
1940 	 * Make sure the pool is formatted with a version that supports this
1941 	 * device type.
1942 	 */
1943 	if (spa_version(spa) < version)
1944 		return (ENOTSUP);
1945 
1946 	/*
1947 	 * Set the pending device list so we correctly handle device in-use
1948 	 * checking.
1949 	 */
1950 	sav->sav_pending = dev;
1951 	sav->sav_npending = ndev;
1952 
1953 	for (i = 0; i < ndev; i++) {
1954 		if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
1955 		    mode)) != 0)
1956 			goto out;
1957 
1958 		if (!vd->vdev_ops->vdev_op_leaf) {
1959 			vdev_free(vd);
1960 			error = EINVAL;
1961 			goto out;
1962 		}
1963 
1964 		/*
1965 		 * The L2ARC currently only supports disk devices in
1966 		 * kernel context.  For user-level testing, we allow it.
1967 		 */
1968 #ifdef _KERNEL
1969 		if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) &&
1970 		    strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) {
1971 			error = ENOTBLK;
1972 			goto out;
1973 		}
1974 #endif
1975 		vd->vdev_top = vd;
1976 
1977 		if ((error = vdev_open(vd)) == 0 &&
1978 		    (error = vdev_label_init(vd, crtxg, label)) == 0) {
1979 			VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
1980 			    vd->vdev_guid) == 0);
1981 		}
1982 
1983 		vdev_free(vd);
1984 
1985 		if (error &&
1986 		    (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
1987 			goto out;
1988 		else
1989 			error = 0;
1990 	}
1991 
1992 out:
1993 	sav->sav_pending = NULL;
1994 	sav->sav_npending = 0;
1995 	return (error);
1996 }
1997 
1998 static int
1999 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
2000 {
2001 	int error;
2002 
2003 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
2004 
2005 	if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
2006 	    &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
2007 	    VDEV_LABEL_SPARE)) != 0) {
2008 		return (error);
2009 	}
2010 
2011 	return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
2012 	    &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
2013 	    VDEV_LABEL_L2CACHE));
2014 }
2015 
2016 static void
2017 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
2018     const char *config)
2019 {
2020 	int i;
2021 
2022 	if (sav->sav_config != NULL) {
2023 		nvlist_t **olddevs;
2024 		uint_t oldndevs;
2025 		nvlist_t **newdevs;
2026 
2027 		/*
2028 		 * Generate new dev list by concatentating with the
2029 		 * current dev list.
2030 		 */
2031 		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
2032 		    &olddevs, &oldndevs) == 0);
2033 
2034 		newdevs = kmem_alloc(sizeof (void *) *
2035 		    (ndevs + oldndevs), KM_SLEEP);
2036 		for (i = 0; i < oldndevs; i++)
2037 			VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
2038 			    KM_SLEEP) == 0);
2039 		for (i = 0; i < ndevs; i++)
2040 			VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
2041 			    KM_SLEEP) == 0);
2042 
2043 		VERIFY(nvlist_remove(sav->sav_config, config,
2044 		    DATA_TYPE_NVLIST_ARRAY) == 0);
2045 
2046 		VERIFY(nvlist_add_nvlist_array(sav->sav_config,
2047 		    config, newdevs, ndevs + oldndevs) == 0);
2048 		for (i = 0; i < oldndevs + ndevs; i++)
2049 			nvlist_free(newdevs[i]);
2050 		kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
2051 	} else {
2052 		/*
2053 		 * Generate a new dev list.
2054 		 */
2055 		VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
2056 		    KM_SLEEP) == 0);
2057 		VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
2058 		    devs, ndevs) == 0);
2059 	}
2060 }
2061 
2062 /*
2063  * Stop and drop level 2 ARC devices
2064  */
2065 void
2066 spa_l2cache_drop(spa_t *spa)
2067 {
2068 	vdev_t *vd;
2069 	int i;
2070 	spa_aux_vdev_t *sav = &spa->spa_l2cache;
2071 
2072 	for (i = 0; i < sav->sav_count; i++) {
2073 		uint64_t pool;
2074 
2075 		vd = sav->sav_vdevs[i];
2076 		ASSERT(vd != NULL);
2077 
2078 		if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
2079 		    pool != 0ULL && l2arc_vdev_present(vd))
2080 			l2arc_remove_vdev(vd);
2081 		if (vd->vdev_isl2cache)
2082 			spa_l2cache_remove(vd);
2083 		vdev_clear_stats(vd);
2084 		(void) vdev_close(vd);
2085 	}
2086 }
2087 
2088 /*
2089  * Pool Creation
2090  */
2091 int
2092 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
2093     const char *history_str, nvlist_t *zplprops)
2094 {
2095 	spa_t *spa;
2096 	char *altroot = NULL;
2097 	vdev_t *rvd;
2098 	dsl_pool_t *dp;
2099 	dmu_tx_t *tx;
2100 	int error = 0;
2101 	uint64_t txg = TXG_INITIAL;
2102 	nvlist_t **spares, **l2cache;
2103 	uint_t nspares, nl2cache;
2104 	uint64_t version;
2105 
2106 	/*
2107 	 * If this pool already exists, return failure.
2108 	 */
2109 	mutex_enter(&spa_namespace_lock);
2110 	if (spa_lookup(pool) != NULL) {
2111 		mutex_exit(&spa_namespace_lock);
2112 		return (EEXIST);
2113 	}
2114 
2115 	/*
2116 	 * Allocate a new spa_t structure.
2117 	 */
2118 	(void) nvlist_lookup_string(props,
2119 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
2120 	spa = spa_add(pool, altroot);
2121 	spa_activate(spa, spa_mode_global);
2122 
2123 	spa->spa_uberblock.ub_txg = txg - 1;
2124 
2125 	if (props && (error = spa_prop_validate(spa, props))) {
2126 		spa_deactivate(spa);
2127 		spa_remove(spa);
2128 		mutex_exit(&spa_namespace_lock);
2129 		return (error);
2130 	}
2131 
2132 	if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION),
2133 	    &version) != 0)
2134 		version = SPA_VERSION;
2135 	ASSERT(version <= SPA_VERSION);
2136 	spa->spa_uberblock.ub_version = version;
2137 	spa->spa_ubsync = spa->spa_uberblock;
2138 
2139 	/*
2140 	 * Create "The Godfather" zio to hold all async IOs
2141 	 */
2142 	spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
2143 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
2144 
2145 	/*
2146 	 * Create the root vdev.
2147 	 */
2148 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2149 
2150 	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
2151 
2152 	ASSERT(error != 0 || rvd != NULL);
2153 	ASSERT(error != 0 || spa->spa_root_vdev == rvd);
2154 
2155 	if (error == 0 && !zfs_allocatable_devs(nvroot))
2156 		error = EINVAL;
2157 
2158 	if (error == 0 &&
2159 	    (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
2160 	    (error = spa_validate_aux(spa, nvroot, txg,
2161 	    VDEV_ALLOC_ADD)) == 0) {
2162 		for (int c = 0; c < rvd->vdev_children; c++) {
2163 			vdev_metaslab_set_size(rvd->vdev_child[c]);
2164 			vdev_expand(rvd->vdev_child[c], txg);
2165 		}
2166 	}
2167 
2168 	spa_config_exit(spa, SCL_ALL, FTAG);
2169 
2170 	if (error != 0) {
2171 		spa_unload(spa);
2172 		spa_deactivate(spa);
2173 		spa_remove(spa);
2174 		mutex_exit(&spa_namespace_lock);
2175 		return (error);
2176 	}
2177 
2178 	/*
2179 	 * Get the list of spares, if specified.
2180 	 */
2181 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
2182 	    &spares, &nspares) == 0) {
2183 		VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
2184 		    KM_SLEEP) == 0);
2185 		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
2186 		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
2187 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2188 		spa_load_spares(spa);
2189 		spa_config_exit(spa, SCL_ALL, FTAG);
2190 		spa->spa_spares.sav_sync = B_TRUE;
2191 	}
2192 
2193 	/*
2194 	 * Get the list of level 2 cache devices, if specified.
2195 	 */
2196 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
2197 	    &l2cache, &nl2cache) == 0) {
2198 		VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
2199 		    NV_UNIQUE_NAME, KM_SLEEP) == 0);
2200 		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
2201 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
2202 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2203 		spa_load_l2cache(spa);
2204 		spa_config_exit(spa, SCL_ALL, FTAG);
2205 		spa->spa_l2cache.sav_sync = B_TRUE;
2206 	}
2207 
2208 	spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
2209 	spa->spa_meta_objset = dp->dp_meta_objset;
2210 
2211 	tx = dmu_tx_create_assigned(dp, txg);
2212 
2213 	/*
2214 	 * Create the pool config object.
2215 	 */
2216 	spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
2217 	    DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
2218 	    DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
2219 
2220 	if (zap_add(spa->spa_meta_objset,
2221 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
2222 	    sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
2223 		cmn_err(CE_PANIC, "failed to add pool config");
2224 	}
2225 
2226 	/* Newly created pools with the right version are always deflated. */
2227 	if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
2228 		spa->spa_deflate = TRUE;
2229 		if (zap_add(spa->spa_meta_objset,
2230 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
2231 		    sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
2232 			cmn_err(CE_PANIC, "failed to add deflate");
2233 		}
2234 	}
2235 
2236 	/*
2237 	 * Create the deferred-free bplist object.  Turn off compression
2238 	 * because sync-to-convergence takes longer if the blocksize
2239 	 * keeps changing.
2240 	 */
2241 	spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset,
2242 	    1 << 14, tx);
2243 	dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj,
2244 	    ZIO_COMPRESS_OFF, tx);
2245 
2246 	if (zap_add(spa->spa_meta_objset,
2247 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
2248 	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) {
2249 		cmn_err(CE_PANIC, "failed to add bplist");
2250 	}
2251 
2252 	/*
2253 	 * Create the pool's history object.
2254 	 */
2255 	if (version >= SPA_VERSION_ZPOOL_HISTORY)
2256 		spa_history_create_obj(spa, tx);
2257 
2258 	/*
2259 	 * Set pool properties.
2260 	 */
2261 	spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
2262 	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
2263 	spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
2264 	spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
2265 	if (props != NULL) {
2266 		spa_configfile_set(spa, props, B_FALSE);
2267 		spa_sync_props(spa, props, CRED(), tx);
2268 	}
2269 
2270 	dmu_tx_commit(tx);
2271 
2272 	spa->spa_sync_on = B_TRUE;
2273 	txg_sync_start(spa->spa_dsl_pool);
2274 
2275 	/*
2276 	 * We explicitly wait for the first transaction to complete so that our
2277 	 * bean counters are appropriately updated.
2278 	 */
2279 	txg_wait_synced(spa->spa_dsl_pool, txg);
2280 
2281 	spa_config_sync(spa, B_FALSE, B_TRUE);
2282 
2283 	if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL)
2284 		(void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE);
2285 	spa_history_log_version(spa, LOG_POOL_CREATE);
2286 
2287 	spa->spa_minref = refcount_count(&spa->spa_refcount);
2288 
2289 	mutex_exit(&spa_namespace_lock);
2290 
2291 	return (0);
2292 }
2293 
2294 #ifdef _KERNEL
2295 /*
2296  * Get the root pool information from the root disk, then import the root pool
2297  * during the system boot up time.
2298  */
2299 extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
2300 
2301 static nvlist_t *
2302 spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid)
2303 {
2304 	nvlist_t *config;
2305 	nvlist_t *nvtop, *nvroot;
2306 	uint64_t pgid;
2307 
2308 	if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0)
2309 		return (NULL);
2310 
2311 	/*
2312 	 * Add this top-level vdev to the child array.
2313 	 */
2314 	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
2315 	    &nvtop) == 0);
2316 	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
2317 	    &pgid) == 0);
2318 	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0);
2319 
2320 	/*
2321 	 * Put this pool's top-level vdevs into a root vdev.
2322 	 */
2323 	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2324 	VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
2325 	    VDEV_TYPE_ROOT) == 0);
2326 	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
2327 	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
2328 	VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
2329 	    &nvtop, 1) == 0);
2330 
2331 	/*
2332 	 * Replace the existing vdev_tree with the new root vdev in
2333 	 * this pool's configuration (remove the old, add the new).
2334 	 */
2335 	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
2336 	nvlist_free(nvroot);
2337 	return (config);
2338 }
2339 
2340 /*
2341  * Walk the vdev tree and see if we can find a device with "better"
2342  * configuration. A configuration is "better" if the label on that
2343  * device has a more recent txg.
2344  */
2345 static void
2346 spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg)
2347 {
2348 	for (int c = 0; c < vd->vdev_children; c++)
2349 		spa_alt_rootvdev(vd->vdev_child[c], avd, txg);
2350 
2351 	if (vd->vdev_ops->vdev_op_leaf) {
2352 		nvlist_t *label;
2353 		uint64_t label_txg;
2354 
2355 		if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid,
2356 		    &label) != 0)
2357 			return;
2358 
2359 		VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
2360 		    &label_txg) == 0);
2361 
2362 		/*
2363 		 * Do we have a better boot device?
2364 		 */
2365 		if (label_txg > *txg) {
2366 			*txg = label_txg;
2367 			*avd = vd;
2368 		}
2369 		nvlist_free(label);
2370 	}
2371 }
2372 
2373 /*
2374  * Import a root pool.
2375  *
2376  * For x86. devpath_list will consist of devid and/or physpath name of
2377  * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a").
2378  * The GRUB "findroot" command will return the vdev we should boot.
2379  *
2380  * For Sparc, devpath_list consists the physpath name of the booting device
2381  * no matter the rootpool is a single device pool or a mirrored pool.
2382  * e.g.
2383  *	"/pci@1f,0/ide@d/disk@0,0:a"
2384  */
2385 int
2386 spa_import_rootpool(char *devpath, char *devid)
2387 {
2388 	spa_t *spa;
2389 	vdev_t *rvd, *bvd, *avd = NULL;
2390 	nvlist_t *config, *nvtop;
2391 	uint64_t guid, txg;
2392 	char *pname;
2393 	int error;
2394 
2395 	/*
2396 	 * Read the label from the boot device and generate a configuration.
2397 	 */
2398 	if ((config = spa_generate_rootconf(devpath, devid, &guid)) == NULL) {
2399 		cmn_err(CE_NOTE, "Can not read the pool label from '%s'",
2400 		    devpath);
2401 		return (EIO);
2402 	}
2403 
2404 	VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
2405 	    &pname) == 0);
2406 	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
2407 
2408 	mutex_enter(&spa_namespace_lock);
2409 	if ((spa = spa_lookup(pname)) != NULL) {
2410 		/*
2411 		 * Remove the existing root pool from the namespace so that we
2412 		 * can replace it with the correct config we just read in.
2413 		 */
2414 		spa_remove(spa);
2415 	}
2416 
2417 	spa = spa_add(pname, NULL);
2418 	spa->spa_is_root = B_TRUE;
2419 	spa->spa_load_verbatim = B_TRUE;
2420 
2421 	/*
2422 	 * Build up a vdev tree based on the boot device's label config.
2423 	 */
2424 	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
2425 	    &nvtop) == 0);
2426 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2427 	error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
2428 	    VDEV_ALLOC_ROOTPOOL);
2429 	spa_config_exit(spa, SCL_ALL, FTAG);
2430 	if (error) {
2431 		mutex_exit(&spa_namespace_lock);
2432 		nvlist_free(config);
2433 		cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
2434 		    pname);
2435 		return (error);
2436 	}
2437 
2438 	/*
2439 	 * Get the boot vdev.
2440 	 */
2441 	if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
2442 		cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu",
2443 		    (u_longlong_t)guid);
2444 		error = ENOENT;
2445 		goto out;
2446 	}
2447 
2448 	/*
2449 	 * Determine if there is a better boot device.
2450 	 */
2451 	avd = bvd;
2452 	spa_alt_rootvdev(rvd, &avd, &txg);
2453 	if (avd != bvd) {
2454 		cmn_err(CE_NOTE, "The boot device is 'degraded'. Please "
2455 		    "try booting from '%s'", avd->vdev_path);
2456 		error = EINVAL;
2457 		goto out;
2458 	}
2459 
2460 	/*
2461 	 * If the boot device is part of a spare vdev then ensure that
2462 	 * we're booting off the active spare.
2463 	 */
2464 	if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
2465 	    !bvd->vdev_isspare) {
2466 		cmn_err(CE_NOTE, "The boot device is currently spared. Please "
2467 		    "try booting from '%s'",
2468 		    bvd->vdev_parent->vdev_child[1]->vdev_path);
2469 		error = EINVAL;
2470 		goto out;
2471 	}
2472 
2473 	VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
2474 	error = 0;
2475 	spa_history_log_version(spa, LOG_POOL_IMPORT);
2476 out:
2477 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2478 	vdev_free(rvd);
2479 	spa_config_exit(spa, SCL_ALL, FTAG);
2480 	mutex_exit(&spa_namespace_lock);
2481 
2482 	nvlist_free(config);
2483 	return (error);
2484 }
2485 
2486 #endif
2487 
2488 /*
2489  * Take a pool and insert it into the namespace as if it had been loaded at
2490  * boot.
2491  */
2492 int
2493 spa_import_verbatim(const char *pool, nvlist_t *config, nvlist_t *props)
2494 {
2495 	spa_t *spa;
2496 	char *altroot = NULL;
2497 
2498 	mutex_enter(&spa_namespace_lock);
2499 	if (spa_lookup(pool) != NULL) {
2500 		mutex_exit(&spa_namespace_lock);
2501 		return (EEXIST);
2502 	}
2503 
2504 	(void) nvlist_lookup_string(props,
2505 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
2506 	spa = spa_add(pool, altroot);
2507 
2508 	spa->spa_load_verbatim = B_TRUE;
2509 
2510 	VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
2511 
2512 	if (props != NULL)
2513 		spa_configfile_set(spa, props, B_FALSE);
2514 
2515 	spa_config_sync(spa, B_FALSE, B_TRUE);
2516 
2517 	mutex_exit(&spa_namespace_lock);
2518 	spa_history_log_version(spa, LOG_POOL_IMPORT);
2519 
2520 	return (0);
2521 }
2522 
2523 /*
2524  * Import a non-root pool into the system.
2525  */
2526 int
2527 spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
2528 {
2529 	spa_t *spa;
2530 	char *altroot = NULL;
2531 	int error;
2532 	nvlist_t *nvroot;
2533 	nvlist_t **spares, **l2cache;
2534 	uint_t nspares, nl2cache;
2535 
2536 	/*
2537 	 * If a pool with this name exists, return failure.
2538 	 */
2539 	mutex_enter(&spa_namespace_lock);
2540 	if ((spa = spa_lookup(pool)) != NULL) {
2541 		mutex_exit(&spa_namespace_lock);
2542 		return (EEXIST);
2543 	}
2544 
2545 	/*
2546 	 * Create and initialize the spa structure.
2547 	 */
2548 	(void) nvlist_lookup_string(props,
2549 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
2550 	spa = spa_add(pool, altroot);
2551 	spa_activate(spa, spa_mode_global);
2552 
2553 	/*
2554 	 * Don't start async tasks until we know everything is healthy.
2555 	 */
2556 	spa_async_suspend(spa);
2557 
2558 	/*
2559 	 * Pass off the heavy lifting to spa_load().  Pass TRUE for mosconfig
2560 	 * because the user-supplied config is actually the one to trust when
2561 	 * doing an import.
2562 	 */
2563 	error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE);
2564 
2565 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2566 	/*
2567 	 * Toss any existing sparelist, as it doesn't have any validity
2568 	 * anymore, and conflicts with spa_has_spare().
2569 	 */
2570 	if (spa->spa_spares.sav_config) {
2571 		nvlist_free(spa->spa_spares.sav_config);
2572 		spa->spa_spares.sav_config = NULL;
2573 		spa_load_spares(spa);
2574 	}
2575 	if (spa->spa_l2cache.sav_config) {
2576 		nvlist_free(spa->spa_l2cache.sav_config);
2577 		spa->spa_l2cache.sav_config = NULL;
2578 		spa_load_l2cache(spa);
2579 	}
2580 
2581 	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
2582 	    &nvroot) == 0);
2583 	if (error == 0)
2584 		error = spa_validate_aux(spa, nvroot, -1ULL,
2585 		    VDEV_ALLOC_SPARE);
2586 	if (error == 0)
2587 		error = spa_validate_aux(spa, nvroot, -1ULL,
2588 		    VDEV_ALLOC_L2CACHE);
2589 	spa_config_exit(spa, SCL_ALL, FTAG);
2590 
2591 	if (props != NULL)
2592 		spa_configfile_set(spa, props, B_FALSE);
2593 
2594 	if (error != 0 || (props && spa_writeable(spa) &&
2595 	    (error = spa_prop_set(spa, props)))) {
2596 		spa_unload(spa);
2597 		spa_deactivate(spa);
2598 		spa_remove(spa);
2599 		mutex_exit(&spa_namespace_lock);
2600 		return (error);
2601 	}
2602 
2603 	spa_async_resume(spa);
2604 
2605 	/*
2606 	 * Override any spares and level 2 cache devices as specified by
2607 	 * the user, as these may have correct device names/devids, etc.
2608 	 */
2609 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
2610 	    &spares, &nspares) == 0) {
2611 		if (spa->spa_spares.sav_config)
2612 			VERIFY(nvlist_remove(spa->spa_spares.sav_config,
2613 			    ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
2614 		else
2615 			VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
2616 			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
2617 		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
2618 		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
2619 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2620 		spa_load_spares(spa);
2621 		spa_config_exit(spa, SCL_ALL, FTAG);
2622 		spa->spa_spares.sav_sync = B_TRUE;
2623 	}
2624 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
2625 	    &l2cache, &nl2cache) == 0) {
2626 		if (spa->spa_l2cache.sav_config)
2627 			VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
2628 			    ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
2629 		else
2630 			VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
2631 			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
2632 		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
2633 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
2634 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2635 		spa_load_l2cache(spa);
2636 		spa_config_exit(spa, SCL_ALL, FTAG);
2637 		spa->spa_l2cache.sav_sync = B_TRUE;
2638 	}
2639 
2640 	if (spa_writeable(spa)) {
2641 		/*
2642 		 * Update the config cache to include the newly-imported pool.
2643 		 */
2644 		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
2645 	}
2646 
2647 	/*
2648 	 * It's possible that the pool was expanded while it was exported.
2649 	 * We kick off an async task to handle this for us.
2650 	 */
2651 	spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
2652 
2653 	mutex_exit(&spa_namespace_lock);
2654 	spa_history_log_version(spa, LOG_POOL_IMPORT);
2655 
2656 	return (0);
2657 }
2658 
2659 
2660 /*
2661  * This (illegal) pool name is used when temporarily importing a spa_t in order
2662  * to get the vdev stats associated with the imported devices.
2663  */
2664 #define	TRYIMPORT_NAME	"$import"
2665 
2666 nvlist_t *
2667 spa_tryimport(nvlist_t *tryconfig)
2668 {
2669 	nvlist_t *config = NULL;
2670 	char *poolname;
2671 	spa_t *spa;
2672 	uint64_t state;
2673 	int error;
2674 
2675 	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
2676 		return (NULL);
2677 
2678 	if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
2679 		return (NULL);
2680 
2681 	/*
2682 	 * Create and initialize the spa structure.
2683 	 */
2684 	mutex_enter(&spa_namespace_lock);
2685 	spa = spa_add(TRYIMPORT_NAME, NULL);
2686 	spa_activate(spa, FREAD);
2687 
2688 	/*
2689 	 * Pass off the heavy lifting to spa_load().
2690 	 * Pass TRUE for mosconfig because the user-supplied config
2691 	 * is actually the one to trust when doing an import.
2692 	 */
2693 	error = spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE);
2694 
2695 	/*
2696 	 * If 'tryconfig' was at least parsable, return the current config.
2697 	 */
2698 	if (spa->spa_root_vdev != NULL) {
2699 		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
2700 		VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
2701 		    poolname) == 0);
2702 		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
2703 		    state) == 0);
2704 		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
2705 		    spa->spa_uberblock.ub_timestamp) == 0);
2706 
2707 		/*
2708 		 * If the bootfs property exists on this pool then we
2709 		 * copy it out so that external consumers can tell which
2710 		 * pools are bootable.
2711 		 */
2712 		if ((!error || error == EEXIST) && spa->spa_bootfs) {
2713 			char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
2714 
2715 			/*
2716 			 * We have to play games with the name since the
2717 			 * pool was opened as TRYIMPORT_NAME.
2718 			 */
2719 			if (dsl_dsobj_to_dsname(spa_name(spa),
2720 			    spa->spa_bootfs, tmpname) == 0) {
2721 				char *cp;
2722 				char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
2723 
2724 				cp = strchr(tmpname, '/');
2725 				if (cp == NULL) {
2726 					(void) strlcpy(dsname, tmpname,
2727 					    MAXPATHLEN);
2728 				} else {
2729 					(void) snprintf(dsname, MAXPATHLEN,
2730 					    "%s/%s", poolname, ++cp);
2731 				}
2732 				VERIFY(nvlist_add_string(config,
2733 				    ZPOOL_CONFIG_BOOTFS, dsname) == 0);
2734 				kmem_free(dsname, MAXPATHLEN);
2735 			}
2736 			kmem_free(tmpname, MAXPATHLEN);
2737 		}
2738 
2739 		/*
2740 		 * Add the list of hot spares and level 2 cache devices.
2741 		 */
2742 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
2743 		spa_add_spares(spa, config);
2744 		spa_add_l2cache(spa, config);
2745 		spa_config_exit(spa, SCL_CONFIG, FTAG);
2746 	}
2747 
2748 	spa_unload(spa);
2749 	spa_deactivate(spa);
2750 	spa_remove(spa);
2751 	mutex_exit(&spa_namespace_lock);
2752 
2753 	return (config);
2754 }
2755 
2756 /*
2757  * Pool export/destroy
2758  *
2759  * The act of destroying or exporting a pool is very simple.  We make sure there
2760  * is no more pending I/O and any references to the pool are gone.  Then, we
2761  * update the pool state and sync all the labels to disk, removing the
2762  * configuration from the cache afterwards. If the 'hardforce' flag is set, then
2763  * we don't sync the labels or remove the configuration cache.
2764  */
2765 static int
2766 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
2767     boolean_t force, boolean_t hardforce)
2768 {
2769 	spa_t *spa;
2770 
2771 	if (oldconfig)
2772 		*oldconfig = NULL;
2773 
2774 	if (!(spa_mode_global & FWRITE))
2775 		return (EROFS);
2776 
2777 	mutex_enter(&spa_namespace_lock);
2778 	if ((spa = spa_lookup(pool)) == NULL) {
2779 		mutex_exit(&spa_namespace_lock);
2780 		return (ENOENT);
2781 	}
2782 
2783 	/*
2784 	 * Put a hold on the pool, drop the namespace lock, stop async tasks,
2785 	 * reacquire the namespace lock, and see if we can export.
2786 	 */
2787 	spa_open_ref(spa, FTAG);
2788 	mutex_exit(&spa_namespace_lock);
2789 	spa_async_suspend(spa);
2790 	mutex_enter(&spa_namespace_lock);
2791 	spa_close(spa, FTAG);
2792 
2793 	/*
2794 	 * The pool will be in core if it's openable,
2795 	 * in which case we can modify its state.
2796 	 */
2797 	if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
2798 		/*
2799 		 * Objsets may be open only because they're dirty, so we
2800 		 * have to force it to sync before checking spa_refcnt.
2801 		 */
2802 		txg_wait_synced(spa->spa_dsl_pool, 0);
2803 
2804 		/*
2805 		 * A pool cannot be exported or destroyed if there are active
2806 		 * references.  If we are resetting a pool, allow references by
2807 		 * fault injection handlers.
2808 		 */
2809 		if (!spa_refcount_zero(spa) ||
2810 		    (spa->spa_inject_ref != 0 &&
2811 		    new_state != POOL_STATE_UNINITIALIZED)) {
2812 			spa_async_resume(spa);
2813 			mutex_exit(&spa_namespace_lock);
2814 			return (EBUSY);
2815 		}
2816 
2817 		/*
2818 		 * A pool cannot be exported if it has an active shared spare.
2819 		 * This is to prevent other pools stealing the active spare
2820 		 * from an exported pool. At user's own will, such pool can
2821 		 * be forcedly exported.
2822 		 */
2823 		if (!force && new_state == POOL_STATE_EXPORTED &&
2824 		    spa_has_active_shared_spare(spa)) {
2825 			spa_async_resume(spa);
2826 			mutex_exit(&spa_namespace_lock);
2827 			return (EXDEV);
2828 		}
2829 
2830 		/*
2831 		 * We want this to be reflected on every label,
2832 		 * so mark them all dirty.  spa_unload() will do the
2833 		 * final sync that pushes these changes out.
2834 		 */
2835 		if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
2836 			spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2837 			spa->spa_state = new_state;
2838 			spa->spa_final_txg = spa_last_synced_txg(spa) + 1;
2839 			vdev_config_dirty(spa->spa_root_vdev);
2840 			spa_config_exit(spa, SCL_ALL, FTAG);
2841 		}
2842 	}
2843 
2844 	spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY);
2845 
2846 	if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
2847 		spa_unload(spa);
2848 		spa_deactivate(spa);
2849 	}
2850 
2851 	if (oldconfig && spa->spa_config)
2852 		VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
2853 
2854 	if (new_state != POOL_STATE_UNINITIALIZED) {
2855 		if (!hardforce)
2856 			spa_config_sync(spa, B_TRUE, B_TRUE);
2857 		spa_remove(spa);
2858 	}
2859 	mutex_exit(&spa_namespace_lock);
2860 
2861 	return (0);
2862 }
2863 
2864 /*
2865  * Destroy a storage pool.
2866  */
2867 int
2868 spa_destroy(char *pool)
2869 {
2870 	return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
2871 	    B_FALSE, B_FALSE));
2872 }
2873 
2874 /*
2875  * Export a storage pool.
2876  */
2877 int
2878 spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
2879     boolean_t hardforce)
2880 {
2881 	return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
2882 	    force, hardforce));
2883 }
2884 
2885 /*
2886  * Similar to spa_export(), this unloads the spa_t without actually removing it
2887  * from the namespace in any way.
2888  */
2889 int
2890 spa_reset(char *pool)
2891 {
2892 	return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
2893 	    B_FALSE, B_FALSE));
2894 }
2895 
2896 /*
2897  * ==========================================================================
2898  * Device manipulation
2899  * ==========================================================================
2900  */
2901 
2902 /*
2903  * Add a device to a storage pool.
2904  */
2905 int
2906 spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
2907 {
2908 	uint64_t txg;
2909 	int error;
2910 	vdev_t *rvd = spa->spa_root_vdev;
2911 	vdev_t *vd, *tvd;
2912 	nvlist_t **spares, **l2cache;
2913 	uint_t nspares, nl2cache;
2914 
2915 	txg = spa_vdev_enter(spa);
2916 
2917 	if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
2918 	    VDEV_ALLOC_ADD)) != 0)
2919 		return (spa_vdev_exit(spa, NULL, txg, error));
2920 
2921 	spa->spa_pending_vdev = vd;	/* spa_vdev_exit() will clear this */
2922 
2923 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
2924 	    &nspares) != 0)
2925 		nspares = 0;
2926 
2927 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
2928 	    &nl2cache) != 0)
2929 		nl2cache = 0;
2930 
2931 	if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
2932 		return (spa_vdev_exit(spa, vd, txg, EINVAL));
2933 
2934 	if (vd->vdev_children != 0 &&
2935 	    (error = vdev_create(vd, txg, B_FALSE)) != 0)
2936 		return (spa_vdev_exit(spa, vd, txg, error));
2937 
2938 	/*
2939 	 * We must validate the spares and l2cache devices after checking the
2940 	 * children.  Otherwise, vdev_inuse() will blindly overwrite the spare.
2941 	 */
2942 	if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
2943 		return (spa_vdev_exit(spa, vd, txg, error));
2944 
2945 	/*
2946 	 * Transfer each new top-level vdev from vd to rvd.
2947 	 */
2948 	for (int c = 0; c < vd->vdev_children; c++) {
2949 		tvd = vd->vdev_child[c];
2950 		vdev_remove_child(vd, tvd);
2951 		tvd->vdev_id = rvd->vdev_children;
2952 		vdev_add_child(rvd, tvd);
2953 		vdev_config_dirty(tvd);
2954 	}
2955 
2956 	if (nspares != 0) {
2957 		spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
2958 		    ZPOOL_CONFIG_SPARES);
2959 		spa_load_spares(spa);
2960 		spa->spa_spares.sav_sync = B_TRUE;
2961 	}
2962 
2963 	if (nl2cache != 0) {
2964 		spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
2965 		    ZPOOL_CONFIG_L2CACHE);
2966 		spa_load_l2cache(spa);
2967 		spa->spa_l2cache.sav_sync = B_TRUE;
2968 	}
2969 
2970 	/*
2971 	 * We have to be careful when adding new vdevs to an existing pool.
2972 	 * If other threads start allocating from these vdevs before we
2973 	 * sync the config cache, and we lose power, then upon reboot we may
2974 	 * fail to open the pool because there are DVAs that the config cache
2975 	 * can't translate.  Therefore, we first add the vdevs without
2976 	 * initializing metaslabs; sync the config cache (via spa_vdev_exit());
2977 	 * and then let spa_config_update() initialize the new metaslabs.
2978 	 *
2979 	 * spa_load() checks for added-but-not-initialized vdevs, so that
2980 	 * if we lose power at any point in this sequence, the remaining
2981 	 * steps will be completed the next time we load the pool.
2982 	 */
2983 	(void) spa_vdev_exit(spa, vd, txg, 0);
2984 
2985 	mutex_enter(&spa_namespace_lock);
2986 	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
2987 	mutex_exit(&spa_namespace_lock);
2988 
2989 	return (0);
2990 }
2991 
2992 /*
2993  * Attach a device to a mirror.  The arguments are the path to any device
2994  * in the mirror, and the nvroot for the new device.  If the path specifies
2995  * a device that is not mirrored, we automatically insert the mirror vdev.
2996  *
2997  * If 'replacing' is specified, the new device is intended to replace the
2998  * existing device; in this case the two devices are made into their own
2999  * mirror using the 'replacing' vdev, which is functionally identical to
3000  * the mirror vdev (it actually reuses all the same ops) but has a few
3001  * extra rules: you can't attach to it after it's been created, and upon
3002  * completion of resilvering, the first disk (the one being replaced)
3003  * is automatically detached.
3004  */
3005 int
3006 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
3007 {
3008 	uint64_t txg, open_txg;
3009 	vdev_t *rvd = spa->spa_root_vdev;
3010 	vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
3011 	vdev_ops_t *pvops;
3012 	char *oldvdpath, *newvdpath;
3013 	int newvd_isspare;
3014 	int error;
3015 
3016 	txg = spa_vdev_enter(spa);
3017 
3018 	oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
3019 
3020 	if (oldvd == NULL)
3021 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
3022 
3023 	if (!oldvd->vdev_ops->vdev_op_leaf)
3024 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
3025 
3026 	pvd = oldvd->vdev_parent;
3027 
3028 	if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
3029 	    VDEV_ALLOC_ADD)) != 0)
3030 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
3031 
3032 	if (newrootvd->vdev_children != 1)
3033 		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
3034 
3035 	newvd = newrootvd->vdev_child[0];
3036 
3037 	if (!newvd->vdev_ops->vdev_op_leaf)
3038 		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
3039 
3040 	if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
3041 		return (spa_vdev_exit(spa, newrootvd, txg, error));
3042 
3043 	/*
3044 	 * Spares can't replace logs
3045 	 */
3046 	if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
3047 		return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
3048 
3049 	if (!replacing) {
3050 		/*
3051 		 * For attach, the only allowable parent is a mirror or the root
3052 		 * vdev.
3053 		 */
3054 		if (pvd->vdev_ops != &vdev_mirror_ops &&
3055 		    pvd->vdev_ops != &vdev_root_ops)
3056 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
3057 
3058 		pvops = &vdev_mirror_ops;
3059 	} else {
3060 		/*
3061 		 * Active hot spares can only be replaced by inactive hot
3062 		 * spares.
3063 		 */
3064 		if (pvd->vdev_ops == &vdev_spare_ops &&
3065 		    pvd->vdev_child[1] == oldvd &&
3066 		    !spa_has_spare(spa, newvd->vdev_guid))
3067 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
3068 
3069 		/*
3070 		 * If the source is a hot spare, and the parent isn't already a
3071 		 * spare, then we want to create a new hot spare.  Otherwise, we
3072 		 * want to create a replacing vdev.  The user is not allowed to
3073 		 * attach to a spared vdev child unless the 'isspare' state is
3074 		 * the same (spare replaces spare, non-spare replaces
3075 		 * non-spare).
3076 		 */
3077 		if (pvd->vdev_ops == &vdev_replacing_ops)
3078 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
3079 		else if (pvd->vdev_ops == &vdev_spare_ops &&
3080 		    newvd->vdev_isspare != oldvd->vdev_isspare)
3081 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
3082 		else if (pvd->vdev_ops != &vdev_spare_ops &&
3083 		    newvd->vdev_isspare)
3084 			pvops = &vdev_spare_ops;
3085 		else
3086 			pvops = &vdev_replacing_ops;
3087 	}
3088 
3089 	/*
3090 	 * Make sure the new device is big enough.
3091 	 */
3092 	if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
3093 		return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
3094 
3095 	/*
3096 	 * The new device cannot have a higher alignment requirement
3097 	 * than the top-level vdev.
3098 	 */
3099 	if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
3100 		return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
3101 
3102 	/*
3103 	 * If this is an in-place replacement, update oldvd's path and devid
3104 	 * to make it distinguishable from newvd, and unopenable from now on.
3105 	 */
3106 	if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
3107 		spa_strfree(oldvd->vdev_path);
3108 		oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
3109 		    KM_SLEEP);
3110 		(void) sprintf(oldvd->vdev_path, "%s/%s",
3111 		    newvd->vdev_path, "old");
3112 		if (oldvd->vdev_devid != NULL) {
3113 			spa_strfree(oldvd->vdev_devid);
3114 			oldvd->vdev_devid = NULL;
3115 		}
3116 	}
3117 
3118 	/*
3119 	 * If the parent is not a mirror, or if we're replacing, insert the new
3120 	 * mirror/replacing/spare vdev above oldvd.
3121 	 */
3122 	if (pvd->vdev_ops != pvops)
3123 		pvd = vdev_add_parent(oldvd, pvops);
3124 
3125 	ASSERT(pvd->vdev_top->vdev_parent == rvd);
3126 	ASSERT(pvd->vdev_ops == pvops);
3127 	ASSERT(oldvd->vdev_parent == pvd);
3128 
3129 	/*
3130 	 * Extract the new device from its root and add it to pvd.
3131 	 */
3132 	vdev_remove_child(newrootvd, newvd);
3133 	newvd->vdev_id = pvd->vdev_children;
3134 	vdev_add_child(pvd, newvd);
3135 
3136 	tvd = newvd->vdev_top;
3137 	ASSERT(pvd->vdev_top == tvd);
3138 	ASSERT(tvd->vdev_parent == rvd);
3139 
3140 	vdev_config_dirty(tvd);
3141 
3142 	/*
3143 	 * Set newvd's DTL to [TXG_INITIAL, open_txg].  It will propagate
3144 	 * upward when spa_vdev_exit() calls vdev_dtl_reassess().
3145 	 */
3146 	open_txg = txg + TXG_CONCURRENT_STATES - 1;
3147 
3148 	vdev_dtl_dirty(newvd, DTL_MISSING,
3149 	    TXG_INITIAL, open_txg - TXG_INITIAL + 1);
3150 
3151 	if (newvd->vdev_isspare) {
3152 		spa_spare_activate(newvd);
3153 		spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE);
3154 	}
3155 
3156 	oldvdpath = spa_strdup(oldvd->vdev_path);
3157 	newvdpath = spa_strdup(newvd->vdev_path);
3158 	newvd_isspare = newvd->vdev_isspare;
3159 
3160 	/*
3161 	 * Mark newvd's DTL dirty in this txg.
3162 	 */
3163 	vdev_dirty(tvd, VDD_DTL, newvd, txg);
3164 
3165 	(void) spa_vdev_exit(spa, newrootvd, open_txg, 0);
3166 
3167 	spa_history_internal_log(LOG_POOL_VDEV_ATTACH, spa, NULL,
3168 	    CRED(),  "%s vdev=%s %s vdev=%s",
3169 	    replacing && newvd_isspare ? "spare in" :
3170 	    replacing ? "replace" : "attach", newvdpath,
3171 	    replacing ? "for" : "to", oldvdpath);
3172 
3173 	spa_strfree(oldvdpath);
3174 	spa_strfree(newvdpath);
3175 
3176 	/*
3177 	 * Kick off a resilver to update newvd.
3178 	 */
3179 	VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0);
3180 
3181 	return (0);
3182 }
3183 
3184 /*
3185  * Detach a device from a mirror or replacing vdev.
3186  * If 'replace_done' is specified, only detach if the parent
3187  * is a replacing vdev.
3188  */
3189 int
3190 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
3191 {
3192 	uint64_t txg;
3193 	int error;
3194 	vdev_t *rvd = spa->spa_root_vdev;
3195 	vdev_t *vd, *pvd, *cvd, *tvd;
3196 	boolean_t unspare = B_FALSE;
3197 	uint64_t unspare_guid;
3198 	size_t len;
3199 
3200 	txg = spa_vdev_enter(spa);
3201 
3202 	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
3203 
3204 	if (vd == NULL)
3205 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
3206 
3207 	if (!vd->vdev_ops->vdev_op_leaf)
3208 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
3209 
3210 	pvd = vd->vdev_parent;
3211 
3212 	/*
3213 	 * If the parent/child relationship is not as expected, don't do it.
3214 	 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
3215 	 * vdev that's replacing B with C.  The user's intent in replacing
3216 	 * is to go from M(A,B) to M(A,C).  If the user decides to cancel
3217 	 * the replace by detaching C, the expected behavior is to end up
3218 	 * M(A,B).  But suppose that right after deciding to detach C,
3219 	 * the replacement of B completes.  We would have M(A,C), and then
3220 	 * ask to detach C, which would leave us with just A -- not what
3221 	 * the user wanted.  To prevent this, we make sure that the
3222 	 * parent/child relationship hasn't changed -- in this example,
3223 	 * that C's parent is still the replacing vdev R.
3224 	 */
3225 	if (pvd->vdev_guid != pguid && pguid != 0)
3226 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
3227 
3228 	/*
3229 	 * If replace_done is specified, only remove this device if it's
3230 	 * the first child of a replacing vdev.  For the 'spare' vdev, either
3231 	 * disk can be removed.
3232 	 */
3233 	if (replace_done) {
3234 		if (pvd->vdev_ops == &vdev_replacing_ops) {
3235 			if (vd->vdev_id != 0)
3236 				return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
3237 		} else if (pvd->vdev_ops != &vdev_spare_ops) {
3238 			return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
3239 		}
3240 	}
3241 
3242 	ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
3243 	    spa_version(spa) >= SPA_VERSION_SPARES);
3244 
3245 	/*
3246 	 * Only mirror, replacing, and spare vdevs support detach.
3247 	 */
3248 	if (pvd->vdev_ops != &vdev_replacing_ops &&
3249 	    pvd->vdev_ops != &vdev_mirror_ops &&
3250 	    pvd->vdev_ops != &vdev_spare_ops)
3251 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
3252 
3253 	/*
3254 	 * If this device has the only valid copy of some data,
3255 	 * we cannot safely detach it.
3256 	 */
3257 	if (vdev_dtl_required(vd))
3258 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
3259 
3260 	ASSERT(pvd->vdev_children >= 2);
3261 
3262 	/*
3263 	 * If we are detaching the second disk from a replacing vdev, then
3264 	 * check to see if we changed the original vdev's path to have "/old"
3265 	 * at the end in spa_vdev_attach().  If so, undo that change now.
3266 	 */
3267 	if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id == 1 &&
3268 	    pvd->vdev_child[0]->vdev_path != NULL &&
3269 	    pvd->vdev_child[1]->vdev_path != NULL) {
3270 		ASSERT(pvd->vdev_child[1] == vd);
3271 		cvd = pvd->vdev_child[0];
3272 		len = strlen(vd->vdev_path);
3273 		if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
3274 		    strcmp(cvd->vdev_path + len, "/old") == 0) {
3275 			spa_strfree(cvd->vdev_path);
3276 			cvd->vdev_path = spa_strdup(vd->vdev_path);
3277 		}
3278 	}
3279 
3280 	/*
3281 	 * If we are detaching the original disk from a spare, then it implies
3282 	 * that the spare should become a real disk, and be removed from the
3283 	 * active spare list for the pool.
3284 	 */
3285 	if (pvd->vdev_ops == &vdev_spare_ops &&
3286 	    vd->vdev_id == 0 && pvd->vdev_child[1]->vdev_isspare)
3287 		unspare = B_TRUE;
3288 
3289 	/*
3290 	 * Erase the disk labels so the disk can be used for other things.
3291 	 * This must be done after all other error cases are handled,
3292 	 * but before we disembowel vd (so we can still do I/O to it).
3293 	 * But if we can't do it, don't treat the error as fatal --
3294 	 * it may be that the unwritability of the disk is the reason
3295 	 * it's being detached!
3296 	 */
3297 	error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
3298 
3299 	/*
3300 	 * Remove vd from its parent and compact the parent's children.
3301 	 */
3302 	vdev_remove_child(pvd, vd);
3303 	vdev_compact_children(pvd);
3304 
3305 	/*
3306 	 * Remember one of the remaining children so we can get tvd below.
3307 	 */
3308 	cvd = pvd->vdev_child[0];
3309 
3310 	/*
3311 	 * If we need to remove the remaining child from the list of hot spares,
3312 	 * do it now, marking the vdev as no longer a spare in the process.
3313 	 * We must do this before vdev_remove_parent(), because that can
3314 	 * change the GUID if it creates a new toplevel GUID.  For a similar
3315 	 * reason, we must remove the spare now, in the same txg as the detach;
3316 	 * otherwise someone could attach a new sibling, change the GUID, and
3317 	 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
3318 	 */
3319 	if (unspare) {
3320 		ASSERT(cvd->vdev_isspare);
3321 		spa_spare_remove(cvd);
3322 		unspare_guid = cvd->vdev_guid;
3323 		(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
3324 	}
3325 
3326 	/*
3327 	 * If the parent mirror/replacing vdev only has one child,
3328 	 * the parent is no longer needed.  Remove it from the tree.
3329 	 */
3330 	if (pvd->vdev_children == 1)
3331 		vdev_remove_parent(cvd);
3332 
3333 	/*
3334 	 * We don't set tvd until now because the parent we just removed
3335 	 * may have been the previous top-level vdev.
3336 	 */
3337 	tvd = cvd->vdev_top;
3338 	ASSERT(tvd->vdev_parent == rvd);
3339 
3340 	/*
3341 	 * Reevaluate the parent vdev state.
3342 	 */
3343 	vdev_propagate_state(cvd);
3344 
3345 	/*
3346 	 * If the 'autoexpand' property is set on the pool then automatically
3347 	 * try to expand the size of the pool. For example if the device we
3348 	 * just detached was smaller than the others, it may be possible to
3349 	 * add metaslabs (i.e. grow the pool). We need to reopen the vdev
3350 	 * first so that we can obtain the updated sizes of the leaf vdevs.
3351 	 */
3352 	if (spa->spa_autoexpand) {
3353 		vdev_reopen(tvd);
3354 		vdev_expand(tvd, txg);
3355 	}
3356 
3357 	vdev_config_dirty(tvd);
3358 
3359 	/*
3360 	 * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
3361 	 * vd->vdev_detached is set and free vd's DTL object in syncing context.
3362 	 * But first make sure we're not on any *other* txg's DTL list, to
3363 	 * prevent vd from being accessed after it's freed.
3364 	 */
3365 	for (int t = 0; t < TXG_SIZE; t++)
3366 		(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
3367 	vd->vdev_detached = B_TRUE;
3368 	vdev_dirty(tvd, VDD_DTL, vd, txg);
3369 
3370 	spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
3371 
3372 	error = spa_vdev_exit(spa, vd, txg, 0);
3373 
3374 	/*
3375 	 * If this was the removal of the original device in a hot spare vdev,
3376 	 * then we want to go through and remove the device from the hot spare
3377 	 * list of every other pool.
3378 	 */
3379 	if (unspare) {
3380 		spa_t *myspa = spa;
3381 		spa = NULL;
3382 		mutex_enter(&spa_namespace_lock);
3383 		while ((spa = spa_next(spa)) != NULL) {
3384 			if (spa->spa_state != POOL_STATE_ACTIVE)
3385 				continue;
3386 			if (spa == myspa)
3387 				continue;
3388 			spa_open_ref(spa, FTAG);
3389 			mutex_exit(&spa_namespace_lock);
3390 			(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
3391 			mutex_enter(&spa_namespace_lock);
3392 			spa_close(spa, FTAG);
3393 		}
3394 		mutex_exit(&spa_namespace_lock);
3395 	}
3396 
3397 	return (error);
3398 }
3399 
3400 static nvlist_t *
3401 spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)
3402 {
3403 	for (int i = 0; i < count; i++) {
3404 		uint64_t guid;
3405 
3406 		VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID,
3407 		    &guid) == 0);
3408 
3409 		if (guid == target_guid)
3410 			return (nvpp[i]);
3411 	}
3412 
3413 	return (NULL);
3414 }
3415 
3416 static void
3417 spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
3418 	nvlist_t *dev_to_remove)
3419 {
3420 	nvlist_t **newdev = NULL;
3421 
3422 	if (count > 1)
3423 		newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP);
3424 
3425 	for (int i = 0, j = 0; i < count; i++) {
3426 		if (dev[i] == dev_to_remove)
3427 			continue;
3428 		VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0);
3429 	}
3430 
3431 	VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0);
3432 	VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0);
3433 
3434 	for (int i = 0; i < count - 1; i++)
3435 		nvlist_free(newdev[i]);
3436 
3437 	if (count > 1)
3438 		kmem_free(newdev, (count - 1) * sizeof (void *));
3439 }
3440 
3441 /*
3442  * Remove a device from the pool.  Currently, this supports removing only hot
3443  * spares and level 2 ARC devices.
3444  */
3445 int
3446 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
3447 {
3448 	vdev_t *vd;
3449 	nvlist_t **spares, **l2cache, *nv;
3450 	uint_t nspares, nl2cache;
3451 	uint64_t txg = 0;
3452 	int error = 0;
3453 	boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
3454 
3455 	if (!locked)
3456 		txg = spa_vdev_enter(spa);
3457 
3458 	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
3459 
3460 	if (spa->spa_spares.sav_vdevs != NULL &&
3461 	    nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
3462 	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 &&
3463 	    (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) {
3464 		/*
3465 		 * Only remove the hot spare if it's not currently in use
3466 		 * in this pool.
3467 		 */
3468 		if (vd == NULL || unspare) {
3469 			spa_vdev_remove_aux(spa->spa_spares.sav_config,
3470 			    ZPOOL_CONFIG_SPARES, spares, nspares, nv);
3471 			spa_load_spares(spa);
3472 			spa->spa_spares.sav_sync = B_TRUE;
3473 		} else {
3474 			error = EBUSY;
3475 		}
3476 	} else if (spa->spa_l2cache.sav_vdevs != NULL &&
3477 	    nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
3478 	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 &&
3479 	    (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) {
3480 		/*
3481 		 * Cache devices can always be removed.
3482 		 */
3483 		spa_vdev_remove_aux(spa->spa_l2cache.sav_config,
3484 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
3485 		spa_load_l2cache(spa);
3486 		spa->spa_l2cache.sav_sync = B_TRUE;
3487 	} else if (vd != NULL) {
3488 		/*
3489 		 * Normal vdevs cannot be removed (yet).
3490 		 */
3491 		error = ENOTSUP;
3492 	} else {
3493 		/*
3494 		 * There is no vdev of any kind with the specified guid.
3495 		 */
3496 		error = ENOENT;
3497 	}
3498 
3499 	if (!locked)
3500 		return (spa_vdev_exit(spa, NULL, txg, error));
3501 
3502 	return (error);
3503 }
3504 
3505 /*
3506  * Find any device that's done replacing, or a vdev marked 'unspare' that's
3507  * current spared, so we can detach it.
3508  */
3509 static vdev_t *
3510 spa_vdev_resilver_done_hunt(vdev_t *vd)
3511 {
3512 	vdev_t *newvd, *oldvd;
3513 
3514 	for (int c = 0; c < vd->vdev_children; c++) {
3515 		oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
3516 		if (oldvd != NULL)
3517 			return (oldvd);
3518 	}
3519 
3520 	/*
3521 	 * Check for a completed replacement.
3522 	 */
3523 	if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) {
3524 		oldvd = vd->vdev_child[0];
3525 		newvd = vd->vdev_child[1];
3526 
3527 		if (vdev_dtl_empty(newvd, DTL_MISSING) &&
3528 		    !vdev_dtl_required(oldvd))
3529 			return (oldvd);
3530 	}
3531 
3532 	/*
3533 	 * Check for a completed resilver with the 'unspare' flag set.
3534 	 */
3535 	if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) {
3536 		newvd = vd->vdev_child[0];
3537 		oldvd = vd->vdev_child[1];
3538 
3539 		if (newvd->vdev_unspare &&
3540 		    vdev_dtl_empty(newvd, DTL_MISSING) &&
3541 		    !vdev_dtl_required(oldvd)) {
3542 			newvd->vdev_unspare = 0;
3543 			return (oldvd);
3544 		}
3545 	}
3546 
3547 	return (NULL);
3548 }
3549 
3550 static void
3551 spa_vdev_resilver_done(spa_t *spa)
3552 {
3553 	vdev_t *vd, *pvd, *ppvd;
3554 	uint64_t guid, sguid, pguid, ppguid;
3555 
3556 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3557 
3558 	while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
3559 		pvd = vd->vdev_parent;
3560 		ppvd = pvd->vdev_parent;
3561 		guid = vd->vdev_guid;
3562 		pguid = pvd->vdev_guid;
3563 		ppguid = ppvd->vdev_guid;
3564 		sguid = 0;
3565 		/*
3566 		 * If we have just finished replacing a hot spared device, then
3567 		 * we need to detach the parent's first child (the original hot
3568 		 * spare) as well.
3569 		 */
3570 		if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0) {
3571 			ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
3572 			ASSERT(ppvd->vdev_children == 2);
3573 			sguid = ppvd->vdev_child[1]->vdev_guid;
3574 		}
3575 		spa_config_exit(spa, SCL_ALL, FTAG);
3576 		if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
3577 			return;
3578 		if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
3579 			return;
3580 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3581 	}
3582 
3583 	spa_config_exit(spa, SCL_ALL, FTAG);
3584 }
3585 
3586 /*
3587  * Update the stored path or FRU for this vdev.  Dirty the vdev configuration,
3588  * relying on spa_vdev_enter/exit() to synchronize the labels and cache.
3589  */
3590 int
3591 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
3592     boolean_t ispath)
3593 {
3594 	vdev_t *vd;
3595 	uint64_t txg;
3596 
3597 	txg = spa_vdev_enter(spa);
3598 
3599 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
3600 		return (spa_vdev_exit(spa, NULL, txg, ENOENT));
3601 
3602 	if (!vd->vdev_ops->vdev_op_leaf)
3603 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
3604 
3605 	if (ispath) {
3606 		spa_strfree(vd->vdev_path);
3607 		vd->vdev_path = spa_strdup(value);
3608 	} else {
3609 		if (vd->vdev_fru != NULL)
3610 			spa_strfree(vd->vdev_fru);
3611 		vd->vdev_fru = spa_strdup(value);
3612 	}
3613 
3614 	vdev_config_dirty(vd->vdev_top);
3615 
3616 	return (spa_vdev_exit(spa, NULL, txg, 0));
3617 }
3618 
3619 int
3620 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
3621 {
3622 	return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
3623 }
3624 
3625 int
3626 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
3627 {
3628 	return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
3629 }
3630 
3631 /*
3632  * ==========================================================================
3633  * SPA Scrubbing
3634  * ==========================================================================
3635  */
3636 
3637 int
3638 spa_scrub(spa_t *spa, pool_scrub_type_t type)
3639 {
3640 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
3641 
3642 	if ((uint_t)type >= POOL_SCRUB_TYPES)
3643 		return (ENOTSUP);
3644 
3645 	/*
3646 	 * If a resilver was requested, but there is no DTL on a
3647 	 * writeable leaf device, we have nothing to do.
3648 	 */
3649 	if (type == POOL_SCRUB_RESILVER &&
3650 	    !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
3651 		spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
3652 		return (0);
3653 	}
3654 
3655 	if (type == POOL_SCRUB_EVERYTHING &&
3656 	    spa->spa_dsl_pool->dp_scrub_func != SCRUB_FUNC_NONE &&
3657 	    spa->spa_dsl_pool->dp_scrub_isresilver)
3658 		return (EBUSY);
3659 
3660 	if (type == POOL_SCRUB_EVERYTHING || type == POOL_SCRUB_RESILVER) {
3661 		return (dsl_pool_scrub_clean(spa->spa_dsl_pool));
3662 	} else if (type == POOL_SCRUB_NONE) {
3663 		return (dsl_pool_scrub_cancel(spa->spa_dsl_pool));
3664 	} else {
3665 		return (EINVAL);
3666 	}
3667 }
3668 
3669 /*
3670  * ==========================================================================
3671  * SPA async task processing
3672  * ==========================================================================
3673  */
3674 
3675 static void
3676 spa_async_remove(spa_t *spa, vdev_t *vd)
3677 {
3678 	if (vd->vdev_remove_wanted) {
3679 		vd->vdev_remove_wanted = 0;
3680 		vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
3681 		vdev_clear(spa, vd);
3682 		vdev_state_dirty(vd->vdev_top);
3683 	}
3684 
3685 	for (int c = 0; c < vd->vdev_children; c++)
3686 		spa_async_remove(spa, vd->vdev_child[c]);
3687 }
3688 
3689 static void
3690 spa_async_probe(spa_t *spa, vdev_t *vd)
3691 {
3692 	if (vd->vdev_probe_wanted) {
3693 		vd->vdev_probe_wanted = 0;
3694 		vdev_reopen(vd);	/* vdev_open() does the actual probe */
3695 	}
3696 
3697 	for (int c = 0; c < vd->vdev_children; c++)
3698 		spa_async_probe(spa, vd->vdev_child[c]);
3699 }
3700 
3701 static void
3702 spa_async_autoexpand(spa_t *spa, vdev_t *vd)
3703 {
3704 	sysevent_id_t eid;
3705 	nvlist_t *attr;
3706 	char *physpath;
3707 
3708 	if (!spa->spa_autoexpand)
3709 		return;
3710 
3711 	for (int c = 0; c < vd->vdev_children; c++) {
3712 		vdev_t *cvd = vd->vdev_child[c];
3713 		spa_async_autoexpand(spa, cvd);
3714 	}
3715 
3716 	if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL)
3717 		return;
3718 
3719 	physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
3720 	(void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath);
3721 
3722 	VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
3723 	VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
3724 
3725 	(void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
3726 	    ESC_DEV_DLE, attr, &eid, DDI_SLEEP);
3727 
3728 	nvlist_free(attr);
3729 	kmem_free(physpath, MAXPATHLEN);
3730 }
3731 
3732 static void
3733 spa_async_thread(spa_t *spa)
3734 {
3735 	int tasks;
3736 
3737 	ASSERT(spa->spa_sync_on);
3738 
3739 	mutex_enter(&spa->spa_async_lock);
3740 	tasks = spa->spa_async_tasks;
3741 	spa->spa_async_tasks = 0;
3742 	mutex_exit(&spa->spa_async_lock);
3743 
3744 	/*
3745 	 * See if the config needs to be updated.
3746 	 */
3747 	if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
3748 		uint64_t oldsz, space_update;
3749 
3750 		mutex_enter(&spa_namespace_lock);
3751 		oldsz = spa_get_space(spa);
3752 		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
3753 		space_update = spa_get_space(spa) - oldsz;
3754 		mutex_exit(&spa_namespace_lock);
3755 
3756 		/*
3757 		 * If the pool grew as a result of the config update,
3758 		 * then log an internal history event.
3759 		 */
3760 		if (space_update) {
3761 			spa_history_internal_log(LOG_POOL_VDEV_ONLINE,
3762 			    spa, NULL, CRED(),
3763 			    "pool '%s' size: %llu(+%llu)",
3764 			    spa_name(spa), spa_get_space(spa),
3765 			    space_update);
3766 		}
3767 	}
3768 
3769 	/*
3770 	 * See if any devices need to be marked REMOVED.
3771 	 */
3772 	if (tasks & SPA_ASYNC_REMOVE) {
3773 		spa_vdev_state_enter(spa);
3774 		spa_async_remove(spa, spa->spa_root_vdev);
3775 		for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
3776 			spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
3777 		for (int i = 0; i < spa->spa_spares.sav_count; i++)
3778 			spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
3779 		(void) spa_vdev_state_exit(spa, NULL, 0);
3780 	}
3781 
3782 	if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
3783 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
3784 		spa_async_autoexpand(spa, spa->spa_root_vdev);
3785 		spa_config_exit(spa, SCL_CONFIG, FTAG);
3786 	}
3787 
3788 	/*
3789 	 * See if any devices need to be probed.
3790 	 */
3791 	if (tasks & SPA_ASYNC_PROBE) {
3792 		spa_vdev_state_enter(spa);
3793 		spa_async_probe(spa, spa->spa_root_vdev);
3794 		(void) spa_vdev_state_exit(spa, NULL, 0);
3795 	}
3796 
3797 	/*
3798 	 * If any devices are done replacing, detach them.
3799 	 */
3800 	if (tasks & SPA_ASYNC_RESILVER_DONE)
3801 		spa_vdev_resilver_done(spa);
3802 
3803 	/*
3804 	 * Kick off a resilver.
3805 	 */
3806 	if (tasks & SPA_ASYNC_RESILVER)
3807 		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER) == 0);
3808 
3809 	/*
3810 	 * Let the world know that we're done.
3811 	 */
3812 	mutex_enter(&spa->spa_async_lock);
3813 	spa->spa_async_thread = NULL;
3814 	cv_broadcast(&spa->spa_async_cv);
3815 	mutex_exit(&spa->spa_async_lock);
3816 	thread_exit();
3817 }
3818 
3819 void
3820 spa_async_suspend(spa_t *spa)
3821 {
3822 	mutex_enter(&spa->spa_async_lock);
3823 	spa->spa_async_suspended++;
3824 	while (spa->spa_async_thread != NULL)
3825 		cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
3826 	mutex_exit(&spa->spa_async_lock);
3827 }
3828 
3829 void
3830 spa_async_resume(spa_t *spa)
3831 {
3832 	mutex_enter(&spa->spa_async_lock);
3833 	ASSERT(spa->spa_async_suspended != 0);
3834 	spa->spa_async_suspended--;
3835 	mutex_exit(&spa->spa_async_lock);
3836 }
3837 
3838 static void
3839 spa_async_dispatch(spa_t *spa)
3840 {
3841 	mutex_enter(&spa->spa_async_lock);
3842 	if (spa->spa_async_tasks && !spa->spa_async_suspended &&
3843 	    spa->spa_async_thread == NULL &&
3844 	    rootdir != NULL && !vn_is_readonly(rootdir))
3845 		spa->spa_async_thread = thread_create(NULL, 0,
3846 		    spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
3847 	mutex_exit(&spa->spa_async_lock);
3848 }
3849 
3850 void
3851 spa_async_request(spa_t *spa, int task)
3852 {
3853 	mutex_enter(&spa->spa_async_lock);
3854 	spa->spa_async_tasks |= task;
3855 	mutex_exit(&spa->spa_async_lock);
3856 }
3857 
3858 /*
3859  * ==========================================================================
3860  * SPA syncing routines
3861  * ==========================================================================
3862  */
3863 
3864 static void
3865 spa_sync_deferred_frees(spa_t *spa, uint64_t txg)
3866 {
3867 	bplist_t *bpl = &spa->spa_sync_bplist;
3868 	dmu_tx_t *tx;
3869 	blkptr_t blk;
3870 	uint64_t itor = 0;
3871 	zio_t *zio;
3872 	int error;
3873 	uint8_t c = 1;
3874 
3875 	zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
3876 
3877 	while (bplist_iterate(bpl, &itor, &blk) == 0) {
3878 		ASSERT(blk.blk_birth < txg);
3879 		zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL,
3880 		    ZIO_FLAG_MUSTSUCCEED));
3881 	}
3882 
3883 	error = zio_wait(zio);
3884 	ASSERT3U(error, ==, 0);
3885 
3886 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
3887 	bplist_vacate(bpl, tx);
3888 
3889 	/*
3890 	 * Pre-dirty the first block so we sync to convergence faster.
3891 	 * (Usually only the first block is needed.)
3892 	 */
3893 	dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx);
3894 	dmu_tx_commit(tx);
3895 }
3896 
3897 static void
3898 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
3899 {
3900 	char *packed = NULL;
3901 	size_t bufsize;
3902 	size_t nvsize = 0;
3903 	dmu_buf_t *db;
3904 
3905 	VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
3906 
3907 	/*
3908 	 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
3909 	 * information.  This avoids the dbuf_will_dirty() path and
3910 	 * saves us a pre-read to get data we don't actually care about.
3911 	 */
3912 	bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE);
3913 	packed = kmem_alloc(bufsize, KM_SLEEP);
3914 
3915 	VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
3916 	    KM_SLEEP) == 0);
3917 	bzero(packed + nvsize, bufsize - nvsize);
3918 
3919 	dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
3920 
3921 	kmem_free(packed, bufsize);
3922 
3923 	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
3924 	dmu_buf_will_dirty(db, tx);
3925 	*(uint64_t *)db->db_data = nvsize;
3926 	dmu_buf_rele(db, FTAG);
3927 }
3928 
3929 static void
3930 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
3931     const char *config, const char *entry)
3932 {
3933 	nvlist_t *nvroot;
3934 	nvlist_t **list;
3935 	int i;
3936 
3937 	if (!sav->sav_sync)
3938 		return;
3939 
3940 	/*
3941 	 * Update the MOS nvlist describing the list of available devices.
3942 	 * spa_validate_aux() will have already made sure this nvlist is
3943 	 * valid and the vdevs are labeled appropriately.
3944 	 */
3945 	if (sav->sav_object == 0) {
3946 		sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
3947 		    DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
3948 		    sizeof (uint64_t), tx);
3949 		VERIFY(zap_update(spa->spa_meta_objset,
3950 		    DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
3951 		    &sav->sav_object, tx) == 0);
3952 	}
3953 
3954 	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
3955 	if (sav->sav_count == 0) {
3956 		VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0);
3957 	} else {
3958 		list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
3959 		for (i = 0; i < sav->sav_count; i++)
3960 			list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
3961 			    B_FALSE, B_FALSE, B_TRUE);
3962 		VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
3963 		    sav->sav_count) == 0);
3964 		for (i = 0; i < sav->sav_count; i++)
3965 			nvlist_free(list[i]);
3966 		kmem_free(list, sav->sav_count * sizeof (void *));
3967 	}
3968 
3969 	spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
3970 	nvlist_free(nvroot);
3971 
3972 	sav->sav_sync = B_FALSE;
3973 }
3974 
3975 static void
3976 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
3977 {
3978 	nvlist_t *config;
3979 
3980 	if (list_is_empty(&spa->spa_config_dirty_list))
3981 		return;
3982 
3983 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
3984 
3985 	config = spa_config_generate(spa, spa->spa_root_vdev,
3986 	    dmu_tx_get_txg(tx), B_FALSE);
3987 
3988 	spa_config_exit(spa, SCL_STATE, FTAG);
3989 
3990 	if (spa->spa_config_syncing)
3991 		nvlist_free(spa->spa_config_syncing);
3992 	spa->spa_config_syncing = config;
3993 
3994 	spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
3995 }
3996 
3997 /*
3998  * Set zpool properties.
3999  */
4000 static void
4001 spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
4002 {
4003 	spa_t *spa = arg1;
4004 	objset_t *mos = spa->spa_meta_objset;
4005 	nvlist_t *nvp = arg2;
4006 	nvpair_t *elem;
4007 	uint64_t intval;
4008 	char *strval;
4009 	zpool_prop_t prop;
4010 	const char *propname;
4011 	zprop_type_t proptype;
4012 
4013 	mutex_enter(&spa->spa_props_lock);
4014 
4015 	elem = NULL;
4016 	while ((elem = nvlist_next_nvpair(nvp, elem))) {
4017 		switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
4018 		case ZPOOL_PROP_VERSION:
4019 			/*
4020 			 * Only set version for non-zpool-creation cases
4021 			 * (set/import). spa_create() needs special care
4022 			 * for version setting.
4023 			 */
4024 			if (tx->tx_txg != TXG_INITIAL) {
4025 				VERIFY(nvpair_value_uint64(elem,
4026 				    &intval) == 0);
4027 				ASSERT(intval <= SPA_VERSION);
4028 				ASSERT(intval >= spa_version(spa));
4029 				spa->spa_uberblock.ub_version = intval;
4030 				vdev_config_dirty(spa->spa_root_vdev);
4031 			}
4032 			break;
4033 
4034 		case ZPOOL_PROP_ALTROOT:
4035 			/*
4036 			 * 'altroot' is a non-persistent property. It should
4037 			 * have been set temporarily at creation or import time.
4038 			 */
4039 			ASSERT(spa->spa_root != NULL);
4040 			break;
4041 
4042 		case ZPOOL_PROP_CACHEFILE:
4043 			/*
4044 			 * 'cachefile' is also a non-persisitent property.
4045 			 */
4046 			break;
4047 		default:
4048 			/*
4049 			 * Set pool property values in the poolprops mos object.
4050 			 */
4051 			if (spa->spa_pool_props_object == 0) {
4052 				objset_t *mos = spa->spa_meta_objset;
4053 
4054 				VERIFY((spa->spa_pool_props_object =
4055 				    zap_create(mos, DMU_OT_POOL_PROPS,
4056 				    DMU_OT_NONE, 0, tx)) > 0);
4057 
4058 				VERIFY(zap_update(mos,
4059 				    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
4060 				    8, 1, &spa->spa_pool_props_object, tx)
4061 				    == 0);
4062 			}
4063 
4064 			/* normalize the property name */
4065 			propname = zpool_prop_to_name(prop);
4066 			proptype = zpool_prop_get_type(prop);
4067 
4068 			if (nvpair_type(elem) == DATA_TYPE_STRING) {
4069 				ASSERT(proptype == PROP_TYPE_STRING);
4070 				VERIFY(nvpair_value_string(elem, &strval) == 0);
4071 				VERIFY(zap_update(mos,
4072 				    spa->spa_pool_props_object, propname,
4073 				    1, strlen(strval) + 1, strval, tx) == 0);
4074 
4075 			} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
4076 				VERIFY(nvpair_value_uint64(elem, &intval) == 0);
4077 
4078 				if (proptype == PROP_TYPE_INDEX) {
4079 					const char *unused;
4080 					VERIFY(zpool_prop_index_to_string(
4081 					    prop, intval, &unused) == 0);
4082 				}
4083 				VERIFY(zap_update(mos,
4084 				    spa->spa_pool_props_object, propname,
4085 				    8, 1, &intval, tx) == 0);
4086 			} else {
4087 				ASSERT(0); /* not allowed */
4088 			}
4089 
4090 			switch (prop) {
4091 			case ZPOOL_PROP_DELEGATION:
4092 				spa->spa_delegation = intval;
4093 				break;
4094 			case ZPOOL_PROP_BOOTFS:
4095 				spa->spa_bootfs = intval;
4096 				break;
4097 			case ZPOOL_PROP_FAILUREMODE:
4098 				spa->spa_failmode = intval;
4099 				break;
4100 			case ZPOOL_PROP_AUTOEXPAND:
4101 				spa->spa_autoexpand = intval;
4102 				spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
4103 				break;
4104 			default:
4105 				break;
4106 			}
4107 		}
4108 
4109 		/* log internal history if this is not a zpool create */
4110 		if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY &&
4111 		    tx->tx_txg != TXG_INITIAL) {
4112 			spa_history_internal_log(LOG_POOL_PROPSET,
4113 			    spa, tx, cr, "%s %lld %s",
4114 			    nvpair_name(elem), intval, spa_name(spa));
4115 		}
4116 	}
4117 
4118 	mutex_exit(&spa->spa_props_lock);
4119 }
4120 
4121 /*
4122  * Sync the specified transaction group.  New blocks may be dirtied as
4123  * part of the process, so we iterate until it converges.
4124  */
4125 void
4126 spa_sync(spa_t *spa, uint64_t txg)
4127 {
4128 	dsl_pool_t *dp = spa->spa_dsl_pool;
4129 	objset_t *mos = spa->spa_meta_objset;
4130 	bplist_t *bpl = &spa->spa_sync_bplist;
4131 	vdev_t *rvd = spa->spa_root_vdev;
4132 	vdev_t *vd;
4133 	dmu_tx_t *tx;
4134 	int dirty_vdevs;
4135 	int error;
4136 
4137 	/*
4138 	 * Lock out configuration changes.
4139 	 */
4140 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4141 
4142 	spa->spa_syncing_txg = txg;
4143 	spa->spa_sync_pass = 0;
4144 
4145 	/*
4146 	 * If there are any pending vdev state changes, convert them
4147 	 * into config changes that go out with this transaction group.
4148 	 */
4149 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
4150 	while (list_head(&spa->spa_state_dirty_list) != NULL) {
4151 		/*
4152 		 * We need the write lock here because, for aux vdevs,
4153 		 * calling vdev_config_dirty() modifies sav_config.
4154 		 * This is ugly and will become unnecessary when we
4155 		 * eliminate the aux vdev wart by integrating all vdevs
4156 		 * into the root vdev tree.
4157 		 */
4158 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
4159 		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
4160 		while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
4161 			vdev_state_clean(vd);
4162 			vdev_config_dirty(vd);
4163 		}
4164 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
4165 		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
4166 	}
4167 	spa_config_exit(spa, SCL_STATE, FTAG);
4168 
4169 	VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj));
4170 
4171 	tx = dmu_tx_create_assigned(dp, txg);
4172 
4173 	/*
4174 	 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
4175 	 * set spa_deflate if we have no raid-z vdevs.
4176 	 */
4177 	if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
4178 	    spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
4179 		int i;
4180 
4181 		for (i = 0; i < rvd->vdev_children; i++) {
4182 			vd = rvd->vdev_child[i];
4183 			if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
4184 				break;
4185 		}
4186 		if (i == rvd->vdev_children) {
4187 			spa->spa_deflate = TRUE;
4188 			VERIFY(0 == zap_add(spa->spa_meta_objset,
4189 			    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
4190 			    sizeof (uint64_t), 1, &spa->spa_deflate, tx));
4191 		}
4192 	}
4193 
4194 	if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
4195 	    spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
4196 		dsl_pool_create_origin(dp, tx);
4197 
4198 		/* Keeping the origin open increases spa_minref */
4199 		spa->spa_minref += 3;
4200 	}
4201 
4202 	if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
4203 	    spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
4204 		dsl_pool_upgrade_clones(dp, tx);
4205 	}
4206 
4207 	/*
4208 	 * If anything has changed in this txg, push the deferred frees
4209 	 * from the previous txg.  If not, leave them alone so that we
4210 	 * don't generate work on an otherwise idle system.
4211 	 */
4212 	if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
4213 	    !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
4214 	    !txg_list_empty(&dp->dp_sync_tasks, txg))
4215 		spa_sync_deferred_frees(spa, txg);
4216 
4217 	/*
4218 	 * Iterate to convergence.
4219 	 */
4220 	do {
4221 		spa->spa_sync_pass++;
4222 
4223 		spa_sync_config_object(spa, tx);
4224 		spa_sync_aux_dev(spa, &spa->spa_spares, tx,
4225 		    ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
4226 		spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
4227 		    ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
4228 		spa_errlog_sync(spa, txg);
4229 		dsl_pool_sync(dp, txg);
4230 
4231 		dirty_vdevs = 0;
4232 		while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) {
4233 			vdev_sync(vd, txg);
4234 			dirty_vdevs++;
4235 		}
4236 
4237 		bplist_sync(bpl, tx);
4238 	} while (dirty_vdevs);
4239 
4240 	bplist_close(bpl);
4241 
4242 	dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass);
4243 
4244 	/*
4245 	 * Rewrite the vdev configuration (which includes the uberblock)
4246 	 * to commit the transaction group.
4247 	 *
4248 	 * If there are no dirty vdevs, we sync the uberblock to a few
4249 	 * random top-level vdevs that are known to be visible in the
4250 	 * config cache (see spa_vdev_add() for a complete description).
4251 	 * If there *are* dirty vdevs, sync the uberblock to all vdevs.
4252 	 */
4253 	for (;;) {
4254 		/*
4255 		 * We hold SCL_STATE to prevent vdev open/close/etc.
4256 		 * while we're attempting to write the vdev labels.
4257 		 */
4258 		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
4259 
4260 		if (list_is_empty(&spa->spa_config_dirty_list)) {
4261 			vdev_t *svd[SPA_DVAS_PER_BP];
4262 			int svdcount = 0;
4263 			int children = rvd->vdev_children;
4264 			int c0 = spa_get_random(children);
4265 
4266 			for (int c = 0; c < children; c++) {
4267 				vd = rvd->vdev_child[(c0 + c) % children];
4268 				if (vd->vdev_ms_array == 0 || vd->vdev_islog)
4269 					continue;
4270 				svd[svdcount++] = vd;
4271 				if (svdcount == SPA_DVAS_PER_BP)
4272 					break;
4273 			}
4274 			error = vdev_config_sync(svd, svdcount, txg, B_FALSE);
4275 			if (error != 0)
4276 				error = vdev_config_sync(svd, svdcount, txg,
4277 				    B_TRUE);
4278 		} else {
4279 			error = vdev_config_sync(rvd->vdev_child,
4280 			    rvd->vdev_children, txg, B_FALSE);
4281 			if (error != 0)
4282 				error = vdev_config_sync(rvd->vdev_child,
4283 				    rvd->vdev_children, txg, B_TRUE);
4284 		}
4285 
4286 		spa_config_exit(spa, SCL_STATE, FTAG);
4287 
4288 		if (error == 0)
4289 			break;
4290 		zio_suspend(spa, NULL);
4291 		zio_resume_wait(spa);
4292 	}
4293 	dmu_tx_commit(tx);
4294 
4295 	/*
4296 	 * Clear the dirty config list.
4297 	 */
4298 	while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL)
4299 		vdev_config_clean(vd);
4300 
4301 	/*
4302 	 * Now that the new config has synced transactionally,
4303 	 * let it become visible to the config cache.
4304 	 */
4305 	if (spa->spa_config_syncing != NULL) {
4306 		spa_config_set(spa, spa->spa_config_syncing);
4307 		spa->spa_config_txg = txg;
4308 		spa->spa_config_syncing = NULL;
4309 	}
4310 
4311 	spa->spa_ubsync = spa->spa_uberblock;
4312 
4313 	/*
4314 	 * Clean up the ZIL records for the synced txg.
4315 	 */
4316 	dsl_pool_zil_clean(dp);
4317 
4318 	/*
4319 	 * Update usable space statistics.
4320 	 */
4321 	while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
4322 		vdev_sync_done(vd, txg);
4323 
4324 	/*
4325 	 * It had better be the case that we didn't dirty anything
4326 	 * since vdev_config_sync().
4327 	 */
4328 	ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
4329 	ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
4330 	ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
4331 	ASSERT(bpl->bpl_queue == NULL);
4332 
4333 	spa_config_exit(spa, SCL_CONFIG, FTAG);
4334 
4335 	/*
4336 	 * If any async tasks have been requested, kick them off.
4337 	 */
4338 	spa_async_dispatch(spa);
4339 }
4340 
4341 /*
4342  * Sync all pools.  We don't want to hold the namespace lock across these
4343  * operations, so we take a reference on the spa_t and drop the lock during the
4344  * sync.
4345  */
4346 void
4347 spa_sync_allpools(void)
4348 {
4349 	spa_t *spa = NULL;
4350 	mutex_enter(&spa_namespace_lock);
4351 	while ((spa = spa_next(spa)) != NULL) {
4352 		if (spa_state(spa) != POOL_STATE_ACTIVE || spa_suspended(spa))
4353 			continue;
4354 		spa_open_ref(spa, FTAG);
4355 		mutex_exit(&spa_namespace_lock);
4356 		txg_wait_synced(spa_get_dsl(spa), 0);
4357 		mutex_enter(&spa_namespace_lock);
4358 		spa_close(spa, FTAG);
4359 	}
4360 	mutex_exit(&spa_namespace_lock);
4361 }
4362 
4363 /*
4364  * ==========================================================================
4365  * Miscellaneous routines
4366  * ==========================================================================
4367  */
4368 
4369 /*
4370  * Remove all pools in the system.
4371  */
4372 void
4373 spa_evict_all(void)
4374 {
4375 	spa_t *spa;
4376 
4377 	/*
4378 	 * Remove all cached state.  All pools should be closed now,
4379 	 * so every spa in the AVL tree should be unreferenced.
4380 	 */
4381 	mutex_enter(&spa_namespace_lock);
4382 	while ((spa = spa_next(NULL)) != NULL) {
4383 		/*
4384 		 * Stop async tasks.  The async thread may need to detach
4385 		 * a device that's been replaced, which requires grabbing
4386 		 * spa_namespace_lock, so we must drop it here.
4387 		 */
4388 		spa_open_ref(spa, FTAG);
4389 		mutex_exit(&spa_namespace_lock);
4390 		spa_async_suspend(spa);
4391 		mutex_enter(&spa_namespace_lock);
4392 		spa_close(spa, FTAG);
4393 
4394 		if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
4395 			spa_unload(spa);
4396 			spa_deactivate(spa);
4397 		}
4398 		spa_remove(spa);
4399 	}
4400 	mutex_exit(&spa_namespace_lock);
4401 }
4402 
4403 vdev_t *
4404 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
4405 {
4406 	vdev_t *vd;
4407 	int i;
4408 
4409 	if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
4410 		return (vd);
4411 
4412 	if (aux) {
4413 		for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
4414 			vd = spa->spa_l2cache.sav_vdevs[i];
4415 			if (vd->vdev_guid == guid)
4416 				return (vd);
4417 		}
4418 
4419 		for (i = 0; i < spa->spa_spares.sav_count; i++) {
4420 			vd = spa->spa_spares.sav_vdevs[i];
4421 			if (vd->vdev_guid == guid)
4422 				return (vd);
4423 		}
4424 	}
4425 
4426 	return (NULL);
4427 }
4428 
4429 void
4430 spa_upgrade(spa_t *spa, uint64_t version)
4431 {
4432 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4433 
4434 	/*
4435 	 * This should only be called for a non-faulted pool, and since a
4436 	 * future version would result in an unopenable pool, this shouldn't be
4437 	 * possible.
4438 	 */
4439 	ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION);
4440 	ASSERT(version >= spa->spa_uberblock.ub_version);
4441 
4442 	spa->spa_uberblock.ub_version = version;
4443 	vdev_config_dirty(spa->spa_root_vdev);
4444 
4445 	spa_config_exit(spa, SCL_ALL, FTAG);
4446 
4447 	txg_wait_synced(spa_get_dsl(spa), 0);
4448 }
4449 
4450 boolean_t
4451 spa_has_spare(spa_t *spa, uint64_t guid)
4452 {
4453 	int i;
4454 	uint64_t spareguid;
4455 	spa_aux_vdev_t *sav = &spa->spa_spares;
4456 
4457 	for (i = 0; i < sav->sav_count; i++)
4458 		if (sav->sav_vdevs[i]->vdev_guid == guid)
4459 			return (B_TRUE);
4460 
4461 	for (i = 0; i < sav->sav_npending; i++) {
4462 		if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
4463 		    &spareguid) == 0 && spareguid == guid)
4464 			return (B_TRUE);
4465 	}
4466 
4467 	return (B_FALSE);
4468 }
4469 
4470 /*
4471  * Check if a pool has an active shared spare device.
4472  * Note: reference count of an active spare is 2, as a spare and as a replace
4473  */
4474 static boolean_t
4475 spa_has_active_shared_spare(spa_t *spa)
4476 {
4477 	int i, refcnt;
4478 	uint64_t pool;
4479 	spa_aux_vdev_t *sav = &spa->spa_spares;
4480 
4481 	for (i = 0; i < sav->sav_count; i++) {
4482 		if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool,
4483 		    &refcnt) && pool != 0ULL && pool == spa_guid(spa) &&
4484 		    refcnt > 2)
4485 			return (B_TRUE);
4486 	}
4487 
4488 	return (B_FALSE);
4489 }
4490 
4491 /*
4492  * Post a sysevent corresponding to the given event.  The 'name' must be one of
4493  * the event definitions in sys/sysevent/eventdefs.h.  The payload will be
4494  * filled in from the spa and (optionally) the vdev.  This doesn't do anything
4495  * in the userland libzpool, as we don't want consumers to misinterpret ztest
4496  * or zdb as real changes.
4497  */
4498 void
4499 spa_event_notify(spa_t *spa, vdev_t *vd, const char *name)
4500 {
4501 #ifdef _KERNEL
4502 	sysevent_t		*ev;
4503 	sysevent_attr_list_t	*attr = NULL;
4504 	sysevent_value_t	value;
4505 	sysevent_id_t		eid;
4506 
4507 	ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs",
4508 	    SE_SLEEP);
4509 
4510 	value.value_type = SE_DATA_TYPE_STRING;
4511 	value.value.sv_string = spa_name(spa);
4512 	if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0)
4513 		goto done;
4514 
4515 	value.value_type = SE_DATA_TYPE_UINT64;
4516 	value.value.sv_uint64 = spa_guid(spa);
4517 	if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0)
4518 		goto done;
4519 
4520 	if (vd) {
4521 		value.value_type = SE_DATA_TYPE_UINT64;
4522 		value.value.sv_uint64 = vd->vdev_guid;
4523 		if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value,
4524 		    SE_SLEEP) != 0)
4525 			goto done;
4526 
4527 		if (vd->vdev_path) {
4528 			value.value_type = SE_DATA_TYPE_STRING;
4529 			value.value.sv_string = vd->vdev_path;
4530 			if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH,
4531 			    &value, SE_SLEEP) != 0)
4532 				goto done;
4533 		}
4534 	}
4535 
4536 	if (sysevent_attach_attributes(ev, attr) != 0)
4537 		goto done;
4538 	attr = NULL;
4539 
4540 	(void) log_sysevent(ev, SE_SLEEP, &eid);
4541 
4542 done:
4543 	if (attr)
4544 		sysevent_free_attr(attr);
4545 	sysevent_free(ev);
4546 #endif
4547 }
4548