xref: /titanic_41/usr/src/uts/common/fs/zfs/spa.c (revision 69dc090717e863b0d20b53b7b1e576d25aa0b28d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * This file contains all the routines used when modifying on-disk SPA state.
31  * This includes opening, importing, destroying, exporting a pool, and syncing a
32  * pool.
33  */
34 
35 #include <sys/zfs_context.h>
36 #include <sys/fm/fs/zfs.h>
37 #include <sys/spa_impl.h>
38 #include <sys/zio.h>
39 #include <sys/zio_checksum.h>
40 #include <sys/zio_compress.h>
41 #include <sys/dmu.h>
42 #include <sys/dmu_tx.h>
43 #include <sys/zap.h>
44 #include <sys/zil.h>
45 #include <sys/vdev_impl.h>
46 #include <sys/metaslab.h>
47 #include <sys/uberblock_impl.h>
48 #include <sys/txg.h>
49 #include <sys/avl.h>
50 #include <sys/dmu_traverse.h>
51 #include <sys/dmu_objset.h>
52 #include <sys/unique.h>
53 #include <sys/dsl_pool.h>
54 #include <sys/dsl_dataset.h>
55 #include <sys/dsl_dir.h>
56 #include <sys/dsl_prop.h>
57 #include <sys/dsl_synctask.h>
58 #include <sys/fs/zfs.h>
59 #include <sys/arc.h>
60 #include <sys/callb.h>
61 #include <sys/systeminfo.h>
62 #include <sys/sunddi.h>
63 
64 #include "zfs_prop.h"
65 
66 int zio_taskq_threads = 8;
67 
68 static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx);
69 
70 /*
71  * ==========================================================================
72  * SPA properties routines
73  * ==========================================================================
74  */
75 
76 /*
77  * Add a (source=src, propname=propval) list to an nvlist.
78  */
79 static int
80 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
81     uint64_t intval, zprop_source_t src)
82 {
83 	const char *propname = zpool_prop_to_name(prop);
84 	nvlist_t *propval;
85 	int err = 0;
86 
87 	if (err = nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP))
88 		return (err);
89 
90 	if (err = nvlist_add_uint64(propval, ZPROP_SOURCE, src))
91 		goto out;
92 
93 	if (strval != NULL) {
94 		if (err = nvlist_add_string(propval, ZPROP_VALUE, strval))
95 			goto out;
96 	} else {
97 		if (err = nvlist_add_uint64(propval, ZPROP_VALUE, intval))
98 			goto out;
99 	}
100 
101 	err = nvlist_add_nvlist(nvl, propname, propval);
102 out:
103 	nvlist_free(propval);
104 	return (err);
105 }
106 
107 /*
108  * Get property values from the spa configuration.
109  */
110 static int
111 spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
112 {
113 	uint64_t size = spa_get_space(spa);
114 	uint64_t used = spa_get_alloc(spa);
115 	uint64_t cap, version;
116 	zprop_source_t src = ZPROP_SRC_NONE;
117 	int err;
118 	char *cachefile;
119 	size_t len;
120 
121 	/*
122 	 * readonly properties
123 	 */
124 	if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa->spa_name,
125 	    0, src))
126 		return (err);
127 
128 	if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src))
129 		return (err);
130 
131 	if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src))
132 		return (err);
133 
134 	if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL,
135 	    size - used, src))
136 		return (err);
137 
138 	cap = (size == 0) ? 0 : (used * 100 / size);
139 	if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src))
140 		return (err);
141 
142 	if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL,
143 	    spa_guid(spa), src))
144 		return (err);
145 
146 	if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
147 	    spa->spa_root_vdev->vdev_state, src))
148 		return (err);
149 
150 	/*
151 	 * settable properties that are not stored in the pool property object.
152 	 */
153 	version = spa_version(spa);
154 	if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
155 		src = ZPROP_SRC_DEFAULT;
156 	else
157 		src = ZPROP_SRC_LOCAL;
158 	if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL,
159 	    version, src))
160 		return (err);
161 
162 	if (spa->spa_root != NULL) {
163 		src = ZPROP_SRC_LOCAL;
164 		if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT,
165 		    spa->spa_root, 0, src))
166 			return (err);
167 	}
168 
169 	if (spa->spa_config_dir != NULL) {
170 		if (strcmp(spa->spa_config_dir, "none") == 0) {
171 			err = spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
172 			    spa->spa_config_dir, 0, ZPROP_SRC_LOCAL);
173 		} else {
174 			len = strlen(spa->spa_config_dir) +
175 			    strlen(spa->spa_config_file) + 2;
176 			cachefile = kmem_alloc(len, KM_SLEEP);
177 			(void) snprintf(cachefile, len, "%s/%s",
178 			    spa->spa_config_dir, spa->spa_config_file);
179 			err = spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
180 			    cachefile, 0, ZPROP_SRC_LOCAL);
181 			kmem_free(cachefile, len);
182 		}
183 
184 		if (err)
185 			return (err);
186 	}
187 
188 	return (0);
189 }
190 
191 /*
192  * Get zpool property values.
193  */
194 int
195 spa_prop_get(spa_t *spa, nvlist_t **nvp)
196 {
197 	zap_cursor_t zc;
198 	zap_attribute_t za;
199 	objset_t *mos = spa->spa_meta_objset;
200 	int err;
201 
202 	if (err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP))
203 		return (err);
204 
205 	/*
206 	 * Get properties from the spa config.
207 	 */
208 	if (err = spa_prop_get_config(spa, nvp))
209 		goto out;
210 
211 	mutex_enter(&spa->spa_props_lock);
212 	/* If no pool property object, no more prop to get. */
213 	if (spa->spa_pool_props_object == 0) {
214 		mutex_exit(&spa->spa_props_lock);
215 		return (0);
216 	}
217 
218 	/*
219 	 * Get properties from the MOS pool property object.
220 	 */
221 	for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
222 	    (err = zap_cursor_retrieve(&zc, &za)) == 0;
223 	    zap_cursor_advance(&zc)) {
224 		uint64_t intval = 0;
225 		char *strval = NULL;
226 		zprop_source_t src = ZPROP_SRC_DEFAULT;
227 		zpool_prop_t prop;
228 
229 		if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL)
230 			continue;
231 
232 		switch (za.za_integer_length) {
233 		case 8:
234 			/* integer property */
235 			if (za.za_first_integer !=
236 			    zpool_prop_default_numeric(prop))
237 				src = ZPROP_SRC_LOCAL;
238 
239 			if (prop == ZPOOL_PROP_BOOTFS) {
240 				dsl_pool_t *dp;
241 				dsl_dataset_t *ds = NULL;
242 
243 				dp = spa_get_dsl(spa);
244 				rw_enter(&dp->dp_config_rwlock, RW_READER);
245 				if (err = dsl_dataset_open_obj(dp,
246 				    za.za_first_integer, NULL, DS_MODE_NONE,
247 				    FTAG, &ds)) {
248 					rw_exit(&dp->dp_config_rwlock);
249 					break;
250 				}
251 
252 				strval = kmem_alloc(
253 				    MAXNAMELEN + strlen(MOS_DIR_NAME) + 1,
254 				    KM_SLEEP);
255 				dsl_dataset_name(ds, strval);
256 				dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
257 				rw_exit(&dp->dp_config_rwlock);
258 			} else {
259 				strval = NULL;
260 				intval = za.za_first_integer;
261 			}
262 
263 			err = spa_prop_add_list(*nvp, prop, strval,
264 			    intval, src);
265 
266 			if (strval != NULL)
267 				kmem_free(strval,
268 				    MAXNAMELEN + strlen(MOS_DIR_NAME) + 1);
269 
270 			break;
271 
272 		case 1:
273 			/* string property */
274 			strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
275 			err = zap_lookup(mos, spa->spa_pool_props_object,
276 			    za.za_name, 1, za.za_num_integers, strval);
277 			if (err) {
278 				kmem_free(strval, za.za_num_integers);
279 				break;
280 			}
281 			err = spa_prop_add_list(*nvp, prop, strval, 0, src);
282 			kmem_free(strval, za.za_num_integers);
283 			break;
284 
285 		default:
286 			break;
287 		}
288 	}
289 	zap_cursor_fini(&zc);
290 	mutex_exit(&spa->spa_props_lock);
291 out:
292 	if (err && err != ENOENT) {
293 		nvlist_free(*nvp);
294 		return (err);
295 	}
296 
297 	return (0);
298 }
299 
300 /*
301  * Validate the given pool properties nvlist and modify the list
302  * for the property values to be set.
303  */
304 static int
305 spa_prop_validate(spa_t *spa, nvlist_t *props)
306 {
307 	nvpair_t *elem;
308 	int error = 0, reset_bootfs = 0;
309 	uint64_t objnum;
310 
311 	elem = NULL;
312 	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
313 		zpool_prop_t prop;
314 		char *propname, *strval;
315 		uint64_t intval;
316 		vdev_t *rvdev;
317 		char *vdev_type;
318 		objset_t *os;
319 		char *slash;
320 
321 		propname = nvpair_name(elem);
322 
323 		if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL)
324 			return (EINVAL);
325 
326 		switch (prop) {
327 		case ZPOOL_PROP_VERSION:
328 			error = nvpair_value_uint64(elem, &intval);
329 			if (!error &&
330 			    (intval < spa_version(spa) || intval > SPA_VERSION))
331 				error = EINVAL;
332 			break;
333 
334 		case ZPOOL_PROP_DELEGATION:
335 		case ZPOOL_PROP_AUTOREPLACE:
336 			error = nvpair_value_uint64(elem, &intval);
337 			if (!error && intval > 1)
338 				error = EINVAL;
339 			break;
340 
341 		case ZPOOL_PROP_BOOTFS:
342 			if (spa_version(spa) < SPA_VERSION_BOOTFS) {
343 				error = ENOTSUP;
344 				break;
345 			}
346 
347 			/*
348 			 * A bootable filesystem can not be on a RAIDZ pool
349 			 * nor a striped pool with more than 1 device.
350 			 */
351 			rvdev = spa->spa_root_vdev;
352 			vdev_type =
353 			    rvdev->vdev_child[0]->vdev_ops->vdev_op_type;
354 			if (rvdev->vdev_children > 1 ||
355 			    strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 ||
356 			    strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) {
357 				error = ENOTSUP;
358 				break;
359 			}
360 
361 			reset_bootfs = 1;
362 
363 			error = nvpair_value_string(elem, &strval);
364 
365 			if (!error) {
366 				if (strval == NULL || strval[0] == '\0') {
367 					objnum = zpool_prop_default_numeric(
368 					    ZPOOL_PROP_BOOTFS);
369 					break;
370 				}
371 
372 				if (error = dmu_objset_open(strval, DMU_OST_ZFS,
373 				    DS_MODE_STANDARD | DS_MODE_READONLY, &os))
374 					break;
375 				objnum = dmu_objset_id(os);
376 				dmu_objset_close(os);
377 			}
378 			break;
379 		case ZPOOL_PROP_FAILUREMODE:
380 			error = nvpair_value_uint64(elem, &intval);
381 			if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
382 			    intval > ZIO_FAILURE_MODE_PANIC))
383 				error = EINVAL;
384 
385 			/*
386 			 * This is a special case which only occurs when
387 			 * the pool has completely failed. This allows
388 			 * the user to change the in-core failmode property
389 			 * without syncing it out to disk (I/Os might
390 			 * currently be blocked). We do this by returning
391 			 * EIO to the caller (spa_prop_set) to trick it
392 			 * into thinking we encountered a property validation
393 			 * error.
394 			 */
395 			if (!error && spa_state(spa) == POOL_STATE_IO_FAILURE) {
396 				spa->spa_failmode = intval;
397 				error = EIO;
398 			}
399 			break;
400 
401 		case ZPOOL_PROP_CACHEFILE:
402 			if ((error = nvpair_value_string(elem, &strval)) != 0)
403 				break;
404 
405 			if (strval[0] == '\0')
406 				break;
407 
408 			if (strcmp(strval, "none") == 0)
409 				break;
410 
411 			if (strval[0] != '/') {
412 				error = EINVAL;
413 				break;
414 			}
415 
416 			slash = strrchr(strval, '/');
417 			ASSERT(slash != NULL);
418 
419 			if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
420 			    strcmp(slash, "/..") == 0)
421 				error = EINVAL;
422 			break;
423 		}
424 
425 		if (error)
426 			break;
427 	}
428 
429 	if (!error && reset_bootfs) {
430 		error = nvlist_remove(props,
431 		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
432 
433 		if (!error) {
434 			error = nvlist_add_uint64(props,
435 			    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
436 		}
437 	}
438 
439 	return (error);
440 }
441 
442 int
443 spa_prop_set(spa_t *spa, nvlist_t *nvp)
444 {
445 	int error;
446 
447 	if ((error = spa_prop_validate(spa, nvp)) != 0)
448 		return (error);
449 
450 	return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props,
451 	    spa, nvp, 3));
452 }
453 
454 /*
455  * If the bootfs property value is dsobj, clear it.
456  */
457 void
458 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
459 {
460 	if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
461 		VERIFY(zap_remove(spa->spa_meta_objset,
462 		    spa->spa_pool_props_object,
463 		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
464 		spa->spa_bootfs = 0;
465 	}
466 }
467 
468 /*
469  * ==========================================================================
470  * SPA state manipulation (open/create/destroy/import/export)
471  * ==========================================================================
472  */
473 
474 static int
475 spa_error_entry_compare(const void *a, const void *b)
476 {
477 	spa_error_entry_t *sa = (spa_error_entry_t *)a;
478 	spa_error_entry_t *sb = (spa_error_entry_t *)b;
479 	int ret;
480 
481 	ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
482 	    sizeof (zbookmark_t));
483 
484 	if (ret < 0)
485 		return (-1);
486 	else if (ret > 0)
487 		return (1);
488 	else
489 		return (0);
490 }
491 
492 /*
493  * Utility function which retrieves copies of the current logs and
494  * re-initializes them in the process.
495  */
496 void
497 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
498 {
499 	ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
500 
501 	bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
502 	bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
503 
504 	avl_create(&spa->spa_errlist_scrub,
505 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
506 	    offsetof(spa_error_entry_t, se_avl));
507 	avl_create(&spa->spa_errlist_last,
508 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
509 	    offsetof(spa_error_entry_t, se_avl));
510 }
511 
512 /*
513  * Activate an uninitialized pool.
514  */
515 static void
516 spa_activate(spa_t *spa)
517 {
518 	int t;
519 
520 	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
521 
522 	spa->spa_state = POOL_STATE_ACTIVE;
523 
524 	spa->spa_normal_class = metaslab_class_create();
525 	spa->spa_log_class = metaslab_class_create();
526 
527 	for (t = 0; t < ZIO_TYPES; t++) {
528 		spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue",
529 		    zio_taskq_threads, maxclsyspri, 50, INT_MAX,
530 		    TASKQ_PREPOPULATE);
531 		spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr",
532 		    zio_taskq_threads, maxclsyspri, 50, INT_MAX,
533 		    TASKQ_PREPOPULATE);
534 	}
535 
536 	list_create(&spa->spa_dirty_list, sizeof (vdev_t),
537 	    offsetof(vdev_t, vdev_dirty_node));
538 	list_create(&spa->spa_zio_list, sizeof (zio_t),
539 	    offsetof(zio_t, zio_link_node));
540 
541 	txg_list_create(&spa->spa_vdev_txg_list,
542 	    offsetof(struct vdev, vdev_txg_node));
543 
544 	avl_create(&spa->spa_errlist_scrub,
545 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
546 	    offsetof(spa_error_entry_t, se_avl));
547 	avl_create(&spa->spa_errlist_last,
548 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
549 	    offsetof(spa_error_entry_t, se_avl));
550 }
551 
552 /*
553  * Opposite of spa_activate().
554  */
555 static void
556 spa_deactivate(spa_t *spa)
557 {
558 	int t;
559 
560 	ASSERT(spa->spa_sync_on == B_FALSE);
561 	ASSERT(spa->spa_dsl_pool == NULL);
562 	ASSERT(spa->spa_root_vdev == NULL);
563 
564 	ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
565 
566 	txg_list_destroy(&spa->spa_vdev_txg_list);
567 
568 	list_destroy(&spa->spa_dirty_list);
569 	list_destroy(&spa->spa_zio_list);
570 
571 	for (t = 0; t < ZIO_TYPES; t++) {
572 		taskq_destroy(spa->spa_zio_issue_taskq[t]);
573 		taskq_destroy(spa->spa_zio_intr_taskq[t]);
574 		spa->spa_zio_issue_taskq[t] = NULL;
575 		spa->spa_zio_intr_taskq[t] = NULL;
576 	}
577 
578 	metaslab_class_destroy(spa->spa_normal_class);
579 	spa->spa_normal_class = NULL;
580 
581 	metaslab_class_destroy(spa->spa_log_class);
582 	spa->spa_log_class = NULL;
583 
584 	/*
585 	 * If this was part of an import or the open otherwise failed, we may
586 	 * still have errors left in the queues.  Empty them just in case.
587 	 */
588 	spa_errlog_drain(spa);
589 
590 	avl_destroy(&spa->spa_errlist_scrub);
591 	avl_destroy(&spa->spa_errlist_last);
592 
593 	spa->spa_state = POOL_STATE_UNINITIALIZED;
594 }
595 
596 /*
597  * Verify a pool configuration, and construct the vdev tree appropriately.  This
598  * will create all the necessary vdevs in the appropriate layout, with each vdev
599  * in the CLOSED state.  This will prep the pool before open/creation/import.
600  * All vdev validation is done by the vdev_alloc() routine.
601  */
602 static int
603 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
604     uint_t id, int atype)
605 {
606 	nvlist_t **child;
607 	uint_t c, children;
608 	int error;
609 
610 	if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
611 		return (error);
612 
613 	if ((*vdp)->vdev_ops->vdev_op_leaf)
614 		return (0);
615 
616 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
617 	    &child, &children) != 0) {
618 		vdev_free(*vdp);
619 		*vdp = NULL;
620 		return (EINVAL);
621 	}
622 
623 	for (c = 0; c < children; c++) {
624 		vdev_t *vd;
625 		if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
626 		    atype)) != 0) {
627 			vdev_free(*vdp);
628 			*vdp = NULL;
629 			return (error);
630 		}
631 	}
632 
633 	ASSERT(*vdp != NULL);
634 
635 	return (0);
636 }
637 
638 /*
639  * Opposite of spa_load().
640  */
641 static void
642 spa_unload(spa_t *spa)
643 {
644 	int i;
645 
646 	/*
647 	 * Stop async tasks.
648 	 */
649 	spa_async_suspend(spa);
650 
651 	/*
652 	 * Stop syncing.
653 	 */
654 	if (spa->spa_sync_on) {
655 		txg_sync_stop(spa->spa_dsl_pool);
656 		spa->spa_sync_on = B_FALSE;
657 	}
658 
659 	/*
660 	 * Wait for any outstanding prefetch I/O to complete.
661 	 */
662 	spa_config_enter(spa, RW_WRITER, FTAG);
663 	spa_config_exit(spa, FTAG);
664 
665 	/*
666 	 * Drop and purge level 2 cache
667 	 */
668 	spa_l2cache_drop(spa);
669 
670 	/*
671 	 * Close the dsl pool.
672 	 */
673 	if (spa->spa_dsl_pool) {
674 		dsl_pool_close(spa->spa_dsl_pool);
675 		spa->spa_dsl_pool = NULL;
676 	}
677 
678 	/*
679 	 * Close all vdevs.
680 	 */
681 	if (spa->spa_root_vdev)
682 		vdev_free(spa->spa_root_vdev);
683 	ASSERT(spa->spa_root_vdev == NULL);
684 
685 	for (i = 0; i < spa->spa_spares.sav_count; i++)
686 		vdev_free(spa->spa_spares.sav_vdevs[i]);
687 	if (spa->spa_spares.sav_vdevs) {
688 		kmem_free(spa->spa_spares.sav_vdevs,
689 		    spa->spa_spares.sav_count * sizeof (void *));
690 		spa->spa_spares.sav_vdevs = NULL;
691 	}
692 	if (spa->spa_spares.sav_config) {
693 		nvlist_free(spa->spa_spares.sav_config);
694 		spa->spa_spares.sav_config = NULL;
695 	}
696 
697 	for (i = 0; i < spa->spa_l2cache.sav_count; i++)
698 		vdev_free(spa->spa_l2cache.sav_vdevs[i]);
699 	if (spa->spa_l2cache.sav_vdevs) {
700 		kmem_free(spa->spa_l2cache.sav_vdevs,
701 		    spa->spa_l2cache.sav_count * sizeof (void *));
702 		spa->spa_l2cache.sav_vdevs = NULL;
703 	}
704 	if (spa->spa_l2cache.sav_config) {
705 		nvlist_free(spa->spa_l2cache.sav_config);
706 		spa->spa_l2cache.sav_config = NULL;
707 	}
708 
709 	spa->spa_async_suspended = 0;
710 }
711 
712 /*
713  * Load (or re-load) the current list of vdevs describing the active spares for
714  * this pool.  When this is called, we have some form of basic information in
715  * 'spa_spares.sav_config'.  We parse this into vdevs, try to open them, and
716  * then re-generate a more complete list including status information.
717  */
718 static void
719 spa_load_spares(spa_t *spa)
720 {
721 	nvlist_t **spares;
722 	uint_t nspares;
723 	int i;
724 	vdev_t *vd, *tvd;
725 
726 	/*
727 	 * First, close and free any existing spare vdevs.
728 	 */
729 	for (i = 0; i < spa->spa_spares.sav_count; i++) {
730 		vd = spa->spa_spares.sav_vdevs[i];
731 
732 		/* Undo the call to spa_activate() below */
733 		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL &&
734 		    tvd->vdev_isspare)
735 			spa_spare_remove(tvd);
736 		vdev_close(vd);
737 		vdev_free(vd);
738 	}
739 
740 	if (spa->spa_spares.sav_vdevs)
741 		kmem_free(spa->spa_spares.sav_vdevs,
742 		    spa->spa_spares.sav_count * sizeof (void *));
743 
744 	if (spa->spa_spares.sav_config == NULL)
745 		nspares = 0;
746 	else
747 		VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
748 		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
749 
750 	spa->spa_spares.sav_count = (int)nspares;
751 	spa->spa_spares.sav_vdevs = NULL;
752 
753 	if (nspares == 0)
754 		return;
755 
756 	/*
757 	 * Construct the array of vdevs, opening them to get status in the
758 	 * process.   For each spare, there is potentially two different vdev_t
759 	 * structures associated with it: one in the list of spares (used only
760 	 * for basic validation purposes) and one in the active vdev
761 	 * configuration (if it's spared in).  During this phase we open and
762 	 * validate each vdev on the spare list.  If the vdev also exists in the
763 	 * active configuration, then we also mark this vdev as an active spare.
764 	 */
765 	spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *),
766 	    KM_SLEEP);
767 	for (i = 0; i < spa->spa_spares.sav_count; i++) {
768 		VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
769 		    VDEV_ALLOC_SPARE) == 0);
770 		ASSERT(vd != NULL);
771 
772 		spa->spa_spares.sav_vdevs[i] = vd;
773 
774 		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) {
775 			if (!tvd->vdev_isspare)
776 				spa_spare_add(tvd);
777 
778 			/*
779 			 * We only mark the spare active if we were successfully
780 			 * able to load the vdev.  Otherwise, importing a pool
781 			 * with a bad active spare would result in strange
782 			 * behavior, because multiple pool would think the spare
783 			 * is actively in use.
784 			 *
785 			 * There is a vulnerability here to an equally bizarre
786 			 * circumstance, where a dead active spare is later
787 			 * brought back to life (onlined or otherwise).  Given
788 			 * the rarity of this scenario, and the extra complexity
789 			 * it adds, we ignore the possibility.
790 			 */
791 			if (!vdev_is_dead(tvd))
792 				spa_spare_activate(tvd);
793 		}
794 
795 		if (vdev_open(vd) != 0)
796 			continue;
797 
798 		vd->vdev_top = vd;
799 		if (vdev_validate_aux(vd) == 0)
800 			spa_spare_add(vd);
801 	}
802 
803 	/*
804 	 * Recompute the stashed list of spares, with status information
805 	 * this time.
806 	 */
807 	VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
808 	    DATA_TYPE_NVLIST_ARRAY) == 0);
809 
810 	spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
811 	    KM_SLEEP);
812 	for (i = 0; i < spa->spa_spares.sav_count; i++)
813 		spares[i] = vdev_config_generate(spa,
814 		    spa->spa_spares.sav_vdevs[i], B_TRUE, B_TRUE, B_FALSE);
815 	VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
816 	    ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
817 	for (i = 0; i < spa->spa_spares.sav_count; i++)
818 		nvlist_free(spares[i]);
819 	kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
820 }
821 
822 /*
823  * Load (or re-load) the current list of vdevs describing the active l2cache for
824  * this pool.  When this is called, we have some form of basic information in
825  * 'spa_l2cache.sav_config'.  We parse this into vdevs, try to open them, and
826  * then re-generate a more complete list including status information.
827  * Devices which are already active have their details maintained, and are
828  * not re-opened.
829  */
830 static void
831 spa_load_l2cache(spa_t *spa)
832 {
833 	nvlist_t **l2cache;
834 	uint_t nl2cache;
835 	int i, j, oldnvdevs;
836 	uint64_t guid;
837 	vdev_t *vd, **oldvdevs, **newvdevs;
838 	spa_aux_vdev_t *sav = &spa->spa_l2cache;
839 
840 	if (sav->sav_config != NULL) {
841 		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
842 		    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
843 		newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
844 	} else {
845 		nl2cache = 0;
846 	}
847 
848 	oldvdevs = sav->sav_vdevs;
849 	oldnvdevs = sav->sav_count;
850 	sav->sav_vdevs = NULL;
851 	sav->sav_count = 0;
852 
853 	/*
854 	 * Process new nvlist of vdevs.
855 	 */
856 	for (i = 0; i < nl2cache; i++) {
857 		VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
858 		    &guid) == 0);
859 
860 		newvdevs[i] = NULL;
861 		for (j = 0; j < oldnvdevs; j++) {
862 			vd = oldvdevs[j];
863 			if (vd != NULL && guid == vd->vdev_guid) {
864 				/*
865 				 * Retain previous vdev for add/remove ops.
866 				 */
867 				newvdevs[i] = vd;
868 				oldvdevs[j] = NULL;
869 				break;
870 			}
871 		}
872 
873 		if (newvdevs[i] == NULL) {
874 			/*
875 			 * Create new vdev
876 			 */
877 			VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
878 			    VDEV_ALLOC_L2CACHE) == 0);
879 			ASSERT(vd != NULL);
880 			newvdevs[i] = vd;
881 
882 			/*
883 			 * Commit this vdev as an l2cache device,
884 			 * even if it fails to open.
885 			 */
886 			spa_l2cache_add(vd);
887 
888 			if (vdev_open(vd) != 0)
889 				continue;
890 
891 			vd->vdev_top = vd;
892 			(void) vdev_validate_aux(vd);
893 
894 			if (!vdev_is_dead(vd)) {
895 				uint64_t size;
896 				size = vdev_get_rsize(vd);
897 				ASSERT3U(size, >, 0);
898 				if (spa_mode & FWRITE) {
899 					l2arc_add_vdev(spa, vd,
900 					    VDEV_LABEL_START_SIZE,
901 					    size - VDEV_LABEL_START_SIZE);
902 				}
903 				spa_l2cache_activate(vd);
904 			}
905 		}
906 	}
907 
908 	/*
909 	 * Purge vdevs that were dropped
910 	 */
911 	for (i = 0; i < oldnvdevs; i++) {
912 		uint64_t pool;
913 
914 		vd = oldvdevs[i];
915 		if (vd != NULL) {
916 			if (spa_mode & FWRITE &&
917 			    spa_l2cache_exists(vd->vdev_guid, &pool) &&
918 			    pool != 0ULL) {
919 				l2arc_remove_vdev(vd);
920 			}
921 			(void) vdev_close(vd);
922 			spa_l2cache_remove(vd);
923 		}
924 	}
925 
926 	if (oldvdevs)
927 		kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
928 
929 	if (sav->sav_config == NULL)
930 		goto out;
931 
932 	sav->sav_vdevs = newvdevs;
933 	sav->sav_count = (int)nl2cache;
934 
935 	/*
936 	 * Recompute the stashed list of l2cache devices, with status
937 	 * information this time.
938 	 */
939 	VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
940 	    DATA_TYPE_NVLIST_ARRAY) == 0);
941 
942 	l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
943 	for (i = 0; i < sav->sav_count; i++)
944 		l2cache[i] = vdev_config_generate(spa,
945 		    sav->sav_vdevs[i], B_TRUE, B_FALSE, B_TRUE);
946 	VERIFY(nvlist_add_nvlist_array(sav->sav_config,
947 	    ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
948 out:
949 	for (i = 0; i < sav->sav_count; i++)
950 		nvlist_free(l2cache[i]);
951 	if (sav->sav_count)
952 		kmem_free(l2cache, sav->sav_count * sizeof (void *));
953 }
954 
955 static int
956 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
957 {
958 	dmu_buf_t *db;
959 	char *packed = NULL;
960 	size_t nvsize = 0;
961 	int error;
962 	*value = NULL;
963 
964 	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
965 	nvsize = *(uint64_t *)db->db_data;
966 	dmu_buf_rele(db, FTAG);
967 
968 	packed = kmem_alloc(nvsize, KM_SLEEP);
969 	error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed);
970 	if (error == 0)
971 		error = nvlist_unpack(packed, nvsize, value, 0);
972 	kmem_free(packed, nvsize);
973 
974 	return (error);
975 }
976 
977 /*
978  * Checks to see if the given vdev could not be opened, in which case we post a
979  * sysevent to notify the autoreplace code that the device has been removed.
980  */
981 static void
982 spa_check_removed(vdev_t *vd)
983 {
984 	int c;
985 
986 	for (c = 0; c < vd->vdev_children; c++)
987 		spa_check_removed(vd->vdev_child[c]);
988 
989 	if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) {
990 		zfs_post_autoreplace(vd->vdev_spa, vd);
991 		spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK);
992 	}
993 }
994 
995 /*
996  * Load an existing storage pool, using the pool's builtin spa_config as a
997  * source of configuration information.
998  */
999 static int
1000 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
1001 {
1002 	int error = 0;
1003 	nvlist_t *nvroot = NULL;
1004 	vdev_t *rvd;
1005 	uberblock_t *ub = &spa->spa_uberblock;
1006 	uint64_t config_cache_txg = spa->spa_config_txg;
1007 	uint64_t pool_guid;
1008 	uint64_t version;
1009 	zio_t *zio;
1010 	uint64_t autoreplace = 0;
1011 
1012 	spa->spa_load_state = state;
1013 
1014 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) ||
1015 	    nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
1016 		error = EINVAL;
1017 		goto out;
1018 	}
1019 
1020 	/*
1021 	 * Versioning wasn't explicitly added to the label until later, so if
1022 	 * it's not present treat it as the initial version.
1023 	 */
1024 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0)
1025 		version = SPA_VERSION_INITIAL;
1026 
1027 	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
1028 	    &spa->spa_config_txg);
1029 
1030 	if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
1031 	    spa_guid_exists(pool_guid, 0)) {
1032 		error = EEXIST;
1033 		goto out;
1034 	}
1035 
1036 	spa->spa_load_guid = pool_guid;
1037 
1038 	/*
1039 	 * Parse the configuration into a vdev tree.  We explicitly set the
1040 	 * value that will be returned by spa_version() since parsing the
1041 	 * configuration requires knowing the version number.
1042 	 */
1043 	spa_config_enter(spa, RW_WRITER, FTAG);
1044 	spa->spa_ubsync.ub_version = version;
1045 	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD);
1046 	spa_config_exit(spa, FTAG);
1047 
1048 	if (error != 0)
1049 		goto out;
1050 
1051 	ASSERT(spa->spa_root_vdev == rvd);
1052 	ASSERT(spa_guid(spa) == pool_guid);
1053 
1054 	/*
1055 	 * Try to open all vdevs, loading each label in the process.
1056 	 */
1057 	error = vdev_open(rvd);
1058 	if (error != 0)
1059 		goto out;
1060 
1061 	/*
1062 	 * Validate the labels for all leaf vdevs.  We need to grab the config
1063 	 * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD
1064 	 * flag.
1065 	 */
1066 	spa_config_enter(spa, RW_READER, FTAG);
1067 	error = vdev_validate(rvd);
1068 	spa_config_exit(spa, FTAG);
1069 
1070 	if (error != 0)
1071 		goto out;
1072 
1073 	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
1074 		error = ENXIO;
1075 		goto out;
1076 	}
1077 
1078 	/*
1079 	 * Find the best uberblock.
1080 	 */
1081 	bzero(ub, sizeof (uberblock_t));
1082 
1083 	zio = zio_root(spa, NULL, NULL,
1084 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
1085 	vdev_uberblock_load(zio, rvd, ub);
1086 	error = zio_wait(zio);
1087 
1088 	/*
1089 	 * If we weren't able to find a single valid uberblock, return failure.
1090 	 */
1091 	if (ub->ub_txg == 0) {
1092 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1093 		    VDEV_AUX_CORRUPT_DATA);
1094 		error = ENXIO;
1095 		goto out;
1096 	}
1097 
1098 	/*
1099 	 * If the pool is newer than the code, we can't open it.
1100 	 */
1101 	if (ub->ub_version > SPA_VERSION) {
1102 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1103 		    VDEV_AUX_VERSION_NEWER);
1104 		error = ENOTSUP;
1105 		goto out;
1106 	}
1107 
1108 	/*
1109 	 * If the vdev guid sum doesn't match the uberblock, we have an
1110 	 * incomplete configuration.
1111 	 */
1112 	if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) {
1113 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1114 		    VDEV_AUX_BAD_GUID_SUM);
1115 		error = ENXIO;
1116 		goto out;
1117 	}
1118 
1119 	/*
1120 	 * Initialize internal SPA structures.
1121 	 */
1122 	spa->spa_state = POOL_STATE_ACTIVE;
1123 	spa->spa_ubsync = spa->spa_uberblock;
1124 	spa->spa_first_txg = spa_last_synced_txg(spa) + 1;
1125 	error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
1126 	if (error) {
1127 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1128 		    VDEV_AUX_CORRUPT_DATA);
1129 		goto out;
1130 	}
1131 	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
1132 
1133 	if (zap_lookup(spa->spa_meta_objset,
1134 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
1135 	    sizeof (uint64_t), 1, &spa->spa_config_object) != 0) {
1136 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1137 		    VDEV_AUX_CORRUPT_DATA);
1138 		error = EIO;
1139 		goto out;
1140 	}
1141 
1142 	if (!mosconfig) {
1143 		nvlist_t *newconfig;
1144 		uint64_t hostid;
1145 
1146 		if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) {
1147 			vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1148 			    VDEV_AUX_CORRUPT_DATA);
1149 			error = EIO;
1150 			goto out;
1151 		}
1152 
1153 		if (nvlist_lookup_uint64(newconfig, ZPOOL_CONFIG_HOSTID,
1154 		    &hostid) == 0) {
1155 			char *hostname;
1156 			unsigned long myhostid = 0;
1157 
1158 			VERIFY(nvlist_lookup_string(newconfig,
1159 			    ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
1160 
1161 			(void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
1162 			if (hostid != 0 && myhostid != 0 &&
1163 			    (unsigned long)hostid != myhostid) {
1164 				cmn_err(CE_WARN, "pool '%s' could not be "
1165 				    "loaded as it was last accessed by "
1166 				    "another system (host: %s hostid: 0x%lx).  "
1167 				    "See: http://www.sun.com/msg/ZFS-8000-EY",
1168 				    spa->spa_name, hostname,
1169 				    (unsigned long)hostid);
1170 				error = EBADF;
1171 				goto out;
1172 			}
1173 		}
1174 
1175 		spa_config_set(spa, newconfig);
1176 		spa_unload(spa);
1177 		spa_deactivate(spa);
1178 		spa_activate(spa);
1179 
1180 		return (spa_load(spa, newconfig, state, B_TRUE));
1181 	}
1182 
1183 	if (zap_lookup(spa->spa_meta_objset,
1184 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
1185 	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) {
1186 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1187 		    VDEV_AUX_CORRUPT_DATA);
1188 		error = EIO;
1189 		goto out;
1190 	}
1191 
1192 	/*
1193 	 * Load the bit that tells us to use the new accounting function
1194 	 * (raid-z deflation).  If we have an older pool, this will not
1195 	 * be present.
1196 	 */
1197 	error = zap_lookup(spa->spa_meta_objset,
1198 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
1199 	    sizeof (uint64_t), 1, &spa->spa_deflate);
1200 	if (error != 0 && error != ENOENT) {
1201 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1202 		    VDEV_AUX_CORRUPT_DATA);
1203 		error = EIO;
1204 		goto out;
1205 	}
1206 
1207 	/*
1208 	 * Load the persistent error log.  If we have an older pool, this will
1209 	 * not be present.
1210 	 */
1211 	error = zap_lookup(spa->spa_meta_objset,
1212 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST,
1213 	    sizeof (uint64_t), 1, &spa->spa_errlog_last);
1214 	if (error != 0 && error != ENOENT) {
1215 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1216 		    VDEV_AUX_CORRUPT_DATA);
1217 		error = EIO;
1218 		goto out;
1219 	}
1220 
1221 	error = zap_lookup(spa->spa_meta_objset,
1222 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB,
1223 	    sizeof (uint64_t), 1, &spa->spa_errlog_scrub);
1224 	if (error != 0 && error != ENOENT) {
1225 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1226 		    VDEV_AUX_CORRUPT_DATA);
1227 		error = EIO;
1228 		goto out;
1229 	}
1230 
1231 	/*
1232 	 * Load the history object.  If we have an older pool, this
1233 	 * will not be present.
1234 	 */
1235 	error = zap_lookup(spa->spa_meta_objset,
1236 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY,
1237 	    sizeof (uint64_t), 1, &spa->spa_history);
1238 	if (error != 0 && error != ENOENT) {
1239 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1240 		    VDEV_AUX_CORRUPT_DATA);
1241 		error = EIO;
1242 		goto out;
1243 	}
1244 
1245 	/*
1246 	 * Load any hot spares for this pool.
1247 	 */
1248 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
1249 	    DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares.sav_object);
1250 	if (error != 0 && error != ENOENT) {
1251 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1252 		    VDEV_AUX_CORRUPT_DATA);
1253 		error = EIO;
1254 		goto out;
1255 	}
1256 	if (error == 0) {
1257 		ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
1258 		if (load_nvlist(spa, spa->spa_spares.sav_object,
1259 		    &spa->spa_spares.sav_config) != 0) {
1260 			vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1261 			    VDEV_AUX_CORRUPT_DATA);
1262 			error = EIO;
1263 			goto out;
1264 		}
1265 
1266 		spa_config_enter(spa, RW_WRITER, FTAG);
1267 		spa_load_spares(spa);
1268 		spa_config_exit(spa, FTAG);
1269 	}
1270 
1271 	/*
1272 	 * Load any level 2 ARC devices for this pool.
1273 	 */
1274 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
1275 	    DMU_POOL_L2CACHE, sizeof (uint64_t), 1,
1276 	    &spa->spa_l2cache.sav_object);
1277 	if (error != 0 && error != ENOENT) {
1278 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1279 		    VDEV_AUX_CORRUPT_DATA);
1280 		error = EIO;
1281 		goto out;
1282 	}
1283 	if (error == 0) {
1284 		ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
1285 		if (load_nvlist(spa, spa->spa_l2cache.sav_object,
1286 		    &spa->spa_l2cache.sav_config) != 0) {
1287 			vdev_set_state(rvd, B_TRUE,
1288 			    VDEV_STATE_CANT_OPEN,
1289 			    VDEV_AUX_CORRUPT_DATA);
1290 			error = EIO;
1291 			goto out;
1292 		}
1293 
1294 		spa_config_enter(spa, RW_WRITER, FTAG);
1295 		spa_load_l2cache(spa);
1296 		spa_config_exit(spa, FTAG);
1297 	}
1298 
1299 	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
1300 
1301 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
1302 	    DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object);
1303 
1304 	if (error && error != ENOENT) {
1305 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1306 		    VDEV_AUX_CORRUPT_DATA);
1307 		error = EIO;
1308 		goto out;
1309 	}
1310 
1311 	if (error == 0) {
1312 		(void) zap_lookup(spa->spa_meta_objset,
1313 		    spa->spa_pool_props_object,
1314 		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS),
1315 		    sizeof (uint64_t), 1, &spa->spa_bootfs);
1316 		(void) zap_lookup(spa->spa_meta_objset,
1317 		    spa->spa_pool_props_object,
1318 		    zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE),
1319 		    sizeof (uint64_t), 1, &autoreplace);
1320 		(void) zap_lookup(spa->spa_meta_objset,
1321 		    spa->spa_pool_props_object,
1322 		    zpool_prop_to_name(ZPOOL_PROP_DELEGATION),
1323 		    sizeof (uint64_t), 1, &spa->spa_delegation);
1324 		(void) zap_lookup(spa->spa_meta_objset,
1325 		    spa->spa_pool_props_object,
1326 		    zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE),
1327 		    sizeof (uint64_t), 1, &spa->spa_failmode);
1328 	}
1329 
1330 	/*
1331 	 * If the 'autoreplace' property is set, then post a resource notifying
1332 	 * the ZFS DE that it should not issue any faults for unopenable
1333 	 * devices.  We also iterate over the vdevs, and post a sysevent for any
1334 	 * unopenable vdevs so that the normal autoreplace handler can take
1335 	 * over.
1336 	 */
1337 	if (autoreplace)
1338 		spa_check_removed(spa->spa_root_vdev);
1339 
1340 	/*
1341 	 * Load the vdev state for all toplevel vdevs.
1342 	 */
1343 	vdev_load(rvd);
1344 
1345 	/*
1346 	 * Propagate the leaf DTLs we just loaded all the way up the tree.
1347 	 */
1348 	spa_config_enter(spa, RW_WRITER, FTAG);
1349 	vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
1350 	spa_config_exit(spa, FTAG);
1351 
1352 	/*
1353 	 * Check the state of the root vdev.  If it can't be opened, it
1354 	 * indicates one or more toplevel vdevs are faulted.
1355 	 */
1356 	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
1357 		error = ENXIO;
1358 		goto out;
1359 	}
1360 
1361 	if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) {
1362 		dmu_tx_t *tx;
1363 		int need_update = B_FALSE;
1364 		int c;
1365 
1366 		/*
1367 		 * Claim log blocks that haven't been committed yet.
1368 		 * This must all happen in a single txg.
1369 		 */
1370 		tx = dmu_tx_create_assigned(spa_get_dsl(spa),
1371 		    spa_first_txg(spa));
1372 		(void) dmu_objset_find(spa->spa_name,
1373 		    zil_claim, tx, DS_FIND_CHILDREN);
1374 		dmu_tx_commit(tx);
1375 
1376 		spa->spa_sync_on = B_TRUE;
1377 		txg_sync_start(spa->spa_dsl_pool);
1378 
1379 		/*
1380 		 * Wait for all claims to sync.
1381 		 */
1382 		txg_wait_synced(spa->spa_dsl_pool, 0);
1383 
1384 		/*
1385 		 * If the config cache is stale, or we have uninitialized
1386 		 * metaslabs (see spa_vdev_add()), then update the config.
1387 		 */
1388 		if (config_cache_txg != spa->spa_config_txg ||
1389 		    state == SPA_LOAD_IMPORT)
1390 			need_update = B_TRUE;
1391 
1392 		for (c = 0; c < rvd->vdev_children; c++)
1393 			if (rvd->vdev_child[c]->vdev_ms_array == 0)
1394 				need_update = B_TRUE;
1395 
1396 		/*
1397 		 * Update the config cache asychronously in case we're the
1398 		 * root pool, in which case the config cache isn't writable yet.
1399 		 */
1400 		if (need_update)
1401 			spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
1402 	}
1403 
1404 	error = 0;
1405 out:
1406 	if (error && error != EBADF)
1407 		zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0);
1408 	spa->spa_load_state = SPA_LOAD_NONE;
1409 	spa->spa_ena = 0;
1410 
1411 	return (error);
1412 }
1413 
1414 /*
1415  * Pool Open/Import
1416  *
1417  * The import case is identical to an open except that the configuration is sent
1418  * down from userland, instead of grabbed from the configuration cache.  For the
1419  * case of an open, the pool configuration will exist in the
1420  * POOL_STATE_UNINITIALIZED state.
1421  *
1422  * The stats information (gen/count/ustats) is used to gather vdev statistics at
1423  * the same time open the pool, without having to keep around the spa_t in some
1424  * ambiguous state.
1425  */
1426 static int
1427 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
1428 {
1429 	spa_t *spa;
1430 	int error;
1431 	int loaded = B_FALSE;
1432 	int locked = B_FALSE;
1433 
1434 	*spapp = NULL;
1435 
1436 	/*
1437 	 * As disgusting as this is, we need to support recursive calls to this
1438 	 * function because dsl_dir_open() is called during spa_load(), and ends
1439 	 * up calling spa_open() again.  The real fix is to figure out how to
1440 	 * avoid dsl_dir_open() calling this in the first place.
1441 	 */
1442 	if (mutex_owner(&spa_namespace_lock) != curthread) {
1443 		mutex_enter(&spa_namespace_lock);
1444 		locked = B_TRUE;
1445 	}
1446 
1447 	if ((spa = spa_lookup(pool)) == NULL) {
1448 		if (locked)
1449 			mutex_exit(&spa_namespace_lock);
1450 		return (ENOENT);
1451 	}
1452 	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
1453 
1454 		spa_activate(spa);
1455 
1456 		error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE);
1457 
1458 		if (error == EBADF) {
1459 			/*
1460 			 * If vdev_validate() returns failure (indicated by
1461 			 * EBADF), it indicates that one of the vdevs indicates
1462 			 * that the pool has been exported or destroyed.  If
1463 			 * this is the case, the config cache is out of sync and
1464 			 * we should remove the pool from the namespace.
1465 			 */
1466 			zfs_post_ok(spa, NULL);
1467 			spa_unload(spa);
1468 			spa_deactivate(spa);
1469 			spa_remove(spa);
1470 			spa_config_sync();
1471 			if (locked)
1472 				mutex_exit(&spa_namespace_lock);
1473 			return (ENOENT);
1474 		}
1475 
1476 		if (error) {
1477 			/*
1478 			 * We can't open the pool, but we still have useful
1479 			 * information: the state of each vdev after the
1480 			 * attempted vdev_open().  Return this to the user.
1481 			 */
1482 			if (config != NULL && spa->spa_root_vdev != NULL) {
1483 				spa_config_enter(spa, RW_READER, FTAG);
1484 				*config = spa_config_generate(spa, NULL, -1ULL,
1485 				    B_TRUE);
1486 				spa_config_exit(spa, FTAG);
1487 			}
1488 			spa_unload(spa);
1489 			spa_deactivate(spa);
1490 			spa->spa_last_open_failed = B_TRUE;
1491 			if (locked)
1492 				mutex_exit(&spa_namespace_lock);
1493 			*spapp = NULL;
1494 			return (error);
1495 		} else {
1496 			zfs_post_ok(spa, NULL);
1497 			spa->spa_last_open_failed = B_FALSE;
1498 		}
1499 
1500 		loaded = B_TRUE;
1501 	}
1502 
1503 	spa_open_ref(spa, tag);
1504 
1505 	/*
1506 	 * If we just loaded the pool, resilver anything that's out of date.
1507 	 */
1508 	if (loaded && (spa_mode & FWRITE))
1509 		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
1510 
1511 	if (locked)
1512 		mutex_exit(&spa_namespace_lock);
1513 
1514 	*spapp = spa;
1515 
1516 	if (config != NULL) {
1517 		spa_config_enter(spa, RW_READER, FTAG);
1518 		*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
1519 		spa_config_exit(spa, FTAG);
1520 	}
1521 
1522 	return (0);
1523 }
1524 
1525 int
1526 spa_open(const char *name, spa_t **spapp, void *tag)
1527 {
1528 	return (spa_open_common(name, spapp, tag, NULL));
1529 }
1530 
1531 /*
1532  * Lookup the given spa_t, incrementing the inject count in the process,
1533  * preventing it from being exported or destroyed.
1534  */
1535 spa_t *
1536 spa_inject_addref(char *name)
1537 {
1538 	spa_t *spa;
1539 
1540 	mutex_enter(&spa_namespace_lock);
1541 	if ((spa = spa_lookup(name)) == NULL) {
1542 		mutex_exit(&spa_namespace_lock);
1543 		return (NULL);
1544 	}
1545 	spa->spa_inject_ref++;
1546 	mutex_exit(&spa_namespace_lock);
1547 
1548 	return (spa);
1549 }
1550 
1551 void
1552 spa_inject_delref(spa_t *spa)
1553 {
1554 	mutex_enter(&spa_namespace_lock);
1555 	spa->spa_inject_ref--;
1556 	mutex_exit(&spa_namespace_lock);
1557 }
1558 
1559 /*
1560  * Add spares device information to the nvlist.
1561  */
1562 static void
1563 spa_add_spares(spa_t *spa, nvlist_t *config)
1564 {
1565 	nvlist_t **spares;
1566 	uint_t i, nspares;
1567 	nvlist_t *nvroot;
1568 	uint64_t guid;
1569 	vdev_stat_t *vs;
1570 	uint_t vsc;
1571 	uint64_t pool;
1572 
1573 	if (spa->spa_spares.sav_count == 0)
1574 		return;
1575 
1576 	VERIFY(nvlist_lookup_nvlist(config,
1577 	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
1578 	VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
1579 	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
1580 	if (nspares != 0) {
1581 		VERIFY(nvlist_add_nvlist_array(nvroot,
1582 		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
1583 		VERIFY(nvlist_lookup_nvlist_array(nvroot,
1584 		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
1585 
1586 		/*
1587 		 * Go through and find any spares which have since been
1588 		 * repurposed as an active spare.  If this is the case, update
1589 		 * their status appropriately.
1590 		 */
1591 		for (i = 0; i < nspares; i++) {
1592 			VERIFY(nvlist_lookup_uint64(spares[i],
1593 			    ZPOOL_CONFIG_GUID, &guid) == 0);
1594 			if (spa_spare_exists(guid, &pool) && pool != 0ULL) {
1595 				VERIFY(nvlist_lookup_uint64_array(
1596 				    spares[i], ZPOOL_CONFIG_STATS,
1597 				    (uint64_t **)&vs, &vsc) == 0);
1598 				vs->vs_state = VDEV_STATE_CANT_OPEN;
1599 				vs->vs_aux = VDEV_AUX_SPARED;
1600 			}
1601 		}
1602 	}
1603 }
1604 
1605 /*
1606  * Add l2cache device information to the nvlist, including vdev stats.
1607  */
1608 static void
1609 spa_add_l2cache(spa_t *spa, nvlist_t *config)
1610 {
1611 	nvlist_t **l2cache;
1612 	uint_t i, j, nl2cache;
1613 	nvlist_t *nvroot;
1614 	uint64_t guid;
1615 	vdev_t *vd;
1616 	vdev_stat_t *vs;
1617 	uint_t vsc;
1618 
1619 	if (spa->spa_l2cache.sav_count == 0)
1620 		return;
1621 
1622 	spa_config_enter(spa, RW_READER, FTAG);
1623 
1624 	VERIFY(nvlist_lookup_nvlist(config,
1625 	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
1626 	VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
1627 	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
1628 	if (nl2cache != 0) {
1629 		VERIFY(nvlist_add_nvlist_array(nvroot,
1630 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
1631 		VERIFY(nvlist_lookup_nvlist_array(nvroot,
1632 		    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
1633 
1634 		/*
1635 		 * Update level 2 cache device stats.
1636 		 */
1637 
1638 		for (i = 0; i < nl2cache; i++) {
1639 			VERIFY(nvlist_lookup_uint64(l2cache[i],
1640 			    ZPOOL_CONFIG_GUID, &guid) == 0);
1641 
1642 			vd = NULL;
1643 			for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
1644 				if (guid ==
1645 				    spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
1646 					vd = spa->spa_l2cache.sav_vdevs[j];
1647 					break;
1648 				}
1649 			}
1650 			ASSERT(vd != NULL);
1651 
1652 			VERIFY(nvlist_lookup_uint64_array(l2cache[i],
1653 			    ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0);
1654 			vdev_get_stats(vd, vs);
1655 		}
1656 	}
1657 
1658 	spa_config_exit(spa, FTAG);
1659 }
1660 
1661 int
1662 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
1663 {
1664 	int error;
1665 	spa_t *spa;
1666 
1667 	*config = NULL;
1668 	error = spa_open_common(name, &spa, FTAG, config);
1669 
1670 	if (spa && *config != NULL) {
1671 		VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT,
1672 		    spa_get_errlog_size(spa)) == 0);
1673 
1674 		spa_add_spares(spa, *config);
1675 		spa_add_l2cache(spa, *config);
1676 	}
1677 
1678 	/*
1679 	 * We want to get the alternate root even for faulted pools, so we cheat
1680 	 * and call spa_lookup() directly.
1681 	 */
1682 	if (altroot) {
1683 		if (spa == NULL) {
1684 			mutex_enter(&spa_namespace_lock);
1685 			spa = spa_lookup(name);
1686 			if (spa)
1687 				spa_altroot(spa, altroot, buflen);
1688 			else
1689 				altroot[0] = '\0';
1690 			spa = NULL;
1691 			mutex_exit(&spa_namespace_lock);
1692 		} else {
1693 			spa_altroot(spa, altroot, buflen);
1694 		}
1695 	}
1696 
1697 	if (spa != NULL)
1698 		spa_close(spa, FTAG);
1699 
1700 	return (error);
1701 }
1702 
1703 /*
1704  * Validate that the auxiliary device array is well formed.  We must have an
1705  * array of nvlists, each which describes a valid leaf vdev.  If this is an
1706  * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
1707  * specified, as long as they are well-formed.
1708  */
1709 static int
1710 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
1711     spa_aux_vdev_t *sav, const char *config, uint64_t version,
1712     vdev_labeltype_t label)
1713 {
1714 	nvlist_t **dev;
1715 	uint_t i, ndev;
1716 	vdev_t *vd;
1717 	int error;
1718 
1719 	/*
1720 	 * It's acceptable to have no devs specified.
1721 	 */
1722 	if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
1723 		return (0);
1724 
1725 	if (ndev == 0)
1726 		return (EINVAL);
1727 
1728 	/*
1729 	 * Make sure the pool is formatted with a version that supports this
1730 	 * device type.
1731 	 */
1732 	if (spa_version(spa) < version)
1733 		return (ENOTSUP);
1734 
1735 	/*
1736 	 * Set the pending device list so we correctly handle device in-use
1737 	 * checking.
1738 	 */
1739 	sav->sav_pending = dev;
1740 	sav->sav_npending = ndev;
1741 
1742 	for (i = 0; i < ndev; i++) {
1743 		if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
1744 		    mode)) != 0)
1745 			goto out;
1746 
1747 		if (!vd->vdev_ops->vdev_op_leaf) {
1748 			vdev_free(vd);
1749 			error = EINVAL;
1750 			goto out;
1751 		}
1752 
1753 		/*
1754 		 * The L2ARC currently only supports disk devices.
1755 		 */
1756 		if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) &&
1757 		    strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) {
1758 			error = ENOTBLK;
1759 			goto out;
1760 		}
1761 
1762 		vd->vdev_top = vd;
1763 
1764 		if ((error = vdev_open(vd)) == 0 &&
1765 		    (error = vdev_label_init(vd, crtxg, label)) == 0) {
1766 			VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
1767 			    vd->vdev_guid) == 0);
1768 		}
1769 
1770 		vdev_free(vd);
1771 
1772 		if (error &&
1773 		    (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
1774 			goto out;
1775 		else
1776 			error = 0;
1777 	}
1778 
1779 out:
1780 	sav->sav_pending = NULL;
1781 	sav->sav_npending = 0;
1782 	return (error);
1783 }
1784 
1785 static int
1786 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
1787 {
1788 	int error;
1789 
1790 	if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
1791 	    &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
1792 	    VDEV_LABEL_SPARE)) != 0) {
1793 		return (error);
1794 	}
1795 
1796 	return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
1797 	    &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
1798 	    VDEV_LABEL_L2CACHE));
1799 }
1800 
1801 static void
1802 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
1803     const char *config)
1804 {
1805 	int i;
1806 
1807 	if (sav->sav_config != NULL) {
1808 		nvlist_t **olddevs;
1809 		uint_t oldndevs;
1810 		nvlist_t **newdevs;
1811 
1812 		/*
1813 		 * Generate new dev list by concatentating with the
1814 		 * current dev list.
1815 		 */
1816 		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
1817 		    &olddevs, &oldndevs) == 0);
1818 
1819 		newdevs = kmem_alloc(sizeof (void *) *
1820 		    (ndevs + oldndevs), KM_SLEEP);
1821 		for (i = 0; i < oldndevs; i++)
1822 			VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
1823 			    KM_SLEEP) == 0);
1824 		for (i = 0; i < ndevs; i++)
1825 			VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
1826 			    KM_SLEEP) == 0);
1827 
1828 		VERIFY(nvlist_remove(sav->sav_config, config,
1829 		    DATA_TYPE_NVLIST_ARRAY) == 0);
1830 
1831 		VERIFY(nvlist_add_nvlist_array(sav->sav_config,
1832 		    config, newdevs, ndevs + oldndevs) == 0);
1833 		for (i = 0; i < oldndevs + ndevs; i++)
1834 			nvlist_free(newdevs[i]);
1835 		kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
1836 	} else {
1837 		/*
1838 		 * Generate a new dev list.
1839 		 */
1840 		VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
1841 		    KM_SLEEP) == 0);
1842 		VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
1843 		    devs, ndevs) == 0);
1844 	}
1845 }
1846 
1847 /*
1848  * Stop and drop level 2 ARC devices
1849  */
1850 void
1851 spa_l2cache_drop(spa_t *spa)
1852 {
1853 	vdev_t *vd;
1854 	int i;
1855 	spa_aux_vdev_t *sav = &spa->spa_l2cache;
1856 
1857 	for (i = 0; i < sav->sav_count; i++) {
1858 		uint64_t pool;
1859 
1860 		vd = sav->sav_vdevs[i];
1861 		ASSERT(vd != NULL);
1862 
1863 		if (spa_mode & FWRITE &&
1864 		    spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL) {
1865 			l2arc_remove_vdev(vd);
1866 		}
1867 		if (vd->vdev_isl2cache)
1868 			spa_l2cache_remove(vd);
1869 		vdev_clear_stats(vd);
1870 		(void) vdev_close(vd);
1871 	}
1872 }
1873 
1874 /*
1875  * Pool Creation
1876  */
1877 int
1878 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
1879     const char *history_str)
1880 {
1881 	spa_t *spa;
1882 	char *altroot = NULL;
1883 	vdev_t *rvd;
1884 	dsl_pool_t *dp;
1885 	dmu_tx_t *tx;
1886 	int c, error = 0;
1887 	uint64_t txg = TXG_INITIAL;
1888 	nvlist_t **spares, **l2cache;
1889 	uint_t nspares, nl2cache;
1890 	uint64_t version;
1891 
1892 	/*
1893 	 * If this pool already exists, return failure.
1894 	 */
1895 	mutex_enter(&spa_namespace_lock);
1896 	if (spa_lookup(pool) != NULL) {
1897 		mutex_exit(&spa_namespace_lock);
1898 		return (EEXIST);
1899 	}
1900 
1901 	/*
1902 	 * Allocate a new spa_t structure.
1903 	 */
1904 	(void) nvlist_lookup_string(props,
1905 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
1906 	spa = spa_add(pool, altroot);
1907 	spa_activate(spa);
1908 
1909 	spa->spa_uberblock.ub_txg = txg - 1;
1910 
1911 	if (props && (error = spa_prop_validate(spa, props))) {
1912 		spa_unload(spa);
1913 		spa_deactivate(spa);
1914 		spa_remove(spa);
1915 		return (error);
1916 	}
1917 
1918 	if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION),
1919 	    &version) != 0)
1920 		version = SPA_VERSION;
1921 	ASSERT(version <= SPA_VERSION);
1922 	spa->spa_uberblock.ub_version = version;
1923 	spa->spa_ubsync = spa->spa_uberblock;
1924 
1925 	/*
1926 	 * Create the root vdev.
1927 	 */
1928 	spa_config_enter(spa, RW_WRITER, FTAG);
1929 
1930 	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
1931 
1932 	ASSERT(error != 0 || rvd != NULL);
1933 	ASSERT(error != 0 || spa->spa_root_vdev == rvd);
1934 
1935 	if (error == 0 && rvd->vdev_children == 0)
1936 		error = EINVAL;
1937 
1938 	if (error == 0 &&
1939 	    (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
1940 	    (error = spa_validate_aux(spa, nvroot, txg,
1941 	    VDEV_ALLOC_ADD)) == 0) {
1942 		for (c = 0; c < rvd->vdev_children; c++)
1943 			vdev_init(rvd->vdev_child[c], txg);
1944 		vdev_config_dirty(rvd);
1945 	}
1946 
1947 	spa_config_exit(spa, FTAG);
1948 
1949 	if (error != 0) {
1950 		spa_unload(spa);
1951 		spa_deactivate(spa);
1952 		spa_remove(spa);
1953 		mutex_exit(&spa_namespace_lock);
1954 		return (error);
1955 	}
1956 
1957 	/*
1958 	 * Get the list of spares, if specified.
1959 	 */
1960 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
1961 	    &spares, &nspares) == 0) {
1962 		VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
1963 		    KM_SLEEP) == 0);
1964 		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
1965 		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
1966 		spa_config_enter(spa, RW_WRITER, FTAG);
1967 		spa_load_spares(spa);
1968 		spa_config_exit(spa, FTAG);
1969 		spa->spa_spares.sav_sync = B_TRUE;
1970 	}
1971 
1972 	/*
1973 	 * Get the list of level 2 cache devices, if specified.
1974 	 */
1975 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
1976 	    &l2cache, &nl2cache) == 0) {
1977 		VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
1978 		    NV_UNIQUE_NAME, KM_SLEEP) == 0);
1979 		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
1980 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
1981 		spa_config_enter(spa, RW_WRITER, FTAG);
1982 		spa_load_l2cache(spa);
1983 		spa_config_exit(spa, FTAG);
1984 		spa->spa_l2cache.sav_sync = B_TRUE;
1985 	}
1986 
1987 	spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg);
1988 	spa->spa_meta_objset = dp->dp_meta_objset;
1989 
1990 	tx = dmu_tx_create_assigned(dp, txg);
1991 
1992 	/*
1993 	 * Create the pool config object.
1994 	 */
1995 	spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
1996 	    DMU_OT_PACKED_NVLIST, 1 << 14,
1997 	    DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
1998 
1999 	if (zap_add(spa->spa_meta_objset,
2000 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
2001 	    sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
2002 		cmn_err(CE_PANIC, "failed to add pool config");
2003 	}
2004 
2005 	/* Newly created pools with the right version are always deflated. */
2006 	if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
2007 		spa->spa_deflate = TRUE;
2008 		if (zap_add(spa->spa_meta_objset,
2009 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
2010 		    sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
2011 			cmn_err(CE_PANIC, "failed to add deflate");
2012 		}
2013 	}
2014 
2015 	/*
2016 	 * Create the deferred-free bplist object.  Turn off compression
2017 	 * because sync-to-convergence takes longer if the blocksize
2018 	 * keeps changing.
2019 	 */
2020 	spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset,
2021 	    1 << 14, tx);
2022 	dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj,
2023 	    ZIO_COMPRESS_OFF, tx);
2024 
2025 	if (zap_add(spa->spa_meta_objset,
2026 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
2027 	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) {
2028 		cmn_err(CE_PANIC, "failed to add bplist");
2029 	}
2030 
2031 	/*
2032 	 * Create the pool's history object.
2033 	 */
2034 	if (version >= SPA_VERSION_ZPOOL_HISTORY)
2035 		spa_history_create_obj(spa, tx);
2036 
2037 	/*
2038 	 * Set pool properties.
2039 	 */
2040 	spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
2041 	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
2042 	spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
2043 	if (props)
2044 		spa_sync_props(spa, props, CRED(), tx);
2045 
2046 	dmu_tx_commit(tx);
2047 
2048 	spa->spa_sync_on = B_TRUE;
2049 	txg_sync_start(spa->spa_dsl_pool);
2050 
2051 	/*
2052 	 * We explicitly wait for the first transaction to complete so that our
2053 	 * bean counters are appropriately updated.
2054 	 */
2055 	txg_wait_synced(spa->spa_dsl_pool, txg);
2056 
2057 	spa_config_sync();
2058 
2059 	if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL)
2060 		(void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE);
2061 
2062 	mutex_exit(&spa_namespace_lock);
2063 
2064 	return (0);
2065 }
2066 
2067 /*
2068  * Import the given pool into the system.  We set up the necessary spa_t and
2069  * then call spa_load() to do the dirty work.
2070  */
2071 int
2072 spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
2073 {
2074 	spa_t *spa;
2075 	char *altroot = NULL;
2076 	int error;
2077 	nvlist_t *nvroot;
2078 	nvlist_t **spares, **l2cache;
2079 	uint_t nspares, nl2cache;
2080 
2081 	/*
2082 	 * If a pool with this name exists, return failure.
2083 	 */
2084 	mutex_enter(&spa_namespace_lock);
2085 	if (spa_lookup(pool) != NULL) {
2086 		mutex_exit(&spa_namespace_lock);
2087 		return (EEXIST);
2088 	}
2089 
2090 	/*
2091 	 * Create and initialize the spa structure.
2092 	 */
2093 	(void) nvlist_lookup_string(props,
2094 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
2095 	spa = spa_add(pool, altroot);
2096 	spa_activate(spa);
2097 
2098 	/*
2099 	 * Pass off the heavy lifting to spa_load().
2100 	 * Pass TRUE for mosconfig because the user-supplied config
2101 	 * is actually the one to trust when doing an import.
2102 	 */
2103 	error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE);
2104 
2105 	spa_config_enter(spa, RW_WRITER, FTAG);
2106 	/*
2107 	 * Toss any existing sparelist, as it doesn't have any validity anymore,
2108 	 * and conflicts with spa_has_spare().
2109 	 */
2110 	if (spa->spa_spares.sav_config) {
2111 		nvlist_free(spa->spa_spares.sav_config);
2112 		spa->spa_spares.sav_config = NULL;
2113 		spa_load_spares(spa);
2114 	}
2115 	if (spa->spa_l2cache.sav_config) {
2116 		nvlist_free(spa->spa_l2cache.sav_config);
2117 		spa->spa_l2cache.sav_config = NULL;
2118 		spa_load_l2cache(spa);
2119 	}
2120 
2121 	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
2122 	    &nvroot) == 0);
2123 	if (error == 0)
2124 		error = spa_validate_aux(spa, nvroot, -1ULL, VDEV_ALLOC_SPARE);
2125 	if (error == 0)
2126 		error = spa_validate_aux(spa, nvroot, -1ULL,
2127 		    VDEV_ALLOC_L2CACHE);
2128 	spa_config_exit(spa, FTAG);
2129 
2130 	if (error != 0 || (props && (error = spa_prop_set(spa, props)))) {
2131 		spa_unload(spa);
2132 		spa_deactivate(spa);
2133 		spa_remove(spa);
2134 		mutex_exit(&spa_namespace_lock);
2135 		return (error);
2136 	}
2137 
2138 	/*
2139 	 * Override any spares and level 2 cache devices as specified by
2140 	 * the user, as these may have correct device names/devids, etc.
2141 	 */
2142 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
2143 	    &spares, &nspares) == 0) {
2144 		if (spa->spa_spares.sav_config)
2145 			VERIFY(nvlist_remove(spa->spa_spares.sav_config,
2146 			    ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
2147 		else
2148 			VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
2149 			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
2150 		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
2151 		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
2152 		spa_config_enter(spa, RW_WRITER, FTAG);
2153 		spa_load_spares(spa);
2154 		spa_config_exit(spa, FTAG);
2155 		spa->spa_spares.sav_sync = B_TRUE;
2156 	}
2157 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
2158 	    &l2cache, &nl2cache) == 0) {
2159 		if (spa->spa_l2cache.sav_config)
2160 			VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
2161 			    ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
2162 		else
2163 			VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
2164 			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
2165 		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
2166 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
2167 		spa_config_enter(spa, RW_WRITER, FTAG);
2168 		spa_load_l2cache(spa);
2169 		spa_config_exit(spa, FTAG);
2170 		spa->spa_l2cache.sav_sync = B_TRUE;
2171 	}
2172 
2173 	/*
2174 	 * Update the config cache to include the newly-imported pool.
2175 	 */
2176 	if (spa_mode & FWRITE)
2177 		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
2178 
2179 	/*
2180 	 * Resilver anything that's out of date.
2181 	 */
2182 	if (spa_mode & FWRITE)
2183 		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
2184 
2185 	mutex_exit(&spa_namespace_lock);
2186 
2187 	return (0);
2188 }
2189 
2190 /*
2191  * This (illegal) pool name is used when temporarily importing a spa_t in order
2192  * to get the vdev stats associated with the imported devices.
2193  */
2194 #define	TRYIMPORT_NAME	"$import"
2195 
2196 nvlist_t *
2197 spa_tryimport(nvlist_t *tryconfig)
2198 {
2199 	nvlist_t *config = NULL;
2200 	char *poolname;
2201 	spa_t *spa;
2202 	uint64_t state;
2203 
2204 	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
2205 		return (NULL);
2206 
2207 	if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
2208 		return (NULL);
2209 
2210 	/*
2211 	 * Create and initialize the spa structure.
2212 	 */
2213 	mutex_enter(&spa_namespace_lock);
2214 	spa = spa_add(TRYIMPORT_NAME, NULL);
2215 	spa_activate(spa);
2216 
2217 	/*
2218 	 * Pass off the heavy lifting to spa_load().
2219 	 * Pass TRUE for mosconfig because the user-supplied config
2220 	 * is actually the one to trust when doing an import.
2221 	 */
2222 	(void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE);
2223 
2224 	/*
2225 	 * If 'tryconfig' was at least parsable, return the current config.
2226 	 */
2227 	if (spa->spa_root_vdev != NULL) {
2228 		spa_config_enter(spa, RW_READER, FTAG);
2229 		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
2230 		spa_config_exit(spa, FTAG);
2231 		VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
2232 		    poolname) == 0);
2233 		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
2234 		    state) == 0);
2235 		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
2236 		    spa->spa_uberblock.ub_timestamp) == 0);
2237 
2238 		/*
2239 		 * Add the list of hot spares and level 2 cache devices.
2240 		 */
2241 		spa_add_spares(spa, config);
2242 		spa_add_l2cache(spa, config);
2243 	}
2244 
2245 	spa_unload(spa);
2246 	spa_deactivate(spa);
2247 	spa_remove(spa);
2248 	mutex_exit(&spa_namespace_lock);
2249 
2250 	return (config);
2251 }
2252 
2253 /*
2254  * Pool export/destroy
2255  *
2256  * The act of destroying or exporting a pool is very simple.  We make sure there
2257  * is no more pending I/O and any references to the pool are gone.  Then, we
2258  * update the pool state and sync all the labels to disk, removing the
2259  * configuration from the cache afterwards.
2260  */
2261 static int
2262 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig)
2263 {
2264 	spa_t *spa;
2265 
2266 	if (oldconfig)
2267 		*oldconfig = NULL;
2268 
2269 	if (!(spa_mode & FWRITE))
2270 		return (EROFS);
2271 
2272 	mutex_enter(&spa_namespace_lock);
2273 	if ((spa = spa_lookup(pool)) == NULL) {
2274 		mutex_exit(&spa_namespace_lock);
2275 		return (ENOENT);
2276 	}
2277 
2278 	/*
2279 	 * Put a hold on the pool, drop the namespace lock, stop async tasks,
2280 	 * reacquire the namespace lock, and see if we can export.
2281 	 */
2282 	spa_open_ref(spa, FTAG);
2283 	mutex_exit(&spa_namespace_lock);
2284 	spa_async_suspend(spa);
2285 	mutex_enter(&spa_namespace_lock);
2286 	spa_close(spa, FTAG);
2287 
2288 	/*
2289 	 * The pool will be in core if it's openable,
2290 	 * in which case we can modify its state.
2291 	 */
2292 	if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
2293 		/*
2294 		 * Objsets may be open only because they're dirty, so we
2295 		 * have to force it to sync before checking spa_refcnt.
2296 		 */
2297 		spa_scrub_suspend(spa);
2298 		txg_wait_synced(spa->spa_dsl_pool, 0);
2299 
2300 		/*
2301 		 * A pool cannot be exported or destroyed if there are active
2302 		 * references.  If we are resetting a pool, allow references by
2303 		 * fault injection handlers.
2304 		 */
2305 		if (!spa_refcount_zero(spa) ||
2306 		    (spa->spa_inject_ref != 0 &&
2307 		    new_state != POOL_STATE_UNINITIALIZED)) {
2308 			spa_scrub_resume(spa);
2309 			spa_async_resume(spa);
2310 			mutex_exit(&spa_namespace_lock);
2311 			return (EBUSY);
2312 		}
2313 
2314 		spa_scrub_resume(spa);
2315 		VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
2316 
2317 		/*
2318 		 * We want this to be reflected on every label,
2319 		 * so mark them all dirty.  spa_unload() will do the
2320 		 * final sync that pushes these changes out.
2321 		 */
2322 		if (new_state != POOL_STATE_UNINITIALIZED) {
2323 			spa_config_enter(spa, RW_WRITER, FTAG);
2324 			spa->spa_state = new_state;
2325 			spa->spa_final_txg = spa_last_synced_txg(spa) + 1;
2326 			vdev_config_dirty(spa->spa_root_vdev);
2327 			spa_config_exit(spa, FTAG);
2328 		}
2329 	}
2330 
2331 	spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY);
2332 
2333 	if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
2334 		spa_unload(spa);
2335 		spa_deactivate(spa);
2336 	}
2337 
2338 	if (oldconfig && spa->spa_config)
2339 		VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
2340 
2341 	if (new_state != POOL_STATE_UNINITIALIZED) {
2342 		spa_config_check(spa->spa_config_dir,
2343 		    spa->spa_config_file);
2344 		spa_remove(spa);
2345 		spa_config_sync();
2346 	}
2347 	mutex_exit(&spa_namespace_lock);
2348 
2349 	return (0);
2350 }
2351 
2352 /*
2353  * Destroy a storage pool.
2354  */
2355 int
2356 spa_destroy(char *pool)
2357 {
2358 	return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL));
2359 }
2360 
2361 /*
2362  * Export a storage pool.
2363  */
2364 int
2365 spa_export(char *pool, nvlist_t **oldconfig)
2366 {
2367 	return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig));
2368 }
2369 
2370 /*
2371  * Similar to spa_export(), this unloads the spa_t without actually removing it
2372  * from the namespace in any way.
2373  */
2374 int
2375 spa_reset(char *pool)
2376 {
2377 	return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL));
2378 }
2379 
2380 
2381 /*
2382  * ==========================================================================
2383  * Device manipulation
2384  * ==========================================================================
2385  */
2386 
2387 /*
2388  * Add a device to a storage pool.
2389  */
2390 int
2391 spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
2392 {
2393 	uint64_t txg;
2394 	int c, error;
2395 	vdev_t *rvd = spa->spa_root_vdev;
2396 	vdev_t *vd, *tvd;
2397 	nvlist_t **spares, **l2cache;
2398 	uint_t nspares, nl2cache;
2399 
2400 	txg = spa_vdev_enter(spa);
2401 
2402 	if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
2403 	    VDEV_ALLOC_ADD)) != 0)
2404 		return (spa_vdev_exit(spa, NULL, txg, error));
2405 
2406 	spa->spa_pending_vdev = vd;
2407 
2408 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
2409 	    &nspares) != 0)
2410 		nspares = 0;
2411 
2412 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
2413 	    &nl2cache) != 0)
2414 		nl2cache = 0;
2415 
2416 	if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) {
2417 		spa->spa_pending_vdev = NULL;
2418 		return (spa_vdev_exit(spa, vd, txg, EINVAL));
2419 	}
2420 
2421 	if (vd->vdev_children != 0) {
2422 		if ((error = vdev_create(vd, txg, B_FALSE)) != 0) {
2423 			spa->spa_pending_vdev = NULL;
2424 			return (spa_vdev_exit(spa, vd, txg, error));
2425 		}
2426 	}
2427 
2428 	/*
2429 	 * We must validate the spares and l2cache devices after checking the
2430 	 * children.  Otherwise, vdev_inuse() will blindly overwrite the spare.
2431 	 */
2432 	if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) {
2433 		spa->spa_pending_vdev = NULL;
2434 		return (spa_vdev_exit(spa, vd, txg, error));
2435 	}
2436 
2437 	spa->spa_pending_vdev = NULL;
2438 
2439 	/*
2440 	 * Transfer each new top-level vdev from vd to rvd.
2441 	 */
2442 	for (c = 0; c < vd->vdev_children; c++) {
2443 		tvd = vd->vdev_child[c];
2444 		vdev_remove_child(vd, tvd);
2445 		tvd->vdev_id = rvd->vdev_children;
2446 		vdev_add_child(rvd, tvd);
2447 		vdev_config_dirty(tvd);
2448 	}
2449 
2450 	if (nspares != 0) {
2451 		spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
2452 		    ZPOOL_CONFIG_SPARES);
2453 		spa_load_spares(spa);
2454 		spa->spa_spares.sav_sync = B_TRUE;
2455 	}
2456 
2457 	if (nl2cache != 0) {
2458 		spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
2459 		    ZPOOL_CONFIG_L2CACHE);
2460 		spa_load_l2cache(spa);
2461 		spa->spa_l2cache.sav_sync = B_TRUE;
2462 	}
2463 
2464 	/*
2465 	 * We have to be careful when adding new vdevs to an existing pool.
2466 	 * If other threads start allocating from these vdevs before we
2467 	 * sync the config cache, and we lose power, then upon reboot we may
2468 	 * fail to open the pool because there are DVAs that the config cache
2469 	 * can't translate.  Therefore, we first add the vdevs without
2470 	 * initializing metaslabs; sync the config cache (via spa_vdev_exit());
2471 	 * and then let spa_config_update() initialize the new metaslabs.
2472 	 *
2473 	 * spa_load() checks for added-but-not-initialized vdevs, so that
2474 	 * if we lose power at any point in this sequence, the remaining
2475 	 * steps will be completed the next time we load the pool.
2476 	 */
2477 	(void) spa_vdev_exit(spa, vd, txg, 0);
2478 
2479 	mutex_enter(&spa_namespace_lock);
2480 	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
2481 	mutex_exit(&spa_namespace_lock);
2482 
2483 	return (0);
2484 }
2485 
2486 /*
2487  * Attach a device to a mirror.  The arguments are the path to any device
2488  * in the mirror, and the nvroot for the new device.  If the path specifies
2489  * a device that is not mirrored, we automatically insert the mirror vdev.
2490  *
2491  * If 'replacing' is specified, the new device is intended to replace the
2492  * existing device; in this case the two devices are made into their own
2493  * mirror using the 'replacing' vdev, which is functionally identical to
2494  * the mirror vdev (it actually reuses all the same ops) but has a few
2495  * extra rules: you can't attach to it after it's been created, and upon
2496  * completion of resilvering, the first disk (the one being replaced)
2497  * is automatically detached.
2498  */
2499 int
2500 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
2501 {
2502 	uint64_t txg, open_txg;
2503 	int error;
2504 	vdev_t *rvd = spa->spa_root_vdev;
2505 	vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
2506 	vdev_ops_t *pvops;
2507 	int is_log;
2508 
2509 	txg = spa_vdev_enter(spa);
2510 
2511 	oldvd = vdev_lookup_by_guid(rvd, guid);
2512 
2513 	if (oldvd == NULL)
2514 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
2515 
2516 	if (!oldvd->vdev_ops->vdev_op_leaf)
2517 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
2518 
2519 	pvd = oldvd->vdev_parent;
2520 
2521 	if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
2522 	    VDEV_ALLOC_ADD)) != 0)
2523 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
2524 
2525 	if (newrootvd->vdev_children != 1)
2526 		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
2527 
2528 	newvd = newrootvd->vdev_child[0];
2529 
2530 	if (!newvd->vdev_ops->vdev_op_leaf)
2531 		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
2532 
2533 	if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
2534 		return (spa_vdev_exit(spa, newrootvd, txg, error));
2535 
2536 	/*
2537 	 * Spares can't replace logs
2538 	 */
2539 	is_log = oldvd->vdev_islog;
2540 	if (is_log && newvd->vdev_isspare)
2541 		return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
2542 
2543 	if (!replacing) {
2544 		/*
2545 		 * For attach, the only allowable parent is a mirror or the root
2546 		 * vdev.
2547 		 */
2548 		if (pvd->vdev_ops != &vdev_mirror_ops &&
2549 		    pvd->vdev_ops != &vdev_root_ops)
2550 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
2551 
2552 		pvops = &vdev_mirror_ops;
2553 	} else {
2554 		/*
2555 		 * Active hot spares can only be replaced by inactive hot
2556 		 * spares.
2557 		 */
2558 		if (pvd->vdev_ops == &vdev_spare_ops &&
2559 		    pvd->vdev_child[1] == oldvd &&
2560 		    !spa_has_spare(spa, newvd->vdev_guid))
2561 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
2562 
2563 		/*
2564 		 * If the source is a hot spare, and the parent isn't already a
2565 		 * spare, then we want to create a new hot spare.  Otherwise, we
2566 		 * want to create a replacing vdev.  The user is not allowed to
2567 		 * attach to a spared vdev child unless the 'isspare' state is
2568 		 * the same (spare replaces spare, non-spare replaces
2569 		 * non-spare).
2570 		 */
2571 		if (pvd->vdev_ops == &vdev_replacing_ops)
2572 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
2573 		else if (pvd->vdev_ops == &vdev_spare_ops &&
2574 		    newvd->vdev_isspare != oldvd->vdev_isspare)
2575 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
2576 		else if (pvd->vdev_ops != &vdev_spare_ops &&
2577 		    newvd->vdev_isspare)
2578 			pvops = &vdev_spare_ops;
2579 		else
2580 			pvops = &vdev_replacing_ops;
2581 	}
2582 
2583 	/*
2584 	 * Compare the new device size with the replaceable/attachable
2585 	 * device size.
2586 	 */
2587 	if (newvd->vdev_psize < vdev_get_rsize(oldvd))
2588 		return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
2589 
2590 	/*
2591 	 * The new device cannot have a higher alignment requirement
2592 	 * than the top-level vdev.
2593 	 */
2594 	if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
2595 		return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
2596 
2597 	/*
2598 	 * If this is an in-place replacement, update oldvd's path and devid
2599 	 * to make it distinguishable from newvd, and unopenable from now on.
2600 	 */
2601 	if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
2602 		spa_strfree(oldvd->vdev_path);
2603 		oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
2604 		    KM_SLEEP);
2605 		(void) sprintf(oldvd->vdev_path, "%s/%s",
2606 		    newvd->vdev_path, "old");
2607 		if (oldvd->vdev_devid != NULL) {
2608 			spa_strfree(oldvd->vdev_devid);
2609 			oldvd->vdev_devid = NULL;
2610 		}
2611 	}
2612 
2613 	/*
2614 	 * If the parent is not a mirror, or if we're replacing, insert the new
2615 	 * mirror/replacing/spare vdev above oldvd.
2616 	 */
2617 	if (pvd->vdev_ops != pvops)
2618 		pvd = vdev_add_parent(oldvd, pvops);
2619 
2620 	ASSERT(pvd->vdev_top->vdev_parent == rvd);
2621 	ASSERT(pvd->vdev_ops == pvops);
2622 	ASSERT(oldvd->vdev_parent == pvd);
2623 
2624 	/*
2625 	 * Extract the new device from its root and add it to pvd.
2626 	 */
2627 	vdev_remove_child(newrootvd, newvd);
2628 	newvd->vdev_id = pvd->vdev_children;
2629 	vdev_add_child(pvd, newvd);
2630 
2631 	/*
2632 	 * If newvd is smaller than oldvd, but larger than its rsize,
2633 	 * the addition of newvd may have decreased our parent's asize.
2634 	 */
2635 	pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize);
2636 
2637 	tvd = newvd->vdev_top;
2638 	ASSERT(pvd->vdev_top == tvd);
2639 	ASSERT(tvd->vdev_parent == rvd);
2640 
2641 	vdev_config_dirty(tvd);
2642 
2643 	/*
2644 	 * Set newvd's DTL to [TXG_INITIAL, open_txg].  It will propagate
2645 	 * upward when spa_vdev_exit() calls vdev_dtl_reassess().
2646 	 */
2647 	open_txg = txg + TXG_CONCURRENT_STATES - 1;
2648 
2649 	mutex_enter(&newvd->vdev_dtl_lock);
2650 	space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL,
2651 	    open_txg - TXG_INITIAL + 1);
2652 	mutex_exit(&newvd->vdev_dtl_lock);
2653 
2654 	if (newvd->vdev_isspare)
2655 		spa_spare_activate(newvd);
2656 
2657 	/*
2658 	 * Mark newvd's DTL dirty in this txg.
2659 	 */
2660 	vdev_dirty(tvd, VDD_DTL, newvd, txg);
2661 
2662 	(void) spa_vdev_exit(spa, newrootvd, open_txg, 0);
2663 
2664 	/*
2665 	 * Kick off a resilver to update newvd.  We need to grab the namespace
2666 	 * lock because spa_scrub() needs to post a sysevent with the pool name.
2667 	 */
2668 	mutex_enter(&spa_namespace_lock);
2669 	VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
2670 	mutex_exit(&spa_namespace_lock);
2671 
2672 	return (0);
2673 }
2674 
2675 /*
2676  * Detach a device from a mirror or replacing vdev.
2677  * If 'replace_done' is specified, only detach if the parent
2678  * is a replacing vdev.
2679  */
2680 int
2681 spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
2682 {
2683 	uint64_t txg;
2684 	int c, t, error;
2685 	vdev_t *rvd = spa->spa_root_vdev;
2686 	vdev_t *vd, *pvd, *cvd, *tvd;
2687 	boolean_t unspare = B_FALSE;
2688 	uint64_t unspare_guid;
2689 
2690 	txg = spa_vdev_enter(spa);
2691 
2692 	vd = vdev_lookup_by_guid(rvd, guid);
2693 
2694 	if (vd == NULL)
2695 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
2696 
2697 	if (!vd->vdev_ops->vdev_op_leaf)
2698 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
2699 
2700 	pvd = vd->vdev_parent;
2701 
2702 	/*
2703 	 * If replace_done is specified, only remove this device if it's
2704 	 * the first child of a replacing vdev.  For the 'spare' vdev, either
2705 	 * disk can be removed.
2706 	 */
2707 	if (replace_done) {
2708 		if (pvd->vdev_ops == &vdev_replacing_ops) {
2709 			if (vd->vdev_id != 0)
2710 				return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
2711 		} else if (pvd->vdev_ops != &vdev_spare_ops) {
2712 			return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
2713 		}
2714 	}
2715 
2716 	ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
2717 	    spa_version(spa) >= SPA_VERSION_SPARES);
2718 
2719 	/*
2720 	 * Only mirror, replacing, and spare vdevs support detach.
2721 	 */
2722 	if (pvd->vdev_ops != &vdev_replacing_ops &&
2723 	    pvd->vdev_ops != &vdev_mirror_ops &&
2724 	    pvd->vdev_ops != &vdev_spare_ops)
2725 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
2726 
2727 	/*
2728 	 * If there's only one replica, you can't detach it.
2729 	 */
2730 	if (pvd->vdev_children <= 1)
2731 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
2732 
2733 	/*
2734 	 * If all siblings have non-empty DTLs, this device may have the only
2735 	 * valid copy of the data, which means we cannot safely detach it.
2736 	 *
2737 	 * XXX -- as in the vdev_offline() case, we really want a more
2738 	 * precise DTL check.
2739 	 */
2740 	for (c = 0; c < pvd->vdev_children; c++) {
2741 		uint64_t dirty;
2742 
2743 		cvd = pvd->vdev_child[c];
2744 		if (cvd == vd)
2745 			continue;
2746 		if (vdev_is_dead(cvd))
2747 			continue;
2748 		mutex_enter(&cvd->vdev_dtl_lock);
2749 		dirty = cvd->vdev_dtl_map.sm_space |
2750 		    cvd->vdev_dtl_scrub.sm_space;
2751 		mutex_exit(&cvd->vdev_dtl_lock);
2752 		if (!dirty)
2753 			break;
2754 	}
2755 
2756 	/*
2757 	 * If we are a replacing or spare vdev, then we can always detach the
2758 	 * latter child, as that is how one cancels the operation.
2759 	 */
2760 	if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) &&
2761 	    c == pvd->vdev_children)
2762 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
2763 
2764 	/*
2765 	 * If we are detaching the original disk from a spare, then it implies
2766 	 * that the spare should become a real disk, and be removed from the
2767 	 * active spare list for the pool.
2768 	 */
2769 	if (pvd->vdev_ops == &vdev_spare_ops &&
2770 	    vd->vdev_id == 0)
2771 		unspare = B_TRUE;
2772 
2773 	/*
2774 	 * Erase the disk labels so the disk can be used for other things.
2775 	 * This must be done after all other error cases are handled,
2776 	 * but before we disembowel vd (so we can still do I/O to it).
2777 	 * But if we can't do it, don't treat the error as fatal --
2778 	 * it may be that the unwritability of the disk is the reason
2779 	 * it's being detached!
2780 	 */
2781 	error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
2782 
2783 	/*
2784 	 * Remove vd from its parent and compact the parent's children.
2785 	 */
2786 	vdev_remove_child(pvd, vd);
2787 	vdev_compact_children(pvd);
2788 
2789 	/*
2790 	 * Remember one of the remaining children so we can get tvd below.
2791 	 */
2792 	cvd = pvd->vdev_child[0];
2793 
2794 	/*
2795 	 * If we need to remove the remaining child from the list of hot spares,
2796 	 * do it now, marking the vdev as no longer a spare in the process.  We
2797 	 * must do this before vdev_remove_parent(), because that can change the
2798 	 * GUID if it creates a new toplevel GUID.
2799 	 */
2800 	if (unspare) {
2801 		ASSERT(cvd->vdev_isspare);
2802 		spa_spare_remove(cvd);
2803 		unspare_guid = cvd->vdev_guid;
2804 	}
2805 
2806 	/*
2807 	 * If the parent mirror/replacing vdev only has one child,
2808 	 * the parent is no longer needed.  Remove it from the tree.
2809 	 */
2810 	if (pvd->vdev_children == 1)
2811 		vdev_remove_parent(cvd);
2812 
2813 	/*
2814 	 * We don't set tvd until now because the parent we just removed
2815 	 * may have been the previous top-level vdev.
2816 	 */
2817 	tvd = cvd->vdev_top;
2818 	ASSERT(tvd->vdev_parent == rvd);
2819 
2820 	/*
2821 	 * Reevaluate the parent vdev state.
2822 	 */
2823 	vdev_propagate_state(cvd);
2824 
2825 	/*
2826 	 * If the device we just detached was smaller than the others, it may be
2827 	 * possible to add metaslabs (i.e. grow the pool).  vdev_metaslab_init()
2828 	 * can't fail because the existing metaslabs are already in core, so
2829 	 * there's nothing to read from disk.
2830 	 */
2831 	VERIFY(vdev_metaslab_init(tvd, txg) == 0);
2832 
2833 	vdev_config_dirty(tvd);
2834 
2835 	/*
2836 	 * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
2837 	 * vd->vdev_detached is set and free vd's DTL object in syncing context.
2838 	 * But first make sure we're not on any *other* txg's DTL list, to
2839 	 * prevent vd from being accessed after it's freed.
2840 	 */
2841 	for (t = 0; t < TXG_SIZE; t++)
2842 		(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
2843 	vd->vdev_detached = B_TRUE;
2844 	vdev_dirty(tvd, VDD_DTL, vd, txg);
2845 
2846 	spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
2847 
2848 	error = spa_vdev_exit(spa, vd, txg, 0);
2849 
2850 	/*
2851 	 * If this was the removal of the original device in a hot spare vdev,
2852 	 * then we want to go through and remove the device from the hot spare
2853 	 * list of every other pool.
2854 	 */
2855 	if (unspare) {
2856 		spa = NULL;
2857 		mutex_enter(&spa_namespace_lock);
2858 		while ((spa = spa_next(spa)) != NULL) {
2859 			if (spa->spa_state != POOL_STATE_ACTIVE)
2860 				continue;
2861 
2862 			(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
2863 		}
2864 		mutex_exit(&spa_namespace_lock);
2865 	}
2866 
2867 	return (error);
2868 }
2869 
2870 /*
2871  * Remove a spares vdev from the nvlist config.
2872  */
2873 static int
2874 spa_remove_spares(spa_aux_vdev_t *sav, uint64_t guid, boolean_t unspare,
2875     nvlist_t **spares, int nspares, vdev_t *vd)
2876 {
2877 	nvlist_t *nv, **newspares;
2878 	int i, j;
2879 
2880 	nv = NULL;
2881 	for (i = 0; i < nspares; i++) {
2882 		uint64_t theguid;
2883 
2884 		VERIFY(nvlist_lookup_uint64(spares[i],
2885 		    ZPOOL_CONFIG_GUID, &theguid) == 0);
2886 		if (theguid == guid) {
2887 			nv = spares[i];
2888 			break;
2889 		}
2890 	}
2891 
2892 	/*
2893 	 * Only remove the hot spare if it's not currently in use in this pool.
2894 	 */
2895 	if (nv == NULL && vd == NULL)
2896 		return (ENOENT);
2897 
2898 	if (nv == NULL && vd != NULL)
2899 		return (ENOTSUP);
2900 
2901 	if (!unspare && nv != NULL && vd != NULL)
2902 		return (EBUSY);
2903 
2904 	if (nspares == 1) {
2905 		newspares = NULL;
2906 	} else {
2907 		newspares = kmem_alloc((nspares - 1) * sizeof (void *),
2908 		    KM_SLEEP);
2909 		for (i = 0, j = 0; i < nspares; i++) {
2910 			if (spares[i] != nv)
2911 				VERIFY(nvlist_dup(spares[i],
2912 				    &newspares[j++], KM_SLEEP) == 0);
2913 		}
2914 	}
2915 
2916 	VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_SPARES,
2917 	    DATA_TYPE_NVLIST_ARRAY) == 0);
2918 	VERIFY(nvlist_add_nvlist_array(sav->sav_config,
2919 	    ZPOOL_CONFIG_SPARES, newspares, nspares - 1) == 0);
2920 	for (i = 0; i < nspares - 1; i++)
2921 		nvlist_free(newspares[i]);
2922 	kmem_free(newspares, (nspares - 1) * sizeof (void *));
2923 
2924 	return (0);
2925 }
2926 
2927 /*
2928  * Remove an l2cache vdev from the nvlist config.
2929  */
2930 static int
2931 spa_remove_l2cache(spa_aux_vdev_t *sav, uint64_t guid, nvlist_t **l2cache,
2932     int nl2cache, vdev_t *vd)
2933 {
2934 	nvlist_t *nv, **newl2cache;
2935 	int i, j;
2936 
2937 	nv = NULL;
2938 	for (i = 0; i < nl2cache; i++) {
2939 		uint64_t theguid;
2940 
2941 		VERIFY(nvlist_lookup_uint64(l2cache[i],
2942 		    ZPOOL_CONFIG_GUID, &theguid) == 0);
2943 		if (theguid == guid) {
2944 			nv = l2cache[i];
2945 			break;
2946 		}
2947 	}
2948 
2949 	if (vd == NULL) {
2950 		for (i = 0; i < nl2cache; i++) {
2951 			if (sav->sav_vdevs[i]->vdev_guid == guid) {
2952 				vd = sav->sav_vdevs[i];
2953 				break;
2954 			}
2955 		}
2956 	}
2957 
2958 	if (nv == NULL && vd == NULL)
2959 		return (ENOENT);
2960 
2961 	if (nv == NULL && vd != NULL)
2962 		return (ENOTSUP);
2963 
2964 	if (nl2cache == 1) {
2965 		newl2cache = NULL;
2966 	} else {
2967 		newl2cache = kmem_alloc((nl2cache - 1) * sizeof (void *),
2968 		    KM_SLEEP);
2969 		for (i = 0, j = 0; i < nl2cache; i++) {
2970 			if (l2cache[i] != nv)
2971 				VERIFY(nvlist_dup(l2cache[i],
2972 				    &newl2cache[j++], KM_SLEEP) == 0);
2973 		}
2974 	}
2975 
2976 	VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
2977 	    DATA_TYPE_NVLIST_ARRAY) == 0);
2978 	VERIFY(nvlist_add_nvlist_array(sav->sav_config,
2979 	    ZPOOL_CONFIG_L2CACHE, newl2cache, nl2cache - 1) == 0);
2980 	for (i = 0; i < nl2cache - 1; i++)
2981 		nvlist_free(newl2cache[i]);
2982 	kmem_free(newl2cache, (nl2cache - 1) * sizeof (void *));
2983 
2984 	return (0);
2985 }
2986 
2987 /*
2988  * Remove a device from the pool.  Currently, this supports removing only hot
2989  * spares and level 2 ARC devices.
2990  */
2991 int
2992 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
2993 {
2994 	vdev_t *vd;
2995 	nvlist_t **spares, **l2cache;
2996 	uint_t nspares, nl2cache;
2997 	int error = 0;
2998 
2999 	spa_config_enter(spa, RW_WRITER, FTAG);
3000 
3001 	vd = spa_lookup_by_guid(spa, guid);
3002 
3003 	if (spa->spa_spares.sav_vdevs != NULL &&
3004 	    spa_spare_exists(guid, NULL) &&
3005 	    nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
3006 	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) {
3007 		if ((error = spa_remove_spares(&spa->spa_spares, guid, unspare,
3008 		    spares, nspares, vd)) != 0)
3009 			goto out;
3010 		spa_load_spares(spa);
3011 		spa->spa_spares.sav_sync = B_TRUE;
3012 		goto out;
3013 	}
3014 
3015 	if (spa->spa_l2cache.sav_vdevs != NULL &&
3016 	    spa_l2cache_exists(guid, NULL) &&
3017 	    nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
3018 	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) {
3019 		if ((error = spa_remove_l2cache(&spa->spa_l2cache, guid,
3020 		    l2cache, nl2cache, vd)) != 0)
3021 			goto out;
3022 		spa_load_l2cache(spa);
3023 		spa->spa_l2cache.sav_sync = B_TRUE;
3024 	}
3025 
3026 out:
3027 	spa_config_exit(spa, FTAG);
3028 	return (error);
3029 }
3030 
3031 /*
3032  * Find any device that's done replacing, or a vdev marked 'unspare' that's
3033  * current spared, so we can detach it.
3034  */
3035 static vdev_t *
3036 spa_vdev_resilver_done_hunt(vdev_t *vd)
3037 {
3038 	vdev_t *newvd, *oldvd;
3039 	int c;
3040 
3041 	for (c = 0; c < vd->vdev_children; c++) {
3042 		oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
3043 		if (oldvd != NULL)
3044 			return (oldvd);
3045 	}
3046 
3047 	/*
3048 	 * Check for a completed replacement.
3049 	 */
3050 	if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) {
3051 		oldvd = vd->vdev_child[0];
3052 		newvd = vd->vdev_child[1];
3053 
3054 		mutex_enter(&newvd->vdev_dtl_lock);
3055 		if (newvd->vdev_dtl_map.sm_space == 0 &&
3056 		    newvd->vdev_dtl_scrub.sm_space == 0) {
3057 			mutex_exit(&newvd->vdev_dtl_lock);
3058 			return (oldvd);
3059 		}
3060 		mutex_exit(&newvd->vdev_dtl_lock);
3061 	}
3062 
3063 	/*
3064 	 * Check for a completed resilver with the 'unspare' flag set.
3065 	 */
3066 	if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) {
3067 		newvd = vd->vdev_child[0];
3068 		oldvd = vd->vdev_child[1];
3069 
3070 		mutex_enter(&newvd->vdev_dtl_lock);
3071 		if (newvd->vdev_unspare &&
3072 		    newvd->vdev_dtl_map.sm_space == 0 &&
3073 		    newvd->vdev_dtl_scrub.sm_space == 0) {
3074 			newvd->vdev_unspare = 0;
3075 			mutex_exit(&newvd->vdev_dtl_lock);
3076 			return (oldvd);
3077 		}
3078 		mutex_exit(&newvd->vdev_dtl_lock);
3079 	}
3080 
3081 	return (NULL);
3082 }
3083 
3084 static void
3085 spa_vdev_resilver_done(spa_t *spa)
3086 {
3087 	vdev_t *vd;
3088 	vdev_t *pvd;
3089 	uint64_t guid;
3090 	uint64_t pguid = 0;
3091 
3092 	spa_config_enter(spa, RW_READER, FTAG);
3093 
3094 	while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
3095 		guid = vd->vdev_guid;
3096 		/*
3097 		 * If we have just finished replacing a hot spared device, then
3098 		 * we need to detach the parent's first child (the original hot
3099 		 * spare) as well.
3100 		 */
3101 		pvd = vd->vdev_parent;
3102 		if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
3103 		    pvd->vdev_id == 0) {
3104 			ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
3105 			ASSERT(pvd->vdev_parent->vdev_children == 2);
3106 			pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid;
3107 		}
3108 		spa_config_exit(spa, FTAG);
3109 		if (spa_vdev_detach(spa, guid, B_TRUE) != 0)
3110 			return;
3111 		if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0)
3112 			return;
3113 		spa_config_enter(spa, RW_READER, FTAG);
3114 	}
3115 
3116 	spa_config_exit(spa, FTAG);
3117 }
3118 
3119 /*
3120  * Update the stored path for this vdev.  Dirty the vdev configuration, relying
3121  * on spa_vdev_enter/exit() to synchronize the labels and cache.
3122  */
3123 int
3124 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
3125 {
3126 	vdev_t *rvd, *vd;
3127 	uint64_t txg;
3128 
3129 	rvd = spa->spa_root_vdev;
3130 
3131 	txg = spa_vdev_enter(spa);
3132 
3133 	if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
3134 		/*
3135 		 * Determine if this is a reference to a hot spare or l2cache
3136 		 * device.  If it is, update the path as stored in their
3137 		 * device list.
3138 		 */
3139 		nvlist_t **spares, **l2cache;
3140 		uint_t i, nspares, nl2cache;
3141 
3142 		if (spa->spa_spares.sav_config != NULL) {
3143 			VERIFY(nvlist_lookup_nvlist_array(
3144 			    spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
3145 			    &spares, &nspares) == 0);
3146 			for (i = 0; i < nspares; i++) {
3147 				uint64_t theguid;
3148 				VERIFY(nvlist_lookup_uint64(spares[i],
3149 				    ZPOOL_CONFIG_GUID, &theguid) == 0);
3150 				if (theguid == guid) {
3151 					VERIFY(nvlist_add_string(spares[i],
3152 					    ZPOOL_CONFIG_PATH, newpath) == 0);
3153 					spa_load_spares(spa);
3154 					spa->spa_spares.sav_sync = B_TRUE;
3155 					return (spa_vdev_exit(spa, NULL, txg,
3156 					    0));
3157 				}
3158 			}
3159 		}
3160 
3161 		if (spa->spa_l2cache.sav_config != NULL) {
3162 			VERIFY(nvlist_lookup_nvlist_array(
3163 			    spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE,
3164 			    &l2cache, &nl2cache) == 0);
3165 			for (i = 0; i < nl2cache; i++) {
3166 				uint64_t theguid;
3167 				VERIFY(nvlist_lookup_uint64(l2cache[i],
3168 				    ZPOOL_CONFIG_GUID, &theguid) == 0);
3169 				if (theguid == guid) {
3170 					VERIFY(nvlist_add_string(l2cache[i],
3171 					    ZPOOL_CONFIG_PATH, newpath) == 0);
3172 					spa_load_l2cache(spa);
3173 					spa->spa_l2cache.sav_sync = B_TRUE;
3174 					return (spa_vdev_exit(spa, NULL, txg,
3175 					    0));
3176 				}
3177 			}
3178 		}
3179 
3180 		return (spa_vdev_exit(spa, NULL, txg, ENOENT));
3181 	}
3182 
3183 	if (!vd->vdev_ops->vdev_op_leaf)
3184 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
3185 
3186 	spa_strfree(vd->vdev_path);
3187 	vd->vdev_path = spa_strdup(newpath);
3188 
3189 	vdev_config_dirty(vd->vdev_top);
3190 
3191 	return (spa_vdev_exit(spa, NULL, txg, 0));
3192 }
3193 
3194 /*
3195  * ==========================================================================
3196  * SPA Scrubbing
3197  * ==========================================================================
3198  */
3199 
3200 static void
3201 spa_scrub_io_done(zio_t *zio)
3202 {
3203 	spa_t *spa = zio->io_spa;
3204 
3205 	arc_data_buf_free(zio->io_data, zio->io_size);
3206 
3207 	mutex_enter(&spa->spa_scrub_lock);
3208 	if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
3209 		vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev;
3210 		spa->spa_scrub_errors++;
3211 		mutex_enter(&vd->vdev_stat_lock);
3212 		vd->vdev_stat.vs_scrub_errors++;
3213 		mutex_exit(&vd->vdev_stat_lock);
3214 	}
3215 
3216 	if (--spa->spa_scrub_inflight < spa->spa_scrub_maxinflight)
3217 		cv_broadcast(&spa->spa_scrub_io_cv);
3218 
3219 	ASSERT(spa->spa_scrub_inflight >= 0);
3220 
3221 	mutex_exit(&spa->spa_scrub_lock);
3222 }
3223 
3224 static void
3225 spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags,
3226     zbookmark_t *zb)
3227 {
3228 	size_t size = BP_GET_LSIZE(bp);
3229 	void *data;
3230 
3231 	mutex_enter(&spa->spa_scrub_lock);
3232 	/*
3233 	 * Do not give too much work to vdev(s).
3234 	 */
3235 	while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) {
3236 		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
3237 	}
3238 	spa->spa_scrub_inflight++;
3239 	mutex_exit(&spa->spa_scrub_lock);
3240 
3241 	data = arc_data_buf_alloc(size);
3242 
3243 	if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)
3244 		flags |= ZIO_FLAG_SPECULATIVE;	/* intent log block */
3245 
3246 	flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL;
3247 
3248 	zio_nowait(zio_read(NULL, spa, bp, data, size,
3249 	    spa_scrub_io_done, NULL, priority, flags, zb));
3250 }
3251 
3252 /* ARGSUSED */
3253 static int
3254 spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a)
3255 {
3256 	blkptr_t *bp = &bc->bc_blkptr;
3257 	vdev_t *vd = spa->spa_root_vdev;
3258 	dva_t *dva = bp->blk_dva;
3259 	int needs_resilver = B_FALSE;
3260 	int d;
3261 
3262 	if (bc->bc_errno) {
3263 		/*
3264 		 * We can't scrub this block, but we can continue to scrub
3265 		 * the rest of the pool.  Note the error and move along.
3266 		 */
3267 		mutex_enter(&spa->spa_scrub_lock);
3268 		spa->spa_scrub_errors++;
3269 		mutex_exit(&spa->spa_scrub_lock);
3270 
3271 		mutex_enter(&vd->vdev_stat_lock);
3272 		vd->vdev_stat.vs_scrub_errors++;
3273 		mutex_exit(&vd->vdev_stat_lock);
3274 
3275 		return (ERESTART);
3276 	}
3277 
3278 	ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg);
3279 
3280 	for (d = 0; d < BP_GET_NDVAS(bp); d++) {
3281 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]));
3282 
3283 		ASSERT(vd != NULL);
3284 
3285 		/*
3286 		 * Keep track of how much data we've examined so that
3287 		 * zpool(1M) status can make useful progress reports.
3288 		 */
3289 		mutex_enter(&vd->vdev_stat_lock);
3290 		vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]);
3291 		mutex_exit(&vd->vdev_stat_lock);
3292 
3293 		if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) {
3294 			if (DVA_GET_GANG(&dva[d])) {
3295 				/*
3296 				 * Gang members may be spread across multiple
3297 				 * vdevs, so the best we can do is look at the
3298 				 * pool-wide DTL.
3299 				 * XXX -- it would be better to change our
3300 				 * allocation policy to ensure that this can't
3301 				 * happen.
3302 				 */
3303 				vd = spa->spa_root_vdev;
3304 			}
3305 			if (vdev_dtl_contains(&vd->vdev_dtl_map,
3306 			    bp->blk_birth, 1))
3307 				needs_resilver = B_TRUE;
3308 		}
3309 	}
3310 
3311 	if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING)
3312 		spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB,
3313 		    ZIO_FLAG_SCRUB, &bc->bc_bookmark);
3314 	else if (needs_resilver)
3315 		spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER,
3316 		    ZIO_FLAG_RESILVER, &bc->bc_bookmark);
3317 
3318 	return (0);
3319 }
3320 
3321 static void
3322 spa_scrub_thread(spa_t *spa)
3323 {
3324 	callb_cpr_t cprinfo;
3325 	traverse_handle_t *th = spa->spa_scrub_th;
3326 	vdev_t *rvd = spa->spa_root_vdev;
3327 	pool_scrub_type_t scrub_type = spa->spa_scrub_type;
3328 	int error = 0;
3329 	boolean_t complete;
3330 
3331 	CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG);
3332 
3333 	/*
3334 	 * If we're restarting due to a snapshot create/delete,
3335 	 * wait for that to complete.
3336 	 */
3337 	txg_wait_synced(spa_get_dsl(spa), 0);
3338 
3339 	dprintf("start %s mintxg=%llu maxtxg=%llu\n",
3340 	    scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub",
3341 	    spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg);
3342 
3343 	spa_config_enter(spa, RW_WRITER, FTAG);
3344 	vdev_reopen(rvd);		/* purge all vdev caches */
3345 	vdev_config_dirty(rvd);		/* rewrite all disk labels */
3346 	vdev_scrub_stat_update(rvd, scrub_type, B_FALSE);
3347 	spa_config_exit(spa, FTAG);
3348 
3349 	mutex_enter(&spa->spa_scrub_lock);
3350 	spa->spa_scrub_errors = 0;
3351 	spa->spa_scrub_active = 1;
3352 	ASSERT(spa->spa_scrub_inflight == 0);
3353 
3354 	while (!spa->spa_scrub_stop) {
3355 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
3356 		while (spa->spa_scrub_suspended) {
3357 			spa->spa_scrub_active = 0;
3358 			cv_broadcast(&spa->spa_scrub_cv);
3359 			cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
3360 			spa->spa_scrub_active = 1;
3361 		}
3362 		CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock);
3363 
3364 		if (spa->spa_scrub_restart_txg != 0)
3365 			break;
3366 
3367 		mutex_exit(&spa->spa_scrub_lock);
3368 		error = traverse_more(th);
3369 		mutex_enter(&spa->spa_scrub_lock);
3370 		if (error != EAGAIN)
3371 			break;
3372 	}
3373 
3374 	while (spa->spa_scrub_inflight)
3375 		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
3376 
3377 	spa->spa_scrub_active = 0;
3378 	cv_broadcast(&spa->spa_scrub_cv);
3379 
3380 	mutex_exit(&spa->spa_scrub_lock);
3381 
3382 	spa_config_enter(spa, RW_WRITER, FTAG);
3383 
3384 	mutex_enter(&spa->spa_scrub_lock);
3385 
3386 	/*
3387 	 * Note: we check spa_scrub_restart_txg under both spa_scrub_lock
3388 	 * AND the spa config lock to synchronize with any config changes
3389 	 * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit().
3390 	 */
3391 	if (spa->spa_scrub_restart_txg != 0)
3392 		error = ERESTART;
3393 
3394 	if (spa->spa_scrub_stop)
3395 		error = EINTR;
3396 
3397 	/*
3398 	 * Even if there were uncorrectable errors, we consider the scrub
3399 	 * completed.  The downside is that if there is a transient error during
3400 	 * a resilver, we won't resilver the data properly to the target.  But
3401 	 * if the damage is permanent (more likely) we will resilver forever,
3402 	 * which isn't really acceptable.  Since there is enough information for
3403 	 * the user to know what has failed and why, this seems like a more
3404 	 * tractable approach.
3405 	 */
3406 	complete = (error == 0);
3407 
3408 	dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n",
3409 	    scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub",
3410 	    spa->spa_scrub_maxtxg, complete ? "done" : "FAILED",
3411 	    error, spa->spa_scrub_errors, spa->spa_scrub_stop);
3412 
3413 	mutex_exit(&spa->spa_scrub_lock);
3414 
3415 	/*
3416 	 * If the scrub/resilver completed, update all DTLs to reflect this.
3417 	 * Whether it succeeded or not, vacate all temporary scrub DTLs.
3418 	 */
3419 	vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1,
3420 	    complete ? spa->spa_scrub_maxtxg : 0, B_TRUE);
3421 	vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete);
3422 	spa_errlog_rotate(spa);
3423 
3424 	if (scrub_type == POOL_SCRUB_RESILVER && complete)
3425 		spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_FINISH);
3426 
3427 	spa_config_exit(spa, FTAG);
3428 
3429 	mutex_enter(&spa->spa_scrub_lock);
3430 
3431 	/*
3432 	 * We may have finished replacing a device.
3433 	 * Let the async thread assess this and handle the detach.
3434 	 */
3435 	spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
3436 
3437 	/*
3438 	 * If we were told to restart, our final act is to start a new scrub.
3439 	 */
3440 	if (error == ERESTART)
3441 		spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ?
3442 		    SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB);
3443 
3444 	spa->spa_scrub_type = POOL_SCRUB_NONE;
3445 	spa->spa_scrub_active = 0;
3446 	spa->spa_scrub_thread = NULL;
3447 	cv_broadcast(&spa->spa_scrub_cv);
3448 	CALLB_CPR_EXIT(&cprinfo);	/* drops &spa->spa_scrub_lock */
3449 	thread_exit();
3450 }
3451 
3452 void
3453 spa_scrub_suspend(spa_t *spa)
3454 {
3455 	mutex_enter(&spa->spa_scrub_lock);
3456 	spa->spa_scrub_suspended++;
3457 	while (spa->spa_scrub_active) {
3458 		cv_broadcast(&spa->spa_scrub_cv);
3459 		cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
3460 	}
3461 	while (spa->spa_scrub_inflight)
3462 		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
3463 	mutex_exit(&spa->spa_scrub_lock);
3464 }
3465 
3466 void
3467 spa_scrub_resume(spa_t *spa)
3468 {
3469 	mutex_enter(&spa->spa_scrub_lock);
3470 	ASSERT(spa->spa_scrub_suspended != 0);
3471 	if (--spa->spa_scrub_suspended == 0)
3472 		cv_broadcast(&spa->spa_scrub_cv);
3473 	mutex_exit(&spa->spa_scrub_lock);
3474 }
3475 
3476 void
3477 spa_scrub_restart(spa_t *spa, uint64_t txg)
3478 {
3479 	/*
3480 	 * Something happened (e.g. snapshot create/delete) that means
3481 	 * we must restart any in-progress scrubs.  The itinerary will
3482 	 * fix this properly.
3483 	 */
3484 	mutex_enter(&spa->spa_scrub_lock);
3485 	spa->spa_scrub_restart_txg = txg;
3486 	mutex_exit(&spa->spa_scrub_lock);
3487 }
3488 
3489 int
3490 spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force)
3491 {
3492 	space_seg_t *ss;
3493 	uint64_t mintxg, maxtxg;
3494 	vdev_t *rvd = spa->spa_root_vdev;
3495 
3496 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
3497 	ASSERT(!spa_config_held(spa, RW_WRITER));
3498 
3499 	if ((uint_t)type >= POOL_SCRUB_TYPES)
3500 		return (ENOTSUP);
3501 
3502 	mutex_enter(&spa->spa_scrub_lock);
3503 
3504 	/*
3505 	 * If there's a scrub or resilver already in progress, stop it.
3506 	 */
3507 	while (spa->spa_scrub_thread != NULL) {
3508 		/*
3509 		 * Don't stop a resilver unless forced.
3510 		 */
3511 		if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) {
3512 			mutex_exit(&spa->spa_scrub_lock);
3513 			return (EBUSY);
3514 		}
3515 		spa->spa_scrub_stop = 1;
3516 		cv_broadcast(&spa->spa_scrub_cv);
3517 		cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
3518 	}
3519 
3520 	/*
3521 	 * Terminate the previous traverse.
3522 	 */
3523 	if (spa->spa_scrub_th != NULL) {
3524 		traverse_fini(spa->spa_scrub_th);
3525 		spa->spa_scrub_th = NULL;
3526 	}
3527 
3528 	if (rvd == NULL) {
3529 		ASSERT(spa->spa_scrub_stop == 0);
3530 		ASSERT(spa->spa_scrub_type == type);
3531 		ASSERT(spa->spa_scrub_restart_txg == 0);
3532 		mutex_exit(&spa->spa_scrub_lock);
3533 		return (0);
3534 	}
3535 
3536 	mintxg = TXG_INITIAL - 1;
3537 	maxtxg = spa_last_synced_txg(spa) + 1;
3538 
3539 	mutex_enter(&rvd->vdev_dtl_lock);
3540 
3541 	if (rvd->vdev_dtl_map.sm_space == 0) {
3542 		/*
3543 		 * The pool-wide DTL is empty.
3544 		 * If this is a resilver, there's nothing to do except
3545 		 * check whether any in-progress replacements have completed.
3546 		 */
3547 		if (type == POOL_SCRUB_RESILVER) {
3548 			type = POOL_SCRUB_NONE;
3549 			spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
3550 		}
3551 	} else {
3552 		/*
3553 		 * The pool-wide DTL is non-empty.
3554 		 * If this is a normal scrub, upgrade to a resilver instead.
3555 		 */
3556 		if (type == POOL_SCRUB_EVERYTHING)
3557 			type = POOL_SCRUB_RESILVER;
3558 	}
3559 
3560 	if (type == POOL_SCRUB_RESILVER) {
3561 		/*
3562 		 * Determine the resilvering boundaries.
3563 		 *
3564 		 * Note: (mintxg, maxtxg) is an open interval,
3565 		 * i.e. mintxg and maxtxg themselves are not included.
3566 		 *
3567 		 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1
3568 		 * so we don't claim to resilver a txg that's still changing.
3569 		 */
3570 		ss = avl_first(&rvd->vdev_dtl_map.sm_root);
3571 		mintxg = ss->ss_start - 1;
3572 		ss = avl_last(&rvd->vdev_dtl_map.sm_root);
3573 		maxtxg = MIN(ss->ss_end, maxtxg);
3574 
3575 		spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START);
3576 	}
3577 
3578 	mutex_exit(&rvd->vdev_dtl_lock);
3579 
3580 	spa->spa_scrub_stop = 0;
3581 	spa->spa_scrub_type = type;
3582 	spa->spa_scrub_restart_txg = 0;
3583 
3584 	if (type != POOL_SCRUB_NONE) {
3585 		spa->spa_scrub_mintxg = mintxg;
3586 		spa->spa_scrub_maxtxg = maxtxg;
3587 		spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL,
3588 		    ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL,
3589 		    ZIO_FLAG_CANFAIL);
3590 		traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg);
3591 		spa->spa_scrub_thread = thread_create(NULL, 0,
3592 		    spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri);
3593 	}
3594 
3595 	mutex_exit(&spa->spa_scrub_lock);
3596 
3597 	return (0);
3598 }
3599 
3600 /*
3601  * ==========================================================================
3602  * SPA async task processing
3603  * ==========================================================================
3604  */
3605 
3606 static void
3607 spa_async_remove(spa_t *spa, vdev_t *vd)
3608 {
3609 	vdev_t *tvd;
3610 	int c;
3611 
3612 	for (c = 0; c < vd->vdev_children; c++) {
3613 		tvd = vd->vdev_child[c];
3614 		if (tvd->vdev_remove_wanted) {
3615 			tvd->vdev_remove_wanted = 0;
3616 			vdev_set_state(tvd, B_FALSE, VDEV_STATE_REMOVED,
3617 			    VDEV_AUX_NONE);
3618 			vdev_clear(spa, tvd, B_TRUE);
3619 			vdev_config_dirty(tvd->vdev_top);
3620 		}
3621 		spa_async_remove(spa, tvd);
3622 	}
3623 }
3624 
3625 static void
3626 spa_async_thread(spa_t *spa)
3627 {
3628 	int tasks;
3629 	uint64_t txg;
3630 
3631 	ASSERT(spa->spa_sync_on);
3632 
3633 	mutex_enter(&spa->spa_async_lock);
3634 	tasks = spa->spa_async_tasks;
3635 	spa->spa_async_tasks = 0;
3636 	mutex_exit(&spa->spa_async_lock);
3637 
3638 	/*
3639 	 * See if the config needs to be updated.
3640 	 */
3641 	if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
3642 		mutex_enter(&spa_namespace_lock);
3643 		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
3644 		mutex_exit(&spa_namespace_lock);
3645 	}
3646 
3647 	/*
3648 	 * See if any devices need to be marked REMOVED.
3649 	 *
3650 	 * XXX - We avoid doing this when we are in
3651 	 * I/O failure state since spa_vdev_enter() grabs
3652 	 * the namespace lock and would not be able to obtain
3653 	 * the writer config lock.
3654 	 */
3655 	if (tasks & SPA_ASYNC_REMOVE &&
3656 	    spa_state(spa) != POOL_STATE_IO_FAILURE) {
3657 		txg = spa_vdev_enter(spa);
3658 		spa_async_remove(spa, spa->spa_root_vdev);
3659 		(void) spa_vdev_exit(spa, NULL, txg, 0);
3660 	}
3661 
3662 	/*
3663 	 * If any devices are done replacing, detach them.
3664 	 */
3665 	if (tasks & SPA_ASYNC_RESILVER_DONE)
3666 		spa_vdev_resilver_done(spa);
3667 
3668 	/*
3669 	 * Kick off a scrub.  When starting a RESILVER scrub (or an EVERYTHING
3670 	 * scrub which can become a resilver), we need to hold
3671 	 * spa_namespace_lock() because the sysevent we post via
3672 	 * spa_event_notify() needs to get the name of the pool.
3673 	 */
3674 	if (tasks & SPA_ASYNC_SCRUB) {
3675 		mutex_enter(&spa_namespace_lock);
3676 		VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0);
3677 		mutex_exit(&spa_namespace_lock);
3678 	}
3679 
3680 	/*
3681 	 * Kick off a resilver.
3682 	 */
3683 	if (tasks & SPA_ASYNC_RESILVER) {
3684 		mutex_enter(&spa_namespace_lock);
3685 		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
3686 		mutex_exit(&spa_namespace_lock);
3687 	}
3688 
3689 	/*
3690 	 * Let the world know that we're done.
3691 	 */
3692 	mutex_enter(&spa->spa_async_lock);
3693 	spa->spa_async_thread = NULL;
3694 	cv_broadcast(&spa->spa_async_cv);
3695 	mutex_exit(&spa->spa_async_lock);
3696 	thread_exit();
3697 }
3698 
3699 void
3700 spa_async_suspend(spa_t *spa)
3701 {
3702 	mutex_enter(&spa->spa_async_lock);
3703 	spa->spa_async_suspended++;
3704 	while (spa->spa_async_thread != NULL)
3705 		cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
3706 	mutex_exit(&spa->spa_async_lock);
3707 }
3708 
3709 void
3710 spa_async_resume(spa_t *spa)
3711 {
3712 	mutex_enter(&spa->spa_async_lock);
3713 	ASSERT(spa->spa_async_suspended != 0);
3714 	spa->spa_async_suspended--;
3715 	mutex_exit(&spa->spa_async_lock);
3716 }
3717 
3718 static void
3719 spa_async_dispatch(spa_t *spa)
3720 {
3721 	mutex_enter(&spa->spa_async_lock);
3722 	if (spa->spa_async_tasks && !spa->spa_async_suspended &&
3723 	    spa->spa_async_thread == NULL &&
3724 	    rootdir != NULL && !vn_is_readonly(rootdir))
3725 		spa->spa_async_thread = thread_create(NULL, 0,
3726 		    spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
3727 	mutex_exit(&spa->spa_async_lock);
3728 }
3729 
3730 void
3731 spa_async_request(spa_t *spa, int task)
3732 {
3733 	mutex_enter(&spa->spa_async_lock);
3734 	spa->spa_async_tasks |= task;
3735 	mutex_exit(&spa->spa_async_lock);
3736 }
3737 
3738 /*
3739  * ==========================================================================
3740  * SPA syncing routines
3741  * ==========================================================================
3742  */
3743 
3744 static void
3745 spa_sync_deferred_frees(spa_t *spa, uint64_t txg)
3746 {
3747 	bplist_t *bpl = &spa->spa_sync_bplist;
3748 	dmu_tx_t *tx;
3749 	blkptr_t blk;
3750 	uint64_t itor = 0;
3751 	zio_t *zio;
3752 	int error;
3753 	uint8_t c = 1;
3754 
3755 	zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD);
3756 
3757 	while (bplist_iterate(bpl, &itor, &blk) == 0)
3758 		zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL));
3759 
3760 	error = zio_wait(zio);
3761 	ASSERT3U(error, ==, 0);
3762 
3763 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
3764 	bplist_vacate(bpl, tx);
3765 
3766 	/*
3767 	 * Pre-dirty the first block so we sync to convergence faster.
3768 	 * (Usually only the first block is needed.)
3769 	 */
3770 	dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx);
3771 	dmu_tx_commit(tx);
3772 }
3773 
3774 static void
3775 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
3776 {
3777 	char *packed = NULL;
3778 	size_t nvsize = 0;
3779 	dmu_buf_t *db;
3780 
3781 	VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
3782 
3783 	packed = kmem_alloc(nvsize, KM_SLEEP);
3784 
3785 	VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
3786 	    KM_SLEEP) == 0);
3787 
3788 	dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx);
3789 
3790 	kmem_free(packed, nvsize);
3791 
3792 	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
3793 	dmu_buf_will_dirty(db, tx);
3794 	*(uint64_t *)db->db_data = nvsize;
3795 	dmu_buf_rele(db, FTAG);
3796 }
3797 
3798 static void
3799 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
3800     const char *config, const char *entry)
3801 {
3802 	nvlist_t *nvroot;
3803 	nvlist_t **list;
3804 	int i;
3805 
3806 	if (!sav->sav_sync)
3807 		return;
3808 
3809 	/*
3810 	 * Update the MOS nvlist describing the list of available devices.
3811 	 * spa_validate_aux() will have already made sure this nvlist is
3812 	 * valid and the vdevs are labeled appropriately.
3813 	 */
3814 	if (sav->sav_object == 0) {
3815 		sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
3816 		    DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
3817 		    sizeof (uint64_t), tx);
3818 		VERIFY(zap_update(spa->spa_meta_objset,
3819 		    DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
3820 		    &sav->sav_object, tx) == 0);
3821 	}
3822 
3823 	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
3824 	if (sav->sav_count == 0) {
3825 		VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0);
3826 	} else {
3827 		list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
3828 		for (i = 0; i < sav->sav_count; i++)
3829 			list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
3830 			    B_FALSE, B_FALSE, B_TRUE);
3831 		VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
3832 		    sav->sav_count) == 0);
3833 		for (i = 0; i < sav->sav_count; i++)
3834 			nvlist_free(list[i]);
3835 		kmem_free(list, sav->sav_count * sizeof (void *));
3836 	}
3837 
3838 	spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
3839 	nvlist_free(nvroot);
3840 
3841 	sav->sav_sync = B_FALSE;
3842 }
3843 
3844 static void
3845 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
3846 {
3847 	nvlist_t *config;
3848 
3849 	if (list_is_empty(&spa->spa_dirty_list))
3850 		return;
3851 
3852 	config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE);
3853 
3854 	if (spa->spa_config_syncing)
3855 		nvlist_free(spa->spa_config_syncing);
3856 	spa->spa_config_syncing = config;
3857 
3858 	spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
3859 }
3860 
3861 /*
3862  * Set zpool properties.
3863  */
3864 static void
3865 spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
3866 {
3867 	spa_t *spa = arg1;
3868 	objset_t *mos = spa->spa_meta_objset;
3869 	nvlist_t *nvp = arg2;
3870 	nvpair_t *elem;
3871 	uint64_t intval;
3872 	char *strval, *slash;
3873 	zpool_prop_t prop;
3874 	const char *propname;
3875 	zprop_type_t proptype;
3876 
3877 	elem = NULL;
3878 	while ((elem = nvlist_next_nvpair(nvp, elem))) {
3879 		switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
3880 		case ZPOOL_PROP_VERSION:
3881 			/*
3882 			 * Only set version for non-zpool-creation cases
3883 			 * (set/import). spa_create() needs special care
3884 			 * for version setting.
3885 			 */
3886 			if (tx->tx_txg != TXG_INITIAL) {
3887 				VERIFY(nvpair_value_uint64(elem,
3888 				    &intval) == 0);
3889 				ASSERT(intval <= SPA_VERSION);
3890 				ASSERT(intval >= spa_version(spa));
3891 				spa->spa_uberblock.ub_version = intval;
3892 				vdev_config_dirty(spa->spa_root_vdev);
3893 			}
3894 			break;
3895 
3896 		case ZPOOL_PROP_ALTROOT:
3897 			/*
3898 			 * 'altroot' is a non-persistent property. It should
3899 			 * have been set temporarily at creation or import time.
3900 			 */
3901 			ASSERT(spa->spa_root != NULL);
3902 			break;
3903 
3904 		case ZPOOL_PROP_CACHEFILE:
3905 			/*
3906 			 * 'cachefile' is a non-persistent property, but note
3907 			 * an async request that the config cache needs to be
3908 			 * udpated.
3909 			 */
3910 			VERIFY(nvpair_value_string(elem, &strval) == 0);
3911 			if (spa->spa_config_dir)
3912 				spa_strfree(spa->spa_config_dir);
3913 			if (spa->spa_config_file)
3914 				spa_strfree(spa->spa_config_file);
3915 
3916 			if (strval[0] == '\0') {
3917 				spa->spa_config_dir = NULL;
3918 				spa->spa_config_file = NULL;
3919 			} else if (strcmp(strval, "none") == 0) {
3920 				spa->spa_config_dir = spa_strdup(strval);
3921 				spa->spa_config_file = NULL;
3922 			} else {
3923 				slash = strrchr(strval, '/');
3924 				ASSERT(slash != NULL);
3925 				*slash = '\0';
3926 				spa->spa_config_dir = spa_strdup(strval);
3927 				spa->spa_config_file = spa_strdup(slash + 1);
3928 			}
3929 			spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
3930 			break;
3931 		default:
3932 			/*
3933 			 * Set pool property values in the poolprops mos object.
3934 			 */
3935 			mutex_enter(&spa->spa_props_lock);
3936 			if (spa->spa_pool_props_object == 0) {
3937 				objset_t *mos = spa->spa_meta_objset;
3938 
3939 				VERIFY((spa->spa_pool_props_object =
3940 				    zap_create(mos, DMU_OT_POOL_PROPS,
3941 				    DMU_OT_NONE, 0, tx)) > 0);
3942 
3943 				VERIFY(zap_update(mos,
3944 				    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
3945 				    8, 1, &spa->spa_pool_props_object, tx)
3946 				    == 0);
3947 			}
3948 			mutex_exit(&spa->spa_props_lock);
3949 
3950 			/* normalize the property name */
3951 			propname = zpool_prop_to_name(prop);
3952 			proptype = zpool_prop_get_type(prop);
3953 
3954 			if (nvpair_type(elem) == DATA_TYPE_STRING) {
3955 				ASSERT(proptype == PROP_TYPE_STRING);
3956 				VERIFY(nvpair_value_string(elem, &strval) == 0);
3957 				VERIFY(zap_update(mos,
3958 				    spa->spa_pool_props_object, propname,
3959 				    1, strlen(strval) + 1, strval, tx) == 0);
3960 
3961 			} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
3962 				VERIFY(nvpair_value_uint64(elem, &intval) == 0);
3963 
3964 				if (proptype == PROP_TYPE_INDEX) {
3965 					const char *unused;
3966 					VERIFY(zpool_prop_index_to_string(
3967 					    prop, intval, &unused) == 0);
3968 				}
3969 				VERIFY(zap_update(mos,
3970 				    spa->spa_pool_props_object, propname,
3971 				    8, 1, &intval, tx) == 0);
3972 			} else {
3973 				ASSERT(0); /* not allowed */
3974 			}
3975 
3976 			switch (prop) {
3977 			case ZPOOL_PROP_DELEGATION:
3978 				spa->spa_delegation = intval;
3979 				break;
3980 			case ZPOOL_PROP_BOOTFS:
3981 				spa->spa_bootfs = intval;
3982 				break;
3983 			case ZPOOL_PROP_FAILUREMODE:
3984 				spa->spa_failmode = intval;
3985 				break;
3986 			default:
3987 				break;
3988 			}
3989 		}
3990 
3991 		/* log internal history if this is not a zpool create */
3992 		if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY &&
3993 		    tx->tx_txg != TXG_INITIAL) {
3994 			spa_history_internal_log(LOG_POOL_PROPSET,
3995 			    spa, tx, cr, "%s %lld %s",
3996 			    nvpair_name(elem), intval, spa->spa_name);
3997 		}
3998 	}
3999 }
4000 
4001 /*
4002  * Sync the specified transaction group.  New blocks may be dirtied as
4003  * part of the process, so we iterate until it converges.
4004  */
4005 void
4006 spa_sync(spa_t *spa, uint64_t txg)
4007 {
4008 	dsl_pool_t *dp = spa->spa_dsl_pool;
4009 	objset_t *mos = spa->spa_meta_objset;
4010 	bplist_t *bpl = &spa->spa_sync_bplist;
4011 	vdev_t *rvd = spa->spa_root_vdev;
4012 	vdev_t *vd;
4013 	dmu_tx_t *tx;
4014 	int dirty_vdevs;
4015 
4016 	/*
4017 	 * Lock out configuration changes.
4018 	 */
4019 	spa_config_enter(spa, RW_READER, FTAG);
4020 
4021 	spa->spa_syncing_txg = txg;
4022 	spa->spa_sync_pass = 0;
4023 
4024 	VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj));
4025 
4026 	tx = dmu_tx_create_assigned(dp, txg);
4027 
4028 	/*
4029 	 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
4030 	 * set spa_deflate if we have no raid-z vdevs.
4031 	 */
4032 	if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
4033 	    spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
4034 		int i;
4035 
4036 		for (i = 0; i < rvd->vdev_children; i++) {
4037 			vd = rvd->vdev_child[i];
4038 			if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
4039 				break;
4040 		}
4041 		if (i == rvd->vdev_children) {
4042 			spa->spa_deflate = TRUE;
4043 			VERIFY(0 == zap_add(spa->spa_meta_objset,
4044 			    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
4045 			    sizeof (uint64_t), 1, &spa->spa_deflate, tx));
4046 		}
4047 	}
4048 
4049 	/*
4050 	 * If anything has changed in this txg, push the deferred frees
4051 	 * from the previous txg.  If not, leave them alone so that we
4052 	 * don't generate work on an otherwise idle system.
4053 	 */
4054 	if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
4055 	    !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
4056 	    !txg_list_empty(&dp->dp_sync_tasks, txg))
4057 		spa_sync_deferred_frees(spa, txg);
4058 
4059 	/*
4060 	 * Iterate to convergence.
4061 	 */
4062 	do {
4063 		spa->spa_sync_pass++;
4064 
4065 		spa_sync_config_object(spa, tx);
4066 		spa_sync_aux_dev(spa, &spa->spa_spares, tx,
4067 		    ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
4068 		spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
4069 		    ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
4070 		spa_errlog_sync(spa, txg);
4071 		dsl_pool_sync(dp, txg);
4072 
4073 		dirty_vdevs = 0;
4074 		while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) {
4075 			vdev_sync(vd, txg);
4076 			dirty_vdevs++;
4077 		}
4078 
4079 		bplist_sync(bpl, tx);
4080 	} while (dirty_vdevs);
4081 
4082 	bplist_close(bpl);
4083 
4084 	dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass);
4085 
4086 	/*
4087 	 * Rewrite the vdev configuration (which includes the uberblock)
4088 	 * to commit the transaction group.
4089 	 *
4090 	 * If there are any dirty vdevs, sync the uberblock to all vdevs.
4091 	 * Otherwise, pick a random top-level vdev that's known to be
4092 	 * visible in the config cache (see spa_vdev_add() for details).
4093 	 * If the write fails, try the next vdev until we're tried them all.
4094 	 */
4095 	if (!list_is_empty(&spa->spa_dirty_list)) {
4096 		VERIFY(vdev_config_sync(rvd, txg) == 0);
4097 	} else {
4098 		int children = rvd->vdev_children;
4099 		int c0 = spa_get_random(children);
4100 		int c;
4101 
4102 		for (c = 0; c < children; c++) {
4103 			vd = rvd->vdev_child[(c0 + c) % children];
4104 			if (vd->vdev_ms_array == 0)
4105 				continue;
4106 			if (vdev_config_sync(vd, txg) == 0)
4107 				break;
4108 		}
4109 		if (c == children)
4110 			VERIFY(vdev_config_sync(rvd, txg) == 0);
4111 	}
4112 
4113 	dmu_tx_commit(tx);
4114 
4115 	/*
4116 	 * Clear the dirty config list.
4117 	 */
4118 	while ((vd = list_head(&spa->spa_dirty_list)) != NULL)
4119 		vdev_config_clean(vd);
4120 
4121 	/*
4122 	 * Now that the new config has synced transactionally,
4123 	 * let it become visible to the config cache.
4124 	 */
4125 	if (spa->spa_config_syncing != NULL) {
4126 		spa_config_set(spa, spa->spa_config_syncing);
4127 		spa->spa_config_txg = txg;
4128 		spa->spa_config_syncing = NULL;
4129 	}
4130 
4131 	/*
4132 	 * Make a stable copy of the fully synced uberblock.
4133 	 * We use this as the root for pool traversals.
4134 	 */
4135 	spa->spa_traverse_wanted = 1;	/* tells traverse_more() to stop */
4136 
4137 	spa_scrub_suspend(spa);		/* stop scrubbing and finish I/Os */
4138 
4139 	rw_enter(&spa->spa_traverse_lock, RW_WRITER);
4140 	spa->spa_traverse_wanted = 0;
4141 	spa->spa_ubsync = spa->spa_uberblock;
4142 	rw_exit(&spa->spa_traverse_lock);
4143 
4144 	spa_scrub_resume(spa);		/* resume scrub with new ubsync */
4145 
4146 	/*
4147 	 * Clean up the ZIL records for the synced txg.
4148 	 */
4149 	dsl_pool_zil_clean(dp);
4150 
4151 	/*
4152 	 * Update usable space statistics.
4153 	 */
4154 	while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
4155 		vdev_sync_done(vd, txg);
4156 
4157 	/*
4158 	 * It had better be the case that we didn't dirty anything
4159 	 * since vdev_config_sync().
4160 	 */
4161 	ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
4162 	ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
4163 	ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
4164 	ASSERT(bpl->bpl_queue == NULL);
4165 
4166 	spa_config_exit(spa, FTAG);
4167 
4168 	/*
4169 	 * If any async tasks have been requested, kick them off.
4170 	 */
4171 	spa_async_dispatch(spa);
4172 }
4173 
4174 /*
4175  * Sync all pools.  We don't want to hold the namespace lock across these
4176  * operations, so we take a reference on the spa_t and drop the lock during the
4177  * sync.
4178  */
4179 void
4180 spa_sync_allpools(void)
4181 {
4182 	spa_t *spa = NULL;
4183 	mutex_enter(&spa_namespace_lock);
4184 	while ((spa = spa_next(spa)) != NULL) {
4185 		if (spa_state(spa) != POOL_STATE_ACTIVE)
4186 			continue;
4187 		spa_open_ref(spa, FTAG);
4188 		mutex_exit(&spa_namespace_lock);
4189 		txg_wait_synced(spa_get_dsl(spa), 0);
4190 		mutex_enter(&spa_namespace_lock);
4191 		spa_close(spa, FTAG);
4192 	}
4193 	mutex_exit(&spa_namespace_lock);
4194 }
4195 
4196 /*
4197  * ==========================================================================
4198  * Miscellaneous routines
4199  * ==========================================================================
4200  */
4201 
4202 /*
4203  * Remove all pools in the system.
4204  */
4205 void
4206 spa_evict_all(void)
4207 {
4208 	spa_t *spa;
4209 
4210 	/*
4211 	 * Remove all cached state.  All pools should be closed now,
4212 	 * so every spa in the AVL tree should be unreferenced.
4213 	 */
4214 	mutex_enter(&spa_namespace_lock);
4215 	while ((spa = spa_next(NULL)) != NULL) {
4216 		/*
4217 		 * Stop async tasks.  The async thread may need to detach
4218 		 * a device that's been replaced, which requires grabbing
4219 		 * spa_namespace_lock, so we must drop it here.
4220 		 */
4221 		spa_open_ref(spa, FTAG);
4222 		mutex_exit(&spa_namespace_lock);
4223 		spa_async_suspend(spa);
4224 		mutex_enter(&spa_namespace_lock);
4225 		VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
4226 		spa_close(spa, FTAG);
4227 
4228 		if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
4229 			spa_unload(spa);
4230 			spa_deactivate(spa);
4231 		}
4232 		spa_remove(spa);
4233 	}
4234 	mutex_exit(&spa_namespace_lock);
4235 }
4236 
4237 vdev_t *
4238 spa_lookup_by_guid(spa_t *spa, uint64_t guid)
4239 {
4240 	return (vdev_lookup_by_guid(spa->spa_root_vdev, guid));
4241 }
4242 
4243 void
4244 spa_upgrade(spa_t *spa, uint64_t version)
4245 {
4246 	spa_config_enter(spa, RW_WRITER, FTAG);
4247 
4248 	/*
4249 	 * This should only be called for a non-faulted pool, and since a
4250 	 * future version would result in an unopenable pool, this shouldn't be
4251 	 * possible.
4252 	 */
4253 	ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION);
4254 	ASSERT(version >= spa->spa_uberblock.ub_version);
4255 
4256 	spa->spa_uberblock.ub_version = version;
4257 	vdev_config_dirty(spa->spa_root_vdev);
4258 
4259 	spa_config_exit(spa, FTAG);
4260 
4261 	txg_wait_synced(spa_get_dsl(spa), 0);
4262 }
4263 
4264 boolean_t
4265 spa_has_spare(spa_t *spa, uint64_t guid)
4266 {
4267 	int i;
4268 	uint64_t spareguid;
4269 	spa_aux_vdev_t *sav = &spa->spa_spares;
4270 
4271 	for (i = 0; i < sav->sav_count; i++)
4272 		if (sav->sav_vdevs[i]->vdev_guid == guid)
4273 			return (B_TRUE);
4274 
4275 	for (i = 0; i < sav->sav_npending; i++) {
4276 		if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
4277 		    &spareguid) == 0 && spareguid == guid)
4278 			return (B_TRUE);
4279 	}
4280 
4281 	return (B_FALSE);
4282 }
4283 
4284 /*
4285  * Post a sysevent corresponding to the given event.  The 'name' must be one of
4286  * the event definitions in sys/sysevent/eventdefs.h.  The payload will be
4287  * filled in from the spa and (optionally) the vdev.  This doesn't do anything
4288  * in the userland libzpool, as we don't want consumers to misinterpret ztest
4289  * or zdb as real changes.
4290  */
4291 void
4292 spa_event_notify(spa_t *spa, vdev_t *vd, const char *name)
4293 {
4294 #ifdef _KERNEL
4295 	sysevent_t		*ev;
4296 	sysevent_attr_list_t	*attr = NULL;
4297 	sysevent_value_t	value;
4298 	sysevent_id_t		eid;
4299 
4300 	ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs",
4301 	    SE_SLEEP);
4302 
4303 	value.value_type = SE_DATA_TYPE_STRING;
4304 	value.value.sv_string = spa_name(spa);
4305 	if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0)
4306 		goto done;
4307 
4308 	value.value_type = SE_DATA_TYPE_UINT64;
4309 	value.value.sv_uint64 = spa_guid(spa);
4310 	if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0)
4311 		goto done;
4312 
4313 	if (vd) {
4314 		value.value_type = SE_DATA_TYPE_UINT64;
4315 		value.value.sv_uint64 = vd->vdev_guid;
4316 		if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value,
4317 		    SE_SLEEP) != 0)
4318 			goto done;
4319 
4320 		if (vd->vdev_path) {
4321 			value.value_type = SE_DATA_TYPE_STRING;
4322 			value.value.sv_string = vd->vdev_path;
4323 			if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH,
4324 			    &value, SE_SLEEP) != 0)
4325 				goto done;
4326 		}
4327 	}
4328 
4329 	(void) log_sysevent(ev, SE_SLEEP, &eid);
4330 
4331 done:
4332 	if (attr)
4333 		sysevent_free_attr(attr);
4334 	sysevent_free(ev);
4335 #endif
4336 }
4337