xref: /illumos-gate/usr/src/uts/common/fs/zfs/spa.c (revision 6ede7bac3e9a6dfa53f3115dca1cf401bfa21a36)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * This file contains all the routines used when modifying on-disk SPA state.
31  * This includes opening, importing, destroying, exporting a pool, and syncing a
32  * pool.
33  */
34 
35 #include <sys/zfs_context.h>
36 #include <sys/fm/fs/zfs.h>
37 #include <sys/spa_impl.h>
38 #include <sys/zio.h>
39 #include <sys/zio_checksum.h>
40 #include <sys/zio_compress.h>
41 #include <sys/dmu.h>
42 #include <sys/dmu_tx.h>
43 #include <sys/zap.h>
44 #include <sys/zil.h>
45 #include <sys/vdev_impl.h>
46 #include <sys/metaslab.h>
47 #include <sys/uberblock_impl.h>
48 #include <sys/txg.h>
49 #include <sys/avl.h>
50 #include <sys/dmu_traverse.h>
51 #include <sys/dmu_objset.h>
52 #include <sys/unique.h>
53 #include <sys/dsl_pool.h>
54 #include <sys/dsl_dataset.h>
55 #include <sys/dsl_dir.h>
56 #include <sys/dsl_prop.h>
57 #include <sys/dsl_synctask.h>
58 #include <sys/fs/zfs.h>
59 #include <sys/arc.h>
60 #include <sys/callb.h>
61 #include <sys/systeminfo.h>
62 #include <sys/sunddi.h>
63 
64 #include "zfs_prop.h"
65 #include "zfs_comutil.h"
66 
67 int zio_taskq_threads = 8;
68 
69 static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx);
70 
71 /*
72  * ==========================================================================
73  * SPA properties routines
74  * ==========================================================================
75  */
76 
77 /*
78  * Add a (source=src, propname=propval) list to an nvlist.
79  */
80 static int
81 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
82     uint64_t intval, zprop_source_t src)
83 {
84 	const char *propname = zpool_prop_to_name(prop);
85 	nvlist_t *propval;
86 	int err = 0;
87 
88 	if (err = nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP))
89 		return (err);
90 
91 	if (err = nvlist_add_uint64(propval, ZPROP_SOURCE, src))
92 		goto out;
93 
94 	if (strval != NULL) {
95 		if (err = nvlist_add_string(propval, ZPROP_VALUE, strval))
96 			goto out;
97 	} else {
98 		if (err = nvlist_add_uint64(propval, ZPROP_VALUE, intval))
99 			goto out;
100 	}
101 
102 	err = nvlist_add_nvlist(nvl, propname, propval);
103 out:
104 	nvlist_free(propval);
105 	return (err);
106 }
107 
108 /*
109  * Get property values from the spa configuration.
110  */
111 static int
112 spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
113 {
114 	uint64_t size = spa_get_space(spa);
115 	uint64_t used = spa_get_alloc(spa);
116 	uint64_t cap, version;
117 	zprop_source_t src = ZPROP_SRC_NONE;
118 	int err;
119 	char *cachefile;
120 	size_t len;
121 
122 	/*
123 	 * readonly properties
124 	 */
125 	if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa->spa_name,
126 	    0, src))
127 		return (err);
128 
129 	if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src))
130 		return (err);
131 
132 	if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src))
133 		return (err);
134 
135 	if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL,
136 	    size - used, src))
137 		return (err);
138 
139 	cap = (size == 0) ? 0 : (used * 100 / size);
140 	if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src))
141 		return (err);
142 
143 	if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL,
144 	    spa_guid(spa), src))
145 		return (err);
146 
147 	if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
148 	    spa->spa_root_vdev->vdev_state, src))
149 		return (err);
150 
151 	/*
152 	 * settable properties that are not stored in the pool property object.
153 	 */
154 	version = spa_version(spa);
155 	if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
156 		src = ZPROP_SRC_DEFAULT;
157 	else
158 		src = ZPROP_SRC_LOCAL;
159 	if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL,
160 	    version, src))
161 		return (err);
162 
163 	if (spa->spa_root != NULL) {
164 		src = ZPROP_SRC_LOCAL;
165 		if (err = spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT,
166 		    spa->spa_root, 0, src))
167 			return (err);
168 	}
169 
170 	if (spa->spa_config_dir != NULL) {
171 		if (strcmp(spa->spa_config_dir, "none") == 0) {
172 			err = spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
173 			    spa->spa_config_dir, 0, ZPROP_SRC_LOCAL);
174 		} else {
175 			len = strlen(spa->spa_config_dir) +
176 			    strlen(spa->spa_config_file) + 2;
177 			cachefile = kmem_alloc(len, KM_SLEEP);
178 			(void) snprintf(cachefile, len, "%s/%s",
179 			    spa->spa_config_dir, spa->spa_config_file);
180 			err = spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
181 			    cachefile, 0, ZPROP_SRC_LOCAL);
182 			kmem_free(cachefile, len);
183 		}
184 
185 		if (err)
186 			return (err);
187 	}
188 
189 	return (0);
190 }
191 
192 /*
193  * Get zpool property values.
194  */
195 int
196 spa_prop_get(spa_t *spa, nvlist_t **nvp)
197 {
198 	zap_cursor_t zc;
199 	zap_attribute_t za;
200 	objset_t *mos = spa->spa_meta_objset;
201 	int err;
202 
203 	if (err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP))
204 		return (err);
205 
206 	/*
207 	 * Get properties from the spa config.
208 	 */
209 	if (err = spa_prop_get_config(spa, nvp))
210 		goto out;
211 
212 	mutex_enter(&spa->spa_props_lock);
213 	/* If no pool property object, no more prop to get. */
214 	if (spa->spa_pool_props_object == 0) {
215 		mutex_exit(&spa->spa_props_lock);
216 		return (0);
217 	}
218 
219 	/*
220 	 * Get properties from the MOS pool property object.
221 	 */
222 	for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
223 	    (err = zap_cursor_retrieve(&zc, &za)) == 0;
224 	    zap_cursor_advance(&zc)) {
225 		uint64_t intval = 0;
226 		char *strval = NULL;
227 		zprop_source_t src = ZPROP_SRC_DEFAULT;
228 		zpool_prop_t prop;
229 
230 		if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL)
231 			continue;
232 
233 		switch (za.za_integer_length) {
234 		case 8:
235 			/* integer property */
236 			if (za.za_first_integer !=
237 			    zpool_prop_default_numeric(prop))
238 				src = ZPROP_SRC_LOCAL;
239 
240 			if (prop == ZPOOL_PROP_BOOTFS) {
241 				dsl_pool_t *dp;
242 				dsl_dataset_t *ds = NULL;
243 
244 				dp = spa_get_dsl(spa);
245 				rw_enter(&dp->dp_config_rwlock, RW_READER);
246 				if (err = dsl_dataset_open_obj(dp,
247 				    za.za_first_integer, NULL, DS_MODE_NONE,
248 				    FTAG, &ds)) {
249 					rw_exit(&dp->dp_config_rwlock);
250 					break;
251 				}
252 
253 				strval = kmem_alloc(
254 				    MAXNAMELEN + strlen(MOS_DIR_NAME) + 1,
255 				    KM_SLEEP);
256 				dsl_dataset_name(ds, strval);
257 				dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
258 				rw_exit(&dp->dp_config_rwlock);
259 			} else {
260 				strval = NULL;
261 				intval = za.za_first_integer;
262 			}
263 
264 			err = spa_prop_add_list(*nvp, prop, strval,
265 			    intval, src);
266 
267 			if (strval != NULL)
268 				kmem_free(strval,
269 				    MAXNAMELEN + strlen(MOS_DIR_NAME) + 1);
270 
271 			break;
272 
273 		case 1:
274 			/* string property */
275 			strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
276 			err = zap_lookup(mos, spa->spa_pool_props_object,
277 			    za.za_name, 1, za.za_num_integers, strval);
278 			if (err) {
279 				kmem_free(strval, za.za_num_integers);
280 				break;
281 			}
282 			err = spa_prop_add_list(*nvp, prop, strval, 0, src);
283 			kmem_free(strval, za.za_num_integers);
284 			break;
285 
286 		default:
287 			break;
288 		}
289 	}
290 	zap_cursor_fini(&zc);
291 	mutex_exit(&spa->spa_props_lock);
292 out:
293 	if (err && err != ENOENT) {
294 		nvlist_free(*nvp);
295 		return (err);
296 	}
297 
298 	return (0);
299 }
300 
301 /*
302  * Validate the given pool properties nvlist and modify the list
303  * for the property values to be set.
304  */
305 static int
306 spa_prop_validate(spa_t *spa, nvlist_t *props)
307 {
308 	nvpair_t *elem;
309 	int error = 0, reset_bootfs = 0;
310 	uint64_t objnum;
311 
312 	elem = NULL;
313 	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
314 		zpool_prop_t prop;
315 		char *propname, *strval;
316 		uint64_t intval;
317 		vdev_t *rvdev;
318 		char *vdev_type;
319 		objset_t *os;
320 		char *slash;
321 
322 		propname = nvpair_name(elem);
323 
324 		if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL)
325 			return (EINVAL);
326 
327 		switch (prop) {
328 		case ZPOOL_PROP_VERSION:
329 			error = nvpair_value_uint64(elem, &intval);
330 			if (!error &&
331 			    (intval < spa_version(spa) || intval > SPA_VERSION))
332 				error = EINVAL;
333 			break;
334 
335 		case ZPOOL_PROP_DELEGATION:
336 		case ZPOOL_PROP_AUTOREPLACE:
337 			error = nvpair_value_uint64(elem, &intval);
338 			if (!error && intval > 1)
339 				error = EINVAL;
340 			break;
341 
342 		case ZPOOL_PROP_BOOTFS:
343 			if (spa_version(spa) < SPA_VERSION_BOOTFS) {
344 				error = ENOTSUP;
345 				break;
346 			}
347 
348 			/*
349 			 * A bootable filesystem can not be on a RAIDZ pool
350 			 * nor a striped pool with more than 1 device.
351 			 */
352 			rvdev = spa->spa_root_vdev;
353 			vdev_type =
354 			    rvdev->vdev_child[0]->vdev_ops->vdev_op_type;
355 			if (rvdev->vdev_children > 1 ||
356 			    strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 ||
357 			    strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) {
358 				error = ENOTSUP;
359 				break;
360 			}
361 
362 			reset_bootfs = 1;
363 
364 			error = nvpair_value_string(elem, &strval);
365 
366 			if (!error) {
367 				if (strval == NULL || strval[0] == '\0') {
368 					objnum = zpool_prop_default_numeric(
369 					    ZPOOL_PROP_BOOTFS);
370 					break;
371 				}
372 
373 				if (error = dmu_objset_open(strval, DMU_OST_ZFS,
374 				    DS_MODE_STANDARD | DS_MODE_READONLY, &os))
375 					break;
376 				objnum = dmu_objset_id(os);
377 				dmu_objset_close(os);
378 			}
379 			break;
380 		case ZPOOL_PROP_FAILUREMODE:
381 			error = nvpair_value_uint64(elem, &intval);
382 			if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
383 			    intval > ZIO_FAILURE_MODE_PANIC))
384 				error = EINVAL;
385 
386 			/*
387 			 * This is a special case which only occurs when
388 			 * the pool has completely failed. This allows
389 			 * the user to change the in-core failmode property
390 			 * without syncing it out to disk (I/Os might
391 			 * currently be blocked). We do this by returning
392 			 * EIO to the caller (spa_prop_set) to trick it
393 			 * into thinking we encountered a property validation
394 			 * error.
395 			 */
396 			if (!error && spa_state(spa) == POOL_STATE_IO_FAILURE) {
397 				spa->spa_failmode = intval;
398 				error = EIO;
399 			}
400 			break;
401 
402 		case ZPOOL_PROP_CACHEFILE:
403 			if ((error = nvpair_value_string(elem, &strval)) != 0)
404 				break;
405 
406 			if (strval[0] == '\0')
407 				break;
408 
409 			if (strcmp(strval, "none") == 0)
410 				break;
411 
412 			if (strval[0] != '/') {
413 				error = EINVAL;
414 				break;
415 			}
416 
417 			slash = strrchr(strval, '/');
418 			ASSERT(slash != NULL);
419 
420 			if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
421 			    strcmp(slash, "/..") == 0)
422 				error = EINVAL;
423 			break;
424 		}
425 
426 		if (error)
427 			break;
428 	}
429 
430 	if (!error && reset_bootfs) {
431 		error = nvlist_remove(props,
432 		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
433 
434 		if (!error) {
435 			error = nvlist_add_uint64(props,
436 			    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
437 		}
438 	}
439 
440 	return (error);
441 }
442 
443 int
444 spa_prop_set(spa_t *spa, nvlist_t *nvp)
445 {
446 	int error;
447 
448 	if ((error = spa_prop_validate(spa, nvp)) != 0)
449 		return (error);
450 
451 	return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props,
452 	    spa, nvp, 3));
453 }
454 
455 /*
456  * If the bootfs property value is dsobj, clear it.
457  */
458 void
459 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
460 {
461 	if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
462 		VERIFY(zap_remove(spa->spa_meta_objset,
463 		    spa->spa_pool_props_object,
464 		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
465 		spa->spa_bootfs = 0;
466 	}
467 }
468 
469 /*
470  * ==========================================================================
471  * SPA state manipulation (open/create/destroy/import/export)
472  * ==========================================================================
473  */
474 
475 static int
476 spa_error_entry_compare(const void *a, const void *b)
477 {
478 	spa_error_entry_t *sa = (spa_error_entry_t *)a;
479 	spa_error_entry_t *sb = (spa_error_entry_t *)b;
480 	int ret;
481 
482 	ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
483 	    sizeof (zbookmark_t));
484 
485 	if (ret < 0)
486 		return (-1);
487 	else if (ret > 0)
488 		return (1);
489 	else
490 		return (0);
491 }
492 
493 /*
494  * Utility function which retrieves copies of the current logs and
495  * re-initializes them in the process.
496  */
497 void
498 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
499 {
500 	ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
501 
502 	bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
503 	bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
504 
505 	avl_create(&spa->spa_errlist_scrub,
506 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
507 	    offsetof(spa_error_entry_t, se_avl));
508 	avl_create(&spa->spa_errlist_last,
509 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
510 	    offsetof(spa_error_entry_t, se_avl));
511 }
512 
513 /*
514  * Activate an uninitialized pool.
515  */
516 static void
517 spa_activate(spa_t *spa)
518 {
519 	int t;
520 
521 	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
522 
523 	spa->spa_state = POOL_STATE_ACTIVE;
524 
525 	spa->spa_normal_class = metaslab_class_create();
526 	spa->spa_log_class = metaslab_class_create();
527 
528 	for (t = 0; t < ZIO_TYPES; t++) {
529 		spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue",
530 		    zio_taskq_threads, maxclsyspri, 50, INT_MAX,
531 		    TASKQ_PREPOPULATE);
532 		spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr",
533 		    zio_taskq_threads, maxclsyspri, 50, INT_MAX,
534 		    TASKQ_PREPOPULATE);
535 	}
536 
537 	list_create(&spa->spa_dirty_list, sizeof (vdev_t),
538 	    offsetof(vdev_t, vdev_dirty_node));
539 	list_create(&spa->spa_zio_list, sizeof (zio_t),
540 	    offsetof(zio_t, zio_link_node));
541 
542 	txg_list_create(&spa->spa_vdev_txg_list,
543 	    offsetof(struct vdev, vdev_txg_node));
544 
545 	avl_create(&spa->spa_errlist_scrub,
546 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
547 	    offsetof(spa_error_entry_t, se_avl));
548 	avl_create(&spa->spa_errlist_last,
549 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
550 	    offsetof(spa_error_entry_t, se_avl));
551 }
552 
553 /*
554  * Opposite of spa_activate().
555  */
556 static void
557 spa_deactivate(spa_t *spa)
558 {
559 	int t;
560 
561 	ASSERT(spa->spa_sync_on == B_FALSE);
562 	ASSERT(spa->spa_dsl_pool == NULL);
563 	ASSERT(spa->spa_root_vdev == NULL);
564 
565 	ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
566 
567 	txg_list_destroy(&spa->spa_vdev_txg_list);
568 
569 	list_destroy(&spa->spa_dirty_list);
570 	list_destroy(&spa->spa_zio_list);
571 
572 	for (t = 0; t < ZIO_TYPES; t++) {
573 		taskq_destroy(spa->spa_zio_issue_taskq[t]);
574 		taskq_destroy(spa->spa_zio_intr_taskq[t]);
575 		spa->spa_zio_issue_taskq[t] = NULL;
576 		spa->spa_zio_intr_taskq[t] = NULL;
577 	}
578 
579 	metaslab_class_destroy(spa->spa_normal_class);
580 	spa->spa_normal_class = NULL;
581 
582 	metaslab_class_destroy(spa->spa_log_class);
583 	spa->spa_log_class = NULL;
584 
585 	/*
586 	 * If this was part of an import or the open otherwise failed, we may
587 	 * still have errors left in the queues.  Empty them just in case.
588 	 */
589 	spa_errlog_drain(spa);
590 
591 	avl_destroy(&spa->spa_errlist_scrub);
592 	avl_destroy(&spa->spa_errlist_last);
593 
594 	spa->spa_state = POOL_STATE_UNINITIALIZED;
595 }
596 
597 /*
598  * Verify a pool configuration, and construct the vdev tree appropriately.  This
599  * will create all the necessary vdevs in the appropriate layout, with each vdev
600  * in the CLOSED state.  This will prep the pool before open/creation/import.
601  * All vdev validation is done by the vdev_alloc() routine.
602  */
603 static int
604 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
605     uint_t id, int atype)
606 {
607 	nvlist_t **child;
608 	uint_t c, children;
609 	int error;
610 
611 	if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
612 		return (error);
613 
614 	if ((*vdp)->vdev_ops->vdev_op_leaf)
615 		return (0);
616 
617 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
618 	    &child, &children) != 0) {
619 		vdev_free(*vdp);
620 		*vdp = NULL;
621 		return (EINVAL);
622 	}
623 
624 	for (c = 0; c < children; c++) {
625 		vdev_t *vd;
626 		if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
627 		    atype)) != 0) {
628 			vdev_free(*vdp);
629 			*vdp = NULL;
630 			return (error);
631 		}
632 	}
633 
634 	ASSERT(*vdp != NULL);
635 
636 	return (0);
637 }
638 
639 /*
640  * Opposite of spa_load().
641  */
642 static void
643 spa_unload(spa_t *spa)
644 {
645 	int i;
646 
647 	/*
648 	 * Stop async tasks.
649 	 */
650 	spa_async_suspend(spa);
651 
652 	/*
653 	 * Stop syncing.
654 	 */
655 	if (spa->spa_sync_on) {
656 		txg_sync_stop(spa->spa_dsl_pool);
657 		spa->spa_sync_on = B_FALSE;
658 	}
659 
660 	/*
661 	 * Wait for any outstanding prefetch I/O to complete.
662 	 */
663 	spa_config_enter(spa, RW_WRITER, FTAG);
664 	spa_config_exit(spa, FTAG);
665 
666 	/*
667 	 * Drop and purge level 2 cache
668 	 */
669 	spa_l2cache_drop(spa);
670 
671 	/*
672 	 * Close the dsl pool.
673 	 */
674 	if (spa->spa_dsl_pool) {
675 		dsl_pool_close(spa->spa_dsl_pool);
676 		spa->spa_dsl_pool = NULL;
677 	}
678 
679 	/*
680 	 * Close all vdevs.
681 	 */
682 	if (spa->spa_root_vdev)
683 		vdev_free(spa->spa_root_vdev);
684 	ASSERT(spa->spa_root_vdev == NULL);
685 
686 	for (i = 0; i < spa->spa_spares.sav_count; i++)
687 		vdev_free(spa->spa_spares.sav_vdevs[i]);
688 	if (spa->spa_spares.sav_vdevs) {
689 		kmem_free(spa->spa_spares.sav_vdevs,
690 		    spa->spa_spares.sav_count * sizeof (void *));
691 		spa->spa_spares.sav_vdevs = NULL;
692 	}
693 	if (spa->spa_spares.sav_config) {
694 		nvlist_free(spa->spa_spares.sav_config);
695 		spa->spa_spares.sav_config = NULL;
696 	}
697 
698 	for (i = 0; i < spa->spa_l2cache.sav_count; i++)
699 		vdev_free(spa->spa_l2cache.sav_vdevs[i]);
700 	if (spa->spa_l2cache.sav_vdevs) {
701 		kmem_free(spa->spa_l2cache.sav_vdevs,
702 		    spa->spa_l2cache.sav_count * sizeof (void *));
703 		spa->spa_l2cache.sav_vdevs = NULL;
704 	}
705 	if (spa->spa_l2cache.sav_config) {
706 		nvlist_free(spa->spa_l2cache.sav_config);
707 		spa->spa_l2cache.sav_config = NULL;
708 	}
709 
710 	spa->spa_async_suspended = 0;
711 }
712 
713 /*
714  * Load (or re-load) the current list of vdevs describing the active spares for
715  * this pool.  When this is called, we have some form of basic information in
716  * 'spa_spares.sav_config'.  We parse this into vdevs, try to open them, and
717  * then re-generate a more complete list including status information.
718  */
719 static void
720 spa_load_spares(spa_t *spa)
721 {
722 	nvlist_t **spares;
723 	uint_t nspares;
724 	int i;
725 	vdev_t *vd, *tvd;
726 
727 	/*
728 	 * First, close and free any existing spare vdevs.
729 	 */
730 	for (i = 0; i < spa->spa_spares.sav_count; i++) {
731 		vd = spa->spa_spares.sav_vdevs[i];
732 
733 		/* Undo the call to spa_activate() below */
734 		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL &&
735 		    tvd->vdev_isspare)
736 			spa_spare_remove(tvd);
737 		vdev_close(vd);
738 		vdev_free(vd);
739 	}
740 
741 	if (spa->spa_spares.sav_vdevs)
742 		kmem_free(spa->spa_spares.sav_vdevs,
743 		    spa->spa_spares.sav_count * sizeof (void *));
744 
745 	if (spa->spa_spares.sav_config == NULL)
746 		nspares = 0;
747 	else
748 		VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
749 		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
750 
751 	spa->spa_spares.sav_count = (int)nspares;
752 	spa->spa_spares.sav_vdevs = NULL;
753 
754 	if (nspares == 0)
755 		return;
756 
757 	/*
758 	 * Construct the array of vdevs, opening them to get status in the
759 	 * process.   For each spare, there is potentially two different vdev_t
760 	 * structures associated with it: one in the list of spares (used only
761 	 * for basic validation purposes) and one in the active vdev
762 	 * configuration (if it's spared in).  During this phase we open and
763 	 * validate each vdev on the spare list.  If the vdev also exists in the
764 	 * active configuration, then we also mark this vdev as an active spare.
765 	 */
766 	spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *),
767 	    KM_SLEEP);
768 	for (i = 0; i < spa->spa_spares.sav_count; i++) {
769 		VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
770 		    VDEV_ALLOC_SPARE) == 0);
771 		ASSERT(vd != NULL);
772 
773 		spa->spa_spares.sav_vdevs[i] = vd;
774 
775 		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) {
776 			if (!tvd->vdev_isspare)
777 				spa_spare_add(tvd);
778 
779 			/*
780 			 * We only mark the spare active if we were successfully
781 			 * able to load the vdev.  Otherwise, importing a pool
782 			 * with a bad active spare would result in strange
783 			 * behavior, because multiple pool would think the spare
784 			 * is actively in use.
785 			 *
786 			 * There is a vulnerability here to an equally bizarre
787 			 * circumstance, where a dead active spare is later
788 			 * brought back to life (onlined or otherwise).  Given
789 			 * the rarity of this scenario, and the extra complexity
790 			 * it adds, we ignore the possibility.
791 			 */
792 			if (!vdev_is_dead(tvd))
793 				spa_spare_activate(tvd);
794 		}
795 
796 		if (vdev_open(vd) != 0)
797 			continue;
798 
799 		vd->vdev_top = vd;
800 		if (vdev_validate_aux(vd) == 0)
801 			spa_spare_add(vd);
802 	}
803 
804 	/*
805 	 * Recompute the stashed list of spares, with status information
806 	 * this time.
807 	 */
808 	VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
809 	    DATA_TYPE_NVLIST_ARRAY) == 0);
810 
811 	spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
812 	    KM_SLEEP);
813 	for (i = 0; i < spa->spa_spares.sav_count; i++)
814 		spares[i] = vdev_config_generate(spa,
815 		    spa->spa_spares.sav_vdevs[i], B_TRUE, B_TRUE, B_FALSE);
816 	VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
817 	    ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
818 	for (i = 0; i < spa->spa_spares.sav_count; i++)
819 		nvlist_free(spares[i]);
820 	kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
821 }
822 
823 /*
824  * Load (or re-load) the current list of vdevs describing the active l2cache for
825  * this pool.  When this is called, we have some form of basic information in
826  * 'spa_l2cache.sav_config'.  We parse this into vdevs, try to open them, and
827  * then re-generate a more complete list including status information.
828  * Devices which are already active have their details maintained, and are
829  * not re-opened.
830  */
831 static void
832 spa_load_l2cache(spa_t *spa)
833 {
834 	nvlist_t **l2cache;
835 	uint_t nl2cache;
836 	int i, j, oldnvdevs;
837 	uint64_t guid;
838 	vdev_t *vd, **oldvdevs, **newvdevs;
839 	spa_aux_vdev_t *sav = &spa->spa_l2cache;
840 
841 	if (sav->sav_config != NULL) {
842 		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
843 		    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
844 		newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
845 	} else {
846 		nl2cache = 0;
847 	}
848 
849 	oldvdevs = sav->sav_vdevs;
850 	oldnvdevs = sav->sav_count;
851 	sav->sav_vdevs = NULL;
852 	sav->sav_count = 0;
853 
854 	/*
855 	 * Process new nvlist of vdevs.
856 	 */
857 	for (i = 0; i < nl2cache; i++) {
858 		VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
859 		    &guid) == 0);
860 
861 		newvdevs[i] = NULL;
862 		for (j = 0; j < oldnvdevs; j++) {
863 			vd = oldvdevs[j];
864 			if (vd != NULL && guid == vd->vdev_guid) {
865 				/*
866 				 * Retain previous vdev for add/remove ops.
867 				 */
868 				newvdevs[i] = vd;
869 				oldvdevs[j] = NULL;
870 				break;
871 			}
872 		}
873 
874 		if (newvdevs[i] == NULL) {
875 			/*
876 			 * Create new vdev
877 			 */
878 			VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
879 			    VDEV_ALLOC_L2CACHE) == 0);
880 			ASSERT(vd != NULL);
881 			newvdevs[i] = vd;
882 
883 			/*
884 			 * Commit this vdev as an l2cache device,
885 			 * even if it fails to open.
886 			 */
887 			spa_l2cache_add(vd);
888 
889 			if (vdev_open(vd) != 0)
890 				continue;
891 
892 			vd->vdev_top = vd;
893 			(void) vdev_validate_aux(vd);
894 
895 			if (!vdev_is_dead(vd)) {
896 				uint64_t size;
897 				size = vdev_get_rsize(vd);
898 				ASSERT3U(size, >, 0);
899 				if (spa_mode & FWRITE) {
900 					l2arc_add_vdev(spa, vd,
901 					    VDEV_LABEL_START_SIZE,
902 					    size - VDEV_LABEL_START_SIZE);
903 				}
904 				spa_l2cache_activate(vd);
905 			}
906 		}
907 	}
908 
909 	/*
910 	 * Purge vdevs that were dropped
911 	 */
912 	for (i = 0; i < oldnvdevs; i++) {
913 		uint64_t pool;
914 
915 		vd = oldvdevs[i];
916 		if (vd != NULL) {
917 			if (spa_mode & FWRITE &&
918 			    spa_l2cache_exists(vd->vdev_guid, &pool) &&
919 			    pool != 0ULL) {
920 				l2arc_remove_vdev(vd);
921 			}
922 			(void) vdev_close(vd);
923 			spa_l2cache_remove(vd);
924 		}
925 	}
926 
927 	if (oldvdevs)
928 		kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
929 
930 	if (sav->sav_config == NULL)
931 		goto out;
932 
933 	sav->sav_vdevs = newvdevs;
934 	sav->sav_count = (int)nl2cache;
935 
936 	/*
937 	 * Recompute the stashed list of l2cache devices, with status
938 	 * information this time.
939 	 */
940 	VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
941 	    DATA_TYPE_NVLIST_ARRAY) == 0);
942 
943 	l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
944 	for (i = 0; i < sav->sav_count; i++)
945 		l2cache[i] = vdev_config_generate(spa,
946 		    sav->sav_vdevs[i], B_TRUE, B_FALSE, B_TRUE);
947 	VERIFY(nvlist_add_nvlist_array(sav->sav_config,
948 	    ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
949 out:
950 	for (i = 0; i < sav->sav_count; i++)
951 		nvlist_free(l2cache[i]);
952 	if (sav->sav_count)
953 		kmem_free(l2cache, sav->sav_count * sizeof (void *));
954 }
955 
956 static int
957 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
958 {
959 	dmu_buf_t *db;
960 	char *packed = NULL;
961 	size_t nvsize = 0;
962 	int error;
963 	*value = NULL;
964 
965 	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
966 	nvsize = *(uint64_t *)db->db_data;
967 	dmu_buf_rele(db, FTAG);
968 
969 	packed = kmem_alloc(nvsize, KM_SLEEP);
970 	error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed);
971 	if (error == 0)
972 		error = nvlist_unpack(packed, nvsize, value, 0);
973 	kmem_free(packed, nvsize);
974 
975 	return (error);
976 }
977 
978 /*
979  * Checks to see if the given vdev could not be opened, in which case we post a
980  * sysevent to notify the autoreplace code that the device has been removed.
981  */
982 static void
983 spa_check_removed(vdev_t *vd)
984 {
985 	int c;
986 
987 	for (c = 0; c < vd->vdev_children; c++)
988 		spa_check_removed(vd->vdev_child[c]);
989 
990 	if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) {
991 		zfs_post_autoreplace(vd->vdev_spa, vd);
992 		spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK);
993 	}
994 }
995 
996 /*
997  * Load an existing storage pool, using the pool's builtin spa_config as a
998  * source of configuration information.
999  */
1000 static int
1001 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
1002 {
1003 	int error = 0;
1004 	nvlist_t *nvroot = NULL;
1005 	vdev_t *rvd;
1006 	uberblock_t *ub = &spa->spa_uberblock;
1007 	uint64_t config_cache_txg = spa->spa_config_txg;
1008 	uint64_t pool_guid;
1009 	uint64_t version;
1010 	zio_t *zio;
1011 	uint64_t autoreplace = 0;
1012 
1013 	spa->spa_load_state = state;
1014 
1015 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) ||
1016 	    nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
1017 		error = EINVAL;
1018 		goto out;
1019 	}
1020 
1021 	/*
1022 	 * Versioning wasn't explicitly added to the label until later, so if
1023 	 * it's not present treat it as the initial version.
1024 	 */
1025 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0)
1026 		version = SPA_VERSION_INITIAL;
1027 
1028 	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
1029 	    &spa->spa_config_txg);
1030 
1031 	if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
1032 	    spa_guid_exists(pool_guid, 0)) {
1033 		error = EEXIST;
1034 		goto out;
1035 	}
1036 
1037 	spa->spa_load_guid = pool_guid;
1038 
1039 	/*
1040 	 * Parse the configuration into a vdev tree.  We explicitly set the
1041 	 * value that will be returned by spa_version() since parsing the
1042 	 * configuration requires knowing the version number.
1043 	 */
1044 	spa_config_enter(spa, RW_WRITER, FTAG);
1045 	spa->spa_ubsync.ub_version = version;
1046 	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD);
1047 	spa_config_exit(spa, FTAG);
1048 
1049 	if (error != 0)
1050 		goto out;
1051 
1052 	ASSERT(spa->spa_root_vdev == rvd);
1053 	ASSERT(spa_guid(spa) == pool_guid);
1054 
1055 	/*
1056 	 * Try to open all vdevs, loading each label in the process.
1057 	 */
1058 	error = vdev_open(rvd);
1059 	if (error != 0)
1060 		goto out;
1061 
1062 	/*
1063 	 * Validate the labels for all leaf vdevs.  We need to grab the config
1064 	 * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD
1065 	 * flag.
1066 	 */
1067 	spa_config_enter(spa, RW_READER, FTAG);
1068 	error = vdev_validate(rvd);
1069 	spa_config_exit(spa, FTAG);
1070 
1071 	if (error != 0)
1072 		goto out;
1073 
1074 	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
1075 		error = ENXIO;
1076 		goto out;
1077 	}
1078 
1079 	/*
1080 	 * Find the best uberblock.
1081 	 */
1082 	bzero(ub, sizeof (uberblock_t));
1083 
1084 	zio = zio_root(spa, NULL, NULL,
1085 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
1086 	vdev_uberblock_load(zio, rvd, ub);
1087 	error = zio_wait(zio);
1088 
1089 	/*
1090 	 * If we weren't able to find a single valid uberblock, return failure.
1091 	 */
1092 	if (ub->ub_txg == 0) {
1093 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1094 		    VDEV_AUX_CORRUPT_DATA);
1095 		error = ENXIO;
1096 		goto out;
1097 	}
1098 
1099 	/*
1100 	 * If the pool is newer than the code, we can't open it.
1101 	 */
1102 	if (ub->ub_version > SPA_VERSION) {
1103 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1104 		    VDEV_AUX_VERSION_NEWER);
1105 		error = ENOTSUP;
1106 		goto out;
1107 	}
1108 
1109 	/*
1110 	 * If the vdev guid sum doesn't match the uberblock, we have an
1111 	 * incomplete configuration.
1112 	 */
1113 	if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) {
1114 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1115 		    VDEV_AUX_BAD_GUID_SUM);
1116 		error = ENXIO;
1117 		goto out;
1118 	}
1119 
1120 	/*
1121 	 * Initialize internal SPA structures.
1122 	 */
1123 	spa->spa_state = POOL_STATE_ACTIVE;
1124 	spa->spa_ubsync = spa->spa_uberblock;
1125 	spa->spa_first_txg = spa_last_synced_txg(spa) + 1;
1126 	error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
1127 	if (error) {
1128 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1129 		    VDEV_AUX_CORRUPT_DATA);
1130 		goto out;
1131 	}
1132 	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
1133 
1134 	if (zap_lookup(spa->spa_meta_objset,
1135 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
1136 	    sizeof (uint64_t), 1, &spa->spa_config_object) != 0) {
1137 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1138 		    VDEV_AUX_CORRUPT_DATA);
1139 		error = EIO;
1140 		goto out;
1141 	}
1142 
1143 	if (!mosconfig) {
1144 		nvlist_t *newconfig;
1145 		uint64_t hostid;
1146 
1147 		if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) {
1148 			vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1149 			    VDEV_AUX_CORRUPT_DATA);
1150 			error = EIO;
1151 			goto out;
1152 		}
1153 
1154 		if (nvlist_lookup_uint64(newconfig, ZPOOL_CONFIG_HOSTID,
1155 		    &hostid) == 0) {
1156 			char *hostname;
1157 			unsigned long myhostid = 0;
1158 
1159 			VERIFY(nvlist_lookup_string(newconfig,
1160 			    ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
1161 
1162 			(void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
1163 			if (hostid != 0 && myhostid != 0 &&
1164 			    (unsigned long)hostid != myhostid) {
1165 				cmn_err(CE_WARN, "pool '%s' could not be "
1166 				    "loaded as it was last accessed by "
1167 				    "another system (host: %s hostid: 0x%lx).  "
1168 				    "See: http://www.sun.com/msg/ZFS-8000-EY",
1169 				    spa->spa_name, hostname,
1170 				    (unsigned long)hostid);
1171 				error = EBADF;
1172 				goto out;
1173 			}
1174 		}
1175 
1176 		spa_config_set(spa, newconfig);
1177 		spa_unload(spa);
1178 		spa_deactivate(spa);
1179 		spa_activate(spa);
1180 
1181 		return (spa_load(spa, newconfig, state, B_TRUE));
1182 	}
1183 
1184 	if (zap_lookup(spa->spa_meta_objset,
1185 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
1186 	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) {
1187 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1188 		    VDEV_AUX_CORRUPT_DATA);
1189 		error = EIO;
1190 		goto out;
1191 	}
1192 
1193 	/*
1194 	 * Load the bit that tells us to use the new accounting function
1195 	 * (raid-z deflation).  If we have an older pool, this will not
1196 	 * be present.
1197 	 */
1198 	error = zap_lookup(spa->spa_meta_objset,
1199 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
1200 	    sizeof (uint64_t), 1, &spa->spa_deflate);
1201 	if (error != 0 && error != ENOENT) {
1202 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1203 		    VDEV_AUX_CORRUPT_DATA);
1204 		error = EIO;
1205 		goto out;
1206 	}
1207 
1208 	/*
1209 	 * Load the persistent error log.  If we have an older pool, this will
1210 	 * not be present.
1211 	 */
1212 	error = zap_lookup(spa->spa_meta_objset,
1213 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST,
1214 	    sizeof (uint64_t), 1, &spa->spa_errlog_last);
1215 	if (error != 0 && error != ENOENT) {
1216 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1217 		    VDEV_AUX_CORRUPT_DATA);
1218 		error = EIO;
1219 		goto out;
1220 	}
1221 
1222 	error = zap_lookup(spa->spa_meta_objset,
1223 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB,
1224 	    sizeof (uint64_t), 1, &spa->spa_errlog_scrub);
1225 	if (error != 0 && error != ENOENT) {
1226 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1227 		    VDEV_AUX_CORRUPT_DATA);
1228 		error = EIO;
1229 		goto out;
1230 	}
1231 
1232 	/*
1233 	 * Load the history object.  If we have an older pool, this
1234 	 * will not be present.
1235 	 */
1236 	error = zap_lookup(spa->spa_meta_objset,
1237 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY,
1238 	    sizeof (uint64_t), 1, &spa->spa_history);
1239 	if (error != 0 && error != ENOENT) {
1240 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1241 		    VDEV_AUX_CORRUPT_DATA);
1242 		error = EIO;
1243 		goto out;
1244 	}
1245 
1246 	/*
1247 	 * Load any hot spares for this pool.
1248 	 */
1249 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
1250 	    DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares.sav_object);
1251 	if (error != 0 && error != ENOENT) {
1252 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1253 		    VDEV_AUX_CORRUPT_DATA);
1254 		error = EIO;
1255 		goto out;
1256 	}
1257 	if (error == 0) {
1258 		ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
1259 		if (load_nvlist(spa, spa->spa_spares.sav_object,
1260 		    &spa->spa_spares.sav_config) != 0) {
1261 			vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1262 			    VDEV_AUX_CORRUPT_DATA);
1263 			error = EIO;
1264 			goto out;
1265 		}
1266 
1267 		spa_config_enter(spa, RW_WRITER, FTAG);
1268 		spa_load_spares(spa);
1269 		spa_config_exit(spa, FTAG);
1270 	}
1271 
1272 	/*
1273 	 * Load any level 2 ARC devices for this pool.
1274 	 */
1275 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
1276 	    DMU_POOL_L2CACHE, sizeof (uint64_t), 1,
1277 	    &spa->spa_l2cache.sav_object);
1278 	if (error != 0 && error != ENOENT) {
1279 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1280 		    VDEV_AUX_CORRUPT_DATA);
1281 		error = EIO;
1282 		goto out;
1283 	}
1284 	if (error == 0) {
1285 		ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
1286 		if (load_nvlist(spa, spa->spa_l2cache.sav_object,
1287 		    &spa->spa_l2cache.sav_config) != 0) {
1288 			vdev_set_state(rvd, B_TRUE,
1289 			    VDEV_STATE_CANT_OPEN,
1290 			    VDEV_AUX_CORRUPT_DATA);
1291 			error = EIO;
1292 			goto out;
1293 		}
1294 
1295 		spa_config_enter(spa, RW_WRITER, FTAG);
1296 		spa_load_l2cache(spa);
1297 		spa_config_exit(spa, FTAG);
1298 	}
1299 
1300 	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
1301 
1302 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
1303 	    DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object);
1304 
1305 	if (error && error != ENOENT) {
1306 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
1307 		    VDEV_AUX_CORRUPT_DATA);
1308 		error = EIO;
1309 		goto out;
1310 	}
1311 
1312 	if (error == 0) {
1313 		(void) zap_lookup(spa->spa_meta_objset,
1314 		    spa->spa_pool_props_object,
1315 		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS),
1316 		    sizeof (uint64_t), 1, &spa->spa_bootfs);
1317 		(void) zap_lookup(spa->spa_meta_objset,
1318 		    spa->spa_pool_props_object,
1319 		    zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE),
1320 		    sizeof (uint64_t), 1, &autoreplace);
1321 		(void) zap_lookup(spa->spa_meta_objset,
1322 		    spa->spa_pool_props_object,
1323 		    zpool_prop_to_name(ZPOOL_PROP_DELEGATION),
1324 		    sizeof (uint64_t), 1, &spa->spa_delegation);
1325 		(void) zap_lookup(spa->spa_meta_objset,
1326 		    spa->spa_pool_props_object,
1327 		    zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE),
1328 		    sizeof (uint64_t), 1, &spa->spa_failmode);
1329 	}
1330 
1331 	/*
1332 	 * If the 'autoreplace' property is set, then post a resource notifying
1333 	 * the ZFS DE that it should not issue any faults for unopenable
1334 	 * devices.  We also iterate over the vdevs, and post a sysevent for any
1335 	 * unopenable vdevs so that the normal autoreplace handler can take
1336 	 * over.
1337 	 */
1338 	if (autoreplace && state != SPA_LOAD_TRYIMPORT)
1339 		spa_check_removed(spa->spa_root_vdev);
1340 
1341 	/*
1342 	 * Load the vdev state for all toplevel vdevs.
1343 	 */
1344 	vdev_load(rvd);
1345 
1346 	/*
1347 	 * Propagate the leaf DTLs we just loaded all the way up the tree.
1348 	 */
1349 	spa_config_enter(spa, RW_WRITER, FTAG);
1350 	vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
1351 	spa_config_exit(spa, FTAG);
1352 
1353 	/*
1354 	 * Check the state of the root vdev.  If it can't be opened, it
1355 	 * indicates one or more toplevel vdevs are faulted.
1356 	 */
1357 	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
1358 		error = ENXIO;
1359 		goto out;
1360 	}
1361 
1362 	if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) {
1363 		dmu_tx_t *tx;
1364 		int need_update = B_FALSE;
1365 		int c;
1366 
1367 		/*
1368 		 * Claim log blocks that haven't been committed yet.
1369 		 * This must all happen in a single txg.
1370 		 */
1371 		tx = dmu_tx_create_assigned(spa_get_dsl(spa),
1372 		    spa_first_txg(spa));
1373 		(void) dmu_objset_find(spa->spa_name,
1374 		    zil_claim, tx, DS_FIND_CHILDREN);
1375 		dmu_tx_commit(tx);
1376 
1377 		spa->spa_sync_on = B_TRUE;
1378 		txg_sync_start(spa->spa_dsl_pool);
1379 
1380 		/*
1381 		 * Wait for all claims to sync.
1382 		 */
1383 		txg_wait_synced(spa->spa_dsl_pool, 0);
1384 
1385 		/*
1386 		 * If the config cache is stale, or we have uninitialized
1387 		 * metaslabs (see spa_vdev_add()), then update the config.
1388 		 */
1389 		if (config_cache_txg != spa->spa_config_txg ||
1390 		    state == SPA_LOAD_IMPORT)
1391 			need_update = B_TRUE;
1392 
1393 		for (c = 0; c < rvd->vdev_children; c++)
1394 			if (rvd->vdev_child[c]->vdev_ms_array == 0)
1395 				need_update = B_TRUE;
1396 
1397 		/*
1398 		 * Update the config cache asychronously in case we're the
1399 		 * root pool, in which case the config cache isn't writable yet.
1400 		 */
1401 		if (need_update)
1402 			spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
1403 	}
1404 
1405 	error = 0;
1406 out:
1407 	if (error && error != EBADF)
1408 		zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0);
1409 	spa->spa_load_state = SPA_LOAD_NONE;
1410 	spa->spa_ena = 0;
1411 
1412 	return (error);
1413 }
1414 
1415 /*
1416  * Pool Open/Import
1417  *
1418  * The import case is identical to an open except that the configuration is sent
1419  * down from userland, instead of grabbed from the configuration cache.  For the
1420  * case of an open, the pool configuration will exist in the
1421  * POOL_STATE_UNINITIALIZED state.
1422  *
1423  * The stats information (gen/count/ustats) is used to gather vdev statistics at
1424  * the same time open the pool, without having to keep around the spa_t in some
1425  * ambiguous state.
1426  */
1427 static int
1428 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
1429 {
1430 	spa_t *spa;
1431 	int error;
1432 	int loaded = B_FALSE;
1433 	int locked = B_FALSE;
1434 
1435 	*spapp = NULL;
1436 
1437 	/*
1438 	 * As disgusting as this is, we need to support recursive calls to this
1439 	 * function because dsl_dir_open() is called during spa_load(), and ends
1440 	 * up calling spa_open() again.  The real fix is to figure out how to
1441 	 * avoid dsl_dir_open() calling this in the first place.
1442 	 */
1443 	if (mutex_owner(&spa_namespace_lock) != curthread) {
1444 		mutex_enter(&spa_namespace_lock);
1445 		locked = B_TRUE;
1446 	}
1447 
1448 	if ((spa = spa_lookup(pool)) == NULL) {
1449 		if (locked)
1450 			mutex_exit(&spa_namespace_lock);
1451 		return (ENOENT);
1452 	}
1453 	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
1454 
1455 		spa_activate(spa);
1456 
1457 		error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE);
1458 
1459 		if (error == EBADF) {
1460 			/*
1461 			 * If vdev_validate() returns failure (indicated by
1462 			 * EBADF), it indicates that one of the vdevs indicates
1463 			 * that the pool has been exported or destroyed.  If
1464 			 * this is the case, the config cache is out of sync and
1465 			 * we should remove the pool from the namespace.
1466 			 */
1467 			zfs_post_ok(spa, NULL);
1468 			spa_unload(spa);
1469 			spa_deactivate(spa);
1470 			spa_remove(spa);
1471 			spa_config_sync();
1472 			if (locked)
1473 				mutex_exit(&spa_namespace_lock);
1474 			return (ENOENT);
1475 		}
1476 
1477 		if (error) {
1478 			/*
1479 			 * We can't open the pool, but we still have useful
1480 			 * information: the state of each vdev after the
1481 			 * attempted vdev_open().  Return this to the user.
1482 			 */
1483 			if (config != NULL && spa->spa_root_vdev != NULL) {
1484 				spa_config_enter(spa, RW_READER, FTAG);
1485 				*config = spa_config_generate(spa, NULL, -1ULL,
1486 				    B_TRUE);
1487 				spa_config_exit(spa, FTAG);
1488 			}
1489 			spa_unload(spa);
1490 			spa_deactivate(spa);
1491 			spa->spa_last_open_failed = B_TRUE;
1492 			if (locked)
1493 				mutex_exit(&spa_namespace_lock);
1494 			*spapp = NULL;
1495 			return (error);
1496 		} else {
1497 			zfs_post_ok(spa, NULL);
1498 			spa->spa_last_open_failed = B_FALSE;
1499 		}
1500 
1501 		loaded = B_TRUE;
1502 	}
1503 
1504 	spa_open_ref(spa, tag);
1505 
1506 	/*
1507 	 * If we just loaded the pool, resilver anything that's out of date.
1508 	 */
1509 	if (loaded && (spa_mode & FWRITE))
1510 		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
1511 
1512 	if (locked)
1513 		mutex_exit(&spa_namespace_lock);
1514 
1515 	*spapp = spa;
1516 
1517 	if (config != NULL) {
1518 		spa_config_enter(spa, RW_READER, FTAG);
1519 		*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
1520 		spa_config_exit(spa, FTAG);
1521 	}
1522 
1523 	return (0);
1524 }
1525 
1526 int
1527 spa_open(const char *name, spa_t **spapp, void *tag)
1528 {
1529 	return (spa_open_common(name, spapp, tag, NULL));
1530 }
1531 
1532 /*
1533  * Lookup the given spa_t, incrementing the inject count in the process,
1534  * preventing it from being exported or destroyed.
1535  */
1536 spa_t *
1537 spa_inject_addref(char *name)
1538 {
1539 	spa_t *spa;
1540 
1541 	mutex_enter(&spa_namespace_lock);
1542 	if ((spa = spa_lookup(name)) == NULL) {
1543 		mutex_exit(&spa_namespace_lock);
1544 		return (NULL);
1545 	}
1546 	spa->spa_inject_ref++;
1547 	mutex_exit(&spa_namespace_lock);
1548 
1549 	return (spa);
1550 }
1551 
1552 void
1553 spa_inject_delref(spa_t *spa)
1554 {
1555 	mutex_enter(&spa_namespace_lock);
1556 	spa->spa_inject_ref--;
1557 	mutex_exit(&spa_namespace_lock);
1558 }
1559 
1560 /*
1561  * Add spares device information to the nvlist.
1562  */
1563 static void
1564 spa_add_spares(spa_t *spa, nvlist_t *config)
1565 {
1566 	nvlist_t **spares;
1567 	uint_t i, nspares;
1568 	nvlist_t *nvroot;
1569 	uint64_t guid;
1570 	vdev_stat_t *vs;
1571 	uint_t vsc;
1572 	uint64_t pool;
1573 
1574 	if (spa->spa_spares.sav_count == 0)
1575 		return;
1576 
1577 	VERIFY(nvlist_lookup_nvlist(config,
1578 	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
1579 	VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
1580 	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
1581 	if (nspares != 0) {
1582 		VERIFY(nvlist_add_nvlist_array(nvroot,
1583 		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
1584 		VERIFY(nvlist_lookup_nvlist_array(nvroot,
1585 		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
1586 
1587 		/*
1588 		 * Go through and find any spares which have since been
1589 		 * repurposed as an active spare.  If this is the case, update
1590 		 * their status appropriately.
1591 		 */
1592 		for (i = 0; i < nspares; i++) {
1593 			VERIFY(nvlist_lookup_uint64(spares[i],
1594 			    ZPOOL_CONFIG_GUID, &guid) == 0);
1595 			if (spa_spare_exists(guid, &pool) && pool != 0ULL) {
1596 				VERIFY(nvlist_lookup_uint64_array(
1597 				    spares[i], ZPOOL_CONFIG_STATS,
1598 				    (uint64_t **)&vs, &vsc) == 0);
1599 				vs->vs_state = VDEV_STATE_CANT_OPEN;
1600 				vs->vs_aux = VDEV_AUX_SPARED;
1601 			}
1602 		}
1603 	}
1604 }
1605 
1606 /*
1607  * Add l2cache device information to the nvlist, including vdev stats.
1608  */
1609 static void
1610 spa_add_l2cache(spa_t *spa, nvlist_t *config)
1611 {
1612 	nvlist_t **l2cache;
1613 	uint_t i, j, nl2cache;
1614 	nvlist_t *nvroot;
1615 	uint64_t guid;
1616 	vdev_t *vd;
1617 	vdev_stat_t *vs;
1618 	uint_t vsc;
1619 
1620 	if (spa->spa_l2cache.sav_count == 0)
1621 		return;
1622 
1623 	spa_config_enter(spa, RW_READER, FTAG);
1624 
1625 	VERIFY(nvlist_lookup_nvlist(config,
1626 	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
1627 	VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
1628 	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
1629 	if (nl2cache != 0) {
1630 		VERIFY(nvlist_add_nvlist_array(nvroot,
1631 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
1632 		VERIFY(nvlist_lookup_nvlist_array(nvroot,
1633 		    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
1634 
1635 		/*
1636 		 * Update level 2 cache device stats.
1637 		 */
1638 
1639 		for (i = 0; i < nl2cache; i++) {
1640 			VERIFY(nvlist_lookup_uint64(l2cache[i],
1641 			    ZPOOL_CONFIG_GUID, &guid) == 0);
1642 
1643 			vd = NULL;
1644 			for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
1645 				if (guid ==
1646 				    spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
1647 					vd = spa->spa_l2cache.sav_vdevs[j];
1648 					break;
1649 				}
1650 			}
1651 			ASSERT(vd != NULL);
1652 
1653 			VERIFY(nvlist_lookup_uint64_array(l2cache[i],
1654 			    ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0);
1655 			vdev_get_stats(vd, vs);
1656 		}
1657 	}
1658 
1659 	spa_config_exit(spa, FTAG);
1660 }
1661 
1662 int
1663 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
1664 {
1665 	int error;
1666 	spa_t *spa;
1667 
1668 	*config = NULL;
1669 	error = spa_open_common(name, &spa, FTAG, config);
1670 
1671 	if (spa && *config != NULL) {
1672 		VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT,
1673 		    spa_get_errlog_size(spa)) == 0);
1674 
1675 		spa_add_spares(spa, *config);
1676 		spa_add_l2cache(spa, *config);
1677 	}
1678 
1679 	/*
1680 	 * We want to get the alternate root even for faulted pools, so we cheat
1681 	 * and call spa_lookup() directly.
1682 	 */
1683 	if (altroot) {
1684 		if (spa == NULL) {
1685 			mutex_enter(&spa_namespace_lock);
1686 			spa = spa_lookup(name);
1687 			if (spa)
1688 				spa_altroot(spa, altroot, buflen);
1689 			else
1690 				altroot[0] = '\0';
1691 			spa = NULL;
1692 			mutex_exit(&spa_namespace_lock);
1693 		} else {
1694 			spa_altroot(spa, altroot, buflen);
1695 		}
1696 	}
1697 
1698 	if (spa != NULL)
1699 		spa_close(spa, FTAG);
1700 
1701 	return (error);
1702 }
1703 
1704 /*
1705  * Validate that the auxiliary device array is well formed.  We must have an
1706  * array of nvlists, each which describes a valid leaf vdev.  If this is an
1707  * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
1708  * specified, as long as they are well-formed.
1709  */
1710 static int
1711 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
1712     spa_aux_vdev_t *sav, const char *config, uint64_t version,
1713     vdev_labeltype_t label)
1714 {
1715 	nvlist_t **dev;
1716 	uint_t i, ndev;
1717 	vdev_t *vd;
1718 	int error;
1719 
1720 	/*
1721 	 * It's acceptable to have no devs specified.
1722 	 */
1723 	if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
1724 		return (0);
1725 
1726 	if (ndev == 0)
1727 		return (EINVAL);
1728 
1729 	/*
1730 	 * Make sure the pool is formatted with a version that supports this
1731 	 * device type.
1732 	 */
1733 	if (spa_version(spa) < version)
1734 		return (ENOTSUP);
1735 
1736 	/*
1737 	 * Set the pending device list so we correctly handle device in-use
1738 	 * checking.
1739 	 */
1740 	sav->sav_pending = dev;
1741 	sav->sav_npending = ndev;
1742 
1743 	for (i = 0; i < ndev; i++) {
1744 		if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
1745 		    mode)) != 0)
1746 			goto out;
1747 
1748 		if (!vd->vdev_ops->vdev_op_leaf) {
1749 			vdev_free(vd);
1750 			error = EINVAL;
1751 			goto out;
1752 		}
1753 
1754 		/*
1755 		 * The L2ARC currently only supports disk devices.
1756 		 */
1757 		if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) &&
1758 		    strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) {
1759 			error = ENOTBLK;
1760 			goto out;
1761 		}
1762 
1763 		vd->vdev_top = vd;
1764 
1765 		if ((error = vdev_open(vd)) == 0 &&
1766 		    (error = vdev_label_init(vd, crtxg, label)) == 0) {
1767 			VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
1768 			    vd->vdev_guid) == 0);
1769 		}
1770 
1771 		vdev_free(vd);
1772 
1773 		if (error &&
1774 		    (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
1775 			goto out;
1776 		else
1777 			error = 0;
1778 	}
1779 
1780 out:
1781 	sav->sav_pending = NULL;
1782 	sav->sav_npending = 0;
1783 	return (error);
1784 }
1785 
1786 static int
1787 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
1788 {
1789 	int error;
1790 
1791 	if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
1792 	    &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
1793 	    VDEV_LABEL_SPARE)) != 0) {
1794 		return (error);
1795 	}
1796 
1797 	return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
1798 	    &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
1799 	    VDEV_LABEL_L2CACHE));
1800 }
1801 
1802 static void
1803 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
1804     const char *config)
1805 {
1806 	int i;
1807 
1808 	if (sav->sav_config != NULL) {
1809 		nvlist_t **olddevs;
1810 		uint_t oldndevs;
1811 		nvlist_t **newdevs;
1812 
1813 		/*
1814 		 * Generate new dev list by concatentating with the
1815 		 * current dev list.
1816 		 */
1817 		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
1818 		    &olddevs, &oldndevs) == 0);
1819 
1820 		newdevs = kmem_alloc(sizeof (void *) *
1821 		    (ndevs + oldndevs), KM_SLEEP);
1822 		for (i = 0; i < oldndevs; i++)
1823 			VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
1824 			    KM_SLEEP) == 0);
1825 		for (i = 0; i < ndevs; i++)
1826 			VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
1827 			    KM_SLEEP) == 0);
1828 
1829 		VERIFY(nvlist_remove(sav->sav_config, config,
1830 		    DATA_TYPE_NVLIST_ARRAY) == 0);
1831 
1832 		VERIFY(nvlist_add_nvlist_array(sav->sav_config,
1833 		    config, newdevs, ndevs + oldndevs) == 0);
1834 		for (i = 0; i < oldndevs + ndevs; i++)
1835 			nvlist_free(newdevs[i]);
1836 		kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
1837 	} else {
1838 		/*
1839 		 * Generate a new dev list.
1840 		 */
1841 		VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
1842 		    KM_SLEEP) == 0);
1843 		VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
1844 		    devs, ndevs) == 0);
1845 	}
1846 }
1847 
1848 /*
1849  * Stop and drop level 2 ARC devices
1850  */
1851 void
1852 spa_l2cache_drop(spa_t *spa)
1853 {
1854 	vdev_t *vd;
1855 	int i;
1856 	spa_aux_vdev_t *sav = &spa->spa_l2cache;
1857 
1858 	for (i = 0; i < sav->sav_count; i++) {
1859 		uint64_t pool;
1860 
1861 		vd = sav->sav_vdevs[i];
1862 		ASSERT(vd != NULL);
1863 
1864 		if (spa_mode & FWRITE &&
1865 		    spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL) {
1866 			l2arc_remove_vdev(vd);
1867 		}
1868 		if (vd->vdev_isl2cache)
1869 			spa_l2cache_remove(vd);
1870 		vdev_clear_stats(vd);
1871 		(void) vdev_close(vd);
1872 	}
1873 }
1874 
1875 /*
1876  * Pool Creation
1877  */
1878 int
1879 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
1880     const char *history_str)
1881 {
1882 	spa_t *spa;
1883 	char *altroot = NULL;
1884 	vdev_t *rvd;
1885 	dsl_pool_t *dp;
1886 	dmu_tx_t *tx;
1887 	int c, error = 0;
1888 	uint64_t txg = TXG_INITIAL;
1889 	nvlist_t **spares, **l2cache;
1890 	uint_t nspares, nl2cache;
1891 	uint64_t version;
1892 
1893 	/*
1894 	 * If this pool already exists, return failure.
1895 	 */
1896 	mutex_enter(&spa_namespace_lock);
1897 	if (spa_lookup(pool) != NULL) {
1898 		mutex_exit(&spa_namespace_lock);
1899 		return (EEXIST);
1900 	}
1901 
1902 	/*
1903 	 * Allocate a new spa_t structure.
1904 	 */
1905 	(void) nvlist_lookup_string(props,
1906 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
1907 	spa = spa_add(pool, altroot);
1908 	spa_activate(spa);
1909 
1910 	spa->spa_uberblock.ub_txg = txg - 1;
1911 
1912 	if (props && (error = spa_prop_validate(spa, props))) {
1913 		spa_unload(spa);
1914 		spa_deactivate(spa);
1915 		spa_remove(spa);
1916 		return (error);
1917 	}
1918 
1919 	if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION),
1920 	    &version) != 0)
1921 		version = SPA_VERSION;
1922 	ASSERT(version <= SPA_VERSION);
1923 	spa->spa_uberblock.ub_version = version;
1924 	spa->spa_ubsync = spa->spa_uberblock;
1925 
1926 	/*
1927 	 * Create the root vdev.
1928 	 */
1929 	spa_config_enter(spa, RW_WRITER, FTAG);
1930 
1931 	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
1932 
1933 	ASSERT(error != 0 || rvd != NULL);
1934 	ASSERT(error != 0 || spa->spa_root_vdev == rvd);
1935 
1936 	if (error == 0 && !zfs_allocatable_devs(nvroot))
1937 		error = EINVAL;
1938 
1939 	if (error == 0 &&
1940 	    (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
1941 	    (error = spa_validate_aux(spa, nvroot, txg,
1942 	    VDEV_ALLOC_ADD)) == 0) {
1943 		for (c = 0; c < rvd->vdev_children; c++)
1944 			vdev_init(rvd->vdev_child[c], txg);
1945 		vdev_config_dirty(rvd);
1946 	}
1947 
1948 	spa_config_exit(spa, FTAG);
1949 
1950 	if (error != 0) {
1951 		spa_unload(spa);
1952 		spa_deactivate(spa);
1953 		spa_remove(spa);
1954 		mutex_exit(&spa_namespace_lock);
1955 		return (error);
1956 	}
1957 
1958 	/*
1959 	 * Get the list of spares, if specified.
1960 	 */
1961 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
1962 	    &spares, &nspares) == 0) {
1963 		VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
1964 		    KM_SLEEP) == 0);
1965 		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
1966 		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
1967 		spa_config_enter(spa, RW_WRITER, FTAG);
1968 		spa_load_spares(spa);
1969 		spa_config_exit(spa, FTAG);
1970 		spa->spa_spares.sav_sync = B_TRUE;
1971 	}
1972 
1973 	/*
1974 	 * Get the list of level 2 cache devices, if specified.
1975 	 */
1976 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
1977 	    &l2cache, &nl2cache) == 0) {
1978 		VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
1979 		    NV_UNIQUE_NAME, KM_SLEEP) == 0);
1980 		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
1981 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
1982 		spa_config_enter(spa, RW_WRITER, FTAG);
1983 		spa_load_l2cache(spa);
1984 		spa_config_exit(spa, FTAG);
1985 		spa->spa_l2cache.sav_sync = B_TRUE;
1986 	}
1987 
1988 	spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg);
1989 	spa->spa_meta_objset = dp->dp_meta_objset;
1990 
1991 	tx = dmu_tx_create_assigned(dp, txg);
1992 
1993 	/*
1994 	 * Create the pool config object.
1995 	 */
1996 	spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
1997 	    DMU_OT_PACKED_NVLIST, 1 << 14,
1998 	    DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
1999 
2000 	if (zap_add(spa->spa_meta_objset,
2001 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
2002 	    sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
2003 		cmn_err(CE_PANIC, "failed to add pool config");
2004 	}
2005 
2006 	/* Newly created pools with the right version are always deflated. */
2007 	if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
2008 		spa->spa_deflate = TRUE;
2009 		if (zap_add(spa->spa_meta_objset,
2010 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
2011 		    sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
2012 			cmn_err(CE_PANIC, "failed to add deflate");
2013 		}
2014 	}
2015 
2016 	/*
2017 	 * Create the deferred-free bplist object.  Turn off compression
2018 	 * because sync-to-convergence takes longer if the blocksize
2019 	 * keeps changing.
2020 	 */
2021 	spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset,
2022 	    1 << 14, tx);
2023 	dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj,
2024 	    ZIO_COMPRESS_OFF, tx);
2025 
2026 	if (zap_add(spa->spa_meta_objset,
2027 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
2028 	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) {
2029 		cmn_err(CE_PANIC, "failed to add bplist");
2030 	}
2031 
2032 	/*
2033 	 * Create the pool's history object.
2034 	 */
2035 	if (version >= SPA_VERSION_ZPOOL_HISTORY)
2036 		spa_history_create_obj(spa, tx);
2037 
2038 	/*
2039 	 * Set pool properties.
2040 	 */
2041 	spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
2042 	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
2043 	spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
2044 	if (props)
2045 		spa_sync_props(spa, props, CRED(), tx);
2046 
2047 	dmu_tx_commit(tx);
2048 
2049 	spa->spa_sync_on = B_TRUE;
2050 	txg_sync_start(spa->spa_dsl_pool);
2051 
2052 	/*
2053 	 * We explicitly wait for the first transaction to complete so that our
2054 	 * bean counters are appropriately updated.
2055 	 */
2056 	txg_wait_synced(spa->spa_dsl_pool, txg);
2057 
2058 	spa_config_sync();
2059 
2060 	if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL)
2061 		(void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE);
2062 
2063 	mutex_exit(&spa_namespace_lock);
2064 
2065 	return (0);
2066 }
2067 
2068 /*
2069  * Import the given pool into the system.  We set up the necessary spa_t and
2070  * then call spa_load() to do the dirty work.
2071  */
2072 int
2073 spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
2074 {
2075 	spa_t *spa;
2076 	char *altroot = NULL;
2077 	int error;
2078 	nvlist_t *nvroot;
2079 	nvlist_t **spares, **l2cache;
2080 	uint_t nspares, nl2cache;
2081 
2082 	/*
2083 	 * If a pool with this name exists, return failure.
2084 	 */
2085 	mutex_enter(&spa_namespace_lock);
2086 	if (spa_lookup(pool) != NULL) {
2087 		mutex_exit(&spa_namespace_lock);
2088 		return (EEXIST);
2089 	}
2090 
2091 	/*
2092 	 * Create and initialize the spa structure.
2093 	 */
2094 	(void) nvlist_lookup_string(props,
2095 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
2096 	spa = spa_add(pool, altroot);
2097 	spa_activate(spa);
2098 
2099 	/*
2100 	 * Pass off the heavy lifting to spa_load().
2101 	 * Pass TRUE for mosconfig because the user-supplied config
2102 	 * is actually the one to trust when doing an import.
2103 	 */
2104 	error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE);
2105 
2106 	spa_config_enter(spa, RW_WRITER, FTAG);
2107 	/*
2108 	 * Toss any existing sparelist, as it doesn't have any validity anymore,
2109 	 * and conflicts with spa_has_spare().
2110 	 */
2111 	if (spa->spa_spares.sav_config) {
2112 		nvlist_free(spa->spa_spares.sav_config);
2113 		spa->spa_spares.sav_config = NULL;
2114 		spa_load_spares(spa);
2115 	}
2116 	if (spa->spa_l2cache.sav_config) {
2117 		nvlist_free(spa->spa_l2cache.sav_config);
2118 		spa->spa_l2cache.sav_config = NULL;
2119 		spa_load_l2cache(spa);
2120 	}
2121 
2122 	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
2123 	    &nvroot) == 0);
2124 	if (error == 0)
2125 		error = spa_validate_aux(spa, nvroot, -1ULL, VDEV_ALLOC_SPARE);
2126 	if (error == 0)
2127 		error = spa_validate_aux(spa, nvroot, -1ULL,
2128 		    VDEV_ALLOC_L2CACHE);
2129 	spa_config_exit(spa, FTAG);
2130 
2131 	if (error != 0 || (props && (error = spa_prop_set(spa, props)))) {
2132 		spa_unload(spa);
2133 		spa_deactivate(spa);
2134 		spa_remove(spa);
2135 		mutex_exit(&spa_namespace_lock);
2136 		return (error);
2137 	}
2138 
2139 	/*
2140 	 * Override any spares and level 2 cache devices as specified by
2141 	 * the user, as these may have correct device names/devids, etc.
2142 	 */
2143 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
2144 	    &spares, &nspares) == 0) {
2145 		if (spa->spa_spares.sav_config)
2146 			VERIFY(nvlist_remove(spa->spa_spares.sav_config,
2147 			    ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
2148 		else
2149 			VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
2150 			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
2151 		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
2152 		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
2153 		spa_config_enter(spa, RW_WRITER, FTAG);
2154 		spa_load_spares(spa);
2155 		spa_config_exit(spa, FTAG);
2156 		spa->spa_spares.sav_sync = B_TRUE;
2157 	}
2158 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
2159 	    &l2cache, &nl2cache) == 0) {
2160 		if (spa->spa_l2cache.sav_config)
2161 			VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
2162 			    ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
2163 		else
2164 			VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
2165 			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
2166 		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
2167 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
2168 		spa_config_enter(spa, RW_WRITER, FTAG);
2169 		spa_load_l2cache(spa);
2170 		spa_config_exit(spa, FTAG);
2171 		spa->spa_l2cache.sav_sync = B_TRUE;
2172 	}
2173 
2174 	/*
2175 	 * Update the config cache to include the newly-imported pool.
2176 	 */
2177 	if (spa_mode & FWRITE)
2178 		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
2179 
2180 	/*
2181 	 * Resilver anything that's out of date.
2182 	 */
2183 	if (spa_mode & FWRITE)
2184 		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
2185 
2186 	mutex_exit(&spa_namespace_lock);
2187 
2188 	return (0);
2189 }
2190 
2191 /*
2192  * This (illegal) pool name is used when temporarily importing a spa_t in order
2193  * to get the vdev stats associated with the imported devices.
2194  */
2195 #define	TRYIMPORT_NAME	"$import"
2196 
2197 nvlist_t *
2198 spa_tryimport(nvlist_t *tryconfig)
2199 {
2200 	nvlist_t *config = NULL;
2201 	char *poolname;
2202 	spa_t *spa;
2203 	uint64_t state;
2204 
2205 	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
2206 		return (NULL);
2207 
2208 	if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
2209 		return (NULL);
2210 
2211 	/*
2212 	 * Create and initialize the spa structure.
2213 	 */
2214 	mutex_enter(&spa_namespace_lock);
2215 	spa = spa_add(TRYIMPORT_NAME, NULL);
2216 	spa_activate(spa);
2217 
2218 	/*
2219 	 * Pass off the heavy lifting to spa_load().
2220 	 * Pass TRUE for mosconfig because the user-supplied config
2221 	 * is actually the one to trust when doing an import.
2222 	 */
2223 	(void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE);
2224 
2225 	/*
2226 	 * If 'tryconfig' was at least parsable, return the current config.
2227 	 */
2228 	if (spa->spa_root_vdev != NULL) {
2229 		spa_config_enter(spa, RW_READER, FTAG);
2230 		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
2231 		spa_config_exit(spa, FTAG);
2232 		VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
2233 		    poolname) == 0);
2234 		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
2235 		    state) == 0);
2236 		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
2237 		    spa->spa_uberblock.ub_timestamp) == 0);
2238 
2239 		/*
2240 		 * Add the list of hot spares and level 2 cache devices.
2241 		 */
2242 		spa_add_spares(spa, config);
2243 		spa_add_l2cache(spa, config);
2244 	}
2245 
2246 	spa_unload(spa);
2247 	spa_deactivate(spa);
2248 	spa_remove(spa);
2249 	mutex_exit(&spa_namespace_lock);
2250 
2251 	return (config);
2252 }
2253 
2254 /*
2255  * Pool export/destroy
2256  *
2257  * The act of destroying or exporting a pool is very simple.  We make sure there
2258  * is no more pending I/O and any references to the pool are gone.  Then, we
2259  * update the pool state and sync all the labels to disk, removing the
2260  * configuration from the cache afterwards.
2261  */
2262 static int
2263 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig)
2264 {
2265 	spa_t *spa;
2266 
2267 	if (oldconfig)
2268 		*oldconfig = NULL;
2269 
2270 	if (!(spa_mode & FWRITE))
2271 		return (EROFS);
2272 
2273 	mutex_enter(&spa_namespace_lock);
2274 	if ((spa = spa_lookup(pool)) == NULL) {
2275 		mutex_exit(&spa_namespace_lock);
2276 		return (ENOENT);
2277 	}
2278 
2279 	/*
2280 	 * Put a hold on the pool, drop the namespace lock, stop async tasks,
2281 	 * reacquire the namespace lock, and see if we can export.
2282 	 */
2283 	spa_open_ref(spa, FTAG);
2284 	mutex_exit(&spa_namespace_lock);
2285 	spa_async_suspend(spa);
2286 	mutex_enter(&spa_namespace_lock);
2287 	spa_close(spa, FTAG);
2288 
2289 	/*
2290 	 * The pool will be in core if it's openable,
2291 	 * in which case we can modify its state.
2292 	 */
2293 	if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
2294 		/*
2295 		 * Objsets may be open only because they're dirty, so we
2296 		 * have to force it to sync before checking spa_refcnt.
2297 		 */
2298 		spa_scrub_suspend(spa);
2299 		txg_wait_synced(spa->spa_dsl_pool, 0);
2300 
2301 		/*
2302 		 * A pool cannot be exported or destroyed if there are active
2303 		 * references.  If we are resetting a pool, allow references by
2304 		 * fault injection handlers.
2305 		 */
2306 		if (!spa_refcount_zero(spa) ||
2307 		    (spa->spa_inject_ref != 0 &&
2308 		    new_state != POOL_STATE_UNINITIALIZED)) {
2309 			spa_scrub_resume(spa);
2310 			spa_async_resume(spa);
2311 			mutex_exit(&spa_namespace_lock);
2312 			return (EBUSY);
2313 		}
2314 
2315 		spa_scrub_resume(spa);
2316 		VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
2317 
2318 		/*
2319 		 * We want this to be reflected on every label,
2320 		 * so mark them all dirty.  spa_unload() will do the
2321 		 * final sync that pushes these changes out.
2322 		 */
2323 		if (new_state != POOL_STATE_UNINITIALIZED) {
2324 			spa_config_enter(spa, RW_WRITER, FTAG);
2325 			spa->spa_state = new_state;
2326 			spa->spa_final_txg = spa_last_synced_txg(spa) + 1;
2327 			vdev_config_dirty(spa->spa_root_vdev);
2328 			spa_config_exit(spa, FTAG);
2329 		}
2330 	}
2331 
2332 	spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY);
2333 
2334 	if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
2335 		spa_unload(spa);
2336 		spa_deactivate(spa);
2337 	}
2338 
2339 	if (oldconfig && spa->spa_config)
2340 		VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
2341 
2342 	if (new_state != POOL_STATE_UNINITIALIZED) {
2343 		spa_config_check(spa->spa_config_dir,
2344 		    spa->spa_config_file);
2345 		spa_remove(spa);
2346 		spa_config_sync();
2347 	}
2348 	mutex_exit(&spa_namespace_lock);
2349 
2350 	return (0);
2351 }
2352 
2353 /*
2354  * Destroy a storage pool.
2355  */
2356 int
2357 spa_destroy(char *pool)
2358 {
2359 	return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL));
2360 }
2361 
2362 /*
2363  * Export a storage pool.
2364  */
2365 int
2366 spa_export(char *pool, nvlist_t **oldconfig)
2367 {
2368 	return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig));
2369 }
2370 
2371 /*
2372  * Similar to spa_export(), this unloads the spa_t without actually removing it
2373  * from the namespace in any way.
2374  */
2375 int
2376 spa_reset(char *pool)
2377 {
2378 	return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL));
2379 }
2380 
2381 
2382 /*
2383  * ==========================================================================
2384  * Device manipulation
2385  * ==========================================================================
2386  */
2387 
2388 /*
2389  * Add a device to a storage pool.
2390  */
2391 int
2392 spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
2393 {
2394 	uint64_t txg;
2395 	int c, error;
2396 	vdev_t *rvd = spa->spa_root_vdev;
2397 	vdev_t *vd, *tvd;
2398 	nvlist_t **spares, **l2cache;
2399 	uint_t nspares, nl2cache;
2400 
2401 	txg = spa_vdev_enter(spa);
2402 
2403 	if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
2404 	    VDEV_ALLOC_ADD)) != 0)
2405 		return (spa_vdev_exit(spa, NULL, txg, error));
2406 
2407 	spa->spa_pending_vdev = vd;
2408 
2409 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
2410 	    &nspares) != 0)
2411 		nspares = 0;
2412 
2413 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
2414 	    &nl2cache) != 0)
2415 		nl2cache = 0;
2416 
2417 	if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) {
2418 		spa->spa_pending_vdev = NULL;
2419 		return (spa_vdev_exit(spa, vd, txg, EINVAL));
2420 	}
2421 
2422 	if (vd->vdev_children != 0) {
2423 		if ((error = vdev_create(vd, txg, B_FALSE)) != 0) {
2424 			spa->spa_pending_vdev = NULL;
2425 			return (spa_vdev_exit(spa, vd, txg, error));
2426 		}
2427 	}
2428 
2429 	/*
2430 	 * We must validate the spares and l2cache devices after checking the
2431 	 * children.  Otherwise, vdev_inuse() will blindly overwrite the spare.
2432 	 */
2433 	if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) {
2434 		spa->spa_pending_vdev = NULL;
2435 		return (spa_vdev_exit(spa, vd, txg, error));
2436 	}
2437 
2438 	spa->spa_pending_vdev = NULL;
2439 
2440 	/*
2441 	 * Transfer each new top-level vdev from vd to rvd.
2442 	 */
2443 	for (c = 0; c < vd->vdev_children; c++) {
2444 		tvd = vd->vdev_child[c];
2445 		vdev_remove_child(vd, tvd);
2446 		tvd->vdev_id = rvd->vdev_children;
2447 		vdev_add_child(rvd, tvd);
2448 		vdev_config_dirty(tvd);
2449 	}
2450 
2451 	if (nspares != 0) {
2452 		spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
2453 		    ZPOOL_CONFIG_SPARES);
2454 		spa_load_spares(spa);
2455 		spa->spa_spares.sav_sync = B_TRUE;
2456 	}
2457 
2458 	if (nl2cache != 0) {
2459 		spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
2460 		    ZPOOL_CONFIG_L2CACHE);
2461 		spa_load_l2cache(spa);
2462 		spa->spa_l2cache.sav_sync = B_TRUE;
2463 	}
2464 
2465 	/*
2466 	 * We have to be careful when adding new vdevs to an existing pool.
2467 	 * If other threads start allocating from these vdevs before we
2468 	 * sync the config cache, and we lose power, then upon reboot we may
2469 	 * fail to open the pool because there are DVAs that the config cache
2470 	 * can't translate.  Therefore, we first add the vdevs without
2471 	 * initializing metaslabs; sync the config cache (via spa_vdev_exit());
2472 	 * and then let spa_config_update() initialize the new metaslabs.
2473 	 *
2474 	 * spa_load() checks for added-but-not-initialized vdevs, so that
2475 	 * if we lose power at any point in this sequence, the remaining
2476 	 * steps will be completed the next time we load the pool.
2477 	 */
2478 	(void) spa_vdev_exit(spa, vd, txg, 0);
2479 
2480 	mutex_enter(&spa_namespace_lock);
2481 	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
2482 	mutex_exit(&spa_namespace_lock);
2483 
2484 	return (0);
2485 }
2486 
2487 /*
2488  * Attach a device to a mirror.  The arguments are the path to any device
2489  * in the mirror, and the nvroot for the new device.  If the path specifies
2490  * a device that is not mirrored, we automatically insert the mirror vdev.
2491  *
2492  * If 'replacing' is specified, the new device is intended to replace the
2493  * existing device; in this case the two devices are made into their own
2494  * mirror using the 'replacing' vdev, which is functionally identical to
2495  * the mirror vdev (it actually reuses all the same ops) but has a few
2496  * extra rules: you can't attach to it after it's been created, and upon
2497  * completion of resilvering, the first disk (the one being replaced)
2498  * is automatically detached.
2499  */
2500 int
2501 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
2502 {
2503 	uint64_t txg, open_txg;
2504 	int error;
2505 	vdev_t *rvd = spa->spa_root_vdev;
2506 	vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
2507 	vdev_ops_t *pvops;
2508 	int is_log;
2509 
2510 	txg = spa_vdev_enter(spa);
2511 
2512 	oldvd = vdev_lookup_by_guid(rvd, guid);
2513 
2514 	if (oldvd == NULL)
2515 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
2516 
2517 	if (!oldvd->vdev_ops->vdev_op_leaf)
2518 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
2519 
2520 	pvd = oldvd->vdev_parent;
2521 
2522 	if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
2523 	    VDEV_ALLOC_ADD)) != 0)
2524 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
2525 
2526 	if (newrootvd->vdev_children != 1)
2527 		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
2528 
2529 	newvd = newrootvd->vdev_child[0];
2530 
2531 	if (!newvd->vdev_ops->vdev_op_leaf)
2532 		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
2533 
2534 	if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
2535 		return (spa_vdev_exit(spa, newrootvd, txg, error));
2536 
2537 	/*
2538 	 * Spares can't replace logs
2539 	 */
2540 	is_log = oldvd->vdev_islog;
2541 	if (is_log && newvd->vdev_isspare)
2542 		return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
2543 
2544 	if (!replacing) {
2545 		/*
2546 		 * For attach, the only allowable parent is a mirror or the root
2547 		 * vdev.
2548 		 */
2549 		if (pvd->vdev_ops != &vdev_mirror_ops &&
2550 		    pvd->vdev_ops != &vdev_root_ops)
2551 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
2552 
2553 		pvops = &vdev_mirror_ops;
2554 	} else {
2555 		/*
2556 		 * Active hot spares can only be replaced by inactive hot
2557 		 * spares.
2558 		 */
2559 		if (pvd->vdev_ops == &vdev_spare_ops &&
2560 		    pvd->vdev_child[1] == oldvd &&
2561 		    !spa_has_spare(spa, newvd->vdev_guid))
2562 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
2563 
2564 		/*
2565 		 * If the source is a hot spare, and the parent isn't already a
2566 		 * spare, then we want to create a new hot spare.  Otherwise, we
2567 		 * want to create a replacing vdev.  The user is not allowed to
2568 		 * attach to a spared vdev child unless the 'isspare' state is
2569 		 * the same (spare replaces spare, non-spare replaces
2570 		 * non-spare).
2571 		 */
2572 		if (pvd->vdev_ops == &vdev_replacing_ops)
2573 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
2574 		else if (pvd->vdev_ops == &vdev_spare_ops &&
2575 		    newvd->vdev_isspare != oldvd->vdev_isspare)
2576 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
2577 		else if (pvd->vdev_ops != &vdev_spare_ops &&
2578 		    newvd->vdev_isspare)
2579 			pvops = &vdev_spare_ops;
2580 		else
2581 			pvops = &vdev_replacing_ops;
2582 	}
2583 
2584 	/*
2585 	 * Compare the new device size with the replaceable/attachable
2586 	 * device size.
2587 	 */
2588 	if (newvd->vdev_psize < vdev_get_rsize(oldvd))
2589 		return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
2590 
2591 	/*
2592 	 * The new device cannot have a higher alignment requirement
2593 	 * than the top-level vdev.
2594 	 */
2595 	if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
2596 		return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
2597 
2598 	/*
2599 	 * If this is an in-place replacement, update oldvd's path and devid
2600 	 * to make it distinguishable from newvd, and unopenable from now on.
2601 	 */
2602 	if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
2603 		spa_strfree(oldvd->vdev_path);
2604 		oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
2605 		    KM_SLEEP);
2606 		(void) sprintf(oldvd->vdev_path, "%s/%s",
2607 		    newvd->vdev_path, "old");
2608 		if (oldvd->vdev_devid != NULL) {
2609 			spa_strfree(oldvd->vdev_devid);
2610 			oldvd->vdev_devid = NULL;
2611 		}
2612 	}
2613 
2614 	/*
2615 	 * If the parent is not a mirror, or if we're replacing, insert the new
2616 	 * mirror/replacing/spare vdev above oldvd.
2617 	 */
2618 	if (pvd->vdev_ops != pvops)
2619 		pvd = vdev_add_parent(oldvd, pvops);
2620 
2621 	ASSERT(pvd->vdev_top->vdev_parent == rvd);
2622 	ASSERT(pvd->vdev_ops == pvops);
2623 	ASSERT(oldvd->vdev_parent == pvd);
2624 
2625 	/*
2626 	 * Extract the new device from its root and add it to pvd.
2627 	 */
2628 	vdev_remove_child(newrootvd, newvd);
2629 	newvd->vdev_id = pvd->vdev_children;
2630 	vdev_add_child(pvd, newvd);
2631 
2632 	/*
2633 	 * If newvd is smaller than oldvd, but larger than its rsize,
2634 	 * the addition of newvd may have decreased our parent's asize.
2635 	 */
2636 	pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize);
2637 
2638 	tvd = newvd->vdev_top;
2639 	ASSERT(pvd->vdev_top == tvd);
2640 	ASSERT(tvd->vdev_parent == rvd);
2641 
2642 	vdev_config_dirty(tvd);
2643 
2644 	/*
2645 	 * Set newvd's DTL to [TXG_INITIAL, open_txg].  It will propagate
2646 	 * upward when spa_vdev_exit() calls vdev_dtl_reassess().
2647 	 */
2648 	open_txg = txg + TXG_CONCURRENT_STATES - 1;
2649 
2650 	mutex_enter(&newvd->vdev_dtl_lock);
2651 	space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL,
2652 	    open_txg - TXG_INITIAL + 1);
2653 	mutex_exit(&newvd->vdev_dtl_lock);
2654 
2655 	if (newvd->vdev_isspare)
2656 		spa_spare_activate(newvd);
2657 
2658 	/*
2659 	 * Mark newvd's DTL dirty in this txg.
2660 	 */
2661 	vdev_dirty(tvd, VDD_DTL, newvd, txg);
2662 
2663 	(void) spa_vdev_exit(spa, newrootvd, open_txg, 0);
2664 
2665 	/*
2666 	 * Kick off a resilver to update newvd.  We need to grab the namespace
2667 	 * lock because spa_scrub() needs to post a sysevent with the pool name.
2668 	 */
2669 	mutex_enter(&spa_namespace_lock);
2670 	VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
2671 	mutex_exit(&spa_namespace_lock);
2672 
2673 	return (0);
2674 }
2675 
2676 /*
2677  * Detach a device from a mirror or replacing vdev.
2678  * If 'replace_done' is specified, only detach if the parent
2679  * is a replacing vdev.
2680  */
2681 int
2682 spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
2683 {
2684 	uint64_t txg;
2685 	int c, t, error;
2686 	vdev_t *rvd = spa->spa_root_vdev;
2687 	vdev_t *vd, *pvd, *cvd, *tvd;
2688 	boolean_t unspare = B_FALSE;
2689 	uint64_t unspare_guid;
2690 
2691 	txg = spa_vdev_enter(spa);
2692 
2693 	vd = vdev_lookup_by_guid(rvd, guid);
2694 
2695 	if (vd == NULL)
2696 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
2697 
2698 	if (!vd->vdev_ops->vdev_op_leaf)
2699 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
2700 
2701 	pvd = vd->vdev_parent;
2702 
2703 	/*
2704 	 * If replace_done is specified, only remove this device if it's
2705 	 * the first child of a replacing vdev.  For the 'spare' vdev, either
2706 	 * disk can be removed.
2707 	 */
2708 	if (replace_done) {
2709 		if (pvd->vdev_ops == &vdev_replacing_ops) {
2710 			if (vd->vdev_id != 0)
2711 				return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
2712 		} else if (pvd->vdev_ops != &vdev_spare_ops) {
2713 			return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
2714 		}
2715 	}
2716 
2717 	ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
2718 	    spa_version(spa) >= SPA_VERSION_SPARES);
2719 
2720 	/*
2721 	 * Only mirror, replacing, and spare vdevs support detach.
2722 	 */
2723 	if (pvd->vdev_ops != &vdev_replacing_ops &&
2724 	    pvd->vdev_ops != &vdev_mirror_ops &&
2725 	    pvd->vdev_ops != &vdev_spare_ops)
2726 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
2727 
2728 	/*
2729 	 * If there's only one replica, you can't detach it.
2730 	 */
2731 	if (pvd->vdev_children <= 1)
2732 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
2733 
2734 	/*
2735 	 * If all siblings have non-empty DTLs, this device may have the only
2736 	 * valid copy of the data, which means we cannot safely detach it.
2737 	 *
2738 	 * XXX -- as in the vdev_offline() case, we really want a more
2739 	 * precise DTL check.
2740 	 */
2741 	for (c = 0; c < pvd->vdev_children; c++) {
2742 		uint64_t dirty;
2743 
2744 		cvd = pvd->vdev_child[c];
2745 		if (cvd == vd)
2746 			continue;
2747 		if (vdev_is_dead(cvd))
2748 			continue;
2749 		mutex_enter(&cvd->vdev_dtl_lock);
2750 		dirty = cvd->vdev_dtl_map.sm_space |
2751 		    cvd->vdev_dtl_scrub.sm_space;
2752 		mutex_exit(&cvd->vdev_dtl_lock);
2753 		if (!dirty)
2754 			break;
2755 	}
2756 
2757 	/*
2758 	 * If we are a replacing or spare vdev, then we can always detach the
2759 	 * latter child, as that is how one cancels the operation.
2760 	 */
2761 	if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) &&
2762 	    c == pvd->vdev_children)
2763 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
2764 
2765 	/*
2766 	 * If we are detaching the original disk from a spare, then it implies
2767 	 * that the spare should become a real disk, and be removed from the
2768 	 * active spare list for the pool.
2769 	 */
2770 	if (pvd->vdev_ops == &vdev_spare_ops &&
2771 	    vd->vdev_id == 0)
2772 		unspare = B_TRUE;
2773 
2774 	/*
2775 	 * Erase the disk labels so the disk can be used for other things.
2776 	 * This must be done after all other error cases are handled,
2777 	 * but before we disembowel vd (so we can still do I/O to it).
2778 	 * But if we can't do it, don't treat the error as fatal --
2779 	 * it may be that the unwritability of the disk is the reason
2780 	 * it's being detached!
2781 	 */
2782 	error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
2783 
2784 	/*
2785 	 * Remove vd from its parent and compact the parent's children.
2786 	 */
2787 	vdev_remove_child(pvd, vd);
2788 	vdev_compact_children(pvd);
2789 
2790 	/*
2791 	 * Remember one of the remaining children so we can get tvd below.
2792 	 */
2793 	cvd = pvd->vdev_child[0];
2794 
2795 	/*
2796 	 * If we need to remove the remaining child from the list of hot spares,
2797 	 * do it now, marking the vdev as no longer a spare in the process.  We
2798 	 * must do this before vdev_remove_parent(), because that can change the
2799 	 * GUID if it creates a new toplevel GUID.
2800 	 */
2801 	if (unspare) {
2802 		ASSERT(cvd->vdev_isspare);
2803 		spa_spare_remove(cvd);
2804 		unspare_guid = cvd->vdev_guid;
2805 	}
2806 
2807 	/*
2808 	 * If the parent mirror/replacing vdev only has one child,
2809 	 * the parent is no longer needed.  Remove it from the tree.
2810 	 */
2811 	if (pvd->vdev_children == 1)
2812 		vdev_remove_parent(cvd);
2813 
2814 	/*
2815 	 * We don't set tvd until now because the parent we just removed
2816 	 * may have been the previous top-level vdev.
2817 	 */
2818 	tvd = cvd->vdev_top;
2819 	ASSERT(tvd->vdev_parent == rvd);
2820 
2821 	/*
2822 	 * Reevaluate the parent vdev state.
2823 	 */
2824 	vdev_propagate_state(cvd);
2825 
2826 	/*
2827 	 * If the device we just detached was smaller than the others, it may be
2828 	 * possible to add metaslabs (i.e. grow the pool).  vdev_metaslab_init()
2829 	 * can't fail because the existing metaslabs are already in core, so
2830 	 * there's nothing to read from disk.
2831 	 */
2832 	VERIFY(vdev_metaslab_init(tvd, txg) == 0);
2833 
2834 	vdev_config_dirty(tvd);
2835 
2836 	/*
2837 	 * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
2838 	 * vd->vdev_detached is set and free vd's DTL object in syncing context.
2839 	 * But first make sure we're not on any *other* txg's DTL list, to
2840 	 * prevent vd from being accessed after it's freed.
2841 	 */
2842 	for (t = 0; t < TXG_SIZE; t++)
2843 		(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
2844 	vd->vdev_detached = B_TRUE;
2845 	vdev_dirty(tvd, VDD_DTL, vd, txg);
2846 
2847 	spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
2848 
2849 	error = spa_vdev_exit(spa, vd, txg, 0);
2850 
2851 	/*
2852 	 * If this was the removal of the original device in a hot spare vdev,
2853 	 * then we want to go through and remove the device from the hot spare
2854 	 * list of every other pool.
2855 	 */
2856 	if (unspare) {
2857 		spa = NULL;
2858 		mutex_enter(&spa_namespace_lock);
2859 		while ((spa = spa_next(spa)) != NULL) {
2860 			if (spa->spa_state != POOL_STATE_ACTIVE)
2861 				continue;
2862 
2863 			(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
2864 		}
2865 		mutex_exit(&spa_namespace_lock);
2866 	}
2867 
2868 	return (error);
2869 }
2870 
2871 /*
2872  * Remove a spares vdev from the nvlist config.
2873  */
2874 static int
2875 spa_remove_spares(spa_aux_vdev_t *sav, uint64_t guid, boolean_t unspare,
2876     nvlist_t **spares, int nspares, vdev_t *vd)
2877 {
2878 	nvlist_t *nv, **newspares;
2879 	int i, j;
2880 
2881 	nv = NULL;
2882 	for (i = 0; i < nspares; i++) {
2883 		uint64_t theguid;
2884 
2885 		VERIFY(nvlist_lookup_uint64(spares[i],
2886 		    ZPOOL_CONFIG_GUID, &theguid) == 0);
2887 		if (theguid == guid) {
2888 			nv = spares[i];
2889 			break;
2890 		}
2891 	}
2892 
2893 	/*
2894 	 * Only remove the hot spare if it's not currently in use in this pool.
2895 	 */
2896 	if (nv == NULL && vd == NULL)
2897 		return (ENOENT);
2898 
2899 	if (nv == NULL && vd != NULL)
2900 		return (ENOTSUP);
2901 
2902 	if (!unspare && nv != NULL && vd != NULL)
2903 		return (EBUSY);
2904 
2905 	if (nspares == 1) {
2906 		newspares = NULL;
2907 	} else {
2908 		newspares = kmem_alloc((nspares - 1) * sizeof (void *),
2909 		    KM_SLEEP);
2910 		for (i = 0, j = 0; i < nspares; i++) {
2911 			if (spares[i] != nv)
2912 				VERIFY(nvlist_dup(spares[i],
2913 				    &newspares[j++], KM_SLEEP) == 0);
2914 		}
2915 	}
2916 
2917 	VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_SPARES,
2918 	    DATA_TYPE_NVLIST_ARRAY) == 0);
2919 	VERIFY(nvlist_add_nvlist_array(sav->sav_config,
2920 	    ZPOOL_CONFIG_SPARES, newspares, nspares - 1) == 0);
2921 	for (i = 0; i < nspares - 1; i++)
2922 		nvlist_free(newspares[i]);
2923 	kmem_free(newspares, (nspares - 1) * sizeof (void *));
2924 
2925 	return (0);
2926 }
2927 
2928 /*
2929  * Remove an l2cache vdev from the nvlist config.
2930  */
2931 static int
2932 spa_remove_l2cache(spa_aux_vdev_t *sav, uint64_t guid, nvlist_t **l2cache,
2933     int nl2cache, vdev_t *vd)
2934 {
2935 	nvlist_t *nv, **newl2cache;
2936 	int i, j;
2937 
2938 	nv = NULL;
2939 	for (i = 0; i < nl2cache; i++) {
2940 		uint64_t theguid;
2941 
2942 		VERIFY(nvlist_lookup_uint64(l2cache[i],
2943 		    ZPOOL_CONFIG_GUID, &theguid) == 0);
2944 		if (theguid == guid) {
2945 			nv = l2cache[i];
2946 			break;
2947 		}
2948 	}
2949 
2950 	if (vd == NULL) {
2951 		for (i = 0; i < nl2cache; i++) {
2952 			if (sav->sav_vdevs[i]->vdev_guid == guid) {
2953 				vd = sav->sav_vdevs[i];
2954 				break;
2955 			}
2956 		}
2957 	}
2958 
2959 	if (nv == NULL && vd == NULL)
2960 		return (ENOENT);
2961 
2962 	if (nv == NULL && vd != NULL)
2963 		return (ENOTSUP);
2964 
2965 	if (nl2cache == 1) {
2966 		newl2cache = NULL;
2967 	} else {
2968 		newl2cache = kmem_alloc((nl2cache - 1) * sizeof (void *),
2969 		    KM_SLEEP);
2970 		for (i = 0, j = 0; i < nl2cache; i++) {
2971 			if (l2cache[i] != nv)
2972 				VERIFY(nvlist_dup(l2cache[i],
2973 				    &newl2cache[j++], KM_SLEEP) == 0);
2974 		}
2975 	}
2976 
2977 	VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
2978 	    DATA_TYPE_NVLIST_ARRAY) == 0);
2979 	VERIFY(nvlist_add_nvlist_array(sav->sav_config,
2980 	    ZPOOL_CONFIG_L2CACHE, newl2cache, nl2cache - 1) == 0);
2981 	for (i = 0; i < nl2cache - 1; i++)
2982 		nvlist_free(newl2cache[i]);
2983 	kmem_free(newl2cache, (nl2cache - 1) * sizeof (void *));
2984 
2985 	return (0);
2986 }
2987 
2988 /*
2989  * Remove a device from the pool.  Currently, this supports removing only hot
2990  * spares and level 2 ARC devices.
2991  */
2992 int
2993 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
2994 {
2995 	vdev_t *vd;
2996 	nvlist_t **spares, **l2cache;
2997 	uint_t nspares, nl2cache;
2998 	int error = 0;
2999 
3000 	spa_config_enter(spa, RW_WRITER, FTAG);
3001 
3002 	vd = spa_lookup_by_guid(spa, guid);
3003 
3004 	if (spa->spa_spares.sav_vdevs != NULL &&
3005 	    spa_spare_exists(guid, NULL) &&
3006 	    nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
3007 	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) {
3008 		if ((error = spa_remove_spares(&spa->spa_spares, guid, unspare,
3009 		    spares, nspares, vd)) != 0)
3010 			goto out;
3011 		spa_load_spares(spa);
3012 		spa->spa_spares.sav_sync = B_TRUE;
3013 		goto out;
3014 	}
3015 
3016 	if (spa->spa_l2cache.sav_vdevs != NULL &&
3017 	    spa_l2cache_exists(guid, NULL) &&
3018 	    nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
3019 	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) {
3020 		if ((error = spa_remove_l2cache(&spa->spa_l2cache, guid,
3021 		    l2cache, nl2cache, vd)) != 0)
3022 			goto out;
3023 		spa_load_l2cache(spa);
3024 		spa->spa_l2cache.sav_sync = B_TRUE;
3025 	}
3026 
3027 out:
3028 	spa_config_exit(spa, FTAG);
3029 	return (error);
3030 }
3031 
3032 /*
3033  * Find any device that's done replacing, or a vdev marked 'unspare' that's
3034  * current spared, so we can detach it.
3035  */
3036 static vdev_t *
3037 spa_vdev_resilver_done_hunt(vdev_t *vd)
3038 {
3039 	vdev_t *newvd, *oldvd;
3040 	int c;
3041 
3042 	for (c = 0; c < vd->vdev_children; c++) {
3043 		oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
3044 		if (oldvd != NULL)
3045 			return (oldvd);
3046 	}
3047 
3048 	/*
3049 	 * Check for a completed replacement.
3050 	 */
3051 	if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) {
3052 		oldvd = vd->vdev_child[0];
3053 		newvd = vd->vdev_child[1];
3054 
3055 		mutex_enter(&newvd->vdev_dtl_lock);
3056 		if (newvd->vdev_dtl_map.sm_space == 0 &&
3057 		    newvd->vdev_dtl_scrub.sm_space == 0) {
3058 			mutex_exit(&newvd->vdev_dtl_lock);
3059 			return (oldvd);
3060 		}
3061 		mutex_exit(&newvd->vdev_dtl_lock);
3062 	}
3063 
3064 	/*
3065 	 * Check for a completed resilver with the 'unspare' flag set.
3066 	 */
3067 	if (vd->vdev_ops == &vdev_spare_ops && vd->vdev_children == 2) {
3068 		newvd = vd->vdev_child[0];
3069 		oldvd = vd->vdev_child[1];
3070 
3071 		mutex_enter(&newvd->vdev_dtl_lock);
3072 		if (newvd->vdev_unspare &&
3073 		    newvd->vdev_dtl_map.sm_space == 0 &&
3074 		    newvd->vdev_dtl_scrub.sm_space == 0) {
3075 			newvd->vdev_unspare = 0;
3076 			mutex_exit(&newvd->vdev_dtl_lock);
3077 			return (oldvd);
3078 		}
3079 		mutex_exit(&newvd->vdev_dtl_lock);
3080 	}
3081 
3082 	return (NULL);
3083 }
3084 
3085 static void
3086 spa_vdev_resilver_done(spa_t *spa)
3087 {
3088 	vdev_t *vd;
3089 	vdev_t *pvd;
3090 	uint64_t guid;
3091 	uint64_t pguid = 0;
3092 
3093 	spa_config_enter(spa, RW_READER, FTAG);
3094 
3095 	while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
3096 		guid = vd->vdev_guid;
3097 		/*
3098 		 * If we have just finished replacing a hot spared device, then
3099 		 * we need to detach the parent's first child (the original hot
3100 		 * spare) as well.
3101 		 */
3102 		pvd = vd->vdev_parent;
3103 		if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
3104 		    pvd->vdev_id == 0) {
3105 			ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
3106 			ASSERT(pvd->vdev_parent->vdev_children == 2);
3107 			pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid;
3108 		}
3109 		spa_config_exit(spa, FTAG);
3110 		if (spa_vdev_detach(spa, guid, B_TRUE) != 0)
3111 			return;
3112 		if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0)
3113 			return;
3114 		spa_config_enter(spa, RW_READER, FTAG);
3115 	}
3116 
3117 	spa_config_exit(spa, FTAG);
3118 }
3119 
3120 /*
3121  * Update the stored path for this vdev.  Dirty the vdev configuration, relying
3122  * on spa_vdev_enter/exit() to synchronize the labels and cache.
3123  */
3124 int
3125 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
3126 {
3127 	vdev_t *rvd, *vd;
3128 	uint64_t txg;
3129 
3130 	rvd = spa->spa_root_vdev;
3131 
3132 	txg = spa_vdev_enter(spa);
3133 
3134 	if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
3135 		/*
3136 		 * Determine if this is a reference to a hot spare or l2cache
3137 		 * device.  If it is, update the path as stored in their
3138 		 * device list.
3139 		 */
3140 		nvlist_t **spares, **l2cache;
3141 		uint_t i, nspares, nl2cache;
3142 
3143 		if (spa->spa_spares.sav_config != NULL) {
3144 			VERIFY(nvlist_lookup_nvlist_array(
3145 			    spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
3146 			    &spares, &nspares) == 0);
3147 			for (i = 0; i < nspares; i++) {
3148 				uint64_t theguid;
3149 				VERIFY(nvlist_lookup_uint64(spares[i],
3150 				    ZPOOL_CONFIG_GUID, &theguid) == 0);
3151 				if (theguid == guid) {
3152 					VERIFY(nvlist_add_string(spares[i],
3153 					    ZPOOL_CONFIG_PATH, newpath) == 0);
3154 					spa_load_spares(spa);
3155 					spa->spa_spares.sav_sync = B_TRUE;
3156 					return (spa_vdev_exit(spa, NULL, txg,
3157 					    0));
3158 				}
3159 			}
3160 		}
3161 
3162 		if (spa->spa_l2cache.sav_config != NULL) {
3163 			VERIFY(nvlist_lookup_nvlist_array(
3164 			    spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE,
3165 			    &l2cache, &nl2cache) == 0);
3166 			for (i = 0; i < nl2cache; i++) {
3167 				uint64_t theguid;
3168 				VERIFY(nvlist_lookup_uint64(l2cache[i],
3169 				    ZPOOL_CONFIG_GUID, &theguid) == 0);
3170 				if (theguid == guid) {
3171 					VERIFY(nvlist_add_string(l2cache[i],
3172 					    ZPOOL_CONFIG_PATH, newpath) == 0);
3173 					spa_load_l2cache(spa);
3174 					spa->spa_l2cache.sav_sync = B_TRUE;
3175 					return (spa_vdev_exit(spa, NULL, txg,
3176 					    0));
3177 				}
3178 			}
3179 		}
3180 
3181 		return (spa_vdev_exit(spa, NULL, txg, ENOENT));
3182 	}
3183 
3184 	if (!vd->vdev_ops->vdev_op_leaf)
3185 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
3186 
3187 	spa_strfree(vd->vdev_path);
3188 	vd->vdev_path = spa_strdup(newpath);
3189 
3190 	vdev_config_dirty(vd->vdev_top);
3191 
3192 	return (spa_vdev_exit(spa, NULL, txg, 0));
3193 }
3194 
3195 /*
3196  * ==========================================================================
3197  * SPA Scrubbing
3198  * ==========================================================================
3199  */
3200 
3201 static void
3202 spa_scrub_io_done(zio_t *zio)
3203 {
3204 	spa_t *spa = zio->io_spa;
3205 
3206 	arc_data_buf_free(zio->io_data, zio->io_size);
3207 
3208 	mutex_enter(&spa->spa_scrub_lock);
3209 	if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
3210 		vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev;
3211 		spa->spa_scrub_errors++;
3212 		mutex_enter(&vd->vdev_stat_lock);
3213 		vd->vdev_stat.vs_scrub_errors++;
3214 		mutex_exit(&vd->vdev_stat_lock);
3215 	}
3216 
3217 	if (--spa->spa_scrub_inflight < spa->spa_scrub_maxinflight)
3218 		cv_broadcast(&spa->spa_scrub_io_cv);
3219 
3220 	ASSERT(spa->spa_scrub_inflight >= 0);
3221 
3222 	mutex_exit(&spa->spa_scrub_lock);
3223 }
3224 
3225 static void
3226 spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags,
3227     zbookmark_t *zb)
3228 {
3229 	size_t size = BP_GET_LSIZE(bp);
3230 	void *data;
3231 
3232 	mutex_enter(&spa->spa_scrub_lock);
3233 	/*
3234 	 * Do not give too much work to vdev(s).
3235 	 */
3236 	while (spa->spa_scrub_inflight >= spa->spa_scrub_maxinflight) {
3237 		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
3238 	}
3239 	spa->spa_scrub_inflight++;
3240 	mutex_exit(&spa->spa_scrub_lock);
3241 
3242 	data = arc_data_buf_alloc(size);
3243 
3244 	if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)
3245 		flags |= ZIO_FLAG_SPECULATIVE;	/* intent log block */
3246 
3247 	flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL;
3248 
3249 	zio_nowait(zio_read(NULL, spa, bp, data, size,
3250 	    spa_scrub_io_done, NULL, priority, flags, zb));
3251 }
3252 
3253 /* ARGSUSED */
3254 static int
3255 spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a)
3256 {
3257 	blkptr_t *bp = &bc->bc_blkptr;
3258 	vdev_t *vd = spa->spa_root_vdev;
3259 	dva_t *dva = bp->blk_dva;
3260 	int needs_resilver = B_FALSE;
3261 	int d;
3262 
3263 	if (bc->bc_errno) {
3264 		/*
3265 		 * We can't scrub this block, but we can continue to scrub
3266 		 * the rest of the pool.  Note the error and move along.
3267 		 */
3268 		mutex_enter(&spa->spa_scrub_lock);
3269 		spa->spa_scrub_errors++;
3270 		mutex_exit(&spa->spa_scrub_lock);
3271 
3272 		mutex_enter(&vd->vdev_stat_lock);
3273 		vd->vdev_stat.vs_scrub_errors++;
3274 		mutex_exit(&vd->vdev_stat_lock);
3275 
3276 		return (ERESTART);
3277 	}
3278 
3279 	ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg);
3280 
3281 	for (d = 0; d < BP_GET_NDVAS(bp); d++) {
3282 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]));
3283 
3284 		ASSERT(vd != NULL);
3285 
3286 		/*
3287 		 * Keep track of how much data we've examined so that
3288 		 * zpool(1M) status can make useful progress reports.
3289 		 */
3290 		mutex_enter(&vd->vdev_stat_lock);
3291 		vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]);
3292 		mutex_exit(&vd->vdev_stat_lock);
3293 
3294 		if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) {
3295 			if (DVA_GET_GANG(&dva[d])) {
3296 				/*
3297 				 * Gang members may be spread across multiple
3298 				 * vdevs, so the best we can do is look at the
3299 				 * pool-wide DTL.
3300 				 * XXX -- it would be better to change our
3301 				 * allocation policy to ensure that this can't
3302 				 * happen.
3303 				 */
3304 				vd = spa->spa_root_vdev;
3305 			}
3306 			if (vdev_dtl_contains(&vd->vdev_dtl_map,
3307 			    bp->blk_birth, 1))
3308 				needs_resilver = B_TRUE;
3309 		}
3310 	}
3311 
3312 	if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING)
3313 		spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB,
3314 		    ZIO_FLAG_SCRUB, &bc->bc_bookmark);
3315 	else if (needs_resilver)
3316 		spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER,
3317 		    ZIO_FLAG_RESILVER, &bc->bc_bookmark);
3318 
3319 	return (0);
3320 }
3321 
3322 static void
3323 spa_scrub_thread(spa_t *spa)
3324 {
3325 	callb_cpr_t cprinfo;
3326 	traverse_handle_t *th = spa->spa_scrub_th;
3327 	vdev_t *rvd = spa->spa_root_vdev;
3328 	pool_scrub_type_t scrub_type = spa->spa_scrub_type;
3329 	int error = 0;
3330 	boolean_t complete;
3331 
3332 	CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG);
3333 
3334 	/*
3335 	 * If we're restarting due to a snapshot create/delete,
3336 	 * wait for that to complete.
3337 	 */
3338 	txg_wait_synced(spa_get_dsl(spa), 0);
3339 
3340 	dprintf("start %s mintxg=%llu maxtxg=%llu\n",
3341 	    scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub",
3342 	    spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg);
3343 
3344 	spa_config_enter(spa, RW_WRITER, FTAG);
3345 	vdev_reopen(rvd);		/* purge all vdev caches */
3346 	vdev_config_dirty(rvd);		/* rewrite all disk labels */
3347 	vdev_scrub_stat_update(rvd, scrub_type, B_FALSE);
3348 	spa_config_exit(spa, FTAG);
3349 
3350 	mutex_enter(&spa->spa_scrub_lock);
3351 	spa->spa_scrub_errors = 0;
3352 	spa->spa_scrub_active = 1;
3353 	ASSERT(spa->spa_scrub_inflight == 0);
3354 
3355 	while (!spa->spa_scrub_stop) {
3356 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
3357 		while (spa->spa_scrub_suspended) {
3358 			spa->spa_scrub_active = 0;
3359 			cv_broadcast(&spa->spa_scrub_cv);
3360 			cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
3361 			spa->spa_scrub_active = 1;
3362 		}
3363 		CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock);
3364 
3365 		if (spa->spa_scrub_restart_txg != 0)
3366 			break;
3367 
3368 		mutex_exit(&spa->spa_scrub_lock);
3369 		error = traverse_more(th);
3370 		mutex_enter(&spa->spa_scrub_lock);
3371 		if (error != EAGAIN)
3372 			break;
3373 	}
3374 
3375 	while (spa->spa_scrub_inflight)
3376 		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
3377 
3378 	spa->spa_scrub_active = 0;
3379 	cv_broadcast(&spa->spa_scrub_cv);
3380 
3381 	mutex_exit(&spa->spa_scrub_lock);
3382 
3383 	spa_config_enter(spa, RW_WRITER, FTAG);
3384 
3385 	mutex_enter(&spa->spa_scrub_lock);
3386 
3387 	/*
3388 	 * Note: we check spa_scrub_restart_txg under both spa_scrub_lock
3389 	 * AND the spa config lock to synchronize with any config changes
3390 	 * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit().
3391 	 */
3392 	if (spa->spa_scrub_restart_txg != 0)
3393 		error = ERESTART;
3394 
3395 	if (spa->spa_scrub_stop)
3396 		error = EINTR;
3397 
3398 	/*
3399 	 * Even if there were uncorrectable errors, we consider the scrub
3400 	 * completed.  The downside is that if there is a transient error during
3401 	 * a resilver, we won't resilver the data properly to the target.  But
3402 	 * if the damage is permanent (more likely) we will resilver forever,
3403 	 * which isn't really acceptable.  Since there is enough information for
3404 	 * the user to know what has failed and why, this seems like a more
3405 	 * tractable approach.
3406 	 */
3407 	complete = (error == 0);
3408 
3409 	dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n",
3410 	    scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub",
3411 	    spa->spa_scrub_maxtxg, complete ? "done" : "FAILED",
3412 	    error, spa->spa_scrub_errors, spa->spa_scrub_stop);
3413 
3414 	mutex_exit(&spa->spa_scrub_lock);
3415 
3416 	/*
3417 	 * If the scrub/resilver completed, update all DTLs to reflect this.
3418 	 * Whether it succeeded or not, vacate all temporary scrub DTLs.
3419 	 */
3420 	vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1,
3421 	    complete ? spa->spa_scrub_maxtxg : 0, B_TRUE);
3422 	vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete);
3423 	spa_errlog_rotate(spa);
3424 
3425 	if (scrub_type == POOL_SCRUB_RESILVER && complete)
3426 		spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_FINISH);
3427 
3428 	spa_config_exit(spa, FTAG);
3429 
3430 	mutex_enter(&spa->spa_scrub_lock);
3431 
3432 	/*
3433 	 * We may have finished replacing a device.
3434 	 * Let the async thread assess this and handle the detach.
3435 	 */
3436 	spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
3437 
3438 	/*
3439 	 * If we were told to restart, our final act is to start a new scrub.
3440 	 */
3441 	if (error == ERESTART)
3442 		spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ?
3443 		    SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB);
3444 
3445 	spa->spa_scrub_type = POOL_SCRUB_NONE;
3446 	spa->spa_scrub_active = 0;
3447 	spa->spa_scrub_thread = NULL;
3448 	cv_broadcast(&spa->spa_scrub_cv);
3449 	CALLB_CPR_EXIT(&cprinfo);	/* drops &spa->spa_scrub_lock */
3450 	thread_exit();
3451 }
3452 
3453 void
3454 spa_scrub_suspend(spa_t *spa)
3455 {
3456 	mutex_enter(&spa->spa_scrub_lock);
3457 	spa->spa_scrub_suspended++;
3458 	while (spa->spa_scrub_active) {
3459 		cv_broadcast(&spa->spa_scrub_cv);
3460 		cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
3461 	}
3462 	while (spa->spa_scrub_inflight)
3463 		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
3464 	mutex_exit(&spa->spa_scrub_lock);
3465 }
3466 
3467 void
3468 spa_scrub_resume(spa_t *spa)
3469 {
3470 	mutex_enter(&spa->spa_scrub_lock);
3471 	ASSERT(spa->spa_scrub_suspended != 0);
3472 	if (--spa->spa_scrub_suspended == 0)
3473 		cv_broadcast(&spa->spa_scrub_cv);
3474 	mutex_exit(&spa->spa_scrub_lock);
3475 }
3476 
3477 void
3478 spa_scrub_restart(spa_t *spa, uint64_t txg)
3479 {
3480 	/*
3481 	 * Something happened (e.g. snapshot create/delete) that means
3482 	 * we must restart any in-progress scrubs.  The itinerary will
3483 	 * fix this properly.
3484 	 */
3485 	mutex_enter(&spa->spa_scrub_lock);
3486 	spa->spa_scrub_restart_txg = txg;
3487 	mutex_exit(&spa->spa_scrub_lock);
3488 }
3489 
3490 int
3491 spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force)
3492 {
3493 	space_seg_t *ss;
3494 	uint64_t mintxg, maxtxg;
3495 	vdev_t *rvd = spa->spa_root_vdev;
3496 
3497 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
3498 	ASSERT(!spa_config_held(spa, RW_WRITER));
3499 
3500 	if ((uint_t)type >= POOL_SCRUB_TYPES)
3501 		return (ENOTSUP);
3502 
3503 	mutex_enter(&spa->spa_scrub_lock);
3504 
3505 	/*
3506 	 * If there's a scrub or resilver already in progress, stop it.
3507 	 */
3508 	while (spa->spa_scrub_thread != NULL) {
3509 		/*
3510 		 * Don't stop a resilver unless forced.
3511 		 */
3512 		if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) {
3513 			mutex_exit(&spa->spa_scrub_lock);
3514 			return (EBUSY);
3515 		}
3516 		spa->spa_scrub_stop = 1;
3517 		cv_broadcast(&spa->spa_scrub_cv);
3518 		cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
3519 	}
3520 
3521 	/*
3522 	 * Terminate the previous traverse.
3523 	 */
3524 	if (spa->spa_scrub_th != NULL) {
3525 		traverse_fini(spa->spa_scrub_th);
3526 		spa->spa_scrub_th = NULL;
3527 	}
3528 
3529 	if (rvd == NULL) {
3530 		ASSERT(spa->spa_scrub_stop == 0);
3531 		ASSERT(spa->spa_scrub_type == type);
3532 		ASSERT(spa->spa_scrub_restart_txg == 0);
3533 		mutex_exit(&spa->spa_scrub_lock);
3534 		return (0);
3535 	}
3536 
3537 	mintxg = TXG_INITIAL - 1;
3538 	maxtxg = spa_last_synced_txg(spa) + 1;
3539 
3540 	mutex_enter(&rvd->vdev_dtl_lock);
3541 
3542 	if (rvd->vdev_dtl_map.sm_space == 0) {
3543 		/*
3544 		 * The pool-wide DTL is empty.
3545 		 * If this is a resilver, there's nothing to do except
3546 		 * check whether any in-progress replacements have completed.
3547 		 */
3548 		if (type == POOL_SCRUB_RESILVER) {
3549 			type = POOL_SCRUB_NONE;
3550 			spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
3551 		}
3552 	} else {
3553 		/*
3554 		 * The pool-wide DTL is non-empty.
3555 		 * If this is a normal scrub, upgrade to a resilver instead.
3556 		 */
3557 		if (type == POOL_SCRUB_EVERYTHING)
3558 			type = POOL_SCRUB_RESILVER;
3559 	}
3560 
3561 	if (type == POOL_SCRUB_RESILVER) {
3562 		/*
3563 		 * Determine the resilvering boundaries.
3564 		 *
3565 		 * Note: (mintxg, maxtxg) is an open interval,
3566 		 * i.e. mintxg and maxtxg themselves are not included.
3567 		 *
3568 		 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1
3569 		 * so we don't claim to resilver a txg that's still changing.
3570 		 */
3571 		ss = avl_first(&rvd->vdev_dtl_map.sm_root);
3572 		mintxg = ss->ss_start - 1;
3573 		ss = avl_last(&rvd->vdev_dtl_map.sm_root);
3574 		maxtxg = MIN(ss->ss_end, maxtxg);
3575 
3576 		spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START);
3577 	}
3578 
3579 	mutex_exit(&rvd->vdev_dtl_lock);
3580 
3581 	spa->spa_scrub_stop = 0;
3582 	spa->spa_scrub_type = type;
3583 	spa->spa_scrub_restart_txg = 0;
3584 
3585 	if (type != POOL_SCRUB_NONE) {
3586 		spa->spa_scrub_mintxg = mintxg;
3587 		spa->spa_scrub_maxtxg = maxtxg;
3588 		spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL,
3589 		    ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL,
3590 		    ZIO_FLAG_CANFAIL);
3591 		traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg);
3592 		spa->spa_scrub_thread = thread_create(NULL, 0,
3593 		    spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri);
3594 	}
3595 
3596 	mutex_exit(&spa->spa_scrub_lock);
3597 
3598 	return (0);
3599 }
3600 
3601 /*
3602  * ==========================================================================
3603  * SPA async task processing
3604  * ==========================================================================
3605  */
3606 
3607 static void
3608 spa_async_remove(spa_t *spa, vdev_t *vd)
3609 {
3610 	vdev_t *tvd;
3611 	int c;
3612 
3613 	for (c = 0; c < vd->vdev_children; c++) {
3614 		tvd = vd->vdev_child[c];
3615 		if (tvd->vdev_remove_wanted) {
3616 			tvd->vdev_remove_wanted = 0;
3617 			vdev_set_state(tvd, B_FALSE, VDEV_STATE_REMOVED,
3618 			    VDEV_AUX_NONE);
3619 			vdev_clear(spa, tvd, B_TRUE);
3620 			vdev_config_dirty(tvd->vdev_top);
3621 		}
3622 		spa_async_remove(spa, tvd);
3623 	}
3624 }
3625 
3626 static void
3627 spa_async_thread(spa_t *spa)
3628 {
3629 	int tasks;
3630 	uint64_t txg;
3631 
3632 	ASSERT(spa->spa_sync_on);
3633 
3634 	mutex_enter(&spa->spa_async_lock);
3635 	tasks = spa->spa_async_tasks;
3636 	spa->spa_async_tasks = 0;
3637 	mutex_exit(&spa->spa_async_lock);
3638 
3639 	/*
3640 	 * See if the config needs to be updated.
3641 	 */
3642 	if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
3643 		mutex_enter(&spa_namespace_lock);
3644 		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
3645 		mutex_exit(&spa_namespace_lock);
3646 	}
3647 
3648 	/*
3649 	 * See if any devices need to be marked REMOVED.
3650 	 *
3651 	 * XXX - We avoid doing this when we are in
3652 	 * I/O failure state since spa_vdev_enter() grabs
3653 	 * the namespace lock and would not be able to obtain
3654 	 * the writer config lock.
3655 	 */
3656 	if (tasks & SPA_ASYNC_REMOVE &&
3657 	    spa_state(spa) != POOL_STATE_IO_FAILURE) {
3658 		txg = spa_vdev_enter(spa);
3659 		spa_async_remove(spa, spa->spa_root_vdev);
3660 		(void) spa_vdev_exit(spa, NULL, txg, 0);
3661 	}
3662 
3663 	/*
3664 	 * If any devices are done replacing, detach them.
3665 	 */
3666 	if (tasks & SPA_ASYNC_RESILVER_DONE)
3667 		spa_vdev_resilver_done(spa);
3668 
3669 	/*
3670 	 * Kick off a scrub.  When starting a RESILVER scrub (or an EVERYTHING
3671 	 * scrub which can become a resilver), we need to hold
3672 	 * spa_namespace_lock() because the sysevent we post via
3673 	 * spa_event_notify() needs to get the name of the pool.
3674 	 */
3675 	if (tasks & SPA_ASYNC_SCRUB) {
3676 		mutex_enter(&spa_namespace_lock);
3677 		VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0);
3678 		mutex_exit(&spa_namespace_lock);
3679 	}
3680 
3681 	/*
3682 	 * Kick off a resilver.
3683 	 */
3684 	if (tasks & SPA_ASYNC_RESILVER) {
3685 		mutex_enter(&spa_namespace_lock);
3686 		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
3687 		mutex_exit(&spa_namespace_lock);
3688 	}
3689 
3690 	/*
3691 	 * Let the world know that we're done.
3692 	 */
3693 	mutex_enter(&spa->spa_async_lock);
3694 	spa->spa_async_thread = NULL;
3695 	cv_broadcast(&spa->spa_async_cv);
3696 	mutex_exit(&spa->spa_async_lock);
3697 	thread_exit();
3698 }
3699 
3700 void
3701 spa_async_suspend(spa_t *spa)
3702 {
3703 	mutex_enter(&spa->spa_async_lock);
3704 	spa->spa_async_suspended++;
3705 	while (spa->spa_async_thread != NULL)
3706 		cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
3707 	mutex_exit(&spa->spa_async_lock);
3708 }
3709 
3710 void
3711 spa_async_resume(spa_t *spa)
3712 {
3713 	mutex_enter(&spa->spa_async_lock);
3714 	ASSERT(spa->spa_async_suspended != 0);
3715 	spa->spa_async_suspended--;
3716 	mutex_exit(&spa->spa_async_lock);
3717 }
3718 
3719 static void
3720 spa_async_dispatch(spa_t *spa)
3721 {
3722 	mutex_enter(&spa->spa_async_lock);
3723 	if (spa->spa_async_tasks && !spa->spa_async_suspended &&
3724 	    spa->spa_async_thread == NULL &&
3725 	    rootdir != NULL && !vn_is_readonly(rootdir))
3726 		spa->spa_async_thread = thread_create(NULL, 0,
3727 		    spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
3728 	mutex_exit(&spa->spa_async_lock);
3729 }
3730 
3731 void
3732 spa_async_request(spa_t *spa, int task)
3733 {
3734 	mutex_enter(&spa->spa_async_lock);
3735 	spa->spa_async_tasks |= task;
3736 	mutex_exit(&spa->spa_async_lock);
3737 }
3738 
3739 /*
3740  * ==========================================================================
3741  * SPA syncing routines
3742  * ==========================================================================
3743  */
3744 
3745 static void
3746 spa_sync_deferred_frees(spa_t *spa, uint64_t txg)
3747 {
3748 	bplist_t *bpl = &spa->spa_sync_bplist;
3749 	dmu_tx_t *tx;
3750 	blkptr_t blk;
3751 	uint64_t itor = 0;
3752 	zio_t *zio;
3753 	int error;
3754 	uint8_t c = 1;
3755 
3756 	zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD);
3757 
3758 	while (bplist_iterate(bpl, &itor, &blk) == 0)
3759 		zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL));
3760 
3761 	error = zio_wait(zio);
3762 	ASSERT3U(error, ==, 0);
3763 
3764 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
3765 	bplist_vacate(bpl, tx);
3766 
3767 	/*
3768 	 * Pre-dirty the first block so we sync to convergence faster.
3769 	 * (Usually only the first block is needed.)
3770 	 */
3771 	dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx);
3772 	dmu_tx_commit(tx);
3773 }
3774 
3775 static void
3776 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
3777 {
3778 	char *packed = NULL;
3779 	size_t nvsize = 0;
3780 	dmu_buf_t *db;
3781 
3782 	VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
3783 
3784 	packed = kmem_alloc(nvsize, KM_SLEEP);
3785 
3786 	VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
3787 	    KM_SLEEP) == 0);
3788 
3789 	dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx);
3790 
3791 	kmem_free(packed, nvsize);
3792 
3793 	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
3794 	dmu_buf_will_dirty(db, tx);
3795 	*(uint64_t *)db->db_data = nvsize;
3796 	dmu_buf_rele(db, FTAG);
3797 }
3798 
3799 static void
3800 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
3801     const char *config, const char *entry)
3802 {
3803 	nvlist_t *nvroot;
3804 	nvlist_t **list;
3805 	int i;
3806 
3807 	if (!sav->sav_sync)
3808 		return;
3809 
3810 	/*
3811 	 * Update the MOS nvlist describing the list of available devices.
3812 	 * spa_validate_aux() will have already made sure this nvlist is
3813 	 * valid and the vdevs are labeled appropriately.
3814 	 */
3815 	if (sav->sav_object == 0) {
3816 		sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
3817 		    DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
3818 		    sizeof (uint64_t), tx);
3819 		VERIFY(zap_update(spa->spa_meta_objset,
3820 		    DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
3821 		    &sav->sav_object, tx) == 0);
3822 	}
3823 
3824 	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
3825 	if (sav->sav_count == 0) {
3826 		VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0);
3827 	} else {
3828 		list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
3829 		for (i = 0; i < sav->sav_count; i++)
3830 			list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
3831 			    B_FALSE, B_FALSE, B_TRUE);
3832 		VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
3833 		    sav->sav_count) == 0);
3834 		for (i = 0; i < sav->sav_count; i++)
3835 			nvlist_free(list[i]);
3836 		kmem_free(list, sav->sav_count * sizeof (void *));
3837 	}
3838 
3839 	spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
3840 	nvlist_free(nvroot);
3841 
3842 	sav->sav_sync = B_FALSE;
3843 }
3844 
3845 static void
3846 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
3847 {
3848 	nvlist_t *config;
3849 
3850 	if (list_is_empty(&spa->spa_dirty_list))
3851 		return;
3852 
3853 	config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE);
3854 
3855 	if (spa->spa_config_syncing)
3856 		nvlist_free(spa->spa_config_syncing);
3857 	spa->spa_config_syncing = config;
3858 
3859 	spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
3860 }
3861 
3862 /*
3863  * Set zpool properties.
3864  */
3865 static void
3866 spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
3867 {
3868 	spa_t *spa = arg1;
3869 	objset_t *mos = spa->spa_meta_objset;
3870 	nvlist_t *nvp = arg2;
3871 	nvpair_t *elem;
3872 	uint64_t intval;
3873 	char *strval, *slash;
3874 	zpool_prop_t prop;
3875 	const char *propname;
3876 	zprop_type_t proptype;
3877 
3878 	elem = NULL;
3879 	while ((elem = nvlist_next_nvpair(nvp, elem))) {
3880 		switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
3881 		case ZPOOL_PROP_VERSION:
3882 			/*
3883 			 * Only set version for non-zpool-creation cases
3884 			 * (set/import). spa_create() needs special care
3885 			 * for version setting.
3886 			 */
3887 			if (tx->tx_txg != TXG_INITIAL) {
3888 				VERIFY(nvpair_value_uint64(elem,
3889 				    &intval) == 0);
3890 				ASSERT(intval <= SPA_VERSION);
3891 				ASSERT(intval >= spa_version(spa));
3892 				spa->spa_uberblock.ub_version = intval;
3893 				vdev_config_dirty(spa->spa_root_vdev);
3894 			}
3895 			break;
3896 
3897 		case ZPOOL_PROP_ALTROOT:
3898 			/*
3899 			 * 'altroot' is a non-persistent property. It should
3900 			 * have been set temporarily at creation or import time.
3901 			 */
3902 			ASSERT(spa->spa_root != NULL);
3903 			break;
3904 
3905 		case ZPOOL_PROP_CACHEFILE:
3906 			/*
3907 			 * 'cachefile' is a non-persistent property, but note
3908 			 * an async request that the config cache needs to be
3909 			 * udpated.
3910 			 */
3911 			VERIFY(nvpair_value_string(elem, &strval) == 0);
3912 			if (spa->spa_config_dir)
3913 				spa_strfree(spa->spa_config_dir);
3914 			if (spa->spa_config_file)
3915 				spa_strfree(spa->spa_config_file);
3916 
3917 			if (strval[0] == '\0') {
3918 				spa->spa_config_dir = NULL;
3919 				spa->spa_config_file = NULL;
3920 			} else if (strcmp(strval, "none") == 0) {
3921 				spa->spa_config_dir = spa_strdup(strval);
3922 				spa->spa_config_file = NULL;
3923 			} else {
3924 				/*
3925 				 * If the cachefile is in the root directory,
3926 				 * we will end up with an empty string for
3927 				 * spa_config_dir.  This value is only ever
3928 				 * used when concatenated with '/', so an empty
3929 				 * string still behaves correctly and keeps the
3930 				 * rest of the code simple.
3931 				 */
3932 				slash = strrchr(strval, '/');
3933 				ASSERT(slash != NULL);
3934 				*slash = '\0';
3935 				if (strcmp(strval, spa_config_dir) == 0 &&
3936 				    strcmp(slash + 1, ZPOOL_CACHE_FILE) == 0) {
3937 					spa->spa_config_dir = NULL;
3938 					spa->spa_config_file = NULL;
3939 				} else {
3940 					spa->spa_config_dir =
3941 					    spa_strdup(strval);
3942 					spa->spa_config_file =
3943 					    spa_strdup(slash + 1);
3944 				}
3945 			}
3946 			spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
3947 			break;
3948 		default:
3949 			/*
3950 			 * Set pool property values in the poolprops mos object.
3951 			 */
3952 			mutex_enter(&spa->spa_props_lock);
3953 			if (spa->spa_pool_props_object == 0) {
3954 				objset_t *mos = spa->spa_meta_objset;
3955 
3956 				VERIFY((spa->spa_pool_props_object =
3957 				    zap_create(mos, DMU_OT_POOL_PROPS,
3958 				    DMU_OT_NONE, 0, tx)) > 0);
3959 
3960 				VERIFY(zap_update(mos,
3961 				    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
3962 				    8, 1, &spa->spa_pool_props_object, tx)
3963 				    == 0);
3964 			}
3965 			mutex_exit(&spa->spa_props_lock);
3966 
3967 			/* normalize the property name */
3968 			propname = zpool_prop_to_name(prop);
3969 			proptype = zpool_prop_get_type(prop);
3970 
3971 			if (nvpair_type(elem) == DATA_TYPE_STRING) {
3972 				ASSERT(proptype == PROP_TYPE_STRING);
3973 				VERIFY(nvpair_value_string(elem, &strval) == 0);
3974 				VERIFY(zap_update(mos,
3975 				    spa->spa_pool_props_object, propname,
3976 				    1, strlen(strval) + 1, strval, tx) == 0);
3977 
3978 			} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
3979 				VERIFY(nvpair_value_uint64(elem, &intval) == 0);
3980 
3981 				if (proptype == PROP_TYPE_INDEX) {
3982 					const char *unused;
3983 					VERIFY(zpool_prop_index_to_string(
3984 					    prop, intval, &unused) == 0);
3985 				}
3986 				VERIFY(zap_update(mos,
3987 				    spa->spa_pool_props_object, propname,
3988 				    8, 1, &intval, tx) == 0);
3989 			} else {
3990 				ASSERT(0); /* not allowed */
3991 			}
3992 
3993 			switch (prop) {
3994 			case ZPOOL_PROP_DELEGATION:
3995 				spa->spa_delegation = intval;
3996 				break;
3997 			case ZPOOL_PROP_BOOTFS:
3998 				spa->spa_bootfs = intval;
3999 				break;
4000 			case ZPOOL_PROP_FAILUREMODE:
4001 				spa->spa_failmode = intval;
4002 				break;
4003 			default:
4004 				break;
4005 			}
4006 		}
4007 
4008 		/* log internal history if this is not a zpool create */
4009 		if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY &&
4010 		    tx->tx_txg != TXG_INITIAL) {
4011 			spa_history_internal_log(LOG_POOL_PROPSET,
4012 			    spa, tx, cr, "%s %lld %s",
4013 			    nvpair_name(elem), intval, spa->spa_name);
4014 		}
4015 	}
4016 }
4017 
4018 /*
4019  * Sync the specified transaction group.  New blocks may be dirtied as
4020  * part of the process, so we iterate until it converges.
4021  */
4022 void
4023 spa_sync(spa_t *spa, uint64_t txg)
4024 {
4025 	dsl_pool_t *dp = spa->spa_dsl_pool;
4026 	objset_t *mos = spa->spa_meta_objset;
4027 	bplist_t *bpl = &spa->spa_sync_bplist;
4028 	vdev_t *rvd = spa->spa_root_vdev;
4029 	vdev_t *vd;
4030 	vdev_t *svd[SPA_DVAS_PER_BP];
4031 	int svdcount = 0;
4032 	dmu_tx_t *tx;
4033 	int dirty_vdevs;
4034 
4035 	/*
4036 	 * Lock out configuration changes.
4037 	 */
4038 	spa_config_enter(spa, RW_READER, FTAG);
4039 
4040 	spa->spa_syncing_txg = txg;
4041 	spa->spa_sync_pass = 0;
4042 
4043 	VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj));
4044 
4045 	tx = dmu_tx_create_assigned(dp, txg);
4046 
4047 	/*
4048 	 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
4049 	 * set spa_deflate if we have no raid-z vdevs.
4050 	 */
4051 	if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
4052 	    spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
4053 		int i;
4054 
4055 		for (i = 0; i < rvd->vdev_children; i++) {
4056 			vd = rvd->vdev_child[i];
4057 			if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
4058 				break;
4059 		}
4060 		if (i == rvd->vdev_children) {
4061 			spa->spa_deflate = TRUE;
4062 			VERIFY(0 == zap_add(spa->spa_meta_objset,
4063 			    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
4064 			    sizeof (uint64_t), 1, &spa->spa_deflate, tx));
4065 		}
4066 	}
4067 
4068 	/*
4069 	 * If anything has changed in this txg, push the deferred frees
4070 	 * from the previous txg.  If not, leave them alone so that we
4071 	 * don't generate work on an otherwise idle system.
4072 	 */
4073 	if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
4074 	    !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
4075 	    !txg_list_empty(&dp->dp_sync_tasks, txg))
4076 		spa_sync_deferred_frees(spa, txg);
4077 
4078 	/*
4079 	 * Iterate to convergence.
4080 	 */
4081 	do {
4082 		spa->spa_sync_pass++;
4083 
4084 		spa_sync_config_object(spa, tx);
4085 		spa_sync_aux_dev(spa, &spa->spa_spares, tx,
4086 		    ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
4087 		spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
4088 		    ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
4089 		spa_errlog_sync(spa, txg);
4090 		dsl_pool_sync(dp, txg);
4091 
4092 		dirty_vdevs = 0;
4093 		while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) {
4094 			vdev_sync(vd, txg);
4095 			dirty_vdevs++;
4096 		}
4097 
4098 		bplist_sync(bpl, tx);
4099 	} while (dirty_vdevs);
4100 
4101 	bplist_close(bpl);
4102 
4103 	dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass);
4104 
4105 	/*
4106 	 * Rewrite the vdev configuration (which includes the uberblock)
4107 	 * to commit the transaction group.
4108 	 *
4109 	 * If there are no dirty vdevs, we sync the uberblock to a few
4110 	 * random top-level vdevs that are known to be visible in the
4111 	 * config cache (see spa_vdev_add() for details).  If there *are*
4112 	 * dirty vdevs -- or if the sync to our random subset fails --
4113 	 * then sync the uberblock to all vdevs.
4114 	 */
4115 	if (list_is_empty(&spa->spa_dirty_list)) {
4116 		int children = rvd->vdev_children;
4117 		int c0 = spa_get_random(children);
4118 		int c;
4119 
4120 		for (c = 0; c < children; c++) {
4121 			vd = rvd->vdev_child[(c0 + c) % children];
4122 			if (vd->vdev_ms_array == 0 || vd->vdev_islog)
4123 				continue;
4124 			svd[svdcount++] = vd;
4125 			if (svdcount == SPA_DVAS_PER_BP)
4126 				break;
4127 		}
4128 	}
4129 	if (svdcount == 0 || vdev_config_sync(svd, svdcount, txg) != 0)
4130 		VERIFY3U(vdev_config_sync(rvd->vdev_child,
4131 		    rvd->vdev_children, txg), ==, 0);
4132 
4133 	dmu_tx_commit(tx);
4134 
4135 	/*
4136 	 * Clear the dirty config list.
4137 	 */
4138 	while ((vd = list_head(&spa->spa_dirty_list)) != NULL)
4139 		vdev_config_clean(vd);
4140 
4141 	/*
4142 	 * Now that the new config has synced transactionally,
4143 	 * let it become visible to the config cache.
4144 	 */
4145 	if (spa->spa_config_syncing != NULL) {
4146 		spa_config_set(spa, spa->spa_config_syncing);
4147 		spa->spa_config_txg = txg;
4148 		spa->spa_config_syncing = NULL;
4149 	}
4150 
4151 	/*
4152 	 * Make a stable copy of the fully synced uberblock.
4153 	 * We use this as the root for pool traversals.
4154 	 */
4155 	spa->spa_traverse_wanted = 1;	/* tells traverse_more() to stop */
4156 
4157 	spa_scrub_suspend(spa);		/* stop scrubbing and finish I/Os */
4158 
4159 	rw_enter(&spa->spa_traverse_lock, RW_WRITER);
4160 	spa->spa_traverse_wanted = 0;
4161 	spa->spa_ubsync = spa->spa_uberblock;
4162 	rw_exit(&spa->spa_traverse_lock);
4163 
4164 	spa_scrub_resume(spa);		/* resume scrub with new ubsync */
4165 
4166 	/*
4167 	 * Clean up the ZIL records for the synced txg.
4168 	 */
4169 	dsl_pool_zil_clean(dp);
4170 
4171 	/*
4172 	 * Update usable space statistics.
4173 	 */
4174 	while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
4175 		vdev_sync_done(vd, txg);
4176 
4177 	/*
4178 	 * It had better be the case that we didn't dirty anything
4179 	 * since vdev_config_sync().
4180 	 */
4181 	ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
4182 	ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
4183 	ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
4184 	ASSERT(bpl->bpl_queue == NULL);
4185 
4186 	spa_config_exit(spa, FTAG);
4187 
4188 	/*
4189 	 * If any async tasks have been requested, kick them off.
4190 	 */
4191 	spa_async_dispatch(spa);
4192 }
4193 
4194 /*
4195  * Sync all pools.  We don't want to hold the namespace lock across these
4196  * operations, so we take a reference on the spa_t and drop the lock during the
4197  * sync.
4198  */
4199 void
4200 spa_sync_allpools(void)
4201 {
4202 	spa_t *spa = NULL;
4203 	mutex_enter(&spa_namespace_lock);
4204 	while ((spa = spa_next(spa)) != NULL) {
4205 		if (spa_state(spa) != POOL_STATE_ACTIVE)
4206 			continue;
4207 		spa_open_ref(spa, FTAG);
4208 		mutex_exit(&spa_namespace_lock);
4209 		txg_wait_synced(spa_get_dsl(spa), 0);
4210 		mutex_enter(&spa_namespace_lock);
4211 		spa_close(spa, FTAG);
4212 	}
4213 	mutex_exit(&spa_namespace_lock);
4214 }
4215 
4216 /*
4217  * ==========================================================================
4218  * Miscellaneous routines
4219  * ==========================================================================
4220  */
4221 
4222 /*
4223  * Remove all pools in the system.
4224  */
4225 void
4226 spa_evict_all(void)
4227 {
4228 	spa_t *spa;
4229 
4230 	/*
4231 	 * Remove all cached state.  All pools should be closed now,
4232 	 * so every spa in the AVL tree should be unreferenced.
4233 	 */
4234 	mutex_enter(&spa_namespace_lock);
4235 	while ((spa = spa_next(NULL)) != NULL) {
4236 		/*
4237 		 * Stop async tasks.  The async thread may need to detach
4238 		 * a device that's been replaced, which requires grabbing
4239 		 * spa_namespace_lock, so we must drop it here.
4240 		 */
4241 		spa_open_ref(spa, FTAG);
4242 		mutex_exit(&spa_namespace_lock);
4243 		spa_async_suspend(spa);
4244 		mutex_enter(&spa_namespace_lock);
4245 		VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
4246 		spa_close(spa, FTAG);
4247 
4248 		if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
4249 			spa_unload(spa);
4250 			spa_deactivate(spa);
4251 		}
4252 		spa_remove(spa);
4253 	}
4254 	mutex_exit(&spa_namespace_lock);
4255 }
4256 
4257 vdev_t *
4258 spa_lookup_by_guid(spa_t *spa, uint64_t guid)
4259 {
4260 	return (vdev_lookup_by_guid(spa->spa_root_vdev, guid));
4261 }
4262 
4263 void
4264 spa_upgrade(spa_t *spa, uint64_t version)
4265 {
4266 	spa_config_enter(spa, RW_WRITER, FTAG);
4267 
4268 	/*
4269 	 * This should only be called for a non-faulted pool, and since a
4270 	 * future version would result in an unopenable pool, this shouldn't be
4271 	 * possible.
4272 	 */
4273 	ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION);
4274 	ASSERT(version >= spa->spa_uberblock.ub_version);
4275 
4276 	spa->spa_uberblock.ub_version = version;
4277 	vdev_config_dirty(spa->spa_root_vdev);
4278 
4279 	spa_config_exit(spa, FTAG);
4280 
4281 	txg_wait_synced(spa_get_dsl(spa), 0);
4282 }
4283 
4284 boolean_t
4285 spa_has_spare(spa_t *spa, uint64_t guid)
4286 {
4287 	int i;
4288 	uint64_t spareguid;
4289 	spa_aux_vdev_t *sav = &spa->spa_spares;
4290 
4291 	for (i = 0; i < sav->sav_count; i++)
4292 		if (sav->sav_vdevs[i]->vdev_guid == guid)
4293 			return (B_TRUE);
4294 
4295 	for (i = 0; i < sav->sav_npending; i++) {
4296 		if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
4297 		    &spareguid) == 0 && spareguid == guid)
4298 			return (B_TRUE);
4299 	}
4300 
4301 	return (B_FALSE);
4302 }
4303 
4304 /*
4305  * Post a sysevent corresponding to the given event.  The 'name' must be one of
4306  * the event definitions in sys/sysevent/eventdefs.h.  The payload will be
4307  * filled in from the spa and (optionally) the vdev.  This doesn't do anything
4308  * in the userland libzpool, as we don't want consumers to misinterpret ztest
4309  * or zdb as real changes.
4310  */
4311 void
4312 spa_event_notify(spa_t *spa, vdev_t *vd, const char *name)
4313 {
4314 #ifdef _KERNEL
4315 	sysevent_t		*ev;
4316 	sysevent_attr_list_t	*attr = NULL;
4317 	sysevent_value_t	value;
4318 	sysevent_id_t		eid;
4319 
4320 	ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs",
4321 	    SE_SLEEP);
4322 
4323 	value.value_type = SE_DATA_TYPE_STRING;
4324 	value.value.sv_string = spa_name(spa);
4325 	if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0)
4326 		goto done;
4327 
4328 	value.value_type = SE_DATA_TYPE_UINT64;
4329 	value.value.sv_uint64 = spa_guid(spa);
4330 	if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0)
4331 		goto done;
4332 
4333 	if (vd) {
4334 		value.value_type = SE_DATA_TYPE_UINT64;
4335 		value.value.sv_uint64 = vd->vdev_guid;
4336 		if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value,
4337 		    SE_SLEEP) != 0)
4338 			goto done;
4339 
4340 		if (vd->vdev_path) {
4341 			value.value_type = SE_DATA_TYPE_STRING;
4342 			value.value.sv_string = vd->vdev_path;
4343 			if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH,
4344 			    &value, SE_SLEEP) != 0)
4345 				goto done;
4346 		}
4347 	}
4348 
4349 	if (sysevent_attach_attributes(ev, attr) != 0)
4350 		goto done;
4351 	attr = NULL;
4352 
4353 	(void) log_sysevent(ev, SE_SLEEP, &eid);
4354 
4355 done:
4356 	if (attr)
4357 		sysevent_free_attr(attr);
4358 	sysevent_free(ev);
4359 #endif
4360 }
4361