xref: /illumos-gate/usr/src/uts/common/fs/zfs/spa.c (revision 23fe25137099961a024bcef84f1daf96b05f04ee)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * This file contains all the routines used when modifying on-disk SPA state.
31  * This includes opening, importing, destroying, exporting a pool, and syncing a
32  * pool.
33  */
34 
35 #include <sys/zfs_context.h>
36 #include <sys/fm/fs/zfs.h>
37 #include <sys/spa_impl.h>
38 #include <sys/zio.h>
39 #include <sys/zio_checksum.h>
40 #include <sys/zio_compress.h>
41 #include <sys/dmu.h>
42 #include <sys/dmu_tx.h>
43 #include <sys/zap.h>
44 #include <sys/zil.h>
45 #include <sys/vdev_impl.h>
46 #include <sys/metaslab.h>
47 #include <sys/uberblock_impl.h>
48 #include <sys/txg.h>
49 #include <sys/avl.h>
50 #include <sys/dmu_traverse.h>
51 #include <sys/unique.h>
52 #include <sys/dsl_pool.h>
53 #include <sys/dsl_dir.h>
54 #include <sys/dsl_prop.h>
55 #include <sys/fs/zfs.h>
56 #include <sys/callb.h>
57 
58 int zio_taskq_threads = 8;
59 
60 /*
61  * ==========================================================================
62  * SPA state manipulation (open/create/destroy/import/export)
63  * ==========================================================================
64  */
65 
66 static int
67 spa_error_entry_compare(const void *a, const void *b)
68 {
69 	spa_error_entry_t *sa = (spa_error_entry_t *)a;
70 	spa_error_entry_t *sb = (spa_error_entry_t *)b;
71 	int ret;
72 
73 	ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
74 	    sizeof (zbookmark_t));
75 
76 	if (ret < 0)
77 		return (-1);
78 	else if (ret > 0)
79 		return (1);
80 	else
81 		return (0);
82 }
83 
84 /*
85  * Utility function which retrieves copies of the current logs and
86  * re-initializes them in the process.
87  */
88 void
89 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
90 {
91 	ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
92 
93 	bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
94 	bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
95 
96 	avl_create(&spa->spa_errlist_scrub,
97 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
98 	    offsetof(spa_error_entry_t, se_avl));
99 	avl_create(&spa->spa_errlist_last,
100 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
101 	    offsetof(spa_error_entry_t, se_avl));
102 }
103 
104 /*
105  * Activate an uninitialized pool.
106  */
107 static void
108 spa_activate(spa_t *spa)
109 {
110 	int t;
111 
112 	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
113 
114 	spa->spa_state = POOL_STATE_ACTIVE;
115 
116 	spa->spa_normal_class = metaslab_class_create();
117 
118 	for (t = 0; t < ZIO_TYPES; t++) {
119 		spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue",
120 		    zio_taskq_threads, maxclsyspri, 50, INT_MAX,
121 		    TASKQ_PREPOPULATE);
122 		spa->spa_zio_intr_taskq[t] = taskq_create("spa_zio_intr",
123 		    zio_taskq_threads, maxclsyspri, 50, INT_MAX,
124 		    TASKQ_PREPOPULATE);
125 	}
126 
127 	rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL);
128 
129 	mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL);
130 	mutex_init(&spa->spa_config_cache_lock, NULL, MUTEX_DEFAULT, NULL);
131 	mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
132 	mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
133 	mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
134 	mutex_init(&spa->spa_config_lock.scl_lock, NULL, MUTEX_DEFAULT, NULL);
135 	mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL);
136 	mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
137 
138 	list_create(&spa->spa_dirty_list, sizeof (vdev_t),
139 	    offsetof(vdev_t, vdev_dirty_node));
140 
141 	txg_list_create(&spa->spa_vdev_txg_list,
142 	    offsetof(struct vdev, vdev_txg_node));
143 
144 	avl_create(&spa->spa_errlist_scrub,
145 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
146 	    offsetof(spa_error_entry_t, se_avl));
147 	avl_create(&spa->spa_errlist_last,
148 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
149 	    offsetof(spa_error_entry_t, se_avl));
150 }
151 
152 /*
153  * Opposite of spa_activate().
154  */
155 static void
156 spa_deactivate(spa_t *spa)
157 {
158 	int t;
159 
160 	ASSERT(spa->spa_sync_on == B_FALSE);
161 	ASSERT(spa->spa_dsl_pool == NULL);
162 	ASSERT(spa->spa_root_vdev == NULL);
163 
164 	ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
165 
166 	txg_list_destroy(&spa->spa_vdev_txg_list);
167 
168 	list_destroy(&spa->spa_dirty_list);
169 
170 	rw_destroy(&spa->spa_traverse_lock);
171 
172 	for (t = 0; t < ZIO_TYPES; t++) {
173 		taskq_destroy(spa->spa_zio_issue_taskq[t]);
174 		taskq_destroy(spa->spa_zio_intr_taskq[t]);
175 		spa->spa_zio_issue_taskq[t] = NULL;
176 		spa->spa_zio_intr_taskq[t] = NULL;
177 	}
178 
179 	metaslab_class_destroy(spa->spa_normal_class);
180 	spa->spa_normal_class = NULL;
181 
182 	/*
183 	 * If this was part of an import or the open otherwise failed, we may
184 	 * still have errors left in the queues.  Empty them just in case.
185 	 */
186 	spa_errlog_drain(spa);
187 
188 	avl_destroy(&spa->spa_errlist_scrub);
189 	avl_destroy(&spa->spa_errlist_last);
190 
191 	spa->spa_state = POOL_STATE_UNINITIALIZED;
192 }
193 
194 /*
195  * Verify a pool configuration, and construct the vdev tree appropriately.  This
196  * will create all the necessary vdevs in the appropriate layout, with each vdev
197  * in the CLOSED state.  This will prep the pool before open/creation/import.
198  * All vdev validation is done by the vdev_alloc() routine.
199  */
200 static int
201 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
202     uint_t id, int atype)
203 {
204 	nvlist_t **child;
205 	uint_t c, children;
206 	int error;
207 
208 	if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
209 		return (error);
210 
211 	if ((*vdp)->vdev_ops->vdev_op_leaf)
212 		return (0);
213 
214 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
215 	    &child, &children) != 0) {
216 		vdev_free(*vdp);
217 		*vdp = NULL;
218 		return (EINVAL);
219 	}
220 
221 	for (c = 0; c < children; c++) {
222 		vdev_t *vd;
223 		if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
224 		    atype)) != 0) {
225 			vdev_free(*vdp);
226 			*vdp = NULL;
227 			return (error);
228 		}
229 	}
230 
231 	ASSERT(*vdp != NULL);
232 
233 	return (0);
234 }
235 
236 /*
237  * Opposite of spa_load().
238  */
239 static void
240 spa_unload(spa_t *spa)
241 {
242 	int i;
243 
244 	/*
245 	 * Stop async tasks.
246 	 */
247 	spa_async_suspend(spa);
248 
249 	/*
250 	 * Stop syncing.
251 	 */
252 	if (spa->spa_sync_on) {
253 		txg_sync_stop(spa->spa_dsl_pool);
254 		spa->spa_sync_on = B_FALSE;
255 	}
256 
257 	/*
258 	 * Wait for any outstanding prefetch I/O to complete.
259 	 */
260 	spa_config_enter(spa, RW_WRITER, FTAG);
261 	spa_config_exit(spa, FTAG);
262 
263 	/*
264 	 * Close the dsl pool.
265 	 */
266 	if (spa->spa_dsl_pool) {
267 		dsl_pool_close(spa->spa_dsl_pool);
268 		spa->spa_dsl_pool = NULL;
269 	}
270 
271 	/*
272 	 * Close all vdevs.
273 	 */
274 	if (spa->spa_root_vdev)
275 		vdev_free(spa->spa_root_vdev);
276 	ASSERT(spa->spa_root_vdev == NULL);
277 
278 	for (i = 0; i < spa->spa_nspares; i++)
279 		vdev_free(spa->spa_spares[i]);
280 	if (spa->spa_spares) {
281 		kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *));
282 		spa->spa_spares = NULL;
283 	}
284 	if (spa->spa_sparelist) {
285 		nvlist_free(spa->spa_sparelist);
286 		spa->spa_sparelist = NULL;
287 	}
288 
289 	spa->spa_async_suspended = 0;
290 }
291 
292 /*
293  * Load (or re-load) the current list of vdevs describing the active spares for
294  * this pool.  When this is called, we have some form of basic information in
295  * 'spa_sparelist'.  We parse this into vdevs, try to open them, and then
296  * re-generate a more complete list including status information.
297  */
298 static void
299 spa_load_spares(spa_t *spa)
300 {
301 	nvlist_t **spares;
302 	uint_t nspares;
303 	int i;
304 	vdev_t *vd, *tvd;
305 
306 	/*
307 	 * First, close and free any existing spare vdevs.
308 	 */
309 	for (i = 0; i < spa->spa_nspares; i++) {
310 		vd = spa->spa_spares[i];
311 
312 		/* Undo the call to spa_activate() below */
313 		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL &&
314 		    tvd->vdev_isspare)
315 			spa_spare_remove(tvd);
316 		vdev_close(vd);
317 		vdev_free(vd);
318 	}
319 
320 	if (spa->spa_spares)
321 		kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *));
322 
323 	if (spa->spa_sparelist == NULL)
324 		nspares = 0;
325 	else
326 		VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist,
327 		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
328 
329 	spa->spa_nspares = (int)nspares;
330 	spa->spa_spares = NULL;
331 
332 	if (nspares == 0)
333 		return;
334 
335 	/*
336 	 * Construct the array of vdevs, opening them to get status in the
337 	 * process.   For each spare, there is potentially two different vdev_t
338 	 * structures associated with it: one in the list of spares (used only
339 	 * for basic validation purposes) and one in the active vdev
340 	 * configuration (if it's spared in).  During this phase we open and
341 	 * validate each vdev on the spare list.  If the vdev also exists in the
342 	 * active configuration, then we also mark this vdev as an active spare.
343 	 */
344 	spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP);
345 	for (i = 0; i < spa->spa_nspares; i++) {
346 		VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
347 		    VDEV_ALLOC_SPARE) == 0);
348 		ASSERT(vd != NULL);
349 
350 		spa->spa_spares[i] = vd;
351 
352 		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) {
353 			if (!tvd->vdev_isspare)
354 				spa_spare_add(tvd);
355 
356 			/*
357 			 * We only mark the spare active if we were successfully
358 			 * able to load the vdev.  Otherwise, importing a pool
359 			 * with a bad active spare would result in strange
360 			 * behavior, because multiple pool would think the spare
361 			 * is actively in use.
362 			 *
363 			 * There is a vulnerability here to an equally bizarre
364 			 * circumstance, where a dead active spare is later
365 			 * brought back to life (onlined or otherwise).  Given
366 			 * the rarity of this scenario, and the extra complexity
367 			 * it adds, we ignore the possibility.
368 			 */
369 			if (!vdev_is_dead(tvd))
370 				spa_spare_activate(tvd);
371 		}
372 
373 		if (vdev_open(vd) != 0)
374 			continue;
375 
376 		vd->vdev_top = vd;
377 		(void) vdev_validate_spare(vd);
378 	}
379 
380 	/*
381 	 * Recompute the stashed list of spares, with status information
382 	 * this time.
383 	 */
384 	VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
385 	    DATA_TYPE_NVLIST_ARRAY) == 0);
386 
387 	spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP);
388 	for (i = 0; i < spa->spa_nspares; i++)
389 		spares[i] = vdev_config_generate(spa, spa->spa_spares[i],
390 		    B_TRUE, B_TRUE);
391 	VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
392 	    spares, spa->spa_nspares) == 0);
393 	for (i = 0; i < spa->spa_nspares; i++)
394 		nvlist_free(spares[i]);
395 	kmem_free(spares, spa->spa_nspares * sizeof (void *));
396 }
397 
398 static int
399 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
400 {
401 	dmu_buf_t *db;
402 	char *packed = NULL;
403 	size_t nvsize = 0;
404 	int error;
405 	*value = NULL;
406 
407 	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
408 	nvsize = *(uint64_t *)db->db_data;
409 	dmu_buf_rele(db, FTAG);
410 
411 	packed = kmem_alloc(nvsize, KM_SLEEP);
412 	error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed);
413 	if (error == 0)
414 		error = nvlist_unpack(packed, nvsize, value, 0);
415 	kmem_free(packed, nvsize);
416 
417 	return (error);
418 }
419 
420 /*
421  * Load an existing storage pool, using the pool's builtin spa_config as a
422  * source of configuration information.
423  */
424 static int
425 spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
426 {
427 	int error = 0;
428 	nvlist_t *nvroot = NULL;
429 	vdev_t *rvd;
430 	uberblock_t *ub = &spa->spa_uberblock;
431 	uint64_t config_cache_txg = spa->spa_config_txg;
432 	uint64_t pool_guid;
433 	uint64_t version;
434 	zio_t *zio;
435 
436 	spa->spa_load_state = state;
437 
438 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) ||
439 	    nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
440 		error = EINVAL;
441 		goto out;
442 	}
443 
444 	/*
445 	 * Versioning wasn't explicitly added to the label until later, so if
446 	 * it's not present treat it as the initial version.
447 	 */
448 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0)
449 		version = ZFS_VERSION_INITIAL;
450 
451 	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
452 	    &spa->spa_config_txg);
453 
454 	if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
455 	    spa_guid_exists(pool_guid, 0)) {
456 		error = EEXIST;
457 		goto out;
458 	}
459 
460 	spa->spa_load_guid = pool_guid;
461 
462 	/*
463 	 * Parse the configuration into a vdev tree.  We explicitly set the
464 	 * value that will be returned by spa_version() since parsing the
465 	 * configuration requires knowing the version number.
466 	 */
467 	spa_config_enter(spa, RW_WRITER, FTAG);
468 	spa->spa_ubsync.ub_version = version;
469 	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD);
470 	spa_config_exit(spa, FTAG);
471 
472 	if (error != 0)
473 		goto out;
474 
475 	ASSERT(spa->spa_root_vdev == rvd);
476 	ASSERT(spa_guid(spa) == pool_guid);
477 
478 	/*
479 	 * Try to open all vdevs, loading each label in the process.
480 	 */
481 	if (vdev_open(rvd) != 0) {
482 		error = ENXIO;
483 		goto out;
484 	}
485 
486 	/*
487 	 * Validate the labels for all leaf vdevs.  We need to grab the config
488 	 * lock because all label I/O is done with the ZIO_FLAG_CONFIG_HELD
489 	 * flag.
490 	 */
491 	spa_config_enter(spa, RW_READER, FTAG);
492 	error = vdev_validate(rvd);
493 	spa_config_exit(spa, FTAG);
494 
495 	if (error != 0) {
496 		error = EBADF;
497 		goto out;
498 	}
499 
500 	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
501 		error = ENXIO;
502 		goto out;
503 	}
504 
505 	/*
506 	 * Find the best uberblock.
507 	 */
508 	bzero(ub, sizeof (uberblock_t));
509 
510 	zio = zio_root(spa, NULL, NULL,
511 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
512 	vdev_uberblock_load(zio, rvd, ub);
513 	error = zio_wait(zio);
514 
515 	/*
516 	 * If we weren't able to find a single valid uberblock, return failure.
517 	 */
518 	if (ub->ub_txg == 0) {
519 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
520 		    VDEV_AUX_CORRUPT_DATA);
521 		error = ENXIO;
522 		goto out;
523 	}
524 
525 	/*
526 	 * If the pool is newer than the code, we can't open it.
527 	 */
528 	if (ub->ub_version > ZFS_VERSION) {
529 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
530 		    VDEV_AUX_VERSION_NEWER);
531 		error = ENOTSUP;
532 		goto out;
533 	}
534 
535 	/*
536 	 * If the vdev guid sum doesn't match the uberblock, we have an
537 	 * incomplete configuration.
538 	 */
539 	if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) {
540 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
541 		    VDEV_AUX_BAD_GUID_SUM);
542 		error = ENXIO;
543 		goto out;
544 	}
545 
546 	/*
547 	 * Initialize internal SPA structures.
548 	 */
549 	spa->spa_state = POOL_STATE_ACTIVE;
550 	spa->spa_ubsync = spa->spa_uberblock;
551 	spa->spa_first_txg = spa_last_synced_txg(spa) + 1;
552 	error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
553 	if (error) {
554 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
555 		    VDEV_AUX_CORRUPT_DATA);
556 		goto out;
557 	}
558 	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
559 
560 	if (zap_lookup(spa->spa_meta_objset,
561 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
562 	    sizeof (uint64_t), 1, &spa->spa_config_object) != 0) {
563 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
564 		    VDEV_AUX_CORRUPT_DATA);
565 		error = EIO;
566 		goto out;
567 	}
568 
569 	if (!mosconfig) {
570 		nvlist_t *newconfig;
571 
572 		if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) {
573 			vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
574 			    VDEV_AUX_CORRUPT_DATA);
575 			error = EIO;
576 			goto out;
577 		}
578 
579 		spa_config_set(spa, newconfig);
580 		spa_unload(spa);
581 		spa_deactivate(spa);
582 		spa_activate(spa);
583 
584 		return (spa_load(spa, newconfig, state, B_TRUE));
585 	}
586 
587 	if (zap_lookup(spa->spa_meta_objset,
588 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
589 	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) {
590 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
591 		    VDEV_AUX_CORRUPT_DATA);
592 		error = EIO;
593 		goto out;
594 	}
595 
596 	/*
597 	 * Load the bit that tells us to use the new accounting function
598 	 * (raid-z deflation).  If we have an older pool, this will not
599 	 * be present.
600 	 */
601 	error = zap_lookup(spa->spa_meta_objset,
602 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
603 	    sizeof (uint64_t), 1, &spa->spa_deflate);
604 	if (error != 0 && error != ENOENT) {
605 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
606 		    VDEV_AUX_CORRUPT_DATA);
607 		error = EIO;
608 		goto out;
609 	}
610 
611 	/*
612 	 * Load the persistent error log.  If we have an older pool, this will
613 	 * not be present.
614 	 */
615 	error = zap_lookup(spa->spa_meta_objset,
616 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST,
617 	    sizeof (uint64_t), 1, &spa->spa_errlog_last);
618 	if (error != 0 && error != ENOENT) {
619 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
620 		    VDEV_AUX_CORRUPT_DATA);
621 		error = EIO;
622 		goto out;
623 	}
624 
625 	error = zap_lookup(spa->spa_meta_objset,
626 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB,
627 	    sizeof (uint64_t), 1, &spa->spa_errlog_scrub);
628 	if (error != 0 && error != ENOENT) {
629 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
630 		    VDEV_AUX_CORRUPT_DATA);
631 		error = EIO;
632 		goto out;
633 	}
634 
635 	/*
636 	 * Load the history object.  If we have an older pool, this
637 	 * will not be present.
638 	 */
639 	error = zap_lookup(spa->spa_meta_objset,
640 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY,
641 	    sizeof (uint64_t), 1, &spa->spa_history);
642 	if (error != 0 && error != ENOENT) {
643 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
644 		    VDEV_AUX_CORRUPT_DATA);
645 		error = EIO;
646 		goto out;
647 	}
648 
649 	/*
650 	 * Load any hot spares for this pool.
651 	 */
652 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
653 	    DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object);
654 	if (error != 0 && error != ENOENT) {
655 		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
656 		    VDEV_AUX_CORRUPT_DATA);
657 		error = EIO;
658 		goto out;
659 	}
660 	if (error == 0) {
661 		ASSERT(spa_version(spa) >= ZFS_VERSION_SPARES);
662 		if (load_nvlist(spa, spa->spa_spares_object,
663 		    &spa->spa_sparelist) != 0) {
664 			vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
665 			    VDEV_AUX_CORRUPT_DATA);
666 			error = EIO;
667 			goto out;
668 		}
669 
670 		spa_config_enter(spa, RW_WRITER, FTAG);
671 		spa_load_spares(spa);
672 		spa_config_exit(spa, FTAG);
673 	}
674 
675 	/*
676 	 * Load the vdev state for all toplevel vdevs.
677 	 */
678 	vdev_load(rvd);
679 
680 	/*
681 	 * Propagate the leaf DTLs we just loaded all the way up the tree.
682 	 */
683 	spa_config_enter(spa, RW_WRITER, FTAG);
684 	vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
685 	spa_config_exit(spa, FTAG);
686 
687 	/*
688 	 * Check the state of the root vdev.  If it can't be opened, it
689 	 * indicates one or more toplevel vdevs are faulted.
690 	 */
691 	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
692 		error = ENXIO;
693 		goto out;
694 	}
695 
696 	if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) {
697 		dmu_tx_t *tx;
698 		int need_update = B_FALSE;
699 		int c;
700 
701 		/*
702 		 * Claim log blocks that haven't been committed yet.
703 		 * This must all happen in a single txg.
704 		 */
705 		tx = dmu_tx_create_assigned(spa_get_dsl(spa),
706 		    spa_first_txg(spa));
707 		(void) dmu_objset_find(spa->spa_name,
708 		    zil_claim, tx, DS_FIND_CHILDREN);
709 		dmu_tx_commit(tx);
710 
711 		spa->spa_sync_on = B_TRUE;
712 		txg_sync_start(spa->spa_dsl_pool);
713 
714 		/*
715 		 * Wait for all claims to sync.
716 		 */
717 		txg_wait_synced(spa->spa_dsl_pool, 0);
718 
719 		/*
720 		 * If the config cache is stale, or we have uninitialized
721 		 * metaslabs (see spa_vdev_add()), then update the config.
722 		 */
723 		if (config_cache_txg != spa->spa_config_txg ||
724 		    state == SPA_LOAD_IMPORT)
725 			need_update = B_TRUE;
726 
727 		for (c = 0; c < rvd->vdev_children; c++)
728 			if (rvd->vdev_child[c]->vdev_ms_array == 0)
729 				need_update = B_TRUE;
730 
731 		/*
732 		 * Update the config cache asychronously in case we're the
733 		 * root pool, in which case the config cache isn't writable yet.
734 		 */
735 		if (need_update)
736 			spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
737 	}
738 
739 	error = 0;
740 out:
741 	if (error && error != EBADF)
742 		zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0);
743 	spa->spa_load_state = SPA_LOAD_NONE;
744 	spa->spa_ena = 0;
745 
746 	return (error);
747 }
748 
749 /*
750  * Pool Open/Import
751  *
752  * The import case is identical to an open except that the configuration is sent
753  * down from userland, instead of grabbed from the configuration cache.  For the
754  * case of an open, the pool configuration will exist in the
755  * POOL_STATE_UNITIALIZED state.
756  *
757  * The stats information (gen/count/ustats) is used to gather vdev statistics at
758  * the same time open the pool, without having to keep around the spa_t in some
759  * ambiguous state.
760  */
761 static int
762 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
763 {
764 	spa_t *spa;
765 	int error;
766 	int loaded = B_FALSE;
767 	int locked = B_FALSE;
768 
769 	*spapp = NULL;
770 
771 	/*
772 	 * As disgusting as this is, we need to support recursive calls to this
773 	 * function because dsl_dir_open() is called during spa_load(), and ends
774 	 * up calling spa_open() again.  The real fix is to figure out how to
775 	 * avoid dsl_dir_open() calling this in the first place.
776 	 */
777 	if (mutex_owner(&spa_namespace_lock) != curthread) {
778 		mutex_enter(&spa_namespace_lock);
779 		locked = B_TRUE;
780 	}
781 
782 	if ((spa = spa_lookup(pool)) == NULL) {
783 		if (locked)
784 			mutex_exit(&spa_namespace_lock);
785 		return (ENOENT);
786 	}
787 	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
788 
789 		spa_activate(spa);
790 
791 		error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE);
792 
793 		if (error == EBADF) {
794 			/*
795 			 * If vdev_validate() returns failure (indicated by
796 			 * EBADF), it indicates that one of the vdevs indicates
797 			 * that the pool has been exported or destroyed.  If
798 			 * this is the case, the config cache is out of sync and
799 			 * we should remove the pool from the namespace.
800 			 */
801 			zfs_post_ok(spa, NULL);
802 			spa_unload(spa);
803 			spa_deactivate(spa);
804 			spa_remove(spa);
805 			spa_config_sync();
806 			if (locked)
807 				mutex_exit(&spa_namespace_lock);
808 			return (ENOENT);
809 		}
810 
811 		if (error) {
812 			/*
813 			 * We can't open the pool, but we still have useful
814 			 * information: the state of each vdev after the
815 			 * attempted vdev_open().  Return this to the user.
816 			 */
817 			if (config != NULL && spa->spa_root_vdev != NULL) {
818 				spa_config_enter(spa, RW_READER, FTAG);
819 				*config = spa_config_generate(spa, NULL, -1ULL,
820 				    B_TRUE);
821 				spa_config_exit(spa, FTAG);
822 			}
823 			spa_unload(spa);
824 			spa_deactivate(spa);
825 			spa->spa_last_open_failed = B_TRUE;
826 			if (locked)
827 				mutex_exit(&spa_namespace_lock);
828 			*spapp = NULL;
829 			return (error);
830 		} else {
831 			zfs_post_ok(spa, NULL);
832 			spa->spa_last_open_failed = B_FALSE;
833 		}
834 
835 		loaded = B_TRUE;
836 	}
837 
838 	spa_open_ref(spa, tag);
839 	if (locked)
840 		mutex_exit(&spa_namespace_lock);
841 
842 	*spapp = spa;
843 
844 	if (config != NULL) {
845 		spa_config_enter(spa, RW_READER, FTAG);
846 		*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
847 		spa_config_exit(spa, FTAG);
848 	}
849 
850 	/*
851 	 * If we just loaded the pool, resilver anything that's out of date.
852 	 */
853 	if (loaded && (spa_mode & FWRITE))
854 		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
855 
856 	return (0);
857 }
858 
859 int
860 spa_open(const char *name, spa_t **spapp, void *tag)
861 {
862 	return (spa_open_common(name, spapp, tag, NULL));
863 }
864 
865 /*
866  * Lookup the given spa_t, incrementing the inject count in the process,
867  * preventing it from being exported or destroyed.
868  */
869 spa_t *
870 spa_inject_addref(char *name)
871 {
872 	spa_t *spa;
873 
874 	mutex_enter(&spa_namespace_lock);
875 	if ((spa = spa_lookup(name)) == NULL) {
876 		mutex_exit(&spa_namespace_lock);
877 		return (NULL);
878 	}
879 	spa->spa_inject_ref++;
880 	mutex_exit(&spa_namespace_lock);
881 
882 	return (spa);
883 }
884 
885 void
886 spa_inject_delref(spa_t *spa)
887 {
888 	mutex_enter(&spa_namespace_lock);
889 	spa->spa_inject_ref--;
890 	mutex_exit(&spa_namespace_lock);
891 }
892 
893 static void
894 spa_add_spares(spa_t *spa, nvlist_t *config)
895 {
896 	nvlist_t **spares;
897 	uint_t i, nspares;
898 	nvlist_t *nvroot;
899 	uint64_t guid;
900 	vdev_stat_t *vs;
901 	uint_t vsc;
902 	uint64_t pool;
903 
904 	if (spa->spa_nspares == 0)
905 		return;
906 
907 	VERIFY(nvlist_lookup_nvlist(config,
908 	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
909 	VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist,
910 	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
911 	if (nspares != 0) {
912 		VERIFY(nvlist_add_nvlist_array(nvroot,
913 		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
914 		VERIFY(nvlist_lookup_nvlist_array(nvroot,
915 		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
916 
917 		/*
918 		 * Go through and find any spares which have since been
919 		 * repurposed as an active spare.  If this is the case, update
920 		 * their status appropriately.
921 		 */
922 		for (i = 0; i < nspares; i++) {
923 			VERIFY(nvlist_lookup_uint64(spares[i],
924 			    ZPOOL_CONFIG_GUID, &guid) == 0);
925 			if (spa_spare_exists(guid, &pool) && pool != 0ULL) {
926 				VERIFY(nvlist_lookup_uint64_array(
927 				    spares[i], ZPOOL_CONFIG_STATS,
928 				    (uint64_t **)&vs, &vsc) == 0);
929 				vs->vs_state = VDEV_STATE_CANT_OPEN;
930 				vs->vs_aux = VDEV_AUX_SPARED;
931 			}
932 		}
933 	}
934 }
935 
936 int
937 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
938 {
939 	int error;
940 	spa_t *spa;
941 
942 	*config = NULL;
943 	error = spa_open_common(name, &spa, FTAG, config);
944 
945 	if (spa && *config != NULL) {
946 		VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT,
947 		    spa_get_errlog_size(spa)) == 0);
948 
949 		spa_add_spares(spa, *config);
950 	}
951 
952 	/*
953 	 * We want to get the alternate root even for faulted pools, so we cheat
954 	 * and call spa_lookup() directly.
955 	 */
956 	if (altroot) {
957 		if (spa == NULL) {
958 			mutex_enter(&spa_namespace_lock);
959 			spa = spa_lookup(name);
960 			if (spa)
961 				spa_altroot(spa, altroot, buflen);
962 			else
963 				altroot[0] = '\0';
964 			spa = NULL;
965 			mutex_exit(&spa_namespace_lock);
966 		} else {
967 			spa_altroot(spa, altroot, buflen);
968 		}
969 	}
970 
971 	if (spa != NULL)
972 		spa_close(spa, FTAG);
973 
974 	return (error);
975 }
976 
977 /*
978  * Validate that the 'spares' array is well formed.  We must have an array of
979  * nvlists, each which describes a valid leaf vdev.  If this is an import (mode
980  * is VDEV_ALLOC_SPARE), then we allow corrupted spares to be specified, as long
981  * as they are well-formed.
982  */
983 static int
984 spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
985 {
986 	nvlist_t **spares;
987 	uint_t i, nspares;
988 	vdev_t *vd;
989 	int error;
990 
991 	/*
992 	 * It's acceptable to have no spares specified.
993 	 */
994 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
995 	    &spares, &nspares) != 0)
996 		return (0);
997 
998 	if (nspares == 0)
999 		return (EINVAL);
1000 
1001 	/*
1002 	 * Make sure the pool is formatted with a version that supports hot
1003 	 * spares.
1004 	 */
1005 	if (spa_version(spa) < ZFS_VERSION_SPARES)
1006 		return (ENOTSUP);
1007 
1008 	/*
1009 	 * Set the pending spare list so we correctly handle device in-use
1010 	 * checking.
1011 	 */
1012 	spa->spa_pending_spares = spares;
1013 	spa->spa_pending_nspares = nspares;
1014 
1015 	for (i = 0; i < nspares; i++) {
1016 		if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0,
1017 		    mode)) != 0)
1018 			goto out;
1019 
1020 		if (!vd->vdev_ops->vdev_op_leaf) {
1021 			vdev_free(vd);
1022 			error = EINVAL;
1023 			goto out;
1024 		}
1025 
1026 		vd->vdev_top = vd;
1027 
1028 		if ((error = vdev_open(vd)) == 0 &&
1029 		    (error = vdev_label_init(vd, crtxg,
1030 		    VDEV_LABEL_SPARE)) == 0) {
1031 			VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID,
1032 			    vd->vdev_guid) == 0);
1033 		}
1034 
1035 		vdev_free(vd);
1036 
1037 		if (error && mode != VDEV_ALLOC_SPARE)
1038 			goto out;
1039 		else
1040 			error = 0;
1041 	}
1042 
1043 out:
1044 	spa->spa_pending_spares = NULL;
1045 	spa->spa_pending_nspares = 0;
1046 	return (error);
1047 }
1048 
1049 /*
1050  * Pool Creation
1051  */
1052 int
1053 spa_create(const char *pool, nvlist_t *nvroot, const char *altroot)
1054 {
1055 	spa_t *spa;
1056 	vdev_t *rvd;
1057 	dsl_pool_t *dp;
1058 	dmu_tx_t *tx;
1059 	int c, error = 0;
1060 	uint64_t txg = TXG_INITIAL;
1061 	nvlist_t **spares;
1062 	uint_t nspares;
1063 
1064 	/*
1065 	 * If this pool already exists, return failure.
1066 	 */
1067 	mutex_enter(&spa_namespace_lock);
1068 	if (spa_lookup(pool) != NULL) {
1069 		mutex_exit(&spa_namespace_lock);
1070 		return (EEXIST);
1071 	}
1072 
1073 	/*
1074 	 * Allocate a new spa_t structure.
1075 	 */
1076 	spa = spa_add(pool, altroot);
1077 	spa_activate(spa);
1078 
1079 	spa->spa_uberblock.ub_txg = txg - 1;
1080 	spa->spa_uberblock.ub_version = ZFS_VERSION;
1081 	spa->spa_ubsync = spa->spa_uberblock;
1082 
1083 	/*
1084 	 * Create the root vdev.
1085 	 */
1086 	spa_config_enter(spa, RW_WRITER, FTAG);
1087 
1088 	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
1089 
1090 	ASSERT(error != 0 || rvd != NULL);
1091 	ASSERT(error != 0 || spa->spa_root_vdev == rvd);
1092 
1093 	if (error == 0 && rvd->vdev_children == 0)
1094 		error = EINVAL;
1095 
1096 	if (error == 0 &&
1097 	    (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
1098 	    (error = spa_validate_spares(spa, nvroot, txg,
1099 	    VDEV_ALLOC_ADD)) == 0) {
1100 		for (c = 0; c < rvd->vdev_children; c++)
1101 			vdev_init(rvd->vdev_child[c], txg);
1102 		vdev_config_dirty(rvd);
1103 	}
1104 
1105 	spa_config_exit(spa, FTAG);
1106 
1107 	if (error != 0) {
1108 		spa_unload(spa);
1109 		spa_deactivate(spa);
1110 		spa_remove(spa);
1111 		mutex_exit(&spa_namespace_lock);
1112 		return (error);
1113 	}
1114 
1115 	/*
1116 	 * Get the list of spares, if specified.
1117 	 */
1118 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
1119 	    &spares, &nspares) == 0) {
1120 		VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME,
1121 		    KM_SLEEP) == 0);
1122 		VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist,
1123 		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
1124 		spa_config_enter(spa, RW_WRITER, FTAG);
1125 		spa_load_spares(spa);
1126 		spa_config_exit(spa, FTAG);
1127 		spa->spa_sync_spares = B_TRUE;
1128 	}
1129 
1130 	spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg);
1131 	spa->spa_meta_objset = dp->dp_meta_objset;
1132 
1133 	tx = dmu_tx_create_assigned(dp, txg);
1134 
1135 	/*
1136 	 * Create the pool config object.
1137 	 */
1138 	spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
1139 	    DMU_OT_PACKED_NVLIST, 1 << 14,
1140 	    DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
1141 
1142 	if (zap_add(spa->spa_meta_objset,
1143 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
1144 	    sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
1145 		cmn_err(CE_PANIC, "failed to add pool config");
1146 	}
1147 
1148 	/* Newly created pools are always deflated. */
1149 	spa->spa_deflate = TRUE;
1150 	if (zap_add(spa->spa_meta_objset,
1151 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
1152 	    sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
1153 		cmn_err(CE_PANIC, "failed to add deflate");
1154 	}
1155 
1156 	/*
1157 	 * Create the deferred-free bplist object.  Turn off compression
1158 	 * because sync-to-convergence takes longer if the blocksize
1159 	 * keeps changing.
1160 	 */
1161 	spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset,
1162 	    1 << 14, tx);
1163 	dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj,
1164 	    ZIO_COMPRESS_OFF, tx);
1165 
1166 	if (zap_add(spa->spa_meta_objset,
1167 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
1168 	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) {
1169 		cmn_err(CE_PANIC, "failed to add bplist");
1170 	}
1171 
1172 	/*
1173 	 * Create the pool's history object.
1174 	 */
1175 	spa_history_create_obj(spa, tx);
1176 
1177 	dmu_tx_commit(tx);
1178 
1179 	spa->spa_sync_on = B_TRUE;
1180 	txg_sync_start(spa->spa_dsl_pool);
1181 
1182 	/*
1183 	 * We explicitly wait for the first transaction to complete so that our
1184 	 * bean counters are appropriately updated.
1185 	 */
1186 	txg_wait_synced(spa->spa_dsl_pool, txg);
1187 
1188 	spa_config_sync();
1189 
1190 	mutex_exit(&spa_namespace_lock);
1191 
1192 	return (0);
1193 }
1194 
1195 /*
1196  * Import the given pool into the system.  We set up the necessary spa_t and
1197  * then call spa_load() to do the dirty work.
1198  */
1199 int
1200 spa_import(const char *pool, nvlist_t *config, const char *altroot)
1201 {
1202 	spa_t *spa;
1203 	int error;
1204 	nvlist_t *nvroot;
1205 	nvlist_t **spares;
1206 	uint_t nspares;
1207 
1208 	if (!(spa_mode & FWRITE))
1209 		return (EROFS);
1210 
1211 	/*
1212 	 * If a pool with this name exists, return failure.
1213 	 */
1214 	mutex_enter(&spa_namespace_lock);
1215 	if (spa_lookup(pool) != NULL) {
1216 		mutex_exit(&spa_namespace_lock);
1217 		return (EEXIST);
1218 	}
1219 
1220 	/*
1221 	 * Create and initialize the spa structure.
1222 	 */
1223 	spa = spa_add(pool, altroot);
1224 	spa_activate(spa);
1225 
1226 	/*
1227 	 * Pass off the heavy lifting to spa_load().
1228 	 * Pass TRUE for mosconfig because the user-supplied config
1229 	 * is actually the one to trust when doing an import.
1230 	 */
1231 	error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE);
1232 
1233 	spa_config_enter(spa, RW_WRITER, FTAG);
1234 	/*
1235 	 * Toss any existing sparelist, as it doesn't have any validity anymore,
1236 	 * and conflicts with spa_has_spare().
1237 	 */
1238 	if (spa->spa_sparelist) {
1239 		nvlist_free(spa->spa_sparelist);
1240 		spa->spa_sparelist = NULL;
1241 		spa_load_spares(spa);
1242 	}
1243 
1244 	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
1245 	    &nvroot) == 0);
1246 	if (error == 0)
1247 		error = spa_validate_spares(spa, nvroot, -1ULL,
1248 		    VDEV_ALLOC_SPARE);
1249 	spa_config_exit(spa, FTAG);
1250 
1251 	if (error != 0) {
1252 		spa_unload(spa);
1253 		spa_deactivate(spa);
1254 		spa_remove(spa);
1255 		mutex_exit(&spa_namespace_lock);
1256 		return (error);
1257 	}
1258 
1259 	/*
1260 	 * Override any spares as specified by the user, as these may have
1261 	 * correct device names/devids, etc.
1262 	 */
1263 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
1264 	    &spares, &nspares) == 0) {
1265 		if (spa->spa_sparelist)
1266 			VERIFY(nvlist_remove(spa->spa_sparelist,
1267 			    ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
1268 		else
1269 			VERIFY(nvlist_alloc(&spa->spa_sparelist,
1270 			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
1271 		VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist,
1272 		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
1273 		spa_config_enter(spa, RW_WRITER, FTAG);
1274 		spa_load_spares(spa);
1275 		spa_config_exit(spa, FTAG);
1276 		spa->spa_sync_spares = B_TRUE;
1277 	}
1278 
1279 	/*
1280 	 * Update the config cache to include the newly-imported pool.
1281 	 */
1282 	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
1283 
1284 	mutex_exit(&spa_namespace_lock);
1285 
1286 	/*
1287 	 * Resilver anything that's out of date.
1288 	 */
1289 	if (spa_mode & FWRITE)
1290 		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
1291 
1292 	return (0);
1293 }
1294 
1295 /*
1296  * This (illegal) pool name is used when temporarily importing a spa_t in order
1297  * to get the vdev stats associated with the imported devices.
1298  */
1299 #define	TRYIMPORT_NAME	"$import"
1300 
1301 nvlist_t *
1302 spa_tryimport(nvlist_t *tryconfig)
1303 {
1304 	nvlist_t *config = NULL;
1305 	char *poolname;
1306 	spa_t *spa;
1307 	uint64_t state;
1308 
1309 	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
1310 		return (NULL);
1311 
1312 	if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
1313 		return (NULL);
1314 
1315 	/*
1316 	 * Create and initialize the spa structure.
1317 	 */
1318 	mutex_enter(&spa_namespace_lock);
1319 	spa = spa_add(TRYIMPORT_NAME, NULL);
1320 	spa_activate(spa);
1321 
1322 	/*
1323 	 * Pass off the heavy lifting to spa_load().
1324 	 * Pass TRUE for mosconfig because the user-supplied config
1325 	 * is actually the one to trust when doing an import.
1326 	 */
1327 	(void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE);
1328 
1329 	/*
1330 	 * If 'tryconfig' was at least parsable, return the current config.
1331 	 */
1332 	if (spa->spa_root_vdev != NULL) {
1333 		spa_config_enter(spa, RW_READER, FTAG);
1334 		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
1335 		spa_config_exit(spa, FTAG);
1336 		VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
1337 		    poolname) == 0);
1338 		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
1339 		    state) == 0);
1340 
1341 		/*
1342 		 * Add the list of hot spares.
1343 		 */
1344 		spa_add_spares(spa, config);
1345 	}
1346 
1347 	spa_unload(spa);
1348 	spa_deactivate(spa);
1349 	spa_remove(spa);
1350 	mutex_exit(&spa_namespace_lock);
1351 
1352 	return (config);
1353 }
1354 
1355 /*
1356  * Pool export/destroy
1357  *
1358  * The act of destroying or exporting a pool is very simple.  We make sure there
1359  * is no more pending I/O and any references to the pool are gone.  Then, we
1360  * update the pool state and sync all the labels to disk, removing the
1361  * configuration from the cache afterwards.
1362  */
1363 static int
1364 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig)
1365 {
1366 	spa_t *spa;
1367 
1368 	if (oldconfig)
1369 		*oldconfig = NULL;
1370 
1371 	if (!(spa_mode & FWRITE))
1372 		return (EROFS);
1373 
1374 	mutex_enter(&spa_namespace_lock);
1375 	if ((spa = spa_lookup(pool)) == NULL) {
1376 		mutex_exit(&spa_namespace_lock);
1377 		return (ENOENT);
1378 	}
1379 
1380 	/*
1381 	 * Put a hold on the pool, drop the namespace lock, stop async tasks,
1382 	 * reacquire the namespace lock, and see if we can export.
1383 	 */
1384 	spa_open_ref(spa, FTAG);
1385 	mutex_exit(&spa_namespace_lock);
1386 	spa_async_suspend(spa);
1387 	mutex_enter(&spa_namespace_lock);
1388 	spa_close(spa, FTAG);
1389 
1390 	/*
1391 	 * The pool will be in core if it's openable,
1392 	 * in which case we can modify its state.
1393 	 */
1394 	if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
1395 		/*
1396 		 * Objsets may be open only because they're dirty, so we
1397 		 * have to force it to sync before checking spa_refcnt.
1398 		 */
1399 		spa_scrub_suspend(spa);
1400 		txg_wait_synced(spa->spa_dsl_pool, 0);
1401 
1402 		/*
1403 		 * A pool cannot be exported or destroyed if there are active
1404 		 * references.  If we are resetting a pool, allow references by
1405 		 * fault injection handlers.
1406 		 */
1407 		if (!spa_refcount_zero(spa) ||
1408 		    (spa->spa_inject_ref != 0 &&
1409 		    new_state != POOL_STATE_UNINITIALIZED)) {
1410 			spa_scrub_resume(spa);
1411 			spa_async_resume(spa);
1412 			mutex_exit(&spa_namespace_lock);
1413 			return (EBUSY);
1414 		}
1415 
1416 		spa_scrub_resume(spa);
1417 		VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
1418 
1419 		/*
1420 		 * We want this to be reflected on every label,
1421 		 * so mark them all dirty.  spa_unload() will do the
1422 		 * final sync that pushes these changes out.
1423 		 */
1424 		if (new_state != POOL_STATE_UNINITIALIZED) {
1425 			spa_config_enter(spa, RW_WRITER, FTAG);
1426 			spa->spa_state = new_state;
1427 			spa->spa_final_txg = spa_last_synced_txg(spa) + 1;
1428 			vdev_config_dirty(spa->spa_root_vdev);
1429 			spa_config_exit(spa, FTAG);
1430 		}
1431 	}
1432 
1433 	if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
1434 		spa_unload(spa);
1435 		spa_deactivate(spa);
1436 	}
1437 
1438 	if (oldconfig && spa->spa_config)
1439 		VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
1440 
1441 	if (new_state != POOL_STATE_UNINITIALIZED) {
1442 		spa_remove(spa);
1443 		spa_config_sync();
1444 	}
1445 	mutex_exit(&spa_namespace_lock);
1446 
1447 	return (0);
1448 }
1449 
1450 /*
1451  * Destroy a storage pool.
1452  */
1453 int
1454 spa_destroy(char *pool)
1455 {
1456 	return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL));
1457 }
1458 
1459 /*
1460  * Export a storage pool.
1461  */
1462 int
1463 spa_export(char *pool, nvlist_t **oldconfig)
1464 {
1465 	return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig));
1466 }
1467 
1468 /*
1469  * Similar to spa_export(), this unloads the spa_t without actually removing it
1470  * from the namespace in any way.
1471  */
1472 int
1473 spa_reset(char *pool)
1474 {
1475 	return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL));
1476 }
1477 
1478 
1479 /*
1480  * ==========================================================================
1481  * Device manipulation
1482  * ==========================================================================
1483  */
1484 
1485 /*
1486  * Add capacity to a storage pool.
1487  */
1488 int
1489 spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
1490 {
1491 	uint64_t txg;
1492 	int c, error;
1493 	vdev_t *rvd = spa->spa_root_vdev;
1494 	vdev_t *vd, *tvd;
1495 	nvlist_t **spares;
1496 	uint_t i, nspares;
1497 
1498 	txg = spa_vdev_enter(spa);
1499 
1500 	if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
1501 	    VDEV_ALLOC_ADD)) != 0)
1502 		return (spa_vdev_exit(spa, NULL, txg, error));
1503 
1504 	spa->spa_pending_vdev = vd;
1505 
1506 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
1507 	    &spares, &nspares) != 0)
1508 		nspares = 0;
1509 
1510 	if (vd->vdev_children == 0 && nspares == 0) {
1511 		spa->spa_pending_vdev = NULL;
1512 		return (spa_vdev_exit(spa, vd, txg, EINVAL));
1513 	}
1514 
1515 	if (vd->vdev_children != 0) {
1516 		if ((error = vdev_create(vd, txg, B_FALSE)) != 0) {
1517 			spa->spa_pending_vdev = NULL;
1518 			return (spa_vdev_exit(spa, vd, txg, error));
1519 		}
1520 	}
1521 
1522 	/*
1523 	 * We must validate the spares after checking the children.  Otherwise,
1524 	 * vdev_inuse() will blindly overwrite the spare.
1525 	 */
1526 	if ((error = spa_validate_spares(spa, nvroot, txg,
1527 	    VDEV_ALLOC_ADD)) != 0) {
1528 		spa->spa_pending_vdev = NULL;
1529 		return (spa_vdev_exit(spa, vd, txg, error));
1530 	}
1531 
1532 	spa->spa_pending_vdev = NULL;
1533 
1534 	/*
1535 	 * Transfer each new top-level vdev from vd to rvd.
1536 	 */
1537 	for (c = 0; c < vd->vdev_children; c++) {
1538 		tvd = vd->vdev_child[c];
1539 		vdev_remove_child(vd, tvd);
1540 		tvd->vdev_id = rvd->vdev_children;
1541 		vdev_add_child(rvd, tvd);
1542 		vdev_config_dirty(tvd);
1543 	}
1544 
1545 	if (nspares != 0) {
1546 		if (spa->spa_sparelist != NULL) {
1547 			nvlist_t **oldspares;
1548 			uint_t oldnspares;
1549 			nvlist_t **newspares;
1550 
1551 			VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist,
1552 			    ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0);
1553 
1554 			newspares = kmem_alloc(sizeof (void *) *
1555 			    (nspares + oldnspares), KM_SLEEP);
1556 			for (i = 0; i < oldnspares; i++)
1557 				VERIFY(nvlist_dup(oldspares[i],
1558 				    &newspares[i], KM_SLEEP) == 0);
1559 			for (i = 0; i < nspares; i++)
1560 				VERIFY(nvlist_dup(spares[i],
1561 				    &newspares[i + oldnspares],
1562 				    KM_SLEEP) == 0);
1563 
1564 			VERIFY(nvlist_remove(spa->spa_sparelist,
1565 			    ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
1566 
1567 			VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist,
1568 			    ZPOOL_CONFIG_SPARES, newspares,
1569 			    nspares + oldnspares) == 0);
1570 			for (i = 0; i < oldnspares + nspares; i++)
1571 				nvlist_free(newspares[i]);
1572 			kmem_free(newspares, (oldnspares + nspares) *
1573 			    sizeof (void *));
1574 		} else {
1575 			VERIFY(nvlist_alloc(&spa->spa_sparelist,
1576 			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
1577 			VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist,
1578 			    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
1579 		}
1580 
1581 		spa_load_spares(spa);
1582 		spa->spa_sync_spares = B_TRUE;
1583 	}
1584 
1585 	/*
1586 	 * We have to be careful when adding new vdevs to an existing pool.
1587 	 * If other threads start allocating from these vdevs before we
1588 	 * sync the config cache, and we lose power, then upon reboot we may
1589 	 * fail to open the pool because there are DVAs that the config cache
1590 	 * can't translate.  Therefore, we first add the vdevs without
1591 	 * initializing metaslabs; sync the config cache (via spa_vdev_exit());
1592 	 * and then let spa_config_update() initialize the new metaslabs.
1593 	 *
1594 	 * spa_load() checks for added-but-not-initialized vdevs, so that
1595 	 * if we lose power at any point in this sequence, the remaining
1596 	 * steps will be completed the next time we load the pool.
1597 	 */
1598 	(void) spa_vdev_exit(spa, vd, txg, 0);
1599 
1600 	mutex_enter(&spa_namespace_lock);
1601 	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
1602 	mutex_exit(&spa_namespace_lock);
1603 
1604 	return (0);
1605 }
1606 
1607 /*
1608  * Attach a device to a mirror.  The arguments are the path to any device
1609  * in the mirror, and the nvroot for the new device.  If the path specifies
1610  * a device that is not mirrored, we automatically insert the mirror vdev.
1611  *
1612  * If 'replacing' is specified, the new device is intended to replace the
1613  * existing device; in this case the two devices are made into their own
1614  * mirror using the 'replacing' vdev, which is functionally idendical to
1615  * the mirror vdev (it actually reuses all the same ops) but has a few
1616  * extra rules: you can't attach to it after it's been created, and upon
1617  * completion of resilvering, the first disk (the one being replaced)
1618  * is automatically detached.
1619  */
1620 int
1621 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
1622 {
1623 	uint64_t txg, open_txg;
1624 	int error;
1625 	vdev_t *rvd = spa->spa_root_vdev;
1626 	vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
1627 	vdev_ops_t *pvops;
1628 
1629 	txg = spa_vdev_enter(spa);
1630 
1631 	oldvd = vdev_lookup_by_guid(rvd, guid);
1632 
1633 	if (oldvd == NULL)
1634 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
1635 
1636 	if (!oldvd->vdev_ops->vdev_op_leaf)
1637 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
1638 
1639 	pvd = oldvd->vdev_parent;
1640 
1641 	if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
1642 	    VDEV_ALLOC_ADD)) != 0 || newrootvd->vdev_children != 1)
1643 		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
1644 
1645 	newvd = newrootvd->vdev_child[0];
1646 
1647 	if (!newvd->vdev_ops->vdev_op_leaf)
1648 		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
1649 
1650 	if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
1651 		return (spa_vdev_exit(spa, newrootvd, txg, error));
1652 
1653 	if (!replacing) {
1654 		/*
1655 		 * For attach, the only allowable parent is a mirror or the root
1656 		 * vdev.
1657 		 */
1658 		if (pvd->vdev_ops != &vdev_mirror_ops &&
1659 		    pvd->vdev_ops != &vdev_root_ops)
1660 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
1661 
1662 		pvops = &vdev_mirror_ops;
1663 	} else {
1664 		/*
1665 		 * Active hot spares can only be replaced by inactive hot
1666 		 * spares.
1667 		 */
1668 		if (pvd->vdev_ops == &vdev_spare_ops &&
1669 		    pvd->vdev_child[1] == oldvd &&
1670 		    !spa_has_spare(spa, newvd->vdev_guid))
1671 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
1672 
1673 		/*
1674 		 * If the source is a hot spare, and the parent isn't already a
1675 		 * spare, then we want to create a new hot spare.  Otherwise, we
1676 		 * want to create a replacing vdev.  The user is not allowed to
1677 		 * attach to a spared vdev child unless the 'isspare' state is
1678 		 * the same (spare replaces spare, non-spare replaces
1679 		 * non-spare).
1680 		 */
1681 		if (pvd->vdev_ops == &vdev_replacing_ops)
1682 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
1683 		else if (pvd->vdev_ops == &vdev_spare_ops &&
1684 		    newvd->vdev_isspare != oldvd->vdev_isspare)
1685 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
1686 		else if (pvd->vdev_ops != &vdev_spare_ops &&
1687 		    newvd->vdev_isspare)
1688 			pvops = &vdev_spare_ops;
1689 		else
1690 			pvops = &vdev_replacing_ops;
1691 	}
1692 
1693 	/*
1694 	 * Compare the new device size with the replaceable/attachable
1695 	 * device size.
1696 	 */
1697 	if (newvd->vdev_psize < vdev_get_rsize(oldvd))
1698 		return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
1699 
1700 	/*
1701 	 * The new device cannot have a higher alignment requirement
1702 	 * than the top-level vdev.
1703 	 */
1704 	if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
1705 		return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
1706 
1707 	/*
1708 	 * If this is an in-place replacement, update oldvd's path and devid
1709 	 * to make it distinguishable from newvd, and unopenable from now on.
1710 	 */
1711 	if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
1712 		spa_strfree(oldvd->vdev_path);
1713 		oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
1714 		    KM_SLEEP);
1715 		(void) sprintf(oldvd->vdev_path, "%s/%s",
1716 		    newvd->vdev_path, "old");
1717 		if (oldvd->vdev_devid != NULL) {
1718 			spa_strfree(oldvd->vdev_devid);
1719 			oldvd->vdev_devid = NULL;
1720 		}
1721 	}
1722 
1723 	/*
1724 	 * If the parent is not a mirror, or if we're replacing, insert the new
1725 	 * mirror/replacing/spare vdev above oldvd.
1726 	 */
1727 	if (pvd->vdev_ops != pvops)
1728 		pvd = vdev_add_parent(oldvd, pvops);
1729 
1730 	ASSERT(pvd->vdev_top->vdev_parent == rvd);
1731 	ASSERT(pvd->vdev_ops == pvops);
1732 	ASSERT(oldvd->vdev_parent == pvd);
1733 
1734 	/*
1735 	 * Extract the new device from its root and add it to pvd.
1736 	 */
1737 	vdev_remove_child(newrootvd, newvd);
1738 	newvd->vdev_id = pvd->vdev_children;
1739 	vdev_add_child(pvd, newvd);
1740 
1741 	/*
1742 	 * If newvd is smaller than oldvd, but larger than its rsize,
1743 	 * the addition of newvd may have decreased our parent's asize.
1744 	 */
1745 	pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize);
1746 
1747 	tvd = newvd->vdev_top;
1748 	ASSERT(pvd->vdev_top == tvd);
1749 	ASSERT(tvd->vdev_parent == rvd);
1750 
1751 	vdev_config_dirty(tvd);
1752 
1753 	/*
1754 	 * Set newvd's DTL to [TXG_INITIAL, open_txg].  It will propagate
1755 	 * upward when spa_vdev_exit() calls vdev_dtl_reassess().
1756 	 */
1757 	open_txg = txg + TXG_CONCURRENT_STATES - 1;
1758 
1759 	mutex_enter(&newvd->vdev_dtl_lock);
1760 	space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL,
1761 	    open_txg - TXG_INITIAL + 1);
1762 	mutex_exit(&newvd->vdev_dtl_lock);
1763 
1764 	if (newvd->vdev_isspare)
1765 		spa_spare_activate(newvd);
1766 
1767 	/*
1768 	 * Mark newvd's DTL dirty in this txg.
1769 	 */
1770 	vdev_dirty(tvd, VDD_DTL, newvd, txg);
1771 
1772 	(void) spa_vdev_exit(spa, newrootvd, open_txg, 0);
1773 
1774 	/*
1775 	 * Kick off a resilver to update newvd.
1776 	 */
1777 	VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
1778 
1779 	return (0);
1780 }
1781 
1782 /*
1783  * Detach a device from a mirror or replacing vdev.
1784  * If 'replace_done' is specified, only detach if the parent
1785  * is a replacing vdev.
1786  */
1787 int
1788 spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
1789 {
1790 	uint64_t txg;
1791 	int c, t, error;
1792 	vdev_t *rvd = spa->spa_root_vdev;
1793 	vdev_t *vd, *pvd, *cvd, *tvd;
1794 	boolean_t unspare = B_FALSE;
1795 	uint64_t unspare_guid;
1796 
1797 	txg = spa_vdev_enter(spa);
1798 
1799 	vd = vdev_lookup_by_guid(rvd, guid);
1800 
1801 	if (vd == NULL)
1802 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
1803 
1804 	if (!vd->vdev_ops->vdev_op_leaf)
1805 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
1806 
1807 	pvd = vd->vdev_parent;
1808 
1809 	/*
1810 	 * If replace_done is specified, only remove this device if it's
1811 	 * the first child of a replacing vdev.  For the 'spare' vdev, either
1812 	 * disk can be removed.
1813 	 */
1814 	if (replace_done) {
1815 		if (pvd->vdev_ops == &vdev_replacing_ops) {
1816 			if (vd->vdev_id != 0)
1817 				return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
1818 		} else if (pvd->vdev_ops != &vdev_spare_ops) {
1819 			return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
1820 		}
1821 	}
1822 
1823 	ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
1824 	    spa_version(spa) >= ZFS_VERSION_SPARES);
1825 
1826 	/*
1827 	 * Only mirror, replacing, and spare vdevs support detach.
1828 	 */
1829 	if (pvd->vdev_ops != &vdev_replacing_ops &&
1830 	    pvd->vdev_ops != &vdev_mirror_ops &&
1831 	    pvd->vdev_ops != &vdev_spare_ops)
1832 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
1833 
1834 	/*
1835 	 * If there's only one replica, you can't detach it.
1836 	 */
1837 	if (pvd->vdev_children <= 1)
1838 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
1839 
1840 	/*
1841 	 * If all siblings have non-empty DTLs, this device may have the only
1842 	 * valid copy of the data, which means we cannot safely detach it.
1843 	 *
1844 	 * XXX -- as in the vdev_offline() case, we really want a more
1845 	 * precise DTL check.
1846 	 */
1847 	for (c = 0; c < pvd->vdev_children; c++) {
1848 		uint64_t dirty;
1849 
1850 		cvd = pvd->vdev_child[c];
1851 		if (cvd == vd)
1852 			continue;
1853 		if (vdev_is_dead(cvd))
1854 			continue;
1855 		mutex_enter(&cvd->vdev_dtl_lock);
1856 		dirty = cvd->vdev_dtl_map.sm_space |
1857 		    cvd->vdev_dtl_scrub.sm_space;
1858 		mutex_exit(&cvd->vdev_dtl_lock);
1859 		if (!dirty)
1860 			break;
1861 	}
1862 
1863 	/*
1864 	 * If we are a replacing or spare vdev, then we can always detach the
1865 	 * latter child, as that is how one cancels the operation.
1866 	 */
1867 	if ((pvd->vdev_ops == &vdev_mirror_ops || vd->vdev_id != 1) &&
1868 	    c == pvd->vdev_children)
1869 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
1870 
1871 	/*
1872 	 * If we are detaching the original disk from a spare, then it implies
1873 	 * that the spare should become a real disk, and be removed from the
1874 	 * active spare list for the pool.
1875 	 */
1876 	if (pvd->vdev_ops == &vdev_spare_ops &&
1877 	    vd->vdev_id == 0)
1878 		unspare = B_TRUE;
1879 
1880 	/*
1881 	 * Erase the disk labels so the disk can be used for other things.
1882 	 * This must be done after all other error cases are handled,
1883 	 * but before we disembowel vd (so we can still do I/O to it).
1884 	 * But if we can't do it, don't treat the error as fatal --
1885 	 * it may be that the unwritability of the disk is the reason
1886 	 * it's being detached!
1887 	 */
1888 	error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
1889 
1890 	/*
1891 	 * Remove vd from its parent and compact the parent's children.
1892 	 */
1893 	vdev_remove_child(pvd, vd);
1894 	vdev_compact_children(pvd);
1895 
1896 	/*
1897 	 * Remember one of the remaining children so we can get tvd below.
1898 	 */
1899 	cvd = pvd->vdev_child[0];
1900 
1901 	/*
1902 	 * If we need to remove the remaining child from the list of hot spares,
1903 	 * do it now, marking the vdev as no longer a spare in the process.  We
1904 	 * must do this before vdev_remove_parent(), because that can change the
1905 	 * GUID if it creates a new toplevel GUID.
1906 	 */
1907 	if (unspare) {
1908 		ASSERT(cvd->vdev_isspare);
1909 		spa_spare_remove(cvd);
1910 		unspare_guid = cvd->vdev_guid;
1911 	}
1912 
1913 	/*
1914 	 * If the parent mirror/replacing vdev only has one child,
1915 	 * the parent is no longer needed.  Remove it from the tree.
1916 	 */
1917 	if (pvd->vdev_children == 1)
1918 		vdev_remove_parent(cvd);
1919 
1920 	/*
1921 	 * We don't set tvd until now because the parent we just removed
1922 	 * may have been the previous top-level vdev.
1923 	 */
1924 	tvd = cvd->vdev_top;
1925 	ASSERT(tvd->vdev_parent == rvd);
1926 
1927 	/*
1928 	 * Reevaluate the parent vdev state.
1929 	 */
1930 	vdev_propagate_state(cvd->vdev_parent);
1931 
1932 	/*
1933 	 * If the device we just detached was smaller than the others, it may be
1934 	 * possible to add metaslabs (i.e. grow the pool).  vdev_metaslab_init()
1935 	 * can't fail because the existing metaslabs are already in core, so
1936 	 * there's nothing to read from disk.
1937 	 */
1938 	VERIFY(vdev_metaslab_init(tvd, txg) == 0);
1939 
1940 	vdev_config_dirty(tvd);
1941 
1942 	/*
1943 	 * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
1944 	 * vd->vdev_detached is set and free vd's DTL object in syncing context.
1945 	 * But first make sure we're not on any *other* txg's DTL list, to
1946 	 * prevent vd from being accessed after it's freed.
1947 	 */
1948 	for (t = 0; t < TXG_SIZE; t++)
1949 		(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
1950 	vd->vdev_detached = B_TRUE;
1951 	vdev_dirty(tvd, VDD_DTL, vd, txg);
1952 
1953 	error = spa_vdev_exit(spa, vd, txg, 0);
1954 
1955 	/*
1956 	 * If this was the removal of the original device in a hot spare vdev,
1957 	 * then we want to go through and remove the device from the hot spare
1958 	 * list of every other pool.
1959 	 */
1960 	if (unspare) {
1961 		spa = NULL;
1962 		mutex_enter(&spa_namespace_lock);
1963 		while ((spa = spa_next(spa)) != NULL) {
1964 			if (spa->spa_state != POOL_STATE_ACTIVE)
1965 				continue;
1966 
1967 			(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
1968 		}
1969 		mutex_exit(&spa_namespace_lock);
1970 	}
1971 
1972 	return (error);
1973 }
1974 
1975 /*
1976  * Remove a device from the pool.  Currently, this supports removing only hot
1977  * spares.
1978  */
1979 int
1980 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
1981 {
1982 	vdev_t *vd;
1983 	nvlist_t **spares, *nv, **newspares;
1984 	uint_t i, j, nspares;
1985 	int ret = 0;
1986 
1987 	spa_config_enter(spa, RW_WRITER, FTAG);
1988 
1989 	vd = spa_lookup_by_guid(spa, guid);
1990 
1991 	nv = NULL;
1992 	if (spa->spa_spares != NULL &&
1993 	    nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
1994 	    &spares, &nspares) == 0) {
1995 		for (i = 0; i < nspares; i++) {
1996 			uint64_t theguid;
1997 
1998 			VERIFY(nvlist_lookup_uint64(spares[i],
1999 			    ZPOOL_CONFIG_GUID, &theguid) == 0);
2000 			if (theguid == guid) {
2001 				nv = spares[i];
2002 				break;
2003 			}
2004 		}
2005 	}
2006 
2007 	/*
2008 	 * We only support removing a hot spare, and only if it's not currently
2009 	 * in use in this pool.
2010 	 */
2011 	if (nv == NULL && vd == NULL) {
2012 		ret = ENOENT;
2013 		goto out;
2014 	}
2015 
2016 	if (nv == NULL && vd != NULL) {
2017 		ret = ENOTSUP;
2018 		goto out;
2019 	}
2020 
2021 	if (!unspare && nv != NULL && vd != NULL) {
2022 		ret = EBUSY;
2023 		goto out;
2024 	}
2025 
2026 	if (nspares == 1) {
2027 		newspares = NULL;
2028 	} else {
2029 		newspares = kmem_alloc((nspares - 1) * sizeof (void *),
2030 		    KM_SLEEP);
2031 		for (i = 0, j = 0; i < nspares; i++) {
2032 			if (spares[i] != nv)
2033 				VERIFY(nvlist_dup(spares[i],
2034 				    &newspares[j++], KM_SLEEP) == 0);
2035 		}
2036 	}
2037 
2038 	VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
2039 	    DATA_TYPE_NVLIST_ARRAY) == 0);
2040 	VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
2041 	    newspares, nspares - 1) == 0);
2042 	for (i = 0; i < nspares - 1; i++)
2043 		nvlist_free(newspares[i]);
2044 	kmem_free(newspares, (nspares - 1) * sizeof (void *));
2045 	spa_load_spares(spa);
2046 	spa->spa_sync_spares = B_TRUE;
2047 
2048 out:
2049 	spa_config_exit(spa, FTAG);
2050 
2051 	return (ret);
2052 }
2053 
2054 /*
2055  * Find any device that's done replacing, so we can detach it.
2056  */
2057 static vdev_t *
2058 spa_vdev_replace_done_hunt(vdev_t *vd)
2059 {
2060 	vdev_t *newvd, *oldvd;
2061 	int c;
2062 
2063 	for (c = 0; c < vd->vdev_children; c++) {
2064 		oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]);
2065 		if (oldvd != NULL)
2066 			return (oldvd);
2067 	}
2068 
2069 	if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) {
2070 		oldvd = vd->vdev_child[0];
2071 		newvd = vd->vdev_child[1];
2072 
2073 		mutex_enter(&newvd->vdev_dtl_lock);
2074 		if (newvd->vdev_dtl_map.sm_space == 0 &&
2075 		    newvd->vdev_dtl_scrub.sm_space == 0) {
2076 			mutex_exit(&newvd->vdev_dtl_lock);
2077 			return (oldvd);
2078 		}
2079 		mutex_exit(&newvd->vdev_dtl_lock);
2080 	}
2081 
2082 	return (NULL);
2083 }
2084 
2085 static void
2086 spa_vdev_replace_done(spa_t *spa)
2087 {
2088 	vdev_t *vd;
2089 	vdev_t *pvd;
2090 	uint64_t guid;
2091 	uint64_t pguid = 0;
2092 
2093 	spa_config_enter(spa, RW_READER, FTAG);
2094 
2095 	while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) {
2096 		guid = vd->vdev_guid;
2097 		/*
2098 		 * If we have just finished replacing a hot spared device, then
2099 		 * we need to detach the parent's first child (the original hot
2100 		 * spare) as well.
2101 		 */
2102 		pvd = vd->vdev_parent;
2103 		if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
2104 		    pvd->vdev_id == 0) {
2105 			ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
2106 			ASSERT(pvd->vdev_parent->vdev_children == 2);
2107 			pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid;
2108 		}
2109 		spa_config_exit(spa, FTAG);
2110 		if (spa_vdev_detach(spa, guid, B_TRUE) != 0)
2111 			return;
2112 		if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0)
2113 			return;
2114 		spa_config_enter(spa, RW_READER, FTAG);
2115 	}
2116 
2117 	spa_config_exit(spa, FTAG);
2118 }
2119 
2120 /*
2121  * Update the stored path for this vdev.  Dirty the vdev configuration, relying
2122  * on spa_vdev_enter/exit() to synchronize the labels and cache.
2123  */
2124 int
2125 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
2126 {
2127 	vdev_t *rvd, *vd;
2128 	uint64_t txg;
2129 
2130 	rvd = spa->spa_root_vdev;
2131 
2132 	txg = spa_vdev_enter(spa);
2133 
2134 	if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
2135 		/*
2136 		 * Determine if this is a reference to a hot spare.  In that
2137 		 * case, update the path as stored in the spare list.
2138 		 */
2139 		nvlist_t **spares;
2140 		uint_t i, nspares;
2141 		if (spa->spa_sparelist != NULL) {
2142 			VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist,
2143 			    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
2144 			for (i = 0; i < nspares; i++) {
2145 				uint64_t theguid;
2146 				VERIFY(nvlist_lookup_uint64(spares[i],
2147 				    ZPOOL_CONFIG_GUID, &theguid) == 0);
2148 				if (theguid == guid)
2149 					break;
2150 			}
2151 
2152 			if (i == nspares)
2153 				return (spa_vdev_exit(spa, NULL, txg, ENOENT));
2154 
2155 			VERIFY(nvlist_add_string(spares[i],
2156 			    ZPOOL_CONFIG_PATH, newpath) == 0);
2157 			spa_load_spares(spa);
2158 			spa->spa_sync_spares = B_TRUE;
2159 			return (spa_vdev_exit(spa, NULL, txg, 0));
2160 		} else {
2161 			return (spa_vdev_exit(spa, NULL, txg, ENOENT));
2162 		}
2163 	}
2164 
2165 	if (!vd->vdev_ops->vdev_op_leaf)
2166 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
2167 
2168 	spa_strfree(vd->vdev_path);
2169 	vd->vdev_path = spa_strdup(newpath);
2170 
2171 	vdev_config_dirty(vd->vdev_top);
2172 
2173 	return (spa_vdev_exit(spa, NULL, txg, 0));
2174 }
2175 
2176 /*
2177  * ==========================================================================
2178  * SPA Scrubbing
2179  * ==========================================================================
2180  */
2181 
2182 void
2183 spa_scrub_throttle(spa_t *spa, int direction)
2184 {
2185 	mutex_enter(&spa->spa_scrub_lock);
2186 	spa->spa_scrub_throttled += direction;
2187 	ASSERT(spa->spa_scrub_throttled >= 0);
2188 	if (spa->spa_scrub_throttled == 0)
2189 		cv_broadcast(&spa->spa_scrub_io_cv);
2190 	mutex_exit(&spa->spa_scrub_lock);
2191 }
2192 
2193 static void
2194 spa_scrub_io_done(zio_t *zio)
2195 {
2196 	spa_t *spa = zio->io_spa;
2197 
2198 	zio_data_buf_free(zio->io_data, zio->io_size);
2199 
2200 	mutex_enter(&spa->spa_scrub_lock);
2201 	if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2202 		vdev_t *vd = zio->io_vd ? zio->io_vd : spa->spa_root_vdev;
2203 		spa->spa_scrub_errors++;
2204 		mutex_enter(&vd->vdev_stat_lock);
2205 		vd->vdev_stat.vs_scrub_errors++;
2206 		mutex_exit(&vd->vdev_stat_lock);
2207 	}
2208 	if (--spa->spa_scrub_inflight == 0) {
2209 		cv_broadcast(&spa->spa_scrub_io_cv);
2210 		ASSERT(spa->spa_scrub_throttled == 0);
2211 	}
2212 	mutex_exit(&spa->spa_scrub_lock);
2213 }
2214 
2215 static void
2216 spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags,
2217     zbookmark_t *zb)
2218 {
2219 	size_t size = BP_GET_LSIZE(bp);
2220 	void *data = zio_data_buf_alloc(size);
2221 
2222 	mutex_enter(&spa->spa_scrub_lock);
2223 	spa->spa_scrub_inflight++;
2224 	mutex_exit(&spa->spa_scrub_lock);
2225 
2226 	if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)
2227 		flags |= ZIO_FLAG_SPECULATIVE;	/* intent log block */
2228 
2229 	flags |= ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL;
2230 
2231 	zio_nowait(zio_read(NULL, spa, bp, data, size,
2232 	    spa_scrub_io_done, NULL, priority, flags, zb));
2233 }
2234 
2235 /* ARGSUSED */
2236 static int
2237 spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a)
2238 {
2239 	blkptr_t *bp = &bc->bc_blkptr;
2240 	vdev_t *vd = spa->spa_root_vdev;
2241 	dva_t *dva = bp->blk_dva;
2242 	int needs_resilver = B_FALSE;
2243 	int d;
2244 
2245 	if (bc->bc_errno) {
2246 		/*
2247 		 * We can't scrub this block, but we can continue to scrub
2248 		 * the rest of the pool.  Note the error and move along.
2249 		 */
2250 		mutex_enter(&spa->spa_scrub_lock);
2251 		spa->spa_scrub_errors++;
2252 		mutex_exit(&spa->spa_scrub_lock);
2253 
2254 		mutex_enter(&vd->vdev_stat_lock);
2255 		vd->vdev_stat.vs_scrub_errors++;
2256 		mutex_exit(&vd->vdev_stat_lock);
2257 
2258 		return (ERESTART);
2259 	}
2260 
2261 	ASSERT(bp->blk_birth < spa->spa_scrub_maxtxg);
2262 
2263 	for (d = 0; d < BP_GET_NDVAS(bp); d++) {
2264 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]));
2265 
2266 		ASSERT(vd != NULL);
2267 
2268 		/*
2269 		 * Keep track of how much data we've examined so that
2270 		 * zpool(1M) status can make useful progress reports.
2271 		 */
2272 		mutex_enter(&vd->vdev_stat_lock);
2273 		vd->vdev_stat.vs_scrub_examined += DVA_GET_ASIZE(&dva[d]);
2274 		mutex_exit(&vd->vdev_stat_lock);
2275 
2276 		if (spa->spa_scrub_type == POOL_SCRUB_RESILVER) {
2277 			if (DVA_GET_GANG(&dva[d])) {
2278 				/*
2279 				 * Gang members may be spread across multiple
2280 				 * vdevs, so the best we can do is look at the
2281 				 * pool-wide DTL.
2282 				 * XXX -- it would be better to change our
2283 				 * allocation policy to ensure that this can't
2284 				 * happen.
2285 				 */
2286 				vd = spa->spa_root_vdev;
2287 			}
2288 			if (vdev_dtl_contains(&vd->vdev_dtl_map,
2289 			    bp->blk_birth, 1))
2290 				needs_resilver = B_TRUE;
2291 		}
2292 	}
2293 
2294 	if (spa->spa_scrub_type == POOL_SCRUB_EVERYTHING)
2295 		spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB,
2296 		    ZIO_FLAG_SCRUB, &bc->bc_bookmark);
2297 	else if (needs_resilver)
2298 		spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER,
2299 		    ZIO_FLAG_RESILVER, &bc->bc_bookmark);
2300 
2301 	return (0);
2302 }
2303 
2304 static void
2305 spa_scrub_thread(spa_t *spa)
2306 {
2307 	callb_cpr_t cprinfo;
2308 	traverse_handle_t *th = spa->spa_scrub_th;
2309 	vdev_t *rvd = spa->spa_root_vdev;
2310 	pool_scrub_type_t scrub_type = spa->spa_scrub_type;
2311 	int error = 0;
2312 	boolean_t complete;
2313 
2314 	CALLB_CPR_INIT(&cprinfo, &spa->spa_scrub_lock, callb_generic_cpr, FTAG);
2315 
2316 	/*
2317 	 * If we're restarting due to a snapshot create/delete,
2318 	 * wait for that to complete.
2319 	 */
2320 	txg_wait_synced(spa_get_dsl(spa), 0);
2321 
2322 	dprintf("start %s mintxg=%llu maxtxg=%llu\n",
2323 	    scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub",
2324 	    spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg);
2325 
2326 	spa_config_enter(spa, RW_WRITER, FTAG);
2327 	vdev_reopen(rvd);		/* purge all vdev caches */
2328 	vdev_config_dirty(rvd);		/* rewrite all disk labels */
2329 	vdev_scrub_stat_update(rvd, scrub_type, B_FALSE);
2330 	spa_config_exit(spa, FTAG);
2331 
2332 	mutex_enter(&spa->spa_scrub_lock);
2333 	spa->spa_scrub_errors = 0;
2334 	spa->spa_scrub_active = 1;
2335 	ASSERT(spa->spa_scrub_inflight == 0);
2336 	ASSERT(spa->spa_scrub_throttled == 0);
2337 
2338 	while (!spa->spa_scrub_stop) {
2339 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
2340 		while (spa->spa_scrub_suspended) {
2341 			spa->spa_scrub_active = 0;
2342 			cv_broadcast(&spa->spa_scrub_cv);
2343 			cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
2344 			spa->spa_scrub_active = 1;
2345 		}
2346 		CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_scrub_lock);
2347 
2348 		if (spa->spa_scrub_restart_txg != 0)
2349 			break;
2350 
2351 		mutex_exit(&spa->spa_scrub_lock);
2352 		error = traverse_more(th);
2353 		mutex_enter(&spa->spa_scrub_lock);
2354 		if (error != EAGAIN)
2355 			break;
2356 
2357 		while (spa->spa_scrub_throttled > 0)
2358 			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
2359 	}
2360 
2361 	while (spa->spa_scrub_inflight)
2362 		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
2363 
2364 	spa->spa_scrub_active = 0;
2365 	cv_broadcast(&spa->spa_scrub_cv);
2366 
2367 	mutex_exit(&spa->spa_scrub_lock);
2368 
2369 	spa_config_enter(spa, RW_WRITER, FTAG);
2370 
2371 	mutex_enter(&spa->spa_scrub_lock);
2372 
2373 	/*
2374 	 * Note: we check spa_scrub_restart_txg under both spa_scrub_lock
2375 	 * AND the spa config lock to synchronize with any config changes
2376 	 * that revise the DTLs under spa_vdev_enter() / spa_vdev_exit().
2377 	 */
2378 	if (spa->spa_scrub_restart_txg != 0)
2379 		error = ERESTART;
2380 
2381 	if (spa->spa_scrub_stop)
2382 		error = EINTR;
2383 
2384 	/*
2385 	 * Even if there were uncorrectable errors, we consider the scrub
2386 	 * completed.  The downside is that if there is a transient error during
2387 	 * a resilver, we won't resilver the data properly to the target.  But
2388 	 * if the damage is permanent (more likely) we will resilver forever,
2389 	 * which isn't really acceptable.  Since there is enough information for
2390 	 * the user to know what has failed and why, this seems like a more
2391 	 * tractable approach.
2392 	 */
2393 	complete = (error == 0);
2394 
2395 	dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n",
2396 	    scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub",
2397 	    spa->spa_scrub_maxtxg, complete ? "done" : "FAILED",
2398 	    error, spa->spa_scrub_errors, spa->spa_scrub_stop);
2399 
2400 	mutex_exit(&spa->spa_scrub_lock);
2401 
2402 	/*
2403 	 * If the scrub/resilver completed, update all DTLs to reflect this.
2404 	 * Whether it succeeded or not, vacate all temporary scrub DTLs.
2405 	 */
2406 	vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1,
2407 	    complete ? spa->spa_scrub_maxtxg : 0, B_TRUE);
2408 	vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete);
2409 	spa_errlog_rotate(spa);
2410 
2411 	spa_config_exit(spa, FTAG);
2412 
2413 	mutex_enter(&spa->spa_scrub_lock);
2414 
2415 	/*
2416 	 * We may have finished replacing a device.
2417 	 * Let the async thread assess this and handle the detach.
2418 	 */
2419 	spa_async_request(spa, SPA_ASYNC_REPLACE_DONE);
2420 
2421 	/*
2422 	 * If we were told to restart, our final act is to start a new scrub.
2423 	 */
2424 	if (error == ERESTART)
2425 		spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ?
2426 		    SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB);
2427 
2428 	spa->spa_scrub_type = POOL_SCRUB_NONE;
2429 	spa->spa_scrub_active = 0;
2430 	spa->spa_scrub_thread = NULL;
2431 	cv_broadcast(&spa->spa_scrub_cv);
2432 	CALLB_CPR_EXIT(&cprinfo);	/* drops &spa->spa_scrub_lock */
2433 	thread_exit();
2434 }
2435 
2436 void
2437 spa_scrub_suspend(spa_t *spa)
2438 {
2439 	mutex_enter(&spa->spa_scrub_lock);
2440 	spa->spa_scrub_suspended++;
2441 	while (spa->spa_scrub_active) {
2442 		cv_broadcast(&spa->spa_scrub_cv);
2443 		cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
2444 	}
2445 	while (spa->spa_scrub_inflight)
2446 		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
2447 	mutex_exit(&spa->spa_scrub_lock);
2448 }
2449 
2450 void
2451 spa_scrub_resume(spa_t *spa)
2452 {
2453 	mutex_enter(&spa->spa_scrub_lock);
2454 	ASSERT(spa->spa_scrub_suspended != 0);
2455 	if (--spa->spa_scrub_suspended == 0)
2456 		cv_broadcast(&spa->spa_scrub_cv);
2457 	mutex_exit(&spa->spa_scrub_lock);
2458 }
2459 
2460 void
2461 spa_scrub_restart(spa_t *spa, uint64_t txg)
2462 {
2463 	/*
2464 	 * Something happened (e.g. snapshot create/delete) that means
2465 	 * we must restart any in-progress scrubs.  The itinerary will
2466 	 * fix this properly.
2467 	 */
2468 	mutex_enter(&spa->spa_scrub_lock);
2469 	spa->spa_scrub_restart_txg = txg;
2470 	mutex_exit(&spa->spa_scrub_lock);
2471 }
2472 
2473 int
2474 spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force)
2475 {
2476 	space_seg_t *ss;
2477 	uint64_t mintxg, maxtxg;
2478 	vdev_t *rvd = spa->spa_root_vdev;
2479 
2480 	if ((uint_t)type >= POOL_SCRUB_TYPES)
2481 		return (ENOTSUP);
2482 
2483 	mutex_enter(&spa->spa_scrub_lock);
2484 
2485 	/*
2486 	 * If there's a scrub or resilver already in progress, stop it.
2487 	 */
2488 	while (spa->spa_scrub_thread != NULL) {
2489 		/*
2490 		 * Don't stop a resilver unless forced.
2491 		 */
2492 		if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) {
2493 			mutex_exit(&spa->spa_scrub_lock);
2494 			return (EBUSY);
2495 		}
2496 		spa->spa_scrub_stop = 1;
2497 		cv_broadcast(&spa->spa_scrub_cv);
2498 		cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
2499 	}
2500 
2501 	/*
2502 	 * Terminate the previous traverse.
2503 	 */
2504 	if (spa->spa_scrub_th != NULL) {
2505 		traverse_fini(spa->spa_scrub_th);
2506 		spa->spa_scrub_th = NULL;
2507 	}
2508 
2509 	if (rvd == NULL) {
2510 		ASSERT(spa->spa_scrub_stop == 0);
2511 		ASSERT(spa->spa_scrub_type == type);
2512 		ASSERT(spa->spa_scrub_restart_txg == 0);
2513 		mutex_exit(&spa->spa_scrub_lock);
2514 		return (0);
2515 	}
2516 
2517 	mintxg = TXG_INITIAL - 1;
2518 	maxtxg = spa_last_synced_txg(spa) + 1;
2519 
2520 	mutex_enter(&rvd->vdev_dtl_lock);
2521 
2522 	if (rvd->vdev_dtl_map.sm_space == 0) {
2523 		/*
2524 		 * The pool-wide DTL is empty.
2525 		 * If this is a resilver, there's nothing to do except
2526 		 * check whether any in-progress replacements have completed.
2527 		 */
2528 		if (type == POOL_SCRUB_RESILVER) {
2529 			type = POOL_SCRUB_NONE;
2530 			spa_async_request(spa, SPA_ASYNC_REPLACE_DONE);
2531 		}
2532 	} else {
2533 		/*
2534 		 * The pool-wide DTL is non-empty.
2535 		 * If this is a normal scrub, upgrade to a resilver instead.
2536 		 */
2537 		if (type == POOL_SCRUB_EVERYTHING)
2538 			type = POOL_SCRUB_RESILVER;
2539 	}
2540 
2541 	if (type == POOL_SCRUB_RESILVER) {
2542 		/*
2543 		 * Determine the resilvering boundaries.
2544 		 *
2545 		 * Note: (mintxg, maxtxg) is an open interval,
2546 		 * i.e. mintxg and maxtxg themselves are not included.
2547 		 *
2548 		 * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1
2549 		 * so we don't claim to resilver a txg that's still changing.
2550 		 */
2551 		ss = avl_first(&rvd->vdev_dtl_map.sm_root);
2552 		mintxg = ss->ss_start - 1;
2553 		ss = avl_last(&rvd->vdev_dtl_map.sm_root);
2554 		maxtxg = MIN(ss->ss_end, maxtxg);
2555 	}
2556 
2557 	mutex_exit(&rvd->vdev_dtl_lock);
2558 
2559 	spa->spa_scrub_stop = 0;
2560 	spa->spa_scrub_type = type;
2561 	spa->spa_scrub_restart_txg = 0;
2562 
2563 	if (type != POOL_SCRUB_NONE) {
2564 		spa->spa_scrub_mintxg = mintxg;
2565 		spa->spa_scrub_maxtxg = maxtxg;
2566 		spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL,
2567 		    ADVANCE_PRE | ADVANCE_PRUNE | ADVANCE_ZIL,
2568 		    ZIO_FLAG_CANFAIL);
2569 		traverse_add_pool(spa->spa_scrub_th, mintxg, maxtxg);
2570 		spa->spa_scrub_thread = thread_create(NULL, 0,
2571 		    spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri);
2572 	}
2573 
2574 	mutex_exit(&spa->spa_scrub_lock);
2575 
2576 	return (0);
2577 }
2578 
2579 /*
2580  * ==========================================================================
2581  * SPA async task processing
2582  * ==========================================================================
2583  */
2584 
2585 static void
2586 spa_async_reopen(spa_t *spa)
2587 {
2588 	vdev_t *rvd = spa->spa_root_vdev;
2589 	vdev_t *tvd;
2590 	int c;
2591 
2592 	spa_config_enter(spa, RW_WRITER, FTAG);
2593 
2594 	for (c = 0; c < rvd->vdev_children; c++) {
2595 		tvd = rvd->vdev_child[c];
2596 		if (tvd->vdev_reopen_wanted) {
2597 			tvd->vdev_reopen_wanted = 0;
2598 			vdev_reopen(tvd);
2599 		}
2600 	}
2601 
2602 	spa_config_exit(spa, FTAG);
2603 }
2604 
2605 static void
2606 spa_async_thread(spa_t *spa)
2607 {
2608 	int tasks;
2609 
2610 	ASSERT(spa->spa_sync_on);
2611 
2612 	mutex_enter(&spa->spa_async_lock);
2613 	tasks = spa->spa_async_tasks;
2614 	spa->spa_async_tasks = 0;
2615 	mutex_exit(&spa->spa_async_lock);
2616 
2617 	/*
2618 	 * See if the config needs to be updated.
2619 	 */
2620 	if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
2621 		mutex_enter(&spa_namespace_lock);
2622 		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
2623 		mutex_exit(&spa_namespace_lock);
2624 	}
2625 
2626 	/*
2627 	 * See if any devices need to be reopened.
2628 	 */
2629 	if (tasks & SPA_ASYNC_REOPEN)
2630 		spa_async_reopen(spa);
2631 
2632 	/*
2633 	 * If any devices are done replacing, detach them.
2634 	 */
2635 	if (tasks & SPA_ASYNC_REPLACE_DONE)
2636 		spa_vdev_replace_done(spa);
2637 
2638 	/*
2639 	 * Kick off a scrub.
2640 	 */
2641 	if (tasks & SPA_ASYNC_SCRUB)
2642 		VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0);
2643 
2644 	/*
2645 	 * Kick off a resilver.
2646 	 */
2647 	if (tasks & SPA_ASYNC_RESILVER)
2648 		VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
2649 
2650 	/*
2651 	 * Let the world know that we're done.
2652 	 */
2653 	mutex_enter(&spa->spa_async_lock);
2654 	spa->spa_async_thread = NULL;
2655 	cv_broadcast(&spa->spa_async_cv);
2656 	mutex_exit(&spa->spa_async_lock);
2657 	thread_exit();
2658 }
2659 
2660 void
2661 spa_async_suspend(spa_t *spa)
2662 {
2663 	mutex_enter(&spa->spa_async_lock);
2664 	spa->spa_async_suspended++;
2665 	while (spa->spa_async_thread != NULL)
2666 		cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
2667 	mutex_exit(&spa->spa_async_lock);
2668 }
2669 
2670 void
2671 spa_async_resume(spa_t *spa)
2672 {
2673 	mutex_enter(&spa->spa_async_lock);
2674 	ASSERT(spa->spa_async_suspended != 0);
2675 	spa->spa_async_suspended--;
2676 	mutex_exit(&spa->spa_async_lock);
2677 }
2678 
2679 static void
2680 spa_async_dispatch(spa_t *spa)
2681 {
2682 	mutex_enter(&spa->spa_async_lock);
2683 	if (spa->spa_async_tasks && !spa->spa_async_suspended &&
2684 	    spa->spa_async_thread == NULL &&
2685 	    rootdir != NULL && !vn_is_readonly(rootdir))
2686 		spa->spa_async_thread = thread_create(NULL, 0,
2687 		    spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
2688 	mutex_exit(&spa->spa_async_lock);
2689 }
2690 
2691 void
2692 spa_async_request(spa_t *spa, int task)
2693 {
2694 	mutex_enter(&spa->spa_async_lock);
2695 	spa->spa_async_tasks |= task;
2696 	mutex_exit(&spa->spa_async_lock);
2697 }
2698 
2699 /*
2700  * ==========================================================================
2701  * SPA syncing routines
2702  * ==========================================================================
2703  */
2704 
2705 static void
2706 spa_sync_deferred_frees(spa_t *spa, uint64_t txg)
2707 {
2708 	bplist_t *bpl = &spa->spa_sync_bplist;
2709 	dmu_tx_t *tx;
2710 	blkptr_t blk;
2711 	uint64_t itor = 0;
2712 	zio_t *zio;
2713 	int error;
2714 	uint8_t c = 1;
2715 
2716 	zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CONFIG_HELD);
2717 
2718 	while (bplist_iterate(bpl, &itor, &blk) == 0)
2719 		zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL));
2720 
2721 	error = zio_wait(zio);
2722 	ASSERT3U(error, ==, 0);
2723 
2724 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
2725 	bplist_vacate(bpl, tx);
2726 
2727 	/*
2728 	 * Pre-dirty the first block so we sync to convergence faster.
2729 	 * (Usually only the first block is needed.)
2730 	 */
2731 	dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx);
2732 	dmu_tx_commit(tx);
2733 }
2734 
2735 static void
2736 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
2737 {
2738 	char *packed = NULL;
2739 	size_t nvsize = 0;
2740 	dmu_buf_t *db;
2741 
2742 	VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
2743 
2744 	packed = kmem_alloc(nvsize, KM_SLEEP);
2745 
2746 	VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
2747 	    KM_SLEEP) == 0);
2748 
2749 	dmu_write(spa->spa_meta_objset, obj, 0, nvsize, packed, tx);
2750 
2751 	kmem_free(packed, nvsize);
2752 
2753 	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
2754 	dmu_buf_will_dirty(db, tx);
2755 	*(uint64_t *)db->db_data = nvsize;
2756 	dmu_buf_rele(db, FTAG);
2757 }
2758 
2759 static void
2760 spa_sync_spares(spa_t *spa, dmu_tx_t *tx)
2761 {
2762 	nvlist_t *nvroot;
2763 	nvlist_t **spares;
2764 	int i;
2765 
2766 	if (!spa->spa_sync_spares)
2767 		return;
2768 
2769 	/*
2770 	 * Update the MOS nvlist describing the list of available spares.
2771 	 * spa_validate_spares() will have already made sure this nvlist is
2772 	 * valid and the vdevs are labelled appropriately.
2773 	 */
2774 	if (spa->spa_spares_object == 0) {
2775 		spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset,
2776 		    DMU_OT_PACKED_NVLIST, 1 << 14,
2777 		    DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
2778 		VERIFY(zap_update(spa->spa_meta_objset,
2779 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES,
2780 		    sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0);
2781 	}
2782 
2783 	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
2784 	if (spa->spa_nspares == 0) {
2785 		VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
2786 		    NULL, 0) == 0);
2787 	} else {
2788 		spares = kmem_alloc(spa->spa_nspares * sizeof (void *),
2789 		    KM_SLEEP);
2790 		for (i = 0; i < spa->spa_nspares; i++)
2791 			spares[i] = vdev_config_generate(spa,
2792 			    spa->spa_spares[i], B_FALSE, B_TRUE);
2793 		VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
2794 		    spares, spa->spa_nspares) == 0);
2795 		for (i = 0; i < spa->spa_nspares; i++)
2796 			nvlist_free(spares[i]);
2797 		kmem_free(spares, spa->spa_nspares * sizeof (void *));
2798 	}
2799 
2800 	spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx);
2801 	nvlist_free(nvroot);
2802 
2803 	spa->spa_sync_spares = B_FALSE;
2804 }
2805 
2806 static void
2807 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
2808 {
2809 	nvlist_t *config;
2810 
2811 	if (list_is_empty(&spa->spa_dirty_list))
2812 		return;
2813 
2814 	config = spa_config_generate(spa, NULL, dmu_tx_get_txg(tx), B_FALSE);
2815 
2816 	if (spa->spa_config_syncing)
2817 		nvlist_free(spa->spa_config_syncing);
2818 	spa->spa_config_syncing = config;
2819 
2820 	spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
2821 }
2822 
2823 /*
2824  * Sync the specified transaction group.  New blocks may be dirtied as
2825  * part of the process, so we iterate until it converges.
2826  */
2827 void
2828 spa_sync(spa_t *spa, uint64_t txg)
2829 {
2830 	dsl_pool_t *dp = spa->spa_dsl_pool;
2831 	objset_t *mos = spa->spa_meta_objset;
2832 	bplist_t *bpl = &spa->spa_sync_bplist;
2833 	vdev_t *rvd = spa->spa_root_vdev;
2834 	vdev_t *vd;
2835 	dmu_tx_t *tx;
2836 	int dirty_vdevs;
2837 
2838 	/*
2839 	 * Lock out configuration changes.
2840 	 */
2841 	spa_config_enter(spa, RW_READER, FTAG);
2842 
2843 	spa->spa_syncing_txg = txg;
2844 	spa->spa_sync_pass = 0;
2845 
2846 	VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj));
2847 
2848 	tx = dmu_tx_create_assigned(dp, txg);
2849 
2850 	/*
2851 	 * If we are upgrading to ZFS_VERSION_RAIDZ_DEFLATE this txg,
2852 	 * set spa_deflate if we have no raid-z vdevs.
2853 	 */
2854 	if (spa->spa_ubsync.ub_version < ZFS_VERSION_RAIDZ_DEFLATE &&
2855 	    spa->spa_uberblock.ub_version >= ZFS_VERSION_RAIDZ_DEFLATE) {
2856 		int i;
2857 
2858 		for (i = 0; i < rvd->vdev_children; i++) {
2859 			vd = rvd->vdev_child[i];
2860 			if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
2861 				break;
2862 		}
2863 		if (i == rvd->vdev_children) {
2864 			spa->spa_deflate = TRUE;
2865 			VERIFY(0 == zap_add(spa->spa_meta_objset,
2866 			    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
2867 			    sizeof (uint64_t), 1, &spa->spa_deflate, tx));
2868 		}
2869 	}
2870 
2871 	/*
2872 	 * If anything has changed in this txg, push the deferred frees
2873 	 * from the previous txg.  If not, leave them alone so that we
2874 	 * don't generate work on an otherwise idle system.
2875 	 */
2876 	if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
2877 	    !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
2878 	    !txg_list_empty(&dp->dp_sync_tasks, txg))
2879 		spa_sync_deferred_frees(spa, txg);
2880 
2881 	/*
2882 	 * Iterate to convergence.
2883 	 */
2884 	do {
2885 		spa->spa_sync_pass++;
2886 
2887 		spa_sync_config_object(spa, tx);
2888 		spa_sync_spares(spa, tx);
2889 		spa_errlog_sync(spa, txg);
2890 		dsl_pool_sync(dp, txg);
2891 
2892 		dirty_vdevs = 0;
2893 		while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) {
2894 			vdev_sync(vd, txg);
2895 			dirty_vdevs++;
2896 		}
2897 
2898 		bplist_sync(bpl, tx);
2899 	} while (dirty_vdevs);
2900 
2901 	bplist_close(bpl);
2902 
2903 	dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass);
2904 
2905 	/*
2906 	 * Rewrite the vdev configuration (which includes the uberblock)
2907 	 * to commit the transaction group.
2908 	 *
2909 	 * If there are any dirty vdevs, sync the uberblock to all vdevs.
2910 	 * Otherwise, pick a random top-level vdev that's known to be
2911 	 * visible in the config cache (see spa_vdev_add() for details).
2912 	 * If the write fails, try the next vdev until we're tried them all.
2913 	 */
2914 	if (!list_is_empty(&spa->spa_dirty_list)) {
2915 		VERIFY(vdev_config_sync(rvd, txg) == 0);
2916 	} else {
2917 		int children = rvd->vdev_children;
2918 		int c0 = spa_get_random(children);
2919 		int c;
2920 
2921 		for (c = 0; c < children; c++) {
2922 			vd = rvd->vdev_child[(c0 + c) % children];
2923 			if (vd->vdev_ms_array == 0)
2924 				continue;
2925 			if (vdev_config_sync(vd, txg) == 0)
2926 				break;
2927 		}
2928 		if (c == children)
2929 			VERIFY(vdev_config_sync(rvd, txg) == 0);
2930 	}
2931 
2932 	dmu_tx_commit(tx);
2933 
2934 	/*
2935 	 * Clear the dirty config list.
2936 	 */
2937 	while ((vd = list_head(&spa->spa_dirty_list)) != NULL)
2938 		vdev_config_clean(vd);
2939 
2940 	/*
2941 	 * Now that the new config has synced transactionally,
2942 	 * let it become visible to the config cache.
2943 	 */
2944 	if (spa->spa_config_syncing != NULL) {
2945 		spa_config_set(spa, spa->spa_config_syncing);
2946 		spa->spa_config_txg = txg;
2947 		spa->spa_config_syncing = NULL;
2948 	}
2949 
2950 	/*
2951 	 * Make a stable copy of the fully synced uberblock.
2952 	 * We use this as the root for pool traversals.
2953 	 */
2954 	spa->spa_traverse_wanted = 1;	/* tells traverse_more() to stop */
2955 
2956 	spa_scrub_suspend(spa);		/* stop scrubbing and finish I/Os */
2957 
2958 	rw_enter(&spa->spa_traverse_lock, RW_WRITER);
2959 	spa->spa_traverse_wanted = 0;
2960 	spa->spa_ubsync = spa->spa_uberblock;
2961 	rw_exit(&spa->spa_traverse_lock);
2962 
2963 	spa_scrub_resume(spa);		/* resume scrub with new ubsync */
2964 
2965 	/*
2966 	 * Clean up the ZIL records for the synced txg.
2967 	 */
2968 	dsl_pool_zil_clean(dp);
2969 
2970 	/*
2971 	 * Update usable space statistics.
2972 	 */
2973 	while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
2974 		vdev_sync_done(vd, txg);
2975 
2976 	/*
2977 	 * It had better be the case that we didn't dirty anything
2978 	 * since vdev_config_sync().
2979 	 */
2980 	ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
2981 	ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
2982 	ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
2983 	ASSERT(bpl->bpl_queue == NULL);
2984 
2985 	spa_config_exit(spa, FTAG);
2986 
2987 	/*
2988 	 * If any async tasks have been requested, kick them off.
2989 	 */
2990 	spa_async_dispatch(spa);
2991 }
2992 
2993 /*
2994  * Sync all pools.  We don't want to hold the namespace lock across these
2995  * operations, so we take a reference on the spa_t and drop the lock during the
2996  * sync.
2997  */
2998 void
2999 spa_sync_allpools(void)
3000 {
3001 	spa_t *spa = NULL;
3002 	mutex_enter(&spa_namespace_lock);
3003 	while ((spa = spa_next(spa)) != NULL) {
3004 		if (spa_state(spa) != POOL_STATE_ACTIVE)
3005 			continue;
3006 		spa_open_ref(spa, FTAG);
3007 		mutex_exit(&spa_namespace_lock);
3008 		txg_wait_synced(spa_get_dsl(spa), 0);
3009 		mutex_enter(&spa_namespace_lock);
3010 		spa_close(spa, FTAG);
3011 	}
3012 	mutex_exit(&spa_namespace_lock);
3013 }
3014 
3015 /*
3016  * ==========================================================================
3017  * Miscellaneous routines
3018  * ==========================================================================
3019  */
3020 
3021 /*
3022  * Remove all pools in the system.
3023  */
3024 void
3025 spa_evict_all(void)
3026 {
3027 	spa_t *spa;
3028 
3029 	/*
3030 	 * Remove all cached state.  All pools should be closed now,
3031 	 * so every spa in the AVL tree should be unreferenced.
3032 	 */
3033 	mutex_enter(&spa_namespace_lock);
3034 	while ((spa = spa_next(NULL)) != NULL) {
3035 		/*
3036 		 * Stop async tasks.  The async thread may need to detach
3037 		 * a device that's been replaced, which requires grabbing
3038 		 * spa_namespace_lock, so we must drop it here.
3039 		 */
3040 		spa_open_ref(spa, FTAG);
3041 		mutex_exit(&spa_namespace_lock);
3042 		spa_async_suspend(spa);
3043 		VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
3044 		mutex_enter(&spa_namespace_lock);
3045 		spa_close(spa, FTAG);
3046 
3047 		if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
3048 			spa_unload(spa);
3049 			spa_deactivate(spa);
3050 		}
3051 		spa_remove(spa);
3052 	}
3053 	mutex_exit(&spa_namespace_lock);
3054 }
3055 
3056 vdev_t *
3057 spa_lookup_by_guid(spa_t *spa, uint64_t guid)
3058 {
3059 	return (vdev_lookup_by_guid(spa->spa_root_vdev, guid));
3060 }
3061 
3062 void
3063 spa_upgrade(spa_t *spa)
3064 {
3065 	spa_config_enter(spa, RW_WRITER, FTAG);
3066 
3067 	/*
3068 	 * This should only be called for a non-faulted pool, and since a
3069 	 * future version would result in an unopenable pool, this shouldn't be
3070 	 * possible.
3071 	 */
3072 	ASSERT(spa->spa_uberblock.ub_version <= ZFS_VERSION);
3073 
3074 	spa->spa_uberblock.ub_version = ZFS_VERSION;
3075 	vdev_config_dirty(spa->spa_root_vdev);
3076 
3077 	spa_config_exit(spa, FTAG);
3078 
3079 	txg_wait_synced(spa_get_dsl(spa), 0);
3080 }
3081 
3082 boolean_t
3083 spa_has_spare(spa_t *spa, uint64_t guid)
3084 {
3085 	int i;
3086 	uint64_t spareguid;
3087 
3088 	for (i = 0; i < spa->spa_nspares; i++)
3089 		if (spa->spa_spares[i]->vdev_guid == guid)
3090 			return (B_TRUE);
3091 
3092 	for (i = 0; i < spa->spa_pending_nspares; i++) {
3093 		if (nvlist_lookup_uint64(spa->spa_pending_spares[i],
3094 		    ZPOOL_CONFIG_GUID, &spareguid) == 0 &&
3095 		    spareguid == guid)
3096 			return (B_TRUE);
3097 	}
3098 
3099 	return (B_FALSE);
3100 }
3101