xref: /titanic_41/usr/src/uts/common/fs/zfs/vdev.c (revision b509e89b2befbaa42939abad9da1d7f5a8c6aaae)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/zfs_context.h>
28 #include <sys/fm/fs/zfs.h>
29 #include <sys/spa.h>
30 #include <sys/spa_impl.h>
31 #include <sys/dmu.h>
32 #include <sys/dmu_tx.h>
33 #include <sys/vdev_impl.h>
34 #include <sys/uberblock_impl.h>
35 #include <sys/metaslab.h>
36 #include <sys/metaslab_impl.h>
37 #include <sys/space_map.h>
38 #include <sys/zio.h>
39 #include <sys/zap.h>
40 #include <sys/fs/zfs.h>
41 #include <sys/arc.h>
42 #include <sys/zil.h>
43 
44 /*
45  * Virtual device management.
46  */
47 
48 static vdev_ops_t *vdev_ops_table[] = {
49 	&vdev_root_ops,
50 	&vdev_raidz_ops,
51 	&vdev_mirror_ops,
52 	&vdev_replacing_ops,
53 	&vdev_spare_ops,
54 	&vdev_disk_ops,
55 	&vdev_file_ops,
56 	&vdev_missing_ops,
57 	NULL
58 };
59 
60 /* maximum scrub/resilver I/O queue per leaf vdev */
61 int zfs_scrub_limit = 10;
62 
63 /*
64  * Given a vdev type, return the appropriate ops vector.
65  */
66 static vdev_ops_t *
67 vdev_getops(const char *type)
68 {
69 	vdev_ops_t *ops, **opspp;
70 
71 	for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
72 		if (strcmp(ops->vdev_op_type, type) == 0)
73 			break;
74 
75 	return (ops);
76 }
77 
78 /*
79  * Default asize function: return the MAX of psize with the asize of
80  * all children.  This is what's used by anything other than RAID-Z.
81  */
82 uint64_t
83 vdev_default_asize(vdev_t *vd, uint64_t psize)
84 {
85 	uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
86 	uint64_t csize;
87 	uint64_t c;
88 
89 	for (c = 0; c < vd->vdev_children; c++) {
90 		csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
91 		asize = MAX(asize, csize);
92 	}
93 
94 	return (asize);
95 }
96 
97 /*
98  * Get the replaceable or attachable device size.
99  * If the parent is a mirror or raidz, the replaceable size is the minimum
100  * psize of all its children. For the rest, just return our own psize.
101  *
102  * e.g.
103  *			psize	rsize
104  * root			-	-
105  *	mirror/raidz	-	-
106  *	    disk1	20g	20g
107  *	    disk2 	40g	20g
108  *	disk3 		80g	80g
109  */
110 uint64_t
111 vdev_get_rsize(vdev_t *vd)
112 {
113 	vdev_t *pvd, *cvd;
114 	uint64_t c, rsize;
115 
116 	pvd = vd->vdev_parent;
117 
118 	/*
119 	 * If our parent is NULL or the root, just return our own psize.
120 	 */
121 	if (pvd == NULL || pvd->vdev_parent == NULL)
122 		return (vd->vdev_psize);
123 
124 	rsize = 0;
125 
126 	for (c = 0; c < pvd->vdev_children; c++) {
127 		cvd = pvd->vdev_child[c];
128 		rsize = MIN(rsize - 1, cvd->vdev_psize - 1) + 1;
129 	}
130 
131 	return (rsize);
132 }
133 
134 vdev_t *
135 vdev_lookup_top(spa_t *spa, uint64_t vdev)
136 {
137 	vdev_t *rvd = spa->spa_root_vdev;
138 
139 	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
140 
141 	if (vdev < rvd->vdev_children) {
142 		ASSERT(rvd->vdev_child[vdev] != NULL);
143 		return (rvd->vdev_child[vdev]);
144 	}
145 
146 	return (NULL);
147 }
148 
149 vdev_t *
150 vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
151 {
152 	int c;
153 	vdev_t *mvd;
154 
155 	if (vd->vdev_guid == guid)
156 		return (vd);
157 
158 	for (c = 0; c < vd->vdev_children; c++)
159 		if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
160 		    NULL)
161 			return (mvd);
162 
163 	return (NULL);
164 }
165 
166 void
167 vdev_add_child(vdev_t *pvd, vdev_t *cvd)
168 {
169 	size_t oldsize, newsize;
170 	uint64_t id = cvd->vdev_id;
171 	vdev_t **newchild;
172 
173 	ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
174 	ASSERT(cvd->vdev_parent == NULL);
175 
176 	cvd->vdev_parent = pvd;
177 
178 	if (pvd == NULL)
179 		return;
180 
181 	ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
182 
183 	oldsize = pvd->vdev_children * sizeof (vdev_t *);
184 	pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
185 	newsize = pvd->vdev_children * sizeof (vdev_t *);
186 
187 	newchild = kmem_zalloc(newsize, KM_SLEEP);
188 	if (pvd->vdev_child != NULL) {
189 		bcopy(pvd->vdev_child, newchild, oldsize);
190 		kmem_free(pvd->vdev_child, oldsize);
191 	}
192 
193 	pvd->vdev_child = newchild;
194 	pvd->vdev_child[id] = cvd;
195 
196 	cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
197 	ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
198 
199 	/*
200 	 * Walk up all ancestors to update guid sum.
201 	 */
202 	for (; pvd != NULL; pvd = pvd->vdev_parent)
203 		pvd->vdev_guid_sum += cvd->vdev_guid_sum;
204 
205 	if (cvd->vdev_ops->vdev_op_leaf)
206 		cvd->vdev_spa->spa_scrub_maxinflight += zfs_scrub_limit;
207 }
208 
209 void
210 vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
211 {
212 	int c;
213 	uint_t id = cvd->vdev_id;
214 
215 	ASSERT(cvd->vdev_parent == pvd);
216 
217 	if (pvd == NULL)
218 		return;
219 
220 	ASSERT(id < pvd->vdev_children);
221 	ASSERT(pvd->vdev_child[id] == cvd);
222 
223 	pvd->vdev_child[id] = NULL;
224 	cvd->vdev_parent = NULL;
225 
226 	for (c = 0; c < pvd->vdev_children; c++)
227 		if (pvd->vdev_child[c])
228 			break;
229 
230 	if (c == pvd->vdev_children) {
231 		kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
232 		pvd->vdev_child = NULL;
233 		pvd->vdev_children = 0;
234 	}
235 
236 	/*
237 	 * Walk up all ancestors to update guid sum.
238 	 */
239 	for (; pvd != NULL; pvd = pvd->vdev_parent)
240 		pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
241 
242 	if (cvd->vdev_ops->vdev_op_leaf)
243 		cvd->vdev_spa->spa_scrub_maxinflight -= zfs_scrub_limit;
244 }
245 
246 /*
247  * Remove any holes in the child array.
248  */
249 void
250 vdev_compact_children(vdev_t *pvd)
251 {
252 	vdev_t **newchild, *cvd;
253 	int oldc = pvd->vdev_children;
254 	int newc, c;
255 
256 	ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
257 
258 	for (c = newc = 0; c < oldc; c++)
259 		if (pvd->vdev_child[c])
260 			newc++;
261 
262 	newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP);
263 
264 	for (c = newc = 0; c < oldc; c++) {
265 		if ((cvd = pvd->vdev_child[c]) != NULL) {
266 			newchild[newc] = cvd;
267 			cvd->vdev_id = newc++;
268 		}
269 	}
270 
271 	kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
272 	pvd->vdev_child = newchild;
273 	pvd->vdev_children = newc;
274 }
275 
276 /*
277  * Allocate and minimally initialize a vdev_t.
278  */
279 static vdev_t *
280 vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
281 {
282 	vdev_t *vd;
283 
284 	vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
285 
286 	if (spa->spa_root_vdev == NULL) {
287 		ASSERT(ops == &vdev_root_ops);
288 		spa->spa_root_vdev = vd;
289 	}
290 
291 	if (guid == 0) {
292 		if (spa->spa_root_vdev == vd) {
293 			/*
294 			 * The root vdev's guid will also be the pool guid,
295 			 * which must be unique among all pools.
296 			 */
297 			while (guid == 0 || spa_guid_exists(guid, 0))
298 				guid = spa_get_random(-1ULL);
299 		} else {
300 			/*
301 			 * Any other vdev's guid must be unique within the pool.
302 			 */
303 			while (guid == 0 ||
304 			    spa_guid_exists(spa_guid(spa), guid))
305 				guid = spa_get_random(-1ULL);
306 		}
307 		ASSERT(!spa_guid_exists(spa_guid(spa), guid));
308 	}
309 
310 	vd->vdev_spa = spa;
311 	vd->vdev_id = id;
312 	vd->vdev_guid = guid;
313 	vd->vdev_guid_sum = guid;
314 	vd->vdev_ops = ops;
315 	vd->vdev_state = VDEV_STATE_CLOSED;
316 
317 	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
318 	mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
319 	mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
320 	for (int t = 0; t < DTL_TYPES; t++) {
321 		space_map_create(&vd->vdev_dtl[t], 0, -1ULL, 0,
322 		    &vd->vdev_dtl_lock);
323 	}
324 	txg_list_create(&vd->vdev_ms_list,
325 	    offsetof(struct metaslab, ms_txg_node));
326 	txg_list_create(&vd->vdev_dtl_list,
327 	    offsetof(struct vdev, vdev_dtl_node));
328 	vd->vdev_stat.vs_timestamp = gethrtime();
329 	vdev_queue_init(vd);
330 	vdev_cache_init(vd);
331 
332 	return (vd);
333 }
334 
335 /*
336  * Allocate a new vdev.  The 'alloctype' is used to control whether we are
337  * creating a new vdev or loading an existing one - the behavior is slightly
338  * different for each case.
339  */
340 int
341 vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
342     int alloctype)
343 {
344 	vdev_ops_t *ops;
345 	char *type;
346 	uint64_t guid = 0, islog, nparity;
347 	vdev_t *vd;
348 
349 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
350 
351 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
352 		return (EINVAL);
353 
354 	if ((ops = vdev_getops(type)) == NULL)
355 		return (EINVAL);
356 
357 	/*
358 	 * If this is a load, get the vdev guid from the nvlist.
359 	 * Otherwise, vdev_alloc_common() will generate one for us.
360 	 */
361 	if (alloctype == VDEV_ALLOC_LOAD) {
362 		uint64_t label_id;
363 
364 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
365 		    label_id != id)
366 			return (EINVAL);
367 
368 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
369 			return (EINVAL);
370 	} else if (alloctype == VDEV_ALLOC_SPARE) {
371 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
372 			return (EINVAL);
373 	} else if (alloctype == VDEV_ALLOC_L2CACHE) {
374 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
375 			return (EINVAL);
376 	} else if (alloctype == VDEV_ALLOC_ROOTPOOL) {
377 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
378 			return (EINVAL);
379 	}
380 
381 	/*
382 	 * The first allocated vdev must be of type 'root'.
383 	 */
384 	if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL)
385 		return (EINVAL);
386 
387 	/*
388 	 * Determine whether we're a log vdev.
389 	 */
390 	islog = 0;
391 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog);
392 	if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
393 		return (ENOTSUP);
394 
395 	/*
396 	 * Set the nparity property for RAID-Z vdevs.
397 	 */
398 	nparity = -1ULL;
399 	if (ops == &vdev_raidz_ops) {
400 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
401 		    &nparity) == 0) {
402 			/*
403 			 * Currently, we can only support 2 parity devices.
404 			 */
405 			if (nparity == 0 || nparity > 2)
406 				return (EINVAL);
407 			/*
408 			 * Older versions can only support 1 parity device.
409 			 */
410 			if (nparity == 2 &&
411 			    spa_version(spa) < SPA_VERSION_RAID6)
412 				return (ENOTSUP);
413 		} else {
414 			/*
415 			 * We require the parity to be specified for SPAs that
416 			 * support multiple parity levels.
417 			 */
418 			if (spa_version(spa) >= SPA_VERSION_RAID6)
419 				return (EINVAL);
420 			/*
421 			 * Otherwise, we default to 1 parity device for RAID-Z.
422 			 */
423 			nparity = 1;
424 		}
425 	} else {
426 		nparity = 0;
427 	}
428 	ASSERT(nparity != -1ULL);
429 
430 	vd = vdev_alloc_common(spa, id, guid, ops);
431 
432 	vd->vdev_islog = islog;
433 	vd->vdev_nparity = nparity;
434 
435 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
436 		vd->vdev_path = spa_strdup(vd->vdev_path);
437 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
438 		vd->vdev_devid = spa_strdup(vd->vdev_devid);
439 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH,
440 	    &vd->vdev_physpath) == 0)
441 		vd->vdev_physpath = spa_strdup(vd->vdev_physpath);
442 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0)
443 		vd->vdev_fru = spa_strdup(vd->vdev_fru);
444 
445 	/*
446 	 * Set the whole_disk property.  If it's not specified, leave the value
447 	 * as -1.
448 	 */
449 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
450 	    &vd->vdev_wholedisk) != 0)
451 		vd->vdev_wholedisk = -1ULL;
452 
453 	/*
454 	 * Look for the 'not present' flag.  This will only be set if the device
455 	 * was not present at the time of import.
456 	 */
457 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
458 	    &vd->vdev_not_present);
459 
460 	/*
461 	 * Get the alignment requirement.
462 	 */
463 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
464 
465 	/*
466 	 * If we're a top-level vdev, try to load the allocation parameters.
467 	 */
468 	if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) {
469 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
470 		    &vd->vdev_ms_array);
471 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
472 		    &vd->vdev_ms_shift);
473 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
474 		    &vd->vdev_asize);
475 	}
476 
477 	/*
478 	 * If we're a leaf vdev, try to load the DTL object and other state.
479 	 */
480 	if (vd->vdev_ops->vdev_op_leaf &&
481 	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE ||
482 	    alloctype == VDEV_ALLOC_ROOTPOOL)) {
483 		if (alloctype == VDEV_ALLOC_LOAD) {
484 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
485 			    &vd->vdev_dtl_smo.smo_object);
486 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
487 			    &vd->vdev_unspare);
488 		}
489 
490 		if (alloctype == VDEV_ALLOC_ROOTPOOL) {
491 			uint64_t spare = 0;
492 
493 			if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
494 			    &spare) == 0 && spare)
495 				spa_spare_add(vd);
496 		}
497 
498 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
499 		    &vd->vdev_offline);
500 
501 		/*
502 		 * When importing a pool, we want to ignore the persistent fault
503 		 * state, as the diagnosis made on another system may not be
504 		 * valid in the current context.
505 		 */
506 		if (spa->spa_load_state == SPA_LOAD_OPEN) {
507 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED,
508 			    &vd->vdev_faulted);
509 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED,
510 			    &vd->vdev_degraded);
511 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED,
512 			    &vd->vdev_removed);
513 		}
514 	}
515 
516 	/*
517 	 * Add ourselves to the parent's list of children.
518 	 */
519 	vdev_add_child(parent, vd);
520 
521 	*vdp = vd;
522 
523 	return (0);
524 }
525 
526 void
527 vdev_free(vdev_t *vd)
528 {
529 	int c;
530 	spa_t *spa = vd->vdev_spa;
531 
532 	/*
533 	 * vdev_free() implies closing the vdev first.  This is simpler than
534 	 * trying to ensure complicated semantics for all callers.
535 	 */
536 	vdev_close(vd);
537 
538 	ASSERT(!list_link_active(&vd->vdev_config_dirty_node));
539 
540 	/*
541 	 * Free all children.
542 	 */
543 	for (c = 0; c < vd->vdev_children; c++)
544 		vdev_free(vd->vdev_child[c]);
545 
546 	ASSERT(vd->vdev_child == NULL);
547 	ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
548 
549 	/*
550 	 * Discard allocation state.
551 	 */
552 	if (vd == vd->vdev_top)
553 		vdev_metaslab_fini(vd);
554 
555 	ASSERT3U(vd->vdev_stat.vs_space, ==, 0);
556 	ASSERT3U(vd->vdev_stat.vs_dspace, ==, 0);
557 	ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0);
558 
559 	/*
560 	 * Remove this vdev from its parent's child list.
561 	 */
562 	vdev_remove_child(vd->vdev_parent, vd);
563 
564 	ASSERT(vd->vdev_parent == NULL);
565 
566 	/*
567 	 * Clean up vdev structure.
568 	 */
569 	vdev_queue_fini(vd);
570 	vdev_cache_fini(vd);
571 
572 	if (vd->vdev_path)
573 		spa_strfree(vd->vdev_path);
574 	if (vd->vdev_devid)
575 		spa_strfree(vd->vdev_devid);
576 	if (vd->vdev_physpath)
577 		spa_strfree(vd->vdev_physpath);
578 	if (vd->vdev_fru)
579 		spa_strfree(vd->vdev_fru);
580 
581 	if (vd->vdev_isspare)
582 		spa_spare_remove(vd);
583 	if (vd->vdev_isl2cache)
584 		spa_l2cache_remove(vd);
585 
586 	txg_list_destroy(&vd->vdev_ms_list);
587 	txg_list_destroy(&vd->vdev_dtl_list);
588 
589 	mutex_enter(&vd->vdev_dtl_lock);
590 	for (int t = 0; t < DTL_TYPES; t++) {
591 		space_map_unload(&vd->vdev_dtl[t]);
592 		space_map_destroy(&vd->vdev_dtl[t]);
593 	}
594 	mutex_exit(&vd->vdev_dtl_lock);
595 
596 	mutex_destroy(&vd->vdev_dtl_lock);
597 	mutex_destroy(&vd->vdev_stat_lock);
598 	mutex_destroy(&vd->vdev_probe_lock);
599 
600 	if (vd == spa->spa_root_vdev)
601 		spa->spa_root_vdev = NULL;
602 
603 	kmem_free(vd, sizeof (vdev_t));
604 }
605 
606 /*
607  * Transfer top-level vdev state from svd to tvd.
608  */
609 static void
610 vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
611 {
612 	spa_t *spa = svd->vdev_spa;
613 	metaslab_t *msp;
614 	vdev_t *vd;
615 	int t;
616 
617 	ASSERT(tvd == tvd->vdev_top);
618 
619 	tvd->vdev_ms_array = svd->vdev_ms_array;
620 	tvd->vdev_ms_shift = svd->vdev_ms_shift;
621 	tvd->vdev_ms_count = svd->vdev_ms_count;
622 
623 	svd->vdev_ms_array = 0;
624 	svd->vdev_ms_shift = 0;
625 	svd->vdev_ms_count = 0;
626 
627 	tvd->vdev_mg = svd->vdev_mg;
628 	tvd->vdev_ms = svd->vdev_ms;
629 
630 	svd->vdev_mg = NULL;
631 	svd->vdev_ms = NULL;
632 
633 	if (tvd->vdev_mg != NULL)
634 		tvd->vdev_mg->mg_vd = tvd;
635 
636 	tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
637 	tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
638 	tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace;
639 
640 	svd->vdev_stat.vs_alloc = 0;
641 	svd->vdev_stat.vs_space = 0;
642 	svd->vdev_stat.vs_dspace = 0;
643 
644 	for (t = 0; t < TXG_SIZE; t++) {
645 		while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
646 			(void) txg_list_add(&tvd->vdev_ms_list, msp, t);
647 		while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL)
648 			(void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
649 		if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
650 			(void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
651 	}
652 
653 	if (list_link_active(&svd->vdev_config_dirty_node)) {
654 		vdev_config_clean(svd);
655 		vdev_config_dirty(tvd);
656 	}
657 
658 	if (list_link_active(&svd->vdev_state_dirty_node)) {
659 		vdev_state_clean(svd);
660 		vdev_state_dirty(tvd);
661 	}
662 
663 	tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio;
664 	svd->vdev_deflate_ratio = 0;
665 
666 	tvd->vdev_islog = svd->vdev_islog;
667 	svd->vdev_islog = 0;
668 }
669 
670 static void
671 vdev_top_update(vdev_t *tvd, vdev_t *vd)
672 {
673 	int c;
674 
675 	if (vd == NULL)
676 		return;
677 
678 	vd->vdev_top = tvd;
679 
680 	for (c = 0; c < vd->vdev_children; c++)
681 		vdev_top_update(tvd, vd->vdev_child[c]);
682 }
683 
684 /*
685  * Add a mirror/replacing vdev above an existing vdev.
686  */
687 vdev_t *
688 vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
689 {
690 	spa_t *spa = cvd->vdev_spa;
691 	vdev_t *pvd = cvd->vdev_parent;
692 	vdev_t *mvd;
693 
694 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
695 
696 	mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
697 
698 	mvd->vdev_asize = cvd->vdev_asize;
699 	mvd->vdev_ashift = cvd->vdev_ashift;
700 	mvd->vdev_state = cvd->vdev_state;
701 
702 	vdev_remove_child(pvd, cvd);
703 	vdev_add_child(pvd, mvd);
704 	cvd->vdev_id = mvd->vdev_children;
705 	vdev_add_child(mvd, cvd);
706 	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
707 
708 	if (mvd == mvd->vdev_top)
709 		vdev_top_transfer(cvd, mvd);
710 
711 	return (mvd);
712 }
713 
714 /*
715  * Remove a 1-way mirror/replacing vdev from the tree.
716  */
717 void
718 vdev_remove_parent(vdev_t *cvd)
719 {
720 	vdev_t *mvd = cvd->vdev_parent;
721 	vdev_t *pvd = mvd->vdev_parent;
722 
723 	ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
724 
725 	ASSERT(mvd->vdev_children == 1);
726 	ASSERT(mvd->vdev_ops == &vdev_mirror_ops ||
727 	    mvd->vdev_ops == &vdev_replacing_ops ||
728 	    mvd->vdev_ops == &vdev_spare_ops);
729 	cvd->vdev_ashift = mvd->vdev_ashift;
730 
731 	vdev_remove_child(mvd, cvd);
732 	vdev_remove_child(pvd, mvd);
733 
734 	/*
735 	 * If cvd will replace mvd as a top-level vdev, preserve mvd's guid.
736 	 * Otherwise, we could have detached an offline device, and when we
737 	 * go to import the pool we'll think we have two top-level vdevs,
738 	 * instead of a different version of the same top-level vdev.
739 	 */
740 	if (mvd->vdev_top == mvd) {
741 		uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid;
742 		cvd->vdev_guid += guid_delta;
743 		cvd->vdev_guid_sum += guid_delta;
744 	}
745 	cvd->vdev_id = mvd->vdev_id;
746 	vdev_add_child(pvd, cvd);
747 	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
748 
749 	if (cvd == cvd->vdev_top)
750 		vdev_top_transfer(mvd, cvd);
751 
752 	ASSERT(mvd->vdev_children == 0);
753 	vdev_free(mvd);
754 }
755 
756 int
757 vdev_metaslab_init(vdev_t *vd, uint64_t txg)
758 {
759 	spa_t *spa = vd->vdev_spa;
760 	objset_t *mos = spa->spa_meta_objset;
761 	metaslab_class_t *mc;
762 	uint64_t m;
763 	uint64_t oldc = vd->vdev_ms_count;
764 	uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
765 	metaslab_t **mspp;
766 	int error;
767 
768 	if (vd->vdev_ms_shift == 0)	/* not being allocated from yet */
769 		return (0);
770 
771 	/*
772 	 * Compute the raidz-deflation ratio.  Note, we hard-code
773 	 * in 128k (1 << 17) because it is the current "typical" blocksize.
774 	 * Even if SPA_MAXBLOCKSIZE changes, this algorithm must never change,
775 	 * or we will inconsistently account for existing bp's.
776 	 */
777 	vd->vdev_deflate_ratio = (1 << 17) /
778 	    (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
779 
780 	ASSERT(oldc <= newc);
781 
782 	if (vd->vdev_islog)
783 		mc = spa->spa_log_class;
784 	else
785 		mc = spa->spa_normal_class;
786 
787 	if (vd->vdev_mg == NULL)
788 		vd->vdev_mg = metaslab_group_create(mc, vd);
789 
790 	mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
791 
792 	if (oldc != 0) {
793 		bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp));
794 		kmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
795 	}
796 
797 	vd->vdev_ms = mspp;
798 	vd->vdev_ms_count = newc;
799 
800 	for (m = oldc; m < newc; m++) {
801 		space_map_obj_t smo = { 0, 0, 0 };
802 		if (txg == 0) {
803 			uint64_t object = 0;
804 			error = dmu_read(mos, vd->vdev_ms_array,
805 			    m * sizeof (uint64_t), sizeof (uint64_t), &object,
806 			    DMU_READ_PREFETCH);
807 			if (error)
808 				return (error);
809 			if (object != 0) {
810 				dmu_buf_t *db;
811 				error = dmu_bonus_hold(mos, object, FTAG, &db);
812 				if (error)
813 					return (error);
814 				ASSERT3U(db->db_size, >=, sizeof (smo));
815 				bcopy(db->db_data, &smo, sizeof (smo));
816 				ASSERT3U(smo.smo_object, ==, object);
817 				dmu_buf_rele(db, FTAG);
818 			}
819 		}
820 		vd->vdev_ms[m] = metaslab_init(vd->vdev_mg, &smo,
821 		    m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg);
822 	}
823 
824 	return (0);
825 }
826 
827 void
828 vdev_metaslab_fini(vdev_t *vd)
829 {
830 	uint64_t m;
831 	uint64_t count = vd->vdev_ms_count;
832 
833 	if (vd->vdev_ms != NULL) {
834 		for (m = 0; m < count; m++)
835 			if (vd->vdev_ms[m] != NULL)
836 				metaslab_fini(vd->vdev_ms[m]);
837 		kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
838 		vd->vdev_ms = NULL;
839 	}
840 }
841 
842 typedef struct vdev_probe_stats {
843 	boolean_t	vps_readable;
844 	boolean_t	vps_writeable;
845 	int		vps_flags;
846 } vdev_probe_stats_t;
847 
848 static void
849 vdev_probe_done(zio_t *zio)
850 {
851 	spa_t *spa = zio->io_spa;
852 	vdev_t *vd = zio->io_vd;
853 	vdev_probe_stats_t *vps = zio->io_private;
854 
855 	ASSERT(vd->vdev_probe_zio != NULL);
856 
857 	if (zio->io_type == ZIO_TYPE_READ) {
858 		if (zio->io_error == 0)
859 			vps->vps_readable = 1;
860 		if (zio->io_error == 0 && spa_writeable(spa)) {
861 			zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd,
862 			    zio->io_offset, zio->io_size, zio->io_data,
863 			    ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
864 			    ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE));
865 		} else {
866 			zio_buf_free(zio->io_data, zio->io_size);
867 		}
868 	} else if (zio->io_type == ZIO_TYPE_WRITE) {
869 		if (zio->io_error == 0)
870 			vps->vps_writeable = 1;
871 		zio_buf_free(zio->io_data, zio->io_size);
872 	} else if (zio->io_type == ZIO_TYPE_NULL) {
873 		zio_t *pio;
874 
875 		vd->vdev_cant_read |= !vps->vps_readable;
876 		vd->vdev_cant_write |= !vps->vps_writeable;
877 
878 		if (vdev_readable(vd) &&
879 		    (vdev_writeable(vd) || !spa_writeable(spa))) {
880 			zio->io_error = 0;
881 		} else {
882 			ASSERT(zio->io_error != 0);
883 			zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
884 			    spa, vd, NULL, 0, 0);
885 			zio->io_error = ENXIO;
886 		}
887 
888 		mutex_enter(&vd->vdev_probe_lock);
889 		ASSERT(vd->vdev_probe_zio == zio);
890 		vd->vdev_probe_zio = NULL;
891 		mutex_exit(&vd->vdev_probe_lock);
892 
893 		while ((pio = zio_walk_parents(zio)) != NULL)
894 			if (!vdev_accessible(vd, pio))
895 				pio->io_error = ENXIO;
896 
897 		kmem_free(vps, sizeof (*vps));
898 	}
899 }
900 
901 /*
902  * Determine whether this device is accessible by reading and writing
903  * to several known locations: the pad regions of each vdev label
904  * but the first (which we leave alone in case it contains a VTOC).
905  */
906 zio_t *
907 vdev_probe(vdev_t *vd, zio_t *zio)
908 {
909 	spa_t *spa = vd->vdev_spa;
910 	vdev_probe_stats_t *vps = NULL;
911 	zio_t *pio;
912 
913 	ASSERT(vd->vdev_ops->vdev_op_leaf);
914 
915 	/*
916 	 * Don't probe the probe.
917 	 */
918 	if (zio && (zio->io_flags & ZIO_FLAG_PROBE))
919 		return (NULL);
920 
921 	/*
922 	 * To prevent 'probe storms' when a device fails, we create
923 	 * just one probe i/o at a time.  All zios that want to probe
924 	 * this vdev will become parents of the probe io.
925 	 */
926 	mutex_enter(&vd->vdev_probe_lock);
927 
928 	if ((pio = vd->vdev_probe_zio) == NULL) {
929 		vps = kmem_zalloc(sizeof (*vps), KM_SLEEP);
930 
931 		vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
932 		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE |
933 		    ZIO_FLAG_TRYHARD;
934 
935 		if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
936 			/*
937 			 * vdev_cant_read and vdev_cant_write can only
938 			 * transition from TRUE to FALSE when we have the
939 			 * SCL_ZIO lock as writer; otherwise they can only
940 			 * transition from FALSE to TRUE.  This ensures that
941 			 * any zio looking at these values can assume that
942 			 * failures persist for the life of the I/O.  That's
943 			 * important because when a device has intermittent
944 			 * connectivity problems, we want to ensure that
945 			 * they're ascribed to the device (ENXIO) and not
946 			 * the zio (EIO).
947 			 *
948 			 * Since we hold SCL_ZIO as writer here, clear both
949 			 * values so the probe can reevaluate from first
950 			 * principles.
951 			 */
952 			vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER;
953 			vd->vdev_cant_read = B_FALSE;
954 			vd->vdev_cant_write = B_FALSE;
955 		}
956 
957 		vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd,
958 		    vdev_probe_done, vps,
959 		    vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
960 
961 		if (zio != NULL) {
962 			vd->vdev_probe_wanted = B_TRUE;
963 			spa_async_request(spa, SPA_ASYNC_PROBE);
964 		}
965 	}
966 
967 	if (zio != NULL)
968 		zio_add_child(zio, pio);
969 
970 	mutex_exit(&vd->vdev_probe_lock);
971 
972 	if (vps == NULL) {
973 		ASSERT(zio != NULL);
974 		return (NULL);
975 	}
976 
977 	for (int l = 1; l < VDEV_LABELS; l++) {
978 		zio_nowait(zio_read_phys(pio, vd,
979 		    vdev_label_offset(vd->vdev_psize, l,
980 		    offsetof(vdev_label_t, vl_pad2)),
981 		    VDEV_PAD_SIZE, zio_buf_alloc(VDEV_PAD_SIZE),
982 		    ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
983 		    ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE));
984 	}
985 
986 	if (zio == NULL)
987 		return (pio);
988 
989 	zio_nowait(pio);
990 	return (NULL);
991 }
992 
993 /*
994  * Prepare a virtual device for access.
995  */
996 int
997 vdev_open(vdev_t *vd)
998 {
999 	spa_t *spa = vd->vdev_spa;
1000 	int error;
1001 	int c;
1002 	uint64_t osize = 0;
1003 	uint64_t asize, psize;
1004 	uint64_t ashift = 0;
1005 
1006 	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
1007 
1008 	ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
1009 	    vd->vdev_state == VDEV_STATE_CANT_OPEN ||
1010 	    vd->vdev_state == VDEV_STATE_OFFLINE);
1011 
1012 	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
1013 	vd->vdev_cant_read = B_FALSE;
1014 	vd->vdev_cant_write = B_FALSE;
1015 
1016 	if (!vd->vdev_removed && vd->vdev_faulted) {
1017 		ASSERT(vd->vdev_children == 0);
1018 		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
1019 		    VDEV_AUX_ERR_EXCEEDED);
1020 		return (ENXIO);
1021 	} else if (vd->vdev_offline) {
1022 		ASSERT(vd->vdev_children == 0);
1023 		vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE);
1024 		return (ENXIO);
1025 	}
1026 
1027 	error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift);
1028 
1029 	if (zio_injection_enabled && error == 0)
1030 		error = zio_handle_device_injection(vd, NULL, ENXIO);
1031 
1032 	if (error) {
1033 		if (vd->vdev_removed &&
1034 		    vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED)
1035 			vd->vdev_removed = B_FALSE;
1036 
1037 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
1038 		    vd->vdev_stat.vs_aux);
1039 		return (error);
1040 	}
1041 
1042 	vd->vdev_removed = B_FALSE;
1043 
1044 	if (vd->vdev_degraded) {
1045 		ASSERT(vd->vdev_children == 0);
1046 		vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
1047 		    VDEV_AUX_ERR_EXCEEDED);
1048 	} else {
1049 		vd->vdev_state = VDEV_STATE_HEALTHY;
1050 	}
1051 
1052 	for (c = 0; c < vd->vdev_children; c++)
1053 		if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
1054 			vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
1055 			    VDEV_AUX_NONE);
1056 			break;
1057 		}
1058 
1059 	osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
1060 
1061 	if (vd->vdev_children == 0) {
1062 		if (osize < SPA_MINDEVSIZE) {
1063 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
1064 			    VDEV_AUX_TOO_SMALL);
1065 			return (EOVERFLOW);
1066 		}
1067 		psize = osize;
1068 		asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
1069 	} else {
1070 		if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE -
1071 		    (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
1072 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
1073 			    VDEV_AUX_TOO_SMALL);
1074 			return (EOVERFLOW);
1075 		}
1076 		psize = 0;
1077 		asize = osize;
1078 	}
1079 
1080 	vd->vdev_psize = psize;
1081 
1082 	if (vd->vdev_asize == 0) {
1083 		/*
1084 		 * This is the first-ever open, so use the computed values.
1085 		 * For testing purposes, a higher ashift can be requested.
1086 		 */
1087 		vd->vdev_asize = asize;
1088 		vd->vdev_ashift = MAX(ashift, vd->vdev_ashift);
1089 	} else {
1090 		/*
1091 		 * Make sure the alignment requirement hasn't increased.
1092 		 */
1093 		if (ashift > vd->vdev_top->vdev_ashift) {
1094 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
1095 			    VDEV_AUX_BAD_LABEL);
1096 			return (EINVAL);
1097 		}
1098 
1099 		/*
1100 		 * Make sure the device hasn't shrunk.
1101 		 */
1102 		if (asize < vd->vdev_asize) {
1103 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
1104 			    VDEV_AUX_BAD_LABEL);
1105 			return (EINVAL);
1106 		}
1107 
1108 		/*
1109 		 * If all children are healthy and the asize has increased,
1110 		 * then we've experienced dynamic LUN growth.
1111 		 */
1112 		if (vd->vdev_state == VDEV_STATE_HEALTHY &&
1113 		    asize > vd->vdev_asize) {
1114 			vd->vdev_asize = asize;
1115 		}
1116 	}
1117 
1118 	/*
1119 	 * Ensure we can issue some IO before declaring the
1120 	 * vdev open for business.
1121 	 */
1122 	if (vd->vdev_ops->vdev_op_leaf &&
1123 	    (error = zio_wait(vdev_probe(vd, NULL))) != 0) {
1124 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
1125 		    VDEV_AUX_IO_FAILURE);
1126 		return (error);
1127 	}
1128 
1129 	/*
1130 	 * If a leaf vdev has a DTL, and seems healthy, then kick off a
1131 	 * resilver.  But don't do this if we are doing a reopen for a scrub,
1132 	 * since this would just restart the scrub we are already doing.
1133 	 */
1134 	if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen &&
1135 	    vdev_resilver_needed(vd, NULL, NULL))
1136 		spa_async_request(spa, SPA_ASYNC_RESILVER);
1137 
1138 	return (0);
1139 }
1140 
1141 /*
1142  * Called once the vdevs are all opened, this routine validates the label
1143  * contents.  This needs to be done before vdev_load() so that we don't
1144  * inadvertently do repair I/Os to the wrong device.
1145  *
1146  * This function will only return failure if one of the vdevs indicates that it
1147  * has since been destroyed or exported.  This is only possible if
1148  * /etc/zfs/zpool.cache was readonly at the time.  Otherwise, the vdev state
1149  * will be updated but the function will return 0.
1150  */
1151 int
1152 vdev_validate(vdev_t *vd)
1153 {
1154 	spa_t *spa = vd->vdev_spa;
1155 	int c;
1156 	nvlist_t *label;
1157 	uint64_t guid, top_guid;
1158 	uint64_t state;
1159 
1160 	for (c = 0; c < vd->vdev_children; c++)
1161 		if (vdev_validate(vd->vdev_child[c]) != 0)
1162 			return (EBADF);
1163 
1164 	/*
1165 	 * If the device has already failed, or was marked offline, don't do
1166 	 * any further validation.  Otherwise, label I/O will fail and we will
1167 	 * overwrite the previous state.
1168 	 */
1169 	if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
1170 
1171 		if ((label = vdev_label_read_config(vd)) == NULL) {
1172 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
1173 			    VDEV_AUX_BAD_LABEL);
1174 			return (0);
1175 		}
1176 
1177 		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
1178 		    &guid) != 0 || guid != spa_guid(spa)) {
1179 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1180 			    VDEV_AUX_CORRUPT_DATA);
1181 			nvlist_free(label);
1182 			return (0);
1183 		}
1184 
1185 		/*
1186 		 * If this vdev just became a top-level vdev because its
1187 		 * sibling was detached, it will have adopted the parent's
1188 		 * vdev guid -- but the label may or may not be on disk yet.
1189 		 * Fortunately, either version of the label will have the
1190 		 * same top guid, so if we're a top-level vdev, we can
1191 		 * safely compare to that instead.
1192 		 */
1193 		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
1194 		    &guid) != 0 ||
1195 		    nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID,
1196 		    &top_guid) != 0 ||
1197 		    (vd->vdev_guid != guid &&
1198 		    (vd->vdev_guid != top_guid || vd != vd->vdev_top))) {
1199 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1200 			    VDEV_AUX_CORRUPT_DATA);
1201 			nvlist_free(label);
1202 			return (0);
1203 		}
1204 
1205 		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
1206 		    &state) != 0) {
1207 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1208 			    VDEV_AUX_CORRUPT_DATA);
1209 			nvlist_free(label);
1210 			return (0);
1211 		}
1212 
1213 		nvlist_free(label);
1214 
1215 		if (spa->spa_load_state == SPA_LOAD_OPEN &&
1216 		    state != POOL_STATE_ACTIVE)
1217 			return (EBADF);
1218 
1219 		/*
1220 		 * If we were able to open and validate a vdev that was
1221 		 * previously marked permanently unavailable, clear that state
1222 		 * now.
1223 		 */
1224 		if (vd->vdev_not_present)
1225 			vd->vdev_not_present = 0;
1226 	}
1227 
1228 	return (0);
1229 }
1230 
1231 /*
1232  * Close a virtual device.
1233  */
1234 void
1235 vdev_close(vdev_t *vd)
1236 {
1237 	spa_t *spa = vd->vdev_spa;
1238 
1239 	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
1240 
1241 	vd->vdev_ops->vdev_op_close(vd);
1242 
1243 	vdev_cache_purge(vd);
1244 
1245 	/*
1246 	 * We record the previous state before we close it, so  that if we are
1247 	 * doing a reopen(), we don't generate FMA ereports if we notice that
1248 	 * it's still faulted.
1249 	 */
1250 	vd->vdev_prevstate = vd->vdev_state;
1251 
1252 	if (vd->vdev_offline)
1253 		vd->vdev_state = VDEV_STATE_OFFLINE;
1254 	else
1255 		vd->vdev_state = VDEV_STATE_CLOSED;
1256 	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
1257 }
1258 
1259 void
1260 vdev_reopen(vdev_t *vd)
1261 {
1262 	spa_t *spa = vd->vdev_spa;
1263 
1264 	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
1265 
1266 	vdev_close(vd);
1267 	(void) vdev_open(vd);
1268 
1269 	/*
1270 	 * Call vdev_validate() here to make sure we have the same device.
1271 	 * Otherwise, a device with an invalid label could be successfully
1272 	 * opened in response to vdev_reopen().
1273 	 */
1274 	if (vd->vdev_aux) {
1275 		(void) vdev_validate_aux(vd);
1276 		if (vdev_readable(vd) && vdev_writeable(vd) &&
1277 		    vd->vdev_aux == &spa->spa_l2cache &&
1278 		    !l2arc_vdev_present(vd)) {
1279 			uint64_t size = vdev_get_rsize(vd);
1280 			l2arc_add_vdev(spa, vd,
1281 			    VDEV_LABEL_START_SIZE,
1282 			    size - VDEV_LABEL_START_SIZE);
1283 		}
1284 	} else {
1285 		(void) vdev_validate(vd);
1286 	}
1287 
1288 	/*
1289 	 * Reassess parent vdev's health.
1290 	 */
1291 	vdev_propagate_state(vd);
1292 }
1293 
1294 int
1295 vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
1296 {
1297 	int error;
1298 
1299 	/*
1300 	 * Normally, partial opens (e.g. of a mirror) are allowed.
1301 	 * For a create, however, we want to fail the request if
1302 	 * there are any components we can't open.
1303 	 */
1304 	error = vdev_open(vd);
1305 
1306 	if (error || vd->vdev_state != VDEV_STATE_HEALTHY) {
1307 		vdev_close(vd);
1308 		return (error ? error : ENXIO);
1309 	}
1310 
1311 	/*
1312 	 * Recursively initialize all labels.
1313 	 */
1314 	if ((error = vdev_label_init(vd, txg, isreplacing ?
1315 	    VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) {
1316 		vdev_close(vd);
1317 		return (error);
1318 	}
1319 
1320 	return (0);
1321 }
1322 
1323 /*
1324  * The is the latter half of vdev_create().  It is distinct because it
1325  * involves initiating transactions in order to do metaslab creation.
1326  * For creation, we want to try to create all vdevs at once and then undo it
1327  * if anything fails; this is much harder if we have pending transactions.
1328  */
1329 void
1330 vdev_init(vdev_t *vd, uint64_t txg)
1331 {
1332 	/*
1333 	 * Aim for roughly 200 metaslabs per vdev.
1334 	 */
1335 	vd->vdev_ms_shift = highbit(vd->vdev_asize / 200);
1336 	vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
1337 
1338 	/*
1339 	 * Initialize the vdev's metaslabs.  This can't fail because
1340 	 * there's nothing to read when creating all new metaslabs.
1341 	 */
1342 	VERIFY(vdev_metaslab_init(vd, txg) == 0);
1343 }
1344 
1345 void
1346 vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
1347 {
1348 	ASSERT(vd == vd->vdev_top);
1349 	ASSERT(ISP2(flags));
1350 
1351 	if (flags & VDD_METASLAB)
1352 		(void) txg_list_add(&vd->vdev_ms_list, arg, txg);
1353 
1354 	if (flags & VDD_DTL)
1355 		(void) txg_list_add(&vd->vdev_dtl_list, arg, txg);
1356 
1357 	(void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
1358 }
1359 
1360 /*
1361  * DTLs.
1362  *
1363  * A vdev's DTL (dirty time log) is the set of transaction groups for which
1364  * the vdev has less than perfect replication.  There are three kinds of DTL:
1365  *
1366  * DTL_MISSING: txgs for which the vdev has no valid copies of the data
1367  *
1368  * DTL_PARTIAL: txgs for which data is available, but not fully replicated
1369  *
1370  * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon
1371  *	scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of
1372  *	txgs that was scrubbed.
1373  *
1374  * DTL_OUTAGE: txgs which cannot currently be read, whether due to
1375  *	persistent errors or just some device being offline.
1376  *	Unlike the other three, the DTL_OUTAGE map is not generally
1377  *	maintained; it's only computed when needed, typically to
1378  *	determine whether a device can be detached.
1379  *
1380  * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device
1381  * either has the data or it doesn't.
1382  *
1383  * For interior vdevs such as mirror and RAID-Z the picture is more complex.
1384  * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because
1385  * if any child is less than fully replicated, then so is its parent.
1386  * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs,
1387  * comprising only those txgs which appear in 'maxfaults' or more children;
1388  * those are the txgs we don't have enough replication to read.  For example,
1389  * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2);
1390  * thus, its DTL_MISSING consists of the set of txgs that appear in more than
1391  * two child DTL_MISSING maps.
1392  *
1393  * It should be clear from the above that to compute the DTLs and outage maps
1394  * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps.
1395  * Therefore, that is all we keep on disk.  When loading the pool, or after
1396  * a configuration change, we generate all other DTLs from first principles.
1397  */
1398 void
1399 vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
1400 {
1401 	space_map_t *sm = &vd->vdev_dtl[t];
1402 
1403 	ASSERT(t < DTL_TYPES);
1404 	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
1405 
1406 	mutex_enter(sm->sm_lock);
1407 	if (!space_map_contains(sm, txg, size))
1408 		space_map_add(sm, txg, size);
1409 	mutex_exit(sm->sm_lock);
1410 }
1411 
1412 boolean_t
1413 vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
1414 {
1415 	space_map_t *sm = &vd->vdev_dtl[t];
1416 	boolean_t dirty = B_FALSE;
1417 
1418 	ASSERT(t < DTL_TYPES);
1419 	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
1420 
1421 	mutex_enter(sm->sm_lock);
1422 	if (sm->sm_space != 0)
1423 		dirty = space_map_contains(sm, txg, size);
1424 	mutex_exit(sm->sm_lock);
1425 
1426 	return (dirty);
1427 }
1428 
1429 boolean_t
1430 vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
1431 {
1432 	space_map_t *sm = &vd->vdev_dtl[t];
1433 	boolean_t empty;
1434 
1435 	mutex_enter(sm->sm_lock);
1436 	empty = (sm->sm_space == 0);
1437 	mutex_exit(sm->sm_lock);
1438 
1439 	return (empty);
1440 }
1441 
1442 /*
1443  * Reassess DTLs after a config change or scrub completion.
1444  */
1445 void
1446 vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
1447 {
1448 	spa_t *spa = vd->vdev_spa;
1449 	avl_tree_t reftree;
1450 	int minref;
1451 
1452 	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
1453 
1454 	for (int c = 0; c < vd->vdev_children; c++)
1455 		vdev_dtl_reassess(vd->vdev_child[c], txg,
1456 		    scrub_txg, scrub_done);
1457 
1458 	if (vd == spa->spa_root_vdev)
1459 		return;
1460 
1461 	if (vd->vdev_ops->vdev_op_leaf) {
1462 		mutex_enter(&vd->vdev_dtl_lock);
1463 		if (scrub_txg != 0 &&
1464 		    (spa->spa_scrub_started || spa->spa_scrub_errors == 0)) {
1465 			/* XXX should check scrub_done? */
1466 			/*
1467 			 * We completed a scrub up to scrub_txg.  If we
1468 			 * did it without rebooting, then the scrub dtl
1469 			 * will be valid, so excise the old region and
1470 			 * fold in the scrub dtl.  Otherwise, leave the
1471 			 * dtl as-is if there was an error.
1472 			 *
1473 			 * There's little trick here: to excise the beginning
1474 			 * of the DTL_MISSING map, we put it into a reference
1475 			 * tree and then add a segment with refcnt -1 that
1476 			 * covers the range [0, scrub_txg).  This means
1477 			 * that each txg in that range has refcnt -1 or 0.
1478 			 * We then add DTL_SCRUB with a refcnt of 2, so that
1479 			 * entries in the range [0, scrub_txg) will have a
1480 			 * positive refcnt -- either 1 or 2.  We then convert
1481 			 * the reference tree into the new DTL_MISSING map.
1482 			 */
1483 			space_map_ref_create(&reftree);
1484 			space_map_ref_add_map(&reftree,
1485 			    &vd->vdev_dtl[DTL_MISSING], 1);
1486 			space_map_ref_add_seg(&reftree, 0, scrub_txg, -1);
1487 			space_map_ref_add_map(&reftree,
1488 			    &vd->vdev_dtl[DTL_SCRUB], 2);
1489 			space_map_ref_generate_map(&reftree,
1490 			    &vd->vdev_dtl[DTL_MISSING], 1);
1491 			space_map_ref_destroy(&reftree);
1492 		}
1493 		space_map_vacate(&vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
1494 		space_map_walk(&vd->vdev_dtl[DTL_MISSING],
1495 		    space_map_add, &vd->vdev_dtl[DTL_PARTIAL]);
1496 		if (scrub_done)
1497 			space_map_vacate(&vd->vdev_dtl[DTL_SCRUB], NULL, NULL);
1498 		space_map_vacate(&vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
1499 		if (!vdev_readable(vd))
1500 			space_map_add(&vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
1501 		else
1502 			space_map_walk(&vd->vdev_dtl[DTL_MISSING],
1503 			    space_map_add, &vd->vdev_dtl[DTL_OUTAGE]);
1504 		mutex_exit(&vd->vdev_dtl_lock);
1505 
1506 		if (txg != 0)
1507 			vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
1508 		return;
1509 	}
1510 
1511 	mutex_enter(&vd->vdev_dtl_lock);
1512 	for (int t = 0; t < DTL_TYPES; t++) {
1513 		if (t == DTL_SCRUB)
1514 			continue;			/* leaf vdevs only */
1515 		if (t == DTL_PARTIAL)
1516 			minref = 1;			/* i.e. non-zero */
1517 		else if (vd->vdev_nparity != 0)
1518 			minref = vd->vdev_nparity + 1;	/* RAID-Z */
1519 		else
1520 			minref = vd->vdev_children;	/* any kind of mirror */
1521 		space_map_ref_create(&reftree);
1522 		for (int c = 0; c < vd->vdev_children; c++) {
1523 			vdev_t *cvd = vd->vdev_child[c];
1524 			mutex_enter(&cvd->vdev_dtl_lock);
1525 			space_map_ref_add_map(&reftree, &cvd->vdev_dtl[t], 1);
1526 			mutex_exit(&cvd->vdev_dtl_lock);
1527 		}
1528 		space_map_ref_generate_map(&reftree, &vd->vdev_dtl[t], minref);
1529 		space_map_ref_destroy(&reftree);
1530 	}
1531 	mutex_exit(&vd->vdev_dtl_lock);
1532 }
1533 
1534 static int
1535 vdev_dtl_load(vdev_t *vd)
1536 {
1537 	spa_t *spa = vd->vdev_spa;
1538 	space_map_obj_t *smo = &vd->vdev_dtl_smo;
1539 	objset_t *mos = spa->spa_meta_objset;
1540 	dmu_buf_t *db;
1541 	int error;
1542 
1543 	ASSERT(vd->vdev_children == 0);
1544 
1545 	if (smo->smo_object == 0)
1546 		return (0);
1547 
1548 	if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0)
1549 		return (error);
1550 
1551 	ASSERT3U(db->db_size, >=, sizeof (*smo));
1552 	bcopy(db->db_data, smo, sizeof (*smo));
1553 	dmu_buf_rele(db, FTAG);
1554 
1555 	mutex_enter(&vd->vdev_dtl_lock);
1556 	error = space_map_load(&vd->vdev_dtl[DTL_MISSING],
1557 	    NULL, SM_ALLOC, smo, mos);
1558 	mutex_exit(&vd->vdev_dtl_lock);
1559 
1560 	return (error);
1561 }
1562 
1563 void
1564 vdev_dtl_sync(vdev_t *vd, uint64_t txg)
1565 {
1566 	spa_t *spa = vd->vdev_spa;
1567 	space_map_obj_t *smo = &vd->vdev_dtl_smo;
1568 	space_map_t *sm = &vd->vdev_dtl[DTL_MISSING];
1569 	objset_t *mos = spa->spa_meta_objset;
1570 	space_map_t smsync;
1571 	kmutex_t smlock;
1572 	dmu_buf_t *db;
1573 	dmu_tx_t *tx;
1574 
1575 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1576 
1577 	if (vd->vdev_detached) {
1578 		if (smo->smo_object != 0) {
1579 			int err = dmu_object_free(mos, smo->smo_object, tx);
1580 			ASSERT3U(err, ==, 0);
1581 			smo->smo_object = 0;
1582 		}
1583 		dmu_tx_commit(tx);
1584 		return;
1585 	}
1586 
1587 	if (smo->smo_object == 0) {
1588 		ASSERT(smo->smo_objsize == 0);
1589 		ASSERT(smo->smo_alloc == 0);
1590 		smo->smo_object = dmu_object_alloc(mos,
1591 		    DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
1592 		    DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
1593 		ASSERT(smo->smo_object != 0);
1594 		vdev_config_dirty(vd->vdev_top);
1595 	}
1596 
1597 	mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL);
1598 
1599 	space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift,
1600 	    &smlock);
1601 
1602 	mutex_enter(&smlock);
1603 
1604 	mutex_enter(&vd->vdev_dtl_lock);
1605 	space_map_walk(sm, space_map_add, &smsync);
1606 	mutex_exit(&vd->vdev_dtl_lock);
1607 
1608 	space_map_truncate(smo, mos, tx);
1609 	space_map_sync(&smsync, SM_ALLOC, smo, mos, tx);
1610 
1611 	space_map_destroy(&smsync);
1612 
1613 	mutex_exit(&smlock);
1614 	mutex_destroy(&smlock);
1615 
1616 	VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
1617 	dmu_buf_will_dirty(db, tx);
1618 	ASSERT3U(db->db_size, >=, sizeof (*smo));
1619 	bcopy(smo, db->db_data, sizeof (*smo));
1620 	dmu_buf_rele(db, FTAG);
1621 
1622 	dmu_tx_commit(tx);
1623 }
1624 
1625 /*
1626  * Determine whether the specified vdev can be offlined/detached/removed
1627  * without losing data.
1628  */
1629 boolean_t
1630 vdev_dtl_required(vdev_t *vd)
1631 {
1632 	spa_t *spa = vd->vdev_spa;
1633 	vdev_t *tvd = vd->vdev_top;
1634 	uint8_t cant_read = vd->vdev_cant_read;
1635 	boolean_t required;
1636 
1637 	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
1638 
1639 	if (vd == spa->spa_root_vdev || vd == tvd)
1640 		return (B_TRUE);
1641 
1642 	/*
1643 	 * Temporarily mark the device as unreadable, and then determine
1644 	 * whether this results in any DTL outages in the top-level vdev.
1645 	 * If not, we can safely offline/detach/remove the device.
1646 	 */
1647 	vd->vdev_cant_read = B_TRUE;
1648 	vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
1649 	required = !vdev_dtl_empty(tvd, DTL_OUTAGE);
1650 	vd->vdev_cant_read = cant_read;
1651 	vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
1652 
1653 	return (required);
1654 }
1655 
1656 /*
1657  * Determine if resilver is needed, and if so the txg range.
1658  */
1659 boolean_t
1660 vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
1661 {
1662 	boolean_t needed = B_FALSE;
1663 	uint64_t thismin = UINT64_MAX;
1664 	uint64_t thismax = 0;
1665 
1666 	if (vd->vdev_children == 0) {
1667 		mutex_enter(&vd->vdev_dtl_lock);
1668 		if (vd->vdev_dtl[DTL_MISSING].sm_space != 0 &&
1669 		    vdev_writeable(vd)) {
1670 			space_seg_t *ss;
1671 
1672 			ss = avl_first(&vd->vdev_dtl[DTL_MISSING].sm_root);
1673 			thismin = ss->ss_start - 1;
1674 			ss = avl_last(&vd->vdev_dtl[DTL_MISSING].sm_root);
1675 			thismax = ss->ss_end;
1676 			needed = B_TRUE;
1677 		}
1678 		mutex_exit(&vd->vdev_dtl_lock);
1679 	} else {
1680 		for (int c = 0; c < vd->vdev_children; c++) {
1681 			vdev_t *cvd = vd->vdev_child[c];
1682 			uint64_t cmin, cmax;
1683 
1684 			if (vdev_resilver_needed(cvd, &cmin, &cmax)) {
1685 				thismin = MIN(thismin, cmin);
1686 				thismax = MAX(thismax, cmax);
1687 				needed = B_TRUE;
1688 			}
1689 		}
1690 	}
1691 
1692 	if (needed && minp) {
1693 		*minp = thismin;
1694 		*maxp = thismax;
1695 	}
1696 	return (needed);
1697 }
1698 
1699 void
1700 vdev_load(vdev_t *vd)
1701 {
1702 	/*
1703 	 * Recursively load all children.
1704 	 */
1705 	for (int c = 0; c < vd->vdev_children; c++)
1706 		vdev_load(vd->vdev_child[c]);
1707 
1708 	/*
1709 	 * If this is a top-level vdev, initialize its metaslabs.
1710 	 */
1711 	if (vd == vd->vdev_top &&
1712 	    (vd->vdev_ashift == 0 || vd->vdev_asize == 0 ||
1713 	    vdev_metaslab_init(vd, 0) != 0))
1714 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1715 		    VDEV_AUX_CORRUPT_DATA);
1716 
1717 	/*
1718 	 * If this is a leaf vdev, load its DTL.
1719 	 */
1720 	if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0)
1721 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1722 		    VDEV_AUX_CORRUPT_DATA);
1723 }
1724 
1725 /*
1726  * The special vdev case is used for hot spares and l2cache devices.  Its
1727  * sole purpose it to set the vdev state for the associated vdev.  To do this,
1728  * we make sure that we can open the underlying device, then try to read the
1729  * label, and make sure that the label is sane and that it hasn't been
1730  * repurposed to another pool.
1731  */
1732 int
1733 vdev_validate_aux(vdev_t *vd)
1734 {
1735 	nvlist_t *label;
1736 	uint64_t guid, version;
1737 	uint64_t state;
1738 
1739 	if (!vdev_readable(vd))
1740 		return (0);
1741 
1742 	if ((label = vdev_label_read_config(vd)) == NULL) {
1743 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
1744 		    VDEV_AUX_CORRUPT_DATA);
1745 		return (-1);
1746 	}
1747 
1748 	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 ||
1749 	    version > SPA_VERSION ||
1750 	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 ||
1751 	    guid != vd->vdev_guid ||
1752 	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) {
1753 		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
1754 		    VDEV_AUX_CORRUPT_DATA);
1755 		nvlist_free(label);
1756 		return (-1);
1757 	}
1758 
1759 	/*
1760 	 * We don't actually check the pool state here.  If it's in fact in
1761 	 * use by another pool, we update this fact on the fly when requested.
1762 	 */
1763 	nvlist_free(label);
1764 	return (0);
1765 }
1766 
1767 void
1768 vdev_sync_done(vdev_t *vd, uint64_t txg)
1769 {
1770 	metaslab_t *msp;
1771 
1772 	while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
1773 		metaslab_sync_done(msp, txg);
1774 }
1775 
1776 void
1777 vdev_sync(vdev_t *vd, uint64_t txg)
1778 {
1779 	spa_t *spa = vd->vdev_spa;
1780 	vdev_t *lvd;
1781 	metaslab_t *msp;
1782 	dmu_tx_t *tx;
1783 
1784 	if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) {
1785 		ASSERT(vd == vd->vdev_top);
1786 		tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1787 		vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
1788 		    DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
1789 		ASSERT(vd->vdev_ms_array != 0);
1790 		vdev_config_dirty(vd);
1791 		dmu_tx_commit(tx);
1792 	}
1793 
1794 	while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
1795 		metaslab_sync(msp, txg);
1796 		(void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
1797 	}
1798 
1799 	while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
1800 		vdev_dtl_sync(lvd, txg);
1801 
1802 	(void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
1803 }
1804 
1805 uint64_t
1806 vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
1807 {
1808 	return (vd->vdev_ops->vdev_op_asize(vd, psize));
1809 }
1810 
1811 /*
1812  * Mark the given vdev faulted.  A faulted vdev behaves as if the device could
1813  * not be opened, and no I/O is attempted.
1814  */
1815 int
1816 vdev_fault(spa_t *spa, uint64_t guid)
1817 {
1818 	vdev_t *vd;
1819 
1820 	spa_vdev_state_enter(spa);
1821 
1822 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
1823 		return (spa_vdev_state_exit(spa, NULL, ENODEV));
1824 
1825 	if (!vd->vdev_ops->vdev_op_leaf)
1826 		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
1827 
1828 	/*
1829 	 * Faulted state takes precedence over degraded.
1830 	 */
1831 	vd->vdev_faulted = 1ULL;
1832 	vd->vdev_degraded = 0ULL;
1833 	vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, VDEV_AUX_ERR_EXCEEDED);
1834 
1835 	/*
1836 	 * If marking the vdev as faulted cause the top-level vdev to become
1837 	 * unavailable, then back off and simply mark the vdev as degraded
1838 	 * instead.
1839 	 */
1840 	if (vdev_is_dead(vd->vdev_top) && vd->vdev_aux == NULL) {
1841 		vd->vdev_degraded = 1ULL;
1842 		vd->vdev_faulted = 0ULL;
1843 
1844 		/*
1845 		 * If we reopen the device and it's not dead, only then do we
1846 		 * mark it degraded.
1847 		 */
1848 		vdev_reopen(vd);
1849 
1850 		if (vdev_readable(vd)) {
1851 			vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
1852 			    VDEV_AUX_ERR_EXCEEDED);
1853 		}
1854 	}
1855 
1856 	return (spa_vdev_state_exit(spa, vd, 0));
1857 }
1858 
1859 /*
1860  * Mark the given vdev degraded.  A degraded vdev is purely an indication to the
1861  * user that something is wrong.  The vdev continues to operate as normal as far
1862  * as I/O is concerned.
1863  */
1864 int
1865 vdev_degrade(spa_t *spa, uint64_t guid)
1866 {
1867 	vdev_t *vd;
1868 
1869 	spa_vdev_state_enter(spa);
1870 
1871 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
1872 		return (spa_vdev_state_exit(spa, NULL, ENODEV));
1873 
1874 	if (!vd->vdev_ops->vdev_op_leaf)
1875 		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
1876 
1877 	/*
1878 	 * If the vdev is already faulted, then don't do anything.
1879 	 */
1880 	if (vd->vdev_faulted || vd->vdev_degraded)
1881 		return (spa_vdev_state_exit(spa, NULL, 0));
1882 
1883 	vd->vdev_degraded = 1ULL;
1884 	if (!vdev_is_dead(vd))
1885 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
1886 		    VDEV_AUX_ERR_EXCEEDED);
1887 
1888 	return (spa_vdev_state_exit(spa, vd, 0));
1889 }
1890 
1891 /*
1892  * Online the given vdev.  If 'unspare' is set, it implies two things.  First,
1893  * any attached spare device should be detached when the device finishes
1894  * resilvering.  Second, the online should be treated like a 'test' online case,
1895  * so no FMA events are generated if the device fails to open.
1896  */
1897 int
1898 vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
1899 {
1900 	vdev_t *vd;
1901 
1902 	spa_vdev_state_enter(spa);
1903 
1904 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
1905 		return (spa_vdev_state_exit(spa, NULL, ENODEV));
1906 
1907 	if (!vd->vdev_ops->vdev_op_leaf)
1908 		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
1909 
1910 	vd->vdev_offline = B_FALSE;
1911 	vd->vdev_tmpoffline = B_FALSE;
1912 	vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE);
1913 	vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT);
1914 	vdev_reopen(vd->vdev_top);
1915 	vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE;
1916 
1917 	if (newstate)
1918 		*newstate = vd->vdev_state;
1919 	if ((flags & ZFS_ONLINE_UNSPARE) &&
1920 	    !vdev_is_dead(vd) && vd->vdev_parent &&
1921 	    vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
1922 	    vd->vdev_parent->vdev_child[0] == vd)
1923 		vd->vdev_unspare = B_TRUE;
1924 
1925 	return (spa_vdev_state_exit(spa, vd, 0));
1926 }
1927 
1928 int
1929 vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
1930 {
1931 	vdev_t *vd, *tvd;
1932 	int error;
1933 
1934 	spa_vdev_state_enter(spa);
1935 
1936 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
1937 		return (spa_vdev_state_exit(spa, NULL, ENODEV));
1938 
1939 	if (!vd->vdev_ops->vdev_op_leaf)
1940 		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
1941 
1942 	tvd = vd->vdev_top;
1943 
1944 	/*
1945 	 * If the device isn't already offline, try to offline it.
1946 	 */
1947 	if (!vd->vdev_offline) {
1948 		/*
1949 		 * If this device has the only valid copy of some data,
1950 		 * don't allow it to be offlined. Log devices are always
1951 		 * expendable.
1952 		 */
1953 		if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
1954 		    vdev_dtl_required(vd))
1955 			return (spa_vdev_state_exit(spa, NULL, EBUSY));
1956 
1957 		/*
1958 		 * Offline this device and reopen its top-level vdev.
1959 		 * If the top-level vdev is a log device then just offline
1960 		 * it. Otherwise, if this action results in the top-level
1961 		 * vdev becoming unusable, undo it and fail the request.
1962 		 */
1963 		vd->vdev_offline = B_TRUE;
1964 		vdev_reopen(tvd);
1965 
1966 		if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
1967 		    vdev_is_dead(tvd)) {
1968 			vd->vdev_offline = B_FALSE;
1969 			vdev_reopen(tvd);
1970 			return (spa_vdev_state_exit(spa, NULL, EBUSY));
1971 		}
1972 	}
1973 
1974 	vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY);
1975 
1976 	if (!tvd->vdev_islog || !vdev_is_dead(tvd))
1977 		return (spa_vdev_state_exit(spa, vd, 0));
1978 
1979 	(void) spa_vdev_state_exit(spa, vd, 0);
1980 
1981 	error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
1982 	    NULL, DS_FIND_CHILDREN);
1983 	if (error) {
1984 		(void) vdev_online(spa, guid, 0, NULL);
1985 		return (error);
1986 	}
1987 	/*
1988 	 * If we successfully offlined the log device then we need to
1989 	 * sync out the current txg so that the "stubby" block can be
1990 	 * removed by zil_sync().
1991 	 */
1992 	txg_wait_synced(spa->spa_dsl_pool, 0);
1993 	return (0);
1994 }
1995 
1996 /*
1997  * Clear the error counts associated with this vdev.  Unlike vdev_online() and
1998  * vdev_offline(), we assume the spa config is locked.  We also clear all
1999  * children.  If 'vd' is NULL, then the user wants to clear all vdevs.
2000  */
2001 void
2002 vdev_clear(spa_t *spa, vdev_t *vd)
2003 {
2004 	vdev_t *rvd = spa->spa_root_vdev;
2005 
2006 	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
2007 
2008 	if (vd == NULL)
2009 		vd = rvd;
2010 
2011 	vd->vdev_stat.vs_read_errors = 0;
2012 	vd->vdev_stat.vs_write_errors = 0;
2013 	vd->vdev_stat.vs_checksum_errors = 0;
2014 
2015 	for (int c = 0; c < vd->vdev_children; c++)
2016 		vdev_clear(spa, vd->vdev_child[c]);
2017 
2018 	/*
2019 	 * If we're in the FAULTED state or have experienced failed I/O, then
2020 	 * clear the persistent state and attempt to reopen the device.  We
2021 	 * also mark the vdev config dirty, so that the new faulted state is
2022 	 * written out to disk.
2023 	 */
2024 	if (vd->vdev_faulted || vd->vdev_degraded ||
2025 	    !vdev_readable(vd) || !vdev_writeable(vd)) {
2026 
2027 		vd->vdev_faulted = vd->vdev_degraded = 0;
2028 		vd->vdev_cant_read = B_FALSE;
2029 		vd->vdev_cant_write = B_FALSE;
2030 
2031 		vdev_reopen(vd);
2032 
2033 		if (vd != rvd)
2034 			vdev_state_dirty(vd->vdev_top);
2035 
2036 		if (vd->vdev_aux == NULL && !vdev_is_dead(vd))
2037 			spa_async_request(spa, SPA_ASYNC_RESILVER);
2038 
2039 		spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR);
2040 	}
2041 }
2042 
2043 boolean_t
2044 vdev_is_dead(vdev_t *vd)
2045 {
2046 	return (vd->vdev_state < VDEV_STATE_DEGRADED);
2047 }
2048 
2049 boolean_t
2050 vdev_readable(vdev_t *vd)
2051 {
2052 	return (!vdev_is_dead(vd) && !vd->vdev_cant_read);
2053 }
2054 
2055 boolean_t
2056 vdev_writeable(vdev_t *vd)
2057 {
2058 	return (!vdev_is_dead(vd) && !vd->vdev_cant_write);
2059 }
2060 
2061 boolean_t
2062 vdev_allocatable(vdev_t *vd)
2063 {
2064 	uint64_t state = vd->vdev_state;
2065 
2066 	/*
2067 	 * We currently allow allocations from vdevs which may be in the
2068 	 * process of reopening (i.e. VDEV_STATE_CLOSED). If the device
2069 	 * fails to reopen then we'll catch it later when we're holding
2070 	 * the proper locks.  Note that we have to get the vdev state
2071 	 * in a local variable because although it changes atomically,
2072 	 * we're asking two separate questions about it.
2073 	 */
2074 	return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
2075 	    !vd->vdev_cant_write);
2076 }
2077 
2078 boolean_t
2079 vdev_accessible(vdev_t *vd, zio_t *zio)
2080 {
2081 	ASSERT(zio->io_vd == vd);
2082 
2083 	if (vdev_is_dead(vd) || vd->vdev_remove_wanted)
2084 		return (B_FALSE);
2085 
2086 	if (zio->io_type == ZIO_TYPE_READ)
2087 		return (!vd->vdev_cant_read);
2088 
2089 	if (zio->io_type == ZIO_TYPE_WRITE)
2090 		return (!vd->vdev_cant_write);
2091 
2092 	return (B_TRUE);
2093 }
2094 
2095 /*
2096  * Get statistics for the given vdev.
2097  */
2098 void
2099 vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
2100 {
2101 	vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
2102 
2103 	mutex_enter(&vd->vdev_stat_lock);
2104 	bcopy(&vd->vdev_stat, vs, sizeof (*vs));
2105 	vs->vs_scrub_errors = vd->vdev_spa->spa_scrub_errors;
2106 	vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
2107 	vs->vs_state = vd->vdev_state;
2108 	vs->vs_rsize = vdev_get_rsize(vd);
2109 	mutex_exit(&vd->vdev_stat_lock);
2110 
2111 	/*
2112 	 * If we're getting stats on the root vdev, aggregate the I/O counts
2113 	 * over all top-level vdevs (i.e. the direct children of the root).
2114 	 */
2115 	if (vd == rvd) {
2116 		for (int c = 0; c < rvd->vdev_children; c++) {
2117 			vdev_t *cvd = rvd->vdev_child[c];
2118 			vdev_stat_t *cvs = &cvd->vdev_stat;
2119 
2120 			mutex_enter(&vd->vdev_stat_lock);
2121 			for (int t = 0; t < ZIO_TYPES; t++) {
2122 				vs->vs_ops[t] += cvs->vs_ops[t];
2123 				vs->vs_bytes[t] += cvs->vs_bytes[t];
2124 			}
2125 			vs->vs_scrub_examined += cvs->vs_scrub_examined;
2126 			mutex_exit(&vd->vdev_stat_lock);
2127 		}
2128 	}
2129 }
2130 
2131 void
2132 vdev_clear_stats(vdev_t *vd)
2133 {
2134 	mutex_enter(&vd->vdev_stat_lock);
2135 	vd->vdev_stat.vs_space = 0;
2136 	vd->vdev_stat.vs_dspace = 0;
2137 	vd->vdev_stat.vs_alloc = 0;
2138 	mutex_exit(&vd->vdev_stat_lock);
2139 }
2140 
2141 void
2142 vdev_stat_update(zio_t *zio, uint64_t psize)
2143 {
2144 	spa_t *spa = zio->io_spa;
2145 	vdev_t *rvd = spa->spa_root_vdev;
2146 	vdev_t *vd = zio->io_vd ? zio->io_vd : rvd;
2147 	vdev_t *pvd;
2148 	uint64_t txg = zio->io_txg;
2149 	vdev_stat_t *vs = &vd->vdev_stat;
2150 	zio_type_t type = zio->io_type;
2151 	int flags = zio->io_flags;
2152 
2153 	/*
2154 	 * If this i/o is a gang leader, it didn't do any actual work.
2155 	 */
2156 	if (zio->io_gang_tree)
2157 		return;
2158 
2159 	if (zio->io_error == 0) {
2160 		/*
2161 		 * If this is a root i/o, don't count it -- we've already
2162 		 * counted the top-level vdevs, and vdev_get_stats() will
2163 		 * aggregate them when asked.  This reduces contention on
2164 		 * the root vdev_stat_lock and implicitly handles blocks
2165 		 * that compress away to holes, for which there is no i/o.
2166 		 * (Holes never create vdev children, so all the counters
2167 		 * remain zero, which is what we want.)
2168 		 *
2169 		 * Note: this only applies to successful i/o (io_error == 0)
2170 		 * because unlike i/o counts, errors are not additive.
2171 		 * When reading a ditto block, for example, failure of
2172 		 * one top-level vdev does not imply a root-level error.
2173 		 */
2174 		if (vd == rvd)
2175 			return;
2176 
2177 		ASSERT(vd == zio->io_vd);
2178 
2179 		if (flags & ZIO_FLAG_IO_BYPASS)
2180 			return;
2181 
2182 		mutex_enter(&vd->vdev_stat_lock);
2183 
2184 		if (flags & ZIO_FLAG_IO_REPAIR) {
2185 			if (flags & ZIO_FLAG_SCRUB_THREAD)
2186 				vs->vs_scrub_repaired += psize;
2187 			if (flags & ZIO_FLAG_SELF_HEAL)
2188 				vs->vs_self_healed += psize;
2189 		}
2190 
2191 		vs->vs_ops[type]++;
2192 		vs->vs_bytes[type] += psize;
2193 
2194 		mutex_exit(&vd->vdev_stat_lock);
2195 		return;
2196 	}
2197 
2198 	if (flags & ZIO_FLAG_SPECULATIVE)
2199 		return;
2200 
2201 	/*
2202 	 * If this is an I/O error that is going to be retried, then ignore the
2203 	 * error.  Otherwise, the user may interpret B_FAILFAST I/O errors as
2204 	 * hard errors, when in reality they can happen for any number of
2205 	 * innocuous reasons (bus resets, MPxIO link failure, etc).
2206 	 */
2207 	if (zio->io_error == EIO &&
2208 	    !(zio->io_flags & ZIO_FLAG_IO_RETRY))
2209 		return;
2210 
2211 	mutex_enter(&vd->vdev_stat_lock);
2212 	if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) {
2213 		if (zio->io_error == ECKSUM)
2214 			vs->vs_checksum_errors++;
2215 		else
2216 			vs->vs_read_errors++;
2217 	}
2218 	if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd))
2219 		vs->vs_write_errors++;
2220 	mutex_exit(&vd->vdev_stat_lock);
2221 
2222 	if (type == ZIO_TYPE_WRITE && txg != 0 &&
2223 	    (!(flags & ZIO_FLAG_IO_REPAIR) ||
2224 	    (flags & ZIO_FLAG_SCRUB_THREAD))) {
2225 		/*
2226 		 * This is either a normal write (not a repair), or it's a
2227 		 * repair induced by the scrub thread.  In the normal case,
2228 		 * we commit the DTL change in the same txg as the block
2229 		 * was born.  In the scrub-induced repair case, we know that
2230 		 * scrubs run in first-pass syncing context, so we commit
2231 		 * the DTL change in spa->spa_syncing_txg.
2232 		 *
2233 		 * We currently do not make DTL entries for failed spontaneous
2234 		 * self-healing writes triggered by normal (non-scrubbing)
2235 		 * reads, because we have no transactional context in which to
2236 		 * do so -- and it's not clear that it'd be desirable anyway.
2237 		 */
2238 		if (vd->vdev_ops->vdev_op_leaf) {
2239 			uint64_t commit_txg = txg;
2240 			if (flags & ZIO_FLAG_SCRUB_THREAD) {
2241 				ASSERT(flags & ZIO_FLAG_IO_REPAIR);
2242 				ASSERT(spa_sync_pass(spa) == 1);
2243 				vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
2244 				commit_txg = spa->spa_syncing_txg;
2245 			}
2246 			ASSERT(commit_txg >= spa->spa_syncing_txg);
2247 			if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
2248 				return;
2249 			for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
2250 				vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1);
2251 			vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg);
2252 		}
2253 		if (vd != rvd)
2254 			vdev_dtl_dirty(vd, DTL_MISSING, txg, 1);
2255 	}
2256 }
2257 
2258 void
2259 vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete)
2260 {
2261 	int c;
2262 	vdev_stat_t *vs = &vd->vdev_stat;
2263 
2264 	for (c = 0; c < vd->vdev_children; c++)
2265 		vdev_scrub_stat_update(vd->vdev_child[c], type, complete);
2266 
2267 	mutex_enter(&vd->vdev_stat_lock);
2268 
2269 	if (type == POOL_SCRUB_NONE) {
2270 		/*
2271 		 * Update completion and end time.  Leave everything else alone
2272 		 * so we can report what happened during the previous scrub.
2273 		 */
2274 		vs->vs_scrub_complete = complete;
2275 		vs->vs_scrub_end = gethrestime_sec();
2276 	} else {
2277 		vs->vs_scrub_type = type;
2278 		vs->vs_scrub_complete = 0;
2279 		vs->vs_scrub_examined = 0;
2280 		vs->vs_scrub_repaired = 0;
2281 		vs->vs_scrub_start = gethrestime_sec();
2282 		vs->vs_scrub_end = 0;
2283 	}
2284 
2285 	mutex_exit(&vd->vdev_stat_lock);
2286 }
2287 
2288 /*
2289  * Update the in-core space usage stats for this vdev and the root vdev.
2290  */
2291 void
2292 vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta,
2293     boolean_t update_root)
2294 {
2295 	int64_t dspace_delta = space_delta;
2296 	spa_t *spa = vd->vdev_spa;
2297 	vdev_t *rvd = spa->spa_root_vdev;
2298 
2299 	ASSERT(vd == vd->vdev_top);
2300 
2301 	/*
2302 	 * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion
2303 	 * factor.  We must calculate this here and not at the root vdev
2304 	 * because the root vdev's psize-to-asize is simply the max of its
2305 	 * childrens', thus not accurate enough for us.
2306 	 */
2307 	ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0);
2308 	ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache);
2309 	dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) *
2310 	    vd->vdev_deflate_ratio;
2311 
2312 	mutex_enter(&vd->vdev_stat_lock);
2313 	vd->vdev_stat.vs_space += space_delta;
2314 	vd->vdev_stat.vs_alloc += alloc_delta;
2315 	vd->vdev_stat.vs_dspace += dspace_delta;
2316 	mutex_exit(&vd->vdev_stat_lock);
2317 
2318 	if (update_root) {
2319 		ASSERT(rvd == vd->vdev_parent);
2320 		ASSERT(vd->vdev_ms_count != 0);
2321 
2322 		/*
2323 		 * Don't count non-normal (e.g. intent log) space as part of
2324 		 * the pool's capacity.
2325 		 */
2326 		if (vd->vdev_mg->mg_class != spa->spa_normal_class)
2327 			return;
2328 
2329 		mutex_enter(&rvd->vdev_stat_lock);
2330 		rvd->vdev_stat.vs_space += space_delta;
2331 		rvd->vdev_stat.vs_alloc += alloc_delta;
2332 		rvd->vdev_stat.vs_dspace += dspace_delta;
2333 		mutex_exit(&rvd->vdev_stat_lock);
2334 	}
2335 }
2336 
2337 /*
2338  * Mark a top-level vdev's config as dirty, placing it on the dirty list
2339  * so that it will be written out next time the vdev configuration is synced.
2340  * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
2341  */
2342 void
2343 vdev_config_dirty(vdev_t *vd)
2344 {
2345 	spa_t *spa = vd->vdev_spa;
2346 	vdev_t *rvd = spa->spa_root_vdev;
2347 	int c;
2348 
2349 	/*
2350 	 * If this is an aux vdev (as with l2cache and spare devices), then we
2351 	 * update the vdev config manually and set the sync flag.
2352 	 */
2353 	if (vd->vdev_aux != NULL) {
2354 		spa_aux_vdev_t *sav = vd->vdev_aux;
2355 		nvlist_t **aux;
2356 		uint_t naux;
2357 
2358 		for (c = 0; c < sav->sav_count; c++) {
2359 			if (sav->sav_vdevs[c] == vd)
2360 				break;
2361 		}
2362 
2363 		if (c == sav->sav_count) {
2364 			/*
2365 			 * We're being removed.  There's nothing more to do.
2366 			 */
2367 			ASSERT(sav->sav_sync == B_TRUE);
2368 			return;
2369 		}
2370 
2371 		sav->sav_sync = B_TRUE;
2372 
2373 		if (nvlist_lookup_nvlist_array(sav->sav_config,
2374 		    ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) {
2375 			VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
2376 			    ZPOOL_CONFIG_SPARES, &aux, &naux) == 0);
2377 		}
2378 
2379 		ASSERT(c < naux);
2380 
2381 		/*
2382 		 * Setting the nvlist in the middle if the array is a little
2383 		 * sketchy, but it will work.
2384 		 */
2385 		nvlist_free(aux[c]);
2386 		aux[c] = vdev_config_generate(spa, vd, B_TRUE, B_FALSE, B_TRUE);
2387 
2388 		return;
2389 	}
2390 
2391 	/*
2392 	 * The dirty list is protected by the SCL_CONFIG lock.  The caller
2393 	 * must either hold SCL_CONFIG as writer, or must be the sync thread
2394 	 * (which holds SCL_CONFIG as reader).  There's only one sync thread,
2395 	 * so this is sufficient to ensure mutual exclusion.
2396 	 */
2397 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
2398 	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
2399 	    spa_config_held(spa, SCL_CONFIG, RW_READER)));
2400 
2401 	if (vd == rvd) {
2402 		for (c = 0; c < rvd->vdev_children; c++)
2403 			vdev_config_dirty(rvd->vdev_child[c]);
2404 	} else {
2405 		ASSERT(vd == vd->vdev_top);
2406 
2407 		if (!list_link_active(&vd->vdev_config_dirty_node))
2408 			list_insert_head(&spa->spa_config_dirty_list, vd);
2409 	}
2410 }
2411 
2412 void
2413 vdev_config_clean(vdev_t *vd)
2414 {
2415 	spa_t *spa = vd->vdev_spa;
2416 
2417 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
2418 	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
2419 	    spa_config_held(spa, SCL_CONFIG, RW_READER)));
2420 
2421 	ASSERT(list_link_active(&vd->vdev_config_dirty_node));
2422 	list_remove(&spa->spa_config_dirty_list, vd);
2423 }
2424 
2425 /*
2426  * Mark a top-level vdev's state as dirty, so that the next pass of
2427  * spa_sync() can convert this into vdev_config_dirty().  We distinguish
2428  * the state changes from larger config changes because they require
2429  * much less locking, and are often needed for administrative actions.
2430  */
2431 void
2432 vdev_state_dirty(vdev_t *vd)
2433 {
2434 	spa_t *spa = vd->vdev_spa;
2435 
2436 	ASSERT(vd == vd->vdev_top);
2437 
2438 	/*
2439 	 * The state list is protected by the SCL_STATE lock.  The caller
2440 	 * must either hold SCL_STATE as writer, or must be the sync thread
2441 	 * (which holds SCL_STATE as reader).  There's only one sync thread,
2442 	 * so this is sufficient to ensure mutual exclusion.
2443 	 */
2444 	ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
2445 	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
2446 	    spa_config_held(spa, SCL_STATE, RW_READER)));
2447 
2448 	if (!list_link_active(&vd->vdev_state_dirty_node))
2449 		list_insert_head(&spa->spa_state_dirty_list, vd);
2450 }
2451 
2452 void
2453 vdev_state_clean(vdev_t *vd)
2454 {
2455 	spa_t *spa = vd->vdev_spa;
2456 
2457 	ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
2458 	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
2459 	    spa_config_held(spa, SCL_STATE, RW_READER)));
2460 
2461 	ASSERT(list_link_active(&vd->vdev_state_dirty_node));
2462 	list_remove(&spa->spa_state_dirty_list, vd);
2463 }
2464 
2465 /*
2466  * Propagate vdev state up from children to parent.
2467  */
2468 void
2469 vdev_propagate_state(vdev_t *vd)
2470 {
2471 	spa_t *spa = vd->vdev_spa;
2472 	vdev_t *rvd = spa->spa_root_vdev;
2473 	int degraded = 0, faulted = 0;
2474 	int corrupted = 0;
2475 	int c;
2476 	vdev_t *child;
2477 
2478 	if (vd->vdev_children > 0) {
2479 		for (c = 0; c < vd->vdev_children; c++) {
2480 			child = vd->vdev_child[c];
2481 
2482 			if (!vdev_readable(child) ||
2483 			    (!vdev_writeable(child) && spa_writeable(spa))) {
2484 				/*
2485 				 * Root special: if there is a top-level log
2486 				 * device, treat the root vdev as if it were
2487 				 * degraded.
2488 				 */
2489 				if (child->vdev_islog && vd == rvd)
2490 					degraded++;
2491 				else
2492 					faulted++;
2493 			} else if (child->vdev_state <= VDEV_STATE_DEGRADED) {
2494 				degraded++;
2495 			}
2496 
2497 			if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
2498 				corrupted++;
2499 		}
2500 
2501 		vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded);
2502 
2503 		/*
2504 		 * Root special: if there is a top-level vdev that cannot be
2505 		 * opened due to corrupted metadata, then propagate the root
2506 		 * vdev's aux state as 'corrupt' rather than 'insufficient
2507 		 * replicas'.
2508 		 */
2509 		if (corrupted && vd == rvd &&
2510 		    rvd->vdev_state == VDEV_STATE_CANT_OPEN)
2511 			vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN,
2512 			    VDEV_AUX_CORRUPT_DATA);
2513 	}
2514 
2515 	if (vd->vdev_parent)
2516 		vdev_propagate_state(vd->vdev_parent);
2517 }
2518 
2519 /*
2520  * Set a vdev's state.  If this is during an open, we don't update the parent
2521  * state, because we're in the process of opening children depth-first.
2522  * Otherwise, we propagate the change to the parent.
2523  *
2524  * If this routine places a device in a faulted state, an appropriate ereport is
2525  * generated.
2526  */
2527 void
2528 vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
2529 {
2530 	uint64_t save_state;
2531 	spa_t *spa = vd->vdev_spa;
2532 
2533 	if (state == vd->vdev_state) {
2534 		vd->vdev_stat.vs_aux = aux;
2535 		return;
2536 	}
2537 
2538 	save_state = vd->vdev_state;
2539 
2540 	vd->vdev_state = state;
2541 	vd->vdev_stat.vs_aux = aux;
2542 
2543 	/*
2544 	 * If we are setting the vdev state to anything but an open state, then
2545 	 * always close the underlying device.  Otherwise, we keep accessible
2546 	 * but invalid devices open forever.  We don't call vdev_close() itself,
2547 	 * because that implies some extra checks (offline, etc) that we don't
2548 	 * want here.  This is limited to leaf devices, because otherwise
2549 	 * closing the device will affect other children.
2550 	 */
2551 	if (vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf)
2552 		vd->vdev_ops->vdev_op_close(vd);
2553 
2554 	if (vd->vdev_removed &&
2555 	    state == VDEV_STATE_CANT_OPEN &&
2556 	    (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) {
2557 		/*
2558 		 * If the previous state is set to VDEV_STATE_REMOVED, then this
2559 		 * device was previously marked removed and someone attempted to
2560 		 * reopen it.  If this failed due to a nonexistent device, then
2561 		 * keep the device in the REMOVED state.  We also let this be if
2562 		 * it is one of our special test online cases, which is only
2563 		 * attempting to online the device and shouldn't generate an FMA
2564 		 * fault.
2565 		 */
2566 		vd->vdev_state = VDEV_STATE_REMOVED;
2567 		vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
2568 	} else if (state == VDEV_STATE_REMOVED) {
2569 		/*
2570 		 * Indicate to the ZFS DE that this device has been removed, and
2571 		 * any recent errors should be ignored.
2572 		 */
2573 		zfs_post_remove(spa, vd);
2574 		vd->vdev_removed = B_TRUE;
2575 	} else if (state == VDEV_STATE_CANT_OPEN) {
2576 		/*
2577 		 * If we fail to open a vdev during an import, we mark it as
2578 		 * "not available", which signifies that it was never there to
2579 		 * begin with.  Failure to open such a device is not considered
2580 		 * an error.
2581 		 */
2582 		if (spa->spa_load_state == SPA_LOAD_IMPORT &&
2583 		    vd->vdev_ops->vdev_op_leaf)
2584 			vd->vdev_not_present = 1;
2585 
2586 		/*
2587 		 * Post the appropriate ereport.  If the 'prevstate' field is
2588 		 * set to something other than VDEV_STATE_UNKNOWN, it indicates
2589 		 * that this is part of a vdev_reopen().  In this case, we don't
2590 		 * want to post the ereport if the device was already in the
2591 		 * CANT_OPEN state beforehand.
2592 		 *
2593 		 * If the 'checkremove' flag is set, then this is an attempt to
2594 		 * online the device in response to an insertion event.  If we
2595 		 * hit this case, then we have detected an insertion event for a
2596 		 * faulted or offline device that wasn't in the removed state.
2597 		 * In this scenario, we don't post an ereport because we are
2598 		 * about to replace the device, or attempt an online with
2599 		 * vdev_forcefault, which will generate the fault for us.
2600 		 */
2601 		if ((vd->vdev_prevstate != state || vd->vdev_forcefault) &&
2602 		    !vd->vdev_not_present && !vd->vdev_checkremove &&
2603 		    vd != spa->spa_root_vdev) {
2604 			const char *class;
2605 
2606 			switch (aux) {
2607 			case VDEV_AUX_OPEN_FAILED:
2608 				class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED;
2609 				break;
2610 			case VDEV_AUX_CORRUPT_DATA:
2611 				class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA;
2612 				break;
2613 			case VDEV_AUX_NO_REPLICAS:
2614 				class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS;
2615 				break;
2616 			case VDEV_AUX_BAD_GUID_SUM:
2617 				class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM;
2618 				break;
2619 			case VDEV_AUX_TOO_SMALL:
2620 				class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL;
2621 				break;
2622 			case VDEV_AUX_BAD_LABEL:
2623 				class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
2624 				break;
2625 			case VDEV_AUX_IO_FAILURE:
2626 				class = FM_EREPORT_ZFS_IO_FAILURE;
2627 				break;
2628 			default:
2629 				class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
2630 			}
2631 
2632 			zfs_ereport_post(class, spa, vd, NULL, save_state, 0);
2633 		}
2634 
2635 		/* Erase any notion of persistent removed state */
2636 		vd->vdev_removed = B_FALSE;
2637 	} else {
2638 		vd->vdev_removed = B_FALSE;
2639 	}
2640 
2641 	if (!isopen && vd->vdev_parent)
2642 		vdev_propagate_state(vd->vdev_parent);
2643 }
2644 
2645 /*
2646  * Check the vdev configuration to ensure that it's capable of supporting
2647  * a root pool. Currently, we do not support RAID-Z or partial configuration.
2648  * In addition, only a single top-level vdev is allowed and none of the leaves
2649  * can be wholedisks.
2650  */
2651 boolean_t
2652 vdev_is_bootable(vdev_t *vd)
2653 {
2654 	int c;
2655 
2656 	if (!vd->vdev_ops->vdev_op_leaf) {
2657 		char *vdev_type = vd->vdev_ops->vdev_op_type;
2658 
2659 		if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 &&
2660 		    vd->vdev_children > 1) {
2661 			return (B_FALSE);
2662 		} else if (strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 ||
2663 		    strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) {
2664 			return (B_FALSE);
2665 		}
2666 	} else if (vd->vdev_wholedisk == 1) {
2667 		return (B_FALSE);
2668 	}
2669 
2670 	for (c = 0; c < vd->vdev_children; c++) {
2671 		if (!vdev_is_bootable(vd->vdev_child[c]))
2672 			return (B_FALSE);
2673 	}
2674 	return (B_TRUE);
2675 }
2676 
2677 void
2678 vdev_load_log_state(vdev_t *vd, nvlist_t *nv)
2679 {
2680 	uint_t c, children;
2681 	nvlist_t **child;
2682 	uint64_t val;
2683 	spa_t *spa = vd->vdev_spa;
2684 
2685 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
2686 	    &child, &children) == 0) {
2687 		for (c = 0; c < children; c++)
2688 			vdev_load_log_state(vd->vdev_child[c], child[c]);
2689 	}
2690 
2691 	if (vd->vdev_ops->vdev_op_leaf && nvlist_lookup_uint64(nv,
2692 	    ZPOOL_CONFIG_OFFLINE, &val) == 0 && val) {
2693 
2694 		/*
2695 		 * It would be nice to call vdev_offline()
2696 		 * directly but the pool isn't fully loaded and
2697 		 * the txg threads have not been started yet.
2698 		 */
2699 		spa_config_enter(spa, SCL_STATE_ALL, FTAG, RW_WRITER);
2700 		vd->vdev_offline = val;
2701 		vdev_reopen(vd->vdev_top);
2702 		spa_config_exit(spa, SCL_STATE_ALL, FTAG);
2703 	}
2704 }
2705