xref: /titanic_51/usr/src/uts/common/fs/zfs/vdev.c (revision 12ef07e9fce5f59ad851a4e70cb765f3c56c11a5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/zfs_context.h>
29 #include <sys/spa.h>
30 #include <sys/spa_impl.h>
31 #include <sys/dmu.h>
32 #include <sys/dmu_tx.h>
33 #include <sys/vdev_impl.h>
34 #include <sys/uberblock_impl.h>
35 #include <sys/metaslab.h>
36 #include <sys/metaslab_impl.h>
37 #include <sys/space_map.h>
38 #include <sys/zio.h>
39 #include <sys/zap.h>
40 #include <sys/fs/zfs.h>
41 
42 /*
43  * Virtual device management.
44  */
45 
46 static vdev_ops_t *vdev_ops_table[] = {
47 	&vdev_root_ops,
48 	&vdev_raidz_ops,
49 	&vdev_mirror_ops,
50 	&vdev_replacing_ops,
51 	&vdev_disk_ops,
52 	&vdev_file_ops,
53 	&vdev_missing_ops,
54 	NULL
55 };
56 
57 /*
58  * Given a vdev type, return the appropriate ops vector.
59  */
60 static vdev_ops_t *
61 vdev_getops(const char *type)
62 {
63 	vdev_ops_t *ops, **opspp;
64 
65 	for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
66 		if (strcmp(ops->vdev_op_type, type) == 0)
67 			break;
68 
69 	return (ops);
70 }
71 
72 /*
73  * Default asize function: return the MAX of psize with the asize of
74  * all children.  This is what's used by anything other than RAID-Z.
75  */
76 uint64_t
77 vdev_default_asize(vdev_t *vd, uint64_t psize)
78 {
79 	uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_ashift);
80 	uint64_t csize;
81 	uint64_t c;
82 
83 	for (c = 0; c < vd->vdev_children; c++) {
84 		csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
85 		asize = MAX(asize, csize);
86 	}
87 
88 	return (asize);
89 }
90 
91 /*
92  * Get the replaceable or attachable device size.
93  * If the parent is a mirror or raidz, the replaceable size is the minimum
94  * psize of all its children. For the rest, just return our own psize.
95  *
96  * e.g.
97  *			psize	rsize
98  * root			-	-
99  *	mirror/raidz	-	-
100  *	    disk1	20g	20g
101  *	    disk2 	40g	20g
102  *	disk3 		80g	80g
103  */
104 uint64_t
105 vdev_get_rsize(vdev_t *vd)
106 {
107 	vdev_t *pvd, *cvd;
108 	uint64_t c, rsize;
109 
110 	pvd = vd->vdev_parent;
111 
112 	/*
113 	 * If our parent is NULL or the root, just return our own psize.
114 	 */
115 	if (pvd == NULL || pvd->vdev_parent == NULL)
116 		return (vd->vdev_psize);
117 
118 	rsize = 0;
119 
120 	for (c = 0; c < pvd->vdev_children; c++) {
121 		cvd = pvd->vdev_child[c];
122 		rsize = MIN(rsize - 1, cvd->vdev_psize - 1) + 1;
123 	}
124 
125 	return (rsize);
126 }
127 
128 vdev_t *
129 vdev_lookup_top(spa_t *spa, uint64_t vdev)
130 {
131 	vdev_t *rvd = spa->spa_root_vdev;
132 
133 	if (vdev < rvd->vdev_children)
134 		return (rvd->vdev_child[vdev]);
135 
136 	return (NULL);
137 }
138 
139 vdev_t *
140 vdev_lookup_by_path(vdev_t *vd, const char *path)
141 {
142 	int c;
143 	vdev_t *mvd;
144 
145 	if (vd->vdev_path != NULL) {
146 		if (vd->vdev_wholedisk == 1) {
147 			/*
148 			 * For whole disks, the internal path has 's0', but the
149 			 * path passed in by the user doesn't.
150 			 */
151 			if (strlen(path) == strlen(vd->vdev_path) - 2 &&
152 			    strncmp(path, vd->vdev_path, strlen(path)) == 0)
153 				return (vd);
154 		} else if (strcmp(path, vd->vdev_path) == 0) {
155 			return (vd);
156 		}
157 	}
158 
159 	for (c = 0; c < vd->vdev_children; c++)
160 		if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) !=
161 		    NULL)
162 			return (mvd);
163 
164 	return (NULL);
165 }
166 
167 vdev_t *
168 vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
169 {
170 	int c;
171 	vdev_t *mvd;
172 
173 	if (vd->vdev_children == 0 && vd->vdev_guid == guid)
174 		return (vd);
175 
176 	for (c = 0; c < vd->vdev_children; c++)
177 		if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
178 		    NULL)
179 			return (mvd);
180 
181 	return (NULL);
182 }
183 
184 void
185 vdev_add_child(vdev_t *pvd, vdev_t *cvd)
186 {
187 	size_t oldsize, newsize;
188 	uint64_t id = cvd->vdev_id;
189 	vdev_t **newchild;
190 
191 	ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER));
192 	ASSERT(cvd->vdev_parent == NULL);
193 
194 	cvd->vdev_parent = pvd;
195 
196 	if (pvd == NULL)
197 		return;
198 
199 	ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
200 
201 	oldsize = pvd->vdev_children * sizeof (vdev_t *);
202 	pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
203 	newsize = pvd->vdev_children * sizeof (vdev_t *);
204 
205 	newchild = kmem_zalloc(newsize, KM_SLEEP);
206 	if (pvd->vdev_child != NULL) {
207 		bcopy(pvd->vdev_child, newchild, oldsize);
208 		kmem_free(pvd->vdev_child, oldsize);
209 	}
210 
211 	pvd->vdev_child = newchild;
212 	pvd->vdev_child[id] = cvd;
213 
214 	cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
215 	ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
216 
217 	/*
218 	 * Walk up all ancestors to update guid sum.
219 	 */
220 	for (; pvd != NULL; pvd = pvd->vdev_parent)
221 		pvd->vdev_guid_sum += cvd->vdev_guid_sum;
222 }
223 
224 void
225 vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
226 {
227 	int c;
228 	uint_t id = cvd->vdev_id;
229 
230 	ASSERT(cvd->vdev_parent == pvd);
231 
232 	if (pvd == NULL)
233 		return;
234 
235 	ASSERT(id < pvd->vdev_children);
236 	ASSERT(pvd->vdev_child[id] == cvd);
237 
238 	pvd->vdev_child[id] = NULL;
239 	cvd->vdev_parent = NULL;
240 
241 	for (c = 0; c < pvd->vdev_children; c++)
242 		if (pvd->vdev_child[c])
243 			break;
244 
245 	if (c == pvd->vdev_children) {
246 		kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
247 		pvd->vdev_child = NULL;
248 		pvd->vdev_children = 0;
249 	}
250 
251 	/*
252 	 * Walk up all ancestors to update guid sum.
253 	 */
254 	for (; pvd != NULL; pvd = pvd->vdev_parent)
255 		pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
256 }
257 
258 /*
259  * Remove any holes in the child array.
260  */
261 void
262 vdev_compact_children(vdev_t *pvd)
263 {
264 	vdev_t **newchild, *cvd;
265 	int oldc = pvd->vdev_children;
266 	int newc, c;
267 
268 	ASSERT(spa_config_held(pvd->vdev_spa, RW_WRITER));
269 
270 	for (c = newc = 0; c < oldc; c++)
271 		if (pvd->vdev_child[c])
272 			newc++;
273 
274 	newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP);
275 
276 	for (c = newc = 0; c < oldc; c++) {
277 		if ((cvd = pvd->vdev_child[c]) != NULL) {
278 			newchild[newc] = cvd;
279 			cvd->vdev_id = newc++;
280 		}
281 	}
282 
283 	kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
284 	pvd->vdev_child = newchild;
285 	pvd->vdev_children = newc;
286 }
287 
288 /*
289  * Allocate and minimally initialize a vdev_t.
290  */
291 static vdev_t *
292 vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
293 {
294 	vdev_t *vd;
295 
296 	while (guid == 0)
297 		guid = spa_get_random(-1ULL);
298 
299 	vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
300 
301 	vd->vdev_spa = spa;
302 	vd->vdev_id = id;
303 	vd->vdev_guid = guid;
304 	vd->vdev_guid_sum = guid;
305 	vd->vdev_ops = ops;
306 	vd->vdev_state = VDEV_STATE_CLOSED;
307 
308 	mutex_init(&vd->vdev_io_lock, NULL, MUTEX_DEFAULT, NULL);
309 	cv_init(&vd->vdev_io_cv, NULL, CV_DEFAULT, NULL);
310 	list_create(&vd->vdev_io_pending, sizeof (zio_t),
311 	    offsetof(zio_t, io_pending));
312 	mutex_init(&vd->vdev_dirty_lock, NULL, MUTEX_DEFAULT, NULL);
313 	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
314 	space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock);
315 	space_map_create(&vd->vdev_dtl_scrub, 0, -1ULL, 0, &vd->vdev_dtl_lock);
316 	txg_list_create(&vd->vdev_ms_list,
317 	    offsetof(struct metaslab, ms_txg_node));
318 	txg_list_create(&vd->vdev_dtl_list,
319 	    offsetof(struct vdev, vdev_dtl_node));
320 	vd->vdev_stat.vs_timestamp = gethrtime();
321 
322 	return (vd);
323 }
324 
325 /*
326  * Free a vdev_t that has been removed from service.
327  */
328 static void
329 vdev_free_common(vdev_t *vd)
330 {
331 	if (vd->vdev_path)
332 		spa_strfree(vd->vdev_path);
333 	if (vd->vdev_devid)
334 		spa_strfree(vd->vdev_devid);
335 
336 	txg_list_destroy(&vd->vdev_ms_list);
337 	txg_list_destroy(&vd->vdev_dtl_list);
338 	mutex_enter(&vd->vdev_dtl_lock);
339 	space_map_vacate(&vd->vdev_dtl_map, NULL, NULL);
340 	space_map_destroy(&vd->vdev_dtl_map);
341 	space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
342 	space_map_destroy(&vd->vdev_dtl_scrub);
343 	mutex_exit(&vd->vdev_dtl_lock);
344 	mutex_destroy(&vd->vdev_dtl_lock);
345 	mutex_destroy(&vd->vdev_dirty_lock);
346 	list_destroy(&vd->vdev_io_pending);
347 	mutex_destroy(&vd->vdev_io_lock);
348 	cv_destroy(&vd->vdev_io_cv);
349 
350 	kmem_free(vd, sizeof (vdev_t));
351 }
352 
353 /*
354  * Allocate a new vdev.  The 'alloctype' is used to control whether we are
355  * creating a new vdev or loading an existing one - the behavior is slightly
356  * different for each case.
357  */
358 vdev_t *
359 vdev_alloc(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int alloctype)
360 {
361 	vdev_ops_t *ops;
362 	char *type;
363 	uint64_t guid = 0, offline = 0;
364 	vdev_t *vd;
365 
366 	ASSERT(spa_config_held(spa, RW_WRITER));
367 
368 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
369 		return (NULL);
370 
371 	if ((ops = vdev_getops(type)) == NULL)
372 		return (NULL);
373 
374 	/*
375 	 * If this is a load, get the vdev guid from the nvlist.
376 	 * Otherwise, vdev_alloc_common() will generate one for us.
377 	 */
378 	if (alloctype == VDEV_ALLOC_LOAD) {
379 		uint64_t label_id;
380 
381 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
382 		    label_id != id)
383 			return (NULL);
384 
385 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
386 			return (NULL);
387 	}
388 
389 	vd = vdev_alloc_common(spa, id, guid, ops);
390 
391 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
392 		vd->vdev_path = spa_strdup(vd->vdev_path);
393 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
394 		vd->vdev_devid = spa_strdup(vd->vdev_devid);
395 
396 	/*
397 	 * Set the whole_disk property.  If it's not specified, leave the value
398 	 * as -1.
399 	 */
400 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
401 	    &vd->vdev_wholedisk) != 0)
402 		vd->vdev_wholedisk = -1ULL;
403 
404 	/*
405 	 * If we're a top-level vdev, try to load the allocation parameters.
406 	 */
407 	if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) {
408 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
409 		    &vd->vdev_ms_array);
410 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
411 		    &vd->vdev_ms_shift);
412 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT,
413 		    &vd->vdev_ashift);
414 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
415 		    &vd->vdev_asize);
416 	}
417 
418 	/*
419 	 * If we're a leaf vdev, try to load the DTL object
420 	 * and the offline state.
421 	 */
422 	vd->vdev_offline = B_FALSE;
423 	if (vd->vdev_ops->vdev_op_leaf && alloctype == VDEV_ALLOC_LOAD) {
424 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
425 		    &vd->vdev_dtl.smo_object);
426 
427 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &offline)
428 		    == 0)
429 			vd->vdev_offline = offline;
430 	}
431 
432 	/*
433 	 * Add ourselves to the parent's list of children.
434 	 */
435 	vdev_add_child(parent, vd);
436 
437 	return (vd);
438 }
439 
440 void
441 vdev_free(vdev_t *vd)
442 {
443 	int c;
444 
445 	/*
446 	 * vdev_free() implies closing the vdev first.  This is simpler than
447 	 * trying to ensure complicated semantics for all callers.
448 	 */
449 	vdev_close(vd);
450 
451 	/*
452 	 * It's possible to free a vdev that's been added to the dirty
453 	 * list when in the middle of spa_vdev_add().  Handle that case
454 	 * correctly here.
455 	 */
456 	if (vd->vdev_is_dirty)
457 		vdev_config_clean(vd);
458 
459 	/*
460 	 * Free all children.
461 	 */
462 	for (c = 0; c < vd->vdev_children; c++)
463 		vdev_free(vd->vdev_child[c]);
464 
465 	ASSERT(vd->vdev_child == NULL);
466 	ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
467 
468 	/*
469 	 * Discard allocation state.
470 	 */
471 	if (vd == vd->vdev_top)
472 		vdev_metaslab_fini(vd);
473 
474 	ASSERT3U(vd->vdev_stat.vs_space, ==, 0);
475 	ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0);
476 
477 	/*
478 	 * Remove this vdev from its parent's child list.
479 	 */
480 	vdev_remove_child(vd->vdev_parent, vd);
481 
482 	ASSERT(vd->vdev_parent == NULL);
483 
484 	vdev_free_common(vd);
485 }
486 
487 /*
488  * Transfer top-level vdev state from svd to tvd.
489  */
490 static void
491 vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
492 {
493 	spa_t *spa = svd->vdev_spa;
494 	metaslab_t *msp;
495 	vdev_t *vd;
496 	int t;
497 
498 	ASSERT(tvd == tvd->vdev_top);
499 
500 	tvd->vdev_ms_array = svd->vdev_ms_array;
501 	tvd->vdev_ms_shift = svd->vdev_ms_shift;
502 	tvd->vdev_ms_count = svd->vdev_ms_count;
503 
504 	svd->vdev_ms_array = 0;
505 	svd->vdev_ms_shift = 0;
506 	svd->vdev_ms_count = 0;
507 
508 	tvd->vdev_mg = svd->vdev_mg;
509 	tvd->vdev_mg->mg_vd = tvd;
510 	tvd->vdev_ms = svd->vdev_ms;
511 	tvd->vdev_smo = svd->vdev_smo;
512 
513 	svd->vdev_mg = NULL;
514 	svd->vdev_ms = NULL;
515 	svd->vdev_smo = NULL;
516 
517 	tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
518 	tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
519 
520 	svd->vdev_stat.vs_alloc = 0;
521 	svd->vdev_stat.vs_space = 0;
522 
523 	for (t = 0; t < TXG_SIZE; t++) {
524 		while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
525 			(void) txg_list_add(&tvd->vdev_ms_list, msp, t);
526 		while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL)
527 			(void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
528 		if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
529 			(void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
530 		tvd->vdev_dirty[t] = svd->vdev_dirty[t];
531 		svd->vdev_dirty[t] = 0;
532 	}
533 
534 	if (svd->vdev_is_dirty) {
535 		vdev_config_clean(svd);
536 		vdev_config_dirty(tvd);
537 	}
538 
539 	ASSERT(svd->vdev_io_retry == NULL);
540 	ASSERT(list_is_empty(&svd->vdev_io_pending));
541 }
542 
543 static void
544 vdev_top_update(vdev_t *tvd, vdev_t *vd)
545 {
546 	int c;
547 
548 	if (vd == NULL)
549 		return;
550 
551 	vd->vdev_top = tvd;
552 
553 	for (c = 0; c < vd->vdev_children; c++)
554 		vdev_top_update(tvd, vd->vdev_child[c]);
555 }
556 
557 /*
558  * Add a mirror/replacing vdev above an existing vdev.
559  */
560 vdev_t *
561 vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
562 {
563 	spa_t *spa = cvd->vdev_spa;
564 	vdev_t *pvd = cvd->vdev_parent;
565 	vdev_t *mvd;
566 
567 	ASSERT(spa_config_held(spa, RW_WRITER));
568 
569 	mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
570 	vdev_remove_child(pvd, cvd);
571 	vdev_add_child(pvd, mvd);
572 	cvd->vdev_id = mvd->vdev_children;
573 	vdev_add_child(mvd, cvd);
574 	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
575 
576 	mvd->vdev_asize = cvd->vdev_asize;
577 	mvd->vdev_ashift = cvd->vdev_ashift;
578 	mvd->vdev_state = cvd->vdev_state;
579 
580 	if (mvd == mvd->vdev_top)
581 		vdev_top_transfer(cvd, mvd);
582 
583 	return (mvd);
584 }
585 
586 /*
587  * Remove a 1-way mirror/replacing vdev from the tree.
588  */
589 void
590 vdev_remove_parent(vdev_t *cvd)
591 {
592 	vdev_t *mvd = cvd->vdev_parent;
593 	vdev_t *pvd = mvd->vdev_parent;
594 
595 	ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER));
596 
597 	ASSERT(mvd->vdev_children == 1);
598 	ASSERT(mvd->vdev_ops == &vdev_mirror_ops ||
599 	    mvd->vdev_ops == &vdev_replacing_ops);
600 
601 	vdev_remove_child(mvd, cvd);
602 	vdev_remove_child(pvd, mvd);
603 	cvd->vdev_id = mvd->vdev_id;
604 	vdev_add_child(pvd, cvd);
605 	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
606 
607 	if (cvd == cvd->vdev_top)
608 		vdev_top_transfer(mvd, cvd);
609 
610 	ASSERT(mvd->vdev_children == 0);
611 	vdev_free(mvd);
612 }
613 
614 void
615 vdev_metaslab_init(vdev_t *vd, uint64_t txg)
616 {
617 	spa_t *spa = vd->vdev_spa;
618 	metaslab_class_t *mc = spa_metaslab_class_select(spa);
619 	uint64_t c;
620 	uint64_t oldc = vd->vdev_ms_count;
621 	uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
622 	space_map_obj_t *smo = vd->vdev_smo;
623 	metaslab_t **mspp = vd->vdev_ms;
624 
625 	dprintf("%s oldc %llu newc %llu\n", vdev_description(vd), oldc, newc);
626 
627 	ASSERT(oldc <= newc);
628 
629 	vd->vdev_smo = kmem_zalloc(newc * sizeof (*smo), KM_SLEEP);
630 	vd->vdev_ms = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
631 	vd->vdev_ms_count = newc;
632 
633 	if (vd->vdev_mg == NULL) {
634 		if (txg == 0) {
635 			dmu_buf_t *db;
636 			uint64_t *ms_array;
637 
638 			ms_array = kmem_zalloc(newc * sizeof (uint64_t),
639 			    KM_SLEEP);
640 
641 			dmu_read(spa->spa_meta_objset, vd->vdev_ms_array,
642 			    0, newc * sizeof (uint64_t), ms_array);
643 
644 			for (c = 0; c < newc; c++) {
645 				if (ms_array[c] == 0)
646 					continue;
647 				db = dmu_bonus_hold(spa->spa_meta_objset,
648 				    ms_array[c]);
649 				dmu_buf_read(db);
650 				ASSERT3U(db->db_size, ==, sizeof (*smo));
651 				bcopy(db->db_data, &vd->vdev_smo[c],
652 				    db->db_size);
653 				ASSERT3U(vd->vdev_smo[c].smo_object, ==,
654 				    ms_array[c]);
655 				dmu_buf_rele(db);
656 			}
657 			kmem_free(ms_array, newc * sizeof (uint64_t));
658 		}
659 		vd->vdev_mg = metaslab_group_create(mc, vd);
660 	}
661 
662 	for (c = 0; c < oldc; c++) {
663 		vd->vdev_smo[c] = smo[c];
664 		vd->vdev_ms[c] = mspp[c];
665 		mspp[c]->ms_smo = &vd->vdev_smo[c];
666 	}
667 
668 	for (c = oldc; c < newc; c++)
669 		metaslab_init(vd->vdev_mg, &vd->vdev_smo[c], &vd->vdev_ms[c],
670 		    c << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg);
671 
672 	if (oldc != 0) {
673 		kmem_free(smo, oldc * sizeof (*smo));
674 		kmem_free(mspp, oldc * sizeof (*mspp));
675 	}
676 
677 }
678 
679 void
680 vdev_metaslab_fini(vdev_t *vd)
681 {
682 	uint64_t m;
683 	uint64_t count = vd->vdev_ms_count;
684 
685 	if (vd->vdev_ms != NULL) {
686 		for (m = 0; m < count; m++)
687 			metaslab_fini(vd->vdev_ms[m]);
688 		kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
689 		vd->vdev_ms = NULL;
690 	}
691 
692 	if (vd->vdev_smo != NULL) {
693 		kmem_free(vd->vdev_smo, count * sizeof (space_map_obj_t));
694 		vd->vdev_smo = NULL;
695 	}
696 }
697 
698 /*
699  * Prepare a virtual device for access.
700  */
701 int
702 vdev_open(vdev_t *vd)
703 {
704 	int error;
705 	vdev_knob_t *vk;
706 	int c;
707 	uint64_t osize = 0;
708 	uint64_t asize, psize;
709 	uint64_t ashift = -1ULL;
710 
711 	ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
712 	    vd->vdev_state == VDEV_STATE_CANT_OPEN ||
713 	    vd->vdev_state == VDEV_STATE_OFFLINE);
714 
715 	if (vd->vdev_fault_mode == VDEV_FAULT_COUNT)
716 		vd->vdev_fault_arg >>= 1;
717 	else
718 		vd->vdev_fault_mode = VDEV_FAULT_NONE;
719 
720 	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
721 
722 	for (vk = vdev_knob_next(NULL); vk != NULL; vk = vdev_knob_next(vk)) {
723 		uint64_t *valp = (uint64_t *)((char *)vd + vk->vk_offset);
724 
725 		*valp = vk->vk_default;
726 		*valp = MAX(*valp, vk->vk_min);
727 		*valp = MIN(*valp, vk->vk_max);
728 	}
729 
730 	if (vd->vdev_ops->vdev_op_leaf) {
731 		vdev_cache_init(vd);
732 		vdev_queue_init(vd);
733 		vd->vdev_cache_active = B_TRUE;
734 	}
735 
736 	if (vd->vdev_offline) {
737 		ASSERT(vd->vdev_children == 0);
738 		dprintf("OFFLINE: %s = ENXIO\n", vdev_description(vd));
739 		vd->vdev_state = VDEV_STATE_OFFLINE;
740 		return (ENXIO);
741 	}
742 
743 	error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift);
744 
745 	dprintf("%s = %d, osize %llu, state = %d\n",
746 	    vdev_description(vd), error, osize, vd->vdev_state);
747 
748 	if (error) {
749 		dprintf("%s in %s failed to open, error %d, aux %d\n",
750 		    vdev_description(vd),
751 		    vdev_description(vd->vdev_parent),
752 		    error,
753 		    vd->vdev_stat.vs_aux);
754 
755 		vd->vdev_state = VDEV_STATE_CANT_OPEN;
756 		return (error);
757 	}
758 
759 	vd->vdev_state = VDEV_STATE_HEALTHY;
760 
761 	for (c = 0; c < vd->vdev_children; c++)
762 		if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY)
763 			vd->vdev_state = VDEV_STATE_DEGRADED;
764 
765 	osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
766 
767 	if (vd->vdev_children == 0) {
768 		if (osize < SPA_MINDEVSIZE) {
769 			vd->vdev_state = VDEV_STATE_CANT_OPEN;
770 			vd->vdev_stat.vs_aux = VDEV_AUX_TOO_SMALL;
771 			return (EOVERFLOW);
772 		}
773 		psize = osize;
774 		asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
775 	} else {
776 		if (osize < SPA_MINDEVSIZE -
777 		    (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
778 			vd->vdev_state = VDEV_STATE_CANT_OPEN;
779 			vd->vdev_stat.vs_aux = VDEV_AUX_TOO_SMALL;
780 			return (EOVERFLOW);
781 		}
782 		psize = 0;
783 		asize = osize;
784 	}
785 
786 	vd->vdev_psize = psize;
787 
788 	if (vd->vdev_asize == 0) {
789 		/*
790 		 * This is the first-ever open, so use the computed values.
791 		 */
792 		vd->vdev_asize = asize;
793 		vd->vdev_ashift = ashift;
794 	} else {
795 		/*
796 		 * Make sure the alignment requirement hasn't increased.
797 		 */
798 		if (ashift > vd->vdev_ashift) {
799 			dprintf("%s: ashift grew\n", vdev_description(vd));
800 			vd->vdev_state = VDEV_STATE_CANT_OPEN;
801 			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
802 			return (EINVAL);
803 		}
804 
805 		/*
806 		 * Make sure the device hasn't shrunk.
807 		 */
808 		if (asize < vd->vdev_asize) {
809 			dprintf("%s: device shrank\n", vdev_description(vd));
810 			vd->vdev_state = VDEV_STATE_CANT_OPEN;
811 			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
812 			return (EINVAL);
813 		}
814 
815 		/*
816 		 * If all children are healthy and the asize has increased,
817 		 * then we've experienced dynamic LUN growth.
818 		 */
819 		if (vd->vdev_state == VDEV_STATE_HEALTHY &&
820 		    asize > vd->vdev_asize) {
821 			dprintf("%s: device grew\n", vdev_description(vd));
822 			vd->vdev_asize = asize;
823 		}
824 	}
825 
826 	return (0);
827 }
828 
829 /*
830  * Close a virtual device.
831  */
832 void
833 vdev_close(vdev_t *vd)
834 {
835 	ASSERT3P(list_head(&vd->vdev_io_pending), ==, NULL);
836 
837 	vd->vdev_ops->vdev_op_close(vd);
838 
839 	if (vd->vdev_cache_active) {
840 		vdev_cache_fini(vd);
841 		vdev_queue_fini(vd);
842 		vd->vdev_cache_active = B_FALSE;
843 	}
844 
845 	if (vd->vdev_offline)
846 		vd->vdev_state = VDEV_STATE_OFFLINE;
847 	else
848 		vd->vdev_state = VDEV_STATE_CLOSED;
849 }
850 
851 void
852 vdev_reopen(vdev_t *vd, zio_t **rq)
853 {
854 	vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
855 	int c;
856 
857 	if (vd == rvd) {
858 		ASSERT(rq == NULL);
859 		for (c = 0; c < rvd->vdev_children; c++)
860 			vdev_reopen(rvd->vdev_child[c], NULL);
861 		return;
862 	}
863 
864 	/* only valid for top-level vdevs */
865 	ASSERT3P(vd, ==, vd->vdev_top);
866 
867 	/*
868 	 * vdev_state can change when spa_config_lock is held as writer,
869 	 * or when it's held as reader and we're doing a vdev_reopen().
870 	 * To handle the latter case, we grab rvd's io_lock to serialize
871 	 * reopens.  This ensures that there's never more than one vdev
872 	 * state changer active at a time.
873 	 */
874 	mutex_enter(&rvd->vdev_io_lock);
875 
876 	mutex_enter(&vd->vdev_io_lock);
877 	while (list_head(&vd->vdev_io_pending) != NULL)
878 		cv_wait(&vd->vdev_io_cv, &vd->vdev_io_lock);
879 	vdev_close(vd);
880 	(void) vdev_open(vd);
881 	if (rq != NULL) {
882 		*rq = vd->vdev_io_retry;
883 		vd->vdev_io_retry = NULL;
884 	}
885 	mutex_exit(&vd->vdev_io_lock);
886 
887 	/*
888 	 * Reassess root vdev's health.
889 	 */
890 	rvd->vdev_state = VDEV_STATE_HEALTHY;
891 	for (c = 0; c < rvd->vdev_children; c++) {
892 		uint64_t state = rvd->vdev_child[c]->vdev_state;
893 		rvd->vdev_state = MIN(rvd->vdev_state, state);
894 	}
895 
896 	mutex_exit(&rvd->vdev_io_lock);
897 }
898 
899 int
900 vdev_create(vdev_t *vd, uint64_t txg)
901 {
902 	int error;
903 
904 	/*
905 	 * Normally, partial opens (e.g. of a mirror) are allowed.
906 	 * For a create, however, we want to fail the request if
907 	 * there are any components we can't open.
908 	 */
909 	error = vdev_open(vd);
910 
911 	if (error || vd->vdev_state != VDEV_STATE_HEALTHY) {
912 		vdev_close(vd);
913 		return (error ? error : ENXIO);
914 	}
915 
916 	/*
917 	 * Recursively initialize all labels.
918 	 */
919 	if ((error = vdev_label_init(vd, txg)) != 0) {
920 		vdev_close(vd);
921 		return (error);
922 	}
923 
924 	return (0);
925 }
926 
927 /*
928  * The is the latter half of vdev_create().  It is distinct because it
929  * involves initiating transactions in order to do metaslab creation.
930  * For creation, we want to try to create all vdevs at once and then undo it
931  * if anything fails; this is much harder if we have pending transactions.
932  */
933 void
934 vdev_init(vdev_t *vd, uint64_t txg)
935 {
936 	/*
937 	 * Aim for roughly 200 metaslabs per vdev.
938 	 */
939 	vd->vdev_ms_shift = highbit(vd->vdev_asize / 200);
940 	vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
941 
942 	/*
943 	 * Initialize the vdev's metaslabs.
944 	 */
945 	vdev_metaslab_init(vd, txg);
946 }
947 
948 void
949 vdev_dirty(vdev_t *vd, uint8_t flags, uint64_t txg)
950 {
951 	vdev_t *tvd = vd->vdev_top;
952 
953 	mutex_enter(&tvd->vdev_dirty_lock);
954 	if ((tvd->vdev_dirty[txg & TXG_MASK] & flags) != flags) {
955 		tvd->vdev_dirty[txg & TXG_MASK] |= flags;
956 		(void) txg_list_add(&tvd->vdev_spa->spa_vdev_txg_list,
957 		    tvd, txg);
958 	}
959 	mutex_exit(&tvd->vdev_dirty_lock);
960 }
961 
962 void
963 vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size)
964 {
965 	mutex_enter(sm->sm_lock);
966 	if (!space_map_contains(sm, txg, size))
967 		space_map_add(sm, txg, size);
968 	mutex_exit(sm->sm_lock);
969 }
970 
971 int
972 vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size)
973 {
974 	int dirty;
975 
976 	/*
977 	 * Quick test without the lock -- covers the common case that
978 	 * there are no dirty time segments.
979 	 */
980 	if (sm->sm_space == 0)
981 		return (0);
982 
983 	mutex_enter(sm->sm_lock);
984 	dirty = space_map_contains(sm, txg, size);
985 	mutex_exit(sm->sm_lock);
986 
987 	return (dirty);
988 }
989 
990 /*
991  * Reassess DTLs after a config change or scrub completion.
992  */
993 void
994 vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
995 {
996 	int c;
997 
998 	ASSERT(spa_config_held(vd->vdev_spa, RW_WRITER));
999 
1000 	if (vd->vdev_children == 0) {
1001 		mutex_enter(&vd->vdev_dtl_lock);
1002 		/*
1003 		 * We're successfully scrubbed everything up to scrub_txg.
1004 		 * Therefore, excise all old DTLs up to that point, then
1005 		 * fold in the DTLs for everything we couldn't scrub.
1006 		 */
1007 		if (scrub_txg != 0) {
1008 			space_map_excise(&vd->vdev_dtl_map, 0, scrub_txg);
1009 			space_map_union(&vd->vdev_dtl_map, &vd->vdev_dtl_scrub);
1010 		}
1011 		if (scrub_done)
1012 			space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
1013 		mutex_exit(&vd->vdev_dtl_lock);
1014 		if (txg != 0) {
1015 			vdev_t *tvd = vd->vdev_top;
1016 			vdev_dirty(tvd, VDD_DTL, txg);
1017 			(void) txg_list_add(&tvd->vdev_dtl_list, vd, txg);
1018 		}
1019 		return;
1020 	}
1021 
1022 	mutex_enter(&vd->vdev_dtl_lock);
1023 	space_map_vacate(&vd->vdev_dtl_map, NULL, NULL);
1024 	space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
1025 	mutex_exit(&vd->vdev_dtl_lock);
1026 
1027 	for (c = 0; c < vd->vdev_children; c++) {
1028 		vdev_t *cvd = vd->vdev_child[c];
1029 		vdev_dtl_reassess(cvd, txg, scrub_txg, scrub_done);
1030 		mutex_enter(&vd->vdev_dtl_lock);
1031 		space_map_union(&vd->vdev_dtl_map, &cvd->vdev_dtl_map);
1032 		space_map_union(&vd->vdev_dtl_scrub, &cvd->vdev_dtl_scrub);
1033 		mutex_exit(&vd->vdev_dtl_lock);
1034 	}
1035 }
1036 
1037 static int
1038 vdev_dtl_load(vdev_t *vd)
1039 {
1040 	spa_t *spa = vd->vdev_spa;
1041 	space_map_obj_t *smo = &vd->vdev_dtl;
1042 	dmu_buf_t *db;
1043 	int error;
1044 
1045 	ASSERT(vd->vdev_children == 0);
1046 
1047 	if (smo->smo_object == 0)
1048 		return (0);
1049 
1050 	db = dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object);
1051 	dmu_buf_read(db);
1052 	ASSERT3U(db->db_size, ==, sizeof (*smo));
1053 	bcopy(db->db_data, smo, db->db_size);
1054 	dmu_buf_rele(db);
1055 
1056 	mutex_enter(&vd->vdev_dtl_lock);
1057 	error = space_map_load(&vd->vdev_dtl_map, smo, SM_ALLOC,
1058 	    spa->spa_meta_objset, smo->smo_objsize, smo->smo_alloc);
1059 	mutex_exit(&vd->vdev_dtl_lock);
1060 
1061 	return (error);
1062 }
1063 
1064 void
1065 vdev_dtl_sync(vdev_t *vd, uint64_t txg)
1066 {
1067 	spa_t *spa = vd->vdev_spa;
1068 	space_map_obj_t *smo = &vd->vdev_dtl;
1069 	space_map_t *sm = &vd->vdev_dtl_map;
1070 	space_map_t smsync;
1071 	kmutex_t smlock;
1072 	avl_tree_t *t = &sm->sm_root;
1073 	space_seg_t *ss;
1074 	dmu_buf_t *db;
1075 	dmu_tx_t *tx;
1076 
1077 	dprintf("%s in txg %llu pass %d\n",
1078 	    vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa));
1079 
1080 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1081 
1082 	if (vd->vdev_detached) {
1083 		if (smo->smo_object != 0) {
1084 			int err = dmu_object_free(spa->spa_meta_objset,
1085 			    smo->smo_object, tx);
1086 			ASSERT3U(err, ==, 0);
1087 			smo->smo_object = 0;
1088 		}
1089 		dmu_tx_commit(tx);
1090 		return;
1091 	}
1092 
1093 	if (smo->smo_object == 0) {
1094 		ASSERT(smo->smo_objsize == 0);
1095 		ASSERT(smo->smo_alloc == 0);
1096 		smo->smo_object = dmu_object_alloc(spa->spa_meta_objset,
1097 		    DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
1098 		    DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
1099 		ASSERT(smo->smo_object != 0);
1100 		vdev_config_dirty(vd->vdev_top);
1101 	}
1102 
1103 	dmu_free_range(spa->spa_meta_objset, smo->smo_object,
1104 	    0, smo->smo_objsize, tx);
1105 
1106 	mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL);
1107 
1108 	space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift,
1109 	    &smlock);
1110 
1111 	mutex_enter(&smlock);
1112 
1113 	mutex_enter(&vd->vdev_dtl_lock);
1114 	for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss))
1115 		space_map_add(&smsync, ss->ss_start, ss->ss_end - ss->ss_start);
1116 	mutex_exit(&vd->vdev_dtl_lock);
1117 
1118 	smo->smo_objsize = 0;
1119 	smo->smo_alloc = smsync.sm_space;
1120 
1121 	space_map_sync(&smsync, NULL, smo, SM_ALLOC, spa->spa_meta_objset, tx);
1122 	space_map_destroy(&smsync);
1123 
1124 	mutex_exit(&smlock);
1125 	mutex_destroy(&smlock);
1126 
1127 	db = dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object);
1128 	dmu_buf_will_dirty(db, tx);
1129 	ASSERT3U(db->db_size, ==, sizeof (*smo));
1130 	bcopy(smo, db->db_data, db->db_size);
1131 	dmu_buf_rele(db);
1132 
1133 	dmu_tx_commit(tx);
1134 }
1135 
1136 int
1137 vdev_load(vdev_t *vd, int import)
1138 {
1139 	spa_t *spa = vd->vdev_spa;
1140 	int c, error;
1141 	nvlist_t *label;
1142 	uint64_t guid, state;
1143 
1144 	dprintf("loading %s\n", vdev_description(vd));
1145 
1146 	/*
1147 	 * Recursively load all children.
1148 	 */
1149 	for (c = 0; c < vd->vdev_children; c++)
1150 		if ((error = vdev_load(vd->vdev_child[c], import)) != 0)
1151 			return (error);
1152 
1153 	/*
1154 	 * If this is a leaf vdev, make sure its agrees with its disk labels.
1155 	 */
1156 	if (vd->vdev_ops->vdev_op_leaf) {
1157 
1158 		if (vdev_is_dead(vd))
1159 			return (0);
1160 
1161 		/*
1162 		 * XXX state transitions don't propagate to parent here.
1163 		 * Also, merely setting the state isn't sufficient because
1164 		 * it's not persistent; a vdev_reopen() would make us
1165 		 * forget all about it.
1166 		 */
1167 		if ((label = vdev_label_read_config(vd)) == NULL) {
1168 			dprintf("can't load label config\n");
1169 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
1170 			    VDEV_AUX_CORRUPT_DATA);
1171 			return (0);
1172 		}
1173 
1174 		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
1175 		    &guid) != 0 || guid != spa_guid(spa)) {
1176 			dprintf("bad or missing pool GUID (%llu)\n", guid);
1177 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
1178 			    VDEV_AUX_CORRUPT_DATA);
1179 			nvlist_free(label);
1180 			return (0);
1181 		}
1182 
1183 		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) ||
1184 		    guid != vd->vdev_guid) {
1185 			dprintf("bad or missing vdev guid (%llu != %llu)\n",
1186 			    guid, vd->vdev_guid);
1187 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
1188 			    VDEV_AUX_CORRUPT_DATA);
1189 			nvlist_free(label);
1190 			return (0);
1191 		}
1192 
1193 		/*
1194 		 * If we find a vdev with a matching pool guid and vdev guid,
1195 		 * but the pool state is not active, it indicates that the user
1196 		 * exported or destroyed the pool without affecting the config
1197 		 * cache (if / was mounted readonly, for example).  In this
1198 		 * case, immediately return EBADF so the caller can remove it
1199 		 * from the config.
1200 		 */
1201 		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
1202 		    &state)) {
1203 			dprintf("missing pool state\n");
1204 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
1205 			    VDEV_AUX_CORRUPT_DATA);
1206 			nvlist_free(label);
1207 			return (0);
1208 		}
1209 
1210 		if (state != POOL_STATE_ACTIVE &&
1211 		    (!import || state != POOL_STATE_EXPORTED)) {
1212 			dprintf("pool state not active (%llu)\n", state);
1213 			nvlist_free(label);
1214 			return (EBADF);
1215 		}
1216 
1217 		nvlist_free(label);
1218 	}
1219 
1220 	/*
1221 	 * If this is a top-level vdev, make sure its allocation parameters
1222 	 * exist and initialize its metaslabs.
1223 	 */
1224 	if (vd == vd->vdev_top) {
1225 
1226 		if (vd->vdev_ms_array == 0 ||
1227 		    vd->vdev_ms_shift == 0 ||
1228 		    vd->vdev_ashift == 0 ||
1229 		    vd->vdev_asize == 0) {
1230 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
1231 			    VDEV_AUX_CORRUPT_DATA);
1232 			return (0);
1233 		}
1234 
1235 		vdev_metaslab_init(vd, 0);
1236 	}
1237 
1238 	/*
1239 	 * If this is a leaf vdev, load its DTL.
1240 	 */
1241 	if (vd->vdev_ops->vdev_op_leaf) {
1242 		error = vdev_dtl_load(vd);
1243 		if (error) {
1244 			dprintf("can't load DTL for %s, error %d\n",
1245 			    vdev_description(vd), error);
1246 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
1247 			    VDEV_AUX_CORRUPT_DATA);
1248 			return (0);
1249 		}
1250 	}
1251 
1252 	return (0);
1253 }
1254 
1255 void
1256 vdev_sync_done(vdev_t *vd, uint64_t txg)
1257 {
1258 	metaslab_t *msp;
1259 
1260 	dprintf("%s txg %llu\n", vdev_description(vd), txg);
1261 
1262 	while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
1263 		metaslab_sync_done(msp, txg);
1264 }
1265 
1266 void
1267 vdev_add_sync(vdev_t *vd, uint64_t txg)
1268 {
1269 	spa_t *spa = vd->vdev_spa;
1270 	dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1271 
1272 	ASSERT(vd == vd->vdev_top);
1273 
1274 	if (vd->vdev_ms_array == 0)
1275 		vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
1276 		    DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
1277 
1278 	ASSERT(vd->vdev_ms_array != 0);
1279 
1280 	vdev_config_dirty(vd);
1281 
1282 	dmu_tx_commit(tx);
1283 }
1284 
1285 void
1286 vdev_sync(vdev_t *vd, uint64_t txg)
1287 {
1288 	spa_t *spa = vd->vdev_spa;
1289 	vdev_t *lvd;
1290 	metaslab_t *msp;
1291 	uint8_t *dirtyp = &vd->vdev_dirty[txg & TXG_MASK];
1292 	uint8_t dirty = *dirtyp;
1293 
1294 	mutex_enter(&vd->vdev_dirty_lock);
1295 	*dirtyp &= ~(VDD_ALLOC | VDD_FREE | VDD_ADD | VDD_DTL);
1296 	mutex_exit(&vd->vdev_dirty_lock);
1297 
1298 	dprintf("%s txg %llu pass %d\n",
1299 	    vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa));
1300 
1301 	if (dirty & VDD_ADD)
1302 		vdev_add_sync(vd, txg);
1303 
1304 	while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL)
1305 		metaslab_sync(msp, txg);
1306 
1307 	while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
1308 		vdev_dtl_sync(lvd, txg);
1309 
1310 	(void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
1311 }
1312 
1313 uint64_t
1314 vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
1315 {
1316 	return (vd->vdev_ops->vdev_op_asize(vd, psize));
1317 }
1318 
1319 void
1320 vdev_io_start(zio_t *zio)
1321 {
1322 	zio->io_vd->vdev_ops->vdev_op_io_start(zio);
1323 }
1324 
1325 void
1326 vdev_io_done(zio_t *zio)
1327 {
1328 	zio->io_vd->vdev_ops->vdev_op_io_done(zio);
1329 }
1330 
1331 const char *
1332 vdev_description(vdev_t *vd)
1333 {
1334 	if (vd == NULL || vd->vdev_ops == NULL)
1335 		return ("<unknown>");
1336 
1337 	if (vd->vdev_path != NULL)
1338 		return (vd->vdev_path);
1339 
1340 	if (vd->vdev_parent == NULL)
1341 		return (spa_name(vd->vdev_spa));
1342 
1343 	return (vd->vdev_ops->vdev_op_type);
1344 }
1345 
1346 int
1347 vdev_online(spa_t *spa, const char *path)
1348 {
1349 	vdev_t *rvd, *vd;
1350 	uint64_t txg;
1351 
1352 	txg = spa_vdev_enter(spa);
1353 
1354 	rvd = spa->spa_root_vdev;
1355 	if ((vd = vdev_lookup_by_path(rvd, path)) == NULL)
1356 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
1357 
1358 	dprintf("ONLINE: %s\n", vdev_description(vd));
1359 
1360 	vd->vdev_offline = B_FALSE;
1361 	vd->vdev_tmpoffline = B_FALSE;
1362 
1363 	/*
1364 	 * Clear the error counts.  The idea is that you expect to see all
1365 	 * zeroes when everything is working, so if you've just onlined a
1366 	 * device, you don't want to keep hearing about errors from before.
1367 	 */
1368 	vd->vdev_stat.vs_read_errors = 0;
1369 	vd->vdev_stat.vs_write_errors = 0;
1370 	vd->vdev_stat.vs_checksum_errors = 0;
1371 
1372 	vdev_reopen(vd->vdev_top, NULL);
1373 
1374 	spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0));
1375 
1376 	vdev_config_dirty(vd->vdev_top);
1377 
1378 	(void) spa_vdev_exit(spa, NULL, txg, 0);
1379 
1380 	VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
1381 
1382 	return (0);
1383 }
1384 
1385 int
1386 vdev_offline(spa_t *spa, const char *path, int istmp)
1387 {
1388 	vdev_t *rvd, *vd;
1389 	uint64_t txg;
1390 
1391 	txg = spa_vdev_enter(spa);
1392 
1393 	rvd = spa->spa_root_vdev;
1394 	if ((vd = vdev_lookup_by_path(rvd, path)) == NULL)
1395 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
1396 
1397 	dprintf("OFFLINE: %s\n", vdev_description(vd));
1398 
1399 	/* vdev is already offlined, do nothing */
1400 	if (vd->vdev_offline)
1401 		return (spa_vdev_exit(spa, NULL, txg, 0));
1402 
1403 	/*
1404 	 * If this device's top-level vdev has a non-empty DTL,
1405 	 * don't allow the device to be offlined.
1406 	 *
1407 	 * XXX -- we should make this more precise by allowing the offline
1408 	 * as long as the remaining devices don't have any DTL holes.
1409 	 */
1410 	if (vd->vdev_top->vdev_dtl_map.sm_space != 0)
1411 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
1412 
1413 	/*
1414 	 * Set this device to offline state and reopen its top-level vdev.
1415 	 * If this action results in the top-level vdev becoming unusable,
1416 	 * undo it and fail the request.
1417 	 */
1418 	vd->vdev_offline = B_TRUE;
1419 	vdev_reopen(vd->vdev_top, NULL);
1420 	if (vdev_is_dead(vd->vdev_top)) {
1421 		vd->vdev_offline = B_FALSE;
1422 		vdev_reopen(vd->vdev_top, NULL);
1423 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
1424 	}
1425 
1426 	vd->vdev_tmpoffline = istmp;
1427 	if (istmp)
1428 		return (spa_vdev_exit(spa, NULL, txg, 0));
1429 
1430 	spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0));
1431 
1432 	vdev_config_dirty(vd->vdev_top);
1433 
1434 	return (spa_vdev_exit(spa, NULL, txg, 0));
1435 }
1436 
1437 int
1438 vdev_error_setup(spa_t *spa, const char *path, int mode, int mask, uint64_t arg)
1439 {
1440 	vdev_t *vd;
1441 
1442 	spa_config_enter(spa, RW_WRITER);
1443 
1444 	if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, path)) == NULL) {
1445 		spa_config_exit(spa);
1446 		return (ENODEV);
1447 	}
1448 
1449 	vd->vdev_fault_mode = mode;
1450 	vd->vdev_fault_mask = mask;
1451 	vd->vdev_fault_arg = arg;
1452 
1453 	spa_config_exit(spa);
1454 
1455 	return (0);
1456 }
1457 
1458 int
1459 vdev_is_dead(vdev_t *vd)
1460 {
1461 	return (vd->vdev_state <= VDEV_STATE_CANT_OPEN);
1462 }
1463 
1464 int
1465 vdev_error_inject(vdev_t *vd, zio_t *zio)
1466 {
1467 	int error = 0;
1468 
1469 	if (vd->vdev_fault_mode == VDEV_FAULT_NONE)
1470 		return (0);
1471 
1472 	if (((1ULL << zio->io_type) & vd->vdev_fault_mask) == 0)
1473 		return (0);
1474 
1475 	switch (vd->vdev_fault_mode) {
1476 	case VDEV_FAULT_RANDOM:
1477 		if (spa_get_random(vd->vdev_fault_arg) == 0)
1478 			error = EIO;
1479 		break;
1480 
1481 	case VDEV_FAULT_COUNT:
1482 		if ((int64_t)--vd->vdev_fault_arg <= 0)
1483 			vd->vdev_fault_mode = VDEV_FAULT_NONE;
1484 		error = EIO;
1485 		break;
1486 	}
1487 
1488 	if (error != 0) {
1489 		dprintf("returning %d for type %d on %s state %d offset %llx\n",
1490 		    error, zio->io_type, vdev_description(vd),
1491 		    vd->vdev_state, zio->io_offset);
1492 	}
1493 
1494 	return (error);
1495 }
1496 
1497 /*
1498  * Get statistics for the given vdev.
1499  */
1500 void
1501 vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
1502 {
1503 	vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
1504 	int c, t;
1505 
1506 	mutex_enter(&vd->vdev_stat_lock);
1507 	bcopy(&vd->vdev_stat, vs, sizeof (*vs));
1508 	vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
1509 	vs->vs_state = vd->vdev_state;
1510 	vs->vs_rsize = vdev_get_rsize(vd);
1511 	mutex_exit(&vd->vdev_stat_lock);
1512 
1513 	/*
1514 	 * If we're getting stats on the root vdev, aggregate the I/O counts
1515 	 * over all top-level vdevs (i.e. the direct children of the root).
1516 	 */
1517 	if (vd == rvd) {
1518 		for (c = 0; c < rvd->vdev_children; c++) {
1519 			vdev_t *cvd = rvd->vdev_child[c];
1520 			vdev_stat_t *cvs = &cvd->vdev_stat;
1521 
1522 			mutex_enter(&vd->vdev_stat_lock);
1523 			for (t = 0; t < ZIO_TYPES; t++) {
1524 				vs->vs_ops[t] += cvs->vs_ops[t];
1525 				vs->vs_bytes[t] += cvs->vs_bytes[t];
1526 			}
1527 			vs->vs_read_errors += cvs->vs_read_errors;
1528 			vs->vs_write_errors += cvs->vs_write_errors;
1529 			vs->vs_checksum_errors += cvs->vs_checksum_errors;
1530 			vs->vs_scrub_examined += cvs->vs_scrub_examined;
1531 			vs->vs_scrub_errors += cvs->vs_scrub_errors;
1532 			mutex_exit(&vd->vdev_stat_lock);
1533 		}
1534 	}
1535 }
1536 
1537 void
1538 vdev_stat_update(zio_t *zio)
1539 {
1540 	vdev_t *vd = zio->io_vd;
1541 	vdev_t *pvd;
1542 	uint64_t txg = zio->io_txg;
1543 	vdev_stat_t *vs = &vd->vdev_stat;
1544 	zio_type_t type = zio->io_type;
1545 	int flags = zio->io_flags;
1546 
1547 	if (zio->io_error == 0) {
1548 		if (!(flags & ZIO_FLAG_IO_BYPASS)) {
1549 			mutex_enter(&vd->vdev_stat_lock);
1550 			vs->vs_ops[type]++;
1551 			vs->vs_bytes[type] += zio->io_size;
1552 			mutex_exit(&vd->vdev_stat_lock);
1553 		}
1554 		if ((flags & ZIO_FLAG_IO_REPAIR) &&
1555 		    zio->io_delegate_list == NULL) {
1556 			mutex_enter(&vd->vdev_stat_lock);
1557 			if (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))
1558 				vs->vs_scrub_repaired += zio->io_size;
1559 			else
1560 				vs->vs_self_healed += zio->io_size;
1561 			mutex_exit(&vd->vdev_stat_lock);
1562 		}
1563 		return;
1564 	}
1565 
1566 	if (flags & ZIO_FLAG_SPECULATIVE)
1567 		return;
1568 
1569 	if (!vdev_is_dead(vd)) {
1570 		mutex_enter(&vd->vdev_stat_lock);
1571 		if (type == ZIO_TYPE_READ) {
1572 			if (zio->io_error == ECKSUM)
1573 				vs->vs_checksum_errors++;
1574 			else
1575 				vs->vs_read_errors++;
1576 		}
1577 		if (type == ZIO_TYPE_WRITE)
1578 			vs->vs_write_errors++;
1579 		mutex_exit(&vd->vdev_stat_lock);
1580 	}
1581 
1582 	if (type == ZIO_TYPE_WRITE) {
1583 		if (txg == 0 || vd->vdev_children != 0)
1584 			return;
1585 		if (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) {
1586 			ASSERT(flags & ZIO_FLAG_IO_REPAIR);
1587 			for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
1588 				vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1);
1589 		}
1590 		if (!(flags & ZIO_FLAG_IO_REPAIR)) {
1591 			vdev_t *tvd = vd->vdev_top;
1592 			if (vdev_dtl_contains(&vd->vdev_dtl_map, txg, 1))
1593 				return;
1594 			vdev_dirty(tvd, VDD_DTL, txg);
1595 			(void) txg_list_add(&tvd->vdev_dtl_list, vd, txg);
1596 			for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
1597 				vdev_dtl_dirty(&pvd->vdev_dtl_map, txg, 1);
1598 		}
1599 	}
1600 }
1601 
1602 void
1603 vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete)
1604 {
1605 	int c;
1606 	vdev_stat_t *vs = &vd->vdev_stat;
1607 
1608 	for (c = 0; c < vd->vdev_children; c++)
1609 		vdev_scrub_stat_update(vd->vdev_child[c], type, complete);
1610 
1611 	mutex_enter(&vd->vdev_stat_lock);
1612 
1613 	if (type == POOL_SCRUB_NONE) {
1614 		/*
1615 		 * Update completion and end time.  Leave everything else alone
1616 		 * so we can report what happened during the previous scrub.
1617 		 */
1618 		vs->vs_scrub_complete = complete;
1619 		vs->vs_scrub_end = gethrestime_sec();
1620 	} else {
1621 		vs->vs_scrub_type = type;
1622 		vs->vs_scrub_complete = 0;
1623 		vs->vs_scrub_examined = 0;
1624 		vs->vs_scrub_repaired = 0;
1625 		vs->vs_scrub_errors = 0;
1626 		vs->vs_scrub_start = gethrestime_sec();
1627 		vs->vs_scrub_end = 0;
1628 	}
1629 
1630 	mutex_exit(&vd->vdev_stat_lock);
1631 }
1632 
1633 /*
1634  * Report checksum errors that a vdev that didn't realize it made.
1635  * This can happen, for example, when RAID-Z combinatorial reconstruction
1636  * infers that one of its components returned bad data.
1637  */
1638 void
1639 vdev_checksum_error(zio_t *zio, vdev_t *vd)
1640 {
1641 	dprintf_bp(zio->io_bp, "imputed checksum error on %s: ",
1642 	    vdev_description(vd));
1643 
1644 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1645 		mutex_enter(&vd->vdev_stat_lock);
1646 		vd->vdev_stat.vs_checksum_errors++;
1647 		mutex_exit(&vd->vdev_stat_lock);
1648 	}
1649 }
1650 
1651 /*
1652  * Update the in-core space usage stats for this vdev and the root vdev.
1653  */
1654 void
1655 vdev_space_update(vdev_t *vd, uint64_t space_delta, uint64_t alloc_delta)
1656 {
1657 	ASSERT(vd == vd->vdev_top);
1658 
1659 	do {
1660 		mutex_enter(&vd->vdev_stat_lock);
1661 		vd->vdev_stat.vs_space += space_delta;
1662 		vd->vdev_stat.vs_alloc += alloc_delta;
1663 		mutex_exit(&vd->vdev_stat_lock);
1664 	} while ((vd = vd->vdev_parent) != NULL);
1665 }
1666 
1667 /*
1668  * Various knobs to tune a vdev.
1669  */
1670 static vdev_knob_t vdev_knob[] = {
1671 	{
1672 		"cache_size",
1673 		"size of the read-ahead cache",
1674 		0,
1675 		1ULL << 30,
1676 		10ULL << 20,
1677 		offsetof(struct vdev, vdev_cache.vc_size)
1678 	},
1679 	{
1680 		"cache_bshift",
1681 		"log2 of cache blocksize",
1682 		SPA_MINBLOCKSHIFT,
1683 		SPA_MAXBLOCKSHIFT,
1684 		16,
1685 		offsetof(struct vdev, vdev_cache.vc_bshift)
1686 	},
1687 	{
1688 		"cache_max",
1689 		"largest block size to cache",
1690 		0,
1691 		SPA_MAXBLOCKSIZE,
1692 		1ULL << 14,
1693 		offsetof(struct vdev, vdev_cache.vc_max)
1694 	},
1695 	{
1696 		"min_pending",
1697 		"minimum pending I/Os to the disk",
1698 		1,
1699 		10000,
1700 		2,
1701 		offsetof(struct vdev, vdev_queue.vq_min_pending)
1702 	},
1703 	{
1704 		"max_pending",
1705 		"maximum pending I/Os to the disk",
1706 		1,
1707 		10000,
1708 		35,
1709 		offsetof(struct vdev, vdev_queue.vq_max_pending)
1710 	},
1711 	{
1712 		"agg_limit",
1713 		"maximum size of aggregated I/Os",
1714 		0,
1715 		SPA_MAXBLOCKSIZE,
1716 		SPA_MAXBLOCKSIZE,
1717 		offsetof(struct vdev, vdev_queue.vq_agg_limit)
1718 	},
1719 	{
1720 		"time_shift",
1721 		"deadline = pri + (lbolt >> time_shift)",
1722 		0,
1723 		63,
1724 		4,
1725 		offsetof(struct vdev, vdev_queue.vq_time_shift)
1726 	},
1727 	{
1728 		"ramp_rate",
1729 		"exponential I/O issue ramp-up rate",
1730 		1,
1731 		10000,
1732 		2,
1733 		offsetof(struct vdev, vdev_queue.vq_ramp_rate)
1734 	},
1735 };
1736 
1737 vdev_knob_t *
1738 vdev_knob_next(vdev_knob_t *vk)
1739 {
1740 	if (vk == NULL)
1741 		return (vdev_knob);
1742 
1743 	if (++vk == vdev_knob + sizeof (vdev_knob) / sizeof (vdev_knob_t))
1744 		return (NULL);
1745 
1746 	return (vk);
1747 }
1748 
1749 /*
1750  * Mark a top-level vdev's config as dirty, placing it on the dirty list
1751  * so that it will be written out next time the vdev configuration is synced.
1752  * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
1753  */
1754 void
1755 vdev_config_dirty(vdev_t *vd)
1756 {
1757 	spa_t *spa = vd->vdev_spa;
1758 	vdev_t *rvd = spa->spa_root_vdev;
1759 	int c;
1760 
1761 	if (vd == rvd) {
1762 		for (c = 0; c < rvd->vdev_children; c++)
1763 			vdev_config_dirty(rvd->vdev_child[c]);
1764 	} else {
1765 		ASSERT(vd == vd->vdev_top);
1766 
1767 		if (!vd->vdev_is_dirty) {
1768 			list_insert_head(&spa->spa_dirty_list, vd);
1769 			vd->vdev_is_dirty = B_TRUE;
1770 		}
1771 	}
1772 }
1773 
1774 void
1775 vdev_config_clean(vdev_t *vd)
1776 {
1777 	ASSERT(vd->vdev_is_dirty);
1778 
1779 	list_remove(&vd->vdev_spa->spa_dirty_list, vd);
1780 	vd->vdev_is_dirty = B_FALSE;
1781 }
1782 
1783 /*
1784  * Set a vdev's state, updating any parent's state as well.
1785  */
1786 void
1787 vdev_set_state(vdev_t *vd, vdev_state_t state, vdev_aux_t aux)
1788 {
1789 	if (state == vd->vdev_state)
1790 		return;
1791 
1792 	vd->vdev_state = state;
1793 	vd->vdev_stat.vs_aux = aux;
1794 
1795 	if (vd->vdev_parent != NULL) {
1796 		int c;
1797 		int degraded = 0, faulted = 0;
1798 		vdev_t *parent, *child;
1799 
1800 		parent = vd->vdev_parent;
1801 		for (c = 0; c < parent->vdev_children; c++) {
1802 			child = parent->vdev_child[c];
1803 			if (child->vdev_state <= VDEV_STATE_CANT_OPEN)
1804 				faulted++;
1805 			else if (child->vdev_state == VDEV_STATE_DEGRADED)
1806 				degraded++;
1807 		}
1808 
1809 		vd->vdev_parent->vdev_ops->vdev_op_state_change(
1810 		    vd->vdev_parent, faulted, degraded);
1811 	    }
1812 }
1813