xref: /illumos-gate/usr/src/uts/common/fs/zfs/vdev.c (revision 7f7322febbcfe774b7270abc3b191c094bfcc517)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/zfs_context.h>
30 #include <sys/spa.h>
31 #include <sys/spa_impl.h>
32 #include <sys/dmu.h>
33 #include <sys/dmu_tx.h>
34 #include <sys/vdev_impl.h>
35 #include <sys/uberblock_impl.h>
36 #include <sys/metaslab.h>
37 #include <sys/metaslab_impl.h>
38 #include <sys/space_map.h>
39 #include <sys/zio.h>
40 #include <sys/zap.h>
41 #include <sys/fs/zfs.h>
42 
43 /*
44  * Virtual device management.
45  */
46 
47 static vdev_ops_t *vdev_ops_table[] = {
48 	&vdev_root_ops,
49 	&vdev_raidz_ops,
50 	&vdev_mirror_ops,
51 	&vdev_replacing_ops,
52 	&vdev_disk_ops,
53 	&vdev_file_ops,
54 	&vdev_missing_ops,
55 	NULL
56 };
57 
58 /*
59  * Given a vdev type, return the appropriate ops vector.
60  */
61 static vdev_ops_t *
62 vdev_getops(const char *type)
63 {
64 	vdev_ops_t *ops, **opspp;
65 
66 	for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
67 		if (strcmp(ops->vdev_op_type, type) == 0)
68 			break;
69 
70 	return (ops);
71 }
72 
73 /*
74  * Default asize function: return the MAX of psize with the asize of
75  * all children.  This is what's used by anything other than RAID-Z.
76  */
77 uint64_t
78 vdev_default_asize(vdev_t *vd, uint64_t psize)
79 {
80 	uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_ashift);
81 	uint64_t csize;
82 	uint64_t c;
83 
84 	for (c = 0; c < vd->vdev_children; c++) {
85 		csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
86 		asize = MAX(asize, csize);
87 	}
88 
89 	return (asize);
90 }
91 
92 /*
93  * Get the replaceable or attachable device size.
94  * If the parent is a mirror or raidz, the replaceable size is the minimum
95  * psize of all its children. For the rest, just return our own psize.
96  *
97  * e.g.
98  *			psize	rsize
99  * root			-	-
100  *	mirror/raidz	-	-
101  *	    disk1	20g	20g
102  *	    disk2 	40g	20g
103  *	disk3 		80g	80g
104  */
105 uint64_t
106 vdev_get_rsize(vdev_t *vd)
107 {
108 	vdev_t *pvd, *cvd;
109 	uint64_t c, rsize;
110 
111 	pvd = vd->vdev_parent;
112 
113 	/*
114 	 * If our parent is NULL or the root, just return our own psize.
115 	 */
116 	if (pvd == NULL || pvd->vdev_parent == NULL)
117 		return (vd->vdev_psize);
118 
119 	rsize = 0;
120 
121 	for (c = 0; c < pvd->vdev_children; c++) {
122 		cvd = pvd->vdev_child[c];
123 		rsize = MIN(rsize - 1, cvd->vdev_psize - 1) + 1;
124 	}
125 
126 	return (rsize);
127 }
128 
129 vdev_t *
130 vdev_lookup_top(spa_t *spa, uint64_t vdev)
131 {
132 	vdev_t *rvd = spa->spa_root_vdev;
133 
134 	if (vdev < rvd->vdev_children)
135 		return (rvd->vdev_child[vdev]);
136 
137 	return (NULL);
138 }
139 
140 vdev_t *
141 vdev_lookup_by_path(vdev_t *vd, const char *path)
142 {
143 	int c;
144 	vdev_t *mvd;
145 
146 	if (vd->vdev_path != NULL) {
147 		if (vd->vdev_wholedisk == 1) {
148 			/*
149 			 * For whole disks, the internal path has 's0', but the
150 			 * path passed in by the user doesn't.
151 			 */
152 			if (strlen(path) == strlen(vd->vdev_path) - 2 &&
153 			    strncmp(path, vd->vdev_path, strlen(path)) == 0)
154 				return (vd);
155 		} else if (strcmp(path, vd->vdev_path) == 0) {
156 			return (vd);
157 		}
158 	}
159 
160 	for (c = 0; c < vd->vdev_children; c++)
161 		if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) !=
162 		    NULL)
163 			return (mvd);
164 
165 	return (NULL);
166 }
167 
168 vdev_t *
169 vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
170 {
171 	int c;
172 	vdev_t *mvd;
173 
174 	if (vd->vdev_children == 0 && vd->vdev_guid == guid)
175 		return (vd);
176 
177 	for (c = 0; c < vd->vdev_children; c++)
178 		if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
179 		    NULL)
180 			return (mvd);
181 
182 	return (NULL);
183 }
184 
185 void
186 vdev_add_child(vdev_t *pvd, vdev_t *cvd)
187 {
188 	size_t oldsize, newsize;
189 	uint64_t id = cvd->vdev_id;
190 	vdev_t **newchild;
191 
192 	ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER));
193 	ASSERT(cvd->vdev_parent == NULL);
194 
195 	cvd->vdev_parent = pvd;
196 
197 	if (pvd == NULL)
198 		return;
199 
200 	ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
201 
202 	oldsize = pvd->vdev_children * sizeof (vdev_t *);
203 	pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
204 	newsize = pvd->vdev_children * sizeof (vdev_t *);
205 
206 	newchild = kmem_zalloc(newsize, KM_SLEEP);
207 	if (pvd->vdev_child != NULL) {
208 		bcopy(pvd->vdev_child, newchild, oldsize);
209 		kmem_free(pvd->vdev_child, oldsize);
210 	}
211 
212 	pvd->vdev_child = newchild;
213 	pvd->vdev_child[id] = cvd;
214 
215 	cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
216 	ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
217 
218 	/*
219 	 * Walk up all ancestors to update guid sum.
220 	 */
221 	for (; pvd != NULL; pvd = pvd->vdev_parent)
222 		pvd->vdev_guid_sum += cvd->vdev_guid_sum;
223 }
224 
225 void
226 vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
227 {
228 	int c;
229 	uint_t id = cvd->vdev_id;
230 
231 	ASSERT(cvd->vdev_parent == pvd);
232 
233 	if (pvd == NULL)
234 		return;
235 
236 	ASSERT(id < pvd->vdev_children);
237 	ASSERT(pvd->vdev_child[id] == cvd);
238 
239 	pvd->vdev_child[id] = NULL;
240 	cvd->vdev_parent = NULL;
241 
242 	for (c = 0; c < pvd->vdev_children; c++)
243 		if (pvd->vdev_child[c])
244 			break;
245 
246 	if (c == pvd->vdev_children) {
247 		kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
248 		pvd->vdev_child = NULL;
249 		pvd->vdev_children = 0;
250 	}
251 
252 	/*
253 	 * Walk up all ancestors to update guid sum.
254 	 */
255 	for (; pvd != NULL; pvd = pvd->vdev_parent)
256 		pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
257 }
258 
259 /*
260  * Remove any holes in the child array.
261  */
262 void
263 vdev_compact_children(vdev_t *pvd)
264 {
265 	vdev_t **newchild, *cvd;
266 	int oldc = pvd->vdev_children;
267 	int newc, c;
268 
269 	ASSERT(spa_config_held(pvd->vdev_spa, RW_WRITER));
270 
271 	for (c = newc = 0; c < oldc; c++)
272 		if (pvd->vdev_child[c])
273 			newc++;
274 
275 	newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP);
276 
277 	for (c = newc = 0; c < oldc; c++) {
278 		if ((cvd = pvd->vdev_child[c]) != NULL) {
279 			newchild[newc] = cvd;
280 			cvd->vdev_id = newc++;
281 		}
282 	}
283 
284 	kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
285 	pvd->vdev_child = newchild;
286 	pvd->vdev_children = newc;
287 }
288 
289 /*
290  * Allocate and minimally initialize a vdev_t.
291  */
292 static vdev_t *
293 vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
294 {
295 	vdev_t *vd;
296 
297 	while (guid == 0)
298 		guid = spa_get_random(-1ULL);
299 
300 	vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
301 
302 	vd->vdev_spa = spa;
303 	vd->vdev_id = id;
304 	vd->vdev_guid = guid;
305 	vd->vdev_guid_sum = guid;
306 	vd->vdev_ops = ops;
307 	vd->vdev_state = VDEV_STATE_CLOSED;
308 
309 	mutex_init(&vd->vdev_io_lock, NULL, MUTEX_DEFAULT, NULL);
310 	cv_init(&vd->vdev_io_cv, NULL, CV_DEFAULT, NULL);
311 	list_create(&vd->vdev_io_pending, sizeof (zio_t),
312 	    offsetof(zio_t, io_pending));
313 	mutex_init(&vd->vdev_dirty_lock, NULL, MUTEX_DEFAULT, NULL);
314 	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
315 	space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock);
316 	space_map_create(&vd->vdev_dtl_scrub, 0, -1ULL, 0, &vd->vdev_dtl_lock);
317 	txg_list_create(&vd->vdev_ms_list,
318 	    offsetof(struct metaslab, ms_txg_node));
319 	txg_list_create(&vd->vdev_dtl_list,
320 	    offsetof(struct vdev, vdev_dtl_node));
321 	vd->vdev_stat.vs_timestamp = gethrtime();
322 
323 	return (vd);
324 }
325 
326 /*
327  * Free a vdev_t that has been removed from service.
328  */
329 static void
330 vdev_free_common(vdev_t *vd)
331 {
332 	if (vd->vdev_path)
333 		spa_strfree(vd->vdev_path);
334 	if (vd->vdev_devid)
335 		spa_strfree(vd->vdev_devid);
336 
337 	txg_list_destroy(&vd->vdev_ms_list);
338 	txg_list_destroy(&vd->vdev_dtl_list);
339 	mutex_enter(&vd->vdev_dtl_lock);
340 	space_map_vacate(&vd->vdev_dtl_map, NULL, NULL);
341 	space_map_destroy(&vd->vdev_dtl_map);
342 	space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
343 	space_map_destroy(&vd->vdev_dtl_scrub);
344 	mutex_exit(&vd->vdev_dtl_lock);
345 	mutex_destroy(&vd->vdev_dtl_lock);
346 	mutex_destroy(&vd->vdev_dirty_lock);
347 	list_destroy(&vd->vdev_io_pending);
348 	mutex_destroy(&vd->vdev_io_lock);
349 	cv_destroy(&vd->vdev_io_cv);
350 
351 	kmem_free(vd, sizeof (vdev_t));
352 }
353 
354 /*
355  * Allocate a new vdev.  The 'alloctype' is used to control whether we are
356  * creating a new vdev or loading an existing one - the behavior is slightly
357  * different for each case.
358  */
359 vdev_t *
360 vdev_alloc(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int alloctype)
361 {
362 	vdev_ops_t *ops;
363 	char *type;
364 	uint64_t guid = 0;
365 	vdev_t *vd;
366 
367 	ASSERT(spa_config_held(spa, RW_WRITER));
368 
369 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
370 		return (NULL);
371 
372 	if ((ops = vdev_getops(type)) == NULL)
373 		return (NULL);
374 
375 	/*
376 	 * If this is a load, get the vdev guid from the nvlist.
377 	 * Otherwise, vdev_alloc_common() will generate one for us.
378 	 */
379 	if (alloctype == VDEV_ALLOC_LOAD) {
380 		uint64_t label_id;
381 
382 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
383 		    label_id != id)
384 			return (NULL);
385 
386 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
387 			return (NULL);
388 	}
389 
390 	vd = vdev_alloc_common(spa, id, guid, ops);
391 
392 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
393 		vd->vdev_path = spa_strdup(vd->vdev_path);
394 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
395 		vd->vdev_devid = spa_strdup(vd->vdev_devid);
396 
397 	/*
398 	 * Set the whole_disk property.  If it's not specified, leave the value
399 	 * as -1.
400 	 */
401 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
402 	    &vd->vdev_wholedisk) != 0)
403 		vd->vdev_wholedisk = -1ULL;
404 
405 	/*
406 	 * If we're a top-level vdev, try to load the allocation parameters.
407 	 */
408 	if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) {
409 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
410 		    &vd->vdev_ms_array);
411 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
412 		    &vd->vdev_ms_shift);
413 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT,
414 		    &vd->vdev_ashift);
415 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
416 		    &vd->vdev_asize);
417 	}
418 
419 	/*
420 	 * If we're a leaf vdev, try to load the DTL object.
421 	 */
422 	if (vd->vdev_ops->vdev_op_leaf && alloctype == VDEV_ALLOC_LOAD) {
423 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
424 		    &vd->vdev_dtl.smo_object);
425 	}
426 
427 	/*
428 	 * Add ourselves to the parent's list of children.
429 	 */
430 	vdev_add_child(parent, vd);
431 
432 	return (vd);
433 }
434 
435 void
436 vdev_free(vdev_t *vd)
437 {
438 	int c;
439 
440 	/*
441 	 * vdev_free() implies closing the vdev first.  This is simpler than
442 	 * trying to ensure complicated semantics for all callers.
443 	 */
444 	vdev_close(vd);
445 
446 	/*
447 	 * It's possible to free a vdev that's been added to the dirty
448 	 * list when in the middle of spa_vdev_add().  Handle that case
449 	 * correctly here.
450 	 */
451 	if (vd->vdev_is_dirty)
452 		vdev_config_clean(vd);
453 
454 	/*
455 	 * Free all children.
456 	 */
457 	for (c = 0; c < vd->vdev_children; c++)
458 		vdev_free(vd->vdev_child[c]);
459 
460 	ASSERT(vd->vdev_child == NULL);
461 	ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
462 
463 	/*
464 	 * Discard allocation state.
465 	 */
466 	if (vd == vd->vdev_top)
467 		vdev_metaslab_fini(vd);
468 
469 	ASSERT3U(vd->vdev_stat.vs_space, ==, 0);
470 	ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0);
471 
472 	/*
473 	 * Remove this vdev from its parent's child list.
474 	 */
475 	vdev_remove_child(vd->vdev_parent, vd);
476 
477 	ASSERT(vd->vdev_parent == NULL);
478 
479 	vdev_free_common(vd);
480 }
481 
482 /*
483  * Transfer top-level vdev state from svd to tvd.
484  */
485 static void
486 vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
487 {
488 	spa_t *spa = svd->vdev_spa;
489 	metaslab_t *msp;
490 	vdev_t *vd;
491 	int t;
492 
493 	ASSERT(tvd == tvd->vdev_top);
494 
495 	tvd->vdev_ms_array = svd->vdev_ms_array;
496 	tvd->vdev_ms_shift = svd->vdev_ms_shift;
497 	tvd->vdev_ms_count = svd->vdev_ms_count;
498 
499 	svd->vdev_ms_array = 0;
500 	svd->vdev_ms_shift = 0;
501 	svd->vdev_ms_count = 0;
502 
503 	tvd->vdev_mg = svd->vdev_mg;
504 	tvd->vdev_mg->mg_vd = tvd;
505 	tvd->vdev_ms = svd->vdev_ms;
506 	tvd->vdev_smo = svd->vdev_smo;
507 
508 	svd->vdev_mg = NULL;
509 	svd->vdev_ms = NULL;
510 	svd->vdev_smo = NULL;
511 
512 	tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
513 	tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
514 
515 	svd->vdev_stat.vs_alloc = 0;
516 	svd->vdev_stat.vs_space = 0;
517 
518 	for (t = 0; t < TXG_SIZE; t++) {
519 		while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
520 			(void) txg_list_add(&tvd->vdev_ms_list, msp, t);
521 		while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL)
522 			(void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
523 		if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
524 			(void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
525 		tvd->vdev_dirty[t] = svd->vdev_dirty[t];
526 		svd->vdev_dirty[t] = 0;
527 	}
528 
529 	if (svd->vdev_is_dirty) {
530 		vdev_config_clean(svd);
531 		vdev_config_dirty(tvd);
532 	}
533 
534 	ASSERT(svd->vdev_io_retry == NULL);
535 	ASSERT(list_is_empty(&svd->vdev_io_pending));
536 }
537 
538 static void
539 vdev_top_update(vdev_t *tvd, vdev_t *vd)
540 {
541 	int c;
542 
543 	if (vd == NULL)
544 		return;
545 
546 	vd->vdev_top = tvd;
547 
548 	for (c = 0; c < vd->vdev_children; c++)
549 		vdev_top_update(tvd, vd->vdev_child[c]);
550 }
551 
552 /*
553  * Add a mirror/replacing vdev above an existing vdev.
554  */
555 vdev_t *
556 vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
557 {
558 	spa_t *spa = cvd->vdev_spa;
559 	vdev_t *pvd = cvd->vdev_parent;
560 	vdev_t *mvd;
561 
562 	ASSERT(spa_config_held(spa, RW_WRITER));
563 
564 	mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
565 	vdev_remove_child(pvd, cvd);
566 	vdev_add_child(pvd, mvd);
567 	cvd->vdev_id = mvd->vdev_children;
568 	vdev_add_child(mvd, cvd);
569 	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
570 
571 	mvd->vdev_asize = cvd->vdev_asize;
572 	mvd->vdev_ashift = cvd->vdev_ashift;
573 	mvd->vdev_state = cvd->vdev_state;
574 
575 	if (mvd == mvd->vdev_top)
576 		vdev_top_transfer(cvd, mvd);
577 
578 	return (mvd);
579 }
580 
581 /*
582  * Remove a 1-way mirror/replacing vdev from the tree.
583  */
584 void
585 vdev_remove_parent(vdev_t *cvd)
586 {
587 	vdev_t *mvd = cvd->vdev_parent;
588 	vdev_t *pvd = mvd->vdev_parent;
589 
590 	ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER));
591 
592 	ASSERT(mvd->vdev_children == 1);
593 	ASSERT(mvd->vdev_ops == &vdev_mirror_ops ||
594 	    mvd->vdev_ops == &vdev_replacing_ops);
595 
596 	vdev_remove_child(mvd, cvd);
597 	vdev_remove_child(pvd, mvd);
598 	cvd->vdev_id = mvd->vdev_id;
599 	vdev_add_child(pvd, cvd);
600 	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
601 
602 	if (cvd == cvd->vdev_top)
603 		vdev_top_transfer(mvd, cvd);
604 
605 	ASSERT(mvd->vdev_children == 0);
606 	vdev_free(mvd);
607 }
608 
609 void
610 vdev_metaslab_init(vdev_t *vd, uint64_t txg)
611 {
612 	spa_t *spa = vd->vdev_spa;
613 	metaslab_class_t *mc = spa_metaslab_class_select(spa);
614 	uint64_t c;
615 	uint64_t oldc = vd->vdev_ms_count;
616 	uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
617 	space_map_obj_t *smo = vd->vdev_smo;
618 	metaslab_t **mspp = vd->vdev_ms;
619 
620 	dprintf("%s oldc %llu newc %llu\n", vdev_description(vd), oldc, newc);
621 
622 	ASSERT(oldc <= newc);
623 
624 	vd->vdev_smo = kmem_zalloc(newc * sizeof (*smo), KM_SLEEP);
625 	vd->vdev_ms = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
626 	vd->vdev_ms_count = newc;
627 
628 	if (vd->vdev_mg == NULL) {
629 		if (txg == 0) {
630 			dmu_buf_t *db;
631 			uint64_t *ms_array;
632 
633 			ms_array = kmem_zalloc(newc * sizeof (uint64_t),
634 			    KM_SLEEP);
635 
636 			dmu_read(spa->spa_meta_objset, vd->vdev_ms_array,
637 			    0, newc * sizeof (uint64_t), ms_array);
638 
639 			for (c = 0; c < newc; c++) {
640 				if (ms_array[c] == 0)
641 					continue;
642 				db = dmu_bonus_hold(spa->spa_meta_objset,
643 				    ms_array[c]);
644 				dmu_buf_read(db);
645 				ASSERT3U(db->db_size, ==, sizeof (*smo));
646 				bcopy(db->db_data, &vd->vdev_smo[c],
647 				    db->db_size);
648 				ASSERT3U(vd->vdev_smo[c].smo_object, ==,
649 				    ms_array[c]);
650 				dmu_buf_rele(db);
651 			}
652 			kmem_free(ms_array, newc * sizeof (uint64_t));
653 		}
654 		vd->vdev_mg = metaslab_group_create(mc, vd);
655 	}
656 
657 	for (c = 0; c < oldc; c++) {
658 		vd->vdev_smo[c] = smo[c];
659 		vd->vdev_ms[c] = mspp[c];
660 		mspp[c]->ms_smo = &vd->vdev_smo[c];
661 	}
662 
663 	for (c = oldc; c < newc; c++)
664 		metaslab_init(vd->vdev_mg, &vd->vdev_smo[c], &vd->vdev_ms[c],
665 		    c << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg);
666 
667 	if (oldc != 0) {
668 		kmem_free(smo, oldc * sizeof (*smo));
669 		kmem_free(mspp, oldc * sizeof (*mspp));
670 	}
671 
672 }
673 
674 void
675 vdev_metaslab_fini(vdev_t *vd)
676 {
677 	uint64_t m;
678 	uint64_t count = vd->vdev_ms_count;
679 
680 	if (vd->vdev_ms != NULL) {
681 		for (m = 0; m < count; m++)
682 			metaslab_fini(vd->vdev_ms[m]);
683 		kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
684 		vd->vdev_ms = NULL;
685 	}
686 
687 	if (vd->vdev_smo != NULL) {
688 		kmem_free(vd->vdev_smo, count * sizeof (space_map_obj_t));
689 		vd->vdev_smo = NULL;
690 	}
691 }
692 
693 /*
694  * Prepare a virtual device for access.
695  */
696 int
697 vdev_open(vdev_t *vd)
698 {
699 	int error;
700 	vdev_knob_t *vk;
701 	int c;
702 	uint64_t osize = 0;
703 	uint64_t asize, psize;
704 	uint64_t ashift = -1ULL;
705 
706 	ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
707 	    vd->vdev_state == VDEV_STATE_CANT_OPEN ||
708 	    vd->vdev_state == VDEV_STATE_OFFLINE);
709 
710 	if (vd->vdev_fault_mode == VDEV_FAULT_COUNT)
711 		vd->vdev_fault_arg >>= 1;
712 	else
713 		vd->vdev_fault_mode = VDEV_FAULT_NONE;
714 
715 	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
716 
717 	for (vk = vdev_knob_next(NULL); vk != NULL; vk = vdev_knob_next(vk)) {
718 		uint64_t *valp = (uint64_t *)((char *)vd + vk->vk_offset);
719 
720 		*valp = vk->vk_default;
721 		*valp = MAX(*valp, vk->vk_min);
722 		*valp = MIN(*valp, vk->vk_max);
723 	}
724 
725 	if (vd->vdev_ops->vdev_op_leaf) {
726 		vdev_cache_init(vd);
727 		vdev_queue_init(vd);
728 		vd->vdev_cache_active = B_TRUE;
729 	}
730 
731 	if (vd->vdev_offline) {
732 		ASSERT(vd->vdev_children == 0);
733 		dprintf("OFFLINE: %s = ENXIO\n", vdev_description(vd));
734 		vd->vdev_state = VDEV_STATE_OFFLINE;
735 		return (ENXIO);
736 	}
737 
738 	error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift);
739 
740 	dprintf("%s = %d, osize %llu, state = %d\n",
741 	    vdev_description(vd), error, osize, vd->vdev_state);
742 
743 	if (error) {
744 		dprintf("%s in %s failed to open, error %d, aux %d\n",
745 		    vdev_description(vd),
746 		    vdev_description(vd->vdev_parent),
747 		    error,
748 		    vd->vdev_stat.vs_aux);
749 
750 		vd->vdev_state = VDEV_STATE_CANT_OPEN;
751 		return (error);
752 	}
753 
754 	vd->vdev_state = VDEV_STATE_HEALTHY;
755 
756 	for (c = 0; c < vd->vdev_children; c++)
757 		if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY)
758 			vd->vdev_state = VDEV_STATE_DEGRADED;
759 
760 	osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
761 
762 	if (vd->vdev_children == 0) {
763 		if (osize < SPA_MINDEVSIZE) {
764 			vd->vdev_state = VDEV_STATE_CANT_OPEN;
765 			vd->vdev_stat.vs_aux = VDEV_AUX_TOO_SMALL;
766 			return (EOVERFLOW);
767 		}
768 		psize = osize;
769 		asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
770 	} else {
771 		if (osize < SPA_MINDEVSIZE -
772 		    (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
773 			vd->vdev_state = VDEV_STATE_CANT_OPEN;
774 			vd->vdev_stat.vs_aux = VDEV_AUX_TOO_SMALL;
775 			return (EOVERFLOW);
776 		}
777 		psize = 0;
778 		asize = osize;
779 	}
780 
781 	vd->vdev_psize = psize;
782 
783 	if (vd->vdev_asize == 0) {
784 		/*
785 		 * This is the first-ever open, so use the computed values.
786 		 */
787 		vd->vdev_asize = asize;
788 		vd->vdev_ashift = ashift;
789 	} else {
790 		/*
791 		 * Make sure the alignment requirement hasn't increased.
792 		 */
793 		if (ashift > vd->vdev_ashift) {
794 			dprintf("%s: ashift grew\n", vdev_description(vd));
795 			vd->vdev_state = VDEV_STATE_CANT_OPEN;
796 			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
797 			return (EINVAL);
798 		}
799 
800 		/*
801 		 * Make sure the device hasn't shrunk.
802 		 */
803 		if (asize < vd->vdev_asize) {
804 			dprintf("%s: device shrank\n", vdev_description(vd));
805 			vd->vdev_state = VDEV_STATE_CANT_OPEN;
806 			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
807 			return (EINVAL);
808 		}
809 
810 		/*
811 		 * If all children are healthy and the asize has increased,
812 		 * then we've experienced dynamic LUN growth.
813 		 */
814 		if (vd->vdev_state == VDEV_STATE_HEALTHY &&
815 		    asize > vd->vdev_asize) {
816 			dprintf("%s: device grew\n", vdev_description(vd));
817 			vd->vdev_asize = asize;
818 		}
819 	}
820 
821 	return (0);
822 }
823 
824 /*
825  * Close a virtual device.
826  */
827 void
828 vdev_close(vdev_t *vd)
829 {
830 	ASSERT3P(list_head(&vd->vdev_io_pending), ==, NULL);
831 
832 	vd->vdev_ops->vdev_op_close(vd);
833 
834 	if (vd->vdev_cache_active) {
835 		vdev_cache_fini(vd);
836 		vdev_queue_fini(vd);
837 		vd->vdev_cache_active = B_FALSE;
838 	}
839 
840 	if (vd->vdev_offline)
841 		vd->vdev_state = VDEV_STATE_OFFLINE;
842 	else
843 		vd->vdev_state = VDEV_STATE_CLOSED;
844 }
845 
846 void
847 vdev_reopen(vdev_t *vd, zio_t **rq)
848 {
849 	vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
850 	int c;
851 
852 	if (vd == rvd) {
853 		ASSERT(rq == NULL);
854 		for (c = 0; c < rvd->vdev_children; c++)
855 			vdev_reopen(rvd->vdev_child[c], NULL);
856 		return;
857 	}
858 
859 	/* only valid for top-level vdevs */
860 	ASSERT3P(vd, ==, vd->vdev_top);
861 
862 	/*
863 	 * vdev_state can change when spa_config_lock is held as writer,
864 	 * or when it's held as reader and we're doing a vdev_reopen().
865 	 * To handle the latter case, we grab rvd's io_lock to serialize
866 	 * reopens.  This ensures that there's never more than one vdev
867 	 * state changer active at a time.
868 	 */
869 	mutex_enter(&rvd->vdev_io_lock);
870 
871 	mutex_enter(&vd->vdev_io_lock);
872 	while (list_head(&vd->vdev_io_pending) != NULL)
873 		cv_wait(&vd->vdev_io_cv, &vd->vdev_io_lock);
874 	vdev_close(vd);
875 	(void) vdev_open(vd);
876 	if (rq != NULL) {
877 		*rq = vd->vdev_io_retry;
878 		vd->vdev_io_retry = NULL;
879 	}
880 	mutex_exit(&vd->vdev_io_lock);
881 
882 	/*
883 	 * Reassess root vdev's health.
884 	 */
885 	rvd->vdev_state = VDEV_STATE_HEALTHY;
886 	for (c = 0; c < rvd->vdev_children; c++) {
887 		uint64_t state = rvd->vdev_child[c]->vdev_state;
888 		rvd->vdev_state = MIN(rvd->vdev_state, state);
889 	}
890 
891 	mutex_exit(&rvd->vdev_io_lock);
892 }
893 
894 int
895 vdev_create(vdev_t *vd, uint64_t txg)
896 {
897 	int error;
898 
899 	/*
900 	 * Normally, partial opens (e.g. of a mirror) are allowed.
901 	 * For a create, however, we want to fail the request if
902 	 * there are any components we can't open.
903 	 */
904 	error = vdev_open(vd);
905 
906 	if (error || vd->vdev_state != VDEV_STATE_HEALTHY) {
907 		vdev_close(vd);
908 		return (error ? error : ENXIO);
909 	}
910 
911 	/*
912 	 * Recursively initialize all labels.
913 	 */
914 	if ((error = vdev_label_init(vd, txg)) != 0) {
915 		vdev_close(vd);
916 		return (error);
917 	}
918 
919 	return (0);
920 }
921 
922 /*
923  * The is the latter half of vdev_create().  It is distinct because it
924  * involves initiating transactions in order to do metaslab creation.
925  * For creation, we want to try to create all vdevs at once and then undo it
926  * if anything fails; this is much harder if we have pending transactions.
927  */
928 void
929 vdev_init(vdev_t *vd, uint64_t txg)
930 {
931 	/*
932 	 * Aim for roughly 200 metaslabs per vdev.
933 	 */
934 	vd->vdev_ms_shift = highbit(vd->vdev_asize / 200);
935 	vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
936 
937 	/*
938 	 * Initialize the vdev's metaslabs.
939 	 */
940 	vdev_metaslab_init(vd, txg);
941 }
942 
943 void
944 vdev_dirty(vdev_t *vd, uint8_t flags, uint64_t txg)
945 {
946 	vdev_t *tvd = vd->vdev_top;
947 
948 	mutex_enter(&tvd->vdev_dirty_lock);
949 	if ((tvd->vdev_dirty[txg & TXG_MASK] & flags) != flags) {
950 		tvd->vdev_dirty[txg & TXG_MASK] |= flags;
951 		(void) txg_list_add(&tvd->vdev_spa->spa_vdev_txg_list,
952 		    tvd, txg);
953 	}
954 	mutex_exit(&tvd->vdev_dirty_lock);
955 }
956 
957 void
958 vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size)
959 {
960 	mutex_enter(sm->sm_lock);
961 	if (!space_map_contains(sm, txg, size))
962 		space_map_add(sm, txg, size);
963 	mutex_exit(sm->sm_lock);
964 }
965 
966 int
967 vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size)
968 {
969 	int dirty;
970 
971 	/*
972 	 * Quick test without the lock -- covers the common case that
973 	 * there are no dirty time segments.
974 	 */
975 	if (sm->sm_space == 0)
976 		return (0);
977 
978 	mutex_enter(sm->sm_lock);
979 	dirty = space_map_contains(sm, txg, size);
980 	mutex_exit(sm->sm_lock);
981 
982 	return (dirty);
983 }
984 
985 /*
986  * Reassess DTLs after a config change or scrub completion.
987  */
988 void
989 vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
990 {
991 	int c;
992 
993 	ASSERT(spa_config_held(vd->vdev_spa, RW_WRITER));
994 
995 	if (vd->vdev_children == 0) {
996 		mutex_enter(&vd->vdev_dtl_lock);
997 		/*
998 		 * We're successfully scrubbed everything up to scrub_txg.
999 		 * Therefore, excise all old DTLs up to that point, then
1000 		 * fold in the DTLs for everything we couldn't scrub.
1001 		 */
1002 		if (scrub_txg != 0) {
1003 			space_map_excise(&vd->vdev_dtl_map, 0, scrub_txg);
1004 			space_map_union(&vd->vdev_dtl_map, &vd->vdev_dtl_scrub);
1005 		}
1006 		if (scrub_done)
1007 			space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
1008 		mutex_exit(&vd->vdev_dtl_lock);
1009 		if (txg != 0) {
1010 			vdev_t *tvd = vd->vdev_top;
1011 			vdev_dirty(tvd, VDD_DTL, txg);
1012 			(void) txg_list_add(&tvd->vdev_dtl_list, vd, txg);
1013 		}
1014 		return;
1015 	}
1016 
1017 	mutex_enter(&vd->vdev_dtl_lock);
1018 	space_map_vacate(&vd->vdev_dtl_map, NULL, NULL);
1019 	space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
1020 	mutex_exit(&vd->vdev_dtl_lock);
1021 
1022 	for (c = 0; c < vd->vdev_children; c++) {
1023 		vdev_t *cvd = vd->vdev_child[c];
1024 		vdev_dtl_reassess(cvd, txg, scrub_txg, scrub_done);
1025 		mutex_enter(&vd->vdev_dtl_lock);
1026 		space_map_union(&vd->vdev_dtl_map, &cvd->vdev_dtl_map);
1027 		space_map_union(&vd->vdev_dtl_scrub, &cvd->vdev_dtl_scrub);
1028 		mutex_exit(&vd->vdev_dtl_lock);
1029 	}
1030 }
1031 
1032 static int
1033 vdev_dtl_load(vdev_t *vd)
1034 {
1035 	spa_t *spa = vd->vdev_spa;
1036 	space_map_obj_t *smo = &vd->vdev_dtl;
1037 	dmu_buf_t *db;
1038 	int error;
1039 
1040 	ASSERT(vd->vdev_children == 0);
1041 
1042 	if (smo->smo_object == 0)
1043 		return (0);
1044 
1045 	db = dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object);
1046 	dmu_buf_read(db);
1047 	ASSERT3U(db->db_size, ==, sizeof (*smo));
1048 	bcopy(db->db_data, smo, db->db_size);
1049 	dmu_buf_rele(db);
1050 
1051 	mutex_enter(&vd->vdev_dtl_lock);
1052 	error = space_map_load(&vd->vdev_dtl_map, smo, SM_ALLOC,
1053 	    spa->spa_meta_objset, smo->smo_objsize, smo->smo_alloc);
1054 	mutex_exit(&vd->vdev_dtl_lock);
1055 
1056 	return (error);
1057 }
1058 
1059 void
1060 vdev_dtl_sync(vdev_t *vd, uint64_t txg)
1061 {
1062 	spa_t *spa = vd->vdev_spa;
1063 	space_map_obj_t *smo = &vd->vdev_dtl;
1064 	space_map_t *sm = &vd->vdev_dtl_map;
1065 	space_map_t smsync;
1066 	kmutex_t smlock;
1067 	avl_tree_t *t = &sm->sm_root;
1068 	space_seg_t *ss;
1069 	dmu_buf_t *db;
1070 	dmu_tx_t *tx;
1071 
1072 	dprintf("%s in txg %llu pass %d\n",
1073 	    vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa));
1074 
1075 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1076 
1077 	if (vd->vdev_detached) {
1078 		if (smo->smo_object != 0) {
1079 			int err = dmu_object_free(spa->spa_meta_objset,
1080 			    smo->smo_object, tx);
1081 			ASSERT3U(err, ==, 0);
1082 			smo->smo_object = 0;
1083 		}
1084 		dmu_tx_commit(tx);
1085 		return;
1086 	}
1087 
1088 	if (smo->smo_object == 0) {
1089 		ASSERT(smo->smo_objsize == 0);
1090 		ASSERT(smo->smo_alloc == 0);
1091 		smo->smo_object = dmu_object_alloc(spa->spa_meta_objset,
1092 		    DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
1093 		    DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
1094 		ASSERT(smo->smo_object != 0);
1095 		vdev_config_dirty(vd->vdev_top);
1096 	}
1097 
1098 	dmu_free_range(spa->spa_meta_objset, smo->smo_object,
1099 	    0, smo->smo_objsize, tx);
1100 
1101 	mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL);
1102 
1103 	space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift,
1104 	    &smlock);
1105 
1106 	mutex_enter(&smlock);
1107 
1108 	mutex_enter(&vd->vdev_dtl_lock);
1109 	for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss))
1110 		space_map_add(&smsync, ss->ss_start, ss->ss_end - ss->ss_start);
1111 	mutex_exit(&vd->vdev_dtl_lock);
1112 
1113 	smo->smo_objsize = 0;
1114 	smo->smo_alloc = smsync.sm_space;
1115 
1116 	space_map_sync(&smsync, NULL, smo, SM_ALLOC, spa->spa_meta_objset, tx);
1117 	space_map_destroy(&smsync);
1118 
1119 	mutex_exit(&smlock);
1120 	mutex_destroy(&smlock);
1121 
1122 	db = dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object);
1123 	dmu_buf_will_dirty(db, tx);
1124 	ASSERT3U(db->db_size, ==, sizeof (*smo));
1125 	bcopy(smo, db->db_data, db->db_size);
1126 	dmu_buf_rele(db);
1127 
1128 	dmu_tx_commit(tx);
1129 }
1130 
1131 int
1132 vdev_load(vdev_t *vd, int import)
1133 {
1134 	spa_t *spa = vd->vdev_spa;
1135 	int c, error;
1136 	nvlist_t *label;
1137 	uint64_t guid, state;
1138 
1139 	dprintf("loading %s\n", vdev_description(vd));
1140 
1141 	/*
1142 	 * Recursively load all children.
1143 	 */
1144 	for (c = 0; c < vd->vdev_children; c++)
1145 		if ((error = vdev_load(vd->vdev_child[c], import)) != 0)
1146 			return (error);
1147 
1148 	/*
1149 	 * If this is a leaf vdev, make sure its agrees with its disk labels.
1150 	 */
1151 	if (vd->vdev_ops->vdev_op_leaf) {
1152 
1153 		if (vdev_is_dead(vd))
1154 			return (0);
1155 
1156 		/*
1157 		 * XXX state transitions don't propagate to parent here.
1158 		 * Also, merely setting the state isn't sufficient because
1159 		 * it's not persistent; a vdev_reopen() would make us
1160 		 * forget all about it.
1161 		 */
1162 		if ((label = vdev_label_read_config(vd)) == NULL) {
1163 			dprintf("can't load label config\n");
1164 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
1165 			    VDEV_AUX_CORRUPT_DATA);
1166 			return (0);
1167 		}
1168 
1169 		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
1170 		    &guid) != 0 || guid != spa_guid(spa)) {
1171 			dprintf("bad or missing pool GUID (%llu)\n", guid);
1172 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
1173 			    VDEV_AUX_CORRUPT_DATA);
1174 			nvlist_free(label);
1175 			return (0);
1176 		}
1177 
1178 		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) ||
1179 		    guid != vd->vdev_guid) {
1180 			dprintf("bad or missing vdev guid (%llu != %llu)\n",
1181 			    guid, vd->vdev_guid);
1182 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
1183 			    VDEV_AUX_CORRUPT_DATA);
1184 			nvlist_free(label);
1185 			return (0);
1186 		}
1187 
1188 		/*
1189 		 * If we find a vdev with a matching pool guid and vdev guid,
1190 		 * but the pool state is not active, it indicates that the user
1191 		 * exported or destroyed the pool without affecting the config
1192 		 * cache (if / was mounted readonly, for example).  In this
1193 		 * case, immediately return EBADF so the caller can remove it
1194 		 * from the config.
1195 		 */
1196 		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
1197 		    &state)) {
1198 			dprintf("missing pool state\n");
1199 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
1200 			    VDEV_AUX_CORRUPT_DATA);
1201 			nvlist_free(label);
1202 			return (0);
1203 		}
1204 
1205 		if (state != POOL_STATE_ACTIVE &&
1206 		    (!import || state != POOL_STATE_EXPORTED)) {
1207 			dprintf("pool state not active (%llu)\n", state);
1208 			nvlist_free(label);
1209 			return (EBADF);
1210 		}
1211 
1212 		nvlist_free(label);
1213 	}
1214 
1215 	/*
1216 	 * If this is a top-level vdev, make sure its allocation parameters
1217 	 * exist and initialize its metaslabs.
1218 	 */
1219 	if (vd == vd->vdev_top) {
1220 
1221 		if (vd->vdev_ms_array == 0 ||
1222 		    vd->vdev_ms_shift == 0 ||
1223 		    vd->vdev_ashift == 0 ||
1224 		    vd->vdev_asize == 0) {
1225 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
1226 			    VDEV_AUX_CORRUPT_DATA);
1227 			return (0);
1228 		}
1229 
1230 		vdev_metaslab_init(vd, 0);
1231 	}
1232 
1233 	/*
1234 	 * If this is a leaf vdev, load its DTL.
1235 	 */
1236 	if (vd->vdev_ops->vdev_op_leaf) {
1237 		error = vdev_dtl_load(vd);
1238 		if (error) {
1239 			dprintf("can't load DTL for %s, error %d\n",
1240 			    vdev_description(vd), error);
1241 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
1242 			    VDEV_AUX_CORRUPT_DATA);
1243 			return (0);
1244 		}
1245 	}
1246 
1247 	return (0);
1248 }
1249 
1250 void
1251 vdev_sync_done(vdev_t *vd, uint64_t txg)
1252 {
1253 	metaslab_t *msp;
1254 
1255 	dprintf("%s txg %llu\n", vdev_description(vd), txg);
1256 
1257 	while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
1258 		metaslab_sync_done(msp, txg);
1259 }
1260 
1261 void
1262 vdev_add_sync(vdev_t *vd, uint64_t txg)
1263 {
1264 	spa_t *spa = vd->vdev_spa;
1265 	dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1266 
1267 	ASSERT(vd == vd->vdev_top);
1268 
1269 	if (vd->vdev_ms_array == 0)
1270 		vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
1271 		    DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
1272 
1273 	ASSERT(vd->vdev_ms_array != 0);
1274 
1275 	vdev_config_dirty(vd);
1276 
1277 	dmu_tx_commit(tx);
1278 }
1279 
1280 void
1281 vdev_sync(vdev_t *vd, uint64_t txg)
1282 {
1283 	spa_t *spa = vd->vdev_spa;
1284 	vdev_t *lvd;
1285 	metaslab_t *msp;
1286 	uint8_t *dirtyp = &vd->vdev_dirty[txg & TXG_MASK];
1287 	uint8_t dirty = *dirtyp;
1288 
1289 	mutex_enter(&vd->vdev_dirty_lock);
1290 	*dirtyp &= ~(VDD_ALLOC | VDD_FREE | VDD_ADD | VDD_DTL);
1291 	mutex_exit(&vd->vdev_dirty_lock);
1292 
1293 	dprintf("%s txg %llu pass %d\n",
1294 	    vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa));
1295 
1296 	if (dirty & VDD_ADD)
1297 		vdev_add_sync(vd, txg);
1298 
1299 	while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL)
1300 		metaslab_sync(msp, txg);
1301 
1302 	while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
1303 		vdev_dtl_sync(lvd, txg);
1304 
1305 	(void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
1306 }
1307 
1308 uint64_t
1309 vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
1310 {
1311 	return (vd->vdev_ops->vdev_op_asize(vd, psize));
1312 }
1313 
1314 void
1315 vdev_io_start(zio_t *zio)
1316 {
1317 	zio->io_vd->vdev_ops->vdev_op_io_start(zio);
1318 }
1319 
1320 void
1321 vdev_io_done(zio_t *zio)
1322 {
1323 	zio->io_vd->vdev_ops->vdev_op_io_done(zio);
1324 }
1325 
1326 const char *
1327 vdev_description(vdev_t *vd)
1328 {
1329 	if (vd == NULL || vd->vdev_ops == NULL)
1330 		return ("<unknown>");
1331 
1332 	if (vd->vdev_path != NULL)
1333 		return (vd->vdev_path);
1334 
1335 	if (vd->vdev_parent == NULL)
1336 		return (spa_name(vd->vdev_spa));
1337 
1338 	return (vd->vdev_ops->vdev_op_type);
1339 }
1340 
1341 int
1342 vdev_online(spa_t *spa, const char *path)
1343 {
1344 	vdev_t *vd;
1345 
1346 	spa_config_enter(spa, RW_WRITER);
1347 
1348 	if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, path)) == NULL) {
1349 		spa_config_exit(spa);
1350 		return (ENODEV);
1351 	}
1352 
1353 	dprintf("ONLINE: %s\n", vdev_description(vd));
1354 
1355 	vd->vdev_offline = B_FALSE;
1356 
1357 	/*
1358 	 * Clear the error counts.  The idea is that you expect to see all
1359 	 * zeroes when everything is working, so if you've just onlined a
1360 	 * device, you don't want to keep hearing about errors from before.
1361 	 */
1362 	vd->vdev_stat.vs_read_errors = 0;
1363 	vd->vdev_stat.vs_write_errors = 0;
1364 	vd->vdev_stat.vs_checksum_errors = 0;
1365 
1366 	vdev_reopen(vd->vdev_top, NULL);
1367 
1368 	spa_config_exit(spa);
1369 
1370 	VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
1371 
1372 	return (0);
1373 }
1374 
1375 int
1376 vdev_offline(spa_t *spa, const char *path)
1377 {
1378 	vdev_t *vd;
1379 
1380 	spa_config_enter(spa, RW_WRITER);
1381 
1382 	if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, path)) == NULL) {
1383 		spa_config_exit(spa);
1384 		return (ENODEV);
1385 	}
1386 
1387 	dprintf("OFFLINE: %s\n", vdev_description(vd));
1388 
1389 	/*
1390 	 * If this device's top-level vdev has a non-empty DTL,
1391 	 * don't allow the device to be offlined.
1392 	 *
1393 	 * XXX -- we should make this more precise by allowing the offline
1394 	 * as long as the remaining devices don't have any DTL holes.
1395 	 */
1396 	if (vd->vdev_top->vdev_dtl_map.sm_space != 0) {
1397 		spa_config_exit(spa);
1398 		return (EBUSY);
1399 	}
1400 
1401 	/*
1402 	 * Set this device to offline state and reopen its top-level vdev.
1403 	 * If this action results in the top-level vdev becoming unusable,
1404 	 * undo it and fail the request.
1405 	 */
1406 	vd->vdev_offline = B_TRUE;
1407 	vdev_reopen(vd->vdev_top, NULL);
1408 	if (vdev_is_dead(vd->vdev_top)) {
1409 		vd->vdev_offline = B_FALSE;
1410 		vdev_reopen(vd->vdev_top, NULL);
1411 		spa_config_exit(spa);
1412 		return (EBUSY);
1413 	}
1414 
1415 	spa_config_exit(spa);
1416 
1417 	return (0);
1418 }
1419 
1420 int
1421 vdev_error_setup(spa_t *spa, const char *path, int mode, int mask, uint64_t arg)
1422 {
1423 	vdev_t *vd;
1424 
1425 	spa_config_enter(spa, RW_WRITER);
1426 
1427 	if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, path)) == NULL) {
1428 		spa_config_exit(spa);
1429 		return (ENODEV);
1430 	}
1431 
1432 	vd->vdev_fault_mode = mode;
1433 	vd->vdev_fault_mask = mask;
1434 	vd->vdev_fault_arg = arg;
1435 
1436 	spa_config_exit(spa);
1437 
1438 	return (0);
1439 }
1440 
1441 int
1442 vdev_is_dead(vdev_t *vd)
1443 {
1444 	return (vd->vdev_state <= VDEV_STATE_CANT_OPEN);
1445 }
1446 
1447 int
1448 vdev_error_inject(vdev_t *vd, zio_t *zio)
1449 {
1450 	int error = 0;
1451 
1452 	if (vd->vdev_fault_mode == VDEV_FAULT_NONE)
1453 		return (0);
1454 
1455 	if (((1ULL << zio->io_type) & vd->vdev_fault_mask) == 0)
1456 		return (0);
1457 
1458 	switch (vd->vdev_fault_mode) {
1459 	case VDEV_FAULT_RANDOM:
1460 		if (spa_get_random(vd->vdev_fault_arg) == 0)
1461 			error = EIO;
1462 		break;
1463 
1464 	case VDEV_FAULT_COUNT:
1465 		if ((int64_t)--vd->vdev_fault_arg <= 0)
1466 			vd->vdev_fault_mode = VDEV_FAULT_NONE;
1467 		error = EIO;
1468 		break;
1469 	}
1470 
1471 	if (error != 0) {
1472 		dprintf("returning %d for type %d on %s state %d offset %llx\n",
1473 		    error, zio->io_type, vdev_description(vd),
1474 		    vd->vdev_state, zio->io_offset);
1475 	}
1476 
1477 	return (error);
1478 }
1479 
1480 /*
1481  * Get statistics for the given vdev.
1482  */
1483 void
1484 vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
1485 {
1486 	vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
1487 	int c, t;
1488 
1489 	mutex_enter(&vd->vdev_stat_lock);
1490 	bcopy(&vd->vdev_stat, vs, sizeof (*vs));
1491 	vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
1492 	vs->vs_state = vd->vdev_state;
1493 	vs->vs_rsize = vdev_get_rsize(vd);
1494 	mutex_exit(&vd->vdev_stat_lock);
1495 
1496 	/*
1497 	 * If we're getting stats on the root vdev, aggregate the I/O counts
1498 	 * over all top-level vdevs (i.e. the direct children of the root).
1499 	 */
1500 	if (vd == rvd) {
1501 		for (c = 0; c < rvd->vdev_children; c++) {
1502 			vdev_t *cvd = rvd->vdev_child[c];
1503 			vdev_stat_t *cvs = &cvd->vdev_stat;
1504 
1505 			mutex_enter(&vd->vdev_stat_lock);
1506 			for (t = 0; t < ZIO_TYPES; t++) {
1507 				vs->vs_ops[t] += cvs->vs_ops[t];
1508 				vs->vs_bytes[t] += cvs->vs_bytes[t];
1509 			}
1510 			vs->vs_read_errors += cvs->vs_read_errors;
1511 			vs->vs_write_errors += cvs->vs_write_errors;
1512 			vs->vs_checksum_errors += cvs->vs_checksum_errors;
1513 			vs->vs_scrub_examined += cvs->vs_scrub_examined;
1514 			vs->vs_scrub_errors += cvs->vs_scrub_errors;
1515 			mutex_exit(&vd->vdev_stat_lock);
1516 		}
1517 	}
1518 }
1519 
1520 void
1521 vdev_stat_update(zio_t *zio)
1522 {
1523 	vdev_t *vd = zio->io_vd;
1524 	vdev_t *pvd;
1525 	uint64_t txg = zio->io_txg;
1526 	vdev_stat_t *vs = &vd->vdev_stat;
1527 	zio_type_t type = zio->io_type;
1528 	int flags = zio->io_flags;
1529 
1530 	if (zio->io_error == 0) {
1531 		if (!(flags & ZIO_FLAG_IO_BYPASS)) {
1532 			mutex_enter(&vd->vdev_stat_lock);
1533 			vs->vs_ops[type]++;
1534 			vs->vs_bytes[type] += zio->io_size;
1535 			mutex_exit(&vd->vdev_stat_lock);
1536 		}
1537 		if ((flags & ZIO_FLAG_IO_REPAIR) &&
1538 		    zio->io_delegate_list == NULL) {
1539 			mutex_enter(&vd->vdev_stat_lock);
1540 			if (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))
1541 				vs->vs_scrub_repaired += zio->io_size;
1542 			else
1543 				vs->vs_self_healed += zio->io_size;
1544 			mutex_exit(&vd->vdev_stat_lock);
1545 		}
1546 		return;
1547 	}
1548 
1549 	if (flags & ZIO_FLAG_SPECULATIVE)
1550 		return;
1551 
1552 	if (!vdev_is_dead(vd)) {
1553 		mutex_enter(&vd->vdev_stat_lock);
1554 		if (type == ZIO_TYPE_READ) {
1555 			if (zio->io_error == ECKSUM)
1556 				vs->vs_checksum_errors++;
1557 			else
1558 				vs->vs_read_errors++;
1559 		}
1560 		if (type == ZIO_TYPE_WRITE)
1561 			vs->vs_write_errors++;
1562 		mutex_exit(&vd->vdev_stat_lock);
1563 	}
1564 
1565 	if (type == ZIO_TYPE_WRITE) {
1566 		if (txg == 0 || vd->vdev_children != 0)
1567 			return;
1568 		if (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) {
1569 			ASSERT(flags & ZIO_FLAG_IO_REPAIR);
1570 			for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
1571 				vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1);
1572 		}
1573 		if (!(flags & ZIO_FLAG_IO_REPAIR)) {
1574 			vdev_t *tvd = vd->vdev_top;
1575 			if (vdev_dtl_contains(&vd->vdev_dtl_map, txg, 1))
1576 				return;
1577 			vdev_dirty(tvd, VDD_DTL, txg);
1578 			(void) txg_list_add(&tvd->vdev_dtl_list, vd, txg);
1579 			for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
1580 				vdev_dtl_dirty(&pvd->vdev_dtl_map, txg, 1);
1581 		}
1582 	}
1583 }
1584 
1585 void
1586 vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete)
1587 {
1588 	int c;
1589 	vdev_stat_t *vs = &vd->vdev_stat;
1590 
1591 	for (c = 0; c < vd->vdev_children; c++)
1592 		vdev_scrub_stat_update(vd->vdev_child[c], type, complete);
1593 
1594 	mutex_enter(&vd->vdev_stat_lock);
1595 
1596 	if (type == POOL_SCRUB_NONE) {
1597 		/*
1598 		 * Update completion and end time.  Leave everything else alone
1599 		 * so we can report what happened during the previous scrub.
1600 		 */
1601 		vs->vs_scrub_complete = complete;
1602 		vs->vs_scrub_end = gethrestime_sec();
1603 	} else {
1604 		vs->vs_scrub_type = type;
1605 		vs->vs_scrub_complete = 0;
1606 		vs->vs_scrub_examined = 0;
1607 		vs->vs_scrub_repaired = 0;
1608 		vs->vs_scrub_errors = 0;
1609 		vs->vs_scrub_start = gethrestime_sec();
1610 		vs->vs_scrub_end = 0;
1611 	}
1612 
1613 	mutex_exit(&vd->vdev_stat_lock);
1614 }
1615 
1616 /*
1617  * Report checksum errors that a vdev that didn't realize it made.
1618  * This can happen, for example, when RAID-Z combinatorial reconstruction
1619  * infers that one of its components returned bad data.
1620  */
1621 void
1622 vdev_checksum_error(zio_t *zio, vdev_t *vd)
1623 {
1624 	dprintf_bp(zio->io_bp, "imputed checksum error on %s: ",
1625 	    vdev_description(vd));
1626 
1627 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1628 		mutex_enter(&vd->vdev_stat_lock);
1629 		vd->vdev_stat.vs_checksum_errors++;
1630 		mutex_exit(&vd->vdev_stat_lock);
1631 	}
1632 }
1633 
1634 /*
1635  * Update the in-core space usage stats for this vdev and the root vdev.
1636  */
1637 void
1638 vdev_space_update(vdev_t *vd, uint64_t space_delta, uint64_t alloc_delta)
1639 {
1640 	ASSERT(vd == vd->vdev_top);
1641 
1642 	do {
1643 		mutex_enter(&vd->vdev_stat_lock);
1644 		vd->vdev_stat.vs_space += space_delta;
1645 		vd->vdev_stat.vs_alloc += alloc_delta;
1646 		mutex_exit(&vd->vdev_stat_lock);
1647 	} while ((vd = vd->vdev_parent) != NULL);
1648 }
1649 
1650 /*
1651  * Various knobs to tune a vdev.
1652  */
1653 static vdev_knob_t vdev_knob[] = {
1654 	{
1655 		"cache_size",
1656 		"size of the read-ahead cache",
1657 		0,
1658 		1ULL << 30,
1659 		10ULL << 20,
1660 		offsetof(struct vdev, vdev_cache.vc_size)
1661 	},
1662 	{
1663 		"cache_bshift",
1664 		"log2 of cache blocksize",
1665 		SPA_MINBLOCKSHIFT,
1666 		SPA_MAXBLOCKSHIFT,
1667 		16,
1668 		offsetof(struct vdev, vdev_cache.vc_bshift)
1669 	},
1670 	{
1671 		"cache_max",
1672 		"largest block size to cache",
1673 		0,
1674 		SPA_MAXBLOCKSIZE,
1675 		1ULL << 14,
1676 		offsetof(struct vdev, vdev_cache.vc_max)
1677 	},
1678 	{
1679 		"min_pending",
1680 		"minimum pending I/Os to the disk",
1681 		1,
1682 		10000,
1683 		2,
1684 		offsetof(struct vdev, vdev_queue.vq_min_pending)
1685 	},
1686 	{
1687 		"max_pending",
1688 		"maximum pending I/Os to the disk",
1689 		1,
1690 		10000,
1691 		35,
1692 		offsetof(struct vdev, vdev_queue.vq_max_pending)
1693 	},
1694 	{
1695 		"agg_limit",
1696 		"maximum size of aggregated I/Os",
1697 		0,
1698 		SPA_MAXBLOCKSIZE,
1699 		SPA_MAXBLOCKSIZE,
1700 		offsetof(struct vdev, vdev_queue.vq_agg_limit)
1701 	},
1702 	{
1703 		"time_shift",
1704 		"deadline = pri + (lbolt >> time_shift)",
1705 		0,
1706 		63,
1707 		4,
1708 		offsetof(struct vdev, vdev_queue.vq_time_shift)
1709 	},
1710 	{
1711 		"ramp_rate",
1712 		"exponential I/O issue ramp-up rate",
1713 		1,
1714 		10000,
1715 		2,
1716 		offsetof(struct vdev, vdev_queue.vq_ramp_rate)
1717 	},
1718 };
1719 
1720 vdev_knob_t *
1721 vdev_knob_next(vdev_knob_t *vk)
1722 {
1723 	if (vk == NULL)
1724 		return (vdev_knob);
1725 
1726 	if (++vk == vdev_knob + sizeof (vdev_knob) / sizeof (vdev_knob_t))
1727 		return (NULL);
1728 
1729 	return (vk);
1730 }
1731 
1732 /*
1733  * Mark a top-level vdev's config as dirty, placing it on the dirty list
1734  * so that it will be written out next time the vdev configuration is synced.
1735  * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
1736  */
1737 void
1738 vdev_config_dirty(vdev_t *vd)
1739 {
1740 	spa_t *spa = vd->vdev_spa;
1741 	vdev_t *rvd = spa->spa_root_vdev;
1742 	int c;
1743 
1744 	if (vd == rvd) {
1745 		for (c = 0; c < rvd->vdev_children; c++)
1746 			vdev_config_dirty(rvd->vdev_child[c]);
1747 	} else {
1748 		ASSERT(vd == vd->vdev_top);
1749 
1750 		if (!vd->vdev_is_dirty) {
1751 			list_insert_head(&spa->spa_dirty_list, vd);
1752 			vd->vdev_is_dirty = B_TRUE;
1753 		}
1754 	}
1755 }
1756 
1757 void
1758 vdev_config_clean(vdev_t *vd)
1759 {
1760 	ASSERT(vd->vdev_is_dirty);
1761 
1762 	list_remove(&vd->vdev_spa->spa_dirty_list, vd);
1763 	vd->vdev_is_dirty = B_FALSE;
1764 }
1765 
1766 /*
1767  * Set a vdev's state, updating any parent's state as well.
1768  */
1769 void
1770 vdev_set_state(vdev_t *vd, vdev_state_t state, vdev_aux_t aux)
1771 {
1772 	if (state == vd->vdev_state)
1773 		return;
1774 
1775 	vd->vdev_state = state;
1776 	vd->vdev_stat.vs_aux = aux;
1777 
1778 	if (vd->vdev_parent != NULL) {
1779 		int c;
1780 		int degraded = 0, faulted = 0;
1781 		vdev_t *parent, *child;
1782 
1783 		parent = vd->vdev_parent;
1784 		for (c = 0; c < parent->vdev_children; c++) {
1785 			child = parent->vdev_child[c];
1786 			if (child->vdev_state <= VDEV_STATE_CANT_OPEN)
1787 				faulted++;
1788 			else if (child->vdev_state == VDEV_STATE_DEGRADED)
1789 				degraded++;
1790 		}
1791 
1792 		vd->vdev_parent->vdev_ops->vdev_op_state_change(
1793 		    vd->vdev_parent, faulted, degraded);
1794 	    }
1795 }
1796