xref: /titanic_51/usr/src/uts/common/fs/zfs/vdev.c (revision 2a79c5fee1dab68e30266ba4356cf60b871aabcf)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/zfs_context.h>
30 #include <sys/spa.h>
31 #include <sys/spa_impl.h>
32 #include <sys/dmu.h>
33 #include <sys/dmu_tx.h>
34 #include <sys/vdev_impl.h>
35 #include <sys/uberblock_impl.h>
36 #include <sys/metaslab.h>
37 #include <sys/metaslab_impl.h>
38 #include <sys/space_map.h>
39 #include <sys/zio.h>
40 #include <sys/zap.h>
41 #include <sys/fs/zfs.h>
42 
43 /*
44  * Virtual device management.
45  */
46 
47 static vdev_ops_t *vdev_ops_table[] = {
48 	&vdev_root_ops,
49 	&vdev_raidz_ops,
50 	&vdev_mirror_ops,
51 	&vdev_replacing_ops,
52 	&vdev_disk_ops,
53 	&vdev_file_ops,
54 	&vdev_missing_ops,
55 	NULL
56 };
57 
58 /*
59  * Given a vdev type, return the appropriate ops vector.
60  */
61 static vdev_ops_t *
62 vdev_getops(const char *type)
63 {
64 	vdev_ops_t *ops, **opspp;
65 
66 	for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
67 		if (strcmp(ops->vdev_op_type, type) == 0)
68 			break;
69 
70 	return (ops);
71 }
72 
73 /*
74  * Default asize function: return the MAX of psize with the asize of
75  * all children.  This is what's used by anything other than RAID-Z.
76  */
77 uint64_t
78 vdev_default_asize(vdev_t *vd, uint64_t psize)
79 {
80 	uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_ashift);
81 	uint64_t csize;
82 	uint64_t c;
83 
84 	for (c = 0; c < vd->vdev_children; c++) {
85 		csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
86 		asize = MAX(asize, csize);
87 	}
88 
89 	return (asize);
90 }
91 
92 /*
93  * Get the replaceable or attachable device size.
94  * If the parent is a mirror or raidz, the replaceable size is the minimum
95  * psize of all its children. For the rest, just return our own psize.
96  *
97  * e.g.
98  *			psize	rsize
99  * root			-	-
100  *	mirror/raidz	-	-
101  *	    disk1	20g	20g
102  *	    disk2 	40g	20g
103  *	disk3 		80g	80g
104  */
105 uint64_t
106 vdev_get_rsize(vdev_t *vd)
107 {
108 	vdev_t *pvd, *cvd;
109 	uint64_t c, rsize;
110 
111 	pvd = vd->vdev_parent;
112 
113 	/*
114 	 * If our parent is NULL or the root, just return our own psize.
115 	 */
116 	if (pvd == NULL || pvd->vdev_parent == NULL)
117 		return (vd->vdev_psize);
118 
119 	rsize = 0;
120 
121 	for (c = 0; c < pvd->vdev_children; c++) {
122 		cvd = pvd->vdev_child[c];
123 		rsize = MIN(rsize - 1, cvd->vdev_psize - 1) + 1;
124 	}
125 
126 	return (rsize);
127 }
128 
129 vdev_t *
130 vdev_lookup_top(spa_t *spa, uint64_t vdev)
131 {
132 	vdev_t *rvd = spa->spa_root_vdev;
133 
134 	if (vdev < rvd->vdev_children)
135 		return (rvd->vdev_child[vdev]);
136 
137 	return (NULL);
138 }
139 
140 vdev_t *
141 vdev_lookup_by_path(vdev_t *vd, const char *path)
142 {
143 	int c;
144 	vdev_t *mvd;
145 
146 	if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0)
147 		return (vd);
148 
149 	for (c = 0; c < vd->vdev_children; c++)
150 		if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) !=
151 		    NULL)
152 			return (mvd);
153 
154 	return (NULL);
155 }
156 
157 vdev_t *
158 vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
159 {
160 	int c;
161 	vdev_t *mvd;
162 
163 	if (vd->vdev_children == 0 && vd->vdev_guid == guid)
164 		return (vd);
165 
166 	for (c = 0; c < vd->vdev_children; c++)
167 		if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
168 		    NULL)
169 			return (mvd);
170 
171 	return (NULL);
172 }
173 
174 void
175 vdev_add_child(vdev_t *pvd, vdev_t *cvd)
176 {
177 	size_t oldsize, newsize;
178 	uint64_t id = cvd->vdev_id;
179 	vdev_t **newchild;
180 
181 	ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER));
182 	ASSERT(cvd->vdev_parent == NULL);
183 
184 	cvd->vdev_parent = pvd;
185 
186 	if (pvd == NULL)
187 		return;
188 
189 	ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
190 
191 	oldsize = pvd->vdev_children * sizeof (vdev_t *);
192 	pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
193 	newsize = pvd->vdev_children * sizeof (vdev_t *);
194 
195 	newchild = kmem_zalloc(newsize, KM_SLEEP);
196 	if (pvd->vdev_child != NULL) {
197 		bcopy(pvd->vdev_child, newchild, oldsize);
198 		kmem_free(pvd->vdev_child, oldsize);
199 	}
200 
201 	pvd->vdev_child = newchild;
202 	pvd->vdev_child[id] = cvd;
203 
204 	cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
205 	ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL);
206 
207 	/*
208 	 * Walk up all ancestors to update guid sum.
209 	 */
210 	for (; pvd != NULL; pvd = pvd->vdev_parent)
211 		pvd->vdev_guid_sum += cvd->vdev_guid_sum;
212 }
213 
214 void
215 vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
216 {
217 	int c;
218 	uint_t id = cvd->vdev_id;
219 
220 	ASSERT(cvd->vdev_parent == pvd);
221 
222 	if (pvd == NULL)
223 		return;
224 
225 	ASSERT(id < pvd->vdev_children);
226 	ASSERT(pvd->vdev_child[id] == cvd);
227 
228 	pvd->vdev_child[id] = NULL;
229 	cvd->vdev_parent = NULL;
230 
231 	for (c = 0; c < pvd->vdev_children; c++)
232 		if (pvd->vdev_child[c])
233 			break;
234 
235 	if (c == pvd->vdev_children) {
236 		kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
237 		pvd->vdev_child = NULL;
238 		pvd->vdev_children = 0;
239 	}
240 
241 	/*
242 	 * Walk up all ancestors to update guid sum.
243 	 */
244 	for (; pvd != NULL; pvd = pvd->vdev_parent)
245 		pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
246 }
247 
248 /*
249  * Remove any holes in the child array.
250  */
251 void
252 vdev_compact_children(vdev_t *pvd)
253 {
254 	vdev_t **newchild, *cvd;
255 	int oldc = pvd->vdev_children;
256 	int newc, c;
257 
258 	ASSERT(spa_config_held(pvd->vdev_spa, RW_WRITER));
259 
260 	for (c = newc = 0; c < oldc; c++)
261 		if (pvd->vdev_child[c])
262 			newc++;
263 
264 	newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP);
265 
266 	for (c = newc = 0; c < oldc; c++) {
267 		if ((cvd = pvd->vdev_child[c]) != NULL) {
268 			newchild[newc] = cvd;
269 			cvd->vdev_id = newc++;
270 		}
271 	}
272 
273 	kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
274 	pvd->vdev_child = newchild;
275 	pvd->vdev_children = newc;
276 }
277 
278 /*
279  * Allocate and minimally initialize a vdev_t.
280  */
281 static vdev_t *
282 vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
283 {
284 	vdev_t *vd;
285 
286 	while (guid == 0)
287 		guid = spa_get_random(-1ULL);
288 
289 	vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
290 
291 	vd->vdev_spa = spa;
292 	vd->vdev_id = id;
293 	vd->vdev_guid = guid;
294 	vd->vdev_guid_sum = guid;
295 	vd->vdev_ops = ops;
296 	vd->vdev_state = VDEV_STATE_CLOSED;
297 
298 	mutex_init(&vd->vdev_io_lock, NULL, MUTEX_DEFAULT, NULL);
299 	cv_init(&vd->vdev_io_cv, NULL, CV_DEFAULT, NULL);
300 	list_create(&vd->vdev_io_pending, sizeof (zio_t),
301 	    offsetof(zio_t, io_pending));
302 	mutex_init(&vd->vdev_dirty_lock, NULL, MUTEX_DEFAULT, NULL);
303 	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
304 	space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock);
305 	space_map_create(&vd->vdev_dtl_scrub, 0, -1ULL, 0, &vd->vdev_dtl_lock);
306 	txg_list_create(&vd->vdev_ms_list,
307 	    offsetof(struct metaslab, ms_txg_node));
308 	txg_list_create(&vd->vdev_dtl_list,
309 	    offsetof(struct vdev, vdev_dtl_node));
310 	vd->vdev_stat.vs_timestamp = gethrtime();
311 
312 	return (vd);
313 }
314 
315 /*
316  * Free a vdev_t that has been removed from service.
317  */
318 static void
319 vdev_free_common(vdev_t *vd)
320 {
321 	if (vd->vdev_path)
322 		spa_strfree(vd->vdev_path);
323 	if (vd->vdev_devid)
324 		spa_strfree(vd->vdev_devid);
325 
326 	txg_list_destroy(&vd->vdev_ms_list);
327 	txg_list_destroy(&vd->vdev_dtl_list);
328 	mutex_enter(&vd->vdev_dtl_lock);
329 	space_map_vacate(&vd->vdev_dtl_map, NULL, NULL);
330 	space_map_destroy(&vd->vdev_dtl_map);
331 	space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
332 	space_map_destroy(&vd->vdev_dtl_scrub);
333 	mutex_exit(&vd->vdev_dtl_lock);
334 	mutex_destroy(&vd->vdev_dtl_lock);
335 	mutex_destroy(&vd->vdev_dirty_lock);
336 	list_destroy(&vd->vdev_io_pending);
337 	mutex_destroy(&vd->vdev_io_lock);
338 	cv_destroy(&vd->vdev_io_cv);
339 
340 	kmem_free(vd, sizeof (vdev_t));
341 }
342 
343 /*
344  * Allocate a new vdev.  The 'alloctype' is used to control whether we are
345  * creating a new vdev or loading an existing one - the behavior is slightly
346  * different for each case.
347  */
348 vdev_t *
349 vdev_alloc(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int alloctype)
350 {
351 	vdev_ops_t *ops;
352 	char *type;
353 	uint64_t guid = 0;
354 	vdev_t *vd;
355 
356 	ASSERT(spa_config_held(spa, RW_WRITER));
357 
358 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
359 		return (NULL);
360 
361 	if ((ops = vdev_getops(type)) == NULL)
362 		return (NULL);
363 
364 	/*
365 	 * If this is a load, get the vdev guid from the nvlist.
366 	 * Otherwise, vdev_alloc_common() will generate one for us.
367 	 */
368 	if (alloctype == VDEV_ALLOC_LOAD) {
369 		uint64_t label_id;
370 
371 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
372 		    label_id != id)
373 			return (NULL);
374 
375 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
376 			return (NULL);
377 	}
378 
379 	vd = vdev_alloc_common(spa, id, guid, ops);
380 
381 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0)
382 		vd->vdev_path = spa_strdup(vd->vdev_path);
383 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0)
384 		vd->vdev_devid = spa_strdup(vd->vdev_devid);
385 
386 	/*
387 	 * Set the whole_disk property.  If it's not specified, leave the value
388 	 * as -1.
389 	 */
390 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
391 	    &vd->vdev_wholedisk) != 0)
392 		vd->vdev_wholedisk = -1ULL;
393 
394 	/*
395 	 * If we're a top-level vdev, try to load the allocation parameters.
396 	 */
397 	if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) {
398 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
399 		    &vd->vdev_ms_array);
400 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
401 		    &vd->vdev_ms_shift);
402 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT,
403 		    &vd->vdev_ashift);
404 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
405 		    &vd->vdev_asize);
406 	}
407 
408 	/*
409 	 * If we're a leaf vdev, try to load the DTL object.
410 	 */
411 	if (vd->vdev_ops->vdev_op_leaf && alloctype == VDEV_ALLOC_LOAD) {
412 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
413 		    &vd->vdev_dtl.smo_object);
414 	}
415 
416 	/*
417 	 * Add ourselves to the parent's list of children.
418 	 */
419 	vdev_add_child(parent, vd);
420 
421 	return (vd);
422 }
423 
424 void
425 vdev_free(vdev_t *vd)
426 {
427 	int c;
428 
429 	/*
430 	 * vdev_free() implies closing the vdev first.  This is simpler than
431 	 * trying to ensure complicated semantics for all callers.
432 	 */
433 	vdev_close(vd);
434 
435 	/*
436 	 * It's possible to free a vdev that's been added to the dirty
437 	 * list when in the middle of spa_vdev_add().  Handle that case
438 	 * correctly here.
439 	 */
440 	if (vd->vdev_is_dirty)
441 		vdev_config_clean(vd);
442 
443 	/*
444 	 * Free all children.
445 	 */
446 	for (c = 0; c < vd->vdev_children; c++)
447 		vdev_free(vd->vdev_child[c]);
448 
449 	ASSERT(vd->vdev_child == NULL);
450 	ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
451 
452 	/*
453 	 * Discard allocation state.
454 	 */
455 	if (vd == vd->vdev_top)
456 		vdev_metaslab_fini(vd);
457 
458 	ASSERT3U(vd->vdev_stat.vs_space, ==, 0);
459 	ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0);
460 
461 	/*
462 	 * Remove this vdev from its parent's child list.
463 	 */
464 	vdev_remove_child(vd->vdev_parent, vd);
465 
466 	ASSERT(vd->vdev_parent == NULL);
467 
468 	vdev_free_common(vd);
469 }
470 
471 /*
472  * Transfer top-level vdev state from svd to tvd.
473  */
474 static void
475 vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
476 {
477 	spa_t *spa = svd->vdev_spa;
478 	metaslab_t *msp;
479 	vdev_t *vd;
480 	int t;
481 
482 	ASSERT(tvd == tvd->vdev_top);
483 
484 	tvd->vdev_ms_array = svd->vdev_ms_array;
485 	tvd->vdev_ms_shift = svd->vdev_ms_shift;
486 	tvd->vdev_ms_count = svd->vdev_ms_count;
487 
488 	svd->vdev_ms_array = 0;
489 	svd->vdev_ms_shift = 0;
490 	svd->vdev_ms_count = 0;
491 
492 	tvd->vdev_mg = svd->vdev_mg;
493 	tvd->vdev_mg->mg_vd = tvd;
494 	tvd->vdev_ms = svd->vdev_ms;
495 	tvd->vdev_smo = svd->vdev_smo;
496 
497 	svd->vdev_mg = NULL;
498 	svd->vdev_ms = NULL;
499 	svd->vdev_smo = NULL;
500 
501 	tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
502 	tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
503 
504 	svd->vdev_stat.vs_alloc = 0;
505 	svd->vdev_stat.vs_space = 0;
506 
507 	for (t = 0; t < TXG_SIZE; t++) {
508 		while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
509 			(void) txg_list_add(&tvd->vdev_ms_list, msp, t);
510 		while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL)
511 			(void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
512 		if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
513 			(void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
514 		tvd->vdev_dirty[t] = svd->vdev_dirty[t];
515 		svd->vdev_dirty[t] = 0;
516 	}
517 
518 	if (svd->vdev_is_dirty) {
519 		vdev_config_clean(svd);
520 		vdev_config_dirty(tvd);
521 	}
522 
523 	ASSERT(svd->vdev_io_retry == NULL);
524 	ASSERT(list_is_empty(&svd->vdev_io_pending));
525 }
526 
527 static void
528 vdev_top_update(vdev_t *tvd, vdev_t *vd)
529 {
530 	int c;
531 
532 	if (vd == NULL)
533 		return;
534 
535 	vd->vdev_top = tvd;
536 
537 	for (c = 0; c < vd->vdev_children; c++)
538 		vdev_top_update(tvd, vd->vdev_child[c]);
539 }
540 
541 /*
542  * Add a mirror/replacing vdev above an existing vdev.
543  */
544 vdev_t *
545 vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
546 {
547 	spa_t *spa = cvd->vdev_spa;
548 	vdev_t *pvd = cvd->vdev_parent;
549 	vdev_t *mvd;
550 
551 	ASSERT(spa_config_held(spa, RW_WRITER));
552 
553 	mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
554 	vdev_remove_child(pvd, cvd);
555 	vdev_add_child(pvd, mvd);
556 	cvd->vdev_id = mvd->vdev_children;
557 	vdev_add_child(mvd, cvd);
558 	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
559 
560 	mvd->vdev_asize = cvd->vdev_asize;
561 	mvd->vdev_ashift = cvd->vdev_ashift;
562 	mvd->vdev_state = cvd->vdev_state;
563 
564 	if (mvd == mvd->vdev_top)
565 		vdev_top_transfer(cvd, mvd);
566 
567 	return (mvd);
568 }
569 
570 /*
571  * Remove a 1-way mirror/replacing vdev from the tree.
572  */
573 void
574 vdev_remove_parent(vdev_t *cvd)
575 {
576 	vdev_t *mvd = cvd->vdev_parent;
577 	vdev_t *pvd = mvd->vdev_parent;
578 
579 	ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER));
580 
581 	ASSERT(mvd->vdev_children == 1);
582 	ASSERT(mvd->vdev_ops == &vdev_mirror_ops ||
583 	    mvd->vdev_ops == &vdev_replacing_ops);
584 
585 	vdev_remove_child(mvd, cvd);
586 	vdev_remove_child(pvd, mvd);
587 	cvd->vdev_id = mvd->vdev_id;
588 	vdev_add_child(pvd, cvd);
589 	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
590 
591 	if (cvd == cvd->vdev_top)
592 		vdev_top_transfer(mvd, cvd);
593 
594 	ASSERT(mvd->vdev_children == 0);
595 	vdev_free(mvd);
596 }
597 
598 void
599 vdev_metaslab_init(vdev_t *vd, uint64_t txg)
600 {
601 	spa_t *spa = vd->vdev_spa;
602 	metaslab_class_t *mc = spa_metaslab_class_select(spa);
603 	uint64_t c;
604 	uint64_t oldc = vd->vdev_ms_count;
605 	uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
606 	space_map_obj_t *smo = vd->vdev_smo;
607 	metaslab_t **mspp = vd->vdev_ms;
608 
609 	dprintf("%s oldc %llu newc %llu\n", vdev_description(vd), oldc, newc);
610 
611 	ASSERT(oldc <= newc);
612 
613 	vd->vdev_smo = kmem_zalloc(newc * sizeof (*smo), KM_SLEEP);
614 	vd->vdev_ms = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
615 	vd->vdev_ms_count = newc;
616 
617 	if (vd->vdev_mg == NULL) {
618 		if (txg == 0) {
619 			dmu_buf_t *db;
620 			uint64_t *ms_array;
621 
622 			ms_array = kmem_zalloc(newc * sizeof (uint64_t),
623 			    KM_SLEEP);
624 
625 			dmu_read(spa->spa_meta_objset, vd->vdev_ms_array,
626 			    0, newc * sizeof (uint64_t), ms_array);
627 
628 			for (c = 0; c < newc; c++) {
629 				if (ms_array[c] == 0)
630 					continue;
631 				db = dmu_bonus_hold(spa->spa_meta_objset,
632 				    ms_array[c]);
633 				dmu_buf_read(db);
634 				ASSERT3U(db->db_size, ==, sizeof (*smo));
635 				bcopy(db->db_data, &vd->vdev_smo[c],
636 				    db->db_size);
637 				ASSERT3U(vd->vdev_smo[c].smo_object, ==,
638 				    ms_array[c]);
639 				dmu_buf_rele(db);
640 			}
641 			kmem_free(ms_array, newc * sizeof (uint64_t));
642 		}
643 		vd->vdev_mg = metaslab_group_create(mc, vd);
644 	}
645 
646 	for (c = 0; c < oldc; c++) {
647 		vd->vdev_smo[c] = smo[c];
648 		vd->vdev_ms[c] = mspp[c];
649 		mspp[c]->ms_smo = &vd->vdev_smo[c];
650 	}
651 
652 	for (c = oldc; c < newc; c++)
653 		metaslab_init(vd->vdev_mg, &vd->vdev_smo[c], &vd->vdev_ms[c],
654 		    c << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg);
655 
656 	if (oldc != 0) {
657 		kmem_free(smo, oldc * sizeof (*smo));
658 		kmem_free(mspp, oldc * sizeof (*mspp));
659 	}
660 
661 }
662 
663 void
664 vdev_metaslab_fini(vdev_t *vd)
665 {
666 	uint64_t m;
667 	uint64_t count = vd->vdev_ms_count;
668 
669 	if (vd->vdev_ms != NULL) {
670 		for (m = 0; m < count; m++)
671 			metaslab_fini(vd->vdev_ms[m]);
672 		kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
673 		vd->vdev_ms = NULL;
674 	}
675 
676 	if (vd->vdev_smo != NULL) {
677 		kmem_free(vd->vdev_smo, count * sizeof (space_map_obj_t));
678 		vd->vdev_smo = NULL;
679 	}
680 }
681 
682 /*
683  * Prepare a virtual device for access.
684  */
685 int
686 vdev_open(vdev_t *vd)
687 {
688 	int error;
689 	vdev_knob_t *vk;
690 	int c;
691 	uint64_t osize = 0;
692 	uint64_t asize, psize;
693 	uint64_t ashift = -1ULL;
694 
695 	ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
696 	    vd->vdev_state == VDEV_STATE_CANT_OPEN ||
697 	    vd->vdev_state == VDEV_STATE_OFFLINE);
698 
699 	if (vd->vdev_fault_mode == VDEV_FAULT_COUNT)
700 		vd->vdev_fault_arg >>= 1;
701 	else
702 		vd->vdev_fault_mode = VDEV_FAULT_NONE;
703 
704 	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
705 
706 	for (vk = vdev_knob_next(NULL); vk != NULL; vk = vdev_knob_next(vk)) {
707 		uint64_t *valp = (uint64_t *)((char *)vd + vk->vk_offset);
708 
709 		*valp = vk->vk_default;
710 		*valp = MAX(*valp, vk->vk_min);
711 		*valp = MIN(*valp, vk->vk_max);
712 	}
713 
714 	if (vd->vdev_ops->vdev_op_leaf) {
715 		vdev_cache_init(vd);
716 		vdev_queue_init(vd);
717 		vd->vdev_cache_active = B_TRUE;
718 	}
719 
720 	if (vd->vdev_offline) {
721 		ASSERT(vd->vdev_children == 0);
722 		dprintf("OFFLINE: %s = ENXIO\n", vdev_description(vd));
723 		vd->vdev_state = VDEV_STATE_OFFLINE;
724 		return (ENXIO);
725 	}
726 
727 	error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift);
728 
729 	dprintf("%s = %d, osize %llu, state = %d\n",
730 	    vdev_description(vd), error, osize, vd->vdev_state);
731 
732 	if (error) {
733 		dprintf("%s in %s failed to open, error %d, aux %d\n",
734 		    vdev_description(vd),
735 		    vdev_description(vd->vdev_parent),
736 		    error,
737 		    vd->vdev_stat.vs_aux);
738 
739 		vd->vdev_state = VDEV_STATE_CANT_OPEN;
740 		return (error);
741 	}
742 
743 	vd->vdev_state = VDEV_STATE_HEALTHY;
744 
745 	for (c = 0; c < vd->vdev_children; c++)
746 		if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY)
747 			vd->vdev_state = VDEV_STATE_DEGRADED;
748 
749 	osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
750 
751 	if (vd->vdev_children == 0) {
752 		if (osize < SPA_MINDEVSIZE) {
753 			vd->vdev_state = VDEV_STATE_CANT_OPEN;
754 			vd->vdev_stat.vs_aux = VDEV_AUX_TOO_SMALL;
755 			return (EOVERFLOW);
756 		}
757 		psize = osize;
758 		asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
759 	} else {
760 		if (osize < SPA_MINDEVSIZE -
761 		    (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
762 			vd->vdev_state = VDEV_STATE_CANT_OPEN;
763 			vd->vdev_stat.vs_aux = VDEV_AUX_TOO_SMALL;
764 			return (EOVERFLOW);
765 		}
766 		psize = 0;
767 		asize = osize;
768 	}
769 
770 	vd->vdev_psize = psize;
771 
772 	if (vd->vdev_asize == 0) {
773 		/*
774 		 * This is the first-ever open, so use the computed values.
775 		 */
776 		vd->vdev_asize = asize;
777 		vd->vdev_ashift = ashift;
778 	} else {
779 		/*
780 		 * Make sure the alignment requirement hasn't increased.
781 		 */
782 		if (ashift > vd->vdev_ashift) {
783 			dprintf("%s: ashift grew\n", vdev_description(vd));
784 			vd->vdev_state = VDEV_STATE_CANT_OPEN;
785 			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
786 			return (EINVAL);
787 		}
788 
789 		/*
790 		 * Make sure the device hasn't shrunk.
791 		 */
792 		if (asize < vd->vdev_asize) {
793 			dprintf("%s: device shrank\n", vdev_description(vd));
794 			vd->vdev_state = VDEV_STATE_CANT_OPEN;
795 			vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
796 			return (EINVAL);
797 		}
798 
799 		/*
800 		 * If all children are healthy and the asize has increased,
801 		 * then we've experienced dynamic LUN growth.
802 		 */
803 		if (vd->vdev_state == VDEV_STATE_HEALTHY &&
804 		    asize > vd->vdev_asize) {
805 			dprintf("%s: device grew\n", vdev_description(vd));
806 			vd->vdev_asize = asize;
807 		}
808 	}
809 
810 	return (0);
811 }
812 
813 /*
814  * Close a virtual device.
815  */
816 void
817 vdev_close(vdev_t *vd)
818 {
819 	ASSERT3P(list_head(&vd->vdev_io_pending), ==, NULL);
820 
821 	vd->vdev_ops->vdev_op_close(vd);
822 
823 	if (vd->vdev_cache_active) {
824 		vdev_cache_fini(vd);
825 		vdev_queue_fini(vd);
826 		vd->vdev_cache_active = B_FALSE;
827 	}
828 
829 	if (vd->vdev_offline)
830 		vd->vdev_state = VDEV_STATE_OFFLINE;
831 	else
832 		vd->vdev_state = VDEV_STATE_CLOSED;
833 }
834 
835 void
836 vdev_reopen(vdev_t *vd, zio_t **rq)
837 {
838 	vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
839 	int c;
840 
841 	if (vd == rvd) {
842 		ASSERT(rq == NULL);
843 		for (c = 0; c < rvd->vdev_children; c++)
844 			vdev_reopen(rvd->vdev_child[c], NULL);
845 		return;
846 	}
847 
848 	/* only valid for top-level vdevs */
849 	ASSERT3P(vd, ==, vd->vdev_top);
850 
851 	/*
852 	 * vdev_state can change when spa_config_lock is held as writer,
853 	 * or when it's held as reader and we're doing a vdev_reopen().
854 	 * To handle the latter case, we grab rvd's io_lock to serialize
855 	 * reopens.  This ensures that there's never more than one vdev
856 	 * state changer active at a time.
857 	 */
858 	mutex_enter(&rvd->vdev_io_lock);
859 
860 	mutex_enter(&vd->vdev_io_lock);
861 	while (list_head(&vd->vdev_io_pending) != NULL)
862 		cv_wait(&vd->vdev_io_cv, &vd->vdev_io_lock);
863 	vdev_close(vd);
864 	(void) vdev_open(vd);
865 	if (rq != NULL) {
866 		*rq = vd->vdev_io_retry;
867 		vd->vdev_io_retry = NULL;
868 	}
869 	mutex_exit(&vd->vdev_io_lock);
870 
871 	/*
872 	 * Reassess root vdev's health.
873 	 */
874 	rvd->vdev_state = VDEV_STATE_HEALTHY;
875 	for (c = 0; c < rvd->vdev_children; c++) {
876 		uint64_t state = rvd->vdev_child[c]->vdev_state;
877 		rvd->vdev_state = MIN(rvd->vdev_state, state);
878 	}
879 
880 	mutex_exit(&rvd->vdev_io_lock);
881 }
882 
883 int
884 vdev_create(vdev_t *vd, uint64_t txg)
885 {
886 	int error;
887 
888 	/*
889 	 * Normally, partial opens (e.g. of a mirror) are allowed.
890 	 * For a create, however, we want to fail the request if
891 	 * there are any components we can't open.
892 	 */
893 	error = vdev_open(vd);
894 
895 	if (error || vd->vdev_state != VDEV_STATE_HEALTHY) {
896 		vdev_close(vd);
897 		return (error ? error : ENXIO);
898 	}
899 
900 	/*
901 	 * Recursively initialize all labels.
902 	 */
903 	if ((error = vdev_label_init(vd, txg)) != 0) {
904 		vdev_close(vd);
905 		return (error);
906 	}
907 
908 	return (0);
909 }
910 
911 /*
912  * The is the latter half of vdev_create().  It is distinct because it
913  * involves initiating transactions in order to do metaslab creation.
914  * For creation, we want to try to create all vdevs at once and then undo it
915  * if anything fails; this is much harder if we have pending transactions.
916  */
917 void
918 vdev_init(vdev_t *vd, uint64_t txg)
919 {
920 	/*
921 	 * Aim for roughly 200 metaslabs per vdev.
922 	 */
923 	vd->vdev_ms_shift = highbit(vd->vdev_asize / 200);
924 	vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
925 
926 	/*
927 	 * Initialize the vdev's metaslabs.
928 	 */
929 	vdev_metaslab_init(vd, txg);
930 }
931 
932 void
933 vdev_dirty(vdev_t *vd, uint8_t flags, uint64_t txg)
934 {
935 	vdev_t *tvd = vd->vdev_top;
936 
937 	mutex_enter(&tvd->vdev_dirty_lock);
938 	if ((tvd->vdev_dirty[txg & TXG_MASK] & flags) != flags) {
939 		tvd->vdev_dirty[txg & TXG_MASK] |= flags;
940 		(void) txg_list_add(&tvd->vdev_spa->spa_vdev_txg_list,
941 		    tvd, txg);
942 	}
943 	mutex_exit(&tvd->vdev_dirty_lock);
944 }
945 
946 void
947 vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size)
948 {
949 	mutex_enter(sm->sm_lock);
950 	if (!space_map_contains(sm, txg, size))
951 		space_map_add(sm, txg, size);
952 	mutex_exit(sm->sm_lock);
953 }
954 
955 int
956 vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size)
957 {
958 	int dirty;
959 
960 	/*
961 	 * Quick test without the lock -- covers the common case that
962 	 * there are no dirty time segments.
963 	 */
964 	if (sm->sm_space == 0)
965 		return (0);
966 
967 	mutex_enter(sm->sm_lock);
968 	dirty = space_map_contains(sm, txg, size);
969 	mutex_exit(sm->sm_lock);
970 
971 	return (dirty);
972 }
973 
974 /*
975  * Reassess DTLs after a config change or scrub completion.
976  */
977 void
978 vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
979 {
980 	int c;
981 
982 	ASSERT(spa_config_held(vd->vdev_spa, RW_WRITER));
983 
984 	if (vd->vdev_children == 0) {
985 		mutex_enter(&vd->vdev_dtl_lock);
986 		/*
987 		 * We're successfully scrubbed everything up to scrub_txg.
988 		 * Therefore, excise all old DTLs up to that point, then
989 		 * fold in the DTLs for everything we couldn't scrub.
990 		 */
991 		if (scrub_txg != 0) {
992 			space_map_excise(&vd->vdev_dtl_map, 0, scrub_txg);
993 			space_map_union(&vd->vdev_dtl_map, &vd->vdev_dtl_scrub);
994 		}
995 		if (scrub_done)
996 			space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
997 		mutex_exit(&vd->vdev_dtl_lock);
998 		if (txg != 0) {
999 			vdev_t *tvd = vd->vdev_top;
1000 			vdev_dirty(tvd, VDD_DTL, txg);
1001 			(void) txg_list_add(&tvd->vdev_dtl_list, vd, txg);
1002 		}
1003 		return;
1004 	}
1005 
1006 	mutex_enter(&vd->vdev_dtl_lock);
1007 	space_map_vacate(&vd->vdev_dtl_map, NULL, NULL);
1008 	space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
1009 	mutex_exit(&vd->vdev_dtl_lock);
1010 
1011 	for (c = 0; c < vd->vdev_children; c++) {
1012 		vdev_t *cvd = vd->vdev_child[c];
1013 		vdev_dtl_reassess(cvd, txg, scrub_txg, scrub_done);
1014 		mutex_enter(&vd->vdev_dtl_lock);
1015 		space_map_union(&vd->vdev_dtl_map, &cvd->vdev_dtl_map);
1016 		space_map_union(&vd->vdev_dtl_scrub, &cvd->vdev_dtl_scrub);
1017 		mutex_exit(&vd->vdev_dtl_lock);
1018 	}
1019 }
1020 
1021 static int
1022 vdev_dtl_load(vdev_t *vd)
1023 {
1024 	spa_t *spa = vd->vdev_spa;
1025 	space_map_obj_t *smo = &vd->vdev_dtl;
1026 	dmu_buf_t *db;
1027 	int error;
1028 
1029 	ASSERT(vd->vdev_children == 0);
1030 
1031 	if (smo->smo_object == 0)
1032 		return (0);
1033 
1034 	db = dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object);
1035 	dmu_buf_read(db);
1036 	ASSERT3U(db->db_size, ==, sizeof (*smo));
1037 	bcopy(db->db_data, smo, db->db_size);
1038 	dmu_buf_rele(db);
1039 
1040 	mutex_enter(&vd->vdev_dtl_lock);
1041 	error = space_map_load(&vd->vdev_dtl_map, smo, SM_ALLOC,
1042 	    spa->spa_meta_objset, smo->smo_objsize, smo->smo_alloc);
1043 	mutex_exit(&vd->vdev_dtl_lock);
1044 
1045 	return (error);
1046 }
1047 
1048 void
1049 vdev_dtl_sync(vdev_t *vd, uint64_t txg)
1050 {
1051 	spa_t *spa = vd->vdev_spa;
1052 	space_map_obj_t *smo = &vd->vdev_dtl;
1053 	space_map_t *sm = &vd->vdev_dtl_map;
1054 	space_map_t smsync;
1055 	kmutex_t smlock;
1056 	avl_tree_t *t = &sm->sm_root;
1057 	space_seg_t *ss;
1058 	dmu_buf_t *db;
1059 	dmu_tx_t *tx;
1060 
1061 	dprintf("%s in txg %llu pass %d\n",
1062 	    vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa));
1063 
1064 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1065 
1066 	if (vd->vdev_detached) {
1067 		if (smo->smo_object != 0) {
1068 			int err = dmu_object_free(spa->spa_meta_objset,
1069 			    smo->smo_object, tx);
1070 			ASSERT3U(err, ==, 0);
1071 			smo->smo_object = 0;
1072 		}
1073 		dmu_tx_commit(tx);
1074 		return;
1075 	}
1076 
1077 	if (smo->smo_object == 0) {
1078 		ASSERT(smo->smo_objsize == 0);
1079 		ASSERT(smo->smo_alloc == 0);
1080 		smo->smo_object = dmu_object_alloc(spa->spa_meta_objset,
1081 		    DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
1082 		    DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
1083 		ASSERT(smo->smo_object != 0);
1084 		vdev_config_dirty(vd->vdev_top);
1085 	}
1086 
1087 	dmu_free_range(spa->spa_meta_objset, smo->smo_object,
1088 	    0, smo->smo_objsize, tx);
1089 
1090 	mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL);
1091 
1092 	space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift,
1093 	    &smlock);
1094 
1095 	mutex_enter(&smlock);
1096 
1097 	mutex_enter(&vd->vdev_dtl_lock);
1098 	for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss))
1099 		space_map_add(&smsync, ss->ss_start, ss->ss_end - ss->ss_start);
1100 	mutex_exit(&vd->vdev_dtl_lock);
1101 
1102 	smo->smo_objsize = 0;
1103 	smo->smo_alloc = smsync.sm_space;
1104 
1105 	space_map_sync(&smsync, NULL, smo, SM_ALLOC, spa->spa_meta_objset, tx);
1106 	space_map_destroy(&smsync);
1107 
1108 	mutex_exit(&smlock);
1109 	mutex_destroy(&smlock);
1110 
1111 	db = dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object);
1112 	dmu_buf_will_dirty(db, tx);
1113 	ASSERT3U(db->db_size, ==, sizeof (*smo));
1114 	bcopy(smo, db->db_data, db->db_size);
1115 	dmu_buf_rele(db);
1116 
1117 	dmu_tx_commit(tx);
1118 }
1119 
1120 int
1121 vdev_load(vdev_t *vd, int import)
1122 {
1123 	spa_t *spa = vd->vdev_spa;
1124 	int c, error;
1125 	nvlist_t *label;
1126 	uint64_t guid, state;
1127 
1128 	dprintf("loading %s\n", vdev_description(vd));
1129 
1130 	/*
1131 	 * Recursively load all children.
1132 	 */
1133 	for (c = 0; c < vd->vdev_children; c++)
1134 		if ((error = vdev_load(vd->vdev_child[c], import)) != 0)
1135 			return (error);
1136 
1137 	/*
1138 	 * If this is a leaf vdev, make sure its agrees with its disk labels.
1139 	 */
1140 	if (vd->vdev_ops->vdev_op_leaf) {
1141 
1142 		if (vdev_is_dead(vd))
1143 			return (0);
1144 
1145 		/*
1146 		 * XXX state transitions don't propagate to parent here.
1147 		 * Also, merely setting the state isn't sufficient because
1148 		 * it's not persistent; a vdev_reopen() would make us
1149 		 * forget all about it.
1150 		 */
1151 		if ((label = vdev_label_read_config(vd)) == NULL) {
1152 			dprintf("can't load label config\n");
1153 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
1154 			    VDEV_AUX_CORRUPT_DATA);
1155 			return (0);
1156 		}
1157 
1158 		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
1159 		    &guid) != 0 || guid != spa_guid(spa)) {
1160 			dprintf("bad or missing pool GUID (%llu)\n", guid);
1161 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
1162 			    VDEV_AUX_CORRUPT_DATA);
1163 			nvlist_free(label);
1164 			return (0);
1165 		}
1166 
1167 		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) ||
1168 		    guid != vd->vdev_guid) {
1169 			dprintf("bad or missing vdev guid (%llu != %llu)\n",
1170 			    guid, vd->vdev_guid);
1171 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
1172 			    VDEV_AUX_CORRUPT_DATA);
1173 			nvlist_free(label);
1174 			return (0);
1175 		}
1176 
1177 		/*
1178 		 * If we find a vdev with a matching pool guid and vdev guid,
1179 		 * but the pool state is not active, it indicates that the user
1180 		 * exported or destroyed the pool without affecting the config
1181 		 * cache (if / was mounted readonly, for example).  In this
1182 		 * case, immediately return EBADF so the caller can remove it
1183 		 * from the config.
1184 		 */
1185 		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
1186 		    &state)) {
1187 			dprintf("missing pool state\n");
1188 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
1189 			    VDEV_AUX_CORRUPT_DATA);
1190 			nvlist_free(label);
1191 			return (0);
1192 		}
1193 
1194 		if (state != POOL_STATE_ACTIVE &&
1195 		    (!import || state != POOL_STATE_EXPORTED)) {
1196 			dprintf("pool state not active (%llu)\n", state);
1197 			nvlist_free(label);
1198 			return (EBADF);
1199 		}
1200 
1201 		nvlist_free(label);
1202 	}
1203 
1204 	/*
1205 	 * If this is a top-level vdev, make sure its allocation parameters
1206 	 * exist and initialize its metaslabs.
1207 	 */
1208 	if (vd == vd->vdev_top) {
1209 
1210 		if (vd->vdev_ms_array == 0 ||
1211 		    vd->vdev_ms_shift == 0 ||
1212 		    vd->vdev_ashift == 0 ||
1213 		    vd->vdev_asize == 0) {
1214 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
1215 			    VDEV_AUX_CORRUPT_DATA);
1216 			return (0);
1217 		}
1218 
1219 		vdev_metaslab_init(vd, 0);
1220 	}
1221 
1222 	/*
1223 	 * If this is a leaf vdev, load its DTL.
1224 	 */
1225 	if (vd->vdev_ops->vdev_op_leaf) {
1226 		error = vdev_dtl_load(vd);
1227 		if (error) {
1228 			dprintf("can't load DTL for %s, error %d\n",
1229 			    vdev_description(vd), error);
1230 			vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
1231 			    VDEV_AUX_CORRUPT_DATA);
1232 			return (0);
1233 		}
1234 	}
1235 
1236 	return (0);
1237 }
1238 
1239 void
1240 vdev_sync_done(vdev_t *vd, uint64_t txg)
1241 {
1242 	metaslab_t *msp;
1243 
1244 	dprintf("%s txg %llu\n", vdev_description(vd), txg);
1245 
1246 	while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
1247 		metaslab_sync_done(msp, txg);
1248 }
1249 
1250 void
1251 vdev_add_sync(vdev_t *vd, uint64_t txg)
1252 {
1253 	spa_t *spa = vd->vdev_spa;
1254 	dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1255 
1256 	ASSERT(vd == vd->vdev_top);
1257 
1258 	if (vd->vdev_ms_array == 0)
1259 		vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
1260 		    DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
1261 
1262 	ASSERT(vd->vdev_ms_array != 0);
1263 
1264 	vdev_config_dirty(vd);
1265 
1266 	dmu_tx_commit(tx);
1267 }
1268 
1269 void
1270 vdev_sync(vdev_t *vd, uint64_t txg)
1271 {
1272 	spa_t *spa = vd->vdev_spa;
1273 	vdev_t *lvd;
1274 	metaslab_t *msp;
1275 	uint8_t *dirtyp = &vd->vdev_dirty[txg & TXG_MASK];
1276 	uint8_t dirty = *dirtyp;
1277 
1278 	mutex_enter(&vd->vdev_dirty_lock);
1279 	*dirtyp &= ~(VDD_ALLOC | VDD_FREE | VDD_ADD | VDD_DTL);
1280 	mutex_exit(&vd->vdev_dirty_lock);
1281 
1282 	dprintf("%s txg %llu pass %d\n",
1283 	    vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa));
1284 
1285 	if (dirty & VDD_ADD)
1286 		vdev_add_sync(vd, txg);
1287 
1288 	while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL)
1289 		metaslab_sync(msp, txg);
1290 
1291 	while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
1292 		vdev_dtl_sync(lvd, txg);
1293 
1294 	(void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
1295 }
1296 
1297 uint64_t
1298 vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
1299 {
1300 	return (vd->vdev_ops->vdev_op_asize(vd, psize));
1301 }
1302 
1303 void
1304 vdev_io_start(zio_t *zio)
1305 {
1306 	zio->io_vd->vdev_ops->vdev_op_io_start(zio);
1307 }
1308 
1309 void
1310 vdev_io_done(zio_t *zio)
1311 {
1312 	zio->io_vd->vdev_ops->vdev_op_io_done(zio);
1313 }
1314 
1315 const char *
1316 vdev_description(vdev_t *vd)
1317 {
1318 	if (vd == NULL || vd->vdev_ops == NULL)
1319 		return ("<unknown>");
1320 
1321 	if (vd->vdev_path != NULL)
1322 		return (vd->vdev_path);
1323 
1324 	if (vd->vdev_parent == NULL)
1325 		return (spa_name(vd->vdev_spa));
1326 
1327 	return (vd->vdev_ops->vdev_op_type);
1328 }
1329 
1330 int
1331 vdev_online(spa_t *spa, const char *path)
1332 {
1333 	vdev_t *vd;
1334 
1335 	spa_config_enter(spa, RW_WRITER);
1336 
1337 	if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, path)) == NULL) {
1338 		spa_config_exit(spa);
1339 		return (ENODEV);
1340 	}
1341 
1342 	dprintf("ONLINE: %s\n", vdev_description(vd));
1343 
1344 	vd->vdev_offline = B_FALSE;
1345 
1346 	/*
1347 	 * Clear the error counts.  The idea is that you expect to see all
1348 	 * zeroes when everything is working, so if you've just onlined a
1349 	 * device, you don't want to keep hearing about errors from before.
1350 	 */
1351 	vd->vdev_stat.vs_read_errors = 0;
1352 	vd->vdev_stat.vs_write_errors = 0;
1353 	vd->vdev_stat.vs_checksum_errors = 0;
1354 
1355 	vdev_reopen(vd->vdev_top, NULL);
1356 
1357 	spa_config_exit(spa);
1358 
1359 	VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
1360 
1361 	return (0);
1362 }
1363 
1364 int
1365 vdev_offline(spa_t *spa, const char *path)
1366 {
1367 	vdev_t *vd;
1368 
1369 	spa_config_enter(spa, RW_WRITER);
1370 
1371 	if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, path)) == NULL) {
1372 		spa_config_exit(spa);
1373 		return (ENODEV);
1374 	}
1375 
1376 	dprintf("OFFLINE: %s\n", vdev_description(vd));
1377 
1378 	/*
1379 	 * If this device's top-level vdev has a non-empty DTL,
1380 	 * don't allow the device to be offlined.
1381 	 *
1382 	 * XXX -- we should make this more precise by allowing the offline
1383 	 * as long as the remaining devices don't have any DTL holes.
1384 	 */
1385 	if (vd->vdev_top->vdev_dtl_map.sm_space != 0) {
1386 		spa_config_exit(spa);
1387 		return (EBUSY);
1388 	}
1389 
1390 	/*
1391 	 * Set this device to offline state and reopen its top-level vdev.
1392 	 * If this action results in the top-level vdev becoming unusable,
1393 	 * undo it and fail the request.
1394 	 */
1395 	vd->vdev_offline = B_TRUE;
1396 	vdev_reopen(vd->vdev_top, NULL);
1397 	if (vdev_is_dead(vd->vdev_top)) {
1398 		vd->vdev_offline = B_FALSE;
1399 		vdev_reopen(vd->vdev_top, NULL);
1400 		spa_config_exit(spa);
1401 		return (EBUSY);
1402 	}
1403 
1404 	spa_config_exit(spa);
1405 
1406 	return (0);
1407 }
1408 
1409 int
1410 vdev_error_setup(spa_t *spa, const char *path, int mode, int mask, uint64_t arg)
1411 {
1412 	vdev_t *vd;
1413 
1414 	spa_config_enter(spa, RW_WRITER);
1415 
1416 	if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, path)) == NULL) {
1417 		spa_config_exit(spa);
1418 		return (ENODEV);
1419 	}
1420 
1421 	vd->vdev_fault_mode = mode;
1422 	vd->vdev_fault_mask = mask;
1423 	vd->vdev_fault_arg = arg;
1424 
1425 	spa_config_exit(spa);
1426 
1427 	return (0);
1428 }
1429 
1430 int
1431 vdev_is_dead(vdev_t *vd)
1432 {
1433 	return (vd->vdev_state <= VDEV_STATE_CANT_OPEN);
1434 }
1435 
1436 int
1437 vdev_error_inject(vdev_t *vd, zio_t *zio)
1438 {
1439 	int error = 0;
1440 
1441 	if (vd->vdev_fault_mode == VDEV_FAULT_NONE)
1442 		return (0);
1443 
1444 	if (((1ULL << zio->io_type) & vd->vdev_fault_mask) == 0)
1445 		return (0);
1446 
1447 	switch (vd->vdev_fault_mode) {
1448 	case VDEV_FAULT_RANDOM:
1449 		if (spa_get_random(vd->vdev_fault_arg) == 0)
1450 			error = EIO;
1451 		break;
1452 
1453 	case VDEV_FAULT_COUNT:
1454 		if ((int64_t)--vd->vdev_fault_arg <= 0)
1455 			vd->vdev_fault_mode = VDEV_FAULT_NONE;
1456 		error = EIO;
1457 		break;
1458 	}
1459 
1460 	if (error != 0) {
1461 		dprintf("returning %d for type %d on %s state %d offset %llx\n",
1462 		    error, zio->io_type, vdev_description(vd),
1463 		    vd->vdev_state, zio->io_offset);
1464 	}
1465 
1466 	return (error);
1467 }
1468 
1469 /*
1470  * Get statistics for the given vdev.
1471  */
1472 void
1473 vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
1474 {
1475 	vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
1476 	int c, t;
1477 
1478 	mutex_enter(&vd->vdev_stat_lock);
1479 	bcopy(&vd->vdev_stat, vs, sizeof (*vs));
1480 	vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
1481 	vs->vs_state = vd->vdev_state;
1482 	vs->vs_rsize = vdev_get_rsize(vd);
1483 	mutex_exit(&vd->vdev_stat_lock);
1484 
1485 	/*
1486 	 * If we're getting stats on the root vdev, aggregate the I/O counts
1487 	 * over all top-level vdevs (i.e. the direct children of the root).
1488 	 */
1489 	if (vd == rvd) {
1490 		for (c = 0; c < rvd->vdev_children; c++) {
1491 			vdev_t *cvd = rvd->vdev_child[c];
1492 			vdev_stat_t *cvs = &cvd->vdev_stat;
1493 
1494 			mutex_enter(&vd->vdev_stat_lock);
1495 			for (t = 0; t < ZIO_TYPES; t++) {
1496 				vs->vs_ops[t] += cvs->vs_ops[t];
1497 				vs->vs_bytes[t] += cvs->vs_bytes[t];
1498 			}
1499 			vs->vs_read_errors += cvs->vs_read_errors;
1500 			vs->vs_write_errors += cvs->vs_write_errors;
1501 			vs->vs_checksum_errors += cvs->vs_checksum_errors;
1502 			vs->vs_scrub_examined += cvs->vs_scrub_examined;
1503 			vs->vs_scrub_errors += cvs->vs_scrub_errors;
1504 			mutex_exit(&vd->vdev_stat_lock);
1505 		}
1506 	}
1507 }
1508 
1509 void
1510 vdev_stat_update(zio_t *zio)
1511 {
1512 	vdev_t *vd = zio->io_vd;
1513 	vdev_t *pvd;
1514 	uint64_t txg = zio->io_txg;
1515 	vdev_stat_t *vs = &vd->vdev_stat;
1516 	zio_type_t type = zio->io_type;
1517 	int flags = zio->io_flags;
1518 
1519 	if (zio->io_error == 0) {
1520 		if (!(flags & ZIO_FLAG_IO_BYPASS)) {
1521 			mutex_enter(&vd->vdev_stat_lock);
1522 			vs->vs_ops[type]++;
1523 			vs->vs_bytes[type] += zio->io_size;
1524 			mutex_exit(&vd->vdev_stat_lock);
1525 		}
1526 		if ((flags & ZIO_FLAG_IO_REPAIR) &&
1527 		    zio->io_delegate_list == NULL) {
1528 			mutex_enter(&vd->vdev_stat_lock);
1529 			if (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))
1530 				vs->vs_scrub_repaired += zio->io_size;
1531 			else
1532 				vs->vs_self_healed += zio->io_size;
1533 			mutex_exit(&vd->vdev_stat_lock);
1534 		}
1535 		return;
1536 	}
1537 
1538 	if (flags & ZIO_FLAG_SPECULATIVE)
1539 		return;
1540 
1541 	if (!vdev_is_dead(vd)) {
1542 		mutex_enter(&vd->vdev_stat_lock);
1543 		if (type == ZIO_TYPE_READ) {
1544 			if (zio->io_error == ECKSUM)
1545 				vs->vs_checksum_errors++;
1546 			else
1547 				vs->vs_read_errors++;
1548 		}
1549 		if (type == ZIO_TYPE_WRITE)
1550 			vs->vs_write_errors++;
1551 		mutex_exit(&vd->vdev_stat_lock);
1552 	}
1553 
1554 	if (type == ZIO_TYPE_WRITE) {
1555 		if (txg == 0 || vd->vdev_children != 0)
1556 			return;
1557 		if (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) {
1558 			ASSERT(flags & ZIO_FLAG_IO_REPAIR);
1559 			for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
1560 				vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1);
1561 		}
1562 		if (!(flags & ZIO_FLAG_IO_REPAIR)) {
1563 			vdev_t *tvd = vd->vdev_top;
1564 			if (vdev_dtl_contains(&vd->vdev_dtl_map, txg, 1))
1565 				return;
1566 			vdev_dirty(tvd, VDD_DTL, txg);
1567 			(void) txg_list_add(&tvd->vdev_dtl_list, vd, txg);
1568 			for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
1569 				vdev_dtl_dirty(&pvd->vdev_dtl_map, txg, 1);
1570 		}
1571 	}
1572 }
1573 
1574 void
1575 vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete)
1576 {
1577 	int c;
1578 	vdev_stat_t *vs = &vd->vdev_stat;
1579 
1580 	for (c = 0; c < vd->vdev_children; c++)
1581 		vdev_scrub_stat_update(vd->vdev_child[c], type, complete);
1582 
1583 	mutex_enter(&vd->vdev_stat_lock);
1584 
1585 	if (type == POOL_SCRUB_NONE) {
1586 		/*
1587 		 * Update completion and end time.  Leave everything else alone
1588 		 * so we can report what happened during the previous scrub.
1589 		 */
1590 		vs->vs_scrub_complete = complete;
1591 		vs->vs_scrub_end = gethrestime_sec();
1592 	} else {
1593 		vs->vs_scrub_type = type;
1594 		vs->vs_scrub_complete = 0;
1595 		vs->vs_scrub_examined = 0;
1596 		vs->vs_scrub_repaired = 0;
1597 		vs->vs_scrub_errors = 0;
1598 		vs->vs_scrub_start = gethrestime_sec();
1599 		vs->vs_scrub_end = 0;
1600 	}
1601 
1602 	mutex_exit(&vd->vdev_stat_lock);
1603 }
1604 
1605 /*
1606  * Report checksum errors that a vdev that didn't realize it made.
1607  * This can happen, for example, when RAID-Z combinatorial reconstruction
1608  * infers that one of its components returned bad data.
1609  */
1610 void
1611 vdev_checksum_error(zio_t *zio, vdev_t *vd)
1612 {
1613 	dprintf_bp(zio->io_bp, "imputed checksum error on %s: ",
1614 	    vdev_description(vd));
1615 
1616 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1617 		mutex_enter(&vd->vdev_stat_lock);
1618 		vd->vdev_stat.vs_checksum_errors++;
1619 		mutex_exit(&vd->vdev_stat_lock);
1620 	}
1621 }
1622 
1623 /*
1624  * Update the in-core space usage stats for this vdev and the root vdev.
1625  */
1626 void
1627 vdev_space_update(vdev_t *vd, uint64_t space_delta, uint64_t alloc_delta)
1628 {
1629 	ASSERT(vd == vd->vdev_top);
1630 
1631 	do {
1632 		mutex_enter(&vd->vdev_stat_lock);
1633 		vd->vdev_stat.vs_space += space_delta;
1634 		vd->vdev_stat.vs_alloc += alloc_delta;
1635 		mutex_exit(&vd->vdev_stat_lock);
1636 	} while ((vd = vd->vdev_parent) != NULL);
1637 }
1638 
1639 /*
1640  * Various knobs to tune a vdev.
1641  */
1642 static vdev_knob_t vdev_knob[] = {
1643 	{
1644 		"cache_size",
1645 		"size of the read-ahead cache",
1646 		0,
1647 		1ULL << 30,
1648 		10ULL << 20,
1649 		offsetof(struct vdev, vdev_cache.vc_size)
1650 	},
1651 	{
1652 		"cache_bshift",
1653 		"log2 of cache blocksize",
1654 		SPA_MINBLOCKSHIFT,
1655 		SPA_MAXBLOCKSHIFT,
1656 		16,
1657 		offsetof(struct vdev, vdev_cache.vc_bshift)
1658 	},
1659 	{
1660 		"cache_max",
1661 		"largest block size to cache",
1662 		0,
1663 		SPA_MAXBLOCKSIZE,
1664 		1ULL << 14,
1665 		offsetof(struct vdev, vdev_cache.vc_max)
1666 	},
1667 	{
1668 		"min_pending",
1669 		"minimum pending I/Os to the disk",
1670 		1,
1671 		10000,
1672 		2,
1673 		offsetof(struct vdev, vdev_queue.vq_min_pending)
1674 	},
1675 	{
1676 		"max_pending",
1677 		"maximum pending I/Os to the disk",
1678 		1,
1679 		10000,
1680 		35,
1681 		offsetof(struct vdev, vdev_queue.vq_max_pending)
1682 	},
1683 	{
1684 		"agg_limit",
1685 		"maximum size of aggregated I/Os",
1686 		0,
1687 		SPA_MAXBLOCKSIZE,
1688 		SPA_MAXBLOCKSIZE,
1689 		offsetof(struct vdev, vdev_queue.vq_agg_limit)
1690 	},
1691 	{
1692 		"time_shift",
1693 		"deadline = pri + (lbolt >> time_shift)",
1694 		0,
1695 		63,
1696 		4,
1697 		offsetof(struct vdev, vdev_queue.vq_time_shift)
1698 	},
1699 	{
1700 		"ramp_rate",
1701 		"exponential I/O issue ramp-up rate",
1702 		1,
1703 		10000,
1704 		2,
1705 		offsetof(struct vdev, vdev_queue.vq_ramp_rate)
1706 	},
1707 };
1708 
1709 vdev_knob_t *
1710 vdev_knob_next(vdev_knob_t *vk)
1711 {
1712 	if (vk == NULL)
1713 		return (vdev_knob);
1714 
1715 	if (++vk == vdev_knob + sizeof (vdev_knob) / sizeof (vdev_knob_t))
1716 		return (NULL);
1717 
1718 	return (vk);
1719 }
1720 
1721 /*
1722  * Mark a top-level vdev's config as dirty, placing it on the dirty list
1723  * so that it will be written out next time the vdev configuration is synced.
1724  * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
1725  */
1726 void
1727 vdev_config_dirty(vdev_t *vd)
1728 {
1729 	spa_t *spa = vd->vdev_spa;
1730 	vdev_t *rvd = spa->spa_root_vdev;
1731 	int c;
1732 
1733 	if (vd == rvd) {
1734 		for (c = 0; c < rvd->vdev_children; c++)
1735 			vdev_config_dirty(rvd->vdev_child[c]);
1736 	} else {
1737 		ASSERT(vd == vd->vdev_top);
1738 
1739 		if (!vd->vdev_is_dirty) {
1740 			list_insert_head(&spa->spa_dirty_list, vd);
1741 			vd->vdev_is_dirty = B_TRUE;
1742 		}
1743 	}
1744 }
1745 
1746 void
1747 vdev_config_clean(vdev_t *vd)
1748 {
1749 	ASSERT(vd->vdev_is_dirty);
1750 
1751 	list_remove(&vd->vdev_spa->spa_dirty_list, vd);
1752 	vd->vdev_is_dirty = B_FALSE;
1753 }
1754 
1755 /*
1756  * Set a vdev's state, updating any parent's state as well.
1757  */
1758 void
1759 vdev_set_state(vdev_t *vd, vdev_state_t state, vdev_aux_t aux)
1760 {
1761 	if (state == vd->vdev_state)
1762 		return;
1763 
1764 	vd->vdev_state = state;
1765 	vd->vdev_stat.vs_aux = aux;
1766 
1767 	if (vd->vdev_parent != NULL) {
1768 		int c;
1769 		int degraded = 0, faulted = 0;
1770 		vdev_t *parent, *child;
1771 
1772 		parent = vd->vdev_parent;
1773 		for (c = 0; c < parent->vdev_children; c++) {
1774 			child = parent->vdev_child[c];
1775 			if (child->vdev_state <= VDEV_STATE_CANT_OPEN)
1776 				faulted++;
1777 			else if (child->vdev_state == VDEV_STATE_DEGRADED)
1778 				degraded++;
1779 		}
1780 
1781 		vd->vdev_parent->vdev_ops->vdev_op_state_change(
1782 		    vd->vdev_parent, faulted, degraded);
1783 	    }
1784 }
1785