xref: /freebsd/sys/geom/raid/g_raid.c (revision c243e4902be8df1e643c76b5f18b68bb77cc5268)
1 /*-
2  * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29 
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/kernel.h>
33 #include <sys/module.h>
34 #include <sys/limits.h>
35 #include <sys/lock.h>
36 #include <sys/mutex.h>
37 #include <sys/bio.h>
38 #include <sys/sbuf.h>
39 #include <sys/sysctl.h>
40 #include <sys/malloc.h>
41 #include <sys/eventhandler.h>
42 #include <vm/uma.h>
43 #include <geom/geom.h>
44 #include <sys/proc.h>
45 #include <sys/kthread.h>
46 #include <sys/sched.h>
47 #include <geom/raid/g_raid.h>
48 #include "g_raid_md_if.h"
49 #include "g_raid_tr_if.h"
50 
51 static MALLOC_DEFINE(M_RAID, "raid_data", "GEOM_RAID Data");
52 
53 SYSCTL_DECL(_kern_geom);
54 SYSCTL_NODE(_kern_geom, OID_AUTO, raid, CTLFLAG_RW, 0, "GEOM_RAID stuff");
55 u_int g_raid_aggressive_spare = 0;
56 TUNABLE_INT("kern.geom.raid.aggressive_spare", &g_raid_aggressive_spare);
57 SYSCTL_UINT(_kern_geom_raid, OID_AUTO, aggressive_spare, CTLFLAG_RW,
58     &g_raid_aggressive_spare, 0, "Use disks without metadata as spare");
59 u_int g_raid_debug = 0;
60 TUNABLE_INT("kern.geom.raid.debug", &g_raid_debug);
61 SYSCTL_UINT(_kern_geom_raid, OID_AUTO, debug, CTLFLAG_RW, &g_raid_debug, 0,
62     "Debug level");
63 int g_raid_read_err_thresh = 10;
64 TUNABLE_INT("kern.geom.raid.read_err_thresh", &g_raid_read_err_thresh);
65 SYSCTL_UINT(_kern_geom_raid, OID_AUTO, read_err_thresh, CTLFLAG_RW,
66     &g_raid_read_err_thresh, 0,
67     "Number of read errors equated to disk failure");
68 u_int g_raid_start_timeout = 30;
69 TUNABLE_INT("kern.geom.raid.start_timeout", &g_raid_start_timeout);
70 SYSCTL_UINT(_kern_geom_raid, OID_AUTO, start_timeout, CTLFLAG_RW,
71     &g_raid_start_timeout, 0,
72     "Time to wait for all array components");
73 static u_int g_raid_clean_time = 5;
74 TUNABLE_INT("kern.geom.raid.clean_time", &g_raid_clean_time);
75 SYSCTL_UINT(_kern_geom_raid, OID_AUTO, clean_time, CTLFLAG_RW,
76     &g_raid_clean_time, 0, "Mark volume as clean when idling");
77 static u_int g_raid_disconnect_on_failure = 1;
78 TUNABLE_INT("kern.geom.raid.disconnect_on_failure",
79     &g_raid_disconnect_on_failure);
80 SYSCTL_UINT(_kern_geom_raid, OID_AUTO, disconnect_on_failure, CTLFLAG_RW,
81     &g_raid_disconnect_on_failure, 0, "Disconnect component on I/O failure.");
82 static u_int g_raid_name_format = 0;
83 TUNABLE_INT("kern.geom.raid.name_format", &g_raid_name_format);
84 SYSCTL_UINT(_kern_geom_raid, OID_AUTO, name_format, CTLFLAG_RW,
85     &g_raid_name_format, 0, "Providers name format.");
86 static u_int g_raid_idle_threshold = 1000000;
87 TUNABLE_INT("kern.geom.raid.idle_threshold", &g_raid_idle_threshold);
88 SYSCTL_UINT(_kern_geom_raid, OID_AUTO, idle_threshold, CTLFLAG_RW,
89     &g_raid_idle_threshold, 1000000,
90     "Time in microseconds to consider a volume idle.");
91 
92 #define	MSLEEP(rv, ident, mtx, priority, wmesg, timeout)	do {	\
93 	G_RAID_DEBUG(4, "%s: Sleeping %p.", __func__, (ident));		\
94 	rv = msleep((ident), (mtx), (priority), (wmesg), (timeout));	\
95 	G_RAID_DEBUG(4, "%s: Woken up %p.", __func__, (ident));		\
96 } while (0)
97 
98 LIST_HEAD(, g_raid_md_class) g_raid_md_classes =
99     LIST_HEAD_INITIALIZER(g_raid_md_classes);
100 
101 LIST_HEAD(, g_raid_tr_class) g_raid_tr_classes =
102     LIST_HEAD_INITIALIZER(g_raid_tr_classes);
103 
104 LIST_HEAD(, g_raid_volume) g_raid_volumes =
105     LIST_HEAD_INITIALIZER(g_raid_volumes);
106 
107 static eventhandler_tag g_raid_pre_sync = NULL;
108 static int g_raid_started = 0;
109 
110 static int g_raid_destroy_geom(struct gctl_req *req, struct g_class *mp,
111     struct g_geom *gp);
112 static g_taste_t g_raid_taste;
113 static void g_raid_init(struct g_class *mp);
114 static void g_raid_fini(struct g_class *mp);
115 
116 struct g_class g_raid_class = {
117 	.name = G_RAID_CLASS_NAME,
118 	.version = G_VERSION,
119 	.ctlreq = g_raid_ctl,
120 	.taste = g_raid_taste,
121 	.destroy_geom = g_raid_destroy_geom,
122 	.init = g_raid_init,
123 	.fini = g_raid_fini
124 };
125 
126 static void g_raid_destroy_provider(struct g_raid_volume *vol);
127 static int g_raid_update_disk(struct g_raid_disk *disk, u_int event);
128 static int g_raid_update_subdisk(struct g_raid_subdisk *subdisk, u_int event);
129 static int g_raid_update_volume(struct g_raid_volume *vol, u_int event);
130 static int g_raid_update_node(struct g_raid_softc *sc, u_int event);
131 static void g_raid_dumpconf(struct sbuf *sb, const char *indent,
132     struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
133 static void g_raid_start(struct bio *bp);
134 static void g_raid_start_request(struct bio *bp);
135 static void g_raid_disk_done(struct bio *bp);
136 static void g_raid_poll(struct g_raid_softc *sc);
137 
138 static const char *
139 g_raid_node_event2str(int event)
140 {
141 
142 	switch (event) {
143 	case G_RAID_NODE_E_WAKE:
144 		return ("WAKE");
145 	case G_RAID_NODE_E_START:
146 		return ("START");
147 	default:
148 		return ("INVALID");
149 	}
150 }
151 
152 const char *
153 g_raid_disk_state2str(int state)
154 {
155 
156 	switch (state) {
157 	case G_RAID_DISK_S_NONE:
158 		return ("NONE");
159 	case G_RAID_DISK_S_OFFLINE:
160 		return ("OFFLINE");
161 	case G_RAID_DISK_S_FAILED:
162 		return ("FAILED");
163 	case G_RAID_DISK_S_STALE_FAILED:
164 		return ("STALE_FAILED");
165 	case G_RAID_DISK_S_SPARE:
166 		return ("SPARE");
167 	case G_RAID_DISK_S_STALE:
168 		return ("STALE");
169 	case G_RAID_DISK_S_ACTIVE:
170 		return ("ACTIVE");
171 	default:
172 		return ("INVALID");
173 	}
174 }
175 
176 static const char *
177 g_raid_disk_event2str(int event)
178 {
179 
180 	switch (event) {
181 	case G_RAID_DISK_E_DISCONNECTED:
182 		return ("DISCONNECTED");
183 	default:
184 		return ("INVALID");
185 	}
186 }
187 
188 const char *
189 g_raid_subdisk_state2str(int state)
190 {
191 
192 	switch (state) {
193 	case G_RAID_SUBDISK_S_NONE:
194 		return ("NONE");
195 	case G_RAID_SUBDISK_S_FAILED:
196 		return ("FAILED");
197 	case G_RAID_SUBDISK_S_NEW:
198 		return ("NEW");
199 	case G_RAID_SUBDISK_S_REBUILD:
200 		return ("REBUILD");
201 	case G_RAID_SUBDISK_S_UNINITIALIZED:
202 		return ("UNINITIALIZED");
203 	case G_RAID_SUBDISK_S_STALE:
204 		return ("STALE");
205 	case G_RAID_SUBDISK_S_RESYNC:
206 		return ("RESYNC");
207 	case G_RAID_SUBDISK_S_ACTIVE:
208 		return ("ACTIVE");
209 	default:
210 		return ("INVALID");
211 	}
212 }
213 
214 static const char *
215 g_raid_subdisk_event2str(int event)
216 {
217 
218 	switch (event) {
219 	case G_RAID_SUBDISK_E_NEW:
220 		return ("NEW");
221 	case G_RAID_SUBDISK_E_FAILED:
222 		return ("FAILED");
223 	case G_RAID_SUBDISK_E_DISCONNECTED:
224 		return ("DISCONNECTED");
225 	default:
226 		return ("INVALID");
227 	}
228 }
229 
230 const char *
231 g_raid_volume_state2str(int state)
232 {
233 
234 	switch (state) {
235 	case G_RAID_VOLUME_S_STARTING:
236 		return ("STARTING");
237 	case G_RAID_VOLUME_S_BROKEN:
238 		return ("BROKEN");
239 	case G_RAID_VOLUME_S_DEGRADED:
240 		return ("DEGRADED");
241 	case G_RAID_VOLUME_S_SUBOPTIMAL:
242 		return ("SUBOPTIMAL");
243 	case G_RAID_VOLUME_S_OPTIMAL:
244 		return ("OPTIMAL");
245 	case G_RAID_VOLUME_S_UNSUPPORTED:
246 		return ("UNSUPPORTED");
247 	case G_RAID_VOLUME_S_STOPPED:
248 		return ("STOPPED");
249 	default:
250 		return ("INVALID");
251 	}
252 }
253 
254 static const char *
255 g_raid_volume_event2str(int event)
256 {
257 
258 	switch (event) {
259 	case G_RAID_VOLUME_E_UP:
260 		return ("UP");
261 	case G_RAID_VOLUME_E_DOWN:
262 		return ("DOWN");
263 	case G_RAID_VOLUME_E_START:
264 		return ("START");
265 	case G_RAID_VOLUME_E_STARTMD:
266 		return ("STARTMD");
267 	default:
268 		return ("INVALID");
269 	}
270 }
271 
272 const char *
273 g_raid_volume_level2str(int level, int qual)
274 {
275 
276 	switch (level) {
277 	case G_RAID_VOLUME_RL_RAID0:
278 		return ("RAID0");
279 	case G_RAID_VOLUME_RL_RAID1:
280 		return ("RAID1");
281 	case G_RAID_VOLUME_RL_RAID3:
282 		if (qual == G_RAID_VOLUME_RLQ_R3P0)
283 			return ("RAID3-P0");
284 		if (qual == G_RAID_VOLUME_RLQ_R3PN)
285 			return ("RAID3-PN");
286 		return ("RAID3");
287 	case G_RAID_VOLUME_RL_RAID4:
288 		if (qual == G_RAID_VOLUME_RLQ_R4P0)
289 			return ("RAID4-P0");
290 		if (qual == G_RAID_VOLUME_RLQ_R4PN)
291 			return ("RAID4-PN");
292 		return ("RAID4");
293 	case G_RAID_VOLUME_RL_RAID5:
294 		if (qual == G_RAID_VOLUME_RLQ_R5RA)
295 			return ("RAID5-RA");
296 		if (qual == G_RAID_VOLUME_RLQ_R5RS)
297 			return ("RAID5-RS");
298 		if (qual == G_RAID_VOLUME_RLQ_R5LA)
299 			return ("RAID5-LA");
300 		if (qual == G_RAID_VOLUME_RLQ_R5LS)
301 			return ("RAID5-LS");
302 		return ("RAID5");
303 	case G_RAID_VOLUME_RL_RAID6:
304 		if (qual == G_RAID_VOLUME_RLQ_R6RA)
305 			return ("RAID6-RA");
306 		if (qual == G_RAID_VOLUME_RLQ_R6RS)
307 			return ("RAID6-RS");
308 		if (qual == G_RAID_VOLUME_RLQ_R6LA)
309 			return ("RAID6-LA");
310 		if (qual == G_RAID_VOLUME_RLQ_R6LS)
311 			return ("RAID6-LS");
312 		return ("RAID6");
313 	case G_RAID_VOLUME_RL_RAIDMDF:
314 		if (qual == G_RAID_VOLUME_RLQ_RMDFRA)
315 			return ("RAIDMDF-RA");
316 		if (qual == G_RAID_VOLUME_RLQ_RMDFRS)
317 			return ("RAIDMDF-RS");
318 		if (qual == G_RAID_VOLUME_RLQ_RMDFLA)
319 			return ("RAIDMDF-LA");
320 		if (qual == G_RAID_VOLUME_RLQ_RMDFLS)
321 			return ("RAIDMDF-LS");
322 		return ("RAIDMDF");
323 	case G_RAID_VOLUME_RL_RAID1E:
324 		if (qual == G_RAID_VOLUME_RLQ_R1EA)
325 			return ("RAID1E-A");
326 		if (qual == G_RAID_VOLUME_RLQ_R1EO)
327 			return ("RAID1E-O");
328 		return ("RAID1E");
329 	case G_RAID_VOLUME_RL_SINGLE:
330 		return ("SINGLE");
331 	case G_RAID_VOLUME_RL_CONCAT:
332 		return ("CONCAT");
333 	case G_RAID_VOLUME_RL_RAID5E:
334 		if (qual == G_RAID_VOLUME_RLQ_R5ERA)
335 			return ("RAID5E-RA");
336 		if (qual == G_RAID_VOLUME_RLQ_R5ERS)
337 			return ("RAID5E-RS");
338 		if (qual == G_RAID_VOLUME_RLQ_R5ELA)
339 			return ("RAID5E-LA");
340 		if (qual == G_RAID_VOLUME_RLQ_R5ELS)
341 			return ("RAID5E-LS");
342 		return ("RAID5E");
343 	case G_RAID_VOLUME_RL_RAID5EE:
344 		if (qual == G_RAID_VOLUME_RLQ_R5EERA)
345 			return ("RAID5EE-RA");
346 		if (qual == G_RAID_VOLUME_RLQ_R5EERS)
347 			return ("RAID5EE-RS");
348 		if (qual == G_RAID_VOLUME_RLQ_R5EELA)
349 			return ("RAID5EE-LA");
350 		if (qual == G_RAID_VOLUME_RLQ_R5EELS)
351 			return ("RAID5EE-LS");
352 		return ("RAID5EE");
353 	case G_RAID_VOLUME_RL_RAID5R:
354 		if (qual == G_RAID_VOLUME_RLQ_R5RRA)
355 			return ("RAID5R-RA");
356 		if (qual == G_RAID_VOLUME_RLQ_R5RRS)
357 			return ("RAID5R-RS");
358 		if (qual == G_RAID_VOLUME_RLQ_R5RLA)
359 			return ("RAID5R-LA");
360 		if (qual == G_RAID_VOLUME_RLQ_R5RLS)
361 			return ("RAID5R-LS");
362 		return ("RAID5E");
363 	default:
364 		return ("UNKNOWN");
365 	}
366 }
367 
368 int
369 g_raid_volume_str2level(const char *str, int *level, int *qual)
370 {
371 
372 	*level = G_RAID_VOLUME_RL_UNKNOWN;
373 	*qual = G_RAID_VOLUME_RLQ_NONE;
374 	if (strcasecmp(str, "RAID0") == 0)
375 		*level = G_RAID_VOLUME_RL_RAID0;
376 	else if (strcasecmp(str, "RAID1") == 0)
377 		*level = G_RAID_VOLUME_RL_RAID1;
378 	else if (strcasecmp(str, "RAID3-P0") == 0) {
379 		*level = G_RAID_VOLUME_RL_RAID3;
380 		*qual = G_RAID_VOLUME_RLQ_R3P0;
381 	} else if (strcasecmp(str, "RAID3-PN") == 0 ||
382 		   strcasecmp(str, "RAID3") == 0) {
383 		*level = G_RAID_VOLUME_RL_RAID3;
384 		*qual = G_RAID_VOLUME_RLQ_R3PN;
385 	} else if (strcasecmp(str, "RAID4-P0") == 0) {
386 		*level = G_RAID_VOLUME_RL_RAID4;
387 		*qual = G_RAID_VOLUME_RLQ_R4P0;
388 	} else if (strcasecmp(str, "RAID4-PN") == 0 ||
389 		   strcasecmp(str, "RAID4") == 0) {
390 		*level = G_RAID_VOLUME_RL_RAID4;
391 		*qual = G_RAID_VOLUME_RLQ_R4PN;
392 	} else if (strcasecmp(str, "RAID5-RA") == 0) {
393 		*level = G_RAID_VOLUME_RL_RAID5;
394 		*qual = G_RAID_VOLUME_RLQ_R5RA;
395 	} else if (strcasecmp(str, "RAID5-RS") == 0) {
396 		*level = G_RAID_VOLUME_RL_RAID5;
397 		*qual = G_RAID_VOLUME_RLQ_R5RS;
398 	} else if (strcasecmp(str, "RAID5") == 0 ||
399 		   strcasecmp(str, "RAID5-LA") == 0) {
400 		*level = G_RAID_VOLUME_RL_RAID5;
401 		*qual = G_RAID_VOLUME_RLQ_R5LA;
402 	} else if (strcasecmp(str, "RAID5-LS") == 0) {
403 		*level = G_RAID_VOLUME_RL_RAID5;
404 		*qual = G_RAID_VOLUME_RLQ_R5LS;
405 	} else if (strcasecmp(str, "RAID6-RA") == 0) {
406 		*level = G_RAID_VOLUME_RL_RAID6;
407 		*qual = G_RAID_VOLUME_RLQ_R6RA;
408 	} else if (strcasecmp(str, "RAID6-RS") == 0) {
409 		*level = G_RAID_VOLUME_RL_RAID6;
410 		*qual = G_RAID_VOLUME_RLQ_R6RS;
411 	} else if (strcasecmp(str, "RAID6") == 0 ||
412 		   strcasecmp(str, "RAID6-LA") == 0) {
413 		*level = G_RAID_VOLUME_RL_RAID6;
414 		*qual = G_RAID_VOLUME_RLQ_R6LA;
415 	} else if (strcasecmp(str, "RAID6-LS") == 0) {
416 		*level = G_RAID_VOLUME_RL_RAID6;
417 		*qual = G_RAID_VOLUME_RLQ_R6LS;
418 	} else if (strcasecmp(str, "RAIDMDF-RA") == 0) {
419 		*level = G_RAID_VOLUME_RL_RAIDMDF;
420 		*qual = G_RAID_VOLUME_RLQ_RMDFRA;
421 	} else if (strcasecmp(str, "RAIDMDF-RS") == 0) {
422 		*level = G_RAID_VOLUME_RL_RAIDMDF;
423 		*qual = G_RAID_VOLUME_RLQ_RMDFRS;
424 	} else if (strcasecmp(str, "RAIDMDF") == 0 ||
425 		   strcasecmp(str, "RAIDMDF-LA") == 0) {
426 		*level = G_RAID_VOLUME_RL_RAIDMDF;
427 		*qual = G_RAID_VOLUME_RLQ_RMDFLA;
428 	} else if (strcasecmp(str, "RAIDMDF-LS") == 0) {
429 		*level = G_RAID_VOLUME_RL_RAIDMDF;
430 		*qual = G_RAID_VOLUME_RLQ_RMDFLS;
431 	} else if (strcasecmp(str, "RAID10") == 0 ||
432 		   strcasecmp(str, "RAID1E") == 0 ||
433 		   strcasecmp(str, "RAID1E-A") == 0) {
434 		*level = G_RAID_VOLUME_RL_RAID1E;
435 		*qual = G_RAID_VOLUME_RLQ_R1EA;
436 	} else if (strcasecmp(str, "RAID1E-O") == 0) {
437 		*level = G_RAID_VOLUME_RL_RAID1E;
438 		*qual = G_RAID_VOLUME_RLQ_R1EO;
439 	} else if (strcasecmp(str, "SINGLE") == 0)
440 		*level = G_RAID_VOLUME_RL_SINGLE;
441 	else if (strcasecmp(str, "CONCAT") == 0)
442 		*level = G_RAID_VOLUME_RL_CONCAT;
443 	else if (strcasecmp(str, "RAID5E-RA") == 0) {
444 		*level = G_RAID_VOLUME_RL_RAID5E;
445 		*qual = G_RAID_VOLUME_RLQ_R5ERA;
446 	} else if (strcasecmp(str, "RAID5E-RS") == 0) {
447 		*level = G_RAID_VOLUME_RL_RAID5E;
448 		*qual = G_RAID_VOLUME_RLQ_R5ERS;
449 	} else if (strcasecmp(str, "RAID5E") == 0 ||
450 		   strcasecmp(str, "RAID5E-LA") == 0) {
451 		*level = G_RAID_VOLUME_RL_RAID5E;
452 		*qual = G_RAID_VOLUME_RLQ_R5ELA;
453 	} else if (strcasecmp(str, "RAID5E-LS") == 0) {
454 		*level = G_RAID_VOLUME_RL_RAID5E;
455 		*qual = G_RAID_VOLUME_RLQ_R5ELS;
456 	} else if (strcasecmp(str, "RAID5EE-RA") == 0) {
457 		*level = G_RAID_VOLUME_RL_RAID5EE;
458 		*qual = G_RAID_VOLUME_RLQ_R5EERA;
459 	} else if (strcasecmp(str, "RAID5EE-RS") == 0) {
460 		*level = G_RAID_VOLUME_RL_RAID5EE;
461 		*qual = G_RAID_VOLUME_RLQ_R5EERS;
462 	} else if (strcasecmp(str, "RAID5EE") == 0 ||
463 		   strcasecmp(str, "RAID5EE-LA") == 0) {
464 		*level = G_RAID_VOLUME_RL_RAID5EE;
465 		*qual = G_RAID_VOLUME_RLQ_R5EELA;
466 	} else if (strcasecmp(str, "RAID5EE-LS") == 0) {
467 		*level = G_RAID_VOLUME_RL_RAID5EE;
468 		*qual = G_RAID_VOLUME_RLQ_R5EELS;
469 	} else if (strcasecmp(str, "RAID5R-RA") == 0) {
470 		*level = G_RAID_VOLUME_RL_RAID5R;
471 		*qual = G_RAID_VOLUME_RLQ_R5RRA;
472 	} else if (strcasecmp(str, "RAID5R-RS") == 0) {
473 		*level = G_RAID_VOLUME_RL_RAID5R;
474 		*qual = G_RAID_VOLUME_RLQ_R5RRS;
475 	} else if (strcasecmp(str, "RAID5R") == 0 ||
476 		   strcasecmp(str, "RAID5R-LA") == 0) {
477 		*level = G_RAID_VOLUME_RL_RAID5R;
478 		*qual = G_RAID_VOLUME_RLQ_R5RLA;
479 	} else if (strcasecmp(str, "RAID5R-LS") == 0) {
480 		*level = G_RAID_VOLUME_RL_RAID5R;
481 		*qual = G_RAID_VOLUME_RLQ_R5RLS;
482 	} else
483 		return (-1);
484 	return (0);
485 }
486 
487 const char *
488 g_raid_get_diskname(struct g_raid_disk *disk)
489 {
490 
491 	if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL)
492 		return ("[unknown]");
493 	return (disk->d_consumer->provider->name);
494 }
495 
496 void
497 g_raid_report_disk_state(struct g_raid_disk *disk)
498 {
499 	struct g_raid_subdisk *sd;
500 	int len, state;
501 	uint32_t s;
502 
503 	if (disk->d_consumer == NULL)
504 		return;
505 	if (disk->d_state == G_RAID_DISK_S_FAILED ||
506 	    disk->d_state == G_RAID_DISK_S_STALE_FAILED) {
507 		s = G_STATE_FAILED;
508 	} else {
509 		state = G_RAID_SUBDISK_S_ACTIVE;
510 		TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
511 			if (sd->sd_state < state)
512 				state = sd->sd_state;
513 		}
514 		if (state == G_RAID_SUBDISK_S_FAILED)
515 			s = G_STATE_FAILED;
516 		else if (state == G_RAID_SUBDISK_S_NEW ||
517 		    state == G_RAID_SUBDISK_S_REBUILD)
518 			s = G_STATE_REBUILD;
519 		else if (state == G_RAID_SUBDISK_S_STALE ||
520 		    state == G_RAID_SUBDISK_S_RESYNC)
521 			s = G_STATE_RESYNC;
522 		else
523 			s = G_STATE_ACTIVE;
524 	}
525 	len = sizeof(s);
526 	g_io_getattr("GEOM::setstate", disk->d_consumer, &len, &s);
527 	G_RAID_DEBUG1(2, disk->d_softc, "Disk %s state reported as %d.",
528 	    g_raid_get_diskname(disk), s);
529 }
530 
531 void
532 g_raid_change_disk_state(struct g_raid_disk *disk, int state)
533 {
534 
535 	G_RAID_DEBUG1(0, disk->d_softc, "Disk %s state changed from %s to %s.",
536 	    g_raid_get_diskname(disk),
537 	    g_raid_disk_state2str(disk->d_state),
538 	    g_raid_disk_state2str(state));
539 	disk->d_state = state;
540 	g_raid_report_disk_state(disk);
541 }
542 
543 void
544 g_raid_change_subdisk_state(struct g_raid_subdisk *sd, int state)
545 {
546 
547 	G_RAID_DEBUG1(0, sd->sd_softc,
548 	    "Subdisk %s:%d-%s state changed from %s to %s.",
549 	    sd->sd_volume->v_name, sd->sd_pos,
550 	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
551 	    g_raid_subdisk_state2str(sd->sd_state),
552 	    g_raid_subdisk_state2str(state));
553 	sd->sd_state = state;
554 	if (sd->sd_disk)
555 		g_raid_report_disk_state(sd->sd_disk);
556 }
557 
558 void
559 g_raid_change_volume_state(struct g_raid_volume *vol, int state)
560 {
561 
562 	G_RAID_DEBUG1(0, vol->v_softc,
563 	    "Volume %s state changed from %s to %s.",
564 	    vol->v_name,
565 	    g_raid_volume_state2str(vol->v_state),
566 	    g_raid_volume_state2str(state));
567 	vol->v_state = state;
568 }
569 
570 /*
571  * --- Events handling functions ---
572  * Events in geom_raid are used to maintain subdisks and volumes status
573  * from one thread to simplify locking.
574  */
575 static void
576 g_raid_event_free(struct g_raid_event *ep)
577 {
578 
579 	free(ep, M_RAID);
580 }
581 
582 int
583 g_raid_event_send(void *arg, int event, int flags)
584 {
585 	struct g_raid_softc *sc;
586 	struct g_raid_event *ep;
587 	int error;
588 
589 	if ((flags & G_RAID_EVENT_VOLUME) != 0) {
590 		sc = ((struct g_raid_volume *)arg)->v_softc;
591 	} else if ((flags & G_RAID_EVENT_DISK) != 0) {
592 		sc = ((struct g_raid_disk *)arg)->d_softc;
593 	} else if ((flags & G_RAID_EVENT_SUBDISK) != 0) {
594 		sc = ((struct g_raid_subdisk *)arg)->sd_softc;
595 	} else {
596 		sc = arg;
597 	}
598 	ep = malloc(sizeof(*ep), M_RAID,
599 	    sx_xlocked(&sc->sc_lock) ? M_WAITOK : M_NOWAIT);
600 	if (ep == NULL)
601 		return (ENOMEM);
602 	ep->e_tgt = arg;
603 	ep->e_event = event;
604 	ep->e_flags = flags;
605 	ep->e_error = 0;
606 	G_RAID_DEBUG1(4, sc, "Sending event %p. Waking up %p.", ep, sc);
607 	mtx_lock(&sc->sc_queue_mtx);
608 	TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next);
609 	mtx_unlock(&sc->sc_queue_mtx);
610 	wakeup(sc);
611 
612 	if ((flags & G_RAID_EVENT_WAIT) == 0)
613 		return (0);
614 
615 	sx_assert(&sc->sc_lock, SX_XLOCKED);
616 	G_RAID_DEBUG1(4, sc, "Sleeping on %p.", ep);
617 	sx_xunlock(&sc->sc_lock);
618 	while ((ep->e_flags & G_RAID_EVENT_DONE) == 0) {
619 		mtx_lock(&sc->sc_queue_mtx);
620 		MSLEEP(error, ep, &sc->sc_queue_mtx, PRIBIO | PDROP, "m:event",
621 		    hz * 5);
622 	}
623 	error = ep->e_error;
624 	g_raid_event_free(ep);
625 	sx_xlock(&sc->sc_lock);
626 	return (error);
627 }
628 
629 static void
630 g_raid_event_cancel(struct g_raid_softc *sc, void *tgt)
631 {
632 	struct g_raid_event *ep, *tmpep;
633 
634 	sx_assert(&sc->sc_lock, SX_XLOCKED);
635 
636 	mtx_lock(&sc->sc_queue_mtx);
637 	TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) {
638 		if (ep->e_tgt != tgt)
639 			continue;
640 		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
641 		if ((ep->e_flags & G_RAID_EVENT_WAIT) == 0)
642 			g_raid_event_free(ep);
643 		else {
644 			ep->e_error = ECANCELED;
645 			wakeup(ep);
646 		}
647 	}
648 	mtx_unlock(&sc->sc_queue_mtx);
649 }
650 
651 static int
652 g_raid_event_check(struct g_raid_softc *sc, void *tgt)
653 {
654 	struct g_raid_event *ep;
655 	int	res = 0;
656 
657 	sx_assert(&sc->sc_lock, SX_XLOCKED);
658 
659 	mtx_lock(&sc->sc_queue_mtx);
660 	TAILQ_FOREACH(ep, &sc->sc_events, e_next) {
661 		if (ep->e_tgt != tgt)
662 			continue;
663 		res = 1;
664 		break;
665 	}
666 	mtx_unlock(&sc->sc_queue_mtx);
667 	return (res);
668 }
669 
670 /*
671  * Return the number of disks in given state.
672  * If state is equal to -1, count all connected disks.
673  */
674 u_int
675 g_raid_ndisks(struct g_raid_softc *sc, int state)
676 {
677 	struct g_raid_disk *disk;
678 	u_int n;
679 
680 	sx_assert(&sc->sc_lock, SX_LOCKED);
681 
682 	n = 0;
683 	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
684 		if (disk->d_state == state || state == -1)
685 			n++;
686 	}
687 	return (n);
688 }
689 
690 /*
691  * Return the number of subdisks in given state.
692  * If state is equal to -1, count all connected disks.
693  */
694 u_int
695 g_raid_nsubdisks(struct g_raid_volume *vol, int state)
696 {
697 	struct g_raid_subdisk *subdisk;
698 	struct g_raid_softc *sc;
699 	u_int i, n ;
700 
701 	sc = vol->v_softc;
702 	sx_assert(&sc->sc_lock, SX_LOCKED);
703 
704 	n = 0;
705 	for (i = 0; i < vol->v_disks_count; i++) {
706 		subdisk = &vol->v_subdisks[i];
707 		if ((state == -1 &&
708 		     subdisk->sd_state != G_RAID_SUBDISK_S_NONE) ||
709 		    subdisk->sd_state == state)
710 			n++;
711 	}
712 	return (n);
713 }
714 
715 /*
716  * Return the first subdisk in given state.
717  * If state is equal to -1, then the first connected disks.
718  */
719 struct g_raid_subdisk *
720 g_raid_get_subdisk(struct g_raid_volume *vol, int state)
721 {
722 	struct g_raid_subdisk *sd;
723 	struct g_raid_softc *sc;
724 	u_int i;
725 
726 	sc = vol->v_softc;
727 	sx_assert(&sc->sc_lock, SX_LOCKED);
728 
729 	for (i = 0; i < vol->v_disks_count; i++) {
730 		sd = &vol->v_subdisks[i];
731 		if ((state == -1 &&
732 		     sd->sd_state != G_RAID_SUBDISK_S_NONE) ||
733 		    sd->sd_state == state)
734 			return (sd);
735 	}
736 	return (NULL);
737 }
738 
739 struct g_consumer *
740 g_raid_open_consumer(struct g_raid_softc *sc, const char *name)
741 {
742 	struct g_consumer *cp;
743 	struct g_provider *pp;
744 
745 	g_topology_assert();
746 
747 	if (strncmp(name, "/dev/", 5) == 0)
748 		name += 5;
749 	pp = g_provider_by_name(name);
750 	if (pp == NULL)
751 		return (NULL);
752 	cp = g_new_consumer(sc->sc_geom);
753 	if (g_attach(cp, pp) != 0) {
754 		g_destroy_consumer(cp);
755 		return (NULL);
756 	}
757 	if (g_access(cp, 1, 1, 1) != 0) {
758 		g_detach(cp);
759 		g_destroy_consumer(cp);
760 		return (NULL);
761 	}
762 	return (cp);
763 }
764 
765 static u_int
766 g_raid_nrequests(struct g_raid_softc *sc, struct g_consumer *cp)
767 {
768 	struct bio *bp;
769 	u_int nreqs = 0;
770 
771 	mtx_lock(&sc->sc_queue_mtx);
772 	TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
773 		if (bp->bio_from == cp)
774 			nreqs++;
775 	}
776 	mtx_unlock(&sc->sc_queue_mtx);
777 	return (nreqs);
778 }
779 
780 u_int
781 g_raid_nopens(struct g_raid_softc *sc)
782 {
783 	struct g_raid_volume *vol;
784 	u_int opens;
785 
786 	opens = 0;
787 	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
788 		if (vol->v_provider_open != 0)
789 			opens++;
790 	}
791 	return (opens);
792 }
793 
794 static int
795 g_raid_consumer_is_busy(struct g_raid_softc *sc, struct g_consumer *cp)
796 {
797 
798 	if (cp->index > 0) {
799 		G_RAID_DEBUG1(2, sc,
800 		    "I/O requests for %s exist, can't destroy it now.",
801 		    cp->provider->name);
802 		return (1);
803 	}
804 	if (g_raid_nrequests(sc, cp) > 0) {
805 		G_RAID_DEBUG1(2, sc,
806 		    "I/O requests for %s in queue, can't destroy it now.",
807 		    cp->provider->name);
808 		return (1);
809 	}
810 	return (0);
811 }
812 
813 static void
814 g_raid_destroy_consumer(void *arg, int flags __unused)
815 {
816 	struct g_consumer *cp;
817 
818 	g_topology_assert();
819 
820 	cp = arg;
821 	G_RAID_DEBUG(1, "Consumer %s destroyed.", cp->provider->name);
822 	g_detach(cp);
823 	g_destroy_consumer(cp);
824 }
825 
826 void
827 g_raid_kill_consumer(struct g_raid_softc *sc, struct g_consumer *cp)
828 {
829 	struct g_provider *pp;
830 	int retaste_wait;
831 
832 	g_topology_assert_not();
833 
834 	g_topology_lock();
835 	cp->private = NULL;
836 	if (g_raid_consumer_is_busy(sc, cp))
837 		goto out;
838 	pp = cp->provider;
839 	retaste_wait = 0;
840 	if (cp->acw == 1) {
841 		if ((pp->geom->flags & G_GEOM_WITHER) == 0)
842 			retaste_wait = 1;
843 	}
844 	if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
845 		g_access(cp, -cp->acr, -cp->acw, -cp->ace);
846 	if (retaste_wait) {
847 		/*
848 		 * After retaste event was send (inside g_access()), we can send
849 		 * event to detach and destroy consumer.
850 		 * A class, which has consumer to the given provider connected
851 		 * will not receive retaste event for the provider.
852 		 * This is the way how I ignore retaste events when I close
853 		 * consumers opened for write: I detach and destroy consumer
854 		 * after retaste event is sent.
855 		 */
856 		g_post_event(g_raid_destroy_consumer, cp, M_WAITOK, NULL);
857 		goto out;
858 	}
859 	G_RAID_DEBUG(1, "Consumer %s destroyed.", pp->name);
860 	g_detach(cp);
861 	g_destroy_consumer(cp);
862 out:
863 	g_topology_unlock();
864 }
865 
866 static void
867 g_raid_orphan(struct g_consumer *cp)
868 {
869 	struct g_raid_disk *disk;
870 
871 	g_topology_assert();
872 
873 	disk = cp->private;
874 	if (disk == NULL)
875 		return;
876 	g_raid_event_send(disk, G_RAID_DISK_E_DISCONNECTED,
877 	    G_RAID_EVENT_DISK);
878 }
879 
880 static int
881 g_raid_clean(struct g_raid_volume *vol, int acw)
882 {
883 	struct g_raid_softc *sc;
884 	int timeout;
885 
886 	sc = vol->v_softc;
887 	g_topology_assert_not();
888 	sx_assert(&sc->sc_lock, SX_XLOCKED);
889 
890 //	if ((sc->sc_flags & G_RAID_DEVICE_FLAG_NOFAILSYNC) != 0)
891 //		return (0);
892 	if (!vol->v_dirty)
893 		return (0);
894 	if (vol->v_writes > 0)
895 		return (0);
896 	if (acw > 0 || (acw == -1 &&
897 	    vol->v_provider != NULL && vol->v_provider->acw > 0)) {
898 		timeout = g_raid_clean_time - (time_uptime - vol->v_last_write);
899 		if (timeout > 0)
900 			return (timeout);
901 	}
902 	vol->v_dirty = 0;
903 	G_RAID_DEBUG1(1, sc, "Volume %s marked as clean.",
904 	    vol->v_name);
905 	g_raid_write_metadata(sc, vol, NULL, NULL);
906 	return (0);
907 }
908 
909 static void
910 g_raid_dirty(struct g_raid_volume *vol)
911 {
912 	struct g_raid_softc *sc;
913 
914 	sc = vol->v_softc;
915 	g_topology_assert_not();
916 	sx_assert(&sc->sc_lock, SX_XLOCKED);
917 
918 //	if ((sc->sc_flags & G_RAID_DEVICE_FLAG_NOFAILSYNC) != 0)
919 //		return;
920 	vol->v_dirty = 1;
921 	G_RAID_DEBUG1(1, sc, "Volume %s marked as dirty.",
922 	    vol->v_name);
923 	g_raid_write_metadata(sc, vol, NULL, NULL);
924 }
925 
926 void
927 g_raid_tr_flush_common(struct g_raid_tr_object *tr, struct bio *bp)
928 {
929 	struct g_raid_softc *sc;
930 	struct g_raid_volume *vol;
931 	struct g_raid_subdisk *sd;
932 	struct bio_queue_head queue;
933 	struct bio *cbp;
934 	int i;
935 
936 	vol = tr->tro_volume;
937 	sc = vol->v_softc;
938 
939 	/*
940 	 * Allocate all bios before sending any request, so we can return
941 	 * ENOMEM in nice and clean way.
942 	 */
943 	bioq_init(&queue);
944 	for (i = 0; i < vol->v_disks_count; i++) {
945 		sd = &vol->v_subdisks[i];
946 		if (sd->sd_state == G_RAID_SUBDISK_S_NONE ||
947 		    sd->sd_state == G_RAID_SUBDISK_S_FAILED)
948 			continue;
949 		cbp = g_clone_bio(bp);
950 		if (cbp == NULL)
951 			goto failure;
952 		cbp->bio_caller1 = sd;
953 		bioq_insert_tail(&queue, cbp);
954 	}
955 	for (cbp = bioq_first(&queue); cbp != NULL;
956 	    cbp = bioq_first(&queue)) {
957 		bioq_remove(&queue, cbp);
958 		sd = cbp->bio_caller1;
959 		cbp->bio_caller1 = NULL;
960 		g_raid_subdisk_iostart(sd, cbp);
961 	}
962 	return;
963 failure:
964 	for (cbp = bioq_first(&queue); cbp != NULL;
965 	    cbp = bioq_first(&queue)) {
966 		bioq_remove(&queue, cbp);
967 		g_destroy_bio(cbp);
968 	}
969 	if (bp->bio_error == 0)
970 		bp->bio_error = ENOMEM;
971 	g_raid_iodone(bp, bp->bio_error);
972 }
973 
974 static void
975 g_raid_tr_kerneldump_common_done(struct bio *bp)
976 {
977 
978 	bp->bio_flags |= BIO_DONE;
979 }
980 
981 int
982 g_raid_tr_kerneldump_common(struct g_raid_tr_object *tr,
983     void *virtual, vm_offset_t physical, off_t offset, size_t length)
984 {
985 	struct g_raid_softc *sc;
986 	struct g_raid_volume *vol;
987 	struct bio bp;
988 
989 	vol = tr->tro_volume;
990 	sc = vol->v_softc;
991 
992 	bzero(&bp, sizeof(bp));
993 	bp.bio_cmd = BIO_WRITE;
994 	bp.bio_done = g_raid_tr_kerneldump_common_done;
995 	bp.bio_attribute = NULL;
996 	bp.bio_offset = offset;
997 	bp.bio_length = length;
998 	bp.bio_data = virtual;
999 	bp.bio_to = vol->v_provider;
1000 
1001 	g_raid_start(&bp);
1002 	while (!(bp.bio_flags & BIO_DONE)) {
1003 		G_RAID_DEBUG1(4, sc, "Poll...");
1004 		g_raid_poll(sc);
1005 		DELAY(10);
1006 	}
1007 
1008 	return (bp.bio_error != 0 ? EIO : 0);
1009 }
1010 
1011 static int
1012 g_raid_dump(void *arg,
1013     void *virtual, vm_offset_t physical, off_t offset, size_t length)
1014 {
1015 	struct g_raid_volume *vol;
1016 	int error;
1017 
1018 	vol = (struct g_raid_volume *)arg;
1019 	G_RAID_DEBUG1(3, vol->v_softc, "Dumping at off %llu len %llu.",
1020 	    (long long unsigned)offset, (long long unsigned)length);
1021 
1022 	error = G_RAID_TR_KERNELDUMP(vol->v_tr,
1023 	    virtual, physical, offset, length);
1024 	return (error);
1025 }
1026 
1027 static void
1028 g_raid_kerneldump(struct g_raid_softc *sc, struct bio *bp)
1029 {
1030 	struct g_kerneldump *gkd;
1031 	struct g_provider *pp;
1032 	struct g_raid_volume *vol;
1033 
1034 	gkd = (struct g_kerneldump*)bp->bio_data;
1035 	pp = bp->bio_to;
1036 	vol = pp->private;
1037 	g_trace(G_T_TOPOLOGY, "g_raid_kerneldump(%s, %jd, %jd)",
1038 		pp->name, (intmax_t)gkd->offset, (intmax_t)gkd->length);
1039 	gkd->di.dumper = g_raid_dump;
1040 	gkd->di.priv = vol;
1041 	gkd->di.blocksize = vol->v_sectorsize;
1042 	gkd->di.maxiosize = DFLTPHYS;
1043 	gkd->di.mediaoffset = gkd->offset;
1044 	if ((gkd->offset + gkd->length) > vol->v_mediasize)
1045 		gkd->length = vol->v_mediasize - gkd->offset;
1046 	gkd->di.mediasize = gkd->length;
1047 	g_io_deliver(bp, 0);
1048 }
1049 
1050 static void
1051 g_raid_start(struct bio *bp)
1052 {
1053 	struct g_raid_softc *sc;
1054 
1055 	sc = bp->bio_to->geom->softc;
1056 	/*
1057 	 * If sc == NULL or there are no valid disks, provider's error
1058 	 * should be set and g_raid_start() should not be called at all.
1059 	 */
1060 //	KASSERT(sc != NULL && sc->sc_state == G_RAID_VOLUME_S_RUNNING,
1061 //	    ("Provider's error should be set (error=%d)(mirror=%s).",
1062 //	    bp->bio_to->error, bp->bio_to->name));
1063 	G_RAID_LOGREQ(3, bp, "Request received.");
1064 
1065 	switch (bp->bio_cmd) {
1066 	case BIO_READ:
1067 	case BIO_WRITE:
1068 	case BIO_DELETE:
1069 	case BIO_FLUSH:
1070 		break;
1071 	case BIO_GETATTR:
1072 		if (!strcmp(bp->bio_attribute, "GEOM::kerneldump"))
1073 			g_raid_kerneldump(sc, bp);
1074 		else
1075 			g_io_deliver(bp, EOPNOTSUPP);
1076 		return;
1077 	default:
1078 		g_io_deliver(bp, EOPNOTSUPP);
1079 		return;
1080 	}
1081 	mtx_lock(&sc->sc_queue_mtx);
1082 	bioq_disksort(&sc->sc_queue, bp);
1083 	mtx_unlock(&sc->sc_queue_mtx);
1084 	if (!dumping) {
1085 		G_RAID_DEBUG1(4, sc, "Waking up %p.", sc);
1086 		wakeup(sc);
1087 	}
1088 }
1089 
1090 static int
1091 g_raid_bio_overlaps(const struct bio *bp, off_t lstart, off_t len)
1092 {
1093 	/*
1094 	 * 5 cases:
1095 	 * (1) bp entirely below NO
1096 	 * (2) bp entirely above NO
1097 	 * (3) bp start below, but end in range YES
1098 	 * (4) bp entirely within YES
1099 	 * (5) bp starts within, ends above YES
1100 	 *
1101 	 * lock range 10-19 (offset 10 length 10)
1102 	 * (1) 1-5: first if kicks it out
1103 	 * (2) 30-35: second if kicks it out
1104 	 * (3) 5-15: passes both ifs
1105 	 * (4) 12-14: passes both ifs
1106 	 * (5) 19-20: passes both
1107 	 */
1108 	off_t lend = lstart + len - 1;
1109 	off_t bstart = bp->bio_offset;
1110 	off_t bend = bp->bio_offset + bp->bio_length - 1;
1111 
1112 	if (bend < lstart)
1113 		return (0);
1114 	if (lend < bstart)
1115 		return (0);
1116 	return (1);
1117 }
1118 
1119 static int
1120 g_raid_is_in_locked_range(struct g_raid_volume *vol, const struct bio *bp)
1121 {
1122 	struct g_raid_lock *lp;
1123 
1124 	sx_assert(&vol->v_softc->sc_lock, SX_LOCKED);
1125 
1126 	LIST_FOREACH(lp, &vol->v_locks, l_next) {
1127 		if (g_raid_bio_overlaps(bp, lp->l_offset, lp->l_length))
1128 			return (1);
1129 	}
1130 	return (0);
1131 }
1132 
1133 static void
1134 g_raid_start_request(struct bio *bp)
1135 {
1136 	struct g_raid_softc *sc;
1137 	struct g_raid_volume *vol;
1138 
1139 	sc = bp->bio_to->geom->softc;
1140 	sx_assert(&sc->sc_lock, SX_LOCKED);
1141 	vol = bp->bio_to->private;
1142 
1143 	/*
1144 	 * Check to see if this item is in a locked range.  If so,
1145 	 * queue it to our locked queue and return.  We'll requeue
1146 	 * it when the range is unlocked.  Internal I/O for the
1147 	 * rebuild/rescan/recovery process is excluded from this
1148 	 * check so we can actually do the recovery.
1149 	 */
1150 	if (!(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL) &&
1151 	    g_raid_is_in_locked_range(vol, bp)) {
1152 		G_RAID_LOGREQ(3, bp, "Defer request.");
1153 		bioq_insert_tail(&vol->v_locked, bp);
1154 		return;
1155 	}
1156 
1157 	/*
1158 	 * If we're actually going to do the write/delete, then
1159 	 * update the idle stats for the volume.
1160 	 */
1161 	if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE) {
1162 		if (!vol->v_dirty)
1163 			g_raid_dirty(vol);
1164 		vol->v_writes++;
1165 	}
1166 
1167 	/*
1168 	 * Put request onto inflight queue, so we can check if new
1169 	 * synchronization requests don't collide with it.  Then tell
1170 	 * the transformation layer to start the I/O.
1171 	 */
1172 	bioq_insert_tail(&vol->v_inflight, bp);
1173 	G_RAID_LOGREQ(4, bp, "Request started");
1174 	G_RAID_TR_IOSTART(vol->v_tr, bp);
1175 }
1176 
1177 static void
1178 g_raid_finish_with_locked_ranges(struct g_raid_volume *vol, struct bio *bp)
1179 {
1180 	off_t off, len;
1181 	struct bio *nbp;
1182 	struct g_raid_lock *lp;
1183 
1184 	vol->v_pending_lock = 0;
1185 	LIST_FOREACH(lp, &vol->v_locks, l_next) {
1186 		if (lp->l_pending) {
1187 			off = lp->l_offset;
1188 			len = lp->l_length;
1189 			lp->l_pending = 0;
1190 			TAILQ_FOREACH(nbp, &vol->v_inflight.queue, bio_queue) {
1191 				if (g_raid_bio_overlaps(nbp, off, len))
1192 					lp->l_pending++;
1193 			}
1194 			if (lp->l_pending) {
1195 				vol->v_pending_lock = 1;
1196 				G_RAID_DEBUG1(4, vol->v_softc,
1197 				    "Deferred lock(%jd, %jd) has %d pending",
1198 				    (intmax_t)off, (intmax_t)(off + len),
1199 				    lp->l_pending);
1200 				continue;
1201 			}
1202 			G_RAID_DEBUG1(4, vol->v_softc,
1203 			    "Deferred lock of %jd to %jd completed",
1204 			    (intmax_t)off, (intmax_t)(off + len));
1205 			G_RAID_TR_LOCKED(vol->v_tr, lp->l_callback_arg);
1206 		}
1207 	}
1208 }
1209 
1210 void
1211 g_raid_iodone(struct bio *bp, int error)
1212 {
1213 	struct g_raid_softc *sc;
1214 	struct g_raid_volume *vol;
1215 
1216 	sc = bp->bio_to->geom->softc;
1217 	sx_assert(&sc->sc_lock, SX_LOCKED);
1218 	vol = bp->bio_to->private;
1219 	G_RAID_LOGREQ(3, bp, "Request done: %d.", error);
1220 
1221 	/* Update stats if we done write/delete. */
1222 	if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE) {
1223 		vol->v_writes--;
1224 		vol->v_last_write = time_uptime;
1225 	}
1226 
1227 	bioq_remove(&vol->v_inflight, bp);
1228 	if (vol->v_pending_lock && g_raid_is_in_locked_range(vol, bp))
1229 		g_raid_finish_with_locked_ranges(vol, bp);
1230 	getmicrouptime(&vol->v_last_done);
1231 	g_io_deliver(bp, error);
1232 }
1233 
1234 int
1235 g_raid_lock_range(struct g_raid_volume *vol, off_t off, off_t len,
1236     struct bio *ignore, void *argp)
1237 {
1238 	struct g_raid_softc *sc;
1239 	struct g_raid_lock *lp;
1240 	struct bio *bp;
1241 
1242 	sc = vol->v_softc;
1243 	lp = malloc(sizeof(*lp), M_RAID, M_WAITOK | M_ZERO);
1244 	LIST_INSERT_HEAD(&vol->v_locks, lp, l_next);
1245 	lp->l_offset = off;
1246 	lp->l_length = len;
1247 	lp->l_callback_arg = argp;
1248 
1249 	lp->l_pending = 0;
1250 	TAILQ_FOREACH(bp, &vol->v_inflight.queue, bio_queue) {
1251 		if (bp != ignore && g_raid_bio_overlaps(bp, off, len))
1252 			lp->l_pending++;
1253 	}
1254 
1255 	/*
1256 	 * If there are any writes that are pending, we return EBUSY.  All
1257 	 * callers will have to wait until all pending writes clear.
1258 	 */
1259 	if (lp->l_pending > 0) {
1260 		vol->v_pending_lock = 1;
1261 		G_RAID_DEBUG1(4, sc, "Locking range %jd to %jd deferred %d pend",
1262 		    (intmax_t)off, (intmax_t)(off+len), lp->l_pending);
1263 		return (EBUSY);
1264 	}
1265 	G_RAID_DEBUG1(4, sc, "Locking range %jd to %jd",
1266 	    (intmax_t)off, (intmax_t)(off+len));
1267 	G_RAID_TR_LOCKED(vol->v_tr, lp->l_callback_arg);
1268 	return (0);
1269 }
1270 
1271 int
1272 g_raid_unlock_range(struct g_raid_volume *vol, off_t off, off_t len)
1273 {
1274 	struct g_raid_lock *lp;
1275 	struct g_raid_softc *sc;
1276 	struct bio *bp;
1277 
1278 	sc = vol->v_softc;
1279 	LIST_FOREACH(lp, &vol->v_locks, l_next) {
1280 		if (lp->l_offset == off && lp->l_length == len) {
1281 			LIST_REMOVE(lp, l_next);
1282 			/* XXX
1283 			 * Right now we just put them all back on the queue
1284 			 * and hope for the best.  We hope this because any
1285 			 * locked ranges will go right back on this list
1286 			 * when the worker thread runs.
1287 			 * XXX
1288 			 */
1289 			G_RAID_DEBUG1(4, sc, "Unlocked %jd to %jd",
1290 			    (intmax_t)lp->l_offset,
1291 			    (intmax_t)(lp->l_offset+lp->l_length));
1292 			mtx_lock(&sc->sc_queue_mtx);
1293 			while ((bp = bioq_takefirst(&vol->v_locked)) != NULL)
1294 				bioq_disksort(&sc->sc_queue, bp);
1295 			mtx_unlock(&sc->sc_queue_mtx);
1296 			free(lp, M_RAID);
1297 			return (0);
1298 		}
1299 	}
1300 	return (EINVAL);
1301 }
1302 
1303 void
1304 g_raid_subdisk_iostart(struct g_raid_subdisk *sd, struct bio *bp)
1305 {
1306 	struct g_consumer *cp;
1307 	struct g_raid_disk *disk, *tdisk;
1308 
1309 	bp->bio_caller1 = sd;
1310 
1311 	/*
1312 	 * Make sure that the disk is present. Generally it is a task of
1313 	 * transformation layers to not send requests to absent disks, but
1314 	 * it is better to be safe and report situation then sorry.
1315 	 */
1316 	if (sd->sd_disk == NULL) {
1317 		G_RAID_LOGREQ(0, bp, "Warning! I/O request to an absent disk!");
1318 nodisk:
1319 		bp->bio_from = NULL;
1320 		bp->bio_to = NULL;
1321 		bp->bio_error = ENXIO;
1322 		g_raid_disk_done(bp);
1323 		return;
1324 	}
1325 	disk = sd->sd_disk;
1326 	if (disk->d_state != G_RAID_DISK_S_ACTIVE &&
1327 	    disk->d_state != G_RAID_DISK_S_FAILED) {
1328 		G_RAID_LOGREQ(0, bp, "Warning! I/O request to a disk in a "
1329 		    "wrong state (%s)!", g_raid_disk_state2str(disk->d_state));
1330 		goto nodisk;
1331 	}
1332 
1333 	cp = disk->d_consumer;
1334 	bp->bio_from = cp;
1335 	bp->bio_to = cp->provider;
1336 	cp->index++;
1337 
1338 	/* Update average disks load. */
1339 	TAILQ_FOREACH(tdisk, &sd->sd_softc->sc_disks, d_next) {
1340 		if (tdisk->d_consumer == NULL)
1341 			tdisk->d_load = 0;
1342 		else
1343 			tdisk->d_load = (tdisk->d_consumer->index *
1344 			    G_RAID_SUBDISK_LOAD_SCALE + tdisk->d_load * 7) / 8;
1345 	}
1346 
1347 	disk->d_last_offset = bp->bio_offset + bp->bio_length;
1348 	if (dumping) {
1349 		G_RAID_LOGREQ(3, bp, "Sending dumping request.");
1350 		if (bp->bio_cmd == BIO_WRITE) {
1351 			bp->bio_error = g_raid_subdisk_kerneldump(sd,
1352 			    bp->bio_data, 0, bp->bio_offset, bp->bio_length);
1353 		} else
1354 			bp->bio_error = EOPNOTSUPP;
1355 		g_raid_disk_done(bp);
1356 	} else {
1357 		bp->bio_done = g_raid_disk_done;
1358 		bp->bio_offset += sd->sd_offset;
1359 		G_RAID_LOGREQ(3, bp, "Sending request.");
1360 		g_io_request(bp, cp);
1361 	}
1362 }
1363 
1364 int
1365 g_raid_subdisk_kerneldump(struct g_raid_subdisk *sd,
1366     void *virtual, vm_offset_t physical, off_t offset, size_t length)
1367 {
1368 
1369 	if (sd->sd_disk == NULL)
1370 		return (ENXIO);
1371 	if (sd->sd_disk->d_kd.di.dumper == NULL)
1372 		return (EOPNOTSUPP);
1373 	return (dump_write(&sd->sd_disk->d_kd.di,
1374 	    virtual, physical,
1375 	    sd->sd_disk->d_kd.di.mediaoffset + sd->sd_offset + offset,
1376 	    length));
1377 }
1378 
1379 static void
1380 g_raid_disk_done(struct bio *bp)
1381 {
1382 	struct g_raid_softc *sc;
1383 	struct g_raid_subdisk *sd;
1384 
1385 	sd = bp->bio_caller1;
1386 	sc = sd->sd_softc;
1387 	mtx_lock(&sc->sc_queue_mtx);
1388 	bioq_disksort(&sc->sc_queue, bp);
1389 	mtx_unlock(&sc->sc_queue_mtx);
1390 	if (!dumping)
1391 		wakeup(sc);
1392 }
1393 
1394 static void
1395 g_raid_disk_done_request(struct bio *bp)
1396 {
1397 	struct g_raid_softc *sc;
1398 	struct g_raid_disk *disk;
1399 	struct g_raid_subdisk *sd;
1400 	struct g_raid_volume *vol;
1401 
1402 	g_topology_assert_not();
1403 
1404 	G_RAID_LOGREQ(3, bp, "Disk request done: %d.", bp->bio_error);
1405 	sd = bp->bio_caller1;
1406 	sc = sd->sd_softc;
1407 	vol = sd->sd_volume;
1408 	if (bp->bio_from != NULL) {
1409 		bp->bio_from->index--;
1410 		disk = bp->bio_from->private;
1411 		if (disk == NULL)
1412 			g_raid_kill_consumer(sc, bp->bio_from);
1413 	}
1414 	bp->bio_offset -= sd->sd_offset;
1415 
1416 	G_RAID_TR_IODONE(vol->v_tr, sd, bp);
1417 }
1418 
1419 static void
1420 g_raid_handle_event(struct g_raid_softc *sc, struct g_raid_event *ep)
1421 {
1422 
1423 	if ((ep->e_flags & G_RAID_EVENT_VOLUME) != 0)
1424 		ep->e_error = g_raid_update_volume(ep->e_tgt, ep->e_event);
1425 	else if ((ep->e_flags & G_RAID_EVENT_DISK) != 0)
1426 		ep->e_error = g_raid_update_disk(ep->e_tgt, ep->e_event);
1427 	else if ((ep->e_flags & G_RAID_EVENT_SUBDISK) != 0)
1428 		ep->e_error = g_raid_update_subdisk(ep->e_tgt, ep->e_event);
1429 	else
1430 		ep->e_error = g_raid_update_node(ep->e_tgt, ep->e_event);
1431 	if ((ep->e_flags & G_RAID_EVENT_WAIT) == 0) {
1432 		KASSERT(ep->e_error == 0,
1433 		    ("Error cannot be handled."));
1434 		g_raid_event_free(ep);
1435 	} else {
1436 		ep->e_flags |= G_RAID_EVENT_DONE;
1437 		G_RAID_DEBUG1(4, sc, "Waking up %p.", ep);
1438 		mtx_lock(&sc->sc_queue_mtx);
1439 		wakeup(ep);
1440 		mtx_unlock(&sc->sc_queue_mtx);
1441 	}
1442 }
1443 
1444 /*
1445  * Worker thread.
1446  */
1447 static void
1448 g_raid_worker(void *arg)
1449 {
1450 	struct g_raid_softc *sc;
1451 	struct g_raid_event *ep;
1452 	struct g_raid_volume *vol;
1453 	struct bio *bp;
1454 	struct timeval now, t;
1455 	int timeout, rv;
1456 
1457 	sc = arg;
1458 	thread_lock(curthread);
1459 	sched_prio(curthread, PRIBIO);
1460 	thread_unlock(curthread);
1461 
1462 	sx_xlock(&sc->sc_lock);
1463 	for (;;) {
1464 		mtx_lock(&sc->sc_queue_mtx);
1465 		/*
1466 		 * First take a look at events.
1467 		 * This is important to handle events before any I/O requests.
1468 		 */
1469 		bp = NULL;
1470 		vol = NULL;
1471 		rv = 0;
1472 		ep = TAILQ_FIRST(&sc->sc_events);
1473 		if (ep != NULL)
1474 			TAILQ_REMOVE(&sc->sc_events, ep, e_next);
1475 		else if ((bp = bioq_takefirst(&sc->sc_queue)) != NULL)
1476 			;
1477 		else {
1478 			getmicrouptime(&now);
1479 			t = now;
1480 			TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
1481 				if (bioq_first(&vol->v_inflight) == NULL &&
1482 				    vol->v_tr &&
1483 				    timevalcmp(&vol->v_last_done, &t, < ))
1484 					t = vol->v_last_done;
1485 			}
1486 			timevalsub(&t, &now);
1487 			timeout = g_raid_idle_threshold +
1488 			    t.tv_sec * 1000000 + t.tv_usec;
1489 			if (timeout > 0) {
1490 				/*
1491 				 * Two steps to avoid overflows at HZ=1000
1492 				 * and idle timeouts > 2.1s.  Some rounding
1493 				 * errors can occur, but they are < 1tick,
1494 				 * which is deemed to be close enough for
1495 				 * this purpose.
1496 				 */
1497 				int micpertic = 1000000 / hz;
1498 				timeout = (timeout + micpertic - 1) / micpertic;
1499 				sx_xunlock(&sc->sc_lock);
1500 				MSLEEP(rv, sc, &sc->sc_queue_mtx,
1501 				    PRIBIO | PDROP, "-", timeout);
1502 				sx_xlock(&sc->sc_lock);
1503 				goto process;
1504 			} else
1505 				rv = EWOULDBLOCK;
1506 		}
1507 		mtx_unlock(&sc->sc_queue_mtx);
1508 process:
1509 		if (ep != NULL) {
1510 			g_raid_handle_event(sc, ep);
1511 		} else if (bp != NULL) {
1512 			if (bp->bio_to != NULL &&
1513 			    bp->bio_to->geom == sc->sc_geom)
1514 				g_raid_start_request(bp);
1515 			else
1516 				g_raid_disk_done_request(bp);
1517 		} else if (rv == EWOULDBLOCK) {
1518 			TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
1519 				if (vol->v_writes == 0 && vol->v_dirty)
1520 					g_raid_clean(vol, -1);
1521 				if (bioq_first(&vol->v_inflight) == NULL &&
1522 				    vol->v_tr) {
1523 					t.tv_sec = g_raid_idle_threshold / 1000000;
1524 					t.tv_usec = g_raid_idle_threshold % 1000000;
1525 					timevaladd(&t, &vol->v_last_done);
1526 					getmicrouptime(&now);
1527 					if (timevalcmp(&t, &now, <= )) {
1528 						G_RAID_TR_IDLE(vol->v_tr);
1529 						vol->v_last_done = now;
1530 					}
1531 				}
1532 			}
1533 		}
1534 		if (sc->sc_stopping == G_RAID_DESTROY_HARD)
1535 			g_raid_destroy_node(sc, 1);	/* May not return. */
1536 	}
1537 }
1538 
1539 static void
1540 g_raid_poll(struct g_raid_softc *sc)
1541 {
1542 	struct g_raid_event *ep;
1543 	struct bio *bp;
1544 
1545 	sx_xlock(&sc->sc_lock);
1546 	mtx_lock(&sc->sc_queue_mtx);
1547 	/*
1548 	 * First take a look at events.
1549 	 * This is important to handle events before any I/O requests.
1550 	 */
1551 	ep = TAILQ_FIRST(&sc->sc_events);
1552 	if (ep != NULL) {
1553 		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
1554 		mtx_unlock(&sc->sc_queue_mtx);
1555 		g_raid_handle_event(sc, ep);
1556 		goto out;
1557 	}
1558 	bp = bioq_takefirst(&sc->sc_queue);
1559 	if (bp != NULL) {
1560 		mtx_unlock(&sc->sc_queue_mtx);
1561 		if (bp->bio_from == NULL ||
1562 		    bp->bio_from->geom != sc->sc_geom)
1563 			g_raid_start_request(bp);
1564 		else
1565 			g_raid_disk_done_request(bp);
1566 	}
1567 out:
1568 	sx_xunlock(&sc->sc_lock);
1569 }
1570 
1571 static void
1572 g_raid_launch_provider(struct g_raid_volume *vol)
1573 {
1574 	struct g_raid_disk *disk;
1575 	struct g_raid_softc *sc;
1576 	struct g_provider *pp;
1577 	char name[G_RAID_MAX_VOLUMENAME];
1578 	off_t off;
1579 
1580 	sc = vol->v_softc;
1581 	sx_assert(&sc->sc_lock, SX_LOCKED);
1582 
1583 	g_topology_lock();
1584 	/* Try to name provider with volume name. */
1585 	snprintf(name, sizeof(name), "raid/%s", vol->v_name);
1586 	if (g_raid_name_format == 0 || vol->v_name[0] == 0 ||
1587 	    g_provider_by_name(name) != NULL) {
1588 		/* Otherwise use sequential volume number. */
1589 		snprintf(name, sizeof(name), "raid/r%d", vol->v_global_id);
1590 	}
1591 	pp = g_new_providerf(sc->sc_geom, "%s", name);
1592 	pp->private = vol;
1593 	pp->mediasize = vol->v_mediasize;
1594 	pp->sectorsize = vol->v_sectorsize;
1595 	pp->stripesize = 0;
1596 	pp->stripeoffset = 0;
1597 	if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 ||
1598 	    vol->v_raid_level == G_RAID_VOLUME_RL_RAID3 ||
1599 	    vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE ||
1600 	    vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT) {
1601 		if ((disk = vol->v_subdisks[0].sd_disk) != NULL &&
1602 		    disk->d_consumer != NULL &&
1603 		    disk->d_consumer->provider != NULL) {
1604 			pp->stripesize = disk->d_consumer->provider->stripesize;
1605 			off = disk->d_consumer->provider->stripeoffset;
1606 			pp->stripeoffset = off + vol->v_subdisks[0].sd_offset;
1607 			if (off > 0)
1608 				pp->stripeoffset %= off;
1609 		}
1610 		if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3) {
1611 			pp->stripesize *= (vol->v_disks_count - 1);
1612 			pp->stripeoffset *= (vol->v_disks_count - 1);
1613 		}
1614 	} else
1615 		pp->stripesize = vol->v_strip_size;
1616 	vol->v_provider = pp;
1617 	g_error_provider(pp, 0);
1618 	g_topology_unlock();
1619 	G_RAID_DEBUG1(0, sc, "Provider %s for volume %s created.",
1620 	    pp->name, vol->v_name);
1621 }
1622 
1623 static void
1624 g_raid_destroy_provider(struct g_raid_volume *vol)
1625 {
1626 	struct g_raid_softc *sc;
1627 	struct g_provider *pp;
1628 	struct bio *bp, *tmp;
1629 
1630 	g_topology_assert_not();
1631 	sc = vol->v_softc;
1632 	pp = vol->v_provider;
1633 	KASSERT(pp != NULL, ("NULL provider (volume=%s).", vol->v_name));
1634 
1635 	g_topology_lock();
1636 	g_error_provider(pp, ENXIO);
1637 	mtx_lock(&sc->sc_queue_mtx);
1638 	TAILQ_FOREACH_SAFE(bp, &sc->sc_queue.queue, bio_queue, tmp) {
1639 		if (bp->bio_to != pp)
1640 			continue;
1641 		bioq_remove(&sc->sc_queue, bp);
1642 		g_io_deliver(bp, ENXIO);
1643 	}
1644 	mtx_unlock(&sc->sc_queue_mtx);
1645 	G_RAID_DEBUG1(0, sc, "Provider %s for volume %s destroyed.",
1646 	    pp->name, vol->v_name);
1647 	g_wither_provider(pp, ENXIO);
1648 	g_topology_unlock();
1649 	vol->v_provider = NULL;
1650 }
1651 
1652 /*
1653  * Update device state.
1654  */
1655 static int
1656 g_raid_update_volume(struct g_raid_volume *vol, u_int event)
1657 {
1658 	struct g_raid_softc *sc;
1659 
1660 	sc = vol->v_softc;
1661 	sx_assert(&sc->sc_lock, SX_XLOCKED);
1662 
1663 	G_RAID_DEBUG1(2, sc, "Event %s for volume %s.",
1664 	    g_raid_volume_event2str(event),
1665 	    vol->v_name);
1666 	switch (event) {
1667 	case G_RAID_VOLUME_E_DOWN:
1668 		if (vol->v_provider != NULL)
1669 			g_raid_destroy_provider(vol);
1670 		break;
1671 	case G_RAID_VOLUME_E_UP:
1672 		if (vol->v_provider == NULL)
1673 			g_raid_launch_provider(vol);
1674 		break;
1675 	case G_RAID_VOLUME_E_START:
1676 		if (vol->v_tr)
1677 			G_RAID_TR_START(vol->v_tr);
1678 		return (0);
1679 	default:
1680 		if (sc->sc_md)
1681 			G_RAID_MD_VOLUME_EVENT(sc->sc_md, vol, event);
1682 		return (0);
1683 	}
1684 
1685 	/* Manage root mount release. */
1686 	if (vol->v_starting) {
1687 		vol->v_starting = 0;
1688 		G_RAID_DEBUG1(1, sc, "root_mount_rel %p", vol->v_rootmount);
1689 		root_mount_rel(vol->v_rootmount);
1690 		vol->v_rootmount = NULL;
1691 	}
1692 	if (vol->v_stopping && vol->v_provider_open == 0)
1693 		g_raid_destroy_volume(vol);
1694 	return (0);
1695 }
1696 
1697 /*
1698  * Update subdisk state.
1699  */
1700 static int
1701 g_raid_update_subdisk(struct g_raid_subdisk *sd, u_int event)
1702 {
1703 	struct g_raid_softc *sc;
1704 	struct g_raid_volume *vol;
1705 
1706 	sc = sd->sd_softc;
1707 	vol = sd->sd_volume;
1708 	sx_assert(&sc->sc_lock, SX_XLOCKED);
1709 
1710 	G_RAID_DEBUG1(2, sc, "Event %s for subdisk %s:%d-%s.",
1711 	    g_raid_subdisk_event2str(event),
1712 	    vol->v_name, sd->sd_pos,
1713 	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
1714 	if (vol->v_tr)
1715 		G_RAID_TR_EVENT(vol->v_tr, sd, event);
1716 
1717 	return (0);
1718 }
1719 
1720 /*
1721  * Update disk state.
1722  */
1723 static int
1724 g_raid_update_disk(struct g_raid_disk *disk, u_int event)
1725 {
1726 	struct g_raid_softc *sc;
1727 
1728 	sc = disk->d_softc;
1729 	sx_assert(&sc->sc_lock, SX_XLOCKED);
1730 
1731 	G_RAID_DEBUG1(2, sc, "Event %s for disk %s.",
1732 	    g_raid_disk_event2str(event),
1733 	    g_raid_get_diskname(disk));
1734 
1735 	if (sc->sc_md)
1736 		G_RAID_MD_EVENT(sc->sc_md, disk, event);
1737 	return (0);
1738 }
1739 
1740 /*
1741  * Node event.
1742  */
1743 static int
1744 g_raid_update_node(struct g_raid_softc *sc, u_int event)
1745 {
1746 	sx_assert(&sc->sc_lock, SX_XLOCKED);
1747 
1748 	G_RAID_DEBUG1(2, sc, "Event %s for the array.",
1749 	    g_raid_node_event2str(event));
1750 
1751 	if (event == G_RAID_NODE_E_WAKE)
1752 		return (0);
1753 	if (sc->sc_md)
1754 		G_RAID_MD_EVENT(sc->sc_md, NULL, event);
1755 	return (0);
1756 }
1757 
1758 static int
1759 g_raid_access(struct g_provider *pp, int acr, int acw, int ace)
1760 {
1761 	struct g_raid_volume *vol;
1762 	struct g_raid_softc *sc;
1763 	int dcw, opens, error = 0;
1764 
1765 	g_topology_assert();
1766 	sc = pp->geom->softc;
1767 	vol = pp->private;
1768 	KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name));
1769 	KASSERT(vol != NULL, ("NULL volume (provider=%s).", pp->name));
1770 
1771 	G_RAID_DEBUG1(2, sc, "Access request for %s: r%dw%de%d.", pp->name,
1772 	    acr, acw, ace);
1773 	dcw = pp->acw + acw;
1774 
1775 	g_topology_unlock();
1776 	sx_xlock(&sc->sc_lock);
1777 	/* Deny new opens while dying. */
1778 	if (sc->sc_stopping != 0 && (acr > 0 || acw > 0 || ace > 0)) {
1779 		error = ENXIO;
1780 		goto out;
1781 	}
1782 	if (dcw == 0 && vol->v_dirty)
1783 		g_raid_clean(vol, dcw);
1784 	vol->v_provider_open += acr + acw + ace;
1785 	/* Handle delayed node destruction. */
1786 	if (sc->sc_stopping == G_RAID_DESTROY_DELAYED &&
1787 	    vol->v_provider_open == 0) {
1788 		/* Count open volumes. */
1789 		opens = g_raid_nopens(sc);
1790 		if (opens == 0) {
1791 			sc->sc_stopping = G_RAID_DESTROY_HARD;
1792 			/* Wake up worker to make it selfdestruct. */
1793 			g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
1794 		}
1795 	}
1796 	/* Handle open volume destruction. */
1797 	if (vol->v_stopping && vol->v_provider_open == 0)
1798 		g_raid_destroy_volume(vol);
1799 out:
1800 	sx_xunlock(&sc->sc_lock);
1801 	g_topology_lock();
1802 	return (error);
1803 }
1804 
1805 struct g_raid_softc *
1806 g_raid_create_node(struct g_class *mp,
1807     const char *name, struct g_raid_md_object *md)
1808 {
1809 	struct g_raid_softc *sc;
1810 	struct g_geom *gp;
1811 	int error;
1812 
1813 	g_topology_assert();
1814 	G_RAID_DEBUG(1, "Creating array %s.", name);
1815 
1816 	gp = g_new_geomf(mp, "%s", name);
1817 	sc = malloc(sizeof(*sc), M_RAID, M_WAITOK | M_ZERO);
1818 	gp->start = g_raid_start;
1819 	gp->orphan = g_raid_orphan;
1820 	gp->access = g_raid_access;
1821 	gp->dumpconf = g_raid_dumpconf;
1822 
1823 	sc->sc_md = md;
1824 	sc->sc_geom = gp;
1825 	sc->sc_flags = 0;
1826 	TAILQ_INIT(&sc->sc_volumes);
1827 	TAILQ_INIT(&sc->sc_disks);
1828 	sx_init(&sc->sc_lock, "graid:lock");
1829 	mtx_init(&sc->sc_queue_mtx, "graid:queue", NULL, MTX_DEF);
1830 	TAILQ_INIT(&sc->sc_events);
1831 	bioq_init(&sc->sc_queue);
1832 	gp->softc = sc;
1833 	error = kproc_create(g_raid_worker, sc, &sc->sc_worker, 0, 0,
1834 	    "g_raid %s", name);
1835 	if (error != 0) {
1836 		G_RAID_DEBUG(0, "Cannot create kernel thread for %s.", name);
1837 		mtx_destroy(&sc->sc_queue_mtx);
1838 		sx_destroy(&sc->sc_lock);
1839 		g_destroy_geom(sc->sc_geom);
1840 		free(sc, M_RAID);
1841 		return (NULL);
1842 	}
1843 
1844 	G_RAID_DEBUG1(0, sc, "Array %s created.", name);
1845 	return (sc);
1846 }
1847 
1848 struct g_raid_volume *
1849 g_raid_create_volume(struct g_raid_softc *sc, const char *name, int id)
1850 {
1851 	struct g_raid_volume	*vol, *vol1;
1852 	int i;
1853 
1854 	G_RAID_DEBUG1(1, sc, "Creating volume %s.", name);
1855 	vol = malloc(sizeof(*vol), M_RAID, M_WAITOK | M_ZERO);
1856 	vol->v_softc = sc;
1857 	strlcpy(vol->v_name, name, G_RAID_MAX_VOLUMENAME);
1858 	vol->v_state = G_RAID_VOLUME_S_STARTING;
1859 	vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN;
1860 	vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_UNKNOWN;
1861 	vol->v_rotate_parity = 1;
1862 	bioq_init(&vol->v_inflight);
1863 	bioq_init(&vol->v_locked);
1864 	LIST_INIT(&vol->v_locks);
1865 	for (i = 0; i < G_RAID_MAX_SUBDISKS; i++) {
1866 		vol->v_subdisks[i].sd_softc = sc;
1867 		vol->v_subdisks[i].sd_volume = vol;
1868 		vol->v_subdisks[i].sd_pos = i;
1869 		vol->v_subdisks[i].sd_state = G_RAID_DISK_S_NONE;
1870 	}
1871 
1872 	/* Find free ID for this volume. */
1873 	g_topology_lock();
1874 	vol1 = vol;
1875 	if (id >= 0) {
1876 		LIST_FOREACH(vol1, &g_raid_volumes, v_global_next) {
1877 			if (vol1->v_global_id == id)
1878 				break;
1879 		}
1880 	}
1881 	if (vol1 != NULL) {
1882 		for (id = 0; ; id++) {
1883 			LIST_FOREACH(vol1, &g_raid_volumes, v_global_next) {
1884 				if (vol1->v_global_id == id)
1885 					break;
1886 			}
1887 			if (vol1 == NULL)
1888 				break;
1889 		}
1890 	}
1891 	vol->v_global_id = id;
1892 	LIST_INSERT_HEAD(&g_raid_volumes, vol, v_global_next);
1893 	g_topology_unlock();
1894 
1895 	/* Delay root mounting. */
1896 	vol->v_rootmount = root_mount_hold("GRAID");
1897 	G_RAID_DEBUG1(1, sc, "root_mount_hold %p", vol->v_rootmount);
1898 	vol->v_starting = 1;
1899 	TAILQ_INSERT_TAIL(&sc->sc_volumes, vol, v_next);
1900 	return (vol);
1901 }
1902 
1903 struct g_raid_disk *
1904 g_raid_create_disk(struct g_raid_softc *sc)
1905 {
1906 	struct g_raid_disk	*disk;
1907 
1908 	G_RAID_DEBUG1(1, sc, "Creating disk.");
1909 	disk = malloc(sizeof(*disk), M_RAID, M_WAITOK | M_ZERO);
1910 	disk->d_softc = sc;
1911 	disk->d_state = G_RAID_DISK_S_NONE;
1912 	TAILQ_INIT(&disk->d_subdisks);
1913 	TAILQ_INSERT_TAIL(&sc->sc_disks, disk, d_next);
1914 	return (disk);
1915 }
1916 
1917 int g_raid_start_volume(struct g_raid_volume *vol)
1918 {
1919 	struct g_raid_tr_class *class;
1920 	struct g_raid_tr_object *obj;
1921 	int status;
1922 
1923 	G_RAID_DEBUG1(2, vol->v_softc, "Starting volume %s.", vol->v_name);
1924 	LIST_FOREACH(class, &g_raid_tr_classes, trc_list) {
1925 		G_RAID_DEBUG1(2, vol->v_softc,
1926 		    "Tasting volume %s for %s transformation.",
1927 		    vol->v_name, class->name);
1928 		obj = (void *)kobj_create((kobj_class_t)class, M_RAID,
1929 		    M_WAITOK);
1930 		obj->tro_class = class;
1931 		obj->tro_volume = vol;
1932 		status = G_RAID_TR_TASTE(obj, vol);
1933 		if (status != G_RAID_TR_TASTE_FAIL)
1934 			break;
1935 		kobj_delete((kobj_t)obj, M_RAID);
1936 	}
1937 	if (class == NULL) {
1938 		G_RAID_DEBUG1(0, vol->v_softc,
1939 		    "No transformation module found for %s.",
1940 		    vol->v_name);
1941 		vol->v_tr = NULL;
1942 		g_raid_change_volume_state(vol, G_RAID_VOLUME_S_UNSUPPORTED);
1943 		g_raid_event_send(vol, G_RAID_VOLUME_E_DOWN,
1944 		    G_RAID_EVENT_VOLUME);
1945 		return (-1);
1946 	}
1947 	G_RAID_DEBUG1(2, vol->v_softc,
1948 	    "Transformation module %s chosen for %s.",
1949 	    class->name, vol->v_name);
1950 	vol->v_tr = obj;
1951 	return (0);
1952 }
1953 
1954 int
1955 g_raid_destroy_node(struct g_raid_softc *sc, int worker)
1956 {
1957 	struct g_raid_volume *vol, *tmpv;
1958 	struct g_raid_disk *disk, *tmpd;
1959 	int error = 0;
1960 
1961 	sc->sc_stopping = G_RAID_DESTROY_HARD;
1962 	TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tmpv) {
1963 		if (g_raid_destroy_volume(vol))
1964 			error = EBUSY;
1965 	}
1966 	if (error)
1967 		return (error);
1968 	TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tmpd) {
1969 		if (g_raid_destroy_disk(disk))
1970 			error = EBUSY;
1971 	}
1972 	if (error)
1973 		return (error);
1974 	if (sc->sc_md) {
1975 		G_RAID_MD_FREE(sc->sc_md);
1976 		kobj_delete((kobj_t)sc->sc_md, M_RAID);
1977 		sc->sc_md = NULL;
1978 	}
1979 	if (sc->sc_geom != NULL) {
1980 		G_RAID_DEBUG1(0, sc, "Array %s destroyed.", sc->sc_name);
1981 		g_topology_lock();
1982 		sc->sc_geom->softc = NULL;
1983 		g_wither_geom(sc->sc_geom, ENXIO);
1984 		g_topology_unlock();
1985 		sc->sc_geom = NULL;
1986 	} else
1987 		G_RAID_DEBUG(1, "Array destroyed.");
1988 	if (worker) {
1989 		g_raid_event_cancel(sc, sc);
1990 		mtx_destroy(&sc->sc_queue_mtx);
1991 		sx_xunlock(&sc->sc_lock);
1992 		sx_destroy(&sc->sc_lock);
1993 		wakeup(&sc->sc_stopping);
1994 		free(sc, M_RAID);
1995 		curthread->td_pflags &= ~TDP_GEOM;
1996 		G_RAID_DEBUG(1, "Thread exiting.");
1997 		kproc_exit(0);
1998 	} else {
1999 		/* Wake up worker to make it selfdestruct. */
2000 		g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
2001 	}
2002 	return (0);
2003 }
2004 
2005 int
2006 g_raid_destroy_volume(struct g_raid_volume *vol)
2007 {
2008 	struct g_raid_softc *sc;
2009 	struct g_raid_disk *disk;
2010 	int i;
2011 
2012 	sc = vol->v_softc;
2013 	G_RAID_DEBUG1(2, sc, "Destroying volume %s.", vol->v_name);
2014 	vol->v_stopping = 1;
2015 	if (vol->v_state != G_RAID_VOLUME_S_STOPPED) {
2016 		if (vol->v_tr) {
2017 			G_RAID_TR_STOP(vol->v_tr);
2018 			return (EBUSY);
2019 		} else
2020 			vol->v_state = G_RAID_VOLUME_S_STOPPED;
2021 	}
2022 	if (g_raid_event_check(sc, vol) != 0)
2023 		return (EBUSY);
2024 	if (vol->v_provider != NULL)
2025 		return (EBUSY);
2026 	if (vol->v_provider_open != 0)
2027 		return (EBUSY);
2028 	if (vol->v_tr) {
2029 		G_RAID_TR_FREE(vol->v_tr);
2030 		kobj_delete((kobj_t)vol->v_tr, M_RAID);
2031 		vol->v_tr = NULL;
2032 	}
2033 	if (vol->v_rootmount)
2034 		root_mount_rel(vol->v_rootmount);
2035 	g_topology_lock();
2036 	LIST_REMOVE(vol, v_global_next);
2037 	g_topology_unlock();
2038 	TAILQ_REMOVE(&sc->sc_volumes, vol, v_next);
2039 	for (i = 0; i < G_RAID_MAX_SUBDISKS; i++) {
2040 		g_raid_event_cancel(sc, &vol->v_subdisks[i]);
2041 		disk = vol->v_subdisks[i].sd_disk;
2042 		if (disk == NULL)
2043 			continue;
2044 		TAILQ_REMOVE(&disk->d_subdisks, &vol->v_subdisks[i], sd_next);
2045 	}
2046 	G_RAID_DEBUG1(2, sc, "Volume %s destroyed.", vol->v_name);
2047 	if (sc->sc_md)
2048 		G_RAID_MD_FREE_VOLUME(sc->sc_md, vol);
2049 	g_raid_event_cancel(sc, vol);
2050 	free(vol, M_RAID);
2051 	if (sc->sc_stopping == G_RAID_DESTROY_HARD) {
2052 		/* Wake up worker to let it selfdestruct. */
2053 		g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
2054 	}
2055 	return (0);
2056 }
2057 
2058 int
2059 g_raid_destroy_disk(struct g_raid_disk *disk)
2060 {
2061 	struct g_raid_softc *sc;
2062 	struct g_raid_subdisk *sd, *tmp;
2063 
2064 	sc = disk->d_softc;
2065 	G_RAID_DEBUG1(2, sc, "Destroying disk.");
2066 	if (disk->d_consumer) {
2067 		g_raid_kill_consumer(sc, disk->d_consumer);
2068 		disk->d_consumer = NULL;
2069 	}
2070 	TAILQ_FOREACH_SAFE(sd, &disk->d_subdisks, sd_next, tmp) {
2071 		g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE);
2072 		g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED,
2073 		    G_RAID_EVENT_SUBDISK);
2074 		TAILQ_REMOVE(&disk->d_subdisks, sd, sd_next);
2075 		sd->sd_disk = NULL;
2076 	}
2077 	TAILQ_REMOVE(&sc->sc_disks, disk, d_next);
2078 	if (sc->sc_md)
2079 		G_RAID_MD_FREE_DISK(sc->sc_md, disk);
2080 	g_raid_event_cancel(sc, disk);
2081 	free(disk, M_RAID);
2082 	return (0);
2083 }
2084 
2085 int
2086 g_raid_destroy(struct g_raid_softc *sc, int how)
2087 {
2088 	int opens;
2089 
2090 	g_topology_assert_not();
2091 	if (sc == NULL)
2092 		return (ENXIO);
2093 	sx_assert(&sc->sc_lock, SX_XLOCKED);
2094 
2095 	/* Count open volumes. */
2096 	opens = g_raid_nopens(sc);
2097 
2098 	/* React on some opened volumes. */
2099 	if (opens > 0) {
2100 		switch (how) {
2101 		case G_RAID_DESTROY_SOFT:
2102 			G_RAID_DEBUG1(1, sc,
2103 			    "%d volumes are still open.",
2104 			    opens);
2105 			return (EBUSY);
2106 		case G_RAID_DESTROY_DELAYED:
2107 			G_RAID_DEBUG1(1, sc,
2108 			    "Array will be destroyed on last close.");
2109 			sc->sc_stopping = G_RAID_DESTROY_DELAYED;
2110 			return (EBUSY);
2111 		case G_RAID_DESTROY_HARD:
2112 			G_RAID_DEBUG1(1, sc,
2113 			    "%d volumes are still open.",
2114 			    opens);
2115 		}
2116 	}
2117 
2118 	/* Mark node for destruction. */
2119 	sc->sc_stopping = G_RAID_DESTROY_HARD;
2120 	/* Wake up worker to let it selfdestruct. */
2121 	g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
2122 	/* Sleep until node destroyed. */
2123 	sx_sleep(&sc->sc_stopping, &sc->sc_lock,
2124 	    PRIBIO | PDROP, "r:destroy", 0);
2125 	return (0);
2126 }
2127 
2128 static void
2129 g_raid_taste_orphan(struct g_consumer *cp)
2130 {
2131 
2132 	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
2133 	    cp->provider->name));
2134 }
2135 
2136 static struct g_geom *
2137 g_raid_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
2138 {
2139 	struct g_consumer *cp;
2140 	struct g_geom *gp, *geom;
2141 	struct g_raid_md_class *class;
2142 	struct g_raid_md_object *obj;
2143 	int status;
2144 
2145 	g_topology_assert();
2146 	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
2147 	G_RAID_DEBUG(2, "Tasting provider %s.", pp->name);
2148 
2149 	gp = g_new_geomf(mp, "raid:taste");
2150 	/*
2151 	 * This orphan function should be never called.
2152 	 */
2153 	gp->orphan = g_raid_taste_orphan;
2154 	cp = g_new_consumer(gp);
2155 	g_attach(cp, pp);
2156 
2157 	geom = NULL;
2158 	LIST_FOREACH(class, &g_raid_md_classes, mdc_list) {
2159 		G_RAID_DEBUG(2, "Tasting provider %s for %s metadata.",
2160 		    pp->name, class->name);
2161 		obj = (void *)kobj_create((kobj_class_t)class, M_RAID,
2162 		    M_WAITOK);
2163 		obj->mdo_class = class;
2164 		status = G_RAID_MD_TASTE(obj, mp, cp, &geom);
2165 		if (status != G_RAID_MD_TASTE_NEW)
2166 			kobj_delete((kobj_t)obj, M_RAID);
2167 		if (status != G_RAID_MD_TASTE_FAIL)
2168 			break;
2169 	}
2170 
2171 	g_detach(cp);
2172 	g_destroy_consumer(cp);
2173 	g_destroy_geom(gp);
2174 	G_RAID_DEBUG(2, "Tasting provider %s done.", pp->name);
2175 	return (geom);
2176 }
2177 
2178 int
2179 g_raid_create_node_format(const char *format, struct gctl_req *req,
2180     struct g_geom **gp)
2181 {
2182 	struct g_raid_md_class *class;
2183 	struct g_raid_md_object *obj;
2184 	int status;
2185 
2186 	G_RAID_DEBUG(2, "Creating array for %s metadata.", format);
2187 	LIST_FOREACH(class, &g_raid_md_classes, mdc_list) {
2188 		if (strcasecmp(class->name, format) == 0)
2189 			break;
2190 	}
2191 	if (class == NULL) {
2192 		G_RAID_DEBUG(1, "No support for %s metadata.", format);
2193 		return (G_RAID_MD_TASTE_FAIL);
2194 	}
2195 	obj = (void *)kobj_create((kobj_class_t)class, M_RAID,
2196 	    M_WAITOK);
2197 	obj->mdo_class = class;
2198 	status = G_RAID_MD_CREATE_REQ(obj, &g_raid_class, req, gp);
2199 	if (status != G_RAID_MD_TASTE_NEW)
2200 		kobj_delete((kobj_t)obj, M_RAID);
2201 	return (status);
2202 }
2203 
2204 static int
2205 g_raid_destroy_geom(struct gctl_req *req __unused,
2206     struct g_class *mp __unused, struct g_geom *gp)
2207 {
2208 	struct g_raid_softc *sc;
2209 	int error;
2210 
2211 	g_topology_unlock();
2212 	sc = gp->softc;
2213 	sx_xlock(&sc->sc_lock);
2214 	g_cancel_event(sc);
2215 	error = g_raid_destroy(gp->softc, G_RAID_DESTROY_SOFT);
2216 	if (error != 0)
2217 		sx_xunlock(&sc->sc_lock);
2218 	g_topology_lock();
2219 	return (error);
2220 }
2221 
2222 void g_raid_write_metadata(struct g_raid_softc *sc, struct g_raid_volume *vol,
2223     struct g_raid_subdisk *sd, struct g_raid_disk *disk)
2224 {
2225 
2226 	if (sc->sc_stopping == G_RAID_DESTROY_HARD)
2227 		return;
2228 	if (sc->sc_md)
2229 		G_RAID_MD_WRITE(sc->sc_md, vol, sd, disk);
2230 }
2231 
2232 void g_raid_fail_disk(struct g_raid_softc *sc,
2233     struct g_raid_subdisk *sd, struct g_raid_disk *disk)
2234 {
2235 
2236 	if (disk == NULL)
2237 		disk = sd->sd_disk;
2238 	if (disk == NULL) {
2239 		G_RAID_DEBUG1(0, sc, "Warning! Fail request to an absent disk!");
2240 		return;
2241 	}
2242 	if (disk->d_state != G_RAID_DISK_S_ACTIVE) {
2243 		G_RAID_DEBUG1(0, sc, "Warning! Fail request to a disk in a "
2244 		    "wrong state (%s)!", g_raid_disk_state2str(disk->d_state));
2245 		return;
2246 	}
2247 	if (sc->sc_md)
2248 		G_RAID_MD_FAIL_DISK(sc->sc_md, sd, disk);
2249 }
2250 
2251 static void
2252 g_raid_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
2253     struct g_consumer *cp, struct g_provider *pp)
2254 {
2255 	struct g_raid_softc *sc;
2256 	struct g_raid_volume *vol;
2257 	struct g_raid_subdisk *sd;
2258 	struct g_raid_disk *disk;
2259 	int i, s;
2260 
2261 	g_topology_assert();
2262 
2263 	sc = gp->softc;
2264 	if (sc == NULL)
2265 		return;
2266 	if (pp != NULL) {
2267 		vol = pp->private;
2268 		g_topology_unlock();
2269 		sx_xlock(&sc->sc_lock);
2270 		sbuf_printf(sb, "%s<Label>%s</Label>\n", indent,
2271 		    vol->v_name);
2272 		sbuf_printf(sb, "%s<RAIDLevel>%s</RAIDLevel>\n", indent,
2273 		    g_raid_volume_level2str(vol->v_raid_level,
2274 		    vol->v_raid_level_qualifier));
2275 		sbuf_printf(sb,
2276 		    "%s<Transformation>%s</Transformation>\n", indent,
2277 		    vol->v_tr ? vol->v_tr->tro_class->name : "NONE");
2278 		sbuf_printf(sb, "%s<Components>%u</Components>\n", indent,
2279 		    vol->v_disks_count);
2280 		sbuf_printf(sb, "%s<Strip>%u</Strip>\n", indent,
2281 		    vol->v_strip_size);
2282 		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
2283 		    g_raid_volume_state2str(vol->v_state));
2284 		sbuf_printf(sb, "%s<Dirty>%s</Dirty>\n", indent,
2285 		    vol->v_dirty ? "Yes" : "No");
2286 		sbuf_printf(sb, "%s<Subdisks>", indent);
2287 		for (i = 0; i < vol->v_disks_count; i++) {
2288 			sd = &vol->v_subdisks[i];
2289 			if (sd->sd_disk != NULL &&
2290 			    sd->sd_disk->d_consumer != NULL) {
2291 				sbuf_printf(sb, "%s ",
2292 				    g_raid_get_diskname(sd->sd_disk));
2293 			} else {
2294 				sbuf_printf(sb, "NONE ");
2295 			}
2296 			sbuf_printf(sb, "(%s",
2297 			    g_raid_subdisk_state2str(sd->sd_state));
2298 			if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
2299 			    sd->sd_state == G_RAID_SUBDISK_S_RESYNC) {
2300 				sbuf_printf(sb, " %d%%",
2301 				    (int)(sd->sd_rebuild_pos * 100 /
2302 				     sd->sd_size));
2303 			}
2304 			sbuf_printf(sb, ")");
2305 			if (i + 1 < vol->v_disks_count)
2306 				sbuf_printf(sb, ", ");
2307 		}
2308 		sbuf_printf(sb, "</Subdisks>\n");
2309 		sx_xunlock(&sc->sc_lock);
2310 		g_topology_lock();
2311 	} else if (cp != NULL) {
2312 		disk = cp->private;
2313 		if (disk == NULL)
2314 			return;
2315 		g_topology_unlock();
2316 		sx_xlock(&sc->sc_lock);
2317 		sbuf_printf(sb, "%s<State>%s", indent,
2318 		    g_raid_disk_state2str(disk->d_state));
2319 		if (!TAILQ_EMPTY(&disk->d_subdisks)) {
2320 			sbuf_printf(sb, " (");
2321 			TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
2322 				sbuf_printf(sb, "%s",
2323 				    g_raid_subdisk_state2str(sd->sd_state));
2324 				if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
2325 				    sd->sd_state == G_RAID_SUBDISK_S_RESYNC) {
2326 					sbuf_printf(sb, " %d%%",
2327 					    (int)(sd->sd_rebuild_pos * 100 /
2328 					     sd->sd_size));
2329 				}
2330 				if (TAILQ_NEXT(sd, sd_next))
2331 					sbuf_printf(sb, ", ");
2332 			}
2333 			sbuf_printf(sb, ")");
2334 		}
2335 		sbuf_printf(sb, "</State>\n");
2336 		sbuf_printf(sb, "%s<Subdisks>", indent);
2337 		TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
2338 			sbuf_printf(sb, "r%d(%s):%d@%ju",
2339 			    sd->sd_volume->v_global_id,
2340 			    sd->sd_volume->v_name,
2341 			    sd->sd_pos, sd->sd_offset);
2342 			if (TAILQ_NEXT(sd, sd_next))
2343 				sbuf_printf(sb, ", ");
2344 		}
2345 		sbuf_printf(sb, "</Subdisks>\n");
2346 		sbuf_printf(sb, "%s<ReadErrors>%d</ReadErrors>\n", indent,
2347 		    disk->d_read_errs);
2348 		sx_xunlock(&sc->sc_lock);
2349 		g_topology_lock();
2350 	} else {
2351 		g_topology_unlock();
2352 		sx_xlock(&sc->sc_lock);
2353 		if (sc->sc_md) {
2354 			sbuf_printf(sb, "%s<Metadata>%s</Metadata>\n", indent,
2355 			    sc->sc_md->mdo_class->name);
2356 		}
2357 		if (!TAILQ_EMPTY(&sc->sc_volumes)) {
2358 			s = 0xff;
2359 			TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
2360 				if (vol->v_state < s)
2361 					s = vol->v_state;
2362 			}
2363 			sbuf_printf(sb, "%s<State>%s</State>\n", indent,
2364 			    g_raid_volume_state2str(s));
2365 		}
2366 		sx_xunlock(&sc->sc_lock);
2367 		g_topology_lock();
2368 	}
2369 }
2370 
2371 static void
2372 g_raid_shutdown_pre_sync(void *arg, int howto)
2373 {
2374 	struct g_class *mp;
2375 	struct g_geom *gp, *gp2;
2376 	struct g_raid_softc *sc;
2377 	int error;
2378 
2379 	mp = arg;
2380 	DROP_GIANT();
2381 	g_topology_lock();
2382 	LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
2383 		if ((sc = gp->softc) == NULL)
2384 			continue;
2385 		g_topology_unlock();
2386 		sx_xlock(&sc->sc_lock);
2387 		g_cancel_event(sc);
2388 		error = g_raid_destroy(sc, G_RAID_DESTROY_DELAYED);
2389 		if (error != 0)
2390 			sx_xunlock(&sc->sc_lock);
2391 		g_topology_lock();
2392 	}
2393 	g_topology_unlock();
2394 	PICKUP_GIANT();
2395 }
2396 
2397 static void
2398 g_raid_init(struct g_class *mp)
2399 {
2400 
2401 	g_raid_pre_sync = EVENTHANDLER_REGISTER(shutdown_pre_sync,
2402 	    g_raid_shutdown_pre_sync, mp, SHUTDOWN_PRI_FIRST);
2403 	if (g_raid_pre_sync == NULL)
2404 		G_RAID_DEBUG(0, "Warning! Cannot register shutdown event.");
2405 	g_raid_started = 1;
2406 }
2407 
2408 static void
2409 g_raid_fini(struct g_class *mp)
2410 {
2411 
2412 	if (g_raid_pre_sync != NULL)
2413 		EVENTHANDLER_DEREGISTER(shutdown_pre_sync, g_raid_pre_sync);
2414 	g_raid_started = 0;
2415 }
2416 
2417 int
2418 g_raid_md_modevent(module_t mod, int type, void *arg)
2419 {
2420 	struct g_raid_md_class *class, *c, *nc;
2421 	int error;
2422 
2423 	error = 0;
2424 	class = arg;
2425 	switch (type) {
2426 	case MOD_LOAD:
2427 		c = LIST_FIRST(&g_raid_md_classes);
2428 		if (c == NULL || c->mdc_priority > class->mdc_priority)
2429 			LIST_INSERT_HEAD(&g_raid_md_classes, class, mdc_list);
2430 		else {
2431 			while ((nc = LIST_NEXT(c, mdc_list)) != NULL &&
2432 			    nc->mdc_priority < class->mdc_priority)
2433 				c = nc;
2434 			LIST_INSERT_AFTER(c, class, mdc_list);
2435 		}
2436 		if (g_raid_started)
2437 			g_retaste(&g_raid_class);
2438 		break;
2439 	case MOD_UNLOAD:
2440 		LIST_REMOVE(class, mdc_list);
2441 		break;
2442 	default:
2443 		error = EOPNOTSUPP;
2444 		break;
2445 	}
2446 
2447 	return (error);
2448 }
2449 
2450 int
2451 g_raid_tr_modevent(module_t mod, int type, void *arg)
2452 {
2453 	struct g_raid_tr_class *class, *c, *nc;
2454 	int error;
2455 
2456 	error = 0;
2457 	class = arg;
2458 	switch (type) {
2459 	case MOD_LOAD:
2460 		c = LIST_FIRST(&g_raid_tr_classes);
2461 		if (c == NULL || c->trc_priority > class->trc_priority)
2462 			LIST_INSERT_HEAD(&g_raid_tr_classes, class, trc_list);
2463 		else {
2464 			while ((nc = LIST_NEXT(c, trc_list)) != NULL &&
2465 			    nc->trc_priority < class->trc_priority)
2466 				c = nc;
2467 			LIST_INSERT_AFTER(c, class, trc_list);
2468 		}
2469 		break;
2470 	case MOD_UNLOAD:
2471 		LIST_REMOVE(class, trc_list);
2472 		break;
2473 	default:
2474 		error = EOPNOTSUPP;
2475 		break;
2476 	}
2477 
2478 	return (error);
2479 }
2480 
2481 /*
2482  * Use local implementation of DECLARE_GEOM_CLASS(g_raid_class, g_raid)
2483  * to reduce module priority, allowing submodules to register them first.
2484  */
2485 static moduledata_t g_raid_mod = {
2486 	"g_raid",
2487 	g_modevent,
2488 	&g_raid_class
2489 };
2490 DECLARE_MODULE(g_raid, g_raid_mod, SI_SUB_DRIVERS, SI_ORDER_THIRD);
2491 MODULE_VERSION(geom_raid, 0);
2492