xref: /freebsd/sys/geom/raid/g_raid.c (revision c6ec7d31830ab1c80edae95ad5e4b9dba10c47ac)
1 /*-
2  * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29 
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/kernel.h>
33 #include <sys/module.h>
34 #include <sys/limits.h>
35 #include <sys/lock.h>
36 #include <sys/mutex.h>
37 #include <sys/bio.h>
38 #include <sys/sbuf.h>
39 #include <sys/sysctl.h>
40 #include <sys/malloc.h>
41 #include <sys/eventhandler.h>
42 #include <vm/uma.h>
43 #include <geom/geom.h>
44 #include <sys/proc.h>
45 #include <sys/kthread.h>
46 #include <sys/sched.h>
47 #include <geom/raid/g_raid.h>
48 #include "g_raid_md_if.h"
49 #include "g_raid_tr_if.h"
50 
51 static MALLOC_DEFINE(M_RAID, "raid_data", "GEOM_RAID Data");
52 
53 SYSCTL_DECL(_kern_geom);
54 SYSCTL_NODE(_kern_geom, OID_AUTO, raid, CTLFLAG_RW, 0, "GEOM_RAID stuff");
55 int g_raid_enable = 1;
56 TUNABLE_INT("kern.geom.raid.enable", &g_raid_enable);
57 SYSCTL_INT(_kern_geom_raid, OID_AUTO, enable, CTLFLAG_RW,
58     &g_raid_enable, 0, "Enable on-disk metadata taste");
59 u_int g_raid_aggressive_spare = 0;
60 TUNABLE_INT("kern.geom.raid.aggressive_spare", &g_raid_aggressive_spare);
61 SYSCTL_UINT(_kern_geom_raid, OID_AUTO, aggressive_spare, CTLFLAG_RW,
62     &g_raid_aggressive_spare, 0, "Use disks without metadata as spare");
63 u_int g_raid_debug = 0;
64 TUNABLE_INT("kern.geom.raid.debug", &g_raid_debug);
65 SYSCTL_UINT(_kern_geom_raid, OID_AUTO, debug, CTLFLAG_RW, &g_raid_debug, 0,
66     "Debug level");
67 int g_raid_read_err_thresh = 10;
68 TUNABLE_INT("kern.geom.raid.read_err_thresh", &g_raid_read_err_thresh);
69 SYSCTL_UINT(_kern_geom_raid, OID_AUTO, read_err_thresh, CTLFLAG_RW,
70     &g_raid_read_err_thresh, 0,
71     "Number of read errors equated to disk failure");
72 u_int g_raid_start_timeout = 30;
73 TUNABLE_INT("kern.geom.raid.start_timeout", &g_raid_start_timeout);
74 SYSCTL_UINT(_kern_geom_raid, OID_AUTO, start_timeout, CTLFLAG_RW,
75     &g_raid_start_timeout, 0,
76     "Time to wait for all array components");
77 static u_int g_raid_clean_time = 5;
78 TUNABLE_INT("kern.geom.raid.clean_time", &g_raid_clean_time);
79 SYSCTL_UINT(_kern_geom_raid, OID_AUTO, clean_time, CTLFLAG_RW,
80     &g_raid_clean_time, 0, "Mark volume as clean when idling");
81 static u_int g_raid_disconnect_on_failure = 1;
82 TUNABLE_INT("kern.geom.raid.disconnect_on_failure",
83     &g_raid_disconnect_on_failure);
84 SYSCTL_UINT(_kern_geom_raid, OID_AUTO, disconnect_on_failure, CTLFLAG_RW,
85     &g_raid_disconnect_on_failure, 0, "Disconnect component on I/O failure.");
86 static u_int g_raid_name_format = 0;
87 TUNABLE_INT("kern.geom.raid.name_format", &g_raid_name_format);
88 SYSCTL_UINT(_kern_geom_raid, OID_AUTO, name_format, CTLFLAG_RW,
89     &g_raid_name_format, 0, "Providers name format.");
90 static u_int g_raid_idle_threshold = 1000000;
91 TUNABLE_INT("kern.geom.raid.idle_threshold", &g_raid_idle_threshold);
92 SYSCTL_UINT(_kern_geom_raid, OID_AUTO, idle_threshold, CTLFLAG_RW,
93     &g_raid_idle_threshold, 1000000,
94     "Time in microseconds to consider a volume idle.");
95 
96 #define	MSLEEP(rv, ident, mtx, priority, wmesg, timeout)	do {	\
97 	G_RAID_DEBUG(4, "%s: Sleeping %p.", __func__, (ident));		\
98 	rv = msleep((ident), (mtx), (priority), (wmesg), (timeout));	\
99 	G_RAID_DEBUG(4, "%s: Woken up %p.", __func__, (ident));		\
100 } while (0)
101 
102 LIST_HEAD(, g_raid_md_class) g_raid_md_classes =
103     LIST_HEAD_INITIALIZER(g_raid_md_classes);
104 
105 LIST_HEAD(, g_raid_tr_class) g_raid_tr_classes =
106     LIST_HEAD_INITIALIZER(g_raid_tr_classes);
107 
108 LIST_HEAD(, g_raid_volume) g_raid_volumes =
109     LIST_HEAD_INITIALIZER(g_raid_volumes);
110 
111 static eventhandler_tag g_raid_post_sync = NULL;
112 static int g_raid_started = 0;
113 static int g_raid_shutdown = 0;
114 
115 static int g_raid_destroy_geom(struct gctl_req *req, struct g_class *mp,
116     struct g_geom *gp);
117 static g_taste_t g_raid_taste;
118 static void g_raid_init(struct g_class *mp);
119 static void g_raid_fini(struct g_class *mp);
120 
121 struct g_class g_raid_class = {
122 	.name = G_RAID_CLASS_NAME,
123 	.version = G_VERSION,
124 	.ctlreq = g_raid_ctl,
125 	.taste = g_raid_taste,
126 	.destroy_geom = g_raid_destroy_geom,
127 	.init = g_raid_init,
128 	.fini = g_raid_fini
129 };
130 
131 static void g_raid_destroy_provider(struct g_raid_volume *vol);
132 static int g_raid_update_disk(struct g_raid_disk *disk, u_int event);
133 static int g_raid_update_subdisk(struct g_raid_subdisk *subdisk, u_int event);
134 static int g_raid_update_volume(struct g_raid_volume *vol, u_int event);
135 static int g_raid_update_node(struct g_raid_softc *sc, u_int event);
136 static void g_raid_dumpconf(struct sbuf *sb, const char *indent,
137     struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
138 static void g_raid_start(struct bio *bp);
139 static void g_raid_start_request(struct bio *bp);
140 static void g_raid_disk_done(struct bio *bp);
141 static void g_raid_poll(struct g_raid_softc *sc);
142 
143 static const char *
144 g_raid_node_event2str(int event)
145 {
146 
147 	switch (event) {
148 	case G_RAID_NODE_E_WAKE:
149 		return ("WAKE");
150 	case G_RAID_NODE_E_START:
151 		return ("START");
152 	default:
153 		return ("INVALID");
154 	}
155 }
156 
157 const char *
158 g_raid_disk_state2str(int state)
159 {
160 
161 	switch (state) {
162 	case G_RAID_DISK_S_NONE:
163 		return ("NONE");
164 	case G_RAID_DISK_S_OFFLINE:
165 		return ("OFFLINE");
166 	case G_RAID_DISK_S_FAILED:
167 		return ("FAILED");
168 	case G_RAID_DISK_S_STALE_FAILED:
169 		return ("STALE_FAILED");
170 	case G_RAID_DISK_S_SPARE:
171 		return ("SPARE");
172 	case G_RAID_DISK_S_STALE:
173 		return ("STALE");
174 	case G_RAID_DISK_S_ACTIVE:
175 		return ("ACTIVE");
176 	default:
177 		return ("INVALID");
178 	}
179 }
180 
181 static const char *
182 g_raid_disk_event2str(int event)
183 {
184 
185 	switch (event) {
186 	case G_RAID_DISK_E_DISCONNECTED:
187 		return ("DISCONNECTED");
188 	default:
189 		return ("INVALID");
190 	}
191 }
192 
193 const char *
194 g_raid_subdisk_state2str(int state)
195 {
196 
197 	switch (state) {
198 	case G_RAID_SUBDISK_S_NONE:
199 		return ("NONE");
200 	case G_RAID_SUBDISK_S_FAILED:
201 		return ("FAILED");
202 	case G_RAID_SUBDISK_S_NEW:
203 		return ("NEW");
204 	case G_RAID_SUBDISK_S_REBUILD:
205 		return ("REBUILD");
206 	case G_RAID_SUBDISK_S_UNINITIALIZED:
207 		return ("UNINITIALIZED");
208 	case G_RAID_SUBDISK_S_STALE:
209 		return ("STALE");
210 	case G_RAID_SUBDISK_S_RESYNC:
211 		return ("RESYNC");
212 	case G_RAID_SUBDISK_S_ACTIVE:
213 		return ("ACTIVE");
214 	default:
215 		return ("INVALID");
216 	}
217 }
218 
219 static const char *
220 g_raid_subdisk_event2str(int event)
221 {
222 
223 	switch (event) {
224 	case G_RAID_SUBDISK_E_NEW:
225 		return ("NEW");
226 	case G_RAID_SUBDISK_E_FAILED:
227 		return ("FAILED");
228 	case G_RAID_SUBDISK_E_DISCONNECTED:
229 		return ("DISCONNECTED");
230 	default:
231 		return ("INVALID");
232 	}
233 }
234 
235 const char *
236 g_raid_volume_state2str(int state)
237 {
238 
239 	switch (state) {
240 	case G_RAID_VOLUME_S_STARTING:
241 		return ("STARTING");
242 	case G_RAID_VOLUME_S_BROKEN:
243 		return ("BROKEN");
244 	case G_RAID_VOLUME_S_DEGRADED:
245 		return ("DEGRADED");
246 	case G_RAID_VOLUME_S_SUBOPTIMAL:
247 		return ("SUBOPTIMAL");
248 	case G_RAID_VOLUME_S_OPTIMAL:
249 		return ("OPTIMAL");
250 	case G_RAID_VOLUME_S_UNSUPPORTED:
251 		return ("UNSUPPORTED");
252 	case G_RAID_VOLUME_S_STOPPED:
253 		return ("STOPPED");
254 	default:
255 		return ("INVALID");
256 	}
257 }
258 
259 static const char *
260 g_raid_volume_event2str(int event)
261 {
262 
263 	switch (event) {
264 	case G_RAID_VOLUME_E_UP:
265 		return ("UP");
266 	case G_RAID_VOLUME_E_DOWN:
267 		return ("DOWN");
268 	case G_RAID_VOLUME_E_START:
269 		return ("START");
270 	case G_RAID_VOLUME_E_STARTMD:
271 		return ("STARTMD");
272 	default:
273 		return ("INVALID");
274 	}
275 }
276 
277 const char *
278 g_raid_volume_level2str(int level, int qual)
279 {
280 
281 	switch (level) {
282 	case G_RAID_VOLUME_RL_RAID0:
283 		return ("RAID0");
284 	case G_RAID_VOLUME_RL_RAID1:
285 		return ("RAID1");
286 	case G_RAID_VOLUME_RL_RAID3:
287 		if (qual == G_RAID_VOLUME_RLQ_R3P0)
288 			return ("RAID3-P0");
289 		if (qual == G_RAID_VOLUME_RLQ_R3PN)
290 			return ("RAID3-PN");
291 		return ("RAID3");
292 	case G_RAID_VOLUME_RL_RAID4:
293 		if (qual == G_RAID_VOLUME_RLQ_R4P0)
294 			return ("RAID4-P0");
295 		if (qual == G_RAID_VOLUME_RLQ_R4PN)
296 			return ("RAID4-PN");
297 		return ("RAID4");
298 	case G_RAID_VOLUME_RL_RAID5:
299 		if (qual == G_RAID_VOLUME_RLQ_R5RA)
300 			return ("RAID5-RA");
301 		if (qual == G_RAID_VOLUME_RLQ_R5RS)
302 			return ("RAID5-RS");
303 		if (qual == G_RAID_VOLUME_RLQ_R5LA)
304 			return ("RAID5-LA");
305 		if (qual == G_RAID_VOLUME_RLQ_R5LS)
306 			return ("RAID5-LS");
307 		return ("RAID5");
308 	case G_RAID_VOLUME_RL_RAID6:
309 		if (qual == G_RAID_VOLUME_RLQ_R6RA)
310 			return ("RAID6-RA");
311 		if (qual == G_RAID_VOLUME_RLQ_R6RS)
312 			return ("RAID6-RS");
313 		if (qual == G_RAID_VOLUME_RLQ_R6LA)
314 			return ("RAID6-LA");
315 		if (qual == G_RAID_VOLUME_RLQ_R6LS)
316 			return ("RAID6-LS");
317 		return ("RAID6");
318 	case G_RAID_VOLUME_RL_RAIDMDF:
319 		if (qual == G_RAID_VOLUME_RLQ_RMDFRA)
320 			return ("RAIDMDF-RA");
321 		if (qual == G_RAID_VOLUME_RLQ_RMDFRS)
322 			return ("RAIDMDF-RS");
323 		if (qual == G_RAID_VOLUME_RLQ_RMDFLA)
324 			return ("RAIDMDF-LA");
325 		if (qual == G_RAID_VOLUME_RLQ_RMDFLS)
326 			return ("RAIDMDF-LS");
327 		return ("RAIDMDF");
328 	case G_RAID_VOLUME_RL_RAID1E:
329 		if (qual == G_RAID_VOLUME_RLQ_R1EA)
330 			return ("RAID1E-A");
331 		if (qual == G_RAID_VOLUME_RLQ_R1EO)
332 			return ("RAID1E-O");
333 		return ("RAID1E");
334 	case G_RAID_VOLUME_RL_SINGLE:
335 		return ("SINGLE");
336 	case G_RAID_VOLUME_RL_CONCAT:
337 		return ("CONCAT");
338 	case G_RAID_VOLUME_RL_RAID5E:
339 		if (qual == G_RAID_VOLUME_RLQ_R5ERA)
340 			return ("RAID5E-RA");
341 		if (qual == G_RAID_VOLUME_RLQ_R5ERS)
342 			return ("RAID5E-RS");
343 		if (qual == G_RAID_VOLUME_RLQ_R5ELA)
344 			return ("RAID5E-LA");
345 		if (qual == G_RAID_VOLUME_RLQ_R5ELS)
346 			return ("RAID5E-LS");
347 		return ("RAID5E");
348 	case G_RAID_VOLUME_RL_RAID5EE:
349 		if (qual == G_RAID_VOLUME_RLQ_R5EERA)
350 			return ("RAID5EE-RA");
351 		if (qual == G_RAID_VOLUME_RLQ_R5EERS)
352 			return ("RAID5EE-RS");
353 		if (qual == G_RAID_VOLUME_RLQ_R5EELA)
354 			return ("RAID5EE-LA");
355 		if (qual == G_RAID_VOLUME_RLQ_R5EELS)
356 			return ("RAID5EE-LS");
357 		return ("RAID5EE");
358 	case G_RAID_VOLUME_RL_RAID5R:
359 		if (qual == G_RAID_VOLUME_RLQ_R5RRA)
360 			return ("RAID5R-RA");
361 		if (qual == G_RAID_VOLUME_RLQ_R5RRS)
362 			return ("RAID5R-RS");
363 		if (qual == G_RAID_VOLUME_RLQ_R5RLA)
364 			return ("RAID5R-LA");
365 		if (qual == G_RAID_VOLUME_RLQ_R5RLS)
366 			return ("RAID5R-LS");
367 		return ("RAID5E");
368 	default:
369 		return ("UNKNOWN");
370 	}
371 }
372 
373 int
374 g_raid_volume_str2level(const char *str, int *level, int *qual)
375 {
376 
377 	*level = G_RAID_VOLUME_RL_UNKNOWN;
378 	*qual = G_RAID_VOLUME_RLQ_NONE;
379 	if (strcasecmp(str, "RAID0") == 0)
380 		*level = G_RAID_VOLUME_RL_RAID0;
381 	else if (strcasecmp(str, "RAID1") == 0)
382 		*level = G_RAID_VOLUME_RL_RAID1;
383 	else if (strcasecmp(str, "RAID3-P0") == 0) {
384 		*level = G_RAID_VOLUME_RL_RAID3;
385 		*qual = G_RAID_VOLUME_RLQ_R3P0;
386 	} else if (strcasecmp(str, "RAID3-PN") == 0 ||
387 		   strcasecmp(str, "RAID3") == 0) {
388 		*level = G_RAID_VOLUME_RL_RAID3;
389 		*qual = G_RAID_VOLUME_RLQ_R3PN;
390 	} else if (strcasecmp(str, "RAID4-P0") == 0) {
391 		*level = G_RAID_VOLUME_RL_RAID4;
392 		*qual = G_RAID_VOLUME_RLQ_R4P0;
393 	} else if (strcasecmp(str, "RAID4-PN") == 0 ||
394 		   strcasecmp(str, "RAID4") == 0) {
395 		*level = G_RAID_VOLUME_RL_RAID4;
396 		*qual = G_RAID_VOLUME_RLQ_R4PN;
397 	} else if (strcasecmp(str, "RAID5-RA") == 0) {
398 		*level = G_RAID_VOLUME_RL_RAID5;
399 		*qual = G_RAID_VOLUME_RLQ_R5RA;
400 	} else if (strcasecmp(str, "RAID5-RS") == 0) {
401 		*level = G_RAID_VOLUME_RL_RAID5;
402 		*qual = G_RAID_VOLUME_RLQ_R5RS;
403 	} else if (strcasecmp(str, "RAID5") == 0 ||
404 		   strcasecmp(str, "RAID5-LA") == 0) {
405 		*level = G_RAID_VOLUME_RL_RAID5;
406 		*qual = G_RAID_VOLUME_RLQ_R5LA;
407 	} else if (strcasecmp(str, "RAID5-LS") == 0) {
408 		*level = G_RAID_VOLUME_RL_RAID5;
409 		*qual = G_RAID_VOLUME_RLQ_R5LS;
410 	} else if (strcasecmp(str, "RAID6-RA") == 0) {
411 		*level = G_RAID_VOLUME_RL_RAID6;
412 		*qual = G_RAID_VOLUME_RLQ_R6RA;
413 	} else if (strcasecmp(str, "RAID6-RS") == 0) {
414 		*level = G_RAID_VOLUME_RL_RAID6;
415 		*qual = G_RAID_VOLUME_RLQ_R6RS;
416 	} else if (strcasecmp(str, "RAID6") == 0 ||
417 		   strcasecmp(str, "RAID6-LA") == 0) {
418 		*level = G_RAID_VOLUME_RL_RAID6;
419 		*qual = G_RAID_VOLUME_RLQ_R6LA;
420 	} else if (strcasecmp(str, "RAID6-LS") == 0) {
421 		*level = G_RAID_VOLUME_RL_RAID6;
422 		*qual = G_RAID_VOLUME_RLQ_R6LS;
423 	} else if (strcasecmp(str, "RAIDMDF-RA") == 0) {
424 		*level = G_RAID_VOLUME_RL_RAIDMDF;
425 		*qual = G_RAID_VOLUME_RLQ_RMDFRA;
426 	} else if (strcasecmp(str, "RAIDMDF-RS") == 0) {
427 		*level = G_RAID_VOLUME_RL_RAIDMDF;
428 		*qual = G_RAID_VOLUME_RLQ_RMDFRS;
429 	} else if (strcasecmp(str, "RAIDMDF") == 0 ||
430 		   strcasecmp(str, "RAIDMDF-LA") == 0) {
431 		*level = G_RAID_VOLUME_RL_RAIDMDF;
432 		*qual = G_RAID_VOLUME_RLQ_RMDFLA;
433 	} else if (strcasecmp(str, "RAIDMDF-LS") == 0) {
434 		*level = G_RAID_VOLUME_RL_RAIDMDF;
435 		*qual = G_RAID_VOLUME_RLQ_RMDFLS;
436 	} else if (strcasecmp(str, "RAID10") == 0 ||
437 		   strcasecmp(str, "RAID1E") == 0 ||
438 		   strcasecmp(str, "RAID1E-A") == 0) {
439 		*level = G_RAID_VOLUME_RL_RAID1E;
440 		*qual = G_RAID_VOLUME_RLQ_R1EA;
441 	} else if (strcasecmp(str, "RAID1E-O") == 0) {
442 		*level = G_RAID_VOLUME_RL_RAID1E;
443 		*qual = G_RAID_VOLUME_RLQ_R1EO;
444 	} else if (strcasecmp(str, "SINGLE") == 0)
445 		*level = G_RAID_VOLUME_RL_SINGLE;
446 	else if (strcasecmp(str, "CONCAT") == 0)
447 		*level = G_RAID_VOLUME_RL_CONCAT;
448 	else if (strcasecmp(str, "RAID5E-RA") == 0) {
449 		*level = G_RAID_VOLUME_RL_RAID5E;
450 		*qual = G_RAID_VOLUME_RLQ_R5ERA;
451 	} else if (strcasecmp(str, "RAID5E-RS") == 0) {
452 		*level = G_RAID_VOLUME_RL_RAID5E;
453 		*qual = G_RAID_VOLUME_RLQ_R5ERS;
454 	} else if (strcasecmp(str, "RAID5E") == 0 ||
455 		   strcasecmp(str, "RAID5E-LA") == 0) {
456 		*level = G_RAID_VOLUME_RL_RAID5E;
457 		*qual = G_RAID_VOLUME_RLQ_R5ELA;
458 	} else if (strcasecmp(str, "RAID5E-LS") == 0) {
459 		*level = G_RAID_VOLUME_RL_RAID5E;
460 		*qual = G_RAID_VOLUME_RLQ_R5ELS;
461 	} else if (strcasecmp(str, "RAID5EE-RA") == 0) {
462 		*level = G_RAID_VOLUME_RL_RAID5EE;
463 		*qual = G_RAID_VOLUME_RLQ_R5EERA;
464 	} else if (strcasecmp(str, "RAID5EE-RS") == 0) {
465 		*level = G_RAID_VOLUME_RL_RAID5EE;
466 		*qual = G_RAID_VOLUME_RLQ_R5EERS;
467 	} else if (strcasecmp(str, "RAID5EE") == 0 ||
468 		   strcasecmp(str, "RAID5EE-LA") == 0) {
469 		*level = G_RAID_VOLUME_RL_RAID5EE;
470 		*qual = G_RAID_VOLUME_RLQ_R5EELA;
471 	} else if (strcasecmp(str, "RAID5EE-LS") == 0) {
472 		*level = G_RAID_VOLUME_RL_RAID5EE;
473 		*qual = G_RAID_VOLUME_RLQ_R5EELS;
474 	} else if (strcasecmp(str, "RAID5R-RA") == 0) {
475 		*level = G_RAID_VOLUME_RL_RAID5R;
476 		*qual = G_RAID_VOLUME_RLQ_R5RRA;
477 	} else if (strcasecmp(str, "RAID5R-RS") == 0) {
478 		*level = G_RAID_VOLUME_RL_RAID5R;
479 		*qual = G_RAID_VOLUME_RLQ_R5RRS;
480 	} else if (strcasecmp(str, "RAID5R") == 0 ||
481 		   strcasecmp(str, "RAID5R-LA") == 0) {
482 		*level = G_RAID_VOLUME_RL_RAID5R;
483 		*qual = G_RAID_VOLUME_RLQ_R5RLA;
484 	} else if (strcasecmp(str, "RAID5R-LS") == 0) {
485 		*level = G_RAID_VOLUME_RL_RAID5R;
486 		*qual = G_RAID_VOLUME_RLQ_R5RLS;
487 	} else
488 		return (-1);
489 	return (0);
490 }
491 
492 const char *
493 g_raid_get_diskname(struct g_raid_disk *disk)
494 {
495 
496 	if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL)
497 		return ("[unknown]");
498 	return (disk->d_consumer->provider->name);
499 }
500 
501 void
502 g_raid_get_disk_info(struct g_raid_disk *disk)
503 {
504 	struct g_consumer *cp = disk->d_consumer;
505 	int error, len;
506 
507 	/* Read kernel dumping information. */
508 	disk->d_kd.offset = 0;
509 	disk->d_kd.length = OFF_MAX;
510 	len = sizeof(disk->d_kd);
511 	error = g_io_getattr("GEOM::kerneldump", cp, &len, &disk->d_kd);
512 	if (error)
513 		disk->d_kd.di.dumper = NULL;
514 	if (disk->d_kd.di.dumper == NULL)
515 		G_RAID_DEBUG1(2, disk->d_softc,
516 		    "Dumping not supported by %s: %d.",
517 		    cp->provider->name, error);
518 
519 	/* Read BIO_DELETE support. */
520 	error = g_getattr("GEOM::candelete", cp, &disk->d_candelete);
521 	if (error)
522 		disk->d_candelete = 0;
523 	if (!disk->d_candelete)
524 		G_RAID_DEBUG1(2, disk->d_softc,
525 		    "BIO_DELETE not supported by %s: %d.",
526 		    cp->provider->name, error);
527 }
528 
529 void
530 g_raid_report_disk_state(struct g_raid_disk *disk)
531 {
532 	struct g_raid_subdisk *sd;
533 	int len, state;
534 	uint32_t s;
535 
536 	if (disk->d_consumer == NULL)
537 		return;
538 	if (disk->d_state == G_RAID_DISK_S_FAILED ||
539 	    disk->d_state == G_RAID_DISK_S_STALE_FAILED) {
540 		s = G_STATE_FAILED;
541 	} else {
542 		state = G_RAID_SUBDISK_S_ACTIVE;
543 		TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
544 			if (sd->sd_state < state)
545 				state = sd->sd_state;
546 		}
547 		if (state == G_RAID_SUBDISK_S_FAILED)
548 			s = G_STATE_FAILED;
549 		else if (state == G_RAID_SUBDISK_S_NEW ||
550 		    state == G_RAID_SUBDISK_S_REBUILD)
551 			s = G_STATE_REBUILD;
552 		else if (state == G_RAID_SUBDISK_S_STALE ||
553 		    state == G_RAID_SUBDISK_S_RESYNC)
554 			s = G_STATE_RESYNC;
555 		else
556 			s = G_STATE_ACTIVE;
557 	}
558 	len = sizeof(s);
559 	g_io_getattr("GEOM::setstate", disk->d_consumer, &len, &s);
560 	G_RAID_DEBUG1(2, disk->d_softc, "Disk %s state reported as %d.",
561 	    g_raid_get_diskname(disk), s);
562 }
563 
564 void
565 g_raid_change_disk_state(struct g_raid_disk *disk, int state)
566 {
567 
568 	G_RAID_DEBUG1(0, disk->d_softc, "Disk %s state changed from %s to %s.",
569 	    g_raid_get_diskname(disk),
570 	    g_raid_disk_state2str(disk->d_state),
571 	    g_raid_disk_state2str(state));
572 	disk->d_state = state;
573 	g_raid_report_disk_state(disk);
574 }
575 
576 void
577 g_raid_change_subdisk_state(struct g_raid_subdisk *sd, int state)
578 {
579 
580 	G_RAID_DEBUG1(0, sd->sd_softc,
581 	    "Subdisk %s:%d-%s state changed from %s to %s.",
582 	    sd->sd_volume->v_name, sd->sd_pos,
583 	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
584 	    g_raid_subdisk_state2str(sd->sd_state),
585 	    g_raid_subdisk_state2str(state));
586 	sd->sd_state = state;
587 	if (sd->sd_disk)
588 		g_raid_report_disk_state(sd->sd_disk);
589 }
590 
591 void
592 g_raid_change_volume_state(struct g_raid_volume *vol, int state)
593 {
594 
595 	G_RAID_DEBUG1(0, vol->v_softc,
596 	    "Volume %s state changed from %s to %s.",
597 	    vol->v_name,
598 	    g_raid_volume_state2str(vol->v_state),
599 	    g_raid_volume_state2str(state));
600 	vol->v_state = state;
601 }
602 
603 /*
604  * --- Events handling functions ---
605  * Events in geom_raid are used to maintain subdisks and volumes status
606  * from one thread to simplify locking.
607  */
608 static void
609 g_raid_event_free(struct g_raid_event *ep)
610 {
611 
612 	free(ep, M_RAID);
613 }
614 
615 int
616 g_raid_event_send(void *arg, int event, int flags)
617 {
618 	struct g_raid_softc *sc;
619 	struct g_raid_event *ep;
620 	int error;
621 
622 	if ((flags & G_RAID_EVENT_VOLUME) != 0) {
623 		sc = ((struct g_raid_volume *)arg)->v_softc;
624 	} else if ((flags & G_RAID_EVENT_DISK) != 0) {
625 		sc = ((struct g_raid_disk *)arg)->d_softc;
626 	} else if ((flags & G_RAID_EVENT_SUBDISK) != 0) {
627 		sc = ((struct g_raid_subdisk *)arg)->sd_softc;
628 	} else {
629 		sc = arg;
630 	}
631 	ep = malloc(sizeof(*ep), M_RAID,
632 	    sx_xlocked(&sc->sc_lock) ? M_WAITOK : M_NOWAIT);
633 	if (ep == NULL)
634 		return (ENOMEM);
635 	ep->e_tgt = arg;
636 	ep->e_event = event;
637 	ep->e_flags = flags;
638 	ep->e_error = 0;
639 	G_RAID_DEBUG1(4, sc, "Sending event %p. Waking up %p.", ep, sc);
640 	mtx_lock(&sc->sc_queue_mtx);
641 	TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next);
642 	mtx_unlock(&sc->sc_queue_mtx);
643 	wakeup(sc);
644 
645 	if ((flags & G_RAID_EVENT_WAIT) == 0)
646 		return (0);
647 
648 	sx_assert(&sc->sc_lock, SX_XLOCKED);
649 	G_RAID_DEBUG1(4, sc, "Sleeping on %p.", ep);
650 	sx_xunlock(&sc->sc_lock);
651 	while ((ep->e_flags & G_RAID_EVENT_DONE) == 0) {
652 		mtx_lock(&sc->sc_queue_mtx);
653 		MSLEEP(error, ep, &sc->sc_queue_mtx, PRIBIO | PDROP, "m:event",
654 		    hz * 5);
655 	}
656 	error = ep->e_error;
657 	g_raid_event_free(ep);
658 	sx_xlock(&sc->sc_lock);
659 	return (error);
660 }
661 
662 static void
663 g_raid_event_cancel(struct g_raid_softc *sc, void *tgt)
664 {
665 	struct g_raid_event *ep, *tmpep;
666 
667 	sx_assert(&sc->sc_lock, SX_XLOCKED);
668 
669 	mtx_lock(&sc->sc_queue_mtx);
670 	TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) {
671 		if (ep->e_tgt != tgt)
672 			continue;
673 		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
674 		if ((ep->e_flags & G_RAID_EVENT_WAIT) == 0)
675 			g_raid_event_free(ep);
676 		else {
677 			ep->e_error = ECANCELED;
678 			wakeup(ep);
679 		}
680 	}
681 	mtx_unlock(&sc->sc_queue_mtx);
682 }
683 
684 static int
685 g_raid_event_check(struct g_raid_softc *sc, void *tgt)
686 {
687 	struct g_raid_event *ep;
688 	int	res = 0;
689 
690 	sx_assert(&sc->sc_lock, SX_XLOCKED);
691 
692 	mtx_lock(&sc->sc_queue_mtx);
693 	TAILQ_FOREACH(ep, &sc->sc_events, e_next) {
694 		if (ep->e_tgt != tgt)
695 			continue;
696 		res = 1;
697 		break;
698 	}
699 	mtx_unlock(&sc->sc_queue_mtx);
700 	return (res);
701 }
702 
703 /*
704  * Return the number of disks in given state.
705  * If state is equal to -1, count all connected disks.
706  */
707 u_int
708 g_raid_ndisks(struct g_raid_softc *sc, int state)
709 {
710 	struct g_raid_disk *disk;
711 	u_int n;
712 
713 	sx_assert(&sc->sc_lock, SX_LOCKED);
714 
715 	n = 0;
716 	TAILQ_FOREACH(disk, &sc->sc_disks, d_next) {
717 		if (disk->d_state == state || state == -1)
718 			n++;
719 	}
720 	return (n);
721 }
722 
723 /*
724  * Return the number of subdisks in given state.
725  * If state is equal to -1, count all connected disks.
726  */
727 u_int
728 g_raid_nsubdisks(struct g_raid_volume *vol, int state)
729 {
730 	struct g_raid_subdisk *subdisk;
731 	struct g_raid_softc *sc;
732 	u_int i, n ;
733 
734 	sc = vol->v_softc;
735 	sx_assert(&sc->sc_lock, SX_LOCKED);
736 
737 	n = 0;
738 	for (i = 0; i < vol->v_disks_count; i++) {
739 		subdisk = &vol->v_subdisks[i];
740 		if ((state == -1 &&
741 		     subdisk->sd_state != G_RAID_SUBDISK_S_NONE) ||
742 		    subdisk->sd_state == state)
743 			n++;
744 	}
745 	return (n);
746 }
747 
748 /*
749  * Return the first subdisk in given state.
750  * If state is equal to -1, then the first connected disks.
751  */
752 struct g_raid_subdisk *
753 g_raid_get_subdisk(struct g_raid_volume *vol, int state)
754 {
755 	struct g_raid_subdisk *sd;
756 	struct g_raid_softc *sc;
757 	u_int i;
758 
759 	sc = vol->v_softc;
760 	sx_assert(&sc->sc_lock, SX_LOCKED);
761 
762 	for (i = 0; i < vol->v_disks_count; i++) {
763 		sd = &vol->v_subdisks[i];
764 		if ((state == -1 &&
765 		     sd->sd_state != G_RAID_SUBDISK_S_NONE) ||
766 		    sd->sd_state == state)
767 			return (sd);
768 	}
769 	return (NULL);
770 }
771 
772 struct g_consumer *
773 g_raid_open_consumer(struct g_raid_softc *sc, const char *name)
774 {
775 	struct g_consumer *cp;
776 	struct g_provider *pp;
777 
778 	g_topology_assert();
779 
780 	if (strncmp(name, "/dev/", 5) == 0)
781 		name += 5;
782 	pp = g_provider_by_name(name);
783 	if (pp == NULL)
784 		return (NULL);
785 	cp = g_new_consumer(sc->sc_geom);
786 	if (g_attach(cp, pp) != 0) {
787 		g_destroy_consumer(cp);
788 		return (NULL);
789 	}
790 	if (g_access(cp, 1, 1, 1) != 0) {
791 		g_detach(cp);
792 		g_destroy_consumer(cp);
793 		return (NULL);
794 	}
795 	return (cp);
796 }
797 
798 static u_int
799 g_raid_nrequests(struct g_raid_softc *sc, struct g_consumer *cp)
800 {
801 	struct bio *bp;
802 	u_int nreqs = 0;
803 
804 	mtx_lock(&sc->sc_queue_mtx);
805 	TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
806 		if (bp->bio_from == cp)
807 			nreqs++;
808 	}
809 	mtx_unlock(&sc->sc_queue_mtx);
810 	return (nreqs);
811 }
812 
813 u_int
814 g_raid_nopens(struct g_raid_softc *sc)
815 {
816 	struct g_raid_volume *vol;
817 	u_int opens;
818 
819 	opens = 0;
820 	TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
821 		if (vol->v_provider_open != 0)
822 			opens++;
823 	}
824 	return (opens);
825 }
826 
827 static int
828 g_raid_consumer_is_busy(struct g_raid_softc *sc, struct g_consumer *cp)
829 {
830 
831 	if (cp->index > 0) {
832 		G_RAID_DEBUG1(2, sc,
833 		    "I/O requests for %s exist, can't destroy it now.",
834 		    cp->provider->name);
835 		return (1);
836 	}
837 	if (g_raid_nrequests(sc, cp) > 0) {
838 		G_RAID_DEBUG1(2, sc,
839 		    "I/O requests for %s in queue, can't destroy it now.",
840 		    cp->provider->name);
841 		return (1);
842 	}
843 	return (0);
844 }
845 
846 static void
847 g_raid_destroy_consumer(void *arg, int flags __unused)
848 {
849 	struct g_consumer *cp;
850 
851 	g_topology_assert();
852 
853 	cp = arg;
854 	G_RAID_DEBUG(1, "Consumer %s destroyed.", cp->provider->name);
855 	g_detach(cp);
856 	g_destroy_consumer(cp);
857 }
858 
859 void
860 g_raid_kill_consumer(struct g_raid_softc *sc, struct g_consumer *cp)
861 {
862 	struct g_provider *pp;
863 	int retaste_wait;
864 
865 	g_topology_assert_not();
866 
867 	g_topology_lock();
868 	cp->private = NULL;
869 	if (g_raid_consumer_is_busy(sc, cp))
870 		goto out;
871 	pp = cp->provider;
872 	retaste_wait = 0;
873 	if (cp->acw == 1) {
874 		if ((pp->geom->flags & G_GEOM_WITHER) == 0)
875 			retaste_wait = 1;
876 	}
877 	if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
878 		g_access(cp, -cp->acr, -cp->acw, -cp->ace);
879 	if (retaste_wait) {
880 		/*
881 		 * After retaste event was send (inside g_access()), we can send
882 		 * event to detach and destroy consumer.
883 		 * A class, which has consumer to the given provider connected
884 		 * will not receive retaste event for the provider.
885 		 * This is the way how I ignore retaste events when I close
886 		 * consumers opened for write: I detach and destroy consumer
887 		 * after retaste event is sent.
888 		 */
889 		g_post_event(g_raid_destroy_consumer, cp, M_WAITOK, NULL);
890 		goto out;
891 	}
892 	G_RAID_DEBUG(1, "Consumer %s destroyed.", pp->name);
893 	g_detach(cp);
894 	g_destroy_consumer(cp);
895 out:
896 	g_topology_unlock();
897 }
898 
899 static void
900 g_raid_orphan(struct g_consumer *cp)
901 {
902 	struct g_raid_disk *disk;
903 
904 	g_topology_assert();
905 
906 	disk = cp->private;
907 	if (disk == NULL)
908 		return;
909 	g_raid_event_send(disk, G_RAID_DISK_E_DISCONNECTED,
910 	    G_RAID_EVENT_DISK);
911 }
912 
913 static void
914 g_raid_clean(struct g_raid_volume *vol, int acw)
915 {
916 	struct g_raid_softc *sc;
917 	int timeout;
918 
919 	sc = vol->v_softc;
920 	g_topology_assert_not();
921 	sx_assert(&sc->sc_lock, SX_XLOCKED);
922 
923 //	if ((sc->sc_flags & G_RAID_DEVICE_FLAG_NOFAILSYNC) != 0)
924 //		return;
925 	if (!vol->v_dirty)
926 		return;
927 	if (vol->v_writes > 0)
928 		return;
929 	if (acw > 0 || (acw == -1 &&
930 	    vol->v_provider != NULL && vol->v_provider->acw > 0)) {
931 		timeout = g_raid_clean_time - (time_uptime - vol->v_last_write);
932 		if (!g_raid_shutdown && timeout > 0)
933 			return;
934 	}
935 	vol->v_dirty = 0;
936 	G_RAID_DEBUG1(1, sc, "Volume %s marked as clean.",
937 	    vol->v_name);
938 	g_raid_write_metadata(sc, vol, NULL, NULL);
939 }
940 
941 static void
942 g_raid_dirty(struct g_raid_volume *vol)
943 {
944 	struct g_raid_softc *sc;
945 
946 	sc = vol->v_softc;
947 	g_topology_assert_not();
948 	sx_assert(&sc->sc_lock, SX_XLOCKED);
949 
950 //	if ((sc->sc_flags & G_RAID_DEVICE_FLAG_NOFAILSYNC) != 0)
951 //		return;
952 	vol->v_dirty = 1;
953 	G_RAID_DEBUG1(1, sc, "Volume %s marked as dirty.",
954 	    vol->v_name);
955 	g_raid_write_metadata(sc, vol, NULL, NULL);
956 }
957 
958 void
959 g_raid_tr_flush_common(struct g_raid_tr_object *tr, struct bio *bp)
960 {
961 	struct g_raid_softc *sc;
962 	struct g_raid_volume *vol;
963 	struct g_raid_subdisk *sd;
964 	struct bio_queue_head queue;
965 	struct bio *cbp;
966 	int i;
967 
968 	vol = tr->tro_volume;
969 	sc = vol->v_softc;
970 
971 	/*
972 	 * Allocate all bios before sending any request, so we can return
973 	 * ENOMEM in nice and clean way.
974 	 */
975 	bioq_init(&queue);
976 	for (i = 0; i < vol->v_disks_count; i++) {
977 		sd = &vol->v_subdisks[i];
978 		if (sd->sd_state == G_RAID_SUBDISK_S_NONE ||
979 		    sd->sd_state == G_RAID_SUBDISK_S_FAILED)
980 			continue;
981 		cbp = g_clone_bio(bp);
982 		if (cbp == NULL)
983 			goto failure;
984 		cbp->bio_caller1 = sd;
985 		bioq_insert_tail(&queue, cbp);
986 	}
987 	for (cbp = bioq_first(&queue); cbp != NULL;
988 	    cbp = bioq_first(&queue)) {
989 		bioq_remove(&queue, cbp);
990 		sd = cbp->bio_caller1;
991 		cbp->bio_caller1 = NULL;
992 		g_raid_subdisk_iostart(sd, cbp);
993 	}
994 	return;
995 failure:
996 	for (cbp = bioq_first(&queue); cbp != NULL;
997 	    cbp = bioq_first(&queue)) {
998 		bioq_remove(&queue, cbp);
999 		g_destroy_bio(cbp);
1000 	}
1001 	if (bp->bio_error == 0)
1002 		bp->bio_error = ENOMEM;
1003 	g_raid_iodone(bp, bp->bio_error);
1004 }
1005 
1006 static void
1007 g_raid_tr_kerneldump_common_done(struct bio *bp)
1008 {
1009 
1010 	bp->bio_flags |= BIO_DONE;
1011 }
1012 
1013 int
1014 g_raid_tr_kerneldump_common(struct g_raid_tr_object *tr,
1015     void *virtual, vm_offset_t physical, off_t offset, size_t length)
1016 {
1017 	struct g_raid_softc *sc;
1018 	struct g_raid_volume *vol;
1019 	struct bio bp;
1020 
1021 	vol = tr->tro_volume;
1022 	sc = vol->v_softc;
1023 
1024 	bzero(&bp, sizeof(bp));
1025 	bp.bio_cmd = BIO_WRITE;
1026 	bp.bio_done = g_raid_tr_kerneldump_common_done;
1027 	bp.bio_attribute = NULL;
1028 	bp.bio_offset = offset;
1029 	bp.bio_length = length;
1030 	bp.bio_data = virtual;
1031 	bp.bio_to = vol->v_provider;
1032 
1033 	g_raid_start(&bp);
1034 	while (!(bp.bio_flags & BIO_DONE)) {
1035 		G_RAID_DEBUG1(4, sc, "Poll...");
1036 		g_raid_poll(sc);
1037 		DELAY(10);
1038 	}
1039 
1040 	return (bp.bio_error != 0 ? EIO : 0);
1041 }
1042 
1043 static int
1044 g_raid_dump(void *arg,
1045     void *virtual, vm_offset_t physical, off_t offset, size_t length)
1046 {
1047 	struct g_raid_volume *vol;
1048 	int error;
1049 
1050 	vol = (struct g_raid_volume *)arg;
1051 	G_RAID_DEBUG1(3, vol->v_softc, "Dumping at off %llu len %llu.",
1052 	    (long long unsigned)offset, (long long unsigned)length);
1053 
1054 	error = G_RAID_TR_KERNELDUMP(vol->v_tr,
1055 	    virtual, physical, offset, length);
1056 	return (error);
1057 }
1058 
1059 static void
1060 g_raid_kerneldump(struct g_raid_softc *sc, struct bio *bp)
1061 {
1062 	struct g_kerneldump *gkd;
1063 	struct g_provider *pp;
1064 	struct g_raid_volume *vol;
1065 
1066 	gkd = (struct g_kerneldump*)bp->bio_data;
1067 	pp = bp->bio_to;
1068 	vol = pp->private;
1069 	g_trace(G_T_TOPOLOGY, "g_raid_kerneldump(%s, %jd, %jd)",
1070 		pp->name, (intmax_t)gkd->offset, (intmax_t)gkd->length);
1071 	gkd->di.dumper = g_raid_dump;
1072 	gkd->di.priv = vol;
1073 	gkd->di.blocksize = vol->v_sectorsize;
1074 	gkd->di.maxiosize = DFLTPHYS;
1075 	gkd->di.mediaoffset = gkd->offset;
1076 	if ((gkd->offset + gkd->length) > vol->v_mediasize)
1077 		gkd->length = vol->v_mediasize - gkd->offset;
1078 	gkd->di.mediasize = gkd->length;
1079 	g_io_deliver(bp, 0);
1080 }
1081 
1082 static void
1083 g_raid_candelete(struct g_raid_softc *sc, struct bio *bp)
1084 {
1085 	struct g_provider *pp;
1086 	struct g_raid_volume *vol;
1087 	struct g_raid_subdisk *sd;
1088 	int *val;
1089 	int i;
1090 
1091 	val = (int *)bp->bio_data;
1092 	pp = bp->bio_to;
1093 	vol = pp->private;
1094 	*val = 0;
1095 	for (i = 0; i < vol->v_disks_count; i++) {
1096 		sd = &vol->v_subdisks[i];
1097 		if (sd->sd_state == G_RAID_SUBDISK_S_NONE)
1098 			continue;
1099 		if (sd->sd_disk->d_candelete) {
1100 			*val = 1;
1101 			break;
1102 		}
1103 	}
1104 	g_io_deliver(bp, 0);
1105 }
1106 
1107 static void
1108 g_raid_start(struct bio *bp)
1109 {
1110 	struct g_raid_softc *sc;
1111 
1112 	sc = bp->bio_to->geom->softc;
1113 	/*
1114 	 * If sc == NULL or there are no valid disks, provider's error
1115 	 * should be set and g_raid_start() should not be called at all.
1116 	 */
1117 //	KASSERT(sc != NULL && sc->sc_state == G_RAID_VOLUME_S_RUNNING,
1118 //	    ("Provider's error should be set (error=%d)(mirror=%s).",
1119 //	    bp->bio_to->error, bp->bio_to->name));
1120 	G_RAID_LOGREQ(3, bp, "Request received.");
1121 
1122 	switch (bp->bio_cmd) {
1123 	case BIO_READ:
1124 	case BIO_WRITE:
1125 	case BIO_DELETE:
1126 	case BIO_FLUSH:
1127 		break;
1128 	case BIO_GETATTR:
1129 		if (!strcmp(bp->bio_attribute, "GEOM::candelete"))
1130 			g_raid_candelete(sc, bp);
1131 		else if (!strcmp(bp->bio_attribute, "GEOM::kerneldump"))
1132 			g_raid_kerneldump(sc, bp);
1133 		else
1134 			g_io_deliver(bp, EOPNOTSUPP);
1135 		return;
1136 	default:
1137 		g_io_deliver(bp, EOPNOTSUPP);
1138 		return;
1139 	}
1140 	mtx_lock(&sc->sc_queue_mtx);
1141 	bioq_disksort(&sc->sc_queue, bp);
1142 	mtx_unlock(&sc->sc_queue_mtx);
1143 	if (!dumping) {
1144 		G_RAID_DEBUG1(4, sc, "Waking up %p.", sc);
1145 		wakeup(sc);
1146 	}
1147 }
1148 
1149 static int
1150 g_raid_bio_overlaps(const struct bio *bp, off_t lstart, off_t len)
1151 {
1152 	/*
1153 	 * 5 cases:
1154 	 * (1) bp entirely below NO
1155 	 * (2) bp entirely above NO
1156 	 * (3) bp start below, but end in range YES
1157 	 * (4) bp entirely within YES
1158 	 * (5) bp starts within, ends above YES
1159 	 *
1160 	 * lock range 10-19 (offset 10 length 10)
1161 	 * (1) 1-5: first if kicks it out
1162 	 * (2) 30-35: second if kicks it out
1163 	 * (3) 5-15: passes both ifs
1164 	 * (4) 12-14: passes both ifs
1165 	 * (5) 19-20: passes both
1166 	 */
1167 	off_t lend = lstart + len - 1;
1168 	off_t bstart = bp->bio_offset;
1169 	off_t bend = bp->bio_offset + bp->bio_length - 1;
1170 
1171 	if (bend < lstart)
1172 		return (0);
1173 	if (lend < bstart)
1174 		return (0);
1175 	return (1);
1176 }
1177 
1178 static int
1179 g_raid_is_in_locked_range(struct g_raid_volume *vol, const struct bio *bp)
1180 {
1181 	struct g_raid_lock *lp;
1182 
1183 	sx_assert(&vol->v_softc->sc_lock, SX_LOCKED);
1184 
1185 	LIST_FOREACH(lp, &vol->v_locks, l_next) {
1186 		if (g_raid_bio_overlaps(bp, lp->l_offset, lp->l_length))
1187 			return (1);
1188 	}
1189 	return (0);
1190 }
1191 
1192 static void
1193 g_raid_start_request(struct bio *bp)
1194 {
1195 	struct g_raid_softc *sc;
1196 	struct g_raid_volume *vol;
1197 
1198 	sc = bp->bio_to->geom->softc;
1199 	sx_assert(&sc->sc_lock, SX_LOCKED);
1200 	vol = bp->bio_to->private;
1201 
1202 	/*
1203 	 * Check to see if this item is in a locked range.  If so,
1204 	 * queue it to our locked queue and return.  We'll requeue
1205 	 * it when the range is unlocked.  Internal I/O for the
1206 	 * rebuild/rescan/recovery process is excluded from this
1207 	 * check so we can actually do the recovery.
1208 	 */
1209 	if (!(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL) &&
1210 	    g_raid_is_in_locked_range(vol, bp)) {
1211 		G_RAID_LOGREQ(3, bp, "Defer request.");
1212 		bioq_insert_tail(&vol->v_locked, bp);
1213 		return;
1214 	}
1215 
1216 	/*
1217 	 * If we're actually going to do the write/delete, then
1218 	 * update the idle stats for the volume.
1219 	 */
1220 	if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE) {
1221 		if (!vol->v_dirty)
1222 			g_raid_dirty(vol);
1223 		vol->v_writes++;
1224 	}
1225 
1226 	/*
1227 	 * Put request onto inflight queue, so we can check if new
1228 	 * synchronization requests don't collide with it.  Then tell
1229 	 * the transformation layer to start the I/O.
1230 	 */
1231 	bioq_insert_tail(&vol->v_inflight, bp);
1232 	G_RAID_LOGREQ(4, bp, "Request started");
1233 	G_RAID_TR_IOSTART(vol->v_tr, bp);
1234 }
1235 
1236 static void
1237 g_raid_finish_with_locked_ranges(struct g_raid_volume *vol, struct bio *bp)
1238 {
1239 	off_t off, len;
1240 	struct bio *nbp;
1241 	struct g_raid_lock *lp;
1242 
1243 	vol->v_pending_lock = 0;
1244 	LIST_FOREACH(lp, &vol->v_locks, l_next) {
1245 		if (lp->l_pending) {
1246 			off = lp->l_offset;
1247 			len = lp->l_length;
1248 			lp->l_pending = 0;
1249 			TAILQ_FOREACH(nbp, &vol->v_inflight.queue, bio_queue) {
1250 				if (g_raid_bio_overlaps(nbp, off, len))
1251 					lp->l_pending++;
1252 			}
1253 			if (lp->l_pending) {
1254 				vol->v_pending_lock = 1;
1255 				G_RAID_DEBUG1(4, vol->v_softc,
1256 				    "Deferred lock(%jd, %jd) has %d pending",
1257 				    (intmax_t)off, (intmax_t)(off + len),
1258 				    lp->l_pending);
1259 				continue;
1260 			}
1261 			G_RAID_DEBUG1(4, vol->v_softc,
1262 			    "Deferred lock of %jd to %jd completed",
1263 			    (intmax_t)off, (intmax_t)(off + len));
1264 			G_RAID_TR_LOCKED(vol->v_tr, lp->l_callback_arg);
1265 		}
1266 	}
1267 }
1268 
1269 void
1270 g_raid_iodone(struct bio *bp, int error)
1271 {
1272 	struct g_raid_softc *sc;
1273 	struct g_raid_volume *vol;
1274 
1275 	sc = bp->bio_to->geom->softc;
1276 	sx_assert(&sc->sc_lock, SX_LOCKED);
1277 	vol = bp->bio_to->private;
1278 	G_RAID_LOGREQ(3, bp, "Request done: %d.", error);
1279 
1280 	/* Update stats if we done write/delete. */
1281 	if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE) {
1282 		vol->v_writes--;
1283 		vol->v_last_write = time_uptime;
1284 	}
1285 
1286 	bioq_remove(&vol->v_inflight, bp);
1287 	if (vol->v_pending_lock && g_raid_is_in_locked_range(vol, bp))
1288 		g_raid_finish_with_locked_ranges(vol, bp);
1289 	getmicrouptime(&vol->v_last_done);
1290 	g_io_deliver(bp, error);
1291 }
1292 
1293 int
1294 g_raid_lock_range(struct g_raid_volume *vol, off_t off, off_t len,
1295     struct bio *ignore, void *argp)
1296 {
1297 	struct g_raid_softc *sc;
1298 	struct g_raid_lock *lp;
1299 	struct bio *bp;
1300 
1301 	sc = vol->v_softc;
1302 	lp = malloc(sizeof(*lp), M_RAID, M_WAITOK | M_ZERO);
1303 	LIST_INSERT_HEAD(&vol->v_locks, lp, l_next);
1304 	lp->l_offset = off;
1305 	lp->l_length = len;
1306 	lp->l_callback_arg = argp;
1307 
1308 	lp->l_pending = 0;
1309 	TAILQ_FOREACH(bp, &vol->v_inflight.queue, bio_queue) {
1310 		if (bp != ignore && g_raid_bio_overlaps(bp, off, len))
1311 			lp->l_pending++;
1312 	}
1313 
1314 	/*
1315 	 * If there are any writes that are pending, we return EBUSY.  All
1316 	 * callers will have to wait until all pending writes clear.
1317 	 */
1318 	if (lp->l_pending > 0) {
1319 		vol->v_pending_lock = 1;
1320 		G_RAID_DEBUG1(4, sc, "Locking range %jd to %jd deferred %d pend",
1321 		    (intmax_t)off, (intmax_t)(off+len), lp->l_pending);
1322 		return (EBUSY);
1323 	}
1324 	G_RAID_DEBUG1(4, sc, "Locking range %jd to %jd",
1325 	    (intmax_t)off, (intmax_t)(off+len));
1326 	G_RAID_TR_LOCKED(vol->v_tr, lp->l_callback_arg);
1327 	return (0);
1328 }
1329 
1330 int
1331 g_raid_unlock_range(struct g_raid_volume *vol, off_t off, off_t len)
1332 {
1333 	struct g_raid_lock *lp;
1334 	struct g_raid_softc *sc;
1335 	struct bio *bp;
1336 
1337 	sc = vol->v_softc;
1338 	LIST_FOREACH(lp, &vol->v_locks, l_next) {
1339 		if (lp->l_offset == off && lp->l_length == len) {
1340 			LIST_REMOVE(lp, l_next);
1341 			/* XXX
1342 			 * Right now we just put them all back on the queue
1343 			 * and hope for the best.  We hope this because any
1344 			 * locked ranges will go right back on this list
1345 			 * when the worker thread runs.
1346 			 * XXX
1347 			 */
1348 			G_RAID_DEBUG1(4, sc, "Unlocked %jd to %jd",
1349 			    (intmax_t)lp->l_offset,
1350 			    (intmax_t)(lp->l_offset+lp->l_length));
1351 			mtx_lock(&sc->sc_queue_mtx);
1352 			while ((bp = bioq_takefirst(&vol->v_locked)) != NULL)
1353 				bioq_disksort(&sc->sc_queue, bp);
1354 			mtx_unlock(&sc->sc_queue_mtx);
1355 			free(lp, M_RAID);
1356 			return (0);
1357 		}
1358 	}
1359 	return (EINVAL);
1360 }
1361 
1362 void
1363 g_raid_subdisk_iostart(struct g_raid_subdisk *sd, struct bio *bp)
1364 {
1365 	struct g_consumer *cp;
1366 	struct g_raid_disk *disk, *tdisk;
1367 
1368 	bp->bio_caller1 = sd;
1369 
1370 	/*
1371 	 * Make sure that the disk is present. Generally it is a task of
1372 	 * transformation layers to not send requests to absent disks, but
1373 	 * it is better to be safe and report situation then sorry.
1374 	 */
1375 	if (sd->sd_disk == NULL) {
1376 		G_RAID_LOGREQ(0, bp, "Warning! I/O request to an absent disk!");
1377 nodisk:
1378 		bp->bio_from = NULL;
1379 		bp->bio_to = NULL;
1380 		bp->bio_error = ENXIO;
1381 		g_raid_disk_done(bp);
1382 		return;
1383 	}
1384 	disk = sd->sd_disk;
1385 	if (disk->d_state != G_RAID_DISK_S_ACTIVE &&
1386 	    disk->d_state != G_RAID_DISK_S_FAILED) {
1387 		G_RAID_LOGREQ(0, bp, "Warning! I/O request to a disk in a "
1388 		    "wrong state (%s)!", g_raid_disk_state2str(disk->d_state));
1389 		goto nodisk;
1390 	}
1391 
1392 	cp = disk->d_consumer;
1393 	bp->bio_from = cp;
1394 	bp->bio_to = cp->provider;
1395 	cp->index++;
1396 
1397 	/* Update average disks load. */
1398 	TAILQ_FOREACH(tdisk, &sd->sd_softc->sc_disks, d_next) {
1399 		if (tdisk->d_consumer == NULL)
1400 			tdisk->d_load = 0;
1401 		else
1402 			tdisk->d_load = (tdisk->d_consumer->index *
1403 			    G_RAID_SUBDISK_LOAD_SCALE + tdisk->d_load * 7) / 8;
1404 	}
1405 
1406 	disk->d_last_offset = bp->bio_offset + bp->bio_length;
1407 	if (dumping) {
1408 		G_RAID_LOGREQ(3, bp, "Sending dumping request.");
1409 		if (bp->bio_cmd == BIO_WRITE) {
1410 			bp->bio_error = g_raid_subdisk_kerneldump(sd,
1411 			    bp->bio_data, 0, bp->bio_offset, bp->bio_length);
1412 		} else
1413 			bp->bio_error = EOPNOTSUPP;
1414 		g_raid_disk_done(bp);
1415 	} else {
1416 		bp->bio_done = g_raid_disk_done;
1417 		bp->bio_offset += sd->sd_offset;
1418 		G_RAID_LOGREQ(3, bp, "Sending request.");
1419 		g_io_request(bp, cp);
1420 	}
1421 }
1422 
1423 int
1424 g_raid_subdisk_kerneldump(struct g_raid_subdisk *sd,
1425     void *virtual, vm_offset_t physical, off_t offset, size_t length)
1426 {
1427 
1428 	if (sd->sd_disk == NULL)
1429 		return (ENXIO);
1430 	if (sd->sd_disk->d_kd.di.dumper == NULL)
1431 		return (EOPNOTSUPP);
1432 	return (dump_write(&sd->sd_disk->d_kd.di,
1433 	    virtual, physical,
1434 	    sd->sd_disk->d_kd.di.mediaoffset + sd->sd_offset + offset,
1435 	    length));
1436 }
1437 
1438 static void
1439 g_raid_disk_done(struct bio *bp)
1440 {
1441 	struct g_raid_softc *sc;
1442 	struct g_raid_subdisk *sd;
1443 
1444 	sd = bp->bio_caller1;
1445 	sc = sd->sd_softc;
1446 	mtx_lock(&sc->sc_queue_mtx);
1447 	bioq_disksort(&sc->sc_queue, bp);
1448 	mtx_unlock(&sc->sc_queue_mtx);
1449 	if (!dumping)
1450 		wakeup(sc);
1451 }
1452 
1453 static void
1454 g_raid_disk_done_request(struct bio *bp)
1455 {
1456 	struct g_raid_softc *sc;
1457 	struct g_raid_disk *disk;
1458 	struct g_raid_subdisk *sd;
1459 	struct g_raid_volume *vol;
1460 
1461 	g_topology_assert_not();
1462 
1463 	G_RAID_LOGREQ(3, bp, "Disk request done: %d.", bp->bio_error);
1464 	sd = bp->bio_caller1;
1465 	sc = sd->sd_softc;
1466 	vol = sd->sd_volume;
1467 	if (bp->bio_from != NULL) {
1468 		bp->bio_from->index--;
1469 		disk = bp->bio_from->private;
1470 		if (disk == NULL)
1471 			g_raid_kill_consumer(sc, bp->bio_from);
1472 	}
1473 	bp->bio_offset -= sd->sd_offset;
1474 
1475 	G_RAID_TR_IODONE(vol->v_tr, sd, bp);
1476 }
1477 
1478 static void
1479 g_raid_handle_event(struct g_raid_softc *sc, struct g_raid_event *ep)
1480 {
1481 
1482 	if ((ep->e_flags & G_RAID_EVENT_VOLUME) != 0)
1483 		ep->e_error = g_raid_update_volume(ep->e_tgt, ep->e_event);
1484 	else if ((ep->e_flags & G_RAID_EVENT_DISK) != 0)
1485 		ep->e_error = g_raid_update_disk(ep->e_tgt, ep->e_event);
1486 	else if ((ep->e_flags & G_RAID_EVENT_SUBDISK) != 0)
1487 		ep->e_error = g_raid_update_subdisk(ep->e_tgt, ep->e_event);
1488 	else
1489 		ep->e_error = g_raid_update_node(ep->e_tgt, ep->e_event);
1490 	if ((ep->e_flags & G_RAID_EVENT_WAIT) == 0) {
1491 		KASSERT(ep->e_error == 0,
1492 		    ("Error cannot be handled."));
1493 		g_raid_event_free(ep);
1494 	} else {
1495 		ep->e_flags |= G_RAID_EVENT_DONE;
1496 		G_RAID_DEBUG1(4, sc, "Waking up %p.", ep);
1497 		mtx_lock(&sc->sc_queue_mtx);
1498 		wakeup(ep);
1499 		mtx_unlock(&sc->sc_queue_mtx);
1500 	}
1501 }
1502 
1503 /*
1504  * Worker thread.
1505  */
1506 static void
1507 g_raid_worker(void *arg)
1508 {
1509 	struct g_raid_softc *sc;
1510 	struct g_raid_event *ep;
1511 	struct g_raid_volume *vol;
1512 	struct bio *bp;
1513 	struct timeval now, t;
1514 	int timeout, rv;
1515 
1516 	sc = arg;
1517 	thread_lock(curthread);
1518 	sched_prio(curthread, PRIBIO);
1519 	thread_unlock(curthread);
1520 
1521 	sx_xlock(&sc->sc_lock);
1522 	for (;;) {
1523 		mtx_lock(&sc->sc_queue_mtx);
1524 		/*
1525 		 * First take a look at events.
1526 		 * This is important to handle events before any I/O requests.
1527 		 */
1528 		bp = NULL;
1529 		vol = NULL;
1530 		rv = 0;
1531 		ep = TAILQ_FIRST(&sc->sc_events);
1532 		if (ep != NULL)
1533 			TAILQ_REMOVE(&sc->sc_events, ep, e_next);
1534 		else if ((bp = bioq_takefirst(&sc->sc_queue)) != NULL)
1535 			;
1536 		else {
1537 			getmicrouptime(&now);
1538 			t = now;
1539 			TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
1540 				if (bioq_first(&vol->v_inflight) == NULL &&
1541 				    vol->v_tr &&
1542 				    timevalcmp(&vol->v_last_done, &t, < ))
1543 					t = vol->v_last_done;
1544 			}
1545 			timevalsub(&t, &now);
1546 			timeout = g_raid_idle_threshold +
1547 			    t.tv_sec * 1000000 + t.tv_usec;
1548 			if (timeout > 0) {
1549 				/*
1550 				 * Two steps to avoid overflows at HZ=1000
1551 				 * and idle timeouts > 2.1s.  Some rounding
1552 				 * errors can occur, but they are < 1tick,
1553 				 * which is deemed to be close enough for
1554 				 * this purpose.
1555 				 */
1556 				int micpertic = 1000000 / hz;
1557 				timeout = (timeout + micpertic - 1) / micpertic;
1558 				sx_xunlock(&sc->sc_lock);
1559 				MSLEEP(rv, sc, &sc->sc_queue_mtx,
1560 				    PRIBIO | PDROP, "-", timeout);
1561 				sx_xlock(&sc->sc_lock);
1562 				goto process;
1563 			} else
1564 				rv = EWOULDBLOCK;
1565 		}
1566 		mtx_unlock(&sc->sc_queue_mtx);
1567 process:
1568 		if (ep != NULL) {
1569 			g_raid_handle_event(sc, ep);
1570 		} else if (bp != NULL) {
1571 			if (bp->bio_to != NULL &&
1572 			    bp->bio_to->geom == sc->sc_geom)
1573 				g_raid_start_request(bp);
1574 			else
1575 				g_raid_disk_done_request(bp);
1576 		} else if (rv == EWOULDBLOCK) {
1577 			TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
1578 				g_raid_clean(vol, -1);
1579 				if (bioq_first(&vol->v_inflight) == NULL &&
1580 				    vol->v_tr) {
1581 					t.tv_sec = g_raid_idle_threshold / 1000000;
1582 					t.tv_usec = g_raid_idle_threshold % 1000000;
1583 					timevaladd(&t, &vol->v_last_done);
1584 					getmicrouptime(&now);
1585 					if (timevalcmp(&t, &now, <= )) {
1586 						G_RAID_TR_IDLE(vol->v_tr);
1587 						vol->v_last_done = now;
1588 					}
1589 				}
1590 			}
1591 		}
1592 		if (sc->sc_stopping == G_RAID_DESTROY_HARD)
1593 			g_raid_destroy_node(sc, 1);	/* May not return. */
1594 	}
1595 }
1596 
1597 static void
1598 g_raid_poll(struct g_raid_softc *sc)
1599 {
1600 	struct g_raid_event *ep;
1601 	struct bio *bp;
1602 
1603 	sx_xlock(&sc->sc_lock);
1604 	mtx_lock(&sc->sc_queue_mtx);
1605 	/*
1606 	 * First take a look at events.
1607 	 * This is important to handle events before any I/O requests.
1608 	 */
1609 	ep = TAILQ_FIRST(&sc->sc_events);
1610 	if (ep != NULL) {
1611 		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
1612 		mtx_unlock(&sc->sc_queue_mtx);
1613 		g_raid_handle_event(sc, ep);
1614 		goto out;
1615 	}
1616 	bp = bioq_takefirst(&sc->sc_queue);
1617 	if (bp != NULL) {
1618 		mtx_unlock(&sc->sc_queue_mtx);
1619 		if (bp->bio_from == NULL ||
1620 		    bp->bio_from->geom != sc->sc_geom)
1621 			g_raid_start_request(bp);
1622 		else
1623 			g_raid_disk_done_request(bp);
1624 	}
1625 out:
1626 	sx_xunlock(&sc->sc_lock);
1627 }
1628 
1629 static void
1630 g_raid_launch_provider(struct g_raid_volume *vol)
1631 {
1632 	struct g_raid_disk *disk;
1633 	struct g_raid_softc *sc;
1634 	struct g_provider *pp;
1635 	char name[G_RAID_MAX_VOLUMENAME];
1636 	off_t off;
1637 
1638 	sc = vol->v_softc;
1639 	sx_assert(&sc->sc_lock, SX_LOCKED);
1640 
1641 	g_topology_lock();
1642 	/* Try to name provider with volume name. */
1643 	snprintf(name, sizeof(name), "raid/%s", vol->v_name);
1644 	if (g_raid_name_format == 0 || vol->v_name[0] == 0 ||
1645 	    g_provider_by_name(name) != NULL) {
1646 		/* Otherwise use sequential volume number. */
1647 		snprintf(name, sizeof(name), "raid/r%d", vol->v_global_id);
1648 	}
1649 	pp = g_new_providerf(sc->sc_geom, "%s", name);
1650 	pp->private = vol;
1651 	pp->mediasize = vol->v_mediasize;
1652 	pp->sectorsize = vol->v_sectorsize;
1653 	pp->stripesize = 0;
1654 	pp->stripeoffset = 0;
1655 	if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 ||
1656 	    vol->v_raid_level == G_RAID_VOLUME_RL_RAID3 ||
1657 	    vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE ||
1658 	    vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT) {
1659 		if ((disk = vol->v_subdisks[0].sd_disk) != NULL &&
1660 		    disk->d_consumer != NULL &&
1661 		    disk->d_consumer->provider != NULL) {
1662 			pp->stripesize = disk->d_consumer->provider->stripesize;
1663 			off = disk->d_consumer->provider->stripeoffset;
1664 			pp->stripeoffset = off + vol->v_subdisks[0].sd_offset;
1665 			if (off > 0)
1666 				pp->stripeoffset %= off;
1667 		}
1668 		if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3) {
1669 			pp->stripesize *= (vol->v_disks_count - 1);
1670 			pp->stripeoffset *= (vol->v_disks_count - 1);
1671 		}
1672 	} else
1673 		pp->stripesize = vol->v_strip_size;
1674 	vol->v_provider = pp;
1675 	g_error_provider(pp, 0);
1676 	g_topology_unlock();
1677 	G_RAID_DEBUG1(0, sc, "Provider %s for volume %s created.",
1678 	    pp->name, vol->v_name);
1679 }
1680 
1681 static void
1682 g_raid_destroy_provider(struct g_raid_volume *vol)
1683 {
1684 	struct g_raid_softc *sc;
1685 	struct g_provider *pp;
1686 	struct bio *bp, *tmp;
1687 
1688 	g_topology_assert_not();
1689 	sc = vol->v_softc;
1690 	pp = vol->v_provider;
1691 	KASSERT(pp != NULL, ("NULL provider (volume=%s).", vol->v_name));
1692 
1693 	g_topology_lock();
1694 	g_error_provider(pp, ENXIO);
1695 	mtx_lock(&sc->sc_queue_mtx);
1696 	TAILQ_FOREACH_SAFE(bp, &sc->sc_queue.queue, bio_queue, tmp) {
1697 		if (bp->bio_to != pp)
1698 			continue;
1699 		bioq_remove(&sc->sc_queue, bp);
1700 		g_io_deliver(bp, ENXIO);
1701 	}
1702 	mtx_unlock(&sc->sc_queue_mtx);
1703 	G_RAID_DEBUG1(0, sc, "Provider %s for volume %s destroyed.",
1704 	    pp->name, vol->v_name);
1705 	g_wither_provider(pp, ENXIO);
1706 	g_topology_unlock();
1707 	vol->v_provider = NULL;
1708 }
1709 
1710 /*
1711  * Update device state.
1712  */
1713 static int
1714 g_raid_update_volume(struct g_raid_volume *vol, u_int event)
1715 {
1716 	struct g_raid_softc *sc;
1717 
1718 	sc = vol->v_softc;
1719 	sx_assert(&sc->sc_lock, SX_XLOCKED);
1720 
1721 	G_RAID_DEBUG1(2, sc, "Event %s for volume %s.",
1722 	    g_raid_volume_event2str(event),
1723 	    vol->v_name);
1724 	switch (event) {
1725 	case G_RAID_VOLUME_E_DOWN:
1726 		if (vol->v_provider != NULL)
1727 			g_raid_destroy_provider(vol);
1728 		break;
1729 	case G_RAID_VOLUME_E_UP:
1730 		if (vol->v_provider == NULL)
1731 			g_raid_launch_provider(vol);
1732 		break;
1733 	case G_RAID_VOLUME_E_START:
1734 		if (vol->v_tr)
1735 			G_RAID_TR_START(vol->v_tr);
1736 		return (0);
1737 	default:
1738 		if (sc->sc_md)
1739 			G_RAID_MD_VOLUME_EVENT(sc->sc_md, vol, event);
1740 		return (0);
1741 	}
1742 
1743 	/* Manage root mount release. */
1744 	if (vol->v_starting) {
1745 		vol->v_starting = 0;
1746 		G_RAID_DEBUG1(1, sc, "root_mount_rel %p", vol->v_rootmount);
1747 		root_mount_rel(vol->v_rootmount);
1748 		vol->v_rootmount = NULL;
1749 	}
1750 	if (vol->v_stopping && vol->v_provider_open == 0)
1751 		g_raid_destroy_volume(vol);
1752 	return (0);
1753 }
1754 
1755 /*
1756  * Update subdisk state.
1757  */
1758 static int
1759 g_raid_update_subdisk(struct g_raid_subdisk *sd, u_int event)
1760 {
1761 	struct g_raid_softc *sc;
1762 	struct g_raid_volume *vol;
1763 
1764 	sc = sd->sd_softc;
1765 	vol = sd->sd_volume;
1766 	sx_assert(&sc->sc_lock, SX_XLOCKED);
1767 
1768 	G_RAID_DEBUG1(2, sc, "Event %s for subdisk %s:%d-%s.",
1769 	    g_raid_subdisk_event2str(event),
1770 	    vol->v_name, sd->sd_pos,
1771 	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
1772 	if (vol->v_tr)
1773 		G_RAID_TR_EVENT(vol->v_tr, sd, event);
1774 
1775 	return (0);
1776 }
1777 
1778 /*
1779  * Update disk state.
1780  */
1781 static int
1782 g_raid_update_disk(struct g_raid_disk *disk, u_int event)
1783 {
1784 	struct g_raid_softc *sc;
1785 
1786 	sc = disk->d_softc;
1787 	sx_assert(&sc->sc_lock, SX_XLOCKED);
1788 
1789 	G_RAID_DEBUG1(2, sc, "Event %s for disk %s.",
1790 	    g_raid_disk_event2str(event),
1791 	    g_raid_get_diskname(disk));
1792 
1793 	if (sc->sc_md)
1794 		G_RAID_MD_EVENT(sc->sc_md, disk, event);
1795 	return (0);
1796 }
1797 
1798 /*
1799  * Node event.
1800  */
1801 static int
1802 g_raid_update_node(struct g_raid_softc *sc, u_int event)
1803 {
1804 	sx_assert(&sc->sc_lock, SX_XLOCKED);
1805 
1806 	G_RAID_DEBUG1(2, sc, "Event %s for the array.",
1807 	    g_raid_node_event2str(event));
1808 
1809 	if (event == G_RAID_NODE_E_WAKE)
1810 		return (0);
1811 	if (sc->sc_md)
1812 		G_RAID_MD_EVENT(sc->sc_md, NULL, event);
1813 	return (0);
1814 }
1815 
1816 static int
1817 g_raid_access(struct g_provider *pp, int acr, int acw, int ace)
1818 {
1819 	struct g_raid_volume *vol;
1820 	struct g_raid_softc *sc;
1821 	int dcw, opens, error = 0;
1822 
1823 	g_topology_assert();
1824 	sc = pp->geom->softc;
1825 	vol = pp->private;
1826 	KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name));
1827 	KASSERT(vol != NULL, ("NULL volume (provider=%s).", pp->name));
1828 
1829 	G_RAID_DEBUG1(2, sc, "Access request for %s: r%dw%de%d.", pp->name,
1830 	    acr, acw, ace);
1831 	dcw = pp->acw + acw;
1832 
1833 	g_topology_unlock();
1834 	sx_xlock(&sc->sc_lock);
1835 	/* Deny new opens while dying. */
1836 	if (sc->sc_stopping != 0 && (acr > 0 || acw > 0 || ace > 0)) {
1837 		error = ENXIO;
1838 		goto out;
1839 	}
1840 	if (dcw == 0)
1841 		g_raid_clean(vol, dcw);
1842 	vol->v_provider_open += acr + acw + ace;
1843 	/* Handle delayed node destruction. */
1844 	if (sc->sc_stopping == G_RAID_DESTROY_DELAYED &&
1845 	    vol->v_provider_open == 0) {
1846 		/* Count open volumes. */
1847 		opens = g_raid_nopens(sc);
1848 		if (opens == 0) {
1849 			sc->sc_stopping = G_RAID_DESTROY_HARD;
1850 			/* Wake up worker to make it selfdestruct. */
1851 			g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
1852 		}
1853 	}
1854 	/* Handle open volume destruction. */
1855 	if (vol->v_stopping && vol->v_provider_open == 0)
1856 		g_raid_destroy_volume(vol);
1857 out:
1858 	sx_xunlock(&sc->sc_lock);
1859 	g_topology_lock();
1860 	return (error);
1861 }
1862 
1863 struct g_raid_softc *
1864 g_raid_create_node(struct g_class *mp,
1865     const char *name, struct g_raid_md_object *md)
1866 {
1867 	struct g_raid_softc *sc;
1868 	struct g_geom *gp;
1869 	int error;
1870 
1871 	g_topology_assert();
1872 	G_RAID_DEBUG(1, "Creating array %s.", name);
1873 
1874 	gp = g_new_geomf(mp, "%s", name);
1875 	sc = malloc(sizeof(*sc), M_RAID, M_WAITOK | M_ZERO);
1876 	gp->start = g_raid_start;
1877 	gp->orphan = g_raid_orphan;
1878 	gp->access = g_raid_access;
1879 	gp->dumpconf = g_raid_dumpconf;
1880 
1881 	sc->sc_md = md;
1882 	sc->sc_geom = gp;
1883 	sc->sc_flags = 0;
1884 	TAILQ_INIT(&sc->sc_volumes);
1885 	TAILQ_INIT(&sc->sc_disks);
1886 	sx_init(&sc->sc_lock, "graid:lock");
1887 	mtx_init(&sc->sc_queue_mtx, "graid:queue", NULL, MTX_DEF);
1888 	TAILQ_INIT(&sc->sc_events);
1889 	bioq_init(&sc->sc_queue);
1890 	gp->softc = sc;
1891 	error = kproc_create(g_raid_worker, sc, &sc->sc_worker, 0, 0,
1892 	    "g_raid %s", name);
1893 	if (error != 0) {
1894 		G_RAID_DEBUG(0, "Cannot create kernel thread for %s.", name);
1895 		mtx_destroy(&sc->sc_queue_mtx);
1896 		sx_destroy(&sc->sc_lock);
1897 		g_destroy_geom(sc->sc_geom);
1898 		free(sc, M_RAID);
1899 		return (NULL);
1900 	}
1901 
1902 	G_RAID_DEBUG1(0, sc, "Array %s created.", name);
1903 	return (sc);
1904 }
1905 
1906 struct g_raid_volume *
1907 g_raid_create_volume(struct g_raid_softc *sc, const char *name, int id)
1908 {
1909 	struct g_raid_volume	*vol, *vol1;
1910 	int i;
1911 
1912 	G_RAID_DEBUG1(1, sc, "Creating volume %s.", name);
1913 	vol = malloc(sizeof(*vol), M_RAID, M_WAITOK | M_ZERO);
1914 	vol->v_softc = sc;
1915 	strlcpy(vol->v_name, name, G_RAID_MAX_VOLUMENAME);
1916 	vol->v_state = G_RAID_VOLUME_S_STARTING;
1917 	vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN;
1918 	vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_UNKNOWN;
1919 	vol->v_rotate_parity = 1;
1920 	bioq_init(&vol->v_inflight);
1921 	bioq_init(&vol->v_locked);
1922 	LIST_INIT(&vol->v_locks);
1923 	for (i = 0; i < G_RAID_MAX_SUBDISKS; i++) {
1924 		vol->v_subdisks[i].sd_softc = sc;
1925 		vol->v_subdisks[i].sd_volume = vol;
1926 		vol->v_subdisks[i].sd_pos = i;
1927 		vol->v_subdisks[i].sd_state = G_RAID_DISK_S_NONE;
1928 	}
1929 
1930 	/* Find free ID for this volume. */
1931 	g_topology_lock();
1932 	vol1 = vol;
1933 	if (id >= 0) {
1934 		LIST_FOREACH(vol1, &g_raid_volumes, v_global_next) {
1935 			if (vol1->v_global_id == id)
1936 				break;
1937 		}
1938 	}
1939 	if (vol1 != NULL) {
1940 		for (id = 0; ; id++) {
1941 			LIST_FOREACH(vol1, &g_raid_volumes, v_global_next) {
1942 				if (vol1->v_global_id == id)
1943 					break;
1944 			}
1945 			if (vol1 == NULL)
1946 				break;
1947 		}
1948 	}
1949 	vol->v_global_id = id;
1950 	LIST_INSERT_HEAD(&g_raid_volumes, vol, v_global_next);
1951 	g_topology_unlock();
1952 
1953 	/* Delay root mounting. */
1954 	vol->v_rootmount = root_mount_hold("GRAID");
1955 	G_RAID_DEBUG1(1, sc, "root_mount_hold %p", vol->v_rootmount);
1956 	vol->v_starting = 1;
1957 	TAILQ_INSERT_TAIL(&sc->sc_volumes, vol, v_next);
1958 	return (vol);
1959 }
1960 
1961 struct g_raid_disk *
1962 g_raid_create_disk(struct g_raid_softc *sc)
1963 {
1964 	struct g_raid_disk	*disk;
1965 
1966 	G_RAID_DEBUG1(1, sc, "Creating disk.");
1967 	disk = malloc(sizeof(*disk), M_RAID, M_WAITOK | M_ZERO);
1968 	disk->d_softc = sc;
1969 	disk->d_state = G_RAID_DISK_S_NONE;
1970 	TAILQ_INIT(&disk->d_subdisks);
1971 	TAILQ_INSERT_TAIL(&sc->sc_disks, disk, d_next);
1972 	return (disk);
1973 }
1974 
1975 int g_raid_start_volume(struct g_raid_volume *vol)
1976 {
1977 	struct g_raid_tr_class *class;
1978 	struct g_raid_tr_object *obj;
1979 	int status;
1980 
1981 	G_RAID_DEBUG1(2, vol->v_softc, "Starting volume %s.", vol->v_name);
1982 	LIST_FOREACH(class, &g_raid_tr_classes, trc_list) {
1983 		if (!class->trc_enable)
1984 			continue;
1985 		G_RAID_DEBUG1(2, vol->v_softc,
1986 		    "Tasting volume %s for %s transformation.",
1987 		    vol->v_name, class->name);
1988 		obj = (void *)kobj_create((kobj_class_t)class, M_RAID,
1989 		    M_WAITOK);
1990 		obj->tro_class = class;
1991 		obj->tro_volume = vol;
1992 		status = G_RAID_TR_TASTE(obj, vol);
1993 		if (status != G_RAID_TR_TASTE_FAIL)
1994 			break;
1995 		kobj_delete((kobj_t)obj, M_RAID);
1996 	}
1997 	if (class == NULL) {
1998 		G_RAID_DEBUG1(0, vol->v_softc,
1999 		    "No transformation module found for %s.",
2000 		    vol->v_name);
2001 		vol->v_tr = NULL;
2002 		g_raid_change_volume_state(vol, G_RAID_VOLUME_S_UNSUPPORTED);
2003 		g_raid_event_send(vol, G_RAID_VOLUME_E_DOWN,
2004 		    G_RAID_EVENT_VOLUME);
2005 		return (-1);
2006 	}
2007 	G_RAID_DEBUG1(2, vol->v_softc,
2008 	    "Transformation module %s chosen for %s.",
2009 	    class->name, vol->v_name);
2010 	vol->v_tr = obj;
2011 	return (0);
2012 }
2013 
2014 int
2015 g_raid_destroy_node(struct g_raid_softc *sc, int worker)
2016 {
2017 	struct g_raid_volume *vol, *tmpv;
2018 	struct g_raid_disk *disk, *tmpd;
2019 	int error = 0;
2020 
2021 	sc->sc_stopping = G_RAID_DESTROY_HARD;
2022 	TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tmpv) {
2023 		if (g_raid_destroy_volume(vol))
2024 			error = EBUSY;
2025 	}
2026 	if (error)
2027 		return (error);
2028 	TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tmpd) {
2029 		if (g_raid_destroy_disk(disk))
2030 			error = EBUSY;
2031 	}
2032 	if (error)
2033 		return (error);
2034 	if (sc->sc_md) {
2035 		G_RAID_MD_FREE(sc->sc_md);
2036 		kobj_delete((kobj_t)sc->sc_md, M_RAID);
2037 		sc->sc_md = NULL;
2038 	}
2039 	if (sc->sc_geom != NULL) {
2040 		G_RAID_DEBUG1(0, sc, "Array %s destroyed.", sc->sc_name);
2041 		g_topology_lock();
2042 		sc->sc_geom->softc = NULL;
2043 		g_wither_geom(sc->sc_geom, ENXIO);
2044 		g_topology_unlock();
2045 		sc->sc_geom = NULL;
2046 	} else
2047 		G_RAID_DEBUG(1, "Array destroyed.");
2048 	if (worker) {
2049 		g_raid_event_cancel(sc, sc);
2050 		mtx_destroy(&sc->sc_queue_mtx);
2051 		sx_xunlock(&sc->sc_lock);
2052 		sx_destroy(&sc->sc_lock);
2053 		wakeup(&sc->sc_stopping);
2054 		free(sc, M_RAID);
2055 		curthread->td_pflags &= ~TDP_GEOM;
2056 		G_RAID_DEBUG(1, "Thread exiting.");
2057 		kproc_exit(0);
2058 	} else {
2059 		/* Wake up worker to make it selfdestruct. */
2060 		g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
2061 	}
2062 	return (0);
2063 }
2064 
2065 int
2066 g_raid_destroy_volume(struct g_raid_volume *vol)
2067 {
2068 	struct g_raid_softc *sc;
2069 	struct g_raid_disk *disk;
2070 	int i;
2071 
2072 	sc = vol->v_softc;
2073 	G_RAID_DEBUG1(2, sc, "Destroying volume %s.", vol->v_name);
2074 	vol->v_stopping = 1;
2075 	if (vol->v_state != G_RAID_VOLUME_S_STOPPED) {
2076 		if (vol->v_tr) {
2077 			G_RAID_TR_STOP(vol->v_tr);
2078 			return (EBUSY);
2079 		} else
2080 			vol->v_state = G_RAID_VOLUME_S_STOPPED;
2081 	}
2082 	if (g_raid_event_check(sc, vol) != 0)
2083 		return (EBUSY);
2084 	if (vol->v_provider != NULL)
2085 		return (EBUSY);
2086 	if (vol->v_provider_open != 0)
2087 		return (EBUSY);
2088 	if (vol->v_tr) {
2089 		G_RAID_TR_FREE(vol->v_tr);
2090 		kobj_delete((kobj_t)vol->v_tr, M_RAID);
2091 		vol->v_tr = NULL;
2092 	}
2093 	if (vol->v_rootmount)
2094 		root_mount_rel(vol->v_rootmount);
2095 	g_topology_lock();
2096 	LIST_REMOVE(vol, v_global_next);
2097 	g_topology_unlock();
2098 	TAILQ_REMOVE(&sc->sc_volumes, vol, v_next);
2099 	for (i = 0; i < G_RAID_MAX_SUBDISKS; i++) {
2100 		g_raid_event_cancel(sc, &vol->v_subdisks[i]);
2101 		disk = vol->v_subdisks[i].sd_disk;
2102 		if (disk == NULL)
2103 			continue;
2104 		TAILQ_REMOVE(&disk->d_subdisks, &vol->v_subdisks[i], sd_next);
2105 	}
2106 	G_RAID_DEBUG1(2, sc, "Volume %s destroyed.", vol->v_name);
2107 	if (sc->sc_md)
2108 		G_RAID_MD_FREE_VOLUME(sc->sc_md, vol);
2109 	g_raid_event_cancel(sc, vol);
2110 	free(vol, M_RAID);
2111 	if (sc->sc_stopping == G_RAID_DESTROY_HARD) {
2112 		/* Wake up worker to let it selfdestruct. */
2113 		g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
2114 	}
2115 	return (0);
2116 }
2117 
2118 int
2119 g_raid_destroy_disk(struct g_raid_disk *disk)
2120 {
2121 	struct g_raid_softc *sc;
2122 	struct g_raid_subdisk *sd, *tmp;
2123 
2124 	sc = disk->d_softc;
2125 	G_RAID_DEBUG1(2, sc, "Destroying disk.");
2126 	if (disk->d_consumer) {
2127 		g_raid_kill_consumer(sc, disk->d_consumer);
2128 		disk->d_consumer = NULL;
2129 	}
2130 	TAILQ_FOREACH_SAFE(sd, &disk->d_subdisks, sd_next, tmp) {
2131 		g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE);
2132 		g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED,
2133 		    G_RAID_EVENT_SUBDISK);
2134 		TAILQ_REMOVE(&disk->d_subdisks, sd, sd_next);
2135 		sd->sd_disk = NULL;
2136 	}
2137 	TAILQ_REMOVE(&sc->sc_disks, disk, d_next);
2138 	if (sc->sc_md)
2139 		G_RAID_MD_FREE_DISK(sc->sc_md, disk);
2140 	g_raid_event_cancel(sc, disk);
2141 	free(disk, M_RAID);
2142 	return (0);
2143 }
2144 
2145 int
2146 g_raid_destroy(struct g_raid_softc *sc, int how)
2147 {
2148 	int opens;
2149 
2150 	g_topology_assert_not();
2151 	if (sc == NULL)
2152 		return (ENXIO);
2153 	sx_assert(&sc->sc_lock, SX_XLOCKED);
2154 
2155 	/* Count open volumes. */
2156 	opens = g_raid_nopens(sc);
2157 
2158 	/* React on some opened volumes. */
2159 	if (opens > 0) {
2160 		switch (how) {
2161 		case G_RAID_DESTROY_SOFT:
2162 			G_RAID_DEBUG1(1, sc,
2163 			    "%d volumes are still open.",
2164 			    opens);
2165 			return (EBUSY);
2166 		case G_RAID_DESTROY_DELAYED:
2167 			G_RAID_DEBUG1(1, sc,
2168 			    "Array will be destroyed on last close.");
2169 			sc->sc_stopping = G_RAID_DESTROY_DELAYED;
2170 			return (EBUSY);
2171 		case G_RAID_DESTROY_HARD:
2172 			G_RAID_DEBUG1(1, sc,
2173 			    "%d volumes are still open.",
2174 			    opens);
2175 		}
2176 	}
2177 
2178 	/* Mark node for destruction. */
2179 	sc->sc_stopping = G_RAID_DESTROY_HARD;
2180 	/* Wake up worker to let it selfdestruct. */
2181 	g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0);
2182 	/* Sleep until node destroyed. */
2183 	sx_sleep(&sc->sc_stopping, &sc->sc_lock,
2184 	    PRIBIO | PDROP, "r:destroy", 0);
2185 	return (0);
2186 }
2187 
2188 static void
2189 g_raid_taste_orphan(struct g_consumer *cp)
2190 {
2191 
2192 	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
2193 	    cp->provider->name));
2194 }
2195 
2196 static struct g_geom *
2197 g_raid_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
2198 {
2199 	struct g_consumer *cp;
2200 	struct g_geom *gp, *geom;
2201 	struct g_raid_md_class *class;
2202 	struct g_raid_md_object *obj;
2203 	int status;
2204 
2205 	g_topology_assert();
2206 	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
2207 	if (!g_raid_enable)
2208 		return (NULL);
2209 	G_RAID_DEBUG(2, "Tasting provider %s.", pp->name);
2210 
2211 	gp = g_new_geomf(mp, "raid:taste");
2212 	/*
2213 	 * This orphan function should be never called.
2214 	 */
2215 	gp->orphan = g_raid_taste_orphan;
2216 	cp = g_new_consumer(gp);
2217 	g_attach(cp, pp);
2218 
2219 	geom = NULL;
2220 	LIST_FOREACH(class, &g_raid_md_classes, mdc_list) {
2221 		if (!class->mdc_enable)
2222 			continue;
2223 		G_RAID_DEBUG(2, "Tasting provider %s for %s metadata.",
2224 		    pp->name, class->name);
2225 		obj = (void *)kobj_create((kobj_class_t)class, M_RAID,
2226 		    M_WAITOK);
2227 		obj->mdo_class = class;
2228 		status = G_RAID_MD_TASTE(obj, mp, cp, &geom);
2229 		if (status != G_RAID_MD_TASTE_NEW)
2230 			kobj_delete((kobj_t)obj, M_RAID);
2231 		if (status != G_RAID_MD_TASTE_FAIL)
2232 			break;
2233 	}
2234 
2235 	g_detach(cp);
2236 	g_destroy_consumer(cp);
2237 	g_destroy_geom(gp);
2238 	G_RAID_DEBUG(2, "Tasting provider %s done.", pp->name);
2239 	return (geom);
2240 }
2241 
2242 int
2243 g_raid_create_node_format(const char *format, struct gctl_req *req,
2244     struct g_geom **gp)
2245 {
2246 	struct g_raid_md_class *class;
2247 	struct g_raid_md_object *obj;
2248 	int status;
2249 
2250 	G_RAID_DEBUG(2, "Creating array for %s metadata.", format);
2251 	LIST_FOREACH(class, &g_raid_md_classes, mdc_list) {
2252 		if (strcasecmp(class->name, format) == 0)
2253 			break;
2254 	}
2255 	if (class == NULL) {
2256 		G_RAID_DEBUG(1, "No support for %s metadata.", format);
2257 		return (G_RAID_MD_TASTE_FAIL);
2258 	}
2259 	obj = (void *)kobj_create((kobj_class_t)class, M_RAID,
2260 	    M_WAITOK);
2261 	obj->mdo_class = class;
2262 	status = G_RAID_MD_CREATE_REQ(obj, &g_raid_class, req, gp);
2263 	if (status != G_RAID_MD_TASTE_NEW)
2264 		kobj_delete((kobj_t)obj, M_RAID);
2265 	return (status);
2266 }
2267 
2268 static int
2269 g_raid_destroy_geom(struct gctl_req *req __unused,
2270     struct g_class *mp __unused, struct g_geom *gp)
2271 {
2272 	struct g_raid_softc *sc;
2273 	int error;
2274 
2275 	g_topology_unlock();
2276 	sc = gp->softc;
2277 	sx_xlock(&sc->sc_lock);
2278 	g_cancel_event(sc);
2279 	error = g_raid_destroy(gp->softc, G_RAID_DESTROY_SOFT);
2280 	if (error != 0)
2281 		sx_xunlock(&sc->sc_lock);
2282 	g_topology_lock();
2283 	return (error);
2284 }
2285 
2286 void g_raid_write_metadata(struct g_raid_softc *sc, struct g_raid_volume *vol,
2287     struct g_raid_subdisk *sd, struct g_raid_disk *disk)
2288 {
2289 
2290 	if (sc->sc_stopping == G_RAID_DESTROY_HARD)
2291 		return;
2292 	if (sc->sc_md)
2293 		G_RAID_MD_WRITE(sc->sc_md, vol, sd, disk);
2294 }
2295 
2296 void g_raid_fail_disk(struct g_raid_softc *sc,
2297     struct g_raid_subdisk *sd, struct g_raid_disk *disk)
2298 {
2299 
2300 	if (disk == NULL)
2301 		disk = sd->sd_disk;
2302 	if (disk == NULL) {
2303 		G_RAID_DEBUG1(0, sc, "Warning! Fail request to an absent disk!");
2304 		return;
2305 	}
2306 	if (disk->d_state != G_RAID_DISK_S_ACTIVE) {
2307 		G_RAID_DEBUG1(0, sc, "Warning! Fail request to a disk in a "
2308 		    "wrong state (%s)!", g_raid_disk_state2str(disk->d_state));
2309 		return;
2310 	}
2311 	if (sc->sc_md)
2312 		G_RAID_MD_FAIL_DISK(sc->sc_md, sd, disk);
2313 }
2314 
2315 static void
2316 g_raid_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
2317     struct g_consumer *cp, struct g_provider *pp)
2318 {
2319 	struct g_raid_softc *sc;
2320 	struct g_raid_volume *vol;
2321 	struct g_raid_subdisk *sd;
2322 	struct g_raid_disk *disk;
2323 	int i, s;
2324 
2325 	g_topology_assert();
2326 
2327 	sc = gp->softc;
2328 	if (sc == NULL)
2329 		return;
2330 	if (pp != NULL) {
2331 		vol = pp->private;
2332 		g_topology_unlock();
2333 		sx_xlock(&sc->sc_lock);
2334 		sbuf_printf(sb, "%s<Label>%s</Label>\n", indent,
2335 		    vol->v_name);
2336 		sbuf_printf(sb, "%s<RAIDLevel>%s</RAIDLevel>\n", indent,
2337 		    g_raid_volume_level2str(vol->v_raid_level,
2338 		    vol->v_raid_level_qualifier));
2339 		sbuf_printf(sb,
2340 		    "%s<Transformation>%s</Transformation>\n", indent,
2341 		    vol->v_tr ? vol->v_tr->tro_class->name : "NONE");
2342 		sbuf_printf(sb, "%s<Components>%u</Components>\n", indent,
2343 		    vol->v_disks_count);
2344 		sbuf_printf(sb, "%s<Strip>%u</Strip>\n", indent,
2345 		    vol->v_strip_size);
2346 		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
2347 		    g_raid_volume_state2str(vol->v_state));
2348 		sbuf_printf(sb, "%s<Dirty>%s</Dirty>\n", indent,
2349 		    vol->v_dirty ? "Yes" : "No");
2350 		sbuf_printf(sb, "%s<Subdisks>", indent);
2351 		for (i = 0; i < vol->v_disks_count; i++) {
2352 			sd = &vol->v_subdisks[i];
2353 			if (sd->sd_disk != NULL &&
2354 			    sd->sd_disk->d_consumer != NULL) {
2355 				sbuf_printf(sb, "%s ",
2356 				    g_raid_get_diskname(sd->sd_disk));
2357 			} else {
2358 				sbuf_printf(sb, "NONE ");
2359 			}
2360 			sbuf_printf(sb, "(%s",
2361 			    g_raid_subdisk_state2str(sd->sd_state));
2362 			if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
2363 			    sd->sd_state == G_RAID_SUBDISK_S_RESYNC) {
2364 				sbuf_printf(sb, " %d%%",
2365 				    (int)(sd->sd_rebuild_pos * 100 /
2366 				     sd->sd_size));
2367 			}
2368 			sbuf_printf(sb, ")");
2369 			if (i + 1 < vol->v_disks_count)
2370 				sbuf_printf(sb, ", ");
2371 		}
2372 		sbuf_printf(sb, "</Subdisks>\n");
2373 		sx_xunlock(&sc->sc_lock);
2374 		g_topology_lock();
2375 	} else if (cp != NULL) {
2376 		disk = cp->private;
2377 		if (disk == NULL)
2378 			return;
2379 		g_topology_unlock();
2380 		sx_xlock(&sc->sc_lock);
2381 		sbuf_printf(sb, "%s<State>%s", indent,
2382 		    g_raid_disk_state2str(disk->d_state));
2383 		if (!TAILQ_EMPTY(&disk->d_subdisks)) {
2384 			sbuf_printf(sb, " (");
2385 			TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
2386 				sbuf_printf(sb, "%s",
2387 				    g_raid_subdisk_state2str(sd->sd_state));
2388 				if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
2389 				    sd->sd_state == G_RAID_SUBDISK_S_RESYNC) {
2390 					sbuf_printf(sb, " %d%%",
2391 					    (int)(sd->sd_rebuild_pos * 100 /
2392 					     sd->sd_size));
2393 				}
2394 				if (TAILQ_NEXT(sd, sd_next))
2395 					sbuf_printf(sb, ", ");
2396 			}
2397 			sbuf_printf(sb, ")");
2398 		}
2399 		sbuf_printf(sb, "</State>\n");
2400 		sbuf_printf(sb, "%s<Subdisks>", indent);
2401 		TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) {
2402 			sbuf_printf(sb, "r%d(%s):%d@%ju",
2403 			    sd->sd_volume->v_global_id,
2404 			    sd->sd_volume->v_name,
2405 			    sd->sd_pos, sd->sd_offset);
2406 			if (TAILQ_NEXT(sd, sd_next))
2407 				sbuf_printf(sb, ", ");
2408 		}
2409 		sbuf_printf(sb, "</Subdisks>\n");
2410 		sbuf_printf(sb, "%s<ReadErrors>%d</ReadErrors>\n", indent,
2411 		    disk->d_read_errs);
2412 		sx_xunlock(&sc->sc_lock);
2413 		g_topology_lock();
2414 	} else {
2415 		g_topology_unlock();
2416 		sx_xlock(&sc->sc_lock);
2417 		if (sc->sc_md) {
2418 			sbuf_printf(sb, "%s<Metadata>%s</Metadata>\n", indent,
2419 			    sc->sc_md->mdo_class->name);
2420 		}
2421 		if (!TAILQ_EMPTY(&sc->sc_volumes)) {
2422 			s = 0xff;
2423 			TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) {
2424 				if (vol->v_state < s)
2425 					s = vol->v_state;
2426 			}
2427 			sbuf_printf(sb, "%s<State>%s</State>\n", indent,
2428 			    g_raid_volume_state2str(s));
2429 		}
2430 		sx_xunlock(&sc->sc_lock);
2431 		g_topology_lock();
2432 	}
2433 }
2434 
2435 static void
2436 g_raid_shutdown_post_sync(void *arg, int howto)
2437 {
2438 	struct g_class *mp;
2439 	struct g_geom *gp, *gp2;
2440 	struct g_raid_softc *sc;
2441 	struct g_raid_volume *vol;
2442 	int error;
2443 
2444 	mp = arg;
2445 	DROP_GIANT();
2446 	g_topology_lock();
2447 	g_raid_shutdown = 1;
2448 	LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
2449 		if ((sc = gp->softc) == NULL)
2450 			continue;
2451 		g_topology_unlock();
2452 		sx_xlock(&sc->sc_lock);
2453 		TAILQ_FOREACH(vol, &sc->sc_volumes, v_next)
2454 			g_raid_clean(vol, -1);
2455 		g_cancel_event(sc);
2456 		error = g_raid_destroy(sc, G_RAID_DESTROY_DELAYED);
2457 		if (error != 0)
2458 			sx_xunlock(&sc->sc_lock);
2459 		g_topology_lock();
2460 	}
2461 	g_topology_unlock();
2462 	PICKUP_GIANT();
2463 }
2464 
2465 static void
2466 g_raid_init(struct g_class *mp)
2467 {
2468 
2469 	g_raid_post_sync = EVENTHANDLER_REGISTER(shutdown_post_sync,
2470 	    g_raid_shutdown_post_sync, mp, SHUTDOWN_PRI_FIRST);
2471 	if (g_raid_post_sync == NULL)
2472 		G_RAID_DEBUG(0, "Warning! Cannot register shutdown event.");
2473 	g_raid_started = 1;
2474 }
2475 
2476 static void
2477 g_raid_fini(struct g_class *mp)
2478 {
2479 
2480 	if (g_raid_post_sync != NULL)
2481 		EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_raid_post_sync);
2482 	g_raid_started = 0;
2483 }
2484 
2485 int
2486 g_raid_md_modevent(module_t mod, int type, void *arg)
2487 {
2488 	struct g_raid_md_class *class, *c, *nc;
2489 	int error;
2490 
2491 	error = 0;
2492 	class = arg;
2493 	switch (type) {
2494 	case MOD_LOAD:
2495 		c = LIST_FIRST(&g_raid_md_classes);
2496 		if (c == NULL || c->mdc_priority > class->mdc_priority)
2497 			LIST_INSERT_HEAD(&g_raid_md_classes, class, mdc_list);
2498 		else {
2499 			while ((nc = LIST_NEXT(c, mdc_list)) != NULL &&
2500 			    nc->mdc_priority < class->mdc_priority)
2501 				c = nc;
2502 			LIST_INSERT_AFTER(c, class, mdc_list);
2503 		}
2504 		if (g_raid_started)
2505 			g_retaste(&g_raid_class);
2506 		break;
2507 	case MOD_UNLOAD:
2508 		LIST_REMOVE(class, mdc_list);
2509 		break;
2510 	default:
2511 		error = EOPNOTSUPP;
2512 		break;
2513 	}
2514 
2515 	return (error);
2516 }
2517 
2518 int
2519 g_raid_tr_modevent(module_t mod, int type, void *arg)
2520 {
2521 	struct g_raid_tr_class *class, *c, *nc;
2522 	int error;
2523 
2524 	error = 0;
2525 	class = arg;
2526 	switch (type) {
2527 	case MOD_LOAD:
2528 		c = LIST_FIRST(&g_raid_tr_classes);
2529 		if (c == NULL || c->trc_priority > class->trc_priority)
2530 			LIST_INSERT_HEAD(&g_raid_tr_classes, class, trc_list);
2531 		else {
2532 			while ((nc = LIST_NEXT(c, trc_list)) != NULL &&
2533 			    nc->trc_priority < class->trc_priority)
2534 				c = nc;
2535 			LIST_INSERT_AFTER(c, class, trc_list);
2536 		}
2537 		break;
2538 	case MOD_UNLOAD:
2539 		LIST_REMOVE(class, trc_list);
2540 		break;
2541 	default:
2542 		error = EOPNOTSUPP;
2543 		break;
2544 	}
2545 
2546 	return (error);
2547 }
2548 
2549 /*
2550  * Use local implementation of DECLARE_GEOM_CLASS(g_raid_class, g_raid)
2551  * to reduce module priority, allowing submodules to register them first.
2552  */
2553 static moduledata_t g_raid_mod = {
2554 	"g_raid",
2555 	g_modevent,
2556 	&g_raid_class
2557 };
2558 DECLARE_MODULE(g_raid, g_raid_mod, SI_SUB_DRIVERS, SI_ORDER_THIRD);
2559 MODULE_VERSION(geom_raid, 0);
2560