xref: /freebsd/sys/geom/raid3/g_raid3.c (revision 87569f75a91f298c52a71823c04d41cf53c88889)
1 /*-
2  * Copyright (c) 2004-2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29 
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/kernel.h>
33 #include <sys/module.h>
34 #include <sys/limits.h>
35 #include <sys/lock.h>
36 #include <sys/mutex.h>
37 #include <sys/bio.h>
38 #include <sys/sysctl.h>
39 #include <sys/malloc.h>
40 #include <sys/eventhandler.h>
41 #include <vm/uma.h>
42 #include <geom/geom.h>
43 #include <sys/proc.h>
44 #include <sys/kthread.h>
45 #include <sys/sched.h>
46 #include <geom/raid3/g_raid3.h>
47 
48 
49 static MALLOC_DEFINE(M_RAID3, "raid3_data", "GEOM_RAID3 Data");
50 
51 SYSCTL_DECL(_kern_geom);
52 SYSCTL_NODE(_kern_geom, OID_AUTO, raid3, CTLFLAG_RW, 0, "GEOM_RAID3 stuff");
53 u_int g_raid3_debug = 0;
54 TUNABLE_INT("kern.geom.raid3.debug", &g_raid3_debug);
55 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, debug, CTLFLAG_RW, &g_raid3_debug, 0,
56     "Debug level");
57 static u_int g_raid3_timeout = 4;
58 TUNABLE_INT("kern.geom.raid3.timeout", &g_raid3_timeout);
59 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, timeout, CTLFLAG_RW, &g_raid3_timeout,
60     0, "Time to wait on all raid3 components");
61 static u_int g_raid3_idletime = 5;
62 TUNABLE_INT("kern.geom.raid3.idletime", &g_raid3_idletime);
63 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, idletime, CTLFLAG_RW,
64     &g_raid3_idletime, 0, "Mark components as clean when idling");
65 static u_int g_raid3_disconnect_on_failure = 1;
66 TUNABLE_INT("kern.geom.raid3.disconnect_on_failure",
67     &g_raid3_disconnect_on_failure);
68 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, disconnect_on_failure, CTLFLAG_RW,
69     &g_raid3_disconnect_on_failure, 0, "Disconnect component on I/O failure.");
70 static u_int g_raid3_syncreqs = 2;
71 TUNABLE_INT("kern.geom.raid3.sync_requests", &g_raid3_syncreqs);
72 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, sync_requests, CTLFLAG_RDTUN,
73     &g_raid3_syncreqs, 0, "Parallel synchronization I/O requests.");
74 
75 static u_int g_raid3_n64k = 50;
76 TUNABLE_INT("kern.geom.raid3.n64k", &g_raid3_n64k);
77 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n64k, CTLFLAG_RD, &g_raid3_n64k, 0,
78     "Maximum number of 64kB allocations");
79 static u_int g_raid3_n16k = 200;
80 TUNABLE_INT("kern.geom.raid3.n16k", &g_raid3_n16k);
81 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n16k, CTLFLAG_RD, &g_raid3_n16k, 0,
82     "Maximum number of 16kB allocations");
83 static u_int g_raid3_n4k = 1200;
84 TUNABLE_INT("kern.geom.raid3.n4k", &g_raid3_n4k);
85 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n4k, CTLFLAG_RD, &g_raid3_n4k, 0,
86     "Maximum number of 4kB allocations");
87 
88 SYSCTL_NODE(_kern_geom_raid3, OID_AUTO, stat, CTLFLAG_RW, 0,
89     "GEOM_RAID3 statistics");
90 static u_int g_raid3_parity_mismatch = 0;
91 SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, parity_mismatch, CTLFLAG_RD,
92     &g_raid3_parity_mismatch, 0, "Number of failures in VERIFY mode");
93 
94 #define	MSLEEP(ident, mtx, priority, wmesg, timeout)	do {		\
95 	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, (ident));	\
96 	msleep((ident), (mtx), (priority), (wmesg), (timeout));		\
97 	G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, (ident));	\
98 } while (0)
99 
100 static eventhandler_tag g_raid3_pre_sync = NULL, g_raid3_post_sync = NULL;
101 
102 static int g_raid3_destroy_geom(struct gctl_req *req, struct g_class *mp,
103     struct g_geom *gp);
104 static g_taste_t g_raid3_taste;
105 static void g_raid3_init(struct g_class *mp);
106 static void g_raid3_fini(struct g_class *mp);
107 
108 struct g_class g_raid3_class = {
109 	.name = G_RAID3_CLASS_NAME,
110 	.version = G_VERSION,
111 	.ctlreq = g_raid3_config,
112 	.taste = g_raid3_taste,
113 	.destroy_geom = g_raid3_destroy_geom,
114 	.init = g_raid3_init,
115 	.fini = g_raid3_fini
116 };
117 
118 
119 static void g_raid3_destroy_provider(struct g_raid3_softc *sc);
120 static int g_raid3_update_disk(struct g_raid3_disk *disk, u_int state);
121 static void g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force);
122 static void g_raid3_dumpconf(struct sbuf *sb, const char *indent,
123     struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
124 static void g_raid3_sync_stop(struct g_raid3_softc *sc, int type);
125 static int g_raid3_register_request(struct bio *pbp);
126 static void g_raid3_sync_release(struct g_raid3_softc *sc);
127 
128 
129 static const char *
130 g_raid3_disk_state2str(int state)
131 {
132 
133 	switch (state) {
134 	case G_RAID3_DISK_STATE_NODISK:
135 		return ("NODISK");
136 	case G_RAID3_DISK_STATE_NONE:
137 		return ("NONE");
138 	case G_RAID3_DISK_STATE_NEW:
139 		return ("NEW");
140 	case G_RAID3_DISK_STATE_ACTIVE:
141 		return ("ACTIVE");
142 	case G_RAID3_DISK_STATE_STALE:
143 		return ("STALE");
144 	case G_RAID3_DISK_STATE_SYNCHRONIZING:
145 		return ("SYNCHRONIZING");
146 	case G_RAID3_DISK_STATE_DISCONNECTED:
147 		return ("DISCONNECTED");
148 	default:
149 		return ("INVALID");
150 	}
151 }
152 
153 static const char *
154 g_raid3_device_state2str(int state)
155 {
156 
157 	switch (state) {
158 	case G_RAID3_DEVICE_STATE_STARTING:
159 		return ("STARTING");
160 	case G_RAID3_DEVICE_STATE_DEGRADED:
161 		return ("DEGRADED");
162 	case G_RAID3_DEVICE_STATE_COMPLETE:
163 		return ("COMPLETE");
164 	default:
165 		return ("INVALID");
166 	}
167 }
168 
169 const char *
170 g_raid3_get_diskname(struct g_raid3_disk *disk)
171 {
172 
173 	if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL)
174 		return ("[unknown]");
175 	return (disk->d_name);
176 }
177 
178 static int
179 g_raid3_uma_ctor(void *mem, int size, void *arg, int flags)
180 {
181 	struct g_raid3_zone *sz = arg;
182 
183 	if (sz->sz_inuse == sz->sz_max)
184 		return (ENOMEM);
185 	sz->sz_inuse++;
186 	return (0);
187 }
188 
189 static void
190 g_raid3_uma_dtor(void *mem, int size, void *arg)
191 {
192 	struct g_raid3_zone *sz = arg;
193 
194 	sz->sz_inuse--;
195 }
196 
197 #define	g_raid3_xor(src1, src2, dst, size)				\
198 	_g_raid3_xor((uint64_t *)(src1), (uint64_t *)(src2),		\
199 	    (uint64_t *)(dst), (size_t)size)
200 static void
201 _g_raid3_xor(uint64_t *src1, uint64_t *src2, uint64_t *dst, size_t size)
202 {
203 
204 	KASSERT((size % 128) == 0, ("Invalid size: %zu.", size));
205 	for (; size > 0; size -= 128) {
206 		*dst++ = (*src1++) ^ (*src2++);
207 		*dst++ = (*src1++) ^ (*src2++);
208 		*dst++ = (*src1++) ^ (*src2++);
209 		*dst++ = (*src1++) ^ (*src2++);
210 		*dst++ = (*src1++) ^ (*src2++);
211 		*dst++ = (*src1++) ^ (*src2++);
212 		*dst++ = (*src1++) ^ (*src2++);
213 		*dst++ = (*src1++) ^ (*src2++);
214 		*dst++ = (*src1++) ^ (*src2++);
215 		*dst++ = (*src1++) ^ (*src2++);
216 		*dst++ = (*src1++) ^ (*src2++);
217 		*dst++ = (*src1++) ^ (*src2++);
218 		*dst++ = (*src1++) ^ (*src2++);
219 		*dst++ = (*src1++) ^ (*src2++);
220 		*dst++ = (*src1++) ^ (*src2++);
221 		*dst++ = (*src1++) ^ (*src2++);
222 	}
223 }
224 
225 static int
226 g_raid3_is_zero(struct bio *bp)
227 {
228 	static const uint64_t zeros[] = {
229 	    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
230 	};
231 	u_char *addr;
232 	ssize_t size;
233 
234 	size = bp->bio_length;
235 	addr = (u_char *)bp->bio_data;
236 	for (; size > 0; size -= sizeof(zeros), addr += sizeof(zeros)) {
237 		if (bcmp(addr, zeros, sizeof(zeros)) != 0)
238 			return (0);
239 	}
240 	return (1);
241 }
242 
243 /*
244  * --- Events handling functions ---
245  * Events in geom_raid3 are used to maintain disks and device status
246  * from one thread to simplify locking.
247  */
248 static void
249 g_raid3_event_free(struct g_raid3_event *ep)
250 {
251 
252 	free(ep, M_RAID3);
253 }
254 
255 int
256 g_raid3_event_send(void *arg, int state, int flags)
257 {
258 	struct g_raid3_softc *sc;
259 	struct g_raid3_disk *disk;
260 	struct g_raid3_event *ep;
261 	int error;
262 
263 	ep = malloc(sizeof(*ep), M_RAID3, M_WAITOK);
264 	G_RAID3_DEBUG(4, "%s: Sending event %p.", __func__, ep);
265 	if ((flags & G_RAID3_EVENT_DEVICE) != 0) {
266 		disk = NULL;
267 		sc = arg;
268 	} else {
269 		disk = arg;
270 		sc = disk->d_softc;
271 	}
272 	ep->e_disk = disk;
273 	ep->e_state = state;
274 	ep->e_flags = flags;
275 	ep->e_error = 0;
276 	mtx_lock(&sc->sc_events_mtx);
277 	TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next);
278 	mtx_unlock(&sc->sc_events_mtx);
279 	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
280 	mtx_lock(&sc->sc_queue_mtx);
281 	wakeup(sc);
282 	wakeup(&sc->sc_queue);
283 	mtx_unlock(&sc->sc_queue_mtx);
284 	if ((flags & G_RAID3_EVENT_DONTWAIT) != 0)
285 		return (0);
286 	sx_assert(&sc->sc_lock, SX_XLOCKED);
287 	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, ep);
288 	sx_xunlock(&sc->sc_lock);
289 	while ((ep->e_flags & G_RAID3_EVENT_DONE) == 0) {
290 		mtx_lock(&sc->sc_events_mtx);
291 		MSLEEP(ep, &sc->sc_events_mtx, PRIBIO | PDROP, "r3:event",
292 		    hz * 5);
293 	}
294 	error = ep->e_error;
295 	g_raid3_event_free(ep);
296 	sx_xlock(&sc->sc_lock);
297 	return (error);
298 }
299 
300 static struct g_raid3_event *
301 g_raid3_event_get(struct g_raid3_softc *sc)
302 {
303 	struct g_raid3_event *ep;
304 
305 	mtx_lock(&sc->sc_events_mtx);
306 	ep = TAILQ_FIRST(&sc->sc_events);
307 	mtx_unlock(&sc->sc_events_mtx);
308 	return (ep);
309 }
310 
311 static void
312 g_raid3_event_remove(struct g_raid3_softc *sc, struct g_raid3_event *ep)
313 {
314 
315 	mtx_lock(&sc->sc_events_mtx);
316 	TAILQ_REMOVE(&sc->sc_events, ep, e_next);
317 	mtx_unlock(&sc->sc_events_mtx);
318 }
319 
320 static void
321 g_raid3_event_cancel(struct g_raid3_disk *disk)
322 {
323 	struct g_raid3_softc *sc;
324 	struct g_raid3_event *ep, *tmpep;
325 
326 	sc = disk->d_softc;
327 	sx_assert(&sc->sc_lock, SX_XLOCKED);
328 
329 	mtx_lock(&sc->sc_events_mtx);
330 	TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) {
331 		if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0)
332 			continue;
333 		if (ep->e_disk != disk)
334 			continue;
335 		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
336 		if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0)
337 			g_raid3_event_free(ep);
338 		else {
339 			ep->e_error = ECANCELED;
340 			wakeup(ep);
341 		}
342 	}
343 	mtx_unlock(&sc->sc_events_mtx);
344 }
345 
346 /*
347  * Return the number of disks in the given state.
348  * If state is equal to -1, count all connected disks.
349  */
350 u_int
351 g_raid3_ndisks(struct g_raid3_softc *sc, int state)
352 {
353 	struct g_raid3_disk *disk;
354 	u_int n, ndisks;
355 
356 	sx_assert(&sc->sc_lock, SX_LOCKED);
357 
358 	for (n = ndisks = 0; n < sc->sc_ndisks; n++) {
359 		disk = &sc->sc_disks[n];
360 		if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
361 			continue;
362 		if (state == -1 || disk->d_state == state)
363 			ndisks++;
364 	}
365 	return (ndisks);
366 }
367 
368 static u_int
369 g_raid3_nrequests(struct g_raid3_softc *sc, struct g_consumer *cp)
370 {
371 	struct bio *bp;
372 	u_int nreqs = 0;
373 
374 	mtx_lock(&sc->sc_queue_mtx);
375 	TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
376 		if (bp->bio_from == cp)
377 			nreqs++;
378 	}
379 	mtx_unlock(&sc->sc_queue_mtx);
380 	return (nreqs);
381 }
382 
383 static int
384 g_raid3_is_busy(struct g_raid3_softc *sc, struct g_consumer *cp)
385 {
386 
387 	if (cp->index > 0) {
388 		G_RAID3_DEBUG(2,
389 		    "I/O requests for %s exist, can't destroy it now.",
390 		    cp->provider->name);
391 		return (1);
392 	}
393 	if (g_raid3_nrequests(sc, cp) > 0) {
394 		G_RAID3_DEBUG(2,
395 		    "I/O requests for %s in queue, can't destroy it now.",
396 		    cp->provider->name);
397 		return (1);
398 	}
399 	return (0);
400 }
401 
402 static void
403 g_raid3_destroy_consumer(void *arg, int flags __unused)
404 {
405 	struct g_consumer *cp;
406 
407 	g_topology_assert();
408 
409 	cp = arg;
410 	G_RAID3_DEBUG(1, "Consumer %s destroyed.", cp->provider->name);
411 	g_detach(cp);
412 	g_destroy_consumer(cp);
413 }
414 
415 static void
416 g_raid3_kill_consumer(struct g_raid3_softc *sc, struct g_consumer *cp)
417 {
418 	struct g_provider *pp;
419 	int retaste_wait;
420 
421 	g_topology_assert();
422 
423 	cp->private = NULL;
424 	if (g_raid3_is_busy(sc, cp))
425 		return;
426 	G_RAID3_DEBUG(2, "Consumer %s destroyed.", cp->provider->name);
427 	pp = cp->provider;
428 	retaste_wait = 0;
429 	if (cp->acw == 1) {
430 		if ((pp->geom->flags & G_GEOM_WITHER) == 0)
431 			retaste_wait = 1;
432 	}
433 	G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", pp->name, -cp->acr,
434 	    -cp->acw, -cp->ace, 0);
435 	if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
436 		g_access(cp, -cp->acr, -cp->acw, -cp->ace);
437 	if (retaste_wait) {
438 		/*
439 		 * After retaste event was send (inside g_access()), we can send
440 		 * event to detach and destroy consumer.
441 		 * A class, which has consumer to the given provider connected
442 		 * will not receive retaste event for the provider.
443 		 * This is the way how I ignore retaste events when I close
444 		 * consumers opened for write: I detach and destroy consumer
445 		 * after retaste event is sent.
446 		 */
447 		g_post_event(g_raid3_destroy_consumer, cp, M_WAITOK, NULL);
448 		return;
449 	}
450 	G_RAID3_DEBUG(1, "Consumer %s destroyed.", pp->name);
451 	g_detach(cp);
452 	g_destroy_consumer(cp);
453 }
454 
455 static int
456 g_raid3_connect_disk(struct g_raid3_disk *disk, struct g_provider *pp)
457 {
458 	struct g_consumer *cp;
459 	int error;
460 
461 	g_topology_assert_not();
462 	KASSERT(disk->d_consumer == NULL,
463 	    ("Disk already connected (device %s).", disk->d_softc->sc_name));
464 
465 	g_topology_lock();
466 	cp = g_new_consumer(disk->d_softc->sc_geom);
467 	error = g_attach(cp, pp);
468 	if (error != 0) {
469 		g_destroy_consumer(cp);
470 		g_topology_unlock();
471 		return (error);
472 	}
473 	error = g_access(cp, 1, 1, 1);
474 		g_topology_unlock();
475 	if (error != 0) {
476 		g_detach(cp);
477 		g_destroy_consumer(cp);
478 		G_RAID3_DEBUG(0, "Cannot open consumer %s (error=%d).",
479 		    pp->name, error);
480 		return (error);
481 	}
482 	disk->d_consumer = cp;
483 	disk->d_consumer->private = disk;
484 	disk->d_consumer->index = 0;
485 	G_RAID3_DEBUG(2, "Disk %s connected.", g_raid3_get_diskname(disk));
486 	return (0);
487 }
488 
489 static void
490 g_raid3_disconnect_consumer(struct g_raid3_softc *sc, struct g_consumer *cp)
491 {
492 
493 	g_topology_assert();
494 
495 	if (cp == NULL)
496 		return;
497 	if (cp->provider != NULL)
498 		g_raid3_kill_consumer(sc, cp);
499 	else
500 		g_destroy_consumer(cp);
501 }
502 
503 /*
504  * Initialize disk. This means allocate memory, create consumer, attach it
505  * to the provider and open access (r1w1e1) to it.
506  */
507 static struct g_raid3_disk *
508 g_raid3_init_disk(struct g_raid3_softc *sc, struct g_provider *pp,
509     struct g_raid3_metadata *md, int *errorp)
510 {
511 	struct g_raid3_disk *disk;
512 	int error;
513 
514 	disk = &sc->sc_disks[md->md_no];
515 	error = g_raid3_connect_disk(disk, pp);
516 	if (error != 0) {
517 		if (errorp != NULL)
518 			*errorp = error;
519 		return (NULL);
520 	}
521 	disk->d_state = G_RAID3_DISK_STATE_NONE;
522 	disk->d_flags = md->md_dflags;
523 	if (md->md_provider[0] != '\0')
524 		disk->d_flags |= G_RAID3_DISK_FLAG_HARDCODED;
525 	disk->d_sync.ds_consumer = NULL;
526 	disk->d_sync.ds_offset = md->md_sync_offset;
527 	disk->d_sync.ds_offset_done = md->md_sync_offset;
528 	disk->d_genid = md->md_genid;
529 	disk->d_sync.ds_syncid = md->md_syncid;
530 	if (errorp != NULL)
531 		*errorp = 0;
532 	return (disk);
533 }
534 
535 static void
536 g_raid3_destroy_disk(struct g_raid3_disk *disk)
537 {
538 	struct g_raid3_softc *sc;
539 
540 	g_topology_assert_not();
541 	sc = disk->d_softc;
542 	sx_assert(&sc->sc_lock, SX_XLOCKED);
543 
544 	if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
545 		return;
546 	g_raid3_event_cancel(disk);
547 	switch (disk->d_state) {
548 	case G_RAID3_DISK_STATE_SYNCHRONIZING:
549 		if (sc->sc_syncdisk != NULL)
550 			g_raid3_sync_stop(sc, 1);
551 		/* FALLTHROUGH */
552 	case G_RAID3_DISK_STATE_NEW:
553 	case G_RAID3_DISK_STATE_STALE:
554 	case G_RAID3_DISK_STATE_ACTIVE:
555 		g_topology_lock();
556 		g_raid3_disconnect_consumer(sc, disk->d_consumer);
557 		g_topology_unlock();
558 		disk->d_consumer = NULL;
559 		break;
560 	default:
561 		KASSERT(0 == 1, ("Wrong disk state (%s, %s).",
562 		    g_raid3_get_diskname(disk),
563 		    g_raid3_disk_state2str(disk->d_state)));
564 	}
565 	disk->d_state = G_RAID3_DISK_STATE_NODISK;
566 }
567 
568 static void
569 g_raid3_destroy_device(struct g_raid3_softc *sc)
570 {
571 	struct g_raid3_event *ep;
572 	struct g_raid3_disk *disk;
573 	struct g_geom *gp;
574 	struct g_consumer *cp;
575 	u_int n;
576 
577 	g_topology_assert_not();
578 	sx_assert(&sc->sc_lock, SX_XLOCKED);
579 
580 	gp = sc->sc_geom;
581 	if (sc->sc_provider != NULL)
582 		g_raid3_destroy_provider(sc);
583 	for (n = 0; n < sc->sc_ndisks; n++) {
584 		disk = &sc->sc_disks[n];
585 		if (disk->d_state != G_RAID3_DISK_STATE_NODISK) {
586 			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
587 			g_raid3_update_metadata(disk);
588 			g_raid3_destroy_disk(disk);
589 		}
590 	}
591 	while ((ep = g_raid3_event_get(sc)) != NULL) {
592 		g_raid3_event_remove(sc, ep);
593 		if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0)
594 			g_raid3_event_free(ep);
595 		else {
596 			ep->e_error = ECANCELED;
597 			ep->e_flags |= G_RAID3_EVENT_DONE;
598 			G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, ep);
599 			mtx_lock(&sc->sc_events_mtx);
600 			wakeup(ep);
601 			mtx_unlock(&sc->sc_events_mtx);
602 		}
603 	}
604 	callout_drain(&sc->sc_callout);
605 	gp->softc = NULL;
606 	cp = LIST_FIRST(&sc->sc_sync.ds_geom->consumer);
607 	g_topology_lock();
608 	if (cp != NULL)
609 		g_raid3_disconnect_consumer(sc, cp);
610 	sc->sc_sync.ds_geom->softc = NULL;
611 	g_wither_geom(sc->sc_sync.ds_geom, ENXIO);
612 	G_RAID3_DEBUG(0, "Device %s destroyed.", gp->name);
613 	g_wither_geom(gp, ENXIO);
614 	g_topology_unlock();
615 	uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_64K].sz_zone);
616 	uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_16K].sz_zone);
617 	uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_4K].sz_zone);
618 	mtx_destroy(&sc->sc_queue_mtx);
619 	mtx_destroy(&sc->sc_events_mtx);
620 	sx_xunlock(&sc->sc_lock);
621 	sx_destroy(&sc->sc_lock);
622 }
623 
624 static void
625 g_raid3_orphan(struct g_consumer *cp)
626 {
627 	struct g_raid3_disk *disk;
628 
629 	g_topology_assert();
630 
631 	disk = cp->private;
632 	if (disk == NULL)
633 		return;
634 	disk->d_softc->sc_bump_id = G_RAID3_BUMP_SYNCID;
635 	g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
636 	    G_RAID3_EVENT_DONTWAIT);
637 }
638 
639 static int
640 g_raid3_write_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md)
641 {
642 	struct g_raid3_softc *sc;
643 	struct g_consumer *cp;
644 	off_t offset, length;
645 	u_char *sector;
646 	int error = 0;
647 
648 	g_topology_assert_not();
649 	sc = disk->d_softc;
650 	sx_assert(&sc->sc_lock, SX_LOCKED);
651 
652 	cp = disk->d_consumer;
653 	KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name));
654 	KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name));
655 	KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
656 	    ("Consumer %s closed? (r%dw%de%d).", cp->provider->name, cp->acr,
657 	    cp->acw, cp->ace));
658 	length = cp->provider->sectorsize;
659 	offset = cp->provider->mediasize - length;
660 	sector = malloc((size_t)length, M_RAID3, M_WAITOK | M_ZERO);
661 	if (md != NULL)
662 		raid3_metadata_encode(md, sector);
663 	error = g_write_data(cp, offset, sector, length);
664 	free(sector, M_RAID3);
665 	if (error != 0) {
666 		if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) {
667 			G_RAID3_DEBUG(0, "Cannot write metadata on %s "
668 			    "(device=%s, error=%d).",
669 			    g_raid3_get_diskname(disk), sc->sc_name, error);
670 			disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN;
671 		} else {
672 			G_RAID3_DEBUG(1, "Cannot write metadata on %s "
673 			    "(device=%s, error=%d).",
674 			    g_raid3_get_diskname(disk), sc->sc_name, error);
675 		}
676 		if (g_raid3_disconnect_on_failure &&
677 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
678 			sc->sc_bump_id |= G_RAID3_BUMP_GENID;
679 			g_raid3_event_send(disk,
680 			    G_RAID3_DISK_STATE_DISCONNECTED,
681 			    G_RAID3_EVENT_DONTWAIT);
682 		}
683 	}
684 	return (error);
685 }
686 
687 int
688 g_raid3_clear_metadata(struct g_raid3_disk *disk)
689 {
690 	int error;
691 
692 	g_topology_assert_not();
693 	sx_assert(&disk->d_softc->sc_lock, SX_LOCKED);
694 
695 	error = g_raid3_write_metadata(disk, NULL);
696 	if (error == 0) {
697 		G_RAID3_DEBUG(2, "Metadata on %s cleared.",
698 		    g_raid3_get_diskname(disk));
699 	} else {
700 		G_RAID3_DEBUG(0,
701 		    "Cannot clear metadata on disk %s (error=%d).",
702 		    g_raid3_get_diskname(disk), error);
703 	}
704 	return (error);
705 }
706 
707 void
708 g_raid3_fill_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md)
709 {
710 	struct g_raid3_softc *sc;
711 	struct g_provider *pp;
712 
713 	sc = disk->d_softc;
714 	strlcpy(md->md_magic, G_RAID3_MAGIC, sizeof(md->md_magic));
715 	md->md_version = G_RAID3_VERSION;
716 	strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name));
717 	md->md_id = sc->sc_id;
718 	md->md_all = sc->sc_ndisks;
719 	md->md_genid = sc->sc_genid;
720 	md->md_mediasize = sc->sc_mediasize;
721 	md->md_sectorsize = sc->sc_sectorsize;
722 	md->md_mflags = (sc->sc_flags & G_RAID3_DEVICE_FLAG_MASK);
723 	md->md_no = disk->d_no;
724 	md->md_syncid = disk->d_sync.ds_syncid;
725 	md->md_dflags = (disk->d_flags & G_RAID3_DISK_FLAG_MASK);
726 	if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING)
727 		md->md_sync_offset = disk->d_sync.ds_offset_done;
728 	else
729 		md->md_sync_offset = 0;
730 	if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL)
731 		pp = disk->d_consumer->provider;
732 	else
733 		pp = NULL;
734 	if ((disk->d_flags & G_RAID3_DISK_FLAG_HARDCODED) != 0 && pp != NULL)
735 		strlcpy(md->md_provider, pp->name, sizeof(md->md_provider));
736 	else
737 		bzero(md->md_provider, sizeof(md->md_provider));
738 	if (pp != NULL)
739 		md->md_provsize = pp->mediasize;
740 	else
741 		md->md_provsize = 0;
742 }
743 
744 void
745 g_raid3_update_metadata(struct g_raid3_disk *disk)
746 {
747 	struct g_raid3_softc *sc;
748 	struct g_raid3_metadata md;
749 	int error;
750 
751 	g_topology_assert_not();
752 	sc = disk->d_softc;
753 	sx_assert(&sc->sc_lock, SX_LOCKED);
754 
755 	g_raid3_fill_metadata(disk, &md);
756 	error = g_raid3_write_metadata(disk, &md);
757 	if (error == 0) {
758 		G_RAID3_DEBUG(2, "Metadata on %s updated.",
759 		    g_raid3_get_diskname(disk));
760 	} else {
761 		G_RAID3_DEBUG(0,
762 		    "Cannot update metadata on disk %s (error=%d).",
763 		    g_raid3_get_diskname(disk), error);
764 	}
765 }
766 
767 static void
768 g_raid3_bump_syncid(struct g_raid3_softc *sc)
769 {
770 	struct g_raid3_disk *disk;
771 	u_int n;
772 
773 	g_topology_assert_not();
774 	sx_assert(&sc->sc_lock, SX_XLOCKED);
775 	KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0,
776 	    ("%s called with no active disks (device=%s).", __func__,
777 	    sc->sc_name));
778 
779 	sc->sc_syncid++;
780 	G_RAID3_DEBUG(1, "Device %s: syncid bumped to %u.", sc->sc_name,
781 	    sc->sc_syncid);
782 	for (n = 0; n < sc->sc_ndisks; n++) {
783 		disk = &sc->sc_disks[n];
784 		if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
785 		    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
786 			disk->d_sync.ds_syncid = sc->sc_syncid;
787 			g_raid3_update_metadata(disk);
788 		}
789 	}
790 }
791 
792 static void
793 g_raid3_bump_genid(struct g_raid3_softc *sc)
794 {
795 	struct g_raid3_disk *disk;
796 	u_int n;
797 
798 	g_topology_assert_not();
799 	sx_assert(&sc->sc_lock, SX_XLOCKED);
800 	KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0,
801 	    ("%s called with no active disks (device=%s).", __func__,
802 	    sc->sc_name));
803 
804 	sc->sc_genid++;
805 	G_RAID3_DEBUG(1, "Device %s: genid bumped to %u.", sc->sc_name,
806 	    sc->sc_genid);
807 	for (n = 0; n < sc->sc_ndisks; n++) {
808 		disk = &sc->sc_disks[n];
809 		if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
810 		    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
811 			disk->d_genid = sc->sc_genid;
812 			g_raid3_update_metadata(disk);
813 		}
814 	}
815 }
816 
817 static int
818 g_raid3_idle(struct g_raid3_softc *sc, int acw)
819 {
820 	struct g_raid3_disk *disk;
821 	u_int i;
822 	int timeout;
823 
824 	g_topology_assert_not();
825 	sx_assert(&sc->sc_lock, SX_XLOCKED);
826 
827 	if (sc->sc_provider == NULL)
828 		return (0);
829 	if (sc->sc_idle)
830 		return (0);
831 	if (sc->sc_writes > 0)
832 		return (0);
833 	if (acw > 0 || (acw == -1 && sc->sc_provider->acw > 0)) {
834 		timeout = g_raid3_idletime - (time_uptime - sc->sc_last_write);
835 		if (timeout > 0)
836 			return (timeout);
837 	}
838 	sc->sc_idle = 1;
839 	for (i = 0; i < sc->sc_ndisks; i++) {
840 		disk = &sc->sc_disks[i];
841 		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
842 			continue;
843 		G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.",
844 		    g_raid3_get_diskname(disk), sc->sc_name);
845 		disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
846 		g_raid3_update_metadata(disk);
847 	}
848 	return (0);
849 }
850 
851 static void
852 g_raid3_unidle(struct g_raid3_softc *sc)
853 {
854 	struct g_raid3_disk *disk;
855 	u_int i;
856 
857 	g_topology_assert_not();
858 	sx_assert(&sc->sc_lock, SX_XLOCKED);
859 
860 	sc->sc_idle = 0;
861 	sc->sc_last_write = time_uptime;
862 	for (i = 0; i < sc->sc_ndisks; i++) {
863 		disk = &sc->sc_disks[i];
864 		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
865 			continue;
866 		G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.",
867 		    g_raid3_get_diskname(disk), sc->sc_name);
868 		disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
869 		g_raid3_update_metadata(disk);
870 	}
871 }
872 
873 /*
874  * Treat bio_driver1 field in parent bio as list head and field bio_caller1
875  * in child bio as pointer to the next element on the list.
876  */
877 #define	G_RAID3_HEAD_BIO(pbp)	(pbp)->bio_driver1
878 
879 #define	G_RAID3_NEXT_BIO(cbp)	(cbp)->bio_caller1
880 
881 #define	G_RAID3_FOREACH_BIO(pbp, bp)					\
882 	for ((bp) = G_RAID3_HEAD_BIO(pbp); (bp) != NULL;		\
883 	    (bp) = G_RAID3_NEXT_BIO(bp))
884 
885 #define	G_RAID3_FOREACH_SAFE_BIO(pbp, bp, tmpbp)			\
886 	for ((bp) = G_RAID3_HEAD_BIO(pbp);				\
887 	    (bp) != NULL && ((tmpbp) = G_RAID3_NEXT_BIO(bp), 1);	\
888 	    (bp) = (tmpbp))
889 
890 static void
891 g_raid3_init_bio(struct bio *pbp)
892 {
893 
894 	G_RAID3_HEAD_BIO(pbp) = NULL;
895 }
896 
897 static void
898 g_raid3_remove_bio(struct bio *cbp)
899 {
900 	struct bio *pbp, *bp;
901 
902 	pbp = cbp->bio_parent;
903 	if (G_RAID3_HEAD_BIO(pbp) == cbp)
904 		G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp);
905 	else {
906 		G_RAID3_FOREACH_BIO(pbp, bp) {
907 			if (G_RAID3_NEXT_BIO(bp) == cbp) {
908 				G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp);
909 				break;
910 			}
911 		}
912 	}
913 	G_RAID3_NEXT_BIO(cbp) = NULL;
914 }
915 
916 static void
917 g_raid3_replace_bio(struct bio *sbp, struct bio *dbp)
918 {
919 	struct bio *pbp, *bp;
920 
921 	g_raid3_remove_bio(sbp);
922 	pbp = dbp->bio_parent;
923 	G_RAID3_NEXT_BIO(sbp) = G_RAID3_NEXT_BIO(dbp);
924 	if (G_RAID3_HEAD_BIO(pbp) == dbp)
925 		G_RAID3_HEAD_BIO(pbp) = sbp;
926 	else {
927 		G_RAID3_FOREACH_BIO(pbp, bp) {
928 			if (G_RAID3_NEXT_BIO(bp) == dbp) {
929 				G_RAID3_NEXT_BIO(bp) = sbp;
930 				break;
931 			}
932 		}
933 	}
934 	G_RAID3_NEXT_BIO(dbp) = NULL;
935 }
936 
937 static void
938 g_raid3_destroy_bio(struct g_raid3_softc *sc, struct bio *cbp)
939 {
940 	struct bio *bp, *pbp;
941 	size_t size;
942 
943 	pbp = cbp->bio_parent;
944 	pbp->bio_children--;
945 	KASSERT(cbp->bio_data != NULL, ("NULL bio_data"));
946 	size = pbp->bio_length / (sc->sc_ndisks - 1);
947 	uma_zfree_arg(sc->sc_zones[g_raid3_zone(size)].sz_zone,
948 	    cbp->bio_data,
949 	    &sc->sc_zones[g_raid3_zone(size)]);
950 	if (G_RAID3_HEAD_BIO(pbp) == cbp) {
951 		G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp);
952 		G_RAID3_NEXT_BIO(cbp) = NULL;
953 		g_destroy_bio(cbp);
954 	} else {
955 		G_RAID3_FOREACH_BIO(pbp, bp) {
956 			if (G_RAID3_NEXT_BIO(bp) == cbp)
957 				break;
958 		}
959 		if (bp != NULL) {
960 			KASSERT(G_RAID3_NEXT_BIO(bp) != NULL,
961 			    ("NULL bp->bio_driver1"));
962 			G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp);
963 			G_RAID3_NEXT_BIO(cbp) = NULL;
964 		}
965 		g_destroy_bio(cbp);
966 	}
967 }
968 
969 static struct bio *
970 g_raid3_clone_bio(struct g_raid3_softc *sc, struct bio *pbp)
971 {
972 	struct bio *bp, *cbp;
973 	size_t size;
974 	int memflag;
975 
976 	cbp = g_clone_bio(pbp);
977 	if (cbp == NULL)
978 		return (NULL);
979 	size = pbp->bio_length / (sc->sc_ndisks - 1);
980 	if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0)
981 		memflag = M_WAITOK;
982 	else
983 		memflag = M_NOWAIT;
984 	cbp->bio_data = uma_zalloc_arg(sc->sc_zones[g_raid3_zone(size)].sz_zone,
985 	   &sc->sc_zones[g_raid3_zone(size)], memflag);
986 	sc->sc_zones[g_raid3_zone(size)].sz_requested++;
987 	if (cbp->bio_data == NULL) {
988 		sc->sc_zones[g_raid3_zone(size)].sz_failed++;
989 		pbp->bio_children--;
990 		g_destroy_bio(cbp);
991 		return (NULL);
992 	}
993 	G_RAID3_NEXT_BIO(cbp) = NULL;
994 	if (G_RAID3_HEAD_BIO(pbp) == NULL)
995 		G_RAID3_HEAD_BIO(pbp) = cbp;
996 	else {
997 		G_RAID3_FOREACH_BIO(pbp, bp) {
998 			if (G_RAID3_NEXT_BIO(bp) == NULL) {
999 				G_RAID3_NEXT_BIO(bp) = cbp;
1000 				break;
1001 			}
1002 		}
1003 	}
1004 	return (cbp);
1005 }
1006 
1007 static void
1008 g_raid3_scatter(struct bio *pbp)
1009 {
1010 	struct g_raid3_softc *sc;
1011 	struct g_raid3_disk *disk;
1012 	struct bio *bp, *cbp;
1013 	off_t atom, cadd, padd, left;
1014 
1015 	sc = pbp->bio_to->geom->softc;
1016 	bp = NULL;
1017 	if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) {
1018 		/*
1019 		 * Find bio for which we should calculate data.
1020 		 */
1021 		G_RAID3_FOREACH_BIO(pbp, cbp) {
1022 			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) {
1023 				bp = cbp;
1024 				break;
1025 			}
1026 		}
1027 		KASSERT(bp != NULL, ("NULL parity bio."));
1028 	}
1029 	atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
1030 	cadd = padd = 0;
1031 	for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) {
1032 		G_RAID3_FOREACH_BIO(pbp, cbp) {
1033 			if (cbp == bp)
1034 				continue;
1035 			bcopy(pbp->bio_data + padd, cbp->bio_data + cadd, atom);
1036 			padd += atom;
1037 		}
1038 		cadd += atom;
1039 	}
1040 	if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) {
1041 		struct bio *tmpbp;
1042 
1043 		/*
1044 		 * Calculate parity.
1045 		 */
1046 		bzero(bp->bio_data, bp->bio_length);
1047 		G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) {
1048 			if (cbp == bp)
1049 				continue;
1050 			g_raid3_xor(cbp->bio_data, bp->bio_data, bp->bio_data,
1051 			    bp->bio_length);
1052 			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_NODISK) != 0)
1053 				g_raid3_destroy_bio(sc, cbp);
1054 		}
1055 	}
1056 	G_RAID3_FOREACH_BIO(pbp, cbp) {
1057 		struct g_consumer *cp;
1058 
1059 		disk = cbp->bio_caller2;
1060 		cp = disk->d_consumer;
1061 		cbp->bio_to = cp->provider;
1062 		G_RAID3_LOGREQ(3, cbp, "Sending request.");
1063 		KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
1064 		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1065 		    cp->acr, cp->acw, cp->ace));
1066 		cp->index++;
1067 		sc->sc_writes++;
1068 		g_io_request(cbp, cp);
1069 	}
1070 }
1071 
1072 static void
1073 g_raid3_gather(struct bio *pbp)
1074 {
1075 	struct g_raid3_softc *sc;
1076 	struct g_raid3_disk *disk;
1077 	struct bio *xbp, *fbp, *cbp;
1078 	off_t atom, cadd, padd, left;
1079 
1080 	sc = pbp->bio_to->geom->softc;
1081 	/*
1082 	 * Find bio for which we have to calculate data.
1083 	 * While going through this path, check if all requests
1084 	 * succeeded, if not, deny whole request.
1085 	 * If we're in COMPLETE mode, we allow one request to fail,
1086 	 * so if we find one, we're sending it to the parity consumer.
1087 	 * If there are more failed requests, we deny whole request.
1088 	 */
1089 	xbp = fbp = NULL;
1090 	G_RAID3_FOREACH_BIO(pbp, cbp) {
1091 		if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) {
1092 			KASSERT(xbp == NULL, ("More than one parity bio."));
1093 			xbp = cbp;
1094 		}
1095 		if (cbp->bio_error == 0)
1096 			continue;
1097 		/*
1098 		 * Found failed request.
1099 		 */
1100 		if (fbp == NULL) {
1101 			if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_DEGRADED) != 0) {
1102 				/*
1103 				 * We are already in degraded mode, so we can't
1104 				 * accept any failures.
1105 				 */
1106 				if (pbp->bio_error == 0)
1107 					pbp->bio_error = cbp->bio_error;
1108 			} else {
1109 				fbp = cbp;
1110 			}
1111 		} else {
1112 			/*
1113 			 * Next failed request, that's too many.
1114 			 */
1115 			if (pbp->bio_error == 0)
1116 				pbp->bio_error = fbp->bio_error;
1117 		}
1118 		disk = cbp->bio_caller2;
1119 		if (disk == NULL)
1120 			continue;
1121 		if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) {
1122 			disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN;
1123 			G_RAID3_LOGREQ(0, cbp, "Request failed (error=%d).",
1124 			    cbp->bio_error);
1125 		} else {
1126 			G_RAID3_LOGREQ(1, cbp, "Request failed (error=%d).",
1127 			    cbp->bio_error);
1128 		}
1129 		if (g_raid3_disconnect_on_failure &&
1130 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
1131 			sc->sc_bump_id |= G_RAID3_BUMP_GENID;
1132 			g_raid3_event_send(disk,
1133 			    G_RAID3_DISK_STATE_DISCONNECTED,
1134 			    G_RAID3_EVENT_DONTWAIT);
1135 		}
1136 	}
1137 	if (pbp->bio_error != 0)
1138 		goto finish;
1139 	if (fbp != NULL && (pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) {
1140 		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_VERIFY;
1141 		if (xbp != fbp)
1142 			g_raid3_replace_bio(xbp, fbp);
1143 		g_raid3_destroy_bio(sc, fbp);
1144 	} else if (fbp != NULL) {
1145 		struct g_consumer *cp;
1146 
1147 		/*
1148 		 * One request failed, so send the same request to
1149 		 * the parity consumer.
1150 		 */
1151 		disk = pbp->bio_driver2;
1152 		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) {
1153 			pbp->bio_error = fbp->bio_error;
1154 			goto finish;
1155 		}
1156 		pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
1157 		pbp->bio_inbed--;
1158 		fbp->bio_flags &= ~(BIO_DONE | BIO_ERROR);
1159 		if (disk->d_no == sc->sc_ndisks - 1)
1160 			fbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1161 		fbp->bio_error = 0;
1162 		fbp->bio_completed = 0;
1163 		fbp->bio_children = 0;
1164 		fbp->bio_inbed = 0;
1165 		cp = disk->d_consumer;
1166 		fbp->bio_caller2 = disk;
1167 		fbp->bio_to = cp->provider;
1168 		G_RAID3_LOGREQ(3, fbp, "Sending request (recover).");
1169 		KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
1170 		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1171 		    cp->acr, cp->acw, cp->ace));
1172 		cp->index++;
1173 		g_io_request(fbp, cp);
1174 		return;
1175 	}
1176 	if (xbp != NULL) {
1177 		/*
1178 		 * Calculate parity.
1179 		 */
1180 		G_RAID3_FOREACH_BIO(pbp, cbp) {
1181 			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0)
1182 				continue;
1183 			g_raid3_xor(cbp->bio_data, xbp->bio_data, xbp->bio_data,
1184 			    xbp->bio_length);
1185 		}
1186 		xbp->bio_cflags &= ~G_RAID3_BIO_CFLAG_PARITY;
1187 		if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) {
1188 			if (!g_raid3_is_zero(xbp)) {
1189 				g_raid3_parity_mismatch++;
1190 				pbp->bio_error = EIO;
1191 				goto finish;
1192 			}
1193 			g_raid3_destroy_bio(sc, xbp);
1194 		}
1195 	}
1196 	atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
1197 	cadd = padd = 0;
1198 	for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) {
1199 		G_RAID3_FOREACH_BIO(pbp, cbp) {
1200 			bcopy(cbp->bio_data + cadd, pbp->bio_data + padd, atom);
1201 			pbp->bio_completed += atom;
1202 			padd += atom;
1203 		}
1204 		cadd += atom;
1205 	}
1206 finish:
1207 	if (pbp->bio_error == 0)
1208 		G_RAID3_LOGREQ(3, pbp, "Request finished.");
1209 	else {
1210 		if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0)
1211 			G_RAID3_LOGREQ(1, pbp, "Verification error.");
1212 		else
1213 			G_RAID3_LOGREQ(0, pbp, "Request failed.");
1214 	}
1215 	pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_MASK;
1216 	while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL)
1217 		g_raid3_destroy_bio(sc, cbp);
1218 	g_io_deliver(pbp, pbp->bio_error);
1219 }
1220 
1221 static void
1222 g_raid3_done(struct bio *bp)
1223 {
1224 	struct g_raid3_softc *sc;
1225 
1226 	sc = bp->bio_from->geom->softc;
1227 	bp->bio_cflags |= G_RAID3_BIO_CFLAG_REGULAR;
1228 	G_RAID3_LOGREQ(3, bp, "Regular request done (error=%d).", bp->bio_error);
1229 	mtx_lock(&sc->sc_queue_mtx);
1230 	bioq_insert_head(&sc->sc_queue, bp);
1231 	wakeup(sc);
1232 	wakeup(&sc->sc_queue);
1233 	mtx_unlock(&sc->sc_queue_mtx);
1234 }
1235 
1236 static void
1237 g_raid3_regular_request(struct bio *cbp)
1238 {
1239 	struct g_raid3_softc *sc;
1240 	struct g_raid3_disk *disk;
1241 	struct bio *pbp;
1242 
1243 	g_topology_assert_not();
1244 
1245 	pbp = cbp->bio_parent;
1246 	sc = pbp->bio_to->geom->softc;
1247 	cbp->bio_from->index--;
1248 	if (cbp->bio_cmd == BIO_WRITE)
1249 		sc->sc_writes--;
1250 	disk = cbp->bio_from->private;
1251 	if (disk == NULL) {
1252 		g_topology_lock();
1253 		g_raid3_kill_consumer(sc, cbp->bio_from);
1254 		g_topology_unlock();
1255 	}
1256 
1257 	G_RAID3_LOGREQ(3, cbp, "Request finished.");
1258 	pbp->bio_inbed++;
1259 	KASSERT(pbp->bio_inbed <= pbp->bio_children,
1260 	    ("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed,
1261 	    pbp->bio_children));
1262 	if (pbp->bio_inbed != pbp->bio_children)
1263 		return;
1264 	switch (pbp->bio_cmd) {
1265 	case BIO_READ:
1266 		g_raid3_gather(pbp);
1267 		break;
1268 	case BIO_WRITE:
1269 	case BIO_DELETE:
1270 	    {
1271 		int error = 0;
1272 
1273 		pbp->bio_completed = pbp->bio_length;
1274 		while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) {
1275 			if (cbp->bio_error == 0) {
1276 				g_raid3_destroy_bio(sc, cbp);
1277 				continue;
1278 			}
1279 
1280 			if (error == 0)
1281 				error = cbp->bio_error;
1282 			else if (pbp->bio_error == 0) {
1283 				/*
1284 				 * Next failed request, that's too many.
1285 				 */
1286 				pbp->bio_error = error;
1287 			}
1288 
1289 			disk = cbp->bio_caller2;
1290 			if (disk == NULL) {
1291 				g_raid3_destroy_bio(sc, cbp);
1292 				continue;
1293 			}
1294 
1295 			if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) {
1296 				disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN;
1297 				G_RAID3_LOGREQ(0, cbp,
1298 				    "Request failed (error=%d).",
1299 				    cbp->bio_error);
1300 			} else {
1301 				G_RAID3_LOGREQ(1, cbp,
1302 				    "Request failed (error=%d).",
1303 				    cbp->bio_error);
1304 			}
1305 			if (g_raid3_disconnect_on_failure &&
1306 			    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
1307 				sc->sc_bump_id |= G_RAID3_BUMP_GENID;
1308 				g_raid3_event_send(disk,
1309 				    G_RAID3_DISK_STATE_DISCONNECTED,
1310 				    G_RAID3_EVENT_DONTWAIT);
1311 			}
1312 			g_raid3_destroy_bio(sc, cbp);
1313 		}
1314 		if (pbp->bio_error == 0)
1315 			G_RAID3_LOGREQ(3, pbp, "Request finished.");
1316 		else
1317 			G_RAID3_LOGREQ(0, pbp, "Request failed.");
1318 		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_DEGRADED;
1319 		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_NOPARITY;
1320 		bioq_remove(&sc->sc_inflight, pbp);
1321 		/* Release delayed sync requests if possible. */
1322 		g_raid3_sync_release(sc);
1323 		g_io_deliver(pbp, pbp->bio_error);
1324 		break;
1325 	    }
1326 	}
1327 }
1328 
1329 static void
1330 g_raid3_sync_done(struct bio *bp)
1331 {
1332 	struct g_raid3_softc *sc;
1333 
1334 	G_RAID3_LOGREQ(3, bp, "Synchronization request delivered.");
1335 	sc = bp->bio_from->geom->softc;
1336 	bp->bio_cflags |= G_RAID3_BIO_CFLAG_SYNC;
1337 	mtx_lock(&sc->sc_queue_mtx);
1338 	bioq_insert_head(&sc->sc_queue, bp);
1339 	wakeup(sc);
1340 	wakeup(&sc->sc_queue);
1341 	mtx_unlock(&sc->sc_queue_mtx);
1342 }
1343 
1344 static void
1345 g_raid3_start(struct bio *bp)
1346 {
1347 	struct g_raid3_softc *sc;
1348 
1349 	sc = bp->bio_to->geom->softc;
1350 	/*
1351 	 * If sc == NULL or there are no valid disks, provider's error
1352 	 * should be set and g_raid3_start() should not be called at all.
1353 	 */
1354 	KASSERT(sc != NULL && (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
1355 	    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE),
1356 	    ("Provider's error should be set (error=%d)(device=%s).",
1357 	    bp->bio_to->error, bp->bio_to->name));
1358 	G_RAID3_LOGREQ(3, bp, "Request received.");
1359 
1360 	switch (bp->bio_cmd) {
1361 	case BIO_READ:
1362 	case BIO_WRITE:
1363 	case BIO_DELETE:
1364 		break;
1365 	case BIO_GETATTR:
1366 	default:
1367 		g_io_deliver(bp, EOPNOTSUPP);
1368 		return;
1369 	}
1370 	mtx_lock(&sc->sc_queue_mtx);
1371 	bioq_insert_tail(&sc->sc_queue, bp);
1372 	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
1373 	wakeup(sc);
1374 	mtx_unlock(&sc->sc_queue_mtx);
1375 }
1376 
1377 /*
1378  * Return TRUE if the given request is colliding with a in-progress
1379  * synchronization request.
1380  */
1381 static int
1382 g_raid3_sync_collision(struct g_raid3_softc *sc, struct bio *bp)
1383 {
1384 	struct g_raid3_disk *disk;
1385 	struct bio *sbp;
1386 	off_t rstart, rend, sstart, send;
1387 	int i;
1388 
1389 	disk = sc->sc_syncdisk;
1390 	if (disk == NULL)
1391 		return (0);
1392 	rstart = bp->bio_offset;
1393 	rend = bp->bio_offset + bp->bio_length;
1394 	for (i = 0; i < g_raid3_syncreqs; i++) {
1395 		sbp = disk->d_sync.ds_bios[i];
1396 		if (sbp == NULL)
1397 			continue;
1398 		sstart = sbp->bio_offset;
1399 		send = sbp->bio_length;
1400 		if (sbp->bio_cmd == BIO_WRITE) {
1401 			sstart *= sc->sc_ndisks - 1;
1402 			send *= sc->sc_ndisks - 1;
1403 		}
1404 		send += sstart;
1405 		if (rend > sstart && rstart < send)
1406 			return (1);
1407 	}
1408 	return (0);
1409 }
1410 
1411 /*
1412  * Return TRUE if the given sync request is colliding with a in-progress regular
1413  * request.
1414  */
1415 static int
1416 g_raid3_regular_collision(struct g_raid3_softc *sc, struct bio *sbp)
1417 {
1418 	off_t rstart, rend, sstart, send;
1419 	struct bio *bp;
1420 
1421 	if (sc->sc_syncdisk == NULL)
1422 		return (0);
1423 	sstart = sbp->bio_offset;
1424 	send = sstart + sbp->bio_length;
1425 	TAILQ_FOREACH(bp, &sc->sc_inflight.queue, bio_queue) {
1426 		rstart = bp->bio_offset;
1427 		rend = bp->bio_offset + bp->bio_length;
1428 		if (rend > sstart && rstart < send)
1429 			return (1);
1430 	}
1431 	return (0);
1432 }
1433 
1434 /*
1435  * Puts request onto delayed queue.
1436  */
1437 static void
1438 g_raid3_regular_delay(struct g_raid3_softc *sc, struct bio *bp)
1439 {
1440 
1441         G_RAID3_LOGREQ(2, bp, "Delaying request.");
1442         bioq_insert_head(&sc->sc_regular_delayed, bp);
1443 }
1444 
1445 /*
1446  * Puts synchronization request onto delayed queue.
1447  */
1448 static void
1449 g_raid3_sync_delay(struct g_raid3_softc *sc, struct bio *bp)
1450 {
1451 
1452         G_RAID3_LOGREQ(2, bp, "Delaying synchronization request.");
1453         bioq_insert_tail(&sc->sc_sync_delayed, bp);
1454 }
1455 
1456 /*
1457  * Releases delayed regular requests which don't collide anymore with sync
1458  * requests.
1459  */
1460 static void
1461 g_raid3_regular_release(struct g_raid3_softc *sc)
1462 {
1463         struct bio *bp, *bp2;
1464 
1465         TAILQ_FOREACH_SAFE(bp, &sc->sc_regular_delayed.queue, bio_queue, bp2) {
1466                 if (g_raid3_sync_collision(sc, bp))
1467                         continue;
1468                 bioq_remove(&sc->sc_regular_delayed, bp);
1469                 G_RAID3_LOGREQ(2, bp, "Releasing delayed request (%p).", bp);
1470 		mtx_lock(&sc->sc_queue_mtx);
1471 		bioq_insert_head(&sc->sc_queue, bp);
1472 #if 0
1473 		/*
1474 		 * wakeup() is not needed, because this function is called from
1475 		 * the worker thread.
1476 		 */
1477 		wakeup(&sc->sc_queue);
1478 #endif
1479 		mtx_unlock(&sc->sc_queue_mtx);
1480         }
1481 }
1482 
1483 /*
1484  * Releases delayed sync requests which don't collide anymore with regular
1485  * requests.
1486  */
1487 static void
1488 g_raid3_sync_release(struct g_raid3_softc *sc)
1489 {
1490         struct bio *bp, *bp2;
1491 
1492         TAILQ_FOREACH_SAFE(bp, &sc->sc_sync_delayed.queue, bio_queue, bp2) {
1493                 if (g_raid3_regular_collision(sc, bp))
1494                         continue;
1495                 bioq_remove(&sc->sc_sync_delayed, bp);
1496                 G_RAID3_LOGREQ(2, bp,
1497                     "Releasing delayed synchronization request.");
1498                 g_io_request(bp, bp->bio_from);
1499         }
1500 }
1501 
1502 /*
1503  * Handle synchronization requests.
1504  * Every synchronization request is two-steps process: first, READ request is
1505  * send to active provider and then WRITE request (with read data) to the provider
1506  * beeing synchronized. When WRITE is finished, new synchronization request is
1507  * send.
1508  */
1509 static void
1510 g_raid3_sync_request(struct bio *bp)
1511 {
1512 	struct g_raid3_softc *sc;
1513 	struct g_raid3_disk *disk;
1514 
1515 	bp->bio_from->index--;
1516 	sc = bp->bio_from->geom->softc;
1517 	disk = bp->bio_from->private;
1518 	if (disk == NULL) {
1519 		sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */
1520 		g_topology_lock();
1521 		g_raid3_kill_consumer(sc, bp->bio_from);
1522 		g_topology_unlock();
1523 		free(bp->bio_data, M_RAID3);
1524 		g_destroy_bio(bp);
1525 		sx_xlock(&sc->sc_lock);
1526 		return;
1527 	}
1528 
1529 	/*
1530 	 * Synchronization request.
1531 	 */
1532 	switch (bp->bio_cmd) {
1533 	case BIO_READ:
1534 	    {
1535 		struct g_consumer *cp;
1536 		u_char *dst, *src;
1537 		off_t left;
1538 		u_int atom;
1539 
1540 		if (bp->bio_error != 0) {
1541 			G_RAID3_LOGREQ(0, bp,
1542 			    "Synchronization request failed (error=%d).",
1543 			    bp->bio_error);
1544 			g_destroy_bio(bp);
1545 			return;
1546 		}
1547 		G_RAID3_LOGREQ(3, bp, "Synchronization request finished.");
1548 		atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
1549 		dst = src = bp->bio_data;
1550 		if (disk->d_no == sc->sc_ndisks - 1) {
1551 			u_int n;
1552 
1553 			/* Parity component. */
1554 			for (left = bp->bio_length; left > 0;
1555 			    left -= sc->sc_sectorsize) {
1556 				bcopy(src, dst, atom);
1557 				src += atom;
1558 				for (n = 1; n < sc->sc_ndisks - 1; n++) {
1559 					g_raid3_xor(src, dst, dst, atom);
1560 					src += atom;
1561 				}
1562 				dst += atom;
1563 			}
1564 		} else {
1565 			/* Regular component. */
1566 			src += atom * disk->d_no;
1567 			for (left = bp->bio_length; left > 0;
1568 			    left -= sc->sc_sectorsize) {
1569 				bcopy(src, dst, atom);
1570 				src += sc->sc_sectorsize;
1571 				dst += atom;
1572 			}
1573 		}
1574 		bp->bio_driver1 = bp->bio_driver2 = NULL;
1575 		bp->bio_pflags = 0;
1576 		bp->bio_offset /= sc->sc_ndisks - 1;
1577 		bp->bio_length /= sc->sc_ndisks - 1;
1578 		bp->bio_cmd = BIO_WRITE;
1579 		bp->bio_cflags = 0;
1580 		bp->bio_children = bp->bio_inbed = 0;
1581 		cp = disk->d_consumer;
1582 		KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
1583 		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1584 		    cp->acr, cp->acw, cp->ace));
1585 		cp->index++;
1586 		g_io_request(bp, cp);
1587 		return;
1588 	    }
1589 	case BIO_WRITE:
1590 	    {
1591 		struct g_raid3_disk_sync *sync;
1592 		off_t boffset, moffset;
1593 		void *data;
1594 		int i;
1595 
1596 		if (bp->bio_error != 0) {
1597 			G_RAID3_LOGREQ(0, bp,
1598 			    "Synchronization request failed (error=%d).",
1599 			    bp->bio_error);
1600 			g_destroy_bio(bp);
1601 			sc->sc_bump_id |= G_RAID3_BUMP_GENID;
1602 			g_raid3_event_send(disk,
1603 			    G_RAID3_DISK_STATE_DISCONNECTED,
1604 			    G_RAID3_EVENT_DONTWAIT);
1605 			return;
1606 		}
1607 		G_RAID3_LOGREQ(3, bp, "Synchronization request finished.");
1608 		sync = &disk->d_sync;
1609 		if (sync->ds_offset == sc->sc_mediasize / (sc->sc_ndisks - 1) ||
1610 		    sync->ds_consumer == NULL ||
1611 		    (sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
1612 			/* Don't send more synchronization requests. */
1613 			sync->ds_inflight--;
1614 			if (sync->ds_bios != NULL) {
1615 				i = (int)(uintptr_t)bp->bio_caller1;
1616 				sync->ds_bios[i] = NULL;
1617 			}
1618 			free(bp->bio_data, M_RAID3);
1619 			g_destroy_bio(bp);
1620 			if (sync->ds_inflight > 0)
1621 				return;
1622 			if (sync->ds_consumer == NULL ||
1623 			    (sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
1624 				return;
1625 			}
1626 			/*
1627 			 * Disk up-to-date, activate it.
1628 			 */
1629 			g_raid3_event_send(disk, G_RAID3_DISK_STATE_ACTIVE,
1630 			    G_RAID3_EVENT_DONTWAIT);
1631 			return;
1632 		}
1633 
1634 		/* Send next synchronization request. */
1635 		data = bp->bio_data;
1636 		bzero(bp, sizeof(*bp));
1637 		bp->bio_cmd = BIO_READ;
1638 		bp->bio_offset = sync->ds_offset * (sc->sc_ndisks - 1);
1639 		bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset);
1640 		sync->ds_offset += bp->bio_length / (sc->sc_ndisks - 1);
1641 		bp->bio_done = g_raid3_sync_done;
1642 		bp->bio_data = data;
1643 		bp->bio_from = sync->ds_consumer;
1644 		bp->bio_to = sc->sc_provider;
1645 		G_RAID3_LOGREQ(3, bp, "Sending synchronization request.");
1646 		sync->ds_consumer->index++;
1647 		/*
1648 		 * Delay the request if it is colliding with a regular request.
1649 		 */
1650 		if (g_raid3_regular_collision(sc, bp))
1651 			g_raid3_sync_delay(sc, bp);
1652 		else
1653 			g_io_request(bp, sync->ds_consumer);
1654 
1655 		/* Release delayed requests if possible. */
1656 		g_raid3_regular_release(sc);
1657 
1658 		/* Find the smallest offset. */
1659 		moffset = sc->sc_mediasize;
1660 		for (i = 0; i < g_raid3_syncreqs; i++) {
1661 			bp = sync->ds_bios[i];
1662 			boffset = bp->bio_offset;
1663 			if (bp->bio_cmd == BIO_WRITE)
1664 				boffset *= sc->sc_ndisks - 1;
1665 			if (boffset < moffset)
1666 				moffset = boffset;
1667 		}
1668 		if (sync->ds_offset_done + (MAXPHYS * 100) < moffset) {
1669 			/* Update offset_done on every 100 blocks. */
1670 			sync->ds_offset_done = moffset;
1671 			g_raid3_update_metadata(disk);
1672 		}
1673 		return;
1674 	    }
1675 	default:
1676 		KASSERT(1 == 0, ("Invalid command here: %u (device=%s)",
1677 		    bp->bio_cmd, sc->sc_name));
1678 		break;
1679 	}
1680 }
1681 
1682 static int
1683 g_raid3_register_request(struct bio *pbp)
1684 {
1685 	struct g_raid3_softc *sc;
1686 	struct g_raid3_disk *disk;
1687 	struct g_consumer *cp;
1688 	struct bio *cbp;
1689 	off_t offset, length;
1690 	u_int n, ndisks;
1691 	int round_robin, verify;
1692 
1693 	ndisks = 0;
1694 	sc = pbp->bio_to->geom->softc;
1695 	if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGSYNC) != 0 &&
1696 	    sc->sc_syncdisk == NULL) {
1697 		g_io_deliver(pbp, EIO);
1698 		return (0);
1699 	}
1700 	g_raid3_init_bio(pbp);
1701 	length = pbp->bio_length / (sc->sc_ndisks - 1);
1702 	offset = pbp->bio_offset / (sc->sc_ndisks - 1);
1703 	round_robin = verify = 0;
1704 	switch (pbp->bio_cmd) {
1705 	case BIO_READ:
1706 		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 &&
1707 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
1708 			pbp->bio_pflags |= G_RAID3_BIO_PFLAG_VERIFY;
1709 			verify = 1;
1710 			ndisks = sc->sc_ndisks;
1711 		} else {
1712 			verify = 0;
1713 			ndisks = sc->sc_ndisks - 1;
1714 		}
1715 		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0 &&
1716 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
1717 			round_robin = 1;
1718 		} else {
1719 			round_robin = 0;
1720 		}
1721 		KASSERT(!round_robin || !verify,
1722 		    ("ROUND-ROBIN and VERIFY are mutually exclusive."));
1723 		pbp->bio_driver2 = &sc->sc_disks[sc->sc_ndisks - 1];
1724 		break;
1725 	case BIO_WRITE:
1726 	case BIO_DELETE:
1727 		/*
1728 		 * Delay the request if it is colliding with a synchronization
1729 		 * request.
1730 		 */
1731 		if (g_raid3_sync_collision(sc, pbp)) {
1732 			g_raid3_regular_delay(sc, pbp);
1733 			return (0);
1734 		}
1735 
1736 		if (sc->sc_idle)
1737 			g_raid3_unidle(sc);
1738 		else
1739 			sc->sc_last_write = time_uptime;
1740 
1741 		ndisks = sc->sc_ndisks;
1742 		break;
1743 	}
1744 	for (n = 0; n < ndisks; n++) {
1745 		disk = &sc->sc_disks[n];
1746 		cbp = g_raid3_clone_bio(sc, pbp);
1747 		if (cbp == NULL) {
1748 			while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL)
1749 				g_raid3_destroy_bio(sc, cbp);
1750 			/*
1751 			 * To prevent deadlock, we must run back up
1752 			 * with the ENOMEM for failed requests of any
1753 			 * of our consumers.  Our own sync requests
1754 			 * can stick around, as they are finite.
1755 			 */
1756 			if ((pbp->bio_cflags &
1757 			    G_RAID3_BIO_CFLAG_REGULAR) != 0) {
1758 				g_io_deliver(pbp, ENOMEM);
1759 				return (0);
1760 			}
1761 			return (ENOMEM);
1762 		}
1763 		cbp->bio_offset = offset;
1764 		cbp->bio_length = length;
1765 		cbp->bio_done = g_raid3_done;
1766 		switch (pbp->bio_cmd) {
1767 		case BIO_READ:
1768 			if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) {
1769 				/*
1770 				 * Replace invalid component with the parity
1771 				 * component.
1772 				 */
1773 				disk = &sc->sc_disks[sc->sc_ndisks - 1];
1774 				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1775 				pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
1776 			} else if (round_robin &&
1777 			    disk->d_no == sc->sc_round_robin) {
1778 				/*
1779 				 * In round-robin mode skip one data component
1780 				 * and use parity component when reading.
1781 				 */
1782 				pbp->bio_driver2 = disk;
1783 				disk = &sc->sc_disks[sc->sc_ndisks - 1];
1784 				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1785 				sc->sc_round_robin++;
1786 				round_robin = 0;
1787 			} else if (verify && disk->d_no == sc->sc_ndisks - 1) {
1788 				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1789 			}
1790 			break;
1791 		case BIO_WRITE:
1792 		case BIO_DELETE:
1793 			if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
1794 			    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
1795 				if (n == ndisks - 1) {
1796 					/*
1797 					 * Active parity component, mark it as such.
1798 					 */
1799 					cbp->bio_cflags |=
1800 					    G_RAID3_BIO_CFLAG_PARITY;
1801 				}
1802 			} else {
1803 				pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
1804 				if (n == ndisks - 1) {
1805 					/*
1806 					 * Parity component is not connected,
1807 					 * so destroy its request.
1808 					 */
1809 					pbp->bio_pflags |=
1810 					    G_RAID3_BIO_PFLAG_NOPARITY;
1811 					g_raid3_destroy_bio(sc, cbp);
1812 					cbp = NULL;
1813 				} else {
1814 					cbp->bio_cflags |=
1815 					    G_RAID3_BIO_CFLAG_NODISK;
1816 					disk = NULL;
1817 				}
1818 			}
1819 			break;
1820 		}
1821 		if (cbp != NULL)
1822 			cbp->bio_caller2 = disk;
1823 	}
1824 	switch (pbp->bio_cmd) {
1825 	case BIO_READ:
1826 		if (round_robin) {
1827 			/*
1828 			 * If we are in round-robin mode and 'round_robin' is
1829 			 * still 1, it means, that we skipped parity component
1830 			 * for this read and must reset sc_round_robin field.
1831 			 */
1832 			sc->sc_round_robin = 0;
1833 		}
1834 		G_RAID3_FOREACH_BIO(pbp, cbp) {
1835 			disk = cbp->bio_caller2;
1836 			cp = disk->d_consumer;
1837 			cbp->bio_to = cp->provider;
1838 			G_RAID3_LOGREQ(3, cbp, "Sending request.");
1839 			KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
1840 			    ("Consumer %s not opened (r%dw%de%d).",
1841 			    cp->provider->name, cp->acr, cp->acw, cp->ace));
1842 			cp->index++;
1843 			g_io_request(cbp, cp);
1844 		}
1845 		break;
1846 	case BIO_WRITE:
1847 	case BIO_DELETE:
1848 		/*
1849 		 * Put request onto inflight queue, so we can check if new
1850 		 * synchronization requests don't collide with it.
1851 		 */
1852 		bioq_insert_tail(&sc->sc_inflight, pbp);
1853 
1854 		/*
1855 		 * Bump syncid on first write.
1856 		 */
1857 		if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0) {
1858 			sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID;
1859 			g_topology_lock();
1860 			g_raid3_bump_syncid(sc);
1861 			g_topology_unlock();
1862 		}
1863 		g_raid3_scatter(pbp);
1864 		break;
1865 	}
1866 	return (0);
1867 }
1868 
1869 static int
1870 g_raid3_can_destroy(struct g_raid3_softc *sc)
1871 {
1872 	struct g_geom *gp;
1873 	struct g_consumer *cp;
1874 
1875 	g_topology_assert();
1876 	gp = sc->sc_geom;
1877 	LIST_FOREACH(cp, &gp->consumer, consumer) {
1878 		if (g_raid3_is_busy(sc, cp))
1879 			return (0);
1880 	}
1881 	gp = sc->sc_sync.ds_geom;
1882 	LIST_FOREACH(cp, &gp->consumer, consumer) {
1883 		if (g_raid3_is_busy(sc, cp))
1884 			return (0);
1885 	}
1886 	G_RAID3_DEBUG(2, "No I/O requests for %s, it can be destroyed.",
1887 	    sc->sc_name);
1888 	return (1);
1889 }
1890 
1891 static int
1892 g_raid3_try_destroy(struct g_raid3_softc *sc)
1893 {
1894 
1895 	g_topology_assert_not();
1896 	sx_assert(&sc->sc_lock, SX_XLOCKED);
1897 
1898 	if (sc->sc_rootmount != NULL) {
1899 		G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__,
1900 		    sc->sc_rootmount);
1901 		root_mount_rel(sc->sc_rootmount);
1902 		sc->sc_rootmount = NULL;
1903 	}
1904 
1905 	g_topology_lock();
1906 	if (!g_raid3_can_destroy(sc)) {
1907 		g_topology_unlock();
1908 		return (0);
1909 	}
1910 	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_WAIT) != 0) {
1911 		g_topology_unlock();
1912 		G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__,
1913 		    &sc->sc_worker);
1914 		/* Unlock sc_lock here, as it can be destroyed after wakeup. */
1915 		sx_xunlock(&sc->sc_lock);
1916 		wakeup(&sc->sc_worker);
1917 		sc->sc_worker = NULL;
1918 	} else {
1919 		g_topology_unlock();
1920 		g_raid3_destroy_device(sc);
1921 		free(sc->sc_disks, M_RAID3);
1922 		free(sc, M_RAID3);
1923 	}
1924 	return (1);
1925 }
1926 
1927 /*
1928  * Worker thread.
1929  */
1930 static void
1931 g_raid3_worker(void *arg)
1932 {
1933 	struct g_raid3_softc *sc;
1934 	struct g_raid3_event *ep;
1935 	struct bio *bp;
1936 	int timeout;
1937 
1938 	sc = arg;
1939 	mtx_lock_spin(&sched_lock);
1940 	sched_prio(curthread, PRIBIO);
1941 	mtx_unlock_spin(&sched_lock);
1942 
1943 	sx_xlock(&sc->sc_lock);
1944 	for (;;) {
1945 		G_RAID3_DEBUG(5, "%s: Let's see...", __func__);
1946 		/*
1947 		 * First take a look at events.
1948 		 * This is important to handle events before any I/O requests.
1949 		 */
1950 		ep = g_raid3_event_get(sc);
1951 		if (ep != NULL) {
1952 			g_raid3_event_remove(sc, ep);
1953 			if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0) {
1954 				/* Update only device status. */
1955 				G_RAID3_DEBUG(3,
1956 				    "Running event for device %s.",
1957 				    sc->sc_name);
1958 				ep->e_error = 0;
1959 				g_raid3_update_device(sc, 1);
1960 			} else {
1961 				/* Update disk status. */
1962 				G_RAID3_DEBUG(3, "Running event for disk %s.",
1963 				     g_raid3_get_diskname(ep->e_disk));
1964 				ep->e_error = g_raid3_update_disk(ep->e_disk,
1965 				    ep->e_state);
1966 				if (ep->e_error == 0)
1967 					g_raid3_update_device(sc, 0);
1968 			}
1969 			if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) {
1970 				KASSERT(ep->e_error == 0,
1971 				    ("Error cannot be handled."));
1972 				g_raid3_event_free(ep);
1973 			} else {
1974 				ep->e_flags |= G_RAID3_EVENT_DONE;
1975 				G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__,
1976 				    ep);
1977 				mtx_lock(&sc->sc_events_mtx);
1978 				wakeup(ep);
1979 				mtx_unlock(&sc->sc_events_mtx);
1980 			}
1981 			if ((sc->sc_flags &
1982 			    G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
1983 				if (g_raid3_try_destroy(sc)) {
1984 					curthread->td_pflags &= ~TDP_GEOM;
1985 					G_RAID3_DEBUG(1, "Thread exiting.");
1986 					kthread_exit(0);
1987 				}
1988 			}
1989 			G_RAID3_DEBUG(5, "%s: I'm here 1.", __func__);
1990 			continue;
1991 		}
1992 		/*
1993 		 * Check if we can mark array as CLEAN and if we can't take
1994 		 * how much seconds should we wait.
1995 		 */
1996 		timeout = g_raid3_idle(sc, -1);
1997 		/*
1998 		 * Now I/O requests.
1999 		 */
2000 		/* Get first request from the queue. */
2001 		mtx_lock(&sc->sc_queue_mtx);
2002 		bp = bioq_first(&sc->sc_queue);
2003 		if (bp == NULL) {
2004 			if (ep != NULL) {
2005 				/*
2006 				 * We have a pending even, try to serve it
2007 				 * again.
2008 				 */
2009 				mtx_unlock(&sc->sc_queue_mtx);
2010 				tsleep(ep, PRIBIO, "r3:top1", hz / 5);
2011 				continue;
2012 			}
2013 			if ((sc->sc_flags &
2014 			    G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
2015 				mtx_unlock(&sc->sc_queue_mtx);
2016 				if (g_raid3_try_destroy(sc)) {
2017 					curthread->td_pflags &= ~TDP_GEOM;
2018 					G_RAID3_DEBUG(0, "Thread exiting.");
2019 					kthread_exit(0);
2020 				}
2021 				mtx_lock(&sc->sc_queue_mtx);
2022 			}
2023 			sx_xunlock(&sc->sc_lock);
2024 			MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "r3:w1",
2025 			    timeout * hz);
2026 			sx_xlock(&sc->sc_lock);
2027 			G_RAID3_DEBUG(5, "%s: I'm here 4.", __func__);
2028 			continue;
2029 		}
2030 		bioq_remove(&sc->sc_queue, bp);
2031 		mtx_unlock(&sc->sc_queue_mtx);
2032 
2033 		if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0)
2034 			g_raid3_regular_request(bp);
2035 		else if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0)
2036 			g_raid3_sync_request(bp);
2037 		else {
2038 			if (g_raid3_register_request(bp) != 0) {
2039 				mtx_lock(&sc->sc_queue_mtx);
2040 				bioq_insert_head(&sc->sc_queue, bp);
2041 				MSLEEP(&sc->sc_queue, &sc->sc_queue_mtx,
2042 				    PRIBIO | PDROP, "r3:lowmem", hz / 10);
2043 			}
2044 		}
2045 		G_RAID3_DEBUG(5, "%s: I'm here 9.", __func__);
2046 	}
2047 }
2048 
2049 static void
2050 g_raid3_update_idle(struct g_raid3_softc *sc, struct g_raid3_disk *disk)
2051 {
2052 
2053 	sx_assert(&sc->sc_lock, SX_LOCKED);
2054 	if (!sc->sc_idle && (disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) == 0) {
2055 		G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.",
2056 		    g_raid3_get_diskname(disk), sc->sc_name);
2057 		disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
2058 	} else if (sc->sc_idle &&
2059 	    (disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0) {
2060 		G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.",
2061 		    g_raid3_get_diskname(disk), sc->sc_name);
2062 		disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2063 	}
2064 }
2065 
2066 static void
2067 g_raid3_sync_start(struct g_raid3_softc *sc)
2068 {
2069 	struct g_raid3_disk *disk;
2070 	struct g_consumer *cp;
2071 	struct bio *bp;
2072 	int error;
2073 	u_int n;
2074 
2075 	g_topology_assert_not();
2076 	sx_assert(&sc->sc_lock, SX_XLOCKED);
2077 
2078 	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
2079 	    ("Device not in DEGRADED state (%s, %u).", sc->sc_name,
2080 	    sc->sc_state));
2081 	KASSERT(sc->sc_syncdisk == NULL, ("Syncdisk is not NULL (%s, %u).",
2082 	    sc->sc_name, sc->sc_state));
2083 	disk = NULL;
2084 	for (n = 0; n < sc->sc_ndisks; n++) {
2085 		if (sc->sc_disks[n].d_state != G_RAID3_DISK_STATE_SYNCHRONIZING)
2086 			continue;
2087 		disk = &sc->sc_disks[n];
2088 		break;
2089 	}
2090 	if (disk == NULL)
2091 		return;
2092 
2093 	sx_xunlock(&sc->sc_lock);
2094 	g_topology_lock();
2095 	cp = g_new_consumer(sc->sc_sync.ds_geom);
2096 	error = g_attach(cp, sc->sc_provider);
2097 	KASSERT(error == 0,
2098 	    ("Cannot attach to %s (error=%d).", sc->sc_name, error));
2099 	error = g_access(cp, 1, 0, 0);
2100 	KASSERT(error == 0, ("Cannot open %s (error=%d).", sc->sc_name, error));
2101 	g_topology_unlock();
2102 	sx_xlock(&sc->sc_lock);
2103 
2104 	G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name,
2105 	    g_raid3_get_diskname(disk));
2106 	disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
2107 	KASSERT(disk->d_sync.ds_consumer == NULL,
2108 	    ("Sync consumer already exists (device=%s, disk=%s).",
2109 	    sc->sc_name, g_raid3_get_diskname(disk)));
2110 
2111 	disk->d_sync.ds_consumer = cp;
2112 	disk->d_sync.ds_consumer->private = disk;
2113 	disk->d_sync.ds_consumer->index = 0;
2114 	sc->sc_syncdisk = disk;
2115 
2116 	/*
2117 	 * Allocate memory for synchronization bios and initialize them.
2118 	 */
2119 	disk->d_sync.ds_bios = malloc(sizeof(struct bio *) * g_raid3_syncreqs,
2120 	    M_RAID3, M_WAITOK);
2121 	for (n = 0; n < g_raid3_syncreqs; n++) {
2122 		bp = g_alloc_bio();
2123 		disk->d_sync.ds_bios[n] = bp;
2124 		bp->bio_parent = NULL;
2125 		bp->bio_cmd = BIO_READ;
2126 		bp->bio_data = malloc(MAXPHYS, M_RAID3, M_WAITOK);
2127 		bp->bio_cflags = 0;
2128 		bp->bio_offset = disk->d_sync.ds_offset * (sc->sc_ndisks - 1);
2129 		bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset);
2130 		disk->d_sync.ds_offset += bp->bio_length / (sc->sc_ndisks - 1);
2131 		bp->bio_done = g_raid3_sync_done;
2132 		bp->bio_from = disk->d_sync.ds_consumer;
2133 		bp->bio_to = sc->sc_provider;
2134 		bp->bio_caller1 = (void *)(uintptr_t)n;
2135 	}
2136 
2137 	/* Set the number of in-flight synchronization requests. */
2138 	disk->d_sync.ds_inflight = g_raid3_syncreqs;
2139 
2140 	/*
2141 	 * Fire off first synchronization requests.
2142 	 */
2143 	for (n = 0; n < g_raid3_syncreqs; n++) {
2144 		bp = disk->d_sync.ds_bios[n];
2145 		G_RAID3_LOGREQ(3, bp, "Sending synchronization request.");
2146 		disk->d_sync.ds_consumer->index++;
2147 		/*
2148 		 * Delay the request if it is colliding with a regular request.
2149 		 */
2150 		if (g_raid3_regular_collision(sc, bp))
2151 			g_raid3_sync_delay(sc, bp);
2152 		else
2153 			g_io_request(bp, disk->d_sync.ds_consumer);
2154 	}
2155 }
2156 
2157 /*
2158  * Stop synchronization process.
2159  * type: 0 - synchronization finished
2160  *       1 - synchronization stopped
2161  */
2162 static void
2163 g_raid3_sync_stop(struct g_raid3_softc *sc, int type)
2164 {
2165 	struct g_raid3_disk *disk;
2166 	struct g_consumer *cp;
2167 
2168 	g_topology_assert_not();
2169 	sx_assert(&sc->sc_lock, SX_LOCKED);
2170 
2171 	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
2172 	    ("Device not in DEGRADED state (%s, %u).", sc->sc_name,
2173 	    sc->sc_state));
2174 	disk = sc->sc_syncdisk;
2175 	sc->sc_syncdisk = NULL;
2176 	KASSERT(disk != NULL, ("No disk was synchronized (%s).", sc->sc_name));
2177 	KASSERT(disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
2178 	    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2179 	    g_raid3_disk_state2str(disk->d_state)));
2180 	if (disk->d_sync.ds_consumer == NULL)
2181 		return;
2182 
2183 	if (type == 0) {
2184 		G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s finished.",
2185 		    sc->sc_name, g_raid3_get_diskname(disk));
2186 	} else /* if (type == 1) */ {
2187 		G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s stopped.",
2188 		    sc->sc_name, g_raid3_get_diskname(disk));
2189 	}
2190 	free(disk->d_sync.ds_bios, M_RAID3);
2191 	disk->d_sync.ds_bios = NULL;
2192 	cp = disk->d_sync.ds_consumer;
2193 	disk->d_sync.ds_consumer = NULL;
2194 	disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2195 	sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */
2196 	g_topology_lock();
2197 	g_raid3_kill_consumer(sc, cp);
2198 	g_topology_unlock();
2199 	sx_xlock(&sc->sc_lock);
2200 }
2201 
2202 static void
2203 g_raid3_launch_provider(struct g_raid3_softc *sc)
2204 {
2205 	struct g_provider *pp;
2206 
2207 	sx_assert(&sc->sc_lock, SX_LOCKED);
2208 
2209 	g_topology_lock();
2210 	pp = g_new_providerf(sc->sc_geom, "raid3/%s", sc->sc_name);
2211 	pp->mediasize = sc->sc_mediasize;
2212 	pp->sectorsize = sc->sc_sectorsize;
2213 	sc->sc_provider = pp;
2214 	g_error_provider(pp, 0);
2215 	g_topology_unlock();
2216 	G_RAID3_DEBUG(0, "Device %s: provider %s launched.", sc->sc_name,
2217 	    pp->name);
2218 	if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED)
2219 		g_raid3_sync_start(sc);
2220 }
2221 
2222 static void
2223 g_raid3_destroy_provider(struct g_raid3_softc *sc)
2224 {
2225 	struct bio *bp;
2226 
2227 	g_topology_assert_not();
2228 	KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).",
2229 	    sc->sc_name));
2230 
2231 	g_topology_lock();
2232 	g_error_provider(sc->sc_provider, ENXIO);
2233 	mtx_lock(&sc->sc_queue_mtx);
2234 	while ((bp = bioq_first(&sc->sc_queue)) != NULL) {
2235 		bioq_remove(&sc->sc_queue, bp);
2236 		g_io_deliver(bp, ENXIO);
2237 	}
2238 	mtx_unlock(&sc->sc_queue_mtx);
2239 	G_RAID3_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name,
2240 	    sc->sc_provider->name);
2241 	sc->sc_provider->flags |= G_PF_WITHER;
2242 	g_orphan_provider(sc->sc_provider, ENXIO);
2243 	g_topology_unlock();
2244 	sc->sc_provider = NULL;
2245 	if (sc->sc_syncdisk != NULL)
2246 		g_raid3_sync_stop(sc, 1);
2247 }
2248 
2249 static void
2250 g_raid3_go(void *arg)
2251 {
2252 	struct g_raid3_softc *sc;
2253 
2254 	sc = arg;
2255 	G_RAID3_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name);
2256 	g_raid3_event_send(sc, 0,
2257 	    G_RAID3_EVENT_DONTWAIT | G_RAID3_EVENT_DEVICE);
2258 }
2259 
2260 static u_int
2261 g_raid3_determine_state(struct g_raid3_disk *disk)
2262 {
2263 	struct g_raid3_softc *sc;
2264 	u_int state;
2265 
2266 	sc = disk->d_softc;
2267 	if (sc->sc_syncid == disk->d_sync.ds_syncid) {
2268 		if ((disk->d_flags &
2269 		    G_RAID3_DISK_FLAG_SYNCHRONIZING) == 0) {
2270 			/* Disk does not need synchronization. */
2271 			state = G_RAID3_DISK_STATE_ACTIVE;
2272 		} else {
2273 			if ((sc->sc_flags &
2274 			     G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 ||
2275 			    (disk->d_flags &
2276 			     G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) {
2277 				/*
2278 				 * We can start synchronization from
2279 				 * the stored offset.
2280 				 */
2281 				state = G_RAID3_DISK_STATE_SYNCHRONIZING;
2282 			} else {
2283 				state = G_RAID3_DISK_STATE_STALE;
2284 			}
2285 		}
2286 	} else if (disk->d_sync.ds_syncid < sc->sc_syncid) {
2287 		/*
2288 		 * Reset all synchronization data for this disk,
2289 		 * because if it even was synchronized, it was
2290 		 * synchronized to disks with different syncid.
2291 		 */
2292 		disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING;
2293 		disk->d_sync.ds_offset = 0;
2294 		disk->d_sync.ds_offset_done = 0;
2295 		disk->d_sync.ds_syncid = sc->sc_syncid;
2296 		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 ||
2297 		    (disk->d_flags & G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) {
2298 			state = G_RAID3_DISK_STATE_SYNCHRONIZING;
2299 		} else {
2300 			state = G_RAID3_DISK_STATE_STALE;
2301 		}
2302 	} else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ {
2303 		/*
2304 		 * Not good, NOT GOOD!
2305 		 * It means that device was started on stale disks
2306 		 * and more fresh disk just arrive.
2307 		 * If there were writes, device is fucked up, sorry.
2308 		 * I think the best choice here is don't touch
2309 		 * this disk and inform the user laudly.
2310 		 */
2311 		G_RAID3_DEBUG(0, "Device %s was started before the freshest "
2312 		    "disk (%s) arrives!! It will not be connected to the "
2313 		    "running device.", sc->sc_name,
2314 		    g_raid3_get_diskname(disk));
2315 		g_raid3_destroy_disk(disk);
2316 		state = G_RAID3_DISK_STATE_NONE;
2317 		/* Return immediately, because disk was destroyed. */
2318 		return (state);
2319 	}
2320 	G_RAID3_DEBUG(3, "State for %s disk: %s.",
2321 	    g_raid3_get_diskname(disk), g_raid3_disk_state2str(state));
2322 	return (state);
2323 }
2324 
2325 /*
2326  * Update device state.
2327  */
2328 static void
2329 g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force)
2330 {
2331 	struct g_raid3_disk *disk;
2332 	u_int state;
2333 
2334 	sx_assert(&sc->sc_lock, SX_XLOCKED);
2335 
2336 	switch (sc->sc_state) {
2337 	case G_RAID3_DEVICE_STATE_STARTING:
2338 	    {
2339 		u_int n, ndirty, ndisks, genid, syncid;
2340 
2341 		KASSERT(sc->sc_provider == NULL,
2342 		    ("Non-NULL provider in STARTING state (%s).", sc->sc_name));
2343 		/*
2344 		 * Are we ready? We are, if all disks are connected or
2345 		 * one disk is missing and 'force' is true.
2346 		 */
2347 		if (g_raid3_ndisks(sc, -1) + force == sc->sc_ndisks) {
2348 			if (!force)
2349 				callout_drain(&sc->sc_callout);
2350 		} else {
2351 			if (force) {
2352 				/*
2353 				 * Timeout expired, so destroy device.
2354 				 */
2355 				sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2356 				G_RAID3_DEBUG(1, "root_mount_rel[%u] %p",
2357 				    __LINE__, sc->sc_rootmount);
2358 				root_mount_rel(sc->sc_rootmount);
2359 				sc->sc_rootmount = NULL;
2360 			}
2361 			return;
2362 		}
2363 
2364 		/*
2365 		 * Find the biggest genid.
2366 		 */
2367 		genid = 0;
2368 		for (n = 0; n < sc->sc_ndisks; n++) {
2369 			disk = &sc->sc_disks[n];
2370 			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2371 				continue;
2372 			if (disk->d_genid > genid)
2373 				genid = disk->d_genid;
2374 		}
2375 		sc->sc_genid = genid;
2376 		/*
2377 		 * Remove all disks without the biggest genid.
2378 		 */
2379 		for (n = 0; n < sc->sc_ndisks; n++) {
2380 			disk = &sc->sc_disks[n];
2381 			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2382 				continue;
2383 			if (disk->d_genid < genid) {
2384 				G_RAID3_DEBUG(0,
2385 				    "Component %s (device %s) broken, skipping.",
2386 				    g_raid3_get_diskname(disk), sc->sc_name);
2387 				g_raid3_destroy_disk(disk);
2388 			}
2389 		}
2390 
2391 		/*
2392 		 * There must be at least 'sc->sc_ndisks - 1' components
2393 		 * with the same syncid and without SYNCHRONIZING flag.
2394 		 */
2395 
2396 		/*
2397 		 * Find the biggest syncid, number of valid components and
2398 		 * number of dirty components.
2399 		 */
2400 		ndirty = ndisks = syncid = 0;
2401 		for (n = 0; n < sc->sc_ndisks; n++) {
2402 			disk = &sc->sc_disks[n];
2403 			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2404 				continue;
2405 			if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0)
2406 				ndirty++;
2407 			if (disk->d_sync.ds_syncid > syncid) {
2408 				syncid = disk->d_sync.ds_syncid;
2409 				ndisks = 0;
2410 			} else if (disk->d_sync.ds_syncid < syncid) {
2411 				continue;
2412 			}
2413 			if ((disk->d_flags &
2414 			    G_RAID3_DISK_FLAG_SYNCHRONIZING) != 0) {
2415 				continue;
2416 			}
2417 			ndisks++;
2418 		}
2419 		/*
2420 		 * Do we have enough valid components?
2421 		 */
2422 		if (ndisks + 1 < sc->sc_ndisks) {
2423 			G_RAID3_DEBUG(0,
2424 			    "Device %s is broken, too few valid components.",
2425 			    sc->sc_name);
2426 			sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2427 			return;
2428 		}
2429 		/*
2430 		 * If there is one DIRTY component and all disks are present,
2431 		 * mark it for synchronization. If there is more than one DIRTY
2432 		 * component, mark parity component for synchronization.
2433 		 */
2434 		if (ndisks == sc->sc_ndisks && ndirty == 1) {
2435 			for (n = 0; n < sc->sc_ndisks; n++) {
2436 				disk = &sc->sc_disks[n];
2437 				if ((disk->d_flags &
2438 				    G_RAID3_DISK_FLAG_DIRTY) == 0) {
2439 					continue;
2440 				}
2441 				disk->d_flags |=
2442 				    G_RAID3_DISK_FLAG_SYNCHRONIZING;
2443 			}
2444 		} else if (ndisks == sc->sc_ndisks && ndirty > 1) {
2445 			disk = &sc->sc_disks[sc->sc_ndisks - 1];
2446 			disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING;
2447 		}
2448 
2449 		sc->sc_syncid = syncid;
2450 		if (force) {
2451 			/* Remember to bump syncid on first write. */
2452 			sc->sc_bump_id |= G_RAID3_BUMP_SYNCID;
2453 		}
2454 		if (ndisks == sc->sc_ndisks)
2455 			state = G_RAID3_DEVICE_STATE_COMPLETE;
2456 		else /* if (ndisks == sc->sc_ndisks - 1) */
2457 			state = G_RAID3_DEVICE_STATE_DEGRADED;
2458 		G_RAID3_DEBUG(1, "Device %s state changed from %s to %s.",
2459 		    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
2460 		    g_raid3_device_state2str(state));
2461 		sc->sc_state = state;
2462 		for (n = 0; n < sc->sc_ndisks; n++) {
2463 			disk = &sc->sc_disks[n];
2464 			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2465 				continue;
2466 			state = g_raid3_determine_state(disk);
2467 			g_raid3_event_send(disk, state, G_RAID3_EVENT_DONTWAIT);
2468 			if (state == G_RAID3_DISK_STATE_STALE)
2469 				sc->sc_bump_id |= G_RAID3_BUMP_SYNCID;
2470 		}
2471 		break;
2472 	    }
2473 	case G_RAID3_DEVICE_STATE_DEGRADED:
2474 		/*
2475 		 * Genid need to be bumped immediately, so do it here.
2476 		 */
2477 		if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) {
2478 			sc->sc_bump_id &= ~G_RAID3_BUMP_GENID;
2479 			g_raid3_bump_genid(sc);
2480 		}
2481 
2482 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0)
2483 			return;
2484 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) <
2485 		    sc->sc_ndisks - 1) {
2486 			if (sc->sc_provider != NULL)
2487 				g_raid3_destroy_provider(sc);
2488 			sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2489 			return;
2490 		}
2491 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) ==
2492 		    sc->sc_ndisks) {
2493 			state = G_RAID3_DEVICE_STATE_COMPLETE;
2494 			G_RAID3_DEBUG(1,
2495 			    "Device %s state changed from %s to %s.",
2496 			    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
2497 			    g_raid3_device_state2str(state));
2498 			sc->sc_state = state;
2499 		}
2500 		if (sc->sc_provider == NULL)
2501 			g_raid3_launch_provider(sc);
2502 		if (sc->sc_rootmount != NULL) {
2503 			G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__,
2504 			    sc->sc_rootmount);
2505 			root_mount_rel(sc->sc_rootmount);
2506 			sc->sc_rootmount = NULL;
2507 		}
2508 		break;
2509 	case G_RAID3_DEVICE_STATE_COMPLETE:
2510 		/*
2511 		 * Genid need to be bumped immediately, so do it here.
2512 		 */
2513 		if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) {
2514 			sc->sc_bump_id &= ~G_RAID3_BUMP_GENID;
2515 			g_raid3_bump_genid(sc);
2516 		}
2517 
2518 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0)
2519 			return;
2520 		KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) >=
2521 		    sc->sc_ndisks - 1,
2522 		    ("Too few ACTIVE components in COMPLETE state (device %s).",
2523 		    sc->sc_name));
2524 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) ==
2525 		    sc->sc_ndisks - 1) {
2526 			state = G_RAID3_DEVICE_STATE_DEGRADED;
2527 			G_RAID3_DEBUG(1,
2528 			    "Device %s state changed from %s to %s.",
2529 			    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
2530 			    g_raid3_device_state2str(state));
2531 			sc->sc_state = state;
2532 		}
2533 		if (sc->sc_provider == NULL)
2534 			g_raid3_launch_provider(sc);
2535 		if (sc->sc_rootmount != NULL) {
2536 			G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__,
2537 			    sc->sc_rootmount);
2538 			root_mount_rel(sc->sc_rootmount);
2539 			sc->sc_rootmount = NULL;
2540 		}
2541 		break;
2542 	default:
2543 		KASSERT(1 == 0, ("Wrong device state (%s, %s).", sc->sc_name,
2544 		    g_raid3_device_state2str(sc->sc_state)));
2545 		break;
2546 	}
2547 }
2548 
2549 /*
2550  * Update disk state and device state if needed.
2551  */
2552 #define	DISK_STATE_CHANGED()	G_RAID3_DEBUG(1,			\
2553 	"Disk %s state changed from %s to %s (device %s).",		\
2554 	g_raid3_get_diskname(disk),					\
2555 	g_raid3_disk_state2str(disk->d_state),				\
2556 	g_raid3_disk_state2str(state), sc->sc_name)
2557 static int
2558 g_raid3_update_disk(struct g_raid3_disk *disk, u_int state)
2559 {
2560 	struct g_raid3_softc *sc;
2561 
2562 	sc = disk->d_softc;
2563 	sx_assert(&sc->sc_lock, SX_XLOCKED);
2564 
2565 again:
2566 	G_RAID3_DEBUG(3, "Changing disk %s state from %s to %s.",
2567 	    g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state),
2568 	    g_raid3_disk_state2str(state));
2569 	switch (state) {
2570 	case G_RAID3_DISK_STATE_NEW:
2571 		/*
2572 		 * Possible scenarios:
2573 		 * 1. New disk arrive.
2574 		 */
2575 		/* Previous state should be NONE. */
2576 		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NONE,
2577 		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2578 		    g_raid3_disk_state2str(disk->d_state)));
2579 		DISK_STATE_CHANGED();
2580 
2581 		disk->d_state = state;
2582 		G_RAID3_DEBUG(0, "Device %s: provider %s detected.",
2583 		    sc->sc_name, g_raid3_get_diskname(disk));
2584 		if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING)
2585 			break;
2586 		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2587 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2588 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2589 		    g_raid3_device_state2str(sc->sc_state),
2590 		    g_raid3_get_diskname(disk),
2591 		    g_raid3_disk_state2str(disk->d_state)));
2592 		state = g_raid3_determine_state(disk);
2593 		if (state != G_RAID3_DISK_STATE_NONE)
2594 			goto again;
2595 		break;
2596 	case G_RAID3_DISK_STATE_ACTIVE:
2597 		/*
2598 		 * Possible scenarios:
2599 		 * 1. New disk does not need synchronization.
2600 		 * 2. Synchronization process finished successfully.
2601 		 */
2602 		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2603 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2604 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2605 		    g_raid3_device_state2str(sc->sc_state),
2606 		    g_raid3_get_diskname(disk),
2607 		    g_raid3_disk_state2str(disk->d_state)));
2608 		/* Previous state should be NEW or SYNCHRONIZING. */
2609 		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW ||
2610 		    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
2611 		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2612 		    g_raid3_disk_state2str(disk->d_state)));
2613 		DISK_STATE_CHANGED();
2614 
2615 		if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
2616 			disk->d_flags &= ~G_RAID3_DISK_FLAG_SYNCHRONIZING;
2617 			disk->d_flags &= ~G_RAID3_DISK_FLAG_FORCE_SYNC;
2618 			g_raid3_sync_stop(sc, 0);
2619 		}
2620 		disk->d_state = state;
2621 		disk->d_sync.ds_offset = 0;
2622 		disk->d_sync.ds_offset_done = 0;
2623 		g_raid3_update_idle(sc, disk);
2624 		g_raid3_update_metadata(disk);
2625 		G_RAID3_DEBUG(0, "Device %s: provider %s activated.",
2626 		    sc->sc_name, g_raid3_get_diskname(disk));
2627 		break;
2628 	case G_RAID3_DISK_STATE_STALE:
2629 		/*
2630 		 * Possible scenarios:
2631 		 * 1. Stale disk was connected.
2632 		 */
2633 		/* Previous state should be NEW. */
2634 		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
2635 		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2636 		    g_raid3_disk_state2str(disk->d_state)));
2637 		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2638 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2639 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2640 		    g_raid3_device_state2str(sc->sc_state),
2641 		    g_raid3_get_diskname(disk),
2642 		    g_raid3_disk_state2str(disk->d_state)));
2643 		/*
2644 		 * STALE state is only possible if device is marked
2645 		 * NOAUTOSYNC.
2646 		 */
2647 		KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) != 0,
2648 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2649 		    g_raid3_device_state2str(sc->sc_state),
2650 		    g_raid3_get_diskname(disk),
2651 		    g_raid3_disk_state2str(disk->d_state)));
2652 		DISK_STATE_CHANGED();
2653 
2654 		disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2655 		disk->d_state = state;
2656 		g_raid3_update_metadata(disk);
2657 		G_RAID3_DEBUG(0, "Device %s: provider %s is stale.",
2658 		    sc->sc_name, g_raid3_get_diskname(disk));
2659 		break;
2660 	case G_RAID3_DISK_STATE_SYNCHRONIZING:
2661 		/*
2662 		 * Possible scenarios:
2663 		 * 1. Disk which needs synchronization was connected.
2664 		 */
2665 		/* Previous state should be NEW. */
2666 		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
2667 		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2668 		    g_raid3_disk_state2str(disk->d_state)));
2669 		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2670 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2671 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2672 		    g_raid3_device_state2str(sc->sc_state),
2673 		    g_raid3_get_diskname(disk),
2674 		    g_raid3_disk_state2str(disk->d_state)));
2675 		DISK_STATE_CHANGED();
2676 
2677 		if (disk->d_state == G_RAID3_DISK_STATE_NEW)
2678 			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2679 		disk->d_state = state;
2680 		if (sc->sc_provider != NULL) {
2681 			g_raid3_sync_start(sc);
2682 			g_raid3_update_metadata(disk);
2683 		}
2684 		break;
2685 	case G_RAID3_DISK_STATE_DISCONNECTED:
2686 		/*
2687 		 * Possible scenarios:
2688 		 * 1. Device wasn't running yet, but disk disappear.
2689 		 * 2. Disk was active and disapppear.
2690 		 * 3. Disk disappear during synchronization process.
2691 		 */
2692 		if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2693 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
2694 			/*
2695 			 * Previous state should be ACTIVE, STALE or
2696 			 * SYNCHRONIZING.
2697 			 */
2698 			KASSERT(disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
2699 			    disk->d_state == G_RAID3_DISK_STATE_STALE ||
2700 			    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
2701 			    ("Wrong disk state (%s, %s).",
2702 			    g_raid3_get_diskname(disk),
2703 			    g_raid3_disk_state2str(disk->d_state)));
2704 		} else if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING) {
2705 			/* Previous state should be NEW. */
2706 			KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
2707 			    ("Wrong disk state (%s, %s).",
2708 			    g_raid3_get_diskname(disk),
2709 			    g_raid3_disk_state2str(disk->d_state)));
2710 			/*
2711 			 * Reset bumping syncid if disk disappeared in STARTING
2712 			 * state.
2713 			 */
2714 			if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0)
2715 				sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID;
2716 #ifdef	INVARIANTS
2717 		} else {
2718 			KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).",
2719 			    sc->sc_name,
2720 			    g_raid3_device_state2str(sc->sc_state),
2721 			    g_raid3_get_diskname(disk),
2722 			    g_raid3_disk_state2str(disk->d_state)));
2723 #endif
2724 		}
2725 		DISK_STATE_CHANGED();
2726 		G_RAID3_DEBUG(0, "Device %s: provider %s disconnected.",
2727 		    sc->sc_name, g_raid3_get_diskname(disk));
2728 
2729 		g_raid3_destroy_disk(disk);
2730 		break;
2731 	default:
2732 		KASSERT(1 == 0, ("Unknown state (%u).", state));
2733 		break;
2734 	}
2735 	return (0);
2736 }
2737 #undef	DISK_STATE_CHANGED
2738 
2739 int
2740 g_raid3_read_metadata(struct g_consumer *cp, struct g_raid3_metadata *md)
2741 {
2742 	struct g_provider *pp;
2743 	u_char *buf;
2744 	int error;
2745 
2746 	g_topology_assert();
2747 
2748 	error = g_access(cp, 1, 0, 0);
2749 	if (error != 0)
2750 		return (error);
2751 	pp = cp->provider;
2752 	g_topology_unlock();
2753 	/* Metadata are stored on last sector. */
2754 	buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
2755 	    &error);
2756 	g_topology_lock();
2757 	g_access(cp, -1, 0, 0);
2758 	if (buf == NULL) {
2759 		G_RAID3_DEBUG(1, "Cannot read metadata from %s (error=%d).",
2760 		    cp->provider->name, error);
2761 		return (error);
2762 	}
2763 
2764 	/* Decode metadata. */
2765 	error = raid3_metadata_decode(buf, md);
2766 	g_free(buf);
2767 	if (strcmp(md->md_magic, G_RAID3_MAGIC) != 0)
2768 		return (EINVAL);
2769 	if (md->md_version > G_RAID3_VERSION) {
2770 		G_RAID3_DEBUG(0,
2771 		    "Kernel module is too old to handle metadata from %s.",
2772 		    cp->provider->name);
2773 		return (EINVAL);
2774 	}
2775 	if (error != 0) {
2776 		G_RAID3_DEBUG(1, "MD5 metadata hash mismatch for provider %s.",
2777 		    cp->provider->name);
2778 		return (error);
2779 	}
2780 
2781 	return (0);
2782 }
2783 
2784 static int
2785 g_raid3_check_metadata(struct g_raid3_softc *sc, struct g_provider *pp,
2786     struct g_raid3_metadata *md)
2787 {
2788 
2789 	if (md->md_no >= sc->sc_ndisks) {
2790 		G_RAID3_DEBUG(1, "Invalid disk %s number (no=%u), skipping.",
2791 		    pp->name, md->md_no);
2792 		return (EINVAL);
2793 	}
2794 	if (sc->sc_disks[md->md_no].d_state != G_RAID3_DISK_STATE_NODISK) {
2795 		G_RAID3_DEBUG(1, "Disk %s (no=%u) already exists, skipping.",
2796 		    pp->name, md->md_no);
2797 		return (EEXIST);
2798 	}
2799 	if (md->md_all != sc->sc_ndisks) {
2800 		G_RAID3_DEBUG(1,
2801 		    "Invalid '%s' field on disk %s (device %s), skipping.",
2802 		    "md_all", pp->name, sc->sc_name);
2803 		return (EINVAL);
2804 	}
2805 	if (md->md_mediasize != sc->sc_mediasize) {
2806 		G_RAID3_DEBUG(1,
2807 		    "Invalid '%s' field on disk %s (device %s), skipping.",
2808 		    "md_mediasize", pp->name, sc->sc_name);
2809 		return (EINVAL);
2810 	}
2811 	if ((md->md_mediasize % (sc->sc_ndisks - 1)) != 0) {
2812 		G_RAID3_DEBUG(1,
2813 		    "Invalid '%s' field on disk %s (device %s), skipping.",
2814 		    "md_mediasize", pp->name, sc->sc_name);
2815 		return (EINVAL);
2816 	}
2817 	if ((sc->sc_mediasize / (sc->sc_ndisks - 1)) > pp->mediasize) {
2818 		G_RAID3_DEBUG(1,
2819 		    "Invalid size of disk %s (device %s), skipping.", pp->name,
2820 		    sc->sc_name);
2821 		return (EINVAL);
2822 	}
2823 	if ((md->md_sectorsize / pp->sectorsize) < sc->sc_ndisks - 1) {
2824 		G_RAID3_DEBUG(1,
2825 		    "Invalid '%s' field on disk %s (device %s), skipping.",
2826 		    "md_sectorsize", pp->name, sc->sc_name);
2827 		return (EINVAL);
2828 	}
2829 	if (md->md_sectorsize != sc->sc_sectorsize) {
2830 		G_RAID3_DEBUG(1,
2831 		    "Invalid '%s' field on disk %s (device %s), skipping.",
2832 		    "md_sectorsize", pp->name, sc->sc_name);
2833 		return (EINVAL);
2834 	}
2835 	if ((sc->sc_sectorsize % pp->sectorsize) != 0) {
2836 		G_RAID3_DEBUG(1,
2837 		    "Invalid sector size of disk %s (device %s), skipping.",
2838 		    pp->name, sc->sc_name);
2839 		return (EINVAL);
2840 	}
2841 	if ((md->md_mflags & ~G_RAID3_DEVICE_FLAG_MASK) != 0) {
2842 		G_RAID3_DEBUG(1,
2843 		    "Invalid device flags on disk %s (device %s), skipping.",
2844 		    pp->name, sc->sc_name);
2845 		return (EINVAL);
2846 	}
2847 	if ((md->md_mflags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 &&
2848 	    (md->md_mflags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0) {
2849 		/*
2850 		 * VERIFY and ROUND-ROBIN options are mutally exclusive.
2851 		 */
2852 		G_RAID3_DEBUG(1, "Both VERIFY and ROUND-ROBIN flags exist on "
2853 		    "disk %s (device %s), skipping.", pp->name, sc->sc_name);
2854 		return (EINVAL);
2855 	}
2856 	if ((md->md_dflags & ~G_RAID3_DISK_FLAG_MASK) != 0) {
2857 		G_RAID3_DEBUG(1,
2858 		    "Invalid disk flags on disk %s (device %s), skipping.",
2859 		    pp->name, sc->sc_name);
2860 		return (EINVAL);
2861 	}
2862 	return (0);
2863 }
2864 
2865 int
2866 g_raid3_add_disk(struct g_raid3_softc *sc, struct g_provider *pp,
2867     struct g_raid3_metadata *md)
2868 {
2869 	struct g_raid3_disk *disk;
2870 	int error;
2871 
2872 	g_topology_assert_not();
2873 	G_RAID3_DEBUG(2, "Adding disk %s.", pp->name);
2874 
2875 	error = g_raid3_check_metadata(sc, pp, md);
2876 	if (error != 0)
2877 		return (error);
2878 	if (sc->sc_state != G_RAID3_DEVICE_STATE_STARTING &&
2879 	    md->md_genid < sc->sc_genid) {
2880 		G_RAID3_DEBUG(0, "Component %s (device %s) broken, skipping.",
2881 		    pp->name, sc->sc_name);
2882 		return (EINVAL);
2883 	}
2884 	disk = g_raid3_init_disk(sc, pp, md, &error);
2885 	if (disk == NULL)
2886 		return (error);
2887 	error = g_raid3_event_send(disk, G_RAID3_DISK_STATE_NEW,
2888 	    G_RAID3_EVENT_WAIT);
2889 	if (error != 0)
2890 		return (error);
2891 	if (md->md_version < G_RAID3_VERSION) {
2892 		G_RAID3_DEBUG(0, "Upgrading metadata on %s (v%d->v%d).",
2893 		    pp->name, md->md_version, G_RAID3_VERSION);
2894 		g_raid3_update_metadata(disk);
2895 	}
2896 	return (0);
2897 }
2898 
2899 static int
2900 g_raid3_access(struct g_provider *pp, int acr, int acw, int ace)
2901 {
2902 	struct g_raid3_softc *sc;
2903 	int dcr, dcw, dce, error;
2904 
2905 	g_topology_assert();
2906 	G_RAID3_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr,
2907 	    acw, ace);
2908 
2909 	dcr = pp->acr + acr;
2910 	dcw = pp->acw + acw;
2911 	dce = pp->ace + ace;
2912 
2913 	error = 0;
2914 	sc = pp->geom->softc;
2915 	if (sc != NULL) {
2916 		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0)
2917 			sc = NULL;
2918 		else {
2919 			g_topology_unlock();
2920 			sx_xlock(&sc->sc_lock);
2921 		}
2922 	}
2923 	if (sc == NULL ||
2924 	    g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < sc->sc_ndisks - 1) {
2925 		if (acr > 0 || acw > 0 || ace > 0)
2926 			error = ENXIO;
2927 		goto end;
2928 	}
2929 	if (dcw == 0 && !sc->sc_idle)
2930 		g_raid3_idle(sc, dcw);
2931 end:
2932 	if (sc != NULL) {
2933 		sx_xunlock(&sc->sc_lock);
2934 		g_topology_lock();
2935 	}
2936 	return (error);
2937 }
2938 
2939 static struct g_geom *
2940 g_raid3_create(struct g_class *mp, const struct g_raid3_metadata *md)
2941 {
2942 	struct g_raid3_softc *sc;
2943 	struct g_geom *gp;
2944 	int error, timeout;
2945 	u_int n;
2946 
2947 	g_topology_assert();
2948 	G_RAID3_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id);
2949 
2950 	/* One disk is minimum. */
2951 	if (md->md_all < 1)
2952 		return (NULL);
2953 	/*
2954 	 * Action geom.
2955 	 */
2956 	gp = g_new_geomf(mp, "%s", md->md_name);
2957 	sc = malloc(sizeof(*sc), M_RAID3, M_WAITOK | M_ZERO);
2958 	sc->sc_disks = malloc(sizeof(struct g_raid3_disk) * md->md_all, M_RAID3,
2959 	    M_WAITOK | M_ZERO);
2960 	gp->start = g_raid3_start;
2961 	gp->orphan = g_raid3_orphan;
2962 	gp->access = g_raid3_access;
2963 	gp->dumpconf = g_raid3_dumpconf;
2964 
2965 	sc->sc_id = md->md_id;
2966 	sc->sc_mediasize = md->md_mediasize;
2967 	sc->sc_sectorsize = md->md_sectorsize;
2968 	sc->sc_ndisks = md->md_all;
2969 	sc->sc_round_robin = 0;
2970 	sc->sc_flags = md->md_mflags;
2971 	sc->sc_bump_id = 0;
2972 	sc->sc_idle = 1;
2973 	sc->sc_last_write = time_uptime;
2974 	sc->sc_writes = 0;
2975 	for (n = 0; n < sc->sc_ndisks; n++) {
2976 		sc->sc_disks[n].d_softc = sc;
2977 		sc->sc_disks[n].d_no = n;
2978 		sc->sc_disks[n].d_state = G_RAID3_DISK_STATE_NODISK;
2979 	}
2980 	sx_init(&sc->sc_lock, "graid3:lock");
2981 	bioq_init(&sc->sc_queue);
2982 	mtx_init(&sc->sc_queue_mtx, "graid3:queue", NULL, MTX_DEF);
2983 	bioq_init(&sc->sc_regular_delayed);
2984 	bioq_init(&sc->sc_inflight);
2985 	bioq_init(&sc->sc_sync_delayed);
2986 	TAILQ_INIT(&sc->sc_events);
2987 	mtx_init(&sc->sc_events_mtx, "graid3:events", NULL, MTX_DEF);
2988 	callout_init(&sc->sc_callout, CALLOUT_MPSAFE);
2989 	sc->sc_state = G_RAID3_DEVICE_STATE_STARTING;
2990 	gp->softc = sc;
2991 	sc->sc_geom = gp;
2992 	sc->sc_provider = NULL;
2993 	/*
2994 	 * Synchronization geom.
2995 	 */
2996 	gp = g_new_geomf(mp, "%s.sync", md->md_name);
2997 	gp->softc = sc;
2998 	gp->orphan = g_raid3_orphan;
2999 	sc->sc_sync.ds_geom = gp;
3000 
3001 	sc->sc_zones[G_RAID3_ZONE_64K].sz_zone = uma_zcreate("gr3:64k", 65536,
3002 	    g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL, UMA_ALIGN_PTR, 0);
3003 	sc->sc_zones[G_RAID3_ZONE_64K].sz_inuse = 0;
3004 	sc->sc_zones[G_RAID3_ZONE_64K].sz_max = g_raid3_n64k;
3005 	sc->sc_zones[G_RAID3_ZONE_64K].sz_requested =
3006 	    sc->sc_zones[G_RAID3_ZONE_64K].sz_failed = 0;
3007 	sc->sc_zones[G_RAID3_ZONE_16K].sz_zone = uma_zcreate("gr3:16k", 16384,
3008 	    g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL, UMA_ALIGN_PTR, 0);
3009 	sc->sc_zones[G_RAID3_ZONE_16K].sz_inuse = 0;
3010 	sc->sc_zones[G_RAID3_ZONE_16K].sz_max = g_raid3_n16k;
3011 	sc->sc_zones[G_RAID3_ZONE_16K].sz_requested =
3012 	    sc->sc_zones[G_RAID3_ZONE_16K].sz_failed = 0;
3013 	sc->sc_zones[G_RAID3_ZONE_4K].sz_zone = uma_zcreate("gr3:4k", 4096,
3014 	    g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL, UMA_ALIGN_PTR, 0);
3015 	sc->sc_zones[G_RAID3_ZONE_4K].sz_inuse = 0;
3016 	sc->sc_zones[G_RAID3_ZONE_4K].sz_max = g_raid3_n4k;
3017 	sc->sc_zones[G_RAID3_ZONE_4K].sz_requested =
3018 	    sc->sc_zones[G_RAID3_ZONE_4K].sz_failed = 0;
3019 
3020 	error = kthread_create(g_raid3_worker, sc, &sc->sc_worker, 0, 0,
3021 	    "g_raid3 %s", md->md_name);
3022 	if (error != 0) {
3023 		G_RAID3_DEBUG(1, "Cannot create kernel thread for %s.",
3024 		    sc->sc_name);
3025 		uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_64K].sz_zone);
3026 		uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_16K].sz_zone);
3027 		uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_4K].sz_zone);
3028 		g_destroy_geom(sc->sc_sync.ds_geom);
3029 		mtx_destroy(&sc->sc_events_mtx);
3030 		mtx_destroy(&sc->sc_queue_mtx);
3031 		sx_destroy(&sc->sc_lock);
3032 		g_destroy_geom(sc->sc_geom);
3033 		free(sc->sc_disks, M_RAID3);
3034 		free(sc, M_RAID3);
3035 		return (NULL);
3036 	}
3037 
3038 	G_RAID3_DEBUG(0, "Device %s created (id=%u).", sc->sc_name, sc->sc_id);
3039 
3040 	sc->sc_rootmount = root_mount_hold("GRAID3");
3041 	G_RAID3_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount);
3042 
3043 	/*
3044 	 * Run timeout.
3045 	 */
3046 	timeout = atomic_load_acq_int(&g_raid3_timeout);
3047 	callout_reset(&sc->sc_callout, timeout * hz, g_raid3_go, sc);
3048 	return (sc->sc_geom);
3049 }
3050 
3051 int
3052 g_raid3_destroy(struct g_raid3_softc *sc, boolean_t force)
3053 {
3054 	struct g_provider *pp;
3055 
3056 	g_topology_assert_not();
3057 	if (sc == NULL)
3058 		return (ENXIO);
3059 	sx_assert(&sc->sc_lock, SX_XLOCKED);
3060 
3061 	pp = sc->sc_provider;
3062 	if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
3063 		if (force) {
3064 			G_RAID3_DEBUG(1, "Device %s is still open, so it "
3065 			    "can't be definitely removed.", pp->name);
3066 		} else {
3067 			G_RAID3_DEBUG(1,
3068 			    "Device %s is still open (r%dw%de%d).", pp->name,
3069 			    pp->acr, pp->acw, pp->ace);
3070 			return (EBUSY);
3071 		}
3072 	}
3073 
3074 	sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
3075 	sc->sc_flags |= G_RAID3_DEVICE_FLAG_WAIT;
3076 	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
3077 	sx_xunlock(&sc->sc_lock);
3078 	mtx_lock(&sc->sc_queue_mtx);
3079 	wakeup(sc);
3080 	wakeup(&sc->sc_queue);
3081 	mtx_unlock(&sc->sc_queue_mtx);
3082 	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker);
3083 	while (sc->sc_worker != NULL)
3084 		tsleep(&sc->sc_worker, PRIBIO, "r3:destroy", hz / 5);
3085 	G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker);
3086 	sx_xlock(&sc->sc_lock);
3087 	g_raid3_destroy_device(sc);
3088 	free(sc->sc_disks, M_RAID3);
3089 	free(sc, M_RAID3);
3090 	return (0);
3091 }
3092 
3093 static void
3094 g_raid3_taste_orphan(struct g_consumer *cp)
3095 {
3096 
3097 	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
3098 	    cp->provider->name));
3099 }
3100 
3101 static struct g_geom *
3102 g_raid3_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
3103 {
3104 	struct g_raid3_metadata md;
3105 	struct g_raid3_softc *sc;
3106 	struct g_consumer *cp;
3107 	struct g_geom *gp;
3108 	int error;
3109 
3110 	g_topology_assert();
3111 	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
3112 	G_RAID3_DEBUG(2, "Tasting %s.", pp->name);
3113 
3114 	gp = g_new_geomf(mp, "raid3:taste");
3115 	/* This orphan function should be never called. */
3116 	gp->orphan = g_raid3_taste_orphan;
3117 	cp = g_new_consumer(gp);
3118 	g_attach(cp, pp);
3119 	error = g_raid3_read_metadata(cp, &md);
3120 	g_detach(cp);
3121 	g_destroy_consumer(cp);
3122 	g_destroy_geom(gp);
3123 	if (error != 0)
3124 		return (NULL);
3125 	gp = NULL;
3126 
3127 	if (md.md_provider[0] != '\0' && strcmp(md.md_provider, pp->name) != 0)
3128 		return (NULL);
3129 	if (md.md_provsize != 0 && md.md_provsize != pp->mediasize)
3130 		return (NULL);
3131 	if (g_raid3_debug >= 2)
3132 		raid3_metadata_dump(&md);
3133 
3134 	/*
3135 	 * Let's check if device already exists.
3136 	 */
3137 	sc = NULL;
3138 	LIST_FOREACH(gp, &mp->geom, geom) {
3139 		sc = gp->softc;
3140 		if (sc == NULL)
3141 			continue;
3142 		if (sc->sc_sync.ds_geom == gp)
3143 			continue;
3144 		if (strcmp(md.md_name, sc->sc_name) != 0)
3145 			continue;
3146 		if (md.md_id != sc->sc_id) {
3147 			G_RAID3_DEBUG(0, "Device %s already configured.",
3148 			    sc->sc_name);
3149 			return (NULL);
3150 		}
3151 		break;
3152 	}
3153 	if (gp == NULL) {
3154 		gp = g_raid3_create(mp, &md);
3155 		if (gp == NULL) {
3156 			G_RAID3_DEBUG(0, "Cannot create device %s.",
3157 			    md.md_name);
3158 			return (NULL);
3159 		}
3160 		sc = gp->softc;
3161 	}
3162 	G_RAID3_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name);
3163 	g_topology_unlock();
3164 	sx_xlock(&sc->sc_lock);
3165 	error = g_raid3_add_disk(sc, pp, &md);
3166 	if (error != 0) {
3167 		G_RAID3_DEBUG(0, "Cannot add disk %s to %s (error=%d).",
3168 		    pp->name, gp->name, error);
3169 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NODISK) ==
3170 		    sc->sc_ndisks) {
3171 			g_raid3_destroy(sc, 1);
3172 			g_topology_lock();
3173 			return (NULL);
3174 		}
3175 		gp = NULL;
3176 	}
3177 	sx_xunlock(&sc->sc_lock);
3178 	g_topology_lock();
3179 	return (gp);
3180 }
3181 
3182 static int
3183 g_raid3_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused,
3184     struct g_geom *gp)
3185 {
3186 	struct g_raid3_softc *sc;
3187 	int error;
3188 
3189 	g_topology_unlock();
3190 	sc = gp->softc;
3191 	sx_xlock(&sc->sc_lock);
3192 	error = g_raid3_destroy(gp->softc, 0);
3193 	if (error != 0)
3194 		sx_xunlock(&sc->sc_lock);
3195 	g_topology_lock();
3196 	return (error);
3197 }
3198 
3199 static void
3200 g_raid3_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
3201     struct g_consumer *cp, struct g_provider *pp)
3202 {
3203 	struct g_raid3_softc *sc;
3204 
3205 	g_topology_assert();
3206 
3207 	sc = gp->softc;
3208 	if (sc == NULL)
3209 		return;
3210 	/* Skip synchronization geom. */
3211 	if (gp == sc->sc_sync.ds_geom)
3212 		return;
3213 	if (pp != NULL) {
3214 		/* Nothing here. */
3215 	} else if (cp != NULL) {
3216 		struct g_raid3_disk *disk;
3217 
3218 		disk = cp->private;
3219 		if (disk == NULL)
3220 			return;
3221 		g_topology_unlock();
3222 		sx_xlock(&sc->sc_lock);
3223 		sbuf_printf(sb, "%s<Type>", indent);
3224 		if (disk->d_no == sc->sc_ndisks - 1)
3225 			sbuf_printf(sb, "PARITY");
3226 		else
3227 			sbuf_printf(sb, "DATA");
3228 		sbuf_printf(sb, "</Type>\n");
3229 		sbuf_printf(sb, "%s<Number>%u</Number>\n", indent,
3230 		    (u_int)disk->d_no);
3231 		if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
3232 			sbuf_printf(sb, "%s<Synchronized>", indent);
3233 			if (disk->d_sync.ds_offset == 0)
3234 				sbuf_printf(sb, "0%%");
3235 			else {
3236 				sbuf_printf(sb, "%u%%",
3237 				    (u_int)((disk->d_sync.ds_offset * 100) /
3238 				    (sc->sc_mediasize / (sc->sc_ndisks - 1))));
3239 			}
3240 			sbuf_printf(sb, "</Synchronized>\n");
3241 		}
3242 		sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent,
3243 		    disk->d_sync.ds_syncid);
3244 		sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent, disk->d_genid);
3245 		sbuf_printf(sb, "%s<Flags>", indent);
3246 		if (disk->d_flags == 0)
3247 			sbuf_printf(sb, "NONE");
3248 		else {
3249 			int first = 1;
3250 
3251 #define	ADD_FLAG(flag, name)	do {					\
3252 	if ((disk->d_flags & (flag)) != 0) {				\
3253 		if (!first)						\
3254 			sbuf_printf(sb, ", ");				\
3255 		else							\
3256 			first = 0;					\
3257 		sbuf_printf(sb, name);					\
3258 	}								\
3259 } while (0)
3260 			ADD_FLAG(G_RAID3_DISK_FLAG_DIRTY, "DIRTY");
3261 			ADD_FLAG(G_RAID3_DISK_FLAG_HARDCODED, "HARDCODED");
3262 			ADD_FLAG(G_RAID3_DISK_FLAG_SYNCHRONIZING,
3263 			    "SYNCHRONIZING");
3264 			ADD_FLAG(G_RAID3_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC");
3265 			ADD_FLAG(G_RAID3_DISK_FLAG_BROKEN, "BROKEN");
3266 #undef	ADD_FLAG
3267 		}
3268 		sbuf_printf(sb, "</Flags>\n");
3269 		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
3270 		    g_raid3_disk_state2str(disk->d_state));
3271 		sx_xunlock(&sc->sc_lock);
3272 		g_topology_lock();
3273 	} else {
3274 		g_topology_unlock();
3275 		sx_xlock(&sc->sc_lock);
3276 		sbuf_printf(sb, "%s<Zone4kRequested>%u</Zone4kRequested>\n",
3277 		    indent, sc->sc_zones[G_RAID3_ZONE_4K].sz_requested);
3278 		sbuf_printf(sb, "%s<Zone4kFailed>%u</Zone4kFailed>\n",
3279 		    indent, sc->sc_zones[G_RAID3_ZONE_4K].sz_failed);
3280 		sbuf_printf(sb, "%s<Zone16kRequested>%u</Zone16kRequested>\n",
3281 		    indent, sc->sc_zones[G_RAID3_ZONE_16K].sz_requested);
3282 		sbuf_printf(sb, "%s<Zone16kFailed>%u</Zone16kFailed>\n",
3283 		    indent, sc->sc_zones[G_RAID3_ZONE_16K].sz_failed);
3284 		sbuf_printf(sb, "%s<Zone64kRequested>%u</Zone64kRequested>\n",
3285 		    indent, sc->sc_zones[G_RAID3_ZONE_64K].sz_requested);
3286 		sbuf_printf(sb, "%s<Zone64kFailed>%u</Zone64kFailed>\n",
3287 		    indent, sc->sc_zones[G_RAID3_ZONE_64K].sz_failed);
3288 		sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
3289 		sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent, sc->sc_syncid);
3290 		sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent, sc->sc_genid);
3291 		sbuf_printf(sb, "%s<Flags>", indent);
3292 		if (sc->sc_flags == 0)
3293 			sbuf_printf(sb, "NONE");
3294 		else {
3295 			int first = 1;
3296 
3297 #define	ADD_FLAG(flag, name)	do {					\
3298 	if ((sc->sc_flags & (flag)) != 0) {				\
3299 		if (!first)						\
3300 			sbuf_printf(sb, ", ");				\
3301 		else							\
3302 			first = 0;					\
3303 		sbuf_printf(sb, name);					\
3304 	}								\
3305 } while (0)
3306 			ADD_FLAG(G_RAID3_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC");
3307 			ADD_FLAG(G_RAID3_DEVICE_FLAG_ROUND_ROBIN,
3308 			    "ROUND-ROBIN");
3309 			ADD_FLAG(G_RAID3_DEVICE_FLAG_VERIFY, "VERIFY");
3310 #undef	ADD_FLAG
3311 		}
3312 		sbuf_printf(sb, "</Flags>\n");
3313 		sbuf_printf(sb, "%s<Components>%u</Components>\n", indent,
3314 		    sc->sc_ndisks);
3315 		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
3316 		    g_raid3_device_state2str(sc->sc_state));
3317 		sx_xunlock(&sc->sc_lock);
3318 		g_topology_lock();
3319 	}
3320 }
3321 
3322 static void
3323 g_raid3_shutdown_pre_sync(void *arg, int howto)
3324 {
3325 	struct g_class *mp;
3326 	struct g_geom *gp, *gp2;
3327 	struct g_raid3_softc *sc;
3328 
3329 	mp = arg;
3330 	DROP_GIANT();
3331 	g_topology_lock();
3332 	LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
3333 		if ((sc = gp->softc) == NULL)
3334 			continue;
3335 		g_topology_unlock();
3336 		sx_xlock(&sc->sc_lock);
3337 		if (sc->sc_syncdisk != NULL)
3338 			g_raid3_sync_stop(sc, 1);
3339 		sx_xunlock(&sc->sc_lock);
3340 		g_topology_lock();
3341 	}
3342 	g_topology_unlock();
3343 	PICKUP_GIANT();
3344 }
3345 
3346 static void
3347 g_raid3_shutdown_post_sync(void *arg, int howto)
3348 {
3349 	struct g_class *mp;
3350 	struct g_geom *gp, *gp2;
3351 	struct g_raid3_softc *sc;
3352 
3353 	mp = arg;
3354 	DROP_GIANT();
3355 	g_topology_lock();
3356 	LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
3357 		if ((sc = gp->softc) == NULL)
3358 			continue;
3359 		g_topology_unlock();
3360 		sx_xlock(&sc->sc_lock);
3361 		g_raid3_destroy(sc, 1);
3362 		g_topology_lock();
3363 	}
3364 	g_topology_unlock();
3365 	PICKUP_GIANT();
3366 #if 0
3367 	tsleep(&gp, PRIBIO, "r3:shutdown", hz * 20);
3368 #endif
3369 }
3370 
3371 static void
3372 g_raid3_init(struct g_class *mp)
3373 {
3374 
3375 	g_raid3_pre_sync = EVENTHANDLER_REGISTER(shutdown_pre_sync,
3376 	    g_raid3_shutdown_pre_sync, mp, SHUTDOWN_PRI_FIRST);
3377 	g_raid3_post_sync = EVENTHANDLER_REGISTER(shutdown_post_sync,
3378 	    g_raid3_shutdown_post_sync, mp, SHUTDOWN_PRI_FIRST);
3379 	if (g_raid3_pre_sync == NULL || g_raid3_post_sync == NULL)
3380 		G_RAID3_DEBUG(0, "Warning! Cannot register shutdown event.");
3381 }
3382 
3383 static void
3384 g_raid3_fini(struct g_class *mp)
3385 {
3386 
3387 	if (g_raid3_pre_sync != NULL)
3388 		EVENTHANDLER_DEREGISTER(shutdown_pre_sync, g_raid3_pre_sync);
3389 	if (g_raid3_post_sync != NULL)
3390 		EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_raid3_post_sync);
3391 }
3392 
3393 DECLARE_GEOM_CLASS(g_raid3_class, g_raid3);
3394