xref: /freebsd/sys/geom/raid3/g_raid3.c (revision 31489a9a2653e123121e8ca39b4be802013d2b50)
1 /*-
2  * Copyright (c) 2004 Pawel Jakub Dawidek <pjd@FreeBSD.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29 
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/kernel.h>
33 #include <sys/module.h>
34 #include <sys/limits.h>
35 #include <sys/lock.h>
36 #include <sys/mutex.h>
37 #include <sys/bio.h>
38 #include <sys/sysctl.h>
39 #include <sys/malloc.h>
40 #include <sys/eventhandler.h>
41 #include <vm/uma.h>
42 #include <machine/atomic.h>
43 #include <geom/geom.h>
44 #include <sys/proc.h>
45 #include <sys/kthread.h>
46 #include <geom/raid3/g_raid3.h>
47 
48 
49 static MALLOC_DEFINE(M_RAID3, "raid3 data", "GEOM_RAID3 Data");
50 
51 SYSCTL_DECL(_kern_geom);
52 SYSCTL_NODE(_kern_geom, OID_AUTO, raid3, CTLFLAG_RW, 0, "GEOM_RAID3 stuff");
53 u_int g_raid3_debug = 0;
54 TUNABLE_INT("kern.geom.raid3.debug", &g_raid3_debug);
55 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, debug, CTLFLAG_RW, &g_raid3_debug, 0,
56     "Debug level");
57 static u_int g_raid3_timeout = 4;
58 TUNABLE_INT("kern.geom.raid3.timeout", &g_raid3_timeout);
59 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, timeout, CTLFLAG_RW, &g_raid3_timeout,
60     0, "Time to wait on all raid3 components");
61 static u_int g_raid3_idletime = 5;
62 TUNABLE_INT("kern.geom.raid3.idletime", &g_raid3_idletime);
63 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, idletime, CTLFLAG_RW,
64     &g_raid3_idletime, 0, "Mark components as clean when idling");
65 static u_int g_raid3_reqs_per_sync = 5;
66 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, reqs_per_sync, CTLFLAG_RW,
67     &g_raid3_reqs_per_sync, 0,
68     "Number of regular I/O requests per synchronization request");
69 static u_int g_raid3_syncs_per_sec = 100;
70 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, syncs_per_sec, CTLFLAG_RW,
71     &g_raid3_syncs_per_sec, 0,
72     "Number of synchronizations requests per second");
73 
74 static u_int g_raid3_n64k = 50;
75 TUNABLE_INT("kern.geom.raid3.n64k", &g_raid3_n64k);
76 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n64k, CTLFLAG_RD, &g_raid3_n64k, 0,
77     "Maximum number of 64kB allocations");
78 static u_int g_raid3_n16k = 200;
79 TUNABLE_INT("kern.geom.raid3.n16k", &g_raid3_n16k);
80 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n16k, CTLFLAG_RD, &g_raid3_n16k, 0,
81     "Maximum number of 16kB allocations");
82 static u_int g_raid3_n4k = 1200;
83 TUNABLE_INT("kern.geom.raid3.n4k", &g_raid3_n4k);
84 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n4k, CTLFLAG_RD, &g_raid3_n4k, 0,
85     "Maximum number of 4kB allocations");
86 
87 SYSCTL_NODE(_kern_geom_raid3, OID_AUTO, stat, CTLFLAG_RW, 0,
88     "GEOM_RAID3 statistics");
89 static u_int g_raid3_parity_mismatch = 0;
90 SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, parity_mismatch, CTLFLAG_RD,
91     &g_raid3_parity_mismatch, 0, "Number of failures in VERIFY mode");
92 static u_int g_raid3_64k_requested = 0;
93 SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 64k_requested, CTLFLAG_RD,
94     &g_raid3_64k_requested, 0, "Number of requested 64kB allocations");
95 static u_int g_raid3_64k_failed = 0;
96 SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 64k_failed, CTLFLAG_RD,
97     &g_raid3_64k_failed, 0, "Number of failed 64kB allocations");
98 static u_int g_raid3_16k_requested = 0;
99 SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 16k_requested, CTLFLAG_RD,
100     &g_raid3_16k_requested, 0, "Number of requested 16kB allocations");
101 static u_int g_raid3_16k_failed = 0;
102 SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 16k_failed, CTLFLAG_RD,
103     &g_raid3_16k_failed, 0, "Number of failed 16kB allocations");
104 static u_int g_raid3_4k_requested = 0;
105 SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 4k_requested, CTLFLAG_RD,
106     &g_raid3_4k_requested, 0, "Number of requested 4kB allocations");
107 static u_int g_raid3_4k_failed = 0;
108 SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 4k_failed, CTLFLAG_RD,
109     &g_raid3_4k_failed, 0, "Number of failed 4kB allocations");
110 
111 #define	MSLEEP(ident, mtx, priority, wmesg, timeout)	do {		\
112 	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, (ident));	\
113 	msleep((ident), (mtx), (priority), (wmesg), (timeout));		\
114 	G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, (ident));	\
115 } while (0)
116 
117 static eventhandler_tag g_raid3_ehtag = NULL;
118 
119 static int g_raid3_destroy_geom(struct gctl_req *req, struct g_class *mp,
120     struct g_geom *gp);
121 static g_taste_t g_raid3_taste;
122 static void g_raid3_init(struct g_class *mp);
123 static void g_raid3_fini(struct g_class *mp);
124 
125 struct g_class g_raid3_class = {
126 	.name = G_RAID3_CLASS_NAME,
127 	.version = G_VERSION,
128 	.ctlreq = g_raid3_config,
129 	.taste = g_raid3_taste,
130 	.destroy_geom = g_raid3_destroy_geom,
131 	.init = g_raid3_init,
132 	.fini = g_raid3_fini
133 };
134 
135 
136 static void g_raid3_destroy_provider(struct g_raid3_softc *sc);
137 static int g_raid3_update_disk(struct g_raid3_disk *disk, u_int state);
138 static void g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force);
139 static void g_raid3_dumpconf(struct sbuf *sb, const char *indent,
140     struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
141 static void g_raid3_sync_stop(struct g_raid3_softc *sc, int type);
142 
143 
144 /*
145  * XXX: it should be placed in subr_disk.c.
146  */
147 static void
148 bioq_insert_head(struct bio_queue_head *head, struct bio *bp)
149 {
150 
151 	TAILQ_INSERT_HEAD(&head->queue, bp, bio_queue);
152 }
153 
154 static const char *
155 g_raid3_disk_state2str(int state)
156 {
157 
158 	switch (state) {
159 	case G_RAID3_DISK_STATE_NODISK:
160 		return ("NODISK");
161 	case G_RAID3_DISK_STATE_NONE:
162 		return ("NONE");
163 	case G_RAID3_DISK_STATE_NEW:
164 		return ("NEW");
165 	case G_RAID3_DISK_STATE_ACTIVE:
166 		return ("ACTIVE");
167 	case G_RAID3_DISK_STATE_STALE:
168 		return ("STALE");
169 	case G_RAID3_DISK_STATE_SYNCHRONIZING:
170 		return ("SYNCHRONIZING");
171 	case G_RAID3_DISK_STATE_DISCONNECTED:
172 		return ("DISCONNECTED");
173 	default:
174 		return ("INVALID");
175 	}
176 }
177 
178 static const char *
179 g_raid3_device_state2str(int state)
180 {
181 
182 	switch (state) {
183 	case G_RAID3_DEVICE_STATE_STARTING:
184 		return ("STARTING");
185 	case G_RAID3_DEVICE_STATE_DEGRADED:
186 		return ("DEGRADED");
187 	case G_RAID3_DEVICE_STATE_COMPLETE:
188 		return ("COMPLETE");
189 	default:
190 		return ("INVALID");
191 	}
192 }
193 
194 const char *
195 g_raid3_get_diskname(struct g_raid3_disk *disk)
196 {
197 
198 	if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL)
199 		return ("[unknown]");
200 	return (disk->d_name);
201 }
202 
203 #define	g_raid3_xor(src1, src2, dst, size)				\
204 	_g_raid3_xor((uint64_t *)(src1), (uint64_t *)(src2),		\
205 	    (uint64_t *)(dst), (size_t)size)
206 static void
207 _g_raid3_xor(uint64_t *src1, uint64_t *src2, uint64_t *dst, size_t size)
208 {
209 
210 	KASSERT((size % 128) == 0, ("Invalid size: %zu.", size));
211 	for (; size > 0; size -= 128) {
212 		*dst++ = (*src1++) ^ (*src2++);
213 		*dst++ = (*src1++) ^ (*src2++);
214 		*dst++ = (*src1++) ^ (*src2++);
215 		*dst++ = (*src1++) ^ (*src2++);
216 		*dst++ = (*src1++) ^ (*src2++);
217 		*dst++ = (*src1++) ^ (*src2++);
218 		*dst++ = (*src1++) ^ (*src2++);
219 		*dst++ = (*src1++) ^ (*src2++);
220 		*dst++ = (*src1++) ^ (*src2++);
221 		*dst++ = (*src1++) ^ (*src2++);
222 		*dst++ = (*src1++) ^ (*src2++);
223 		*dst++ = (*src1++) ^ (*src2++);
224 		*dst++ = (*src1++) ^ (*src2++);
225 		*dst++ = (*src1++) ^ (*src2++);
226 		*dst++ = (*src1++) ^ (*src2++);
227 		*dst++ = (*src1++) ^ (*src2++);
228 	}
229 }
230 
231 static int
232 g_raid3_is_zero(struct bio *bp)
233 {
234 	static const uint64_t zeros[] = {
235 	    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
236 	};
237 	u_char *addr;
238 	ssize_t size;
239 
240 	size = bp->bio_length;
241 	addr = (u_char *)bp->bio_data;
242 	for (; size > 0; size -= sizeof(zeros), addr += sizeof(zeros)) {
243 		if (bcmp(addr, zeros, sizeof(zeros)) != 0)
244 			return (0);
245 	}
246 	return (1);
247 }
248 
249 /*
250  * --- Events handling functions ---
251  * Events in geom_raid3 are used to maintain disks and device status
252  * from one thread to simplify locking.
253  */
254 static void
255 g_raid3_event_free(struct g_raid3_event *ep)
256 {
257 
258 	free(ep, M_RAID3);
259 }
260 
261 int
262 g_raid3_event_send(void *arg, int state, int flags)
263 {
264 	struct g_raid3_softc *sc;
265 	struct g_raid3_disk *disk;
266 	struct g_raid3_event *ep;
267 	int error;
268 
269 	ep = malloc(sizeof(*ep), M_RAID3, M_WAITOK);
270 	G_RAID3_DEBUG(4, "%s: Sending event %p.", __func__, ep);
271 	if ((flags & G_RAID3_EVENT_DEVICE) != 0) {
272 		disk = NULL;
273 		sc = arg;
274 	} else {
275 		disk = arg;
276 		sc = disk->d_softc;
277 	}
278 	ep->e_disk = disk;
279 	ep->e_state = state;
280 	ep->e_flags = flags;
281 	ep->e_error = 0;
282 	mtx_lock(&sc->sc_events_mtx);
283 	TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next);
284 	mtx_unlock(&sc->sc_events_mtx);
285 	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
286 	mtx_lock(&sc->sc_queue_mtx);
287 	wakeup(sc);
288 	wakeup(&sc->sc_queue);
289 	mtx_unlock(&sc->sc_queue_mtx);
290 	if ((flags & G_RAID3_EVENT_DONTWAIT) != 0)
291 		return (0);
292 	g_topology_assert();
293 	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, ep);
294 	g_topology_unlock();
295 	while ((ep->e_flags & G_RAID3_EVENT_DONE) == 0) {
296 		mtx_lock(&sc->sc_events_mtx);
297 		MSLEEP(ep, &sc->sc_events_mtx, PRIBIO | PDROP, "r3:event",
298 		    hz * 5);
299 	}
300 	/* Don't even try to use 'sc' here, because it could be already dead. */
301 	g_topology_lock();
302 	error = ep->e_error;
303 	g_raid3_event_free(ep);
304 	return (error);
305 }
306 
307 static struct g_raid3_event *
308 g_raid3_event_get(struct g_raid3_softc *sc)
309 {
310 	struct g_raid3_event *ep;
311 
312 	mtx_lock(&sc->sc_events_mtx);
313 	ep = TAILQ_FIRST(&sc->sc_events);
314 	if (ep != NULL)
315 		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
316 	mtx_unlock(&sc->sc_events_mtx);
317 	return (ep);
318 }
319 
320 static void
321 g_raid3_event_cancel(struct g_raid3_disk *disk)
322 {
323 	struct g_raid3_softc *sc;
324 	struct g_raid3_event *ep, *tmpep;
325 
326 	g_topology_assert();
327 
328 	sc = disk->d_softc;
329 	mtx_lock(&sc->sc_events_mtx);
330 	TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) {
331 		if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0)
332 			continue;
333 		if (ep->e_disk != disk)
334 			continue;
335 		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
336 		if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0)
337 			g_raid3_event_free(ep);
338 		else {
339 			ep->e_error = ECANCELED;
340 			wakeup(ep);
341 		}
342 	}
343 	mtx_unlock(&sc->sc_events_mtx);
344 }
345 
346 /*
347  * Return the number of disks in the given state.
348  * If state is equal to -1, count all connected disks.
349  */
350 u_int
351 g_raid3_ndisks(struct g_raid3_softc *sc, int state)
352 {
353 	struct g_raid3_disk *disk;
354 	u_int n, ndisks;
355 
356 	for (n = ndisks = 0; n < sc->sc_ndisks; n++) {
357 		disk = &sc->sc_disks[n];
358 		if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
359 			continue;
360 		if (state == -1 || disk->d_state == state)
361 			ndisks++;
362 	}
363 	return (ndisks);
364 }
365 
366 static u_int
367 g_raid3_nrequests(struct g_raid3_softc *sc, struct g_consumer *cp)
368 {
369 	struct bio *bp;
370 	u_int nreqs = 0;
371 
372 	mtx_lock(&sc->sc_queue_mtx);
373 	TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
374 		if (bp->bio_from == cp)
375 			nreqs++;
376 	}
377 	mtx_unlock(&sc->sc_queue_mtx);
378 	return (nreqs);
379 }
380 
381 static int
382 g_raid3_is_busy(struct g_raid3_softc *sc, struct g_consumer *cp)
383 {
384 
385 	if (cp->index > 0) {
386 		G_RAID3_DEBUG(2,
387 		    "I/O requests for %s exist, can't destroy it now.",
388 		    cp->provider->name);
389 		return (1);
390 	}
391 	if (g_raid3_nrequests(sc, cp) > 0) {
392 		G_RAID3_DEBUG(2,
393 		    "I/O requests for %s in queue, can't destroy it now.",
394 		    cp->provider->name);
395 		return (1);
396 	}
397 	return (0);
398 }
399 
400 static void
401 g_raid3_kill_consumer(struct g_raid3_softc *sc, struct g_consumer *cp)
402 {
403 
404 	g_topology_assert();
405 
406 	cp->private = NULL;
407 	if (g_raid3_is_busy(sc, cp))
408 		return;
409 	G_RAID3_DEBUG(2, "Consumer %s destroyed.", cp->provider->name);
410 	g_detach(cp);
411 	g_destroy_consumer(cp);
412 }
413 
414 static int
415 g_raid3_connect_disk(struct g_raid3_disk *disk, struct g_provider *pp)
416 {
417 	int error;
418 
419 	g_topology_assert();
420 	KASSERT(disk->d_consumer == NULL,
421 	    ("Disk already connected (device %s).", disk->d_softc->sc_name));
422 
423 	disk->d_consumer = g_new_consumer(disk->d_softc->sc_geom);
424 	disk->d_consumer->private = disk;
425 	disk->d_consumer->index = 0;
426 	error = g_attach(disk->d_consumer, pp);
427 	if (error != 0)
428 		return (error);
429 	G_RAID3_DEBUG(2, "Disk %s connected.", g_raid3_get_diskname(disk));
430 	return (0);
431 }
432 
433 static void
434 g_raid3_disconnect_consumer(struct g_raid3_softc *sc, struct g_consumer *cp)
435 {
436 
437 	g_topology_assert();
438 
439 	if (cp == NULL)
440 		return;
441 	if (cp->provider != NULL) {
442 		G_RAID3_DEBUG(2, "Disk %s disconnected.", cp->provider->name);
443 		if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) {
444 			G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d",
445 			    cp->provider->name, -cp->acr, -cp->acw, -cp->ace,
446 			    0);
447 			g_access(cp, -cp->acr, -cp->acw, -cp->ace);
448 		}
449 		g_raid3_kill_consumer(sc, cp);
450 	} else {
451 		g_destroy_consumer(cp);
452 	}
453 }
454 
455 /*
456  * Initialize disk. This means allocate memory, create consumer, attach it
457  * to the provider and open access (r1w1e1) to it.
458  */
459 static struct g_raid3_disk *
460 g_raid3_init_disk(struct g_raid3_softc *sc, struct g_provider *pp,
461     struct g_raid3_metadata *md, int *errorp)
462 {
463 	struct g_raid3_disk *disk;
464 	int error;
465 
466 	disk = &sc->sc_disks[md->md_no];
467 	disk->d_softc = sc;
468 	error = g_raid3_connect_disk(disk, pp);
469 	if (error != 0)
470 		goto fail;
471 	disk->d_no = md->md_no;
472 	disk->d_state = G_RAID3_DISK_STATE_NONE;
473 	disk->d_flags = md->md_dflags;
474 	if (md->md_provider[0] != '\0')
475 		disk->d_flags |= G_RAID3_DISK_FLAG_HARDCODED;
476 	disk->d_sync.ds_consumer = NULL;
477 	disk->d_sync.ds_offset = md->md_sync_offset;
478 	disk->d_sync.ds_offset_done = md->md_sync_offset;
479 	disk->d_sync.ds_resync = -1;
480 	disk->d_sync.ds_syncid = md->md_syncid;
481 	if (errorp != NULL)
482 		*errorp = 0;
483 	return (disk);
484 fail:
485 	if (errorp != NULL)
486 		*errorp = error;
487 	if (disk != NULL)
488 		g_raid3_disconnect_consumer(sc, disk->d_consumer);
489 	return (NULL);
490 }
491 
492 static void
493 g_raid3_destroy_disk(struct g_raid3_disk *disk)
494 {
495 	struct g_raid3_softc *sc;
496 
497 	g_topology_assert();
498 
499 	if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
500 		return;
501 	g_raid3_event_cancel(disk);
502 	sc = disk->d_softc;
503 	switch (disk->d_state) {
504 	case G_RAID3_DISK_STATE_SYNCHRONIZING:
505 		if (sc->sc_syncdisk != NULL)
506 			g_raid3_sync_stop(sc, 1);
507 		/* FALLTHROUGH */
508 	case G_RAID3_DISK_STATE_NEW:
509 	case G_RAID3_DISK_STATE_STALE:
510 	case G_RAID3_DISK_STATE_ACTIVE:
511 		g_raid3_disconnect_consumer(sc, disk->d_consumer);
512 		disk->d_consumer = NULL;
513 		break;
514 	default:
515 		KASSERT(0 == 1, ("Wrong disk state (%s, %s).",
516 		    g_raid3_get_diskname(disk),
517 		    g_raid3_disk_state2str(disk->d_state)));
518 	}
519 	disk->d_state = G_RAID3_DISK_STATE_NODISK;
520 }
521 
522 static void
523 g_raid3_destroy_device(struct g_raid3_softc *sc)
524 {
525 	struct g_raid3_event *ep;
526 	struct g_raid3_disk *disk;
527 	struct g_geom *gp;
528 	struct g_consumer *cp;
529 	u_int n;
530 
531 	g_topology_assert();
532 
533 	gp = sc->sc_geom;
534 	if (sc->sc_provider != NULL)
535 		g_raid3_destroy_provider(sc);
536 	for (n = 0; n < sc->sc_ndisks; n++) {
537 		disk = &sc->sc_disks[n];
538 		disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
539 		g_raid3_update_metadata(disk);
540 		g_raid3_destroy_disk(disk);
541 	}
542 	while ((ep = g_raid3_event_get(sc)) != NULL) {
543 		if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0)
544 			g_raid3_event_free(ep);
545 		else {
546 			ep->e_error = ECANCELED;
547 			ep->e_flags |= G_RAID3_EVENT_DONE;
548 			G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, ep);
549 			mtx_lock(&sc->sc_events_mtx);
550 			wakeup(ep);
551 			mtx_unlock(&sc->sc_events_mtx);
552 		}
553 	}
554 	callout_drain(&sc->sc_callout);
555 	gp->softc = NULL;
556 	cp = LIST_FIRST(&sc->sc_sync.ds_geom->consumer);
557 	if (cp != NULL)
558 		g_raid3_disconnect_consumer(sc, cp);
559 	sc->sc_sync.ds_geom->softc = NULL;
560 	g_wither_geom(sc->sc_sync.ds_geom, ENXIO);
561 	uma_zdestroy(sc->sc_zone_64k);
562 	uma_zdestroy(sc->sc_zone_16k);
563 	uma_zdestroy(sc->sc_zone_4k);
564 	mtx_destroy(&sc->sc_queue_mtx);
565 	mtx_destroy(&sc->sc_events_mtx);
566 	G_RAID3_DEBUG(0, "Device %s destroyed.", gp->name);
567 	g_wither_geom(gp, ENXIO);
568 }
569 
570 static void
571 g_raid3_orphan(struct g_consumer *cp)
572 {
573 	struct g_raid3_disk *disk;
574 
575 	g_topology_assert();
576 
577 	disk = cp->private;
578 	if (disk == NULL)
579 		return;
580 	disk->d_softc->sc_bump_syncid = G_RAID3_BUMP_ON_FIRST_WRITE;
581 	g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
582 	    G_RAID3_EVENT_DONTWAIT);
583 }
584 
585 static void
586 g_raid3_spoiled(struct g_consumer *cp)
587 {
588 	struct g_raid3_disk *disk;
589 
590 	g_topology_assert();
591 
592 	disk = cp->private;
593 	if (disk == NULL)
594 		return;
595 	disk->d_softc->sc_bump_syncid = G_RAID3_BUMP_IMMEDIATELY;
596 	g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
597 	    G_RAID3_EVENT_DONTWAIT);
598 }
599 
600 static int
601 g_raid3_write_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md)
602 {
603 	struct g_raid3_softc *sc;
604 	struct g_consumer *cp;
605 	off_t offset, length;
606 	int close = 0, error = 0;
607 	u_char *sector;
608 
609 	g_topology_assert();
610 
611 	sc = disk->d_softc;
612 	cp = disk->d_consumer;
613 	KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name));
614 	KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name));
615 	length = cp->provider->sectorsize;
616 	offset = cp->provider->mediasize - length;
617 	sector = malloc((size_t)length, M_RAID3, M_WAITOK | M_ZERO);
618 	/*
619 	 * Open consumer if it wasn't opened and remember to close it.
620 	 */
621 	if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) == 0) {
622 		error = g_access(cp, 0, 1, 1);
623 		G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", cp->provider->name,
624 		    0, 1, 1, error);
625 		if (error == 0)
626 			close = 1;
627 #ifdef	INVARIANTS
628 	} else {
629 		KASSERT(cp->acw > 0 && cp->ace > 0,
630 		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
631 		    cp->acr, cp->acw, cp->ace));
632 #endif
633 	}
634 	if (error == 0) {
635 		if (md != NULL)
636 			raid3_metadata_encode(md, sector);
637 		g_topology_unlock();
638 		error = g_write_data(cp, offset, sector, length);
639 		g_topology_lock();
640 	}
641 	free(sector, M_RAID3);
642 	if (close) {
643 		g_access(cp, 0, -1, -1);
644 		G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d",
645 		    cp->provider->name, 0, -1, -1, 0);
646 	}
647 	if (error != 0) {
648 		disk->d_softc->sc_bump_syncid = G_RAID3_BUMP_IMMEDIATELY;
649 		g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
650 		    G_RAID3_EVENT_DONTWAIT);
651 	}
652 	return (error);
653 }
654 
655 int
656 g_raid3_clear_metadata(struct g_raid3_disk *disk)
657 {
658 	int error;
659 
660 	g_topology_assert();
661 	error = g_raid3_write_metadata(disk, NULL);
662 	if (error == 0) {
663 		G_RAID3_DEBUG(2, "Metadata on %s cleared.",
664 		    g_raid3_get_diskname(disk));
665 	} else {
666 		G_RAID3_DEBUG(0,
667 		    "Cannot clear metadata on disk %s (error=%d).",
668 		    g_raid3_get_diskname(disk), error);
669 	}
670 	return (error);
671 }
672 
673 void
674 g_raid3_fill_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md)
675 {
676 	struct g_raid3_softc *sc;
677 
678 	sc = disk->d_softc;
679 	strlcpy(md->md_magic, G_RAID3_MAGIC, sizeof(md->md_magic));
680 	md->md_version = G_RAID3_VERSION;
681 	strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name));
682 	md->md_id = sc->sc_id;
683 	md->md_all = sc->sc_ndisks;
684 	md->md_mediasize = sc->sc_mediasize;
685 	md->md_sectorsize = sc->sc_sectorsize;
686 	md->md_mflags = (sc->sc_flags & G_RAID3_DEVICE_FLAG_MASK);
687 	md->md_no = disk->d_no;
688 	md->md_syncid = disk->d_sync.ds_syncid;
689 	md->md_dflags = (disk->d_flags & G_RAID3_DISK_FLAG_MASK);
690 	if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING)
691 		md->md_sync_offset = disk->d_sync.ds_offset_done;
692 	else
693 		md->md_sync_offset = 0;
694 	if ((disk->d_flags & G_RAID3_DISK_FLAG_HARDCODED) != 0 &&
695 	    disk->d_consumer != NULL && disk->d_consumer->provider != NULL) {
696 		strlcpy(md->md_provider, disk->d_consumer->provider->name,
697 		    sizeof(md->md_provider));
698 	} else {
699 		bzero(md->md_provider, sizeof(md->md_provider));
700 	}
701 }
702 
703 void
704 g_raid3_update_metadata(struct g_raid3_disk *disk)
705 {
706 	struct g_raid3_metadata md;
707 	int error;
708 
709 	g_topology_assert();
710 	g_raid3_fill_metadata(disk, &md);
711 	error = g_raid3_write_metadata(disk, &md);
712 	if (error == 0) {
713 		G_RAID3_DEBUG(2, "Metadata on %s updated.",
714 		    g_raid3_get_diskname(disk));
715 	} else {
716 		G_RAID3_DEBUG(0,
717 		    "Cannot update metadata on disk %s (error=%d).",
718 		    g_raid3_get_diskname(disk), error);
719 	}
720 }
721 
722 static void
723 g_raid3_bump_syncid(struct g_raid3_softc *sc)
724 {
725 	struct g_raid3_disk *disk;
726 	u_int n;
727 
728 	g_topology_assert();
729 	KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0,
730 	    ("%s called with no active disks (device=%s).", __func__,
731 	    sc->sc_name));
732 
733 	sc->sc_syncid++;
734 	for (n = 0; n < sc->sc_ndisks; n++) {
735 		disk = &sc->sc_disks[n];
736 		if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
737 		    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
738 			disk->d_sync.ds_syncid = sc->sc_syncid;
739 			g_raid3_update_metadata(disk);
740 		}
741 	}
742 }
743 
744 static void
745 g_raid3_idle(struct g_raid3_softc *sc)
746 {
747 	struct g_raid3_disk *disk;
748 	u_int i;
749 
750 	if (sc->sc_provider == NULL || sc->sc_provider->acw == 0)
751 		return;
752 	sc->sc_idle = 1;
753 	g_topology_lock();
754 	for (i = 0; i < sc->sc_ndisks; i++) {
755 		disk = &sc->sc_disks[i];
756 		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
757 			continue;
758 		G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.",
759 		    g_raid3_get_diskname(disk), sc->sc_name);
760 		disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
761 		g_raid3_update_metadata(disk);
762 	}
763 	g_topology_unlock();
764 }
765 
766 static void
767 g_raid3_unidle(struct g_raid3_softc *sc)
768 {
769 	struct g_raid3_disk *disk;
770 	u_int i;
771 
772 	sc->sc_idle = 0;
773 	g_topology_lock();
774 	for (i = 0; i < sc->sc_ndisks; i++) {
775 		disk = &sc->sc_disks[i];
776 		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
777 			continue;
778 		G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.",
779 		    g_raid3_get_diskname(disk), sc->sc_name);
780 		disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
781 		g_raid3_update_metadata(disk);
782 	}
783 	g_topology_unlock();
784 }
785 
786 /*
787  * Return 1 if we should check if RAID3 device is idling.
788  */
789 static int
790 g_raid3_check_idle(struct g_raid3_softc *sc)
791 {
792 	struct g_raid3_disk *disk;
793 	u_int i;
794 
795 	if (sc->sc_idle)
796 		return (0);
797 	if (sc->sc_provider != NULL && sc->sc_provider->acw == 0)
798 		return (0);
799 	/*
800 	 * Check if there are no in-flight requests.
801 	 */
802 	for (i = 0; i < sc->sc_ndisks; i++) {
803 		disk = &sc->sc_disks[i];
804 		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
805 			continue;
806 		if (disk->d_consumer->index > 0)
807 			return (0);
808 	}
809 	return (1);
810 }
811 
812 /*
813  * Treat bio_driver1 field in parent bio as list head and field bio_caller1
814  * in child bio as pointer to the next element on the list.
815  */
816 #define	G_RAID3_HEAD_BIO(pbp)	(pbp)->bio_driver1
817 
818 #define	G_RAID3_NEXT_BIO(cbp)	(cbp)->bio_caller1
819 
820 #define	G_RAID3_FOREACH_BIO(pbp, bp)					\
821 	for ((bp) = G_RAID3_HEAD_BIO(pbp); (bp) != NULL;		\
822 	    (bp) = G_RAID3_NEXT_BIO(bp))
823 
824 #define	G_RAID3_FOREACH_SAFE_BIO(pbp, bp, tmpbp)			\
825 	for ((bp) = G_RAID3_HEAD_BIO(pbp);				\
826 	    (bp) != NULL && ((tmpbp) = G_RAID3_NEXT_BIO(bp), 1);	\
827 	    (bp) = (tmpbp))
828 
829 static void
830 g_raid3_init_bio(struct bio *pbp)
831 {
832 
833 	G_RAID3_HEAD_BIO(pbp) = NULL;
834 }
835 
836 static void
837 g_raid3_remove_bio(struct bio *cbp)
838 {
839 	struct bio *pbp, *bp;
840 
841 	pbp = cbp->bio_parent;
842 	if (G_RAID3_HEAD_BIO(pbp) == cbp)
843 		G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp);
844 	else {
845 		G_RAID3_FOREACH_BIO(pbp, bp) {
846 			if (G_RAID3_NEXT_BIO(bp) == cbp) {
847 				G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp);
848 				break;
849 			}
850 		}
851 	}
852 	G_RAID3_NEXT_BIO(cbp) = NULL;
853 }
854 
855 static void
856 g_raid3_replace_bio(struct bio *sbp, struct bio *dbp)
857 {
858 	struct bio *pbp, *bp;
859 
860 	g_raid3_remove_bio(sbp);
861 	pbp = dbp->bio_parent;
862 	G_RAID3_NEXT_BIO(sbp) = G_RAID3_NEXT_BIO(dbp);
863 	if (G_RAID3_HEAD_BIO(pbp) == dbp)
864 		G_RAID3_HEAD_BIO(pbp) = sbp;
865 	else {
866 		G_RAID3_FOREACH_BIO(pbp, bp) {
867 			if (G_RAID3_NEXT_BIO(bp) == dbp) {
868 				G_RAID3_NEXT_BIO(bp) = sbp;
869 				break;
870 			}
871 		}
872 	}
873 	G_RAID3_NEXT_BIO(dbp) = NULL;
874 }
875 
876 static void
877 g_raid3_destroy_bio(struct g_raid3_softc *sc, struct bio *cbp)
878 {
879 	struct bio *bp, *pbp;
880 	size_t size;
881 
882 	pbp = cbp->bio_parent;
883 	pbp->bio_children--;
884 	KASSERT(cbp->bio_data != NULL, ("NULL bio_data"));
885 	size = pbp->bio_length / (sc->sc_ndisks - 1);
886 	if (size > 16384)
887 		uma_zfree(sc->sc_zone_64k, cbp->bio_data);
888 	else if (size > 4096)
889 		uma_zfree(sc->sc_zone_16k, cbp->bio_data);
890 	else
891 		uma_zfree(sc->sc_zone_4k, cbp->bio_data);
892 	if (G_RAID3_HEAD_BIO(pbp) == cbp) {
893 		G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp);
894 		G_RAID3_NEXT_BIO(cbp) = NULL;
895 		g_destroy_bio(cbp);
896 	} else {
897 		G_RAID3_FOREACH_BIO(pbp, bp) {
898 			if (G_RAID3_NEXT_BIO(bp) == cbp)
899 				break;
900 		}
901 		if (bp != NULL) {
902 			KASSERT(G_RAID3_NEXT_BIO(bp) != NULL,
903 			    ("NULL bp->bio_driver1"));
904 			G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp);
905 			G_RAID3_NEXT_BIO(cbp) = NULL;
906 		}
907 		g_destroy_bio(cbp);
908 	}
909 }
910 
911 static struct bio *
912 g_raid3_clone_bio(struct g_raid3_softc *sc, struct bio *pbp)
913 {
914 	struct bio *bp, *cbp;
915 	size_t size;
916 
917 	cbp = g_clone_bio(pbp);
918 	if (cbp == NULL)
919 		return (NULL);
920 	size = pbp->bio_length / (sc->sc_ndisks - 1);
921 	if (size > 16384) {
922 		cbp->bio_data = uma_zalloc(sc->sc_zone_64k, M_NOWAIT);
923 		g_raid3_64k_requested++;
924 	} else if (size > 4096) {
925 		cbp->bio_data = uma_zalloc(sc->sc_zone_16k, M_NOWAIT);
926 		g_raid3_16k_requested++;
927 	} else {
928 		cbp->bio_data = uma_zalloc(sc->sc_zone_4k, M_NOWAIT);
929 		g_raid3_4k_requested++;
930 	}
931 	if (cbp->bio_data == NULL) {
932 		if (size > 16384)
933 			g_raid3_64k_failed++;
934 		if (size > 4096)
935 			g_raid3_16k_failed++;
936 		else
937 			g_raid3_4k_failed++;
938 		pbp->bio_children--;
939 		g_destroy_bio(cbp);
940 		return (NULL);
941 	}
942 	G_RAID3_NEXT_BIO(cbp) = NULL;
943 	if (G_RAID3_HEAD_BIO(pbp) == NULL)
944 		G_RAID3_HEAD_BIO(pbp) = cbp;
945 	else {
946 		G_RAID3_FOREACH_BIO(pbp, bp) {
947 			if (G_RAID3_NEXT_BIO(bp) == NULL) {
948 				G_RAID3_NEXT_BIO(bp) = cbp;
949 				break;
950 			}
951 		}
952 	}
953 	return (cbp);
954 }
955 
956 static void
957 g_raid3_scatter(struct bio *pbp)
958 {
959 	struct g_raid3_softc *sc;
960 	struct g_raid3_disk *disk;
961 	struct bio *bp, *cbp;
962 	off_t atom, cadd, padd, left;
963 
964 	sc = pbp->bio_to->geom->softc;
965 	bp = NULL;
966 	if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) {
967 		/*
968 		 * Find bio for which we should calculate data.
969 		 */
970 		G_RAID3_FOREACH_BIO(pbp, cbp) {
971 			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) {
972 				bp = cbp;
973 				break;
974 			}
975 		}
976 		KASSERT(bp != NULL, ("NULL parity bio."));
977 	}
978 	atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
979 	cadd = padd = 0;
980 	for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) {
981 		G_RAID3_FOREACH_BIO(pbp, cbp) {
982 			if (cbp == bp)
983 				continue;
984 			bcopy(pbp->bio_data + padd, cbp->bio_data + cadd, atom);
985 			padd += atom;
986 		}
987 		cadd += atom;
988 	}
989 	if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) {
990 		struct bio *tmpbp;
991 
992 		/*
993 		 * Calculate parity.
994 		 */
995 		bzero(bp->bio_data, bp->bio_length);
996 		G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) {
997 			if (cbp == bp)
998 				continue;
999 			g_raid3_xor(cbp->bio_data, bp->bio_data, bp->bio_data,
1000 			    bp->bio_length);
1001 			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_NODISK) != 0)
1002 				g_raid3_destroy_bio(sc, cbp);
1003 		}
1004 	}
1005 	G_RAID3_FOREACH_BIO(pbp, cbp) {
1006 		struct g_consumer *cp;
1007 
1008 		disk = cbp->bio_caller2;
1009 		cp = disk->d_consumer;
1010 		cbp->bio_to = cp->provider;
1011 		G_RAID3_LOGREQ(3, cbp, "Sending request.");
1012 		KASSERT(cp->acr > 0 && cp->ace > 0,
1013 		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1014 		    cp->acr, cp->acw, cp->ace));
1015 		cp->index++;
1016 		g_io_request(cbp, cp);
1017 	}
1018 }
1019 
1020 static void
1021 g_raid3_gather(struct bio *pbp)
1022 {
1023 	struct g_raid3_softc *sc;
1024 	struct g_raid3_disk *disk;
1025 	struct bio *xbp, *fbp, *cbp;
1026 	off_t atom, cadd, padd, left;
1027 
1028 	sc = pbp->bio_to->geom->softc;
1029 	/*
1030 	 * Find bio for which we have to calculate data.
1031 	 * While going through this path, check if all requests
1032 	 * succeeded, if not, deny whole request.
1033 	 * If we're in COMPLETE mode, we allow one request to fail,
1034 	 * so if we find one, we're sending it to the parity consumer.
1035 	 * If there are more failed requests, we deny whole request.
1036 	 */
1037 	xbp = fbp = NULL;
1038 	G_RAID3_FOREACH_BIO(pbp, cbp) {
1039 		if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) {
1040 			KASSERT(xbp == NULL, ("More than one parity bio."));
1041 			xbp = cbp;
1042 		}
1043 		if (cbp->bio_error == 0)
1044 			continue;
1045 		/*
1046 		 * Found failed request.
1047 		 */
1048 		G_RAID3_LOGREQ(0, cbp, "Request failed.");
1049 		disk = cbp->bio_caller2;
1050 		if (disk != NULL) {
1051 			/*
1052 			 * Actually this is pointless to bump syncid,
1053 			 * because whole device is fucked up.
1054 			 */
1055 			sc->sc_bump_syncid = G_RAID3_BUMP_IMMEDIATELY;
1056 			g_raid3_event_send(disk,
1057 			    G_RAID3_DISK_STATE_DISCONNECTED,
1058 			    G_RAID3_EVENT_DONTWAIT);
1059 		}
1060 		if (fbp == NULL) {
1061 			if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_DEGRADED) != 0) {
1062 				/*
1063 				 * We are already in degraded mode, so we can't
1064 				 * accept any failures.
1065 				 */
1066 				if (pbp->bio_error == 0)
1067 					pbp->bio_error = fbp->bio_error;
1068 			} else {
1069 				fbp = cbp;
1070 			}
1071 		} else {
1072 			/*
1073 			 * Next failed request, that's too many.
1074 			 */
1075 			if (pbp->bio_error == 0)
1076 				pbp->bio_error = fbp->bio_error;
1077 		}
1078 	}
1079 	if (pbp->bio_error != 0)
1080 		goto finish;
1081 	if (fbp != NULL && (pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) {
1082 		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_VERIFY;
1083 		if (xbp != fbp)
1084 			g_raid3_replace_bio(xbp, fbp);
1085 		g_raid3_destroy_bio(sc, fbp);
1086 	} else if (fbp != NULL) {
1087 		struct g_consumer *cp;
1088 
1089 		/*
1090 		 * One request failed, so send the same request to
1091 		 * the parity consumer.
1092 		 */
1093 		disk = pbp->bio_driver2;
1094 		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) {
1095 			pbp->bio_error = fbp->bio_error;
1096 			goto finish;
1097 		}
1098 		pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
1099 		pbp->bio_inbed--;
1100 		fbp->bio_flags &= ~(BIO_DONE | BIO_ERROR);
1101 		if (disk->d_no == sc->sc_ndisks - 1)
1102 			fbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1103 		fbp->bio_error = 0;
1104 		fbp->bio_completed = 0;
1105 		fbp->bio_children = 0;
1106 		fbp->bio_inbed = 0;
1107 		cp = disk->d_consumer;
1108 		fbp->bio_caller2 = disk;
1109 		fbp->bio_to = cp->provider;
1110 		G_RAID3_LOGREQ(3, fbp, "Sending request (recover).");
1111 		KASSERT(cp->acr > 0 && cp->ace > 0,
1112 		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1113 		    cp->acr, cp->acw, cp->ace));
1114 		cp->index++;
1115 		g_io_request(fbp, cp);
1116 		return;
1117 	}
1118 	if (xbp != NULL) {
1119 		/*
1120 		 * Calculate parity.
1121 		 */
1122 		G_RAID3_FOREACH_BIO(pbp, cbp) {
1123 			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0)
1124 				continue;
1125 			g_raid3_xor(cbp->bio_data, xbp->bio_data, xbp->bio_data,
1126 			    xbp->bio_length);
1127 		}
1128 		xbp->bio_cflags &= ~G_RAID3_BIO_CFLAG_PARITY;
1129 		if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) {
1130 			if (!g_raid3_is_zero(xbp)) {
1131 				g_raid3_parity_mismatch++;
1132 				pbp->bio_error = EIO;
1133 				goto finish;
1134 			}
1135 			g_raid3_destroy_bio(sc, xbp);
1136 		}
1137 	}
1138 	atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
1139 	cadd = padd = 0;
1140 	for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) {
1141 		G_RAID3_FOREACH_BIO(pbp, cbp) {
1142 			bcopy(cbp->bio_data + cadd, pbp->bio_data + padd, atom);
1143 			pbp->bio_completed += atom;
1144 			padd += atom;
1145 		}
1146 		cadd += atom;
1147 	}
1148 finish:
1149 	if (pbp->bio_error == 0)
1150 		G_RAID3_LOGREQ(3, pbp, "Request finished.");
1151 	else {
1152 		if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0)
1153 			G_RAID3_LOGREQ(1, pbp, "Verification error.");
1154 		else
1155 			G_RAID3_LOGREQ(0, pbp, "Request failed.");
1156 	}
1157 	pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_MASK;
1158 	g_io_deliver(pbp, pbp->bio_error);
1159 	while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL)
1160 		g_raid3_destroy_bio(sc, cbp);
1161 }
1162 
1163 static void
1164 g_raid3_done(struct bio *bp)
1165 {
1166 	struct g_raid3_softc *sc;
1167 
1168 	sc = bp->bio_from->geom->softc;
1169 	bp->bio_cflags |= G_RAID3_BIO_CFLAG_REGULAR;
1170 	G_RAID3_LOGREQ(3, bp, "Regular request done (error=%d).", bp->bio_error);
1171 	mtx_lock(&sc->sc_queue_mtx);
1172 	bioq_insert_head(&sc->sc_queue, bp);
1173 	wakeup(sc);
1174 	wakeup(&sc->sc_queue);
1175 	mtx_unlock(&sc->sc_queue_mtx);
1176 }
1177 
1178 static void
1179 g_raid3_regular_request(struct bio *cbp)
1180 {
1181 	struct g_raid3_softc *sc;
1182 	struct g_raid3_disk *disk;
1183 	struct bio *pbp;
1184 
1185 	g_topology_assert_not();
1186 
1187 	cbp->bio_from->index--;
1188 	pbp = cbp->bio_parent;
1189 	sc = pbp->bio_to->geom->softc;
1190 	disk = cbp->bio_from->private;
1191 	if (disk == NULL) {
1192 		g_topology_lock();
1193 		g_raid3_kill_consumer(sc, cbp->bio_from);
1194 		g_topology_unlock();
1195 	}
1196 
1197 	G_RAID3_LOGREQ(3, cbp, "Request finished.");
1198 	pbp->bio_inbed++;
1199 	KASSERT(pbp->bio_inbed <= pbp->bio_children,
1200 	    ("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed,
1201 	    pbp->bio_children));
1202 	if (pbp->bio_inbed != pbp->bio_children)
1203 		return;
1204 	switch (pbp->bio_cmd) {
1205 	case BIO_READ:
1206 		g_raid3_gather(pbp);
1207 		break;
1208 	case BIO_WRITE:
1209 	case BIO_DELETE:
1210 	    {
1211 		int error = 0;
1212 
1213 		pbp->bio_completed = pbp->bio_length;
1214 		while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) {
1215 			if (cbp->bio_error != 0) {
1216 				disk = cbp->bio_caller2;
1217 				if (disk != NULL) {
1218 					sc->sc_bump_syncid =
1219 					    G_RAID3_BUMP_IMMEDIATELY;
1220 					g_raid3_event_send(disk,
1221 					    G_RAID3_DISK_STATE_DISCONNECTED,
1222 					    G_RAID3_EVENT_DONTWAIT);
1223 				}
1224 				if (error == 0)
1225 					error = cbp->bio_error;
1226 				else if (pbp->bio_error == 0) {
1227 					/*
1228 					 * Next failed request, that's too many.
1229 					 */
1230 					pbp->bio_error = error;
1231 				}
1232 			}
1233 			g_raid3_destroy_bio(sc, cbp);
1234 		}
1235 		if (pbp->bio_error == 0)
1236 			G_RAID3_LOGREQ(3, pbp, "Request finished.");
1237 		else
1238 			G_RAID3_LOGREQ(0, pbp, "Request failed.");
1239 		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_DEGRADED;
1240 		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_NOPARITY;
1241 		g_io_deliver(pbp, pbp->bio_error);
1242 		break;
1243 	    }
1244 	}
1245 }
1246 
1247 static void
1248 g_raid3_sync_done(struct bio *bp)
1249 {
1250 	struct g_raid3_softc *sc;
1251 
1252 	G_RAID3_LOGREQ(3, bp, "Synchronization request delivered.");
1253 	sc = bp->bio_from->geom->softc;
1254 	bp->bio_cflags |= G_RAID3_BIO_CFLAG_SYNC;
1255 	mtx_lock(&sc->sc_queue_mtx);
1256 	bioq_insert_head(&sc->sc_queue, bp);
1257 	wakeup(sc);
1258 	wakeup(&sc->sc_queue);
1259 	mtx_unlock(&sc->sc_queue_mtx);
1260 }
1261 
1262 static void
1263 g_raid3_start(struct bio *bp)
1264 {
1265 	struct g_raid3_softc *sc;
1266 
1267 	sc = bp->bio_to->geom->softc;
1268 	/*
1269 	 * If sc == NULL or there are no valid disks, provider's error
1270 	 * should be set and g_raid3_start() should not be called at all.
1271 	 */
1272 	KASSERT(sc != NULL && (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
1273 	    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE),
1274 	    ("Provider's error should be set (error=%d)(device=%s).",
1275 	    bp->bio_to->error, bp->bio_to->name));
1276 	G_RAID3_LOGREQ(3, bp, "Request received.");
1277 
1278 	switch (bp->bio_cmd) {
1279 	case BIO_READ:
1280 	case BIO_WRITE:
1281 	case BIO_DELETE:
1282 		break;
1283 	case BIO_GETATTR:
1284 	default:
1285 		g_io_deliver(bp, EOPNOTSUPP);
1286 		return;
1287 	}
1288 	mtx_lock(&sc->sc_queue_mtx);
1289 	bioq_insert_tail(&sc->sc_queue, bp);
1290 	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
1291 	wakeup(sc);
1292 	mtx_unlock(&sc->sc_queue_mtx);
1293 }
1294 
1295 /*
1296  * Send one synchronization request.
1297  */
1298 static void
1299 g_raid3_sync_one(struct g_raid3_softc *sc)
1300 {
1301 	struct g_raid3_disk *disk;
1302 	struct bio *bp;
1303 
1304 	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
1305 	    ("Wrong device state (%s, %s).", sc->sc_name,
1306 	    g_raid3_device_state2str(sc->sc_state)));
1307 	disk = sc->sc_syncdisk;
1308 	KASSERT(disk != NULL, ("No sync disk (%s).", sc->sc_name));
1309 	KASSERT(disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
1310 	    ("Disk %s is not marked for synchronization.",
1311 	    g_raid3_get_diskname(disk)));
1312 
1313 	bp = g_new_bio();
1314 	if (bp == NULL)
1315 		return;
1316 	bp->bio_parent = NULL;
1317 	bp->bio_cmd = BIO_READ;
1318 	bp->bio_offset = disk->d_sync.ds_offset * (sc->sc_ndisks - 1);
1319 	bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset);
1320 	bp->bio_cflags = 0;
1321 	bp->bio_done = g_raid3_sync_done;
1322 	bp->bio_data = disk->d_sync.ds_data;
1323 	if (bp->bio_data == NULL) {
1324 		g_destroy_bio(bp);
1325 		return;
1326 	}
1327 	bp->bio_cflags = G_RAID3_BIO_CFLAG_REGSYNC;
1328 	disk->d_sync.ds_offset += bp->bio_length / (sc->sc_ndisks - 1);
1329 	bp->bio_to = sc->sc_provider;
1330 	G_RAID3_LOGREQ(3, bp, "Sending synchronization request.");
1331 	disk->d_sync.ds_consumer->index++;
1332 	g_io_request(bp, disk->d_sync.ds_consumer);
1333 }
1334 
1335 static void
1336 g_raid3_sync_request(struct bio *bp)
1337 {
1338 	struct g_raid3_softc *sc;
1339 	struct g_raid3_disk *disk;
1340 
1341 	bp->bio_from->index--;
1342 	sc = bp->bio_from->geom->softc;
1343 	disk = bp->bio_from->private;
1344 	if (disk == NULL) {
1345 		g_topology_lock();
1346 		g_raid3_kill_consumer(sc, bp->bio_from);
1347 		g_topology_unlock();
1348 		g_destroy_bio(bp);
1349 		return;
1350 	}
1351 
1352 	/*
1353 	 * Synchronization request.
1354 	 */
1355 	switch (bp->bio_cmd) {
1356 	case BIO_READ:
1357 	    {
1358 		struct g_consumer *cp;
1359 		u_char *dst, *src;
1360 		off_t left;
1361 		u_int atom;
1362 
1363 		if (bp->bio_error != 0) {
1364 			G_RAID3_LOGREQ(0, bp,
1365 			    "Synchronization request failed (error=%d).",
1366 			    bp->bio_error);
1367 			g_destroy_bio(bp);
1368 			return;
1369 		}
1370 		G_RAID3_LOGREQ(3, bp, "Synchronization request finished.");
1371 		atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
1372 		dst = src = bp->bio_data;
1373 		if (disk->d_no == sc->sc_ndisks - 1) {
1374 			u_int n;
1375 
1376 			/* Parity component. */
1377 			for (left = bp->bio_length; left > 0;
1378 			    left -= sc->sc_sectorsize) {
1379 				bcopy(src, dst, atom);
1380 				src += atom;
1381 				for (n = 1; n < sc->sc_ndisks - 1; n++) {
1382 					g_raid3_xor(src, dst, dst, atom);
1383 					src += atom;
1384 				}
1385 				dst += atom;
1386 			}
1387 		} else {
1388 			/* Regular component. */
1389 			src += atom * disk->d_no;
1390 			for (left = bp->bio_length; left > 0;
1391 			    left -= sc->sc_sectorsize) {
1392 				bcopy(src, dst, atom);
1393 				src += sc->sc_sectorsize;
1394 				dst += atom;
1395 			}
1396 		}
1397 		bp->bio_offset /= sc->sc_ndisks - 1;
1398 		bp->bio_length /= sc->sc_ndisks - 1;
1399 		bp->bio_cmd = BIO_WRITE;
1400 		bp->bio_cflags = 0;
1401 		bp->bio_children = bp->bio_inbed = 0;
1402 		cp = disk->d_consumer;
1403 		KASSERT(cp->acr == 0 && cp->acw == 1 && cp->ace == 1,
1404 		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1405 		    cp->acr, cp->acw, cp->ace));
1406 		cp->index++;
1407 		g_io_request(bp, cp);
1408 		return;
1409 	    }
1410 	case BIO_WRITE:
1411 	    {
1412 		struct g_raid3_disk_sync *sync;
1413 
1414 		if (bp->bio_error != 0) {
1415 			G_RAID3_LOGREQ(0, bp,
1416 			    "Synchronization request failed (error=%d).",
1417 			    bp->bio_error);
1418 			g_destroy_bio(bp);
1419 			sc->sc_bump_syncid = G_RAID3_BUMP_IMMEDIATELY;
1420 			g_raid3_event_send(disk,
1421 			    G_RAID3_DISK_STATE_DISCONNECTED,
1422 			    G_RAID3_EVENT_DONTWAIT);
1423 			return;
1424 		}
1425 		G_RAID3_LOGREQ(3, bp, "Synchronization request finished.");
1426 		sync = &disk->d_sync;
1427 		sync->ds_offset_done = bp->bio_offset + bp->bio_length;
1428 		g_destroy_bio(bp);
1429 		if (sync->ds_resync != -1)
1430 			return;
1431 		if (sync->ds_offset_done ==
1432 		    sc->sc_mediasize / (sc->sc_ndisks - 1)) {
1433 			/*
1434 			 * Disk up-to-date, activate it.
1435 			 */
1436 			g_raid3_event_send(disk, G_RAID3_DISK_STATE_ACTIVE,
1437 			    G_RAID3_EVENT_DONTWAIT);
1438 			return;
1439 		} else if (sync->ds_offset_done % (MAXPHYS * 100) == 0) {
1440 			/*
1441 			 * Update offset_done on every 100 blocks.
1442 			 * XXX: This should be configurable.
1443 			 */
1444 			g_topology_lock();
1445 			g_raid3_update_metadata(disk);
1446 			g_topology_unlock();
1447 		}
1448 		return;
1449 	    }
1450 	default:
1451 		KASSERT(1 == 0, ("Invalid command here: %u (device=%s)",
1452 		    bp->bio_cmd, sc->sc_name));
1453 		break;
1454 	}
1455 }
1456 
1457 static int
1458 g_raid3_register_request(struct bio *pbp)
1459 {
1460 	struct g_raid3_softc *sc;
1461 	struct g_raid3_disk *disk;
1462 	struct g_consumer *cp;
1463 	struct bio *cbp;
1464 	off_t offset, length;
1465 	u_int n, ndisks;
1466 	int round_robin, verify;
1467 
1468 	ndisks = 0;
1469 	sc = pbp->bio_to->geom->softc;
1470 	if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGSYNC) != 0 &&
1471 	    sc->sc_syncdisk == NULL) {
1472 		g_io_deliver(pbp, EIO);
1473 		return (0);
1474 	}
1475 	g_raid3_init_bio(pbp);
1476 	length = pbp->bio_length / (sc->sc_ndisks - 1);
1477 	offset = pbp->bio_offset / (sc->sc_ndisks - 1);
1478 	round_robin = verify = 0;
1479 	switch (pbp->bio_cmd) {
1480 	case BIO_READ:
1481 		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 &&
1482 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
1483 			pbp->bio_pflags |= G_RAID3_BIO_PFLAG_VERIFY;
1484 			verify = 1;
1485 			ndisks = sc->sc_ndisks;
1486 		} else {
1487 			verify = 0;
1488 			ndisks = sc->sc_ndisks - 1;
1489 		}
1490 		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0 &&
1491 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
1492 			round_robin = 1;
1493 		} else {
1494 			round_robin = 0;
1495 		}
1496 		KASSERT(!round_robin || !verify,
1497 		    ("ROUND-ROBIN and VERIFY are mutually exclusive."));
1498 		pbp->bio_driver2 = &sc->sc_disks[sc->sc_ndisks - 1];
1499 		break;
1500 	case BIO_WRITE:
1501 	case BIO_DELETE:
1502 	    {
1503 		struct g_raid3_disk_sync *sync;
1504 
1505 		if (sc->sc_idle)
1506 			g_raid3_unidle(sc);
1507 
1508 		ndisks = sc->sc_ndisks;
1509 
1510 		if (sc->sc_syncdisk == NULL)
1511 			break;
1512 		sync = &sc->sc_syncdisk->d_sync;
1513 		if (offset >= sync->ds_offset)
1514 			break;
1515 		if (offset + length <= sync->ds_offset_done)
1516 			break;
1517 		if (offset >= sync->ds_resync && sync->ds_resync != -1)
1518 			break;
1519 		sync->ds_resync = offset - (offset % MAXPHYS);
1520 		break;
1521 	    }
1522 	}
1523 	for (n = 0; n < ndisks; n++) {
1524 		disk = &sc->sc_disks[n];
1525 		cbp = g_raid3_clone_bio(sc, pbp);
1526 		if (cbp == NULL) {
1527 			while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL)
1528 				g_raid3_destroy_bio(sc, cbp);
1529 			return (ENOMEM);
1530 		}
1531 		cbp->bio_offset = offset;
1532 		cbp->bio_length = length;
1533 		cbp->bio_done = g_raid3_done;
1534 		switch (pbp->bio_cmd) {
1535 		case BIO_READ:
1536 			if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) {
1537 				/*
1538 				 * Replace invalid component with the parity
1539 				 * component.
1540 				 */
1541 				disk = &sc->sc_disks[sc->sc_ndisks - 1];
1542 				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1543 				pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
1544 			} else if (round_robin &&
1545 			    disk->d_no == sc->sc_round_robin) {
1546 				/*
1547 				 * In round-robin mode skip one data component
1548 				 * and use parity component when reading.
1549 				 */
1550 				pbp->bio_driver2 = disk;
1551 				disk = &sc->sc_disks[sc->sc_ndisks - 1];
1552 				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1553 				sc->sc_round_robin++;
1554 				round_robin = 0;
1555 			} else if (verify && disk->d_no == sc->sc_ndisks - 1) {
1556 				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1557 			}
1558 			break;
1559 		case BIO_WRITE:
1560 		case BIO_DELETE:
1561 			if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
1562 			    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
1563 				if (n == ndisks - 1) {
1564 					/*
1565 					 * Active parity component, mark it as such.
1566 					 */
1567 					cbp->bio_cflags |=
1568 					    G_RAID3_BIO_CFLAG_PARITY;
1569 				}
1570 			} else {
1571 				pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
1572 				if (n == ndisks - 1) {
1573 					/*
1574 					 * Parity component is not connected,
1575 					 * so destroy its request.
1576 					 */
1577 					pbp->bio_pflags |=
1578 					    G_RAID3_BIO_PFLAG_NOPARITY;
1579 					g_raid3_destroy_bio(sc, cbp);
1580 					cbp = NULL;
1581 				} else {
1582 					cbp->bio_cflags |=
1583 					    G_RAID3_BIO_CFLAG_NODISK;
1584 					disk = NULL;
1585 				}
1586 			}
1587 			break;
1588 		}
1589 		if (cbp != NULL)
1590 			cbp->bio_caller2 = disk;
1591 	}
1592 	switch (pbp->bio_cmd) {
1593 	case BIO_READ:
1594 		if (round_robin) {
1595 			/*
1596 			 * If we are in round-robin mode and 'round_robin' is
1597 			 * still 1, it means, that we skipped parity component
1598 			 * for this read and must reset sc_round_robin field.
1599 			 */
1600 			sc->sc_round_robin = 0;
1601 		}
1602 		G_RAID3_FOREACH_BIO(pbp, cbp) {
1603 			disk = cbp->bio_caller2;
1604 			cp = disk->d_consumer;
1605 			cbp->bio_to = cp->provider;
1606 			G_RAID3_LOGREQ(3, cbp, "Sending request.");
1607 			KASSERT(cp->acr > 0 && cp->ace > 0,
1608 			    ("Consumer %s not opened (r%dw%de%d).",
1609 			    cp->provider->name, cp->acr, cp->acw, cp->ace));
1610 			cp->index++;
1611 			g_io_request(cbp, cp);
1612 		}
1613 		break;
1614 	case BIO_WRITE:
1615 	case BIO_DELETE:
1616 		/*
1617 		 * Bump syncid on first write.
1618 		 */
1619 		if (sc->sc_bump_syncid == G_RAID3_BUMP_ON_FIRST_WRITE) {
1620 			sc->sc_bump_syncid = 0;
1621 			g_topology_lock();
1622 			g_raid3_bump_syncid(sc);
1623 			g_topology_unlock();
1624 		}
1625 		g_raid3_scatter(pbp);
1626 		break;
1627 	}
1628 	return (0);
1629 }
1630 
1631 static int
1632 g_raid3_can_destroy(struct g_raid3_softc *sc)
1633 {
1634 	struct g_geom *gp;
1635 	struct g_consumer *cp;
1636 
1637 	g_topology_assert();
1638 	gp = sc->sc_geom;
1639 	LIST_FOREACH(cp, &gp->consumer, consumer) {
1640 		if (g_raid3_is_busy(sc, cp))
1641 			return (0);
1642 	}
1643 	gp = sc->sc_sync.ds_geom;
1644 	LIST_FOREACH(cp, &gp->consumer, consumer) {
1645 		if (g_raid3_is_busy(sc, cp))
1646 			return (0);
1647 	}
1648 	G_RAID3_DEBUG(2, "No I/O requests for %s, it can be destroyed.",
1649 	    sc->sc_name);
1650 	return (1);
1651 }
1652 
1653 static int
1654 g_raid3_try_destroy(struct g_raid3_softc *sc)
1655 {
1656 
1657 	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_WAIT) != 0) {
1658 		g_topology_lock();
1659 		if (!g_raid3_can_destroy(sc)) {
1660 			g_topology_unlock();
1661 			return (0);
1662 		}
1663 		g_topology_unlock();
1664 		G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__,
1665 		    &sc->sc_worker);
1666 		wakeup(&sc->sc_worker);
1667 		sc->sc_worker = NULL;
1668 	} else {
1669 		g_topology_lock();
1670 		if (!g_raid3_can_destroy(sc)) {
1671 			g_topology_unlock();
1672 			return (0);
1673 		}
1674 		g_raid3_destroy_device(sc);
1675 		g_topology_unlock();
1676 		free(sc->sc_disks, M_RAID3);
1677 		free(sc, M_RAID3);
1678 	}
1679 	return (1);
1680 }
1681 
1682 /*
1683  * Worker thread.
1684  */
1685 static void
1686 g_raid3_worker(void *arg)
1687 {
1688 	struct g_raid3_softc *sc;
1689 	struct g_raid3_disk *disk;
1690 	struct g_raid3_disk_sync *sync;
1691 	struct g_raid3_event *ep;
1692 	struct bio *bp;
1693 	u_int nreqs;
1694 
1695 	sc = arg;
1696 	curthread->td_base_pri = PRIBIO;
1697 
1698 	nreqs = 0;
1699 	for (;;) {
1700 		G_RAID3_DEBUG(5, "%s: Let's see...", __func__);
1701 		/*
1702 		 * First take a look at events.
1703 		 * This is important to handle events before any I/O requests.
1704 		 */
1705 		ep = g_raid3_event_get(sc);
1706 		if (ep != NULL) {
1707 			g_topology_lock();
1708 			if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0) {
1709 				/* Update only device status. */
1710 				G_RAID3_DEBUG(3,
1711 				    "Running event for device %s.",
1712 				    sc->sc_name);
1713 				ep->e_error = 0;
1714 				g_raid3_update_device(sc, 1);
1715 			} else {
1716 				/* Update disk status. */
1717 				G_RAID3_DEBUG(3, "Running event for disk %s.",
1718 				     g_raid3_get_diskname(ep->e_disk));
1719 				ep->e_error = g_raid3_update_disk(ep->e_disk,
1720 				    ep->e_state);
1721 				if (ep->e_error == 0)
1722 					g_raid3_update_device(sc, 0);
1723 			}
1724 			g_topology_unlock();
1725 			if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) {
1726 				KASSERT(ep->e_error == 0,
1727 				    ("Error cannot be handled."));
1728 				g_raid3_event_free(ep);
1729 			} else {
1730 				ep->e_flags |= G_RAID3_EVENT_DONE;
1731 				G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__,
1732 				    ep);
1733 				mtx_lock(&sc->sc_events_mtx);
1734 				wakeup(ep);
1735 				mtx_unlock(&sc->sc_events_mtx);
1736 			}
1737 			if ((sc->sc_flags &
1738 			    G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
1739 				if (g_raid3_try_destroy(sc))
1740 					kthread_exit(0);
1741 			}
1742 			G_RAID3_DEBUG(5, "%s: I'm here 1.", __func__);
1743 			continue;
1744 		}
1745 		/*
1746 		 * Now I/O requests.
1747 		 */
1748 		/* Get first request from the queue. */
1749 		mtx_lock(&sc->sc_queue_mtx);
1750 		bp = bioq_first(&sc->sc_queue);
1751 		if (bp == NULL) {
1752 			if ((sc->sc_flags &
1753 			    G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
1754 				mtx_unlock(&sc->sc_queue_mtx);
1755 				if (g_raid3_try_destroy(sc))
1756 					kthread_exit(0);
1757 				mtx_lock(&sc->sc_queue_mtx);
1758 			}
1759 		}
1760 		if (sc->sc_syncdisk != NULL &&
1761 		    (bp == NULL || nreqs > g_raid3_reqs_per_sync)) {
1762 			mtx_unlock(&sc->sc_queue_mtx);
1763 			/*
1764 			 * It is time for synchronization...
1765 			 */
1766 			nreqs = 0;
1767 			disk = sc->sc_syncdisk;
1768 			sync = &disk->d_sync;
1769 			if (sync->ds_offset <
1770 			    sc->sc_mediasize / (sc->sc_ndisks - 1) &&
1771 			    sync->ds_offset == sync->ds_offset_done) {
1772 				if (sync->ds_resync != -1) {
1773 					sync->ds_offset = sync->ds_resync;
1774 					sync->ds_offset_done = sync->ds_resync;
1775 					sync->ds_resync = -1;
1776 				}
1777 				g_raid3_sync_one(sc);
1778 			}
1779 			G_RAID3_DEBUG(5, "%s: I'm here 2.", __func__);
1780 			goto sleep;
1781 		}
1782 		if (bp == NULL) {
1783 			if (g_raid3_check_idle(sc)) {
1784 				u_int idletime;
1785 
1786 				idletime = g_raid3_idletime;
1787 				if (idletime == 0)
1788 					idletime = 1;
1789 				idletime *= hz;
1790 				if (msleep(sc, &sc->sc_queue_mtx, PRIBIO | PDROP,
1791 				    "r3:w1", idletime) == EWOULDBLOCK) {
1792 					G_RAID3_DEBUG(5, "%s: I'm here 3.",
1793 					    __func__);
1794 					/*
1795 					 * No I/O requests in 'idletime'
1796 					 * seconds, so mark components as clean.
1797 					 */
1798 					g_raid3_idle(sc);
1799 				}
1800 				G_RAID3_DEBUG(5, "%s: I'm here 4.", __func__);
1801 			} else {
1802 				MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP,
1803 				    "r3:w2", 0);
1804 				G_RAID3_DEBUG(5, "%s: I'm here 5.", __func__);
1805 			}
1806 			continue;
1807 		}
1808 		nreqs++;
1809 		bioq_remove(&sc->sc_queue, bp);
1810 		mtx_unlock(&sc->sc_queue_mtx);
1811 
1812 		if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0) {
1813 			g_raid3_regular_request(bp);
1814 		} else if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0) {
1815 			u_int timeout, sps;
1816 
1817 			g_raid3_sync_request(bp);
1818 sleep:
1819 			sps = atomic_load_acq_int(&g_raid3_syncs_per_sec);
1820 			if (sps == 0) {
1821 				G_RAID3_DEBUG(5, "%s: I'm here 5.", __func__);
1822 				continue;
1823 			}
1824 			mtx_lock(&sc->sc_queue_mtx);
1825 			if (bioq_first(&sc->sc_queue) != NULL) {
1826 				mtx_unlock(&sc->sc_queue_mtx);
1827 				G_RAID3_DEBUG(5, "%s: I'm here 4.", __func__);
1828 				continue;
1829 			}
1830 			timeout = hz / sps;
1831 			if (timeout == 0)
1832 				timeout = 1;
1833 			MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "r3:w2",
1834 			    timeout);
1835 		} else {
1836 			if (g_raid3_register_request(bp) != 0) {
1837 				mtx_lock(&sc->sc_queue_mtx);
1838 				bioq_insert_tail(&sc->sc_queue, bp);
1839 				MSLEEP(&sc->sc_queue, &sc->sc_queue_mtx,
1840 				    PRIBIO | PDROP, "r3:lowmem", hz / 10);
1841 			}
1842 		}
1843 		G_RAID3_DEBUG(5, "%s: I'm here 6.", __func__);
1844 	}
1845 }
1846 
1847 /*
1848  * Open disk's consumer if needed.
1849  */
1850 static void
1851 g_raid3_update_access(struct g_raid3_disk *disk)
1852 {
1853 	struct g_provider *pp;
1854 	struct g_consumer *cp;
1855 	int acr, acw, ace, cpw, error;
1856 
1857 	g_topology_assert();
1858 
1859 	cp = disk->d_consumer;
1860 	pp = disk->d_softc->sc_provider;
1861 	if (pp == NULL) {
1862 		acr = -cp->acr;
1863 		acw = -cp->acw;
1864 		ace = -cp->ace;
1865 	} else {
1866 		acr = pp->acr - cp->acr;
1867 		acw = pp->acw - cp->acw;
1868 		ace = pp->ace - cp->ace;
1869 		/* Grab an extra "exclusive" bit. */
1870 		if (pp->acr > 0 || pp->acw > 0 || pp->ace > 0)
1871 			ace++;
1872 	}
1873 	if (acr == 0 && acw == 0 && ace == 0)
1874 		return;
1875 	cpw = cp->acw;
1876 	error = g_access(cp, acr, acw, ace);
1877 	G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", cp->provider->name, acr,
1878 	    acw, ace, error);
1879 	if (error != 0) {
1880 		disk->d_softc->sc_bump_syncid = G_RAID3_BUMP_ON_FIRST_WRITE;
1881 		g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
1882 		    G_RAID3_EVENT_DONTWAIT);
1883 		return;
1884 	}
1885 	if (cpw == 0 && cp->acw > 0) {
1886 		G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.",
1887 		    g_raid3_get_diskname(disk), disk->d_softc->sc_name);
1888 		disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
1889 	} else if (cpw > 0 && cp->acw == 0) {
1890 		G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.",
1891 		    g_raid3_get_diskname(disk), disk->d_softc->sc_name);
1892 		disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
1893 	}
1894 }
1895 
1896 static void
1897 g_raid3_sync_start(struct g_raid3_softc *sc)
1898 {
1899 	struct g_raid3_disk *disk;
1900 	struct g_consumer *cp;
1901 	int error;
1902 	u_int n;
1903 
1904 	g_topology_assert();
1905 
1906 	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
1907 	    ("Device not in DEGRADED state (%s, %u).", sc->sc_name,
1908 	    sc->sc_state));
1909 	KASSERT(sc->sc_syncdisk == NULL, ("Syncdisk is not NULL (%s, %u).",
1910 	    sc->sc_name, sc->sc_state));
1911 	disk = NULL;
1912 	for (n = 0; n < sc->sc_ndisks; n++) {
1913 		if (sc->sc_disks[n].d_state != G_RAID3_DISK_STATE_SYNCHRONIZING)
1914 			continue;
1915 		disk = &sc->sc_disks[n];
1916 		break;
1917 	}
1918 	if (disk == NULL)
1919 		return;
1920 	cp = disk->d_consumer;
1921 	KASSERT(cp->acr == 0 && cp->acw == 0 && cp->ace == 0,
1922 	    ("Consumer %s already opened.", cp->provider->name));
1923 
1924 	G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name,
1925 	    g_raid3_get_diskname(disk));
1926 	error = g_access(cp, 0, 1, 1);
1927 	G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", cp->provider->name, 0, 1,
1928 	    1, error);
1929 	if (error != 0) {
1930 		g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
1931 		    G_RAID3_EVENT_DONTWAIT);
1932 		return;
1933 	}
1934 	disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
1935 	KASSERT(disk->d_sync.ds_consumer == NULL,
1936 	    ("Sync consumer already exists (device=%s, disk=%s).",
1937 	    sc->sc_name, g_raid3_get_diskname(disk)));
1938 	disk->d_sync.ds_consumer = g_new_consumer(sc->sc_sync.ds_geom);
1939 	disk->d_sync.ds_consumer->private = disk;
1940 	disk->d_sync.ds_consumer->index = 0;
1941 	error = g_attach(disk->d_sync.ds_consumer, disk->d_softc->sc_provider);
1942 	KASSERT(error == 0, ("Cannot attach to %s (error=%d).",
1943 	    disk->d_softc->sc_name, error));
1944 	error = g_access(disk->d_sync.ds_consumer, 1, 0, 0);
1945 	KASSERT(error == 0, ("Cannot open %s (error=%d).",
1946 	    disk->d_softc->sc_name, error));
1947 	disk->d_sync.ds_data = malloc(MAXPHYS, M_RAID3, M_WAITOK);
1948 	sc->sc_syncdisk = disk;
1949 }
1950 
1951 /*
1952  * Stop synchronization process.
1953  * type: 0 - synchronization finished
1954  *       1 - synchronization stopped
1955  */
1956 static void
1957 g_raid3_sync_stop(struct g_raid3_softc *sc, int type)
1958 {
1959 	struct g_raid3_disk *disk;
1960 	struct g_consumer *cp;
1961 
1962 	g_topology_assert();
1963 	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
1964 	    ("Device not in DEGRADED state (%s, %u).", sc->sc_name,
1965 	    sc->sc_state));
1966 	disk = sc->sc_syncdisk;
1967 	sc->sc_syncdisk = NULL;
1968 	KASSERT(disk != NULL, ("No disk was synchronized (%s).", sc->sc_name));
1969 	KASSERT(disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
1970 	    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
1971 	    g_raid3_disk_state2str(disk->d_state)));
1972 	if (disk->d_sync.ds_consumer == NULL)
1973 		return;
1974 
1975 	if (type == 0) {
1976 		G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s finished.",
1977 		    disk->d_softc->sc_name, g_raid3_get_diskname(disk));
1978 	} else /* if (type == 1) */ {
1979 		G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s stopped.",
1980 		    disk->d_softc->sc_name, g_raid3_get_diskname(disk));
1981 	}
1982 	cp = disk->d_sync.ds_consumer;
1983 	g_access(cp, -1, 0, 0);
1984 	g_raid3_kill_consumer(disk->d_softc, cp);
1985 	free(disk->d_sync.ds_data, M_RAID3);
1986 	disk->d_sync.ds_consumer = NULL;
1987 	cp = disk->d_consumer;
1988 	KASSERT(cp->acr == 0 && cp->acw == 1 && cp->ace == 1,
1989 	    ("Consumer %s not opened.", cp->provider->name));
1990 	g_access(cp, 0, -1, -1);
1991 	G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", cp->provider->name, 0, -1,
1992 	    -1, 0);
1993 	disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
1994 }
1995 
1996 static void
1997 g_raid3_launch_provider(struct g_raid3_softc *sc)
1998 {
1999 	struct g_provider *pp;
2000 
2001 	g_topology_assert();
2002 
2003 	pp = g_new_providerf(sc->sc_geom, "raid3/%s", sc->sc_name);
2004 	pp->mediasize = sc->sc_mediasize;
2005 	pp->sectorsize = sc->sc_sectorsize;
2006 	sc->sc_provider = pp;
2007 	g_error_provider(pp, 0);
2008 	G_RAID3_DEBUG(0, "Device %s: provider %s launched.", sc->sc_name,
2009 	    pp->name);
2010 	if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED)
2011 		g_raid3_sync_start(sc);
2012 }
2013 
2014 static void
2015 g_raid3_destroy_provider(struct g_raid3_softc *sc)
2016 {
2017 	struct bio *bp;
2018 
2019 	g_topology_assert();
2020 	KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).",
2021 	    sc->sc_name));
2022 
2023 	g_error_provider(sc->sc_provider, ENXIO);
2024 	mtx_lock(&sc->sc_queue_mtx);
2025 	while ((bp = bioq_first(&sc->sc_queue)) != NULL) {
2026 		bioq_remove(&sc->sc_queue, bp);
2027 		g_io_deliver(bp, ENXIO);
2028 	}
2029 	mtx_unlock(&sc->sc_queue_mtx);
2030 	G_RAID3_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name,
2031 	    sc->sc_provider->name);
2032 	sc->sc_provider->flags |= G_PF_WITHER;
2033 	g_orphan_provider(sc->sc_provider, ENXIO);
2034 	sc->sc_provider = NULL;
2035 	if (sc->sc_syncdisk != NULL)
2036 		g_raid3_sync_stop(sc, 1);
2037 }
2038 
2039 static void
2040 g_raid3_go(void *arg)
2041 {
2042 	struct g_raid3_softc *sc;
2043 
2044 	sc = arg;
2045 	G_RAID3_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name);
2046 	g_raid3_event_send(sc, 0,
2047 	    G_RAID3_EVENT_DONTWAIT | G_RAID3_EVENT_DEVICE);
2048 }
2049 
2050 static u_int
2051 g_raid3_determine_state(struct g_raid3_disk *disk)
2052 {
2053 	struct g_raid3_softc *sc;
2054 	u_int state;
2055 
2056 	sc = disk->d_softc;
2057 	if (sc->sc_syncid == disk->d_sync.ds_syncid) {
2058 		if ((disk->d_flags &
2059 		    G_RAID3_DISK_FLAG_SYNCHRONIZING) == 0) {
2060 			/* Disk does not need synchronization. */
2061 			state = G_RAID3_DISK_STATE_ACTIVE;
2062 		} else {
2063 			if ((sc->sc_flags &
2064 			     G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0  ||
2065 			    (disk->d_flags &
2066 			     G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) {
2067 				/*
2068 				 * We can start synchronization from
2069 				 * the stored offset.
2070 				 */
2071 				state = G_RAID3_DISK_STATE_SYNCHRONIZING;
2072 			} else {
2073 				state = G_RAID3_DISK_STATE_STALE;
2074 			}
2075 		}
2076 	} else if (disk->d_sync.ds_syncid < sc->sc_syncid) {
2077 		/*
2078 		 * Reset all synchronization data for this disk,
2079 		 * because if it even was synchronized, it was
2080 		 * synchronized to disks with different syncid.
2081 		 */
2082 		disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING;
2083 		disk->d_sync.ds_offset = 0;
2084 		disk->d_sync.ds_offset_done = 0;
2085 		disk->d_sync.ds_syncid = sc->sc_syncid;
2086 		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 ||
2087 		    (disk->d_flags & G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) {
2088 			state = G_RAID3_DISK_STATE_SYNCHRONIZING;
2089 		} else {
2090 			state = G_RAID3_DISK_STATE_STALE;
2091 		}
2092 	} else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ {
2093 		/*
2094 		 * Not good, NOT GOOD!
2095 		 * It means that device was started on stale disks
2096 		 * and more fresh disk just arrive.
2097 		 * If there were writes, device is fucked up, sorry.
2098 		 * I think the best choice here is don't touch
2099 		 * this disk and inform the user laudly.
2100 		 */
2101 		G_RAID3_DEBUG(0, "Device %s was started before the freshest "
2102 		    "disk (%s) arrives!! It will not be connected to the "
2103 		    "running device.", sc->sc_name,
2104 		    g_raid3_get_diskname(disk));
2105 		g_raid3_destroy_disk(disk);
2106 		state = G_RAID3_DISK_STATE_NONE;
2107 		/* Return immediately, because disk was destroyed. */
2108 		return (state);
2109 	}
2110 	G_RAID3_DEBUG(3, "State for %s disk: %s.",
2111 	    g_raid3_get_diskname(disk), g_raid3_disk_state2str(state));
2112 	return (state);
2113 }
2114 
2115 /*
2116  * Update device state.
2117  */
2118 static void
2119 g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force)
2120 {
2121 	struct g_raid3_disk *disk;
2122 	u_int state;
2123 
2124 	g_topology_assert();
2125 
2126 	switch (sc->sc_state) {
2127 	case G_RAID3_DEVICE_STATE_STARTING:
2128 	    {
2129 		u_int n, ndirty, ndisks, syncid;
2130 
2131 		KASSERT(sc->sc_provider == NULL,
2132 		    ("Non-NULL provider in STARTING state (%s).", sc->sc_name));
2133 		/*
2134 		 * Are we ready? We are, if all disks are connected or
2135 		 * one disk is missing and 'force' is true.
2136 		 */
2137 		if (g_raid3_ndisks(sc, -1) + force == sc->sc_ndisks) {
2138 			if (!force)
2139 				callout_drain(&sc->sc_callout);
2140 		} else {
2141 			if (force) {
2142 				/*
2143 				 * Timeout expired, so destroy device.
2144 				 */
2145 				sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2146 			}
2147 			return;
2148 		}
2149 
2150 		/*
2151 		 * There must be at least 'sc->sc_ndisks - 1' components
2152 		 * with the same syncid and without SYNCHRONIZING flag.
2153 		 */
2154 
2155 		/*
2156 		 * Find the biggest syncid, number of valid components and
2157 		 * number of dirty components.
2158 		 */
2159 		ndirty = ndisks = syncid = 0;
2160 		for (n = 0; n < sc->sc_ndisks; n++) {
2161 			disk = &sc->sc_disks[n];
2162 			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2163 				continue;
2164 			if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0)
2165 				ndirty++;
2166 			if (disk->d_sync.ds_syncid > syncid) {
2167 				syncid = disk->d_sync.ds_syncid;
2168 				ndisks = 0;
2169 			} else if (disk->d_sync.ds_syncid < syncid) {
2170 				continue;
2171 			}
2172 			if ((disk->d_flags &
2173 			    G_RAID3_DISK_FLAG_SYNCHRONIZING) != 0) {
2174 				continue;
2175 			}
2176 			ndisks++;
2177 		}
2178 		/*
2179 		 * Do we have enough valid components?
2180 		 */
2181 		if (ndisks + 1 < sc->sc_ndisks) {
2182 			G_RAID3_DEBUG(0,
2183 			    "Device %s is broken, too few valid components.",
2184 			    sc->sc_name);
2185 			sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2186 			return;
2187 		}
2188 		/*
2189 		 * If there is one DIRTY component and all disks are present,
2190 		 * mark it for synchronization. If there is more than one DIRTY
2191 		 * component, mark parity component for synchronization.
2192 		 */
2193 		if (ndisks == sc->sc_ndisks && ndirty == 1) {
2194 			for (n = 0; n < sc->sc_ndisks; n++) {
2195 				disk = &sc->sc_disks[n];
2196 				if ((disk->d_flags &
2197 				    G_RAID3_DISK_FLAG_DIRTY) == 0) {
2198 					continue;
2199 				}
2200 				disk->d_flags |=
2201 				    G_RAID3_DISK_FLAG_SYNCHRONIZING;
2202 			}
2203 		} else if (ndisks == sc->sc_ndisks && ndirty > 1) {
2204 			disk = &sc->sc_disks[sc->sc_ndisks - 1];
2205 			disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING;
2206 		}
2207 
2208 		sc->sc_syncid = syncid;
2209 		if (force) {
2210 			/* Remember to bump syncid on first write. */
2211 			sc->sc_bump_syncid = G_RAID3_BUMP_ON_FIRST_WRITE;
2212 		}
2213 		if (ndisks == sc->sc_ndisks)
2214 			state = G_RAID3_DEVICE_STATE_COMPLETE;
2215 		else /* if (ndisks == sc->sc_ndisks - 1) */
2216 			state = G_RAID3_DEVICE_STATE_DEGRADED;
2217 		G_RAID3_DEBUG(1, "Device %s state changed from %s to %s.",
2218 		    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
2219 		    g_raid3_device_state2str(state));
2220 		sc->sc_state = state;
2221 		for (n = 0; n < sc->sc_ndisks; n++) {
2222 			disk = &sc->sc_disks[n];
2223 			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2224 				continue;
2225 			state = g_raid3_determine_state(disk);
2226 			g_raid3_event_send(disk, state, G_RAID3_EVENT_DONTWAIT);
2227 			if (state == G_RAID3_DISK_STATE_STALE) {
2228 				sc->sc_bump_syncid =
2229 				    G_RAID3_BUMP_ON_FIRST_WRITE;
2230 			}
2231 		}
2232 		break;
2233 	    }
2234 	case G_RAID3_DEVICE_STATE_DEGRADED:
2235 		/*
2236 		 * Bump syncid here, if we need to do it immediately.
2237 		 */
2238 		if (sc->sc_bump_syncid == G_RAID3_BUMP_IMMEDIATELY) {
2239 			sc->sc_bump_syncid = 0;
2240 			g_raid3_bump_syncid(sc);
2241 		}
2242 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0)
2243 			return;
2244 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) <
2245 		    sc->sc_ndisks - 1) {
2246 			if (sc->sc_provider != NULL)
2247 				g_raid3_destroy_provider(sc);
2248 			sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2249 			return;
2250 		}
2251 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) ==
2252 		    sc->sc_ndisks) {
2253 			state = G_RAID3_DEVICE_STATE_COMPLETE;
2254 			G_RAID3_DEBUG(1,
2255 			    "Device %s state changed from %s to %s.",
2256 			    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
2257 			    g_raid3_device_state2str(state));
2258 			sc->sc_state = state;
2259 		}
2260 		if (sc->sc_provider == NULL)
2261 			g_raid3_launch_provider(sc);
2262 		break;
2263 	case G_RAID3_DEVICE_STATE_COMPLETE:
2264 		/*
2265 		 * Bump syncid here, if we need to do it immediately.
2266 		 */
2267 		if (sc->sc_bump_syncid == G_RAID3_BUMP_IMMEDIATELY) {
2268 			sc->sc_bump_syncid = 0;
2269 			g_raid3_bump_syncid(sc);
2270 		}
2271 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0)
2272 			return;
2273 		KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) >=
2274 		    sc->sc_ndisks - 1,
2275 		    ("Too few ACTIVE components in COMPLETE state (device %s).",
2276 		    sc->sc_name));
2277 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) ==
2278 		    sc->sc_ndisks - 1) {
2279 			state = G_RAID3_DEVICE_STATE_DEGRADED;
2280 			G_RAID3_DEBUG(1,
2281 			    "Device %s state changed from %s to %s.",
2282 			    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
2283 			    g_raid3_device_state2str(state));
2284 			sc->sc_state = state;
2285 		}
2286 		if (sc->sc_provider == NULL)
2287 			g_raid3_launch_provider(sc);
2288 		break;
2289 	default:
2290 		KASSERT(1 == 0, ("Wrong device state (%s, %s).", sc->sc_name,
2291 		    g_raid3_device_state2str(sc->sc_state)));
2292 		break;
2293 	}
2294 }
2295 
2296 /*
2297  * Update disk state and device state if needed.
2298  */
2299 #define	DISK_STATE_CHANGED()	G_RAID3_DEBUG(1,			\
2300 	"Disk %s state changed from %s to %s (device %s).",		\
2301 	g_raid3_get_diskname(disk),					\
2302 	g_raid3_disk_state2str(disk->d_state),				\
2303 	g_raid3_disk_state2str(state), sc->sc_name)
2304 static int
2305 g_raid3_update_disk(struct g_raid3_disk *disk, u_int state)
2306 {
2307 	struct g_raid3_softc *sc;
2308 
2309 	g_topology_assert();
2310 
2311 	sc = disk->d_softc;
2312 again:
2313 	G_RAID3_DEBUG(3, "Changing disk %s state from %s to %s.",
2314 	    g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state),
2315 	    g_raid3_disk_state2str(state));
2316 	switch (state) {
2317 	case G_RAID3_DISK_STATE_NEW:
2318 		/*
2319 		 * Possible scenarios:
2320 		 * 1. New disk arrive.
2321 		 */
2322 		/* Previous state should be NONE. */
2323 		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NONE,
2324 		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2325 		    g_raid3_disk_state2str(disk->d_state)));
2326 		DISK_STATE_CHANGED();
2327 
2328 		disk->d_state = state;
2329 		G_RAID3_DEBUG(0, "Device %s: provider %s detected.",
2330 		    sc->sc_name, g_raid3_get_diskname(disk));
2331 		if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING)
2332 			break;
2333 		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2334 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2335 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2336 		    g_raid3_device_state2str(sc->sc_state),
2337 		    g_raid3_get_diskname(disk),
2338 		    g_raid3_disk_state2str(disk->d_state)));
2339 		state = g_raid3_determine_state(disk);
2340 		if (state != G_RAID3_DISK_STATE_NONE)
2341 			goto again;
2342 		break;
2343 	case G_RAID3_DISK_STATE_ACTIVE:
2344 		/*
2345 		 * Possible scenarios:
2346 		 * 1. New disk does not need synchronization.
2347 		 * 2. Synchronization process finished successfully.
2348 		 */
2349 		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2350 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2351 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2352 		    g_raid3_device_state2str(sc->sc_state),
2353 		    g_raid3_get_diskname(disk),
2354 		    g_raid3_disk_state2str(disk->d_state)));
2355 		/* Previous state should be NEW or SYNCHRONIZING. */
2356 		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW ||
2357 		    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
2358 		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2359 		    g_raid3_disk_state2str(disk->d_state)));
2360 		DISK_STATE_CHANGED();
2361 
2362 		if (disk->d_state == G_RAID3_DISK_STATE_NEW)
2363 			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2364 		else if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
2365 			disk->d_flags &= ~G_RAID3_DISK_FLAG_SYNCHRONIZING;
2366 			disk->d_flags &= ~G_RAID3_DISK_FLAG_FORCE_SYNC;
2367 			g_raid3_sync_stop(sc, 0);
2368 		}
2369 		disk->d_state = state;
2370 		disk->d_sync.ds_offset = 0;
2371 		disk->d_sync.ds_offset_done = 0;
2372 		g_raid3_update_access(disk);
2373 		g_raid3_update_metadata(disk);
2374 		G_RAID3_DEBUG(0, "Device %s: provider %s activated.",
2375 		    sc->sc_name, g_raid3_get_diskname(disk));
2376 		break;
2377 	case G_RAID3_DISK_STATE_STALE:
2378 		/*
2379 		 * Possible scenarios:
2380 		 * 1. Stale disk was connected.
2381 		 */
2382 		/* Previous state should be NEW. */
2383 		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
2384 		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2385 		    g_raid3_disk_state2str(disk->d_state)));
2386 		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2387 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2388 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2389 		    g_raid3_device_state2str(sc->sc_state),
2390 		    g_raid3_get_diskname(disk),
2391 		    g_raid3_disk_state2str(disk->d_state)));
2392 		/*
2393 		 * STALE state is only possible if device is marked
2394 		 * NOAUTOSYNC.
2395 		 */
2396 		KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) != 0,
2397 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2398 		    g_raid3_device_state2str(sc->sc_state),
2399 		    g_raid3_get_diskname(disk),
2400 		    g_raid3_disk_state2str(disk->d_state)));
2401 		DISK_STATE_CHANGED();
2402 
2403 		disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2404 		disk->d_state = state;
2405 		g_raid3_update_metadata(disk);
2406 		G_RAID3_DEBUG(0, "Device %s: provider %s is stale.",
2407 		    sc->sc_name, g_raid3_get_diskname(disk));
2408 		break;
2409 	case G_RAID3_DISK_STATE_SYNCHRONIZING:
2410 		/*
2411 		 * Possible scenarios:
2412 		 * 1. Disk which needs synchronization was connected.
2413 		 */
2414 		/* Previous state should be NEW. */
2415 		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
2416 		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2417 		    g_raid3_disk_state2str(disk->d_state)));
2418 		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2419 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2420 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2421 		    g_raid3_device_state2str(sc->sc_state),
2422 		    g_raid3_get_diskname(disk),
2423 		    g_raid3_disk_state2str(disk->d_state)));
2424 		DISK_STATE_CHANGED();
2425 
2426 		if (disk->d_state == G_RAID3_DISK_STATE_NEW)
2427 			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2428 		disk->d_state = state;
2429 		if (sc->sc_provider != NULL) {
2430 			g_raid3_sync_start(sc);
2431 			g_raid3_update_metadata(disk);
2432 		}
2433 		break;
2434 	case G_RAID3_DISK_STATE_DISCONNECTED:
2435 		/*
2436 		 * Possible scenarios:
2437 		 * 1. Device wasn't running yet, but disk disappear.
2438 		 * 2. Disk was active and disapppear.
2439 		 * 3. Disk disappear during synchronization process.
2440 		 */
2441 		if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2442 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
2443 			/*
2444 			 * Previous state should be ACTIVE, STALE or
2445 			 * SYNCHRONIZING.
2446 			 */
2447 			KASSERT(disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
2448 			    disk->d_state == G_RAID3_DISK_STATE_STALE ||
2449 			    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
2450 			    ("Wrong disk state (%s, %s).",
2451 			    g_raid3_get_diskname(disk),
2452 			    g_raid3_disk_state2str(disk->d_state)));
2453 		} else if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING) {
2454 			/* Previous state should be NEW. */
2455 			KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
2456 			    ("Wrong disk state (%s, %s).",
2457 			    g_raid3_get_diskname(disk),
2458 			    g_raid3_disk_state2str(disk->d_state)));
2459 			/*
2460 			 * Reset bumping syncid if disk disappeared in STARTING
2461 			 * state.
2462 			 */
2463 			if (sc->sc_bump_syncid == G_RAID3_BUMP_ON_FIRST_WRITE)
2464 				sc->sc_bump_syncid = 0;
2465 #ifdef	INVARIANTS
2466 		} else {
2467 			KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).",
2468 			    sc->sc_name,
2469 			    g_raid3_device_state2str(sc->sc_state),
2470 			    g_raid3_get_diskname(disk),
2471 			    g_raid3_disk_state2str(disk->d_state)));
2472 #endif
2473 		}
2474 		DISK_STATE_CHANGED();
2475 		G_RAID3_DEBUG(0, "Device %s: provider %s disconnected.",
2476 		    sc->sc_name, g_raid3_get_diskname(disk));
2477 
2478 		g_raid3_destroy_disk(disk);
2479 		break;
2480 	default:
2481 		KASSERT(1 == 0, ("Unknown state (%u).", state));
2482 		break;
2483 	}
2484 	return (0);
2485 }
2486 #undef	DISK_STATE_CHANGED
2487 
2488 static int
2489 g_raid3_read_metadata(struct g_consumer *cp, struct g_raid3_metadata *md)
2490 {
2491 	struct g_provider *pp;
2492 	u_char *buf;
2493 	int error;
2494 
2495 	g_topology_assert();
2496 
2497 	error = g_access(cp, 1, 0, 0);
2498 	if (error != 0)
2499 		return (error);
2500 	pp = cp->provider;
2501 	g_topology_unlock();
2502 	/* Metadata are stored on last sector. */
2503 	buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
2504 	    &error);
2505 	g_topology_lock();
2506 	if (buf == NULL) {
2507 		g_access(cp, -1, 0, 0);
2508 		return (error);
2509 	}
2510 	if (error != 0) {
2511 		g_access(cp, -1, 0, 0);
2512 		g_free(buf);
2513 		return (error);
2514 	}
2515 	error = g_access(cp, -1, 0, 0);
2516 	KASSERT(error == 0, ("Cannot decrease access count for %s.", pp->name));
2517 
2518 	/* Decode metadata. */
2519 	error = raid3_metadata_decode(buf, md);
2520 	g_free(buf);
2521 	if (strcmp(md->md_magic, G_RAID3_MAGIC) != 0)
2522 		return (EINVAL);
2523 	if (error != 0) {
2524 		G_RAID3_DEBUG(1, "MD5 metadata hash mismatch for provider %s.",
2525 		    cp->provider->name);
2526 		return (error);
2527 	}
2528 
2529 	return (0);
2530 }
2531 
2532 static int
2533 g_raid3_check_metadata(struct g_raid3_softc *sc, struct g_provider *pp,
2534     struct g_raid3_metadata *md)
2535 {
2536 
2537 	if (md->md_no >= sc->sc_ndisks) {
2538 		G_RAID3_DEBUG(1, "Invalid disk %s number (no=%u), skipping.",
2539 		    pp->name, md->md_no);
2540 		return (EINVAL);
2541 	}
2542 	if (sc->sc_disks[md->md_no].d_state != G_RAID3_DISK_STATE_NODISK) {
2543 		G_RAID3_DEBUG(1, "Disk %s (no=%u) already exists, skipping.",
2544 		    pp->name, md->md_no);
2545 		return (EEXIST);
2546 	}
2547 	if (md->md_all != sc->sc_ndisks) {
2548 		G_RAID3_DEBUG(1,
2549 		    "Invalid '%s' field on disk %s (device %s), skipping.",
2550 		    "md_all", pp->name, sc->sc_name);
2551 		return (EINVAL);
2552 	}
2553 	if (md->md_mediasize != sc->sc_mediasize) {
2554 		G_RAID3_DEBUG(1,
2555 		    "Invalid '%s' field on disk %s (device %s), skipping.",
2556 		    "md_mediasize", pp->name, sc->sc_name);
2557 		return (EINVAL);
2558 	}
2559 	if ((md->md_mediasize % (sc->sc_ndisks - 1)) != 0) {
2560 		G_RAID3_DEBUG(1,
2561 		    "Invalid '%s' field on disk %s (device %s), skipping.",
2562 		    "md_mediasize", pp->name, sc->sc_name);
2563 		return (EINVAL);
2564 	}
2565 	if ((sc->sc_mediasize / (sc->sc_ndisks - 1)) > pp->mediasize) {
2566 		G_RAID3_DEBUG(1,
2567 		    "Invalid size of disk %s (device %s), skipping.", pp->name,
2568 		    sc->sc_name);
2569 		return (EINVAL);
2570 	}
2571 	if ((md->md_sectorsize / pp->sectorsize) < sc->sc_ndisks - 1) {
2572 		G_RAID3_DEBUG(1,
2573 		    "Invalid '%s' field on disk %s (device %s), skipping.",
2574 		    "md_sectorsize", pp->name, sc->sc_name);
2575 		return (EINVAL);
2576 	}
2577 	if (md->md_sectorsize != sc->sc_sectorsize) {
2578 		G_RAID3_DEBUG(1,
2579 		    "Invalid '%s' field on disk %s (device %s), skipping.",
2580 		    "md_sectorsize", pp->name, sc->sc_name);
2581 		return (EINVAL);
2582 	}
2583 	if ((sc->sc_sectorsize % pp->sectorsize) != 0) {
2584 		G_RAID3_DEBUG(1,
2585 		    "Invalid sector size of disk %s (device %s), skipping.",
2586 		    pp->name, sc->sc_name);
2587 		return (EINVAL);
2588 	}
2589 	if ((md->md_mflags & ~G_RAID3_DEVICE_FLAG_MASK) != 0) {
2590 		G_RAID3_DEBUG(1,
2591 		    "Invalid device flags on disk %s (device %s), skipping.",
2592 		    pp->name, sc->sc_name);
2593 		return (EINVAL);
2594 	}
2595 	if ((md->md_mflags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 &&
2596 	    (md->md_mflags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0) {
2597 		/*
2598 		 * VERIFY and ROUND-ROBIN options are mutally exclusive.
2599 		 */
2600 		G_RAID3_DEBUG(1, "Both VERIFY and ROUND-ROBIN flags exist on "
2601 		    "disk %s (device %s), skipping.", pp->name, sc->sc_name);
2602 		return (EINVAL);
2603 	}
2604 	if ((md->md_dflags & ~G_RAID3_DISK_FLAG_MASK) != 0) {
2605 		G_RAID3_DEBUG(1,
2606 		    "Invalid disk flags on disk %s (device %s), skipping.",
2607 		    pp->name, sc->sc_name);
2608 		return (EINVAL);
2609 	}
2610 	return (0);
2611 }
2612 
2613 static int
2614 g_raid3_add_disk(struct g_raid3_softc *sc, struct g_provider *pp,
2615     struct g_raid3_metadata *md)
2616 {
2617 	struct g_raid3_disk *disk;
2618 	int error;
2619 
2620 	g_topology_assert();
2621 	G_RAID3_DEBUG(2, "Adding disk %s.", pp->name);
2622 
2623 	error = g_raid3_check_metadata(sc, pp, md);
2624 	if (error != 0)
2625 		return (error);
2626 	disk = g_raid3_init_disk(sc, pp, md, &error);
2627 	if (disk == NULL)
2628 		return (error);
2629 	error = g_raid3_event_send(disk, G_RAID3_DISK_STATE_NEW,
2630 	    G_RAID3_EVENT_WAIT);
2631 	return (error);
2632 }
2633 
2634 static int
2635 g_raid3_access(struct g_provider *pp, int acr, int acw, int ace)
2636 {
2637 	struct g_raid3_softc *sc;
2638 	struct g_raid3_disk *disk;
2639 	int dcr, dcw, dce, err, error;
2640 	u_int n;
2641 
2642 	g_topology_assert();
2643 	G_RAID3_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr,
2644 	    acw, ace);
2645 
2646 	dcr = pp->acr + acr;
2647 	dcw = pp->acw + acw;
2648 	dce = pp->ace + ace;
2649 
2650 	/* On first open, grab an extra "exclusive" bit */
2651 	if (pp->acr == 0 && pp->acw == 0 && pp->ace == 0)
2652 		ace++;
2653 	/* ... and let go of it on last close */
2654 	if (dcr == 0 && dcw == 0 && dce == 0)
2655 		ace--;
2656 
2657 	sc = pp->geom->softc;
2658 	if (sc == NULL ||
2659 	    g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < sc->sc_ndisks - 1 ||
2660 	    (sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
2661 		if (acr <= 0 && acw <= 0 && ace <= 0)
2662 			return (0);
2663 		else
2664 			return (ENXIO);
2665 	}
2666 	error = ENXIO;
2667 	for (n = 0; n < sc->sc_ndisks; n++) {
2668 		disk = &sc->sc_disks[n];
2669 		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
2670 			continue;
2671 		err = g_access(disk->d_consumer, acr, acw, ace);
2672 		G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d",
2673 		    g_raid3_get_diskname(disk), acr, acw, ace, err);
2674 		if (err == 0) {
2675 			/*
2676 			 * Mark disk as dirty on open and unmark on close.
2677 			 */
2678 			if (pp->acw == 0 && dcw > 0) {
2679 				G_RAID3_DEBUG(1,
2680 				    "Disk %s (device %s) marked as dirty.",
2681 				    g_raid3_get_diskname(disk), sc->sc_name);
2682 				disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
2683 				g_raid3_update_metadata(disk);
2684 			} else if (pp->acw > 0 && dcw == 0) {
2685 				G_RAID3_DEBUG(1,
2686 				    "Disk %s (device %s) marked as clean.",
2687 				    g_raid3_get_diskname(disk), sc->sc_name);
2688 				disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2689 				g_raid3_update_metadata(disk);
2690 			}
2691 			error = 0;
2692 		} else {
2693 			sc->sc_bump_syncid = G_RAID3_BUMP_ON_FIRST_WRITE;
2694 			g_raid3_event_send(disk,
2695 			    G_RAID3_DISK_STATE_DISCONNECTED,
2696 			    G_RAID3_EVENT_DONTWAIT);
2697 		}
2698 	}
2699 	return (error);
2700 }
2701 
2702 static struct g_geom *
2703 g_raid3_create(struct g_class *mp, const struct g_raid3_metadata *md)
2704 {
2705 	struct g_raid3_softc *sc;
2706 	struct g_geom *gp;
2707 	int error, timeout;
2708 	u_int n;
2709 
2710 	g_topology_assert();
2711 	G_RAID3_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id);
2712 
2713 	/* One disk is minimum. */
2714 	if (md->md_all < 1)
2715 		return (NULL);
2716 	/*
2717 	 * Action geom.
2718 	 */
2719 	gp = g_new_geomf(mp, "%s", md->md_name);
2720 	sc = malloc(sizeof(*sc), M_RAID3, M_WAITOK | M_ZERO);
2721 	sc->sc_disks = malloc(sizeof(struct g_raid3_disk) * md->md_all, M_RAID3,
2722 	    M_WAITOK | M_ZERO);
2723 	gp->start = g_raid3_start;
2724 	gp->spoiled = g_raid3_spoiled;
2725 	gp->orphan = g_raid3_orphan;
2726 	gp->access = g_raid3_access;
2727 	gp->dumpconf = g_raid3_dumpconf;
2728 
2729 	sc->sc_id = md->md_id;
2730 	sc->sc_mediasize = md->md_mediasize;
2731 	sc->sc_sectorsize = md->md_sectorsize;
2732 	sc->sc_ndisks = md->md_all;
2733 	sc->sc_round_robin = 0;
2734 	sc->sc_flags = md->md_mflags;
2735 	sc->sc_bump_syncid = 0;
2736 	sc->sc_idle = 0;
2737 	for (n = 0; n < sc->sc_ndisks; n++)
2738 		sc->sc_disks[n].d_state = G_RAID3_DISK_STATE_NODISK;
2739 	bioq_init(&sc->sc_queue);
2740 	mtx_init(&sc->sc_queue_mtx, "graid3:queue", NULL, MTX_DEF);
2741 	TAILQ_INIT(&sc->sc_events);
2742 	mtx_init(&sc->sc_events_mtx, "graid3:events", NULL, MTX_DEF);
2743 	callout_init(&sc->sc_callout, CALLOUT_MPSAFE);
2744 	sc->sc_state = G_RAID3_DEVICE_STATE_STARTING;
2745 	gp->softc = sc;
2746 	sc->sc_geom = gp;
2747 	sc->sc_provider = NULL;
2748 	/*
2749 	 * Synchronization geom.
2750 	 */
2751 	gp = g_new_geomf(mp, "%s.sync", md->md_name);
2752 	gp->softc = sc;
2753 	gp->orphan = g_raid3_orphan;
2754 	sc->sc_sync.ds_geom = gp;
2755 	sc->sc_zone_64k = uma_zcreate("gr3:64k", 65536, NULL, NULL, NULL, NULL,
2756 	    UMA_ALIGN_PTR, 0);
2757 	uma_zone_set_max(sc->sc_zone_64k, g_raid3_n64k);
2758 	sc->sc_zone_16k = uma_zcreate("gr3:16k", 16384, NULL, NULL, NULL, NULL,
2759 	    UMA_ALIGN_PTR, 0);
2760 	uma_zone_set_max(sc->sc_zone_64k, g_raid3_n16k);
2761 	sc->sc_zone_4k = uma_zcreate("gr3:4k", 4096, NULL, NULL, NULL, NULL,
2762 	    UMA_ALIGN_PTR, 0);
2763 	uma_zone_set_max(sc->sc_zone_4k, g_raid3_n4k);
2764 	error = kthread_create(g_raid3_worker, sc, &sc->sc_worker, 0, 0,
2765 	    "g_raid3 %s", md->md_name);
2766 	if (error != 0) {
2767 		G_RAID3_DEBUG(1, "Cannot create kernel thread for %s.",
2768 		    sc->sc_name);
2769 		uma_zdestroy(sc->sc_zone_64k);
2770 		uma_zdestroy(sc->sc_zone_16k);
2771 		uma_zdestroy(sc->sc_zone_4k);
2772 		g_destroy_geom(sc->sc_sync.ds_geom);
2773 		mtx_destroy(&sc->sc_events_mtx);
2774 		mtx_destroy(&sc->sc_queue_mtx);
2775 		g_destroy_geom(sc->sc_geom);
2776 		free(sc->sc_disks, M_RAID3);
2777 		free(sc, M_RAID3);
2778 		return (NULL);
2779 	}
2780 
2781 	G_RAID3_DEBUG(0, "Device %s created (id=%u).", sc->sc_name, sc->sc_id);
2782 
2783 	/*
2784 	 * Run timeout.
2785 	 */
2786 	timeout = atomic_load_acq_int(&g_raid3_timeout);
2787 	callout_reset(&sc->sc_callout, timeout * hz, g_raid3_go, sc);
2788 	return (sc->sc_geom);
2789 }
2790 
2791 int
2792 g_raid3_destroy(struct g_raid3_softc *sc, boolean_t force)
2793 {
2794 	struct g_provider *pp;
2795 
2796 	g_topology_assert();
2797 
2798 	if (sc == NULL)
2799 		return (ENXIO);
2800 	pp = sc->sc_provider;
2801 	if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
2802 		if (force) {
2803 			G_RAID3_DEBUG(0, "Device %s is still open, so it "
2804 			    "can't be definitely removed.", pp->name);
2805 		} else {
2806 			G_RAID3_DEBUG(1,
2807 			    "Device %s is still open (r%dw%de%d).", pp->name,
2808 			    pp->acr, pp->acw, pp->ace);
2809 			return (EBUSY);
2810 		}
2811 	}
2812 
2813 	sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2814 	sc->sc_flags |= G_RAID3_DEVICE_FLAG_WAIT;
2815 	g_topology_unlock();
2816 	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
2817 	mtx_lock(&sc->sc_queue_mtx);
2818 	wakeup(sc);
2819 	wakeup(&sc->sc_queue);
2820 	mtx_unlock(&sc->sc_queue_mtx);
2821 	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker);
2822 	while (sc->sc_worker != NULL)
2823 		tsleep(&sc->sc_worker, PRIBIO, "r3:destroy", hz / 5);
2824 	G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker);
2825 	g_topology_lock();
2826 	g_raid3_destroy_device(sc);
2827 	free(sc->sc_disks, M_RAID3);
2828 	free(sc, M_RAID3);
2829 	return (0);
2830 }
2831 
2832 static void
2833 g_raid3_taste_orphan(struct g_consumer *cp)
2834 {
2835 
2836 	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
2837 	    cp->provider->name));
2838 }
2839 
2840 static struct g_geom *
2841 g_raid3_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
2842 {
2843 	struct g_raid3_metadata md;
2844 	struct g_raid3_softc *sc;
2845 	struct g_consumer *cp;
2846 	struct g_geom *gp;
2847 	int error;
2848 
2849 	g_topology_assert();
2850 	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
2851 	G_RAID3_DEBUG(2, "Tasting %s.", pp->name);
2852 
2853 	gp = g_new_geomf(mp, "raid3:taste");
2854 	/* This orphan function should be never called. */
2855 	gp->orphan = g_raid3_taste_orphan;
2856 	cp = g_new_consumer(gp);
2857 	g_attach(cp, pp);
2858 	error = g_raid3_read_metadata(cp, &md);
2859 	g_detach(cp);
2860 	g_destroy_consumer(cp);
2861 	g_destroy_geom(gp);
2862 	if (error != 0)
2863 		return (NULL);
2864 	gp = NULL;
2865 
2866 	if (md.md_version > G_RAID3_VERSION) {
2867 		printf("geom_raid3.ko module is too old to handle %s.\n",
2868 		    pp->name);
2869 		return (NULL);
2870 	}
2871 	if (md.md_provider[0] != '\0' && strcmp(md.md_provider, pp->name) != 0)
2872 		return (NULL);
2873 	if (g_raid3_debug >= 2)
2874 		raid3_metadata_dump(&md);
2875 
2876 	/*
2877 	 * Let's check if device already exists.
2878 	 */
2879 	sc = NULL;
2880 	LIST_FOREACH(gp, &mp->geom, geom) {
2881 		sc = gp->softc;
2882 		if (sc == NULL)
2883 			continue;
2884 		if (sc->sc_sync.ds_geom == gp)
2885 			continue;
2886 		if (strcmp(md.md_name, sc->sc_name) != 0)
2887 			continue;
2888 		if (md.md_id != sc->sc_id) {
2889 			G_RAID3_DEBUG(0, "Device %s already configured.",
2890 			    sc->sc_name);
2891 			return (NULL);
2892 		}
2893 		break;
2894 	}
2895 	if (gp == NULL) {
2896 		gp = g_raid3_create(mp, &md);
2897 		if (gp == NULL) {
2898 			G_RAID3_DEBUG(0, "Cannot create device %s.",
2899 			    md.md_name);
2900 			return (NULL);
2901 		}
2902 		sc = gp->softc;
2903 	}
2904 	G_RAID3_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name);
2905 	error = g_raid3_add_disk(sc, pp, &md);
2906 	if (error != 0) {
2907 		G_RAID3_DEBUG(0, "Cannot add disk %s to %s (error=%d).",
2908 		    pp->name, gp->name, error);
2909 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NODISK) ==
2910 		    sc->sc_ndisks) {
2911 			g_raid3_destroy(sc, 1);
2912 		}
2913 		return (NULL);
2914 	}
2915 	return (gp);
2916 }
2917 
2918 static int
2919 g_raid3_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused,
2920     struct g_geom *gp)
2921 {
2922 
2923 	return (g_raid3_destroy(gp->softc, 0));
2924 }
2925 
2926 static void
2927 g_raid3_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
2928     struct g_consumer *cp, struct g_provider *pp)
2929 {
2930 	struct g_raid3_softc *sc;
2931 
2932 	g_topology_assert();
2933 
2934 	sc = gp->softc;
2935 	if (sc == NULL)
2936 		return;
2937 	/* Skip synchronization geom. */
2938 	if (gp == sc->sc_sync.ds_geom)
2939 		return;
2940 	if (pp != NULL) {
2941 		/* Nothing here. */
2942 	} else if (cp != NULL) {
2943 		struct g_raid3_disk *disk;
2944 
2945 		disk = cp->private;
2946 		if (disk == NULL)
2947 			return;
2948 		sbuf_printf(sb, "%s<Type>", indent);
2949 		if (disk->d_no == sc->sc_ndisks - 1)
2950 			sbuf_printf(sb, "PARITY");
2951 		else
2952 			sbuf_printf(sb, "DATA");
2953 		sbuf_printf(sb, "</Type>\n");
2954 		sbuf_printf(sb, "%s<Number>%u</Number>\n", indent,
2955 		    (u_int)disk->d_no);
2956 		if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
2957 			sbuf_printf(sb, "%s<Synchronized>", indent);
2958 			if (disk->d_sync.ds_offset_done == 0)
2959 				sbuf_printf(sb, "0%%");
2960 			else {
2961 				sbuf_printf(sb, "%u%%",
2962 				    (u_int)((disk->d_sync.ds_offset_done * 100) /
2963 				    (sc->sc_mediasize / (sc->sc_ndisks - 1))));
2964 			}
2965 			sbuf_printf(sb, "</Synchronized>\n");
2966 		}
2967 		sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent,
2968 		    disk->d_sync.ds_syncid);
2969 		sbuf_printf(sb, "%s<Flags>", indent);
2970 		if (disk->d_flags == 0)
2971 			sbuf_printf(sb, "NONE");
2972 		else {
2973 			int first = 1;
2974 
2975 #define	ADD_FLAG(flag, name)	do {					\
2976 	if ((disk->d_flags & (flag)) != 0) {				\
2977 		if (!first)						\
2978 			sbuf_printf(sb, ", ");				\
2979 		else							\
2980 			first = 0;					\
2981 		sbuf_printf(sb, name);					\
2982 	}								\
2983 } while (0)
2984 			ADD_FLAG(G_RAID3_DISK_FLAG_DIRTY, "DIRTY");
2985 			ADD_FLAG(G_RAID3_DISK_FLAG_HARDCODED, "HARDCODED");
2986 			ADD_FLAG(G_RAID3_DISK_FLAG_SYNCHRONIZING,
2987 			    "SYNCHRONIZING");
2988 			ADD_FLAG(G_RAID3_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC");
2989 #undef	ADD_FLAG
2990 		}
2991 		sbuf_printf(sb, "</Flags>\n");
2992 		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
2993 		    g_raid3_disk_state2str(disk->d_state));
2994 	} else {
2995 		sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
2996 		sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent, sc->sc_syncid);
2997 		sbuf_printf(sb, "%s<Flags>", indent);
2998 		if (sc->sc_flags == 0)
2999 			sbuf_printf(sb, "NONE");
3000 		else {
3001 			int first = 1;
3002 
3003 #define	ADD_FLAG(flag, name)	do {					\
3004 	if ((sc->sc_flags & (flag)) != 0) {				\
3005 		if (!first)						\
3006 			sbuf_printf(sb, ", ");				\
3007 		else							\
3008 			first = 0;					\
3009 		sbuf_printf(sb, name);					\
3010 	}								\
3011 } while (0)
3012 			ADD_FLAG(G_RAID3_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC");
3013 			ADD_FLAG(G_RAID3_DEVICE_FLAG_ROUND_ROBIN,
3014 			    "ROUND-ROBIN");
3015 			ADD_FLAG(G_RAID3_DEVICE_FLAG_VERIFY, "VERIFY");
3016 #undef	ADD_FLAG
3017 		}
3018 		sbuf_printf(sb, "</Flags>\n");
3019 		sbuf_printf(sb, "%s<Components>%u</Components>\n", indent,
3020 		    sc->sc_ndisks);
3021 		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
3022 		    g_raid3_device_state2str(sc->sc_state));
3023 	}
3024 }
3025 
3026 static void
3027 g_raid3_shutdown(void *arg, int howto)
3028 {
3029 	struct g_class *mp;
3030 	struct g_geom *gp, *gp2;
3031 
3032 	mp = arg;
3033 	DROP_GIANT();
3034 	g_topology_lock();
3035 	LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
3036 		if (gp->softc == NULL)
3037 			continue;
3038 		g_raid3_destroy(gp->softc, 1);
3039 	}
3040 	g_topology_unlock();
3041 	PICKUP_GIANT();
3042 #if 0
3043 	tsleep(&gp, PRIBIO, "r3:shutdown", hz * 20);
3044 #endif
3045 }
3046 
3047 static void
3048 g_raid3_init(struct g_class *mp)
3049 {
3050 
3051 	g_raid3_ehtag = EVENTHANDLER_REGISTER(shutdown_post_sync,
3052 	    g_raid3_shutdown, mp, SHUTDOWN_PRI_FIRST);
3053 	if (g_raid3_ehtag == NULL)
3054 		G_RAID3_DEBUG(0, "Warning! Cannot register shutdown event.");
3055 }
3056 
3057 static void
3058 g_raid3_fini(struct g_class *mp)
3059 {
3060 
3061 	if (g_raid3_ehtag == NULL)
3062 		return;
3063 	EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_raid3_ehtag);
3064 }
3065 
3066 DECLARE_GEOM_CLASS(g_raid3_class, g_raid3);
3067