xref: /freebsd/sys/geom/raid3/g_raid3.c (revision 6af83ee0d2941d18880b6aaa2b4facd1d30c6106)
1 /*-
2  * Copyright (c) 2004 Pawel Jakub Dawidek <pjd@FreeBSD.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29 
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/kernel.h>
33 #include <sys/module.h>
34 #include <sys/limits.h>
35 #include <sys/lock.h>
36 #include <sys/mutex.h>
37 #include <sys/bio.h>
38 #include <sys/sysctl.h>
39 #include <sys/malloc.h>
40 #include <sys/eventhandler.h>
41 #include <vm/uma.h>
42 #include <geom/geom.h>
43 #include <sys/proc.h>
44 #include <sys/kthread.h>
45 #include <sys/sched.h>
46 #include <geom/raid3/g_raid3.h>
47 
48 
49 static MALLOC_DEFINE(M_RAID3, "raid3 data", "GEOM_RAID3 Data");
50 
51 SYSCTL_DECL(_kern_geom);
52 SYSCTL_NODE(_kern_geom, OID_AUTO, raid3, CTLFLAG_RW, 0, "GEOM_RAID3 stuff");
53 u_int g_raid3_debug = 0;
54 TUNABLE_INT("kern.geom.raid3.debug", &g_raid3_debug);
55 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, debug, CTLFLAG_RW, &g_raid3_debug, 0,
56     "Debug level");
57 static u_int g_raid3_timeout = 4;
58 TUNABLE_INT("kern.geom.raid3.timeout", &g_raid3_timeout);
59 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, timeout, CTLFLAG_RW, &g_raid3_timeout,
60     0, "Time to wait on all raid3 components");
61 static u_int g_raid3_idletime = 5;
62 TUNABLE_INT("kern.geom.raid3.idletime", &g_raid3_idletime);
63 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, idletime, CTLFLAG_RW,
64     &g_raid3_idletime, 0, "Mark components as clean when idling");
65 static u_int g_raid3_reqs_per_sync = 5;
66 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, reqs_per_sync, CTLFLAG_RW,
67     &g_raid3_reqs_per_sync, 0,
68     "Number of regular I/O requests per synchronization request");
69 static u_int g_raid3_syncs_per_sec = 1000;
70 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, syncs_per_sec, CTLFLAG_RW,
71     &g_raid3_syncs_per_sec, 0,
72     "Number of synchronizations requests per second");
73 
74 static u_int g_raid3_n64k = 50;
75 TUNABLE_INT("kern.geom.raid3.n64k", &g_raid3_n64k);
76 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n64k, CTLFLAG_RD, &g_raid3_n64k, 0,
77     "Maximum number of 64kB allocations");
78 static u_int g_raid3_n16k = 200;
79 TUNABLE_INT("kern.geom.raid3.n16k", &g_raid3_n16k);
80 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n16k, CTLFLAG_RD, &g_raid3_n16k, 0,
81     "Maximum number of 16kB allocations");
82 static u_int g_raid3_n4k = 1200;
83 TUNABLE_INT("kern.geom.raid3.n4k", &g_raid3_n4k);
84 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n4k, CTLFLAG_RD, &g_raid3_n4k, 0,
85     "Maximum number of 4kB allocations");
86 
87 SYSCTL_NODE(_kern_geom_raid3, OID_AUTO, stat, CTLFLAG_RW, 0,
88     "GEOM_RAID3 statistics");
89 static u_int g_raid3_parity_mismatch = 0;
90 SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, parity_mismatch, CTLFLAG_RD,
91     &g_raid3_parity_mismatch, 0, "Number of failures in VERIFY mode");
92 static u_int g_raid3_64k_requested = 0;
93 SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 64k_requested, CTLFLAG_RD,
94     &g_raid3_64k_requested, 0, "Number of requested 64kB allocations");
95 static u_int g_raid3_64k_failed = 0;
96 SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 64k_failed, CTLFLAG_RD,
97     &g_raid3_64k_failed, 0, "Number of failed 64kB allocations");
98 static u_int g_raid3_16k_requested = 0;
99 SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 16k_requested, CTLFLAG_RD,
100     &g_raid3_16k_requested, 0, "Number of requested 16kB allocations");
101 static u_int g_raid3_16k_failed = 0;
102 SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 16k_failed, CTLFLAG_RD,
103     &g_raid3_16k_failed, 0, "Number of failed 16kB allocations");
104 static u_int g_raid3_4k_requested = 0;
105 SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 4k_requested, CTLFLAG_RD,
106     &g_raid3_4k_requested, 0, "Number of requested 4kB allocations");
107 static u_int g_raid3_4k_failed = 0;
108 SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 4k_failed, CTLFLAG_RD,
109     &g_raid3_4k_failed, 0, "Number of failed 4kB allocations");
110 
111 #define	MSLEEP(ident, mtx, priority, wmesg, timeout)	do {		\
112 	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, (ident));	\
113 	msleep((ident), (mtx), (priority), (wmesg), (timeout));		\
114 	G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, (ident));	\
115 } while (0)
116 
117 static eventhandler_tag g_raid3_ehtag = NULL;
118 
119 static int g_raid3_destroy_geom(struct gctl_req *req, struct g_class *mp,
120     struct g_geom *gp);
121 static g_taste_t g_raid3_taste;
122 static void g_raid3_init(struct g_class *mp);
123 static void g_raid3_fini(struct g_class *mp);
124 
125 struct g_class g_raid3_class = {
126 	.name = G_RAID3_CLASS_NAME,
127 	.version = G_VERSION,
128 	.ctlreq = g_raid3_config,
129 	.taste = g_raid3_taste,
130 	.destroy_geom = g_raid3_destroy_geom,
131 	.init = g_raid3_init,
132 	.fini = g_raid3_fini
133 };
134 
135 
136 static void g_raid3_destroy_provider(struct g_raid3_softc *sc);
137 static int g_raid3_update_disk(struct g_raid3_disk *disk, u_int state);
138 static void g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force);
139 static void g_raid3_dumpconf(struct sbuf *sb, const char *indent,
140     struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
141 static void g_raid3_sync_stop(struct g_raid3_softc *sc, int type);
142 
143 
144 static const char *
145 g_raid3_disk_state2str(int state)
146 {
147 
148 	switch (state) {
149 	case G_RAID3_DISK_STATE_NODISK:
150 		return ("NODISK");
151 	case G_RAID3_DISK_STATE_NONE:
152 		return ("NONE");
153 	case G_RAID3_DISK_STATE_NEW:
154 		return ("NEW");
155 	case G_RAID3_DISK_STATE_ACTIVE:
156 		return ("ACTIVE");
157 	case G_RAID3_DISK_STATE_STALE:
158 		return ("STALE");
159 	case G_RAID3_DISK_STATE_SYNCHRONIZING:
160 		return ("SYNCHRONIZING");
161 	case G_RAID3_DISK_STATE_DISCONNECTED:
162 		return ("DISCONNECTED");
163 	default:
164 		return ("INVALID");
165 	}
166 }
167 
168 static const char *
169 g_raid3_device_state2str(int state)
170 {
171 
172 	switch (state) {
173 	case G_RAID3_DEVICE_STATE_STARTING:
174 		return ("STARTING");
175 	case G_RAID3_DEVICE_STATE_DEGRADED:
176 		return ("DEGRADED");
177 	case G_RAID3_DEVICE_STATE_COMPLETE:
178 		return ("COMPLETE");
179 	default:
180 		return ("INVALID");
181 	}
182 }
183 
184 const char *
185 g_raid3_get_diskname(struct g_raid3_disk *disk)
186 {
187 
188 	if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL)
189 		return ("[unknown]");
190 	return (disk->d_name);
191 }
192 
193 #define	g_raid3_xor(src1, src2, dst, size)				\
194 	_g_raid3_xor((uint64_t *)(src1), (uint64_t *)(src2),		\
195 	    (uint64_t *)(dst), (size_t)size)
196 static void
197 _g_raid3_xor(uint64_t *src1, uint64_t *src2, uint64_t *dst, size_t size)
198 {
199 
200 	KASSERT((size % 128) == 0, ("Invalid size: %zu.", size));
201 	for (; size > 0; size -= 128) {
202 		*dst++ = (*src1++) ^ (*src2++);
203 		*dst++ = (*src1++) ^ (*src2++);
204 		*dst++ = (*src1++) ^ (*src2++);
205 		*dst++ = (*src1++) ^ (*src2++);
206 		*dst++ = (*src1++) ^ (*src2++);
207 		*dst++ = (*src1++) ^ (*src2++);
208 		*dst++ = (*src1++) ^ (*src2++);
209 		*dst++ = (*src1++) ^ (*src2++);
210 		*dst++ = (*src1++) ^ (*src2++);
211 		*dst++ = (*src1++) ^ (*src2++);
212 		*dst++ = (*src1++) ^ (*src2++);
213 		*dst++ = (*src1++) ^ (*src2++);
214 		*dst++ = (*src1++) ^ (*src2++);
215 		*dst++ = (*src1++) ^ (*src2++);
216 		*dst++ = (*src1++) ^ (*src2++);
217 		*dst++ = (*src1++) ^ (*src2++);
218 	}
219 }
220 
221 static int
222 g_raid3_is_zero(struct bio *bp)
223 {
224 	static const uint64_t zeros[] = {
225 	    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
226 	};
227 	u_char *addr;
228 	ssize_t size;
229 
230 	size = bp->bio_length;
231 	addr = (u_char *)bp->bio_data;
232 	for (; size > 0; size -= sizeof(zeros), addr += sizeof(zeros)) {
233 		if (bcmp(addr, zeros, sizeof(zeros)) != 0)
234 			return (0);
235 	}
236 	return (1);
237 }
238 
239 /*
240  * --- Events handling functions ---
241  * Events in geom_raid3 are used to maintain disks and device status
242  * from one thread to simplify locking.
243  */
244 static void
245 g_raid3_event_free(struct g_raid3_event *ep)
246 {
247 
248 	free(ep, M_RAID3);
249 }
250 
251 int
252 g_raid3_event_send(void *arg, int state, int flags)
253 {
254 	struct g_raid3_softc *sc;
255 	struct g_raid3_disk *disk;
256 	struct g_raid3_event *ep;
257 	int error;
258 
259 	ep = malloc(sizeof(*ep), M_RAID3, M_WAITOK);
260 	G_RAID3_DEBUG(4, "%s: Sending event %p.", __func__, ep);
261 	if ((flags & G_RAID3_EVENT_DEVICE) != 0) {
262 		disk = NULL;
263 		sc = arg;
264 	} else {
265 		disk = arg;
266 		sc = disk->d_softc;
267 	}
268 	ep->e_disk = disk;
269 	ep->e_state = state;
270 	ep->e_flags = flags;
271 	ep->e_error = 0;
272 	mtx_lock(&sc->sc_events_mtx);
273 	TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next);
274 	mtx_unlock(&sc->sc_events_mtx);
275 	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
276 	mtx_lock(&sc->sc_queue_mtx);
277 	wakeup(sc);
278 	wakeup(&sc->sc_queue);
279 	mtx_unlock(&sc->sc_queue_mtx);
280 	if ((flags & G_RAID3_EVENT_DONTWAIT) != 0)
281 		return (0);
282 	g_topology_assert();
283 	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, ep);
284 	g_topology_unlock();
285 	while ((ep->e_flags & G_RAID3_EVENT_DONE) == 0) {
286 		mtx_lock(&sc->sc_events_mtx);
287 		MSLEEP(ep, &sc->sc_events_mtx, PRIBIO | PDROP, "r3:event",
288 		    hz * 5);
289 	}
290 	/* Don't even try to use 'sc' here, because it could be already dead. */
291 	g_topology_lock();
292 	error = ep->e_error;
293 	g_raid3_event_free(ep);
294 	return (error);
295 }
296 
297 static struct g_raid3_event *
298 g_raid3_event_get(struct g_raid3_softc *sc)
299 {
300 	struct g_raid3_event *ep;
301 
302 	mtx_lock(&sc->sc_events_mtx);
303 	ep = TAILQ_FIRST(&sc->sc_events);
304 	mtx_unlock(&sc->sc_events_mtx);
305 	return (ep);
306 }
307 
308 static void
309 g_raid3_event_remove(struct g_raid3_softc *sc, struct g_raid3_event *ep)
310 {
311 
312 	mtx_lock(&sc->sc_events_mtx);
313 	TAILQ_REMOVE(&sc->sc_events, ep, e_next);
314 	mtx_unlock(&sc->sc_events_mtx);
315 }
316 
317 static void
318 g_raid3_event_cancel(struct g_raid3_disk *disk)
319 {
320 	struct g_raid3_softc *sc;
321 	struct g_raid3_event *ep, *tmpep;
322 
323 	g_topology_assert();
324 
325 	sc = disk->d_softc;
326 	mtx_lock(&sc->sc_events_mtx);
327 	TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) {
328 		if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0)
329 			continue;
330 		if (ep->e_disk != disk)
331 			continue;
332 		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
333 		if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0)
334 			g_raid3_event_free(ep);
335 		else {
336 			ep->e_error = ECANCELED;
337 			wakeup(ep);
338 		}
339 	}
340 	mtx_unlock(&sc->sc_events_mtx);
341 }
342 
343 /*
344  * Return the number of disks in the given state.
345  * If state is equal to -1, count all connected disks.
346  */
347 u_int
348 g_raid3_ndisks(struct g_raid3_softc *sc, int state)
349 {
350 	struct g_raid3_disk *disk;
351 	u_int n, ndisks;
352 
353 	for (n = ndisks = 0; n < sc->sc_ndisks; n++) {
354 		disk = &sc->sc_disks[n];
355 		if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
356 			continue;
357 		if (state == -1 || disk->d_state == state)
358 			ndisks++;
359 	}
360 	return (ndisks);
361 }
362 
363 static u_int
364 g_raid3_nrequests(struct g_raid3_softc *sc, struct g_consumer *cp)
365 {
366 	struct bio *bp;
367 	u_int nreqs = 0;
368 
369 	mtx_lock(&sc->sc_queue_mtx);
370 	TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
371 		if (bp->bio_from == cp)
372 			nreqs++;
373 	}
374 	mtx_unlock(&sc->sc_queue_mtx);
375 	return (nreqs);
376 }
377 
378 static int
379 g_raid3_is_busy(struct g_raid3_softc *sc, struct g_consumer *cp)
380 {
381 
382 	if (cp->index > 0) {
383 		G_RAID3_DEBUG(2,
384 		    "I/O requests for %s exist, can't destroy it now.",
385 		    cp->provider->name);
386 		return (1);
387 	}
388 	if (g_raid3_nrequests(sc, cp) > 0) {
389 		G_RAID3_DEBUG(2,
390 		    "I/O requests for %s in queue, can't destroy it now.",
391 		    cp->provider->name);
392 		return (1);
393 	}
394 	return (0);
395 }
396 
397 static void
398 g_raid3_destroy_consumer(void *arg, int flags __unused)
399 {
400 	struct g_consumer *cp;
401 
402 	cp = arg;
403 	G_RAID3_DEBUG(1, "Consumer %s destroyed.", cp->provider->name);
404 	g_detach(cp);
405 	g_destroy_consumer(cp);
406 }
407 
408 static void
409 g_raid3_kill_consumer(struct g_raid3_softc *sc, struct g_consumer *cp)
410 {
411 	struct g_provider *pp;
412 	int retaste_wait;
413 
414 	g_topology_assert();
415 
416 	cp->private = NULL;
417 	if (g_raid3_is_busy(sc, cp))
418 		return;
419 	G_RAID3_DEBUG(2, "Consumer %s destroyed.", cp->provider->name);
420 	pp = cp->provider;
421 	retaste_wait = 0;
422 	if (cp->acw == 1) {
423 		if ((pp->geom->flags & G_GEOM_WITHER) == 0)
424 			retaste_wait = 1;
425 	}
426 	G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", pp->name, -cp->acr,
427 	    -cp->acw, -cp->ace, 0);
428 	if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
429 		g_access(cp, -cp->acr, -cp->acw, -cp->ace);
430 	if (retaste_wait) {
431 		/*
432 		 * After retaste event was send (inside g_access()), we can send
433 		 * event to detach and destroy consumer.
434 		 * A class, which has consumer to the given provider connected
435 		 * will not receive retaste event for the provider.
436 		 * This is the way how I ignore retaste events when I close
437 		 * consumers opened for write: I detach and destroy consumer
438 		 * after retaste event is sent.
439 		 */
440 		g_post_event(g_raid3_destroy_consumer, cp, M_WAITOK, NULL);
441 		return;
442 	}
443 	G_RAID3_DEBUG(1, "Consumer %s destroyed.", pp->name);
444 	g_detach(cp);
445 	g_destroy_consumer(cp);
446 }
447 
448 static int
449 g_raid3_connect_disk(struct g_raid3_disk *disk, struct g_provider *pp)
450 {
451 	int error;
452 
453 	g_topology_assert();
454 	KASSERT(disk->d_consumer == NULL,
455 	    ("Disk already connected (device %s).", disk->d_softc->sc_name));
456 
457 	disk->d_consumer = g_new_consumer(disk->d_softc->sc_geom);
458 	disk->d_consumer->private = disk;
459 	disk->d_consumer->index = 0;
460 	error = g_attach(disk->d_consumer, pp);
461 	if (error != 0)
462 		return (error);
463 	error = g_access(disk->d_consumer, 1, 1, 1);
464 	if (error != 0) {
465 		G_RAID3_DEBUG(0, "Cannot open consumer %s (error=%d).",
466 		    pp->name, error);
467 		return (error);
468 	}
469 	G_RAID3_DEBUG(2, "Disk %s connected.", g_raid3_get_diskname(disk));
470 	return (0);
471 }
472 
473 static void
474 g_raid3_disconnect_consumer(struct g_raid3_softc *sc, struct g_consumer *cp)
475 {
476 
477 	g_topology_assert();
478 
479 	if (cp == NULL)
480 		return;
481 	if (cp->provider != NULL)
482 		g_raid3_kill_consumer(sc, cp);
483 	else
484 		g_destroy_consumer(cp);
485 }
486 
487 /*
488  * Initialize disk. This means allocate memory, create consumer, attach it
489  * to the provider and open access (r1w1e1) to it.
490  */
491 static struct g_raid3_disk *
492 g_raid3_init_disk(struct g_raid3_softc *sc, struct g_provider *pp,
493     struct g_raid3_metadata *md, int *errorp)
494 {
495 	struct g_raid3_disk *disk;
496 	int error;
497 
498 	disk = &sc->sc_disks[md->md_no];
499 	error = g_raid3_connect_disk(disk, pp);
500 	if (error != 0)
501 		goto fail;
502 	disk->d_state = G_RAID3_DISK_STATE_NONE;
503 	disk->d_flags = md->md_dflags;
504 	if (md->md_provider[0] != '\0')
505 		disk->d_flags |= G_RAID3_DISK_FLAG_HARDCODED;
506 	disk->d_sync.ds_consumer = NULL;
507 	disk->d_sync.ds_offset = md->md_sync_offset;
508 	disk->d_sync.ds_offset_done = md->md_sync_offset;
509 	disk->d_sync.ds_resync = -1;
510 	disk->d_genid = md->md_genid;
511 	disk->d_sync.ds_syncid = md->md_syncid;
512 	if (errorp != NULL)
513 		*errorp = 0;
514 	return (disk);
515 fail:
516 	if (errorp != NULL)
517 		*errorp = error;
518 	if (disk != NULL)
519 		g_raid3_disconnect_consumer(sc, disk->d_consumer);
520 	return (NULL);
521 }
522 
523 static void
524 g_raid3_destroy_disk(struct g_raid3_disk *disk)
525 {
526 	struct g_raid3_softc *sc;
527 
528 	g_topology_assert();
529 
530 	if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
531 		return;
532 	g_raid3_event_cancel(disk);
533 	sc = disk->d_softc;
534 	switch (disk->d_state) {
535 	case G_RAID3_DISK_STATE_SYNCHRONIZING:
536 		if (sc->sc_syncdisk != NULL)
537 			g_raid3_sync_stop(sc, 1);
538 		/* FALLTHROUGH */
539 	case G_RAID3_DISK_STATE_NEW:
540 	case G_RAID3_DISK_STATE_STALE:
541 	case G_RAID3_DISK_STATE_ACTIVE:
542 		g_raid3_disconnect_consumer(sc, disk->d_consumer);
543 		disk->d_consumer = NULL;
544 		break;
545 	default:
546 		KASSERT(0 == 1, ("Wrong disk state (%s, %s).",
547 		    g_raid3_get_diskname(disk),
548 		    g_raid3_disk_state2str(disk->d_state)));
549 	}
550 	disk->d_state = G_RAID3_DISK_STATE_NODISK;
551 }
552 
553 static void
554 g_raid3_destroy_device(struct g_raid3_softc *sc)
555 {
556 	struct g_raid3_event *ep;
557 	struct g_raid3_disk *disk;
558 	struct g_geom *gp;
559 	struct g_consumer *cp;
560 	u_int n;
561 
562 	g_topology_assert();
563 
564 	gp = sc->sc_geom;
565 	if (sc->sc_provider != NULL)
566 		g_raid3_destroy_provider(sc);
567 	for (n = 0; n < sc->sc_ndisks; n++) {
568 		disk = &sc->sc_disks[n];
569 		if (disk->d_state != G_RAID3_DISK_STATE_NODISK) {
570 			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
571 			g_raid3_update_metadata(disk);
572 			g_raid3_destroy_disk(disk);
573 		}
574 	}
575 	while ((ep = g_raid3_event_get(sc)) != NULL) {
576 		g_raid3_event_remove(sc, ep);
577 		if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0)
578 			g_raid3_event_free(ep);
579 		else {
580 			ep->e_error = ECANCELED;
581 			ep->e_flags |= G_RAID3_EVENT_DONE;
582 			G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, ep);
583 			mtx_lock(&sc->sc_events_mtx);
584 			wakeup(ep);
585 			mtx_unlock(&sc->sc_events_mtx);
586 		}
587 	}
588 	callout_drain(&sc->sc_callout);
589 	gp->softc = NULL;
590 	cp = LIST_FIRST(&sc->sc_sync.ds_geom->consumer);
591 	if (cp != NULL)
592 		g_raid3_disconnect_consumer(sc, cp);
593 	sc->sc_sync.ds_geom->softc = NULL;
594 	g_wither_geom(sc->sc_sync.ds_geom, ENXIO);
595 	uma_zdestroy(sc->sc_zone_64k);
596 	uma_zdestroy(sc->sc_zone_16k);
597 	uma_zdestroy(sc->sc_zone_4k);
598 	mtx_destroy(&sc->sc_queue_mtx);
599 	mtx_destroy(&sc->sc_events_mtx);
600 	G_RAID3_DEBUG(0, "Device %s destroyed.", gp->name);
601 	g_wither_geom(gp, ENXIO);
602 }
603 
604 static void
605 g_raid3_orphan(struct g_consumer *cp)
606 {
607 	struct g_raid3_disk *disk;
608 
609 	g_topology_assert();
610 
611 	disk = cp->private;
612 	if (disk == NULL)
613 		return;
614 	disk->d_softc->sc_bump_id = G_RAID3_BUMP_SYNCID;
615 	g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
616 	    G_RAID3_EVENT_DONTWAIT);
617 }
618 
619 static int
620 g_raid3_write_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md)
621 {
622 	struct g_raid3_softc *sc;
623 	struct g_consumer *cp;
624 	off_t offset, length;
625 	u_char *sector;
626 	int error = 0;
627 
628 	g_topology_assert();
629 
630 	sc = disk->d_softc;
631 	cp = disk->d_consumer;
632 	KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name));
633 	KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name));
634 	KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1,
635 	    ("Consumer %s closed? (r%dw%de%d).", cp->provider->name, cp->acr,
636 	    cp->acw, cp->ace));
637 	length = cp->provider->sectorsize;
638 	offset = cp->provider->mediasize - length;
639 	sector = malloc((size_t)length, M_RAID3, M_WAITOK | M_ZERO);
640 	if (md != NULL)
641 		raid3_metadata_encode(md, sector);
642 	g_topology_unlock();
643 	error = g_write_data(cp, offset, sector, length);
644 	g_topology_lock();
645 	free(sector, M_RAID3);
646 	if (error != 0) {
647 		disk->d_softc->sc_bump_id = G_RAID3_BUMP_GENID;
648 		g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
649 		    G_RAID3_EVENT_DONTWAIT);
650 	}
651 	return (error);
652 }
653 
654 int
655 g_raid3_clear_metadata(struct g_raid3_disk *disk)
656 {
657 	int error;
658 
659 	g_topology_assert();
660 	error = g_raid3_write_metadata(disk, NULL);
661 	if (error == 0) {
662 		G_RAID3_DEBUG(2, "Metadata on %s cleared.",
663 		    g_raid3_get_diskname(disk));
664 	} else {
665 		G_RAID3_DEBUG(0,
666 		    "Cannot clear metadata on disk %s (error=%d).",
667 		    g_raid3_get_diskname(disk), error);
668 	}
669 	return (error);
670 }
671 
672 void
673 g_raid3_fill_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md)
674 {
675 	struct g_raid3_softc *sc;
676 
677 	sc = disk->d_softc;
678 	strlcpy(md->md_magic, G_RAID3_MAGIC, sizeof(md->md_magic));
679 	md->md_version = G_RAID3_VERSION;
680 	strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name));
681 	md->md_id = sc->sc_id;
682 	md->md_all = sc->sc_ndisks;
683 	md->md_genid = sc->sc_genid;
684 	md->md_mediasize = sc->sc_mediasize;
685 	md->md_sectorsize = sc->sc_sectorsize;
686 	md->md_mflags = (sc->sc_flags & G_RAID3_DEVICE_FLAG_MASK);
687 	md->md_no = disk->d_no;
688 	md->md_syncid = disk->d_sync.ds_syncid;
689 	md->md_dflags = (disk->d_flags & G_RAID3_DISK_FLAG_MASK);
690 	if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING)
691 		md->md_sync_offset = disk->d_sync.ds_offset_done;
692 	else
693 		md->md_sync_offset = 0;
694 	if ((disk->d_flags & G_RAID3_DISK_FLAG_HARDCODED) != 0 &&
695 	    disk->d_consumer != NULL && disk->d_consumer->provider != NULL) {
696 		strlcpy(md->md_provider, disk->d_consumer->provider->name,
697 		    sizeof(md->md_provider));
698 	} else {
699 		bzero(md->md_provider, sizeof(md->md_provider));
700 	}
701 }
702 
703 void
704 g_raid3_update_metadata(struct g_raid3_disk *disk)
705 {
706 	struct g_raid3_metadata md;
707 	int error;
708 
709 	g_topology_assert();
710 	g_raid3_fill_metadata(disk, &md);
711 	error = g_raid3_write_metadata(disk, &md);
712 	if (error == 0) {
713 		G_RAID3_DEBUG(2, "Metadata on %s updated.",
714 		    g_raid3_get_diskname(disk));
715 	} else {
716 		G_RAID3_DEBUG(0,
717 		    "Cannot update metadata on disk %s (error=%d).",
718 		    g_raid3_get_diskname(disk), error);
719 	}
720 }
721 
722 static void
723 g_raid3_bump_syncid(struct g_raid3_softc *sc)
724 {
725 	struct g_raid3_disk *disk;
726 	u_int n;
727 
728 	g_topology_assert();
729 	KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0,
730 	    ("%s called with no active disks (device=%s).", __func__,
731 	    sc->sc_name));
732 
733 	sc->sc_syncid++;
734 	G_RAID3_DEBUG(1, "Device %s: syncid bumped to %u.", sc->sc_name,
735 	    sc->sc_syncid);
736 	for (n = 0; n < sc->sc_ndisks; n++) {
737 		disk = &sc->sc_disks[n];
738 		if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
739 		    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
740 			disk->d_sync.ds_syncid = sc->sc_syncid;
741 			g_raid3_update_metadata(disk);
742 		}
743 	}
744 }
745 
746 static void
747 g_raid3_bump_genid(struct g_raid3_softc *sc)
748 {
749 	struct g_raid3_disk *disk;
750 	u_int n;
751 
752 	g_topology_assert();
753 	KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0,
754 	    ("%s called with no active disks (device=%s).", __func__,
755 	    sc->sc_name));
756 
757 	sc->sc_genid++;
758 	G_RAID3_DEBUG(1, "Device %s: genid bumped to %u.", sc->sc_name,
759 	    sc->sc_genid);
760 	for (n = 0; n < sc->sc_ndisks; n++) {
761 		disk = &sc->sc_disks[n];
762 		if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
763 		    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
764 			disk->d_genid = sc->sc_genid;
765 			g_raid3_update_metadata(disk);
766 		}
767 	}
768 }
769 
770 static void
771 g_raid3_idle(struct g_raid3_softc *sc)
772 {
773 	struct g_raid3_disk *disk;
774 	u_int i;
775 
776 	if (sc->sc_provider == NULL || sc->sc_provider->acw == 0)
777 		return;
778 	sc->sc_idle = 1;
779 	g_topology_lock();
780 	for (i = 0; i < sc->sc_ndisks; i++) {
781 		disk = &sc->sc_disks[i];
782 		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
783 			continue;
784 		G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.",
785 		    g_raid3_get_diskname(disk), sc->sc_name);
786 		disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
787 		g_raid3_update_metadata(disk);
788 	}
789 	g_topology_unlock();
790 }
791 
792 static void
793 g_raid3_unidle(struct g_raid3_softc *sc)
794 {
795 	struct g_raid3_disk *disk;
796 	u_int i;
797 
798 	sc->sc_idle = 0;
799 	g_topology_lock();
800 	for (i = 0; i < sc->sc_ndisks; i++) {
801 		disk = &sc->sc_disks[i];
802 		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
803 			continue;
804 		G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.",
805 		    g_raid3_get_diskname(disk), sc->sc_name);
806 		disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
807 		g_raid3_update_metadata(disk);
808 	}
809 	g_topology_unlock();
810 }
811 
812 /*
813  * Return 1 if we should check if RAID3 device is idling.
814  */
815 static int
816 g_raid3_check_idle(struct g_raid3_softc *sc)
817 {
818 	struct g_raid3_disk *disk;
819 	u_int i;
820 
821 	if (sc->sc_idle)
822 		return (0);
823 	if (sc->sc_provider != NULL && sc->sc_provider->acw == 0)
824 		return (0);
825 	/*
826 	 * Check if there are no in-flight requests.
827 	 */
828 	for (i = 0; i < sc->sc_ndisks; i++) {
829 		disk = &sc->sc_disks[i];
830 		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
831 			continue;
832 		if (disk->d_consumer->index > 0)
833 			return (0);
834 	}
835 	return (1);
836 }
837 
838 /*
839  * Treat bio_driver1 field in parent bio as list head and field bio_caller1
840  * in child bio as pointer to the next element on the list.
841  */
842 #define	G_RAID3_HEAD_BIO(pbp)	(pbp)->bio_driver1
843 
844 #define	G_RAID3_NEXT_BIO(cbp)	(cbp)->bio_caller1
845 
846 #define	G_RAID3_FOREACH_BIO(pbp, bp)					\
847 	for ((bp) = G_RAID3_HEAD_BIO(pbp); (bp) != NULL;		\
848 	    (bp) = G_RAID3_NEXT_BIO(bp))
849 
850 #define	G_RAID3_FOREACH_SAFE_BIO(pbp, bp, tmpbp)			\
851 	for ((bp) = G_RAID3_HEAD_BIO(pbp);				\
852 	    (bp) != NULL && ((tmpbp) = G_RAID3_NEXT_BIO(bp), 1);	\
853 	    (bp) = (tmpbp))
854 
855 static void
856 g_raid3_init_bio(struct bio *pbp)
857 {
858 
859 	G_RAID3_HEAD_BIO(pbp) = NULL;
860 }
861 
862 static void
863 g_raid3_remove_bio(struct bio *cbp)
864 {
865 	struct bio *pbp, *bp;
866 
867 	pbp = cbp->bio_parent;
868 	if (G_RAID3_HEAD_BIO(pbp) == cbp)
869 		G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp);
870 	else {
871 		G_RAID3_FOREACH_BIO(pbp, bp) {
872 			if (G_RAID3_NEXT_BIO(bp) == cbp) {
873 				G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp);
874 				break;
875 			}
876 		}
877 	}
878 	G_RAID3_NEXT_BIO(cbp) = NULL;
879 }
880 
881 static void
882 g_raid3_replace_bio(struct bio *sbp, struct bio *dbp)
883 {
884 	struct bio *pbp, *bp;
885 
886 	g_raid3_remove_bio(sbp);
887 	pbp = dbp->bio_parent;
888 	G_RAID3_NEXT_BIO(sbp) = G_RAID3_NEXT_BIO(dbp);
889 	if (G_RAID3_HEAD_BIO(pbp) == dbp)
890 		G_RAID3_HEAD_BIO(pbp) = sbp;
891 	else {
892 		G_RAID3_FOREACH_BIO(pbp, bp) {
893 			if (G_RAID3_NEXT_BIO(bp) == dbp) {
894 				G_RAID3_NEXT_BIO(bp) = sbp;
895 				break;
896 			}
897 		}
898 	}
899 	G_RAID3_NEXT_BIO(dbp) = NULL;
900 }
901 
902 static void
903 g_raid3_destroy_bio(struct g_raid3_softc *sc, struct bio *cbp)
904 {
905 	struct bio *bp, *pbp;
906 	size_t size;
907 
908 	pbp = cbp->bio_parent;
909 	pbp->bio_children--;
910 	KASSERT(cbp->bio_data != NULL, ("NULL bio_data"));
911 	size = pbp->bio_length / (sc->sc_ndisks - 1);
912 	if (size > 16384)
913 		uma_zfree(sc->sc_zone_64k, cbp->bio_data);
914 	else if (size > 4096)
915 		uma_zfree(sc->sc_zone_16k, cbp->bio_data);
916 	else
917 		uma_zfree(sc->sc_zone_4k, cbp->bio_data);
918 	if (G_RAID3_HEAD_BIO(pbp) == cbp) {
919 		G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp);
920 		G_RAID3_NEXT_BIO(cbp) = NULL;
921 		g_destroy_bio(cbp);
922 	} else {
923 		G_RAID3_FOREACH_BIO(pbp, bp) {
924 			if (G_RAID3_NEXT_BIO(bp) == cbp)
925 				break;
926 		}
927 		if (bp != NULL) {
928 			KASSERT(G_RAID3_NEXT_BIO(bp) != NULL,
929 			    ("NULL bp->bio_driver1"));
930 			G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp);
931 			G_RAID3_NEXT_BIO(cbp) = NULL;
932 		}
933 		g_destroy_bio(cbp);
934 	}
935 }
936 
937 static struct bio *
938 g_raid3_clone_bio(struct g_raid3_softc *sc, struct bio *pbp)
939 {
940 	struct bio *bp, *cbp;
941 	size_t size;
942 
943 	cbp = g_clone_bio(pbp);
944 	if (cbp == NULL)
945 		return (NULL);
946 	size = pbp->bio_length / (sc->sc_ndisks - 1);
947 	if (size > 16384) {
948 		cbp->bio_data = uma_zalloc(sc->sc_zone_64k, M_NOWAIT);
949 		g_raid3_64k_requested++;
950 	} else if (size > 4096) {
951 		cbp->bio_data = uma_zalloc(sc->sc_zone_16k, M_NOWAIT);
952 		g_raid3_16k_requested++;
953 	} else {
954 		cbp->bio_data = uma_zalloc(sc->sc_zone_4k, M_NOWAIT);
955 		g_raid3_4k_requested++;
956 	}
957 	if (cbp->bio_data == NULL) {
958 		if (size > 16384)
959 			g_raid3_64k_failed++;
960 		if (size > 4096)
961 			g_raid3_16k_failed++;
962 		else
963 			g_raid3_4k_failed++;
964 		pbp->bio_children--;
965 		g_destroy_bio(cbp);
966 		return (NULL);
967 	}
968 	G_RAID3_NEXT_BIO(cbp) = NULL;
969 	if (G_RAID3_HEAD_BIO(pbp) == NULL)
970 		G_RAID3_HEAD_BIO(pbp) = cbp;
971 	else {
972 		G_RAID3_FOREACH_BIO(pbp, bp) {
973 			if (G_RAID3_NEXT_BIO(bp) == NULL) {
974 				G_RAID3_NEXT_BIO(bp) = cbp;
975 				break;
976 			}
977 		}
978 	}
979 	return (cbp);
980 }
981 
982 static void
983 g_raid3_scatter(struct bio *pbp)
984 {
985 	struct g_raid3_softc *sc;
986 	struct g_raid3_disk *disk;
987 	struct bio *bp, *cbp;
988 	off_t atom, cadd, padd, left;
989 
990 	sc = pbp->bio_to->geom->softc;
991 	bp = NULL;
992 	if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) {
993 		/*
994 		 * Find bio for which we should calculate data.
995 		 */
996 		G_RAID3_FOREACH_BIO(pbp, cbp) {
997 			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) {
998 				bp = cbp;
999 				break;
1000 			}
1001 		}
1002 		KASSERT(bp != NULL, ("NULL parity bio."));
1003 	}
1004 	atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
1005 	cadd = padd = 0;
1006 	for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) {
1007 		G_RAID3_FOREACH_BIO(pbp, cbp) {
1008 			if (cbp == bp)
1009 				continue;
1010 			bcopy(pbp->bio_data + padd, cbp->bio_data + cadd, atom);
1011 			padd += atom;
1012 		}
1013 		cadd += atom;
1014 	}
1015 	if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) {
1016 		struct bio *tmpbp;
1017 
1018 		/*
1019 		 * Calculate parity.
1020 		 */
1021 		bzero(bp->bio_data, bp->bio_length);
1022 		G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) {
1023 			if (cbp == bp)
1024 				continue;
1025 			g_raid3_xor(cbp->bio_data, bp->bio_data, bp->bio_data,
1026 			    bp->bio_length);
1027 			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_NODISK) != 0)
1028 				g_raid3_destroy_bio(sc, cbp);
1029 		}
1030 	}
1031 	G_RAID3_FOREACH_BIO(pbp, cbp) {
1032 		struct g_consumer *cp;
1033 
1034 		disk = cbp->bio_caller2;
1035 		cp = disk->d_consumer;
1036 		cbp->bio_to = cp->provider;
1037 		G_RAID3_LOGREQ(3, cbp, "Sending request.");
1038 		KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1,
1039 		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1040 		    cp->acr, cp->acw, cp->ace));
1041 		cp->index++;
1042 		g_io_request(cbp, cp);
1043 	}
1044 }
1045 
1046 static void
1047 g_raid3_gather(struct bio *pbp)
1048 {
1049 	struct g_raid3_softc *sc;
1050 	struct g_raid3_disk *disk;
1051 	struct bio *xbp, *fbp, *cbp;
1052 	off_t atom, cadd, padd, left;
1053 
1054 	sc = pbp->bio_to->geom->softc;
1055 	/*
1056 	 * Find bio for which we have to calculate data.
1057 	 * While going through this path, check if all requests
1058 	 * succeeded, if not, deny whole request.
1059 	 * If we're in COMPLETE mode, we allow one request to fail,
1060 	 * so if we find one, we're sending it to the parity consumer.
1061 	 * If there are more failed requests, we deny whole request.
1062 	 */
1063 	xbp = fbp = NULL;
1064 	G_RAID3_FOREACH_BIO(pbp, cbp) {
1065 		if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) {
1066 			KASSERT(xbp == NULL, ("More than one parity bio."));
1067 			xbp = cbp;
1068 		}
1069 		if (cbp->bio_error == 0)
1070 			continue;
1071 		/*
1072 		 * Found failed request.
1073 		 */
1074 		G_RAID3_LOGREQ(0, cbp, "Request failed.");
1075 		disk = cbp->bio_caller2;
1076 		if (disk != NULL) {
1077 			/*
1078 			 * Actually this is pointless to bump genid,
1079 			 * because whole device is fucked up.
1080 			 */
1081 			sc->sc_bump_id |= G_RAID3_BUMP_GENID;
1082 			g_raid3_event_send(disk,
1083 			    G_RAID3_DISK_STATE_DISCONNECTED,
1084 			    G_RAID3_EVENT_DONTWAIT);
1085 		}
1086 		if (fbp == NULL) {
1087 			if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_DEGRADED) != 0) {
1088 				/*
1089 				 * We are already in degraded mode, so we can't
1090 				 * accept any failures.
1091 				 */
1092 				if (pbp->bio_error == 0)
1093 					pbp->bio_error = fbp->bio_error;
1094 			} else {
1095 				fbp = cbp;
1096 			}
1097 		} else {
1098 			/*
1099 			 * Next failed request, that's too many.
1100 			 */
1101 			if (pbp->bio_error == 0)
1102 				pbp->bio_error = fbp->bio_error;
1103 		}
1104 	}
1105 	if (pbp->bio_error != 0)
1106 		goto finish;
1107 	if (fbp != NULL && (pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) {
1108 		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_VERIFY;
1109 		if (xbp != fbp)
1110 			g_raid3_replace_bio(xbp, fbp);
1111 		g_raid3_destroy_bio(sc, fbp);
1112 	} else if (fbp != NULL) {
1113 		struct g_consumer *cp;
1114 
1115 		/*
1116 		 * One request failed, so send the same request to
1117 		 * the parity consumer.
1118 		 */
1119 		disk = pbp->bio_driver2;
1120 		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) {
1121 			pbp->bio_error = fbp->bio_error;
1122 			goto finish;
1123 		}
1124 		pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
1125 		pbp->bio_inbed--;
1126 		fbp->bio_flags &= ~(BIO_DONE | BIO_ERROR);
1127 		if (disk->d_no == sc->sc_ndisks - 1)
1128 			fbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1129 		fbp->bio_error = 0;
1130 		fbp->bio_completed = 0;
1131 		fbp->bio_children = 0;
1132 		fbp->bio_inbed = 0;
1133 		cp = disk->d_consumer;
1134 		fbp->bio_caller2 = disk;
1135 		fbp->bio_to = cp->provider;
1136 		G_RAID3_LOGREQ(3, fbp, "Sending request (recover).");
1137 		KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1,
1138 		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1139 		    cp->acr, cp->acw, cp->ace));
1140 		cp->index++;
1141 		g_io_request(fbp, cp);
1142 		return;
1143 	}
1144 	if (xbp != NULL) {
1145 		/*
1146 		 * Calculate parity.
1147 		 */
1148 		G_RAID3_FOREACH_BIO(pbp, cbp) {
1149 			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0)
1150 				continue;
1151 			g_raid3_xor(cbp->bio_data, xbp->bio_data, xbp->bio_data,
1152 			    xbp->bio_length);
1153 		}
1154 		xbp->bio_cflags &= ~G_RAID3_BIO_CFLAG_PARITY;
1155 		if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) {
1156 			if (!g_raid3_is_zero(xbp)) {
1157 				g_raid3_parity_mismatch++;
1158 				pbp->bio_error = EIO;
1159 				goto finish;
1160 			}
1161 			g_raid3_destroy_bio(sc, xbp);
1162 		}
1163 	}
1164 	atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
1165 	cadd = padd = 0;
1166 	for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) {
1167 		G_RAID3_FOREACH_BIO(pbp, cbp) {
1168 			bcopy(cbp->bio_data + cadd, pbp->bio_data + padd, atom);
1169 			pbp->bio_completed += atom;
1170 			padd += atom;
1171 		}
1172 		cadd += atom;
1173 	}
1174 finish:
1175 	if (pbp->bio_error == 0)
1176 		G_RAID3_LOGREQ(3, pbp, "Request finished.");
1177 	else {
1178 		if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0)
1179 			G_RAID3_LOGREQ(1, pbp, "Verification error.");
1180 		else
1181 			G_RAID3_LOGREQ(0, pbp, "Request failed.");
1182 	}
1183 	pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_MASK;
1184 	g_io_deliver(pbp, pbp->bio_error);
1185 	while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL)
1186 		g_raid3_destroy_bio(sc, cbp);
1187 }
1188 
1189 static void
1190 g_raid3_done(struct bio *bp)
1191 {
1192 	struct g_raid3_softc *sc;
1193 
1194 	sc = bp->bio_from->geom->softc;
1195 	bp->bio_cflags |= G_RAID3_BIO_CFLAG_REGULAR;
1196 	G_RAID3_LOGREQ(3, bp, "Regular request done (error=%d).", bp->bio_error);
1197 	mtx_lock(&sc->sc_queue_mtx);
1198 	bioq_insert_head(&sc->sc_queue, bp);
1199 	wakeup(sc);
1200 	wakeup(&sc->sc_queue);
1201 	mtx_unlock(&sc->sc_queue_mtx);
1202 }
1203 
1204 static void
1205 g_raid3_regular_request(struct bio *cbp)
1206 {
1207 	struct g_raid3_softc *sc;
1208 	struct g_raid3_disk *disk;
1209 	struct bio *pbp;
1210 
1211 	g_topology_assert_not();
1212 
1213 	cbp->bio_from->index--;
1214 	pbp = cbp->bio_parent;
1215 	sc = pbp->bio_to->geom->softc;
1216 	disk = cbp->bio_from->private;
1217 	if (disk == NULL) {
1218 		g_topology_lock();
1219 		g_raid3_kill_consumer(sc, cbp->bio_from);
1220 		g_topology_unlock();
1221 	}
1222 
1223 	G_RAID3_LOGREQ(3, cbp, "Request finished.");
1224 	pbp->bio_inbed++;
1225 	KASSERT(pbp->bio_inbed <= pbp->bio_children,
1226 	    ("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed,
1227 	    pbp->bio_children));
1228 	if (pbp->bio_inbed != pbp->bio_children)
1229 		return;
1230 	switch (pbp->bio_cmd) {
1231 	case BIO_READ:
1232 		g_raid3_gather(pbp);
1233 		break;
1234 	case BIO_WRITE:
1235 	case BIO_DELETE:
1236 	    {
1237 		int error = 0;
1238 
1239 		pbp->bio_completed = pbp->bio_length;
1240 		while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) {
1241 			if (cbp->bio_error != 0) {
1242 				disk = cbp->bio_caller2;
1243 				if (disk != NULL) {
1244 					sc->sc_bump_id |= G_RAID3_BUMP_GENID;
1245 					g_raid3_event_send(disk,
1246 					    G_RAID3_DISK_STATE_DISCONNECTED,
1247 					    G_RAID3_EVENT_DONTWAIT);
1248 				}
1249 				if (error == 0)
1250 					error = cbp->bio_error;
1251 				else if (pbp->bio_error == 0) {
1252 					/*
1253 					 * Next failed request, that's too many.
1254 					 */
1255 					pbp->bio_error = error;
1256 				}
1257 			}
1258 			g_raid3_destroy_bio(sc, cbp);
1259 		}
1260 		if (pbp->bio_error == 0)
1261 			G_RAID3_LOGREQ(3, pbp, "Request finished.");
1262 		else
1263 			G_RAID3_LOGREQ(0, pbp, "Request failed.");
1264 		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_DEGRADED;
1265 		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_NOPARITY;
1266 		g_io_deliver(pbp, pbp->bio_error);
1267 		break;
1268 	    }
1269 	}
1270 }
1271 
1272 static void
1273 g_raid3_sync_done(struct bio *bp)
1274 {
1275 	struct g_raid3_softc *sc;
1276 
1277 	G_RAID3_LOGREQ(3, bp, "Synchronization request delivered.");
1278 	sc = bp->bio_from->geom->softc;
1279 	bp->bio_cflags |= G_RAID3_BIO_CFLAG_SYNC;
1280 	mtx_lock(&sc->sc_queue_mtx);
1281 	bioq_insert_head(&sc->sc_queue, bp);
1282 	wakeup(sc);
1283 	wakeup(&sc->sc_queue);
1284 	mtx_unlock(&sc->sc_queue_mtx);
1285 }
1286 
1287 static void
1288 g_raid3_start(struct bio *bp)
1289 {
1290 	struct g_raid3_softc *sc;
1291 
1292 	sc = bp->bio_to->geom->softc;
1293 	/*
1294 	 * If sc == NULL or there are no valid disks, provider's error
1295 	 * should be set and g_raid3_start() should not be called at all.
1296 	 */
1297 	KASSERT(sc != NULL && (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
1298 	    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE),
1299 	    ("Provider's error should be set (error=%d)(device=%s).",
1300 	    bp->bio_to->error, bp->bio_to->name));
1301 	G_RAID3_LOGREQ(3, bp, "Request received.");
1302 
1303 	switch (bp->bio_cmd) {
1304 	case BIO_READ:
1305 	case BIO_WRITE:
1306 	case BIO_DELETE:
1307 		break;
1308 	case BIO_GETATTR:
1309 	default:
1310 		g_io_deliver(bp, EOPNOTSUPP);
1311 		return;
1312 	}
1313 	mtx_lock(&sc->sc_queue_mtx);
1314 	bioq_insert_tail(&sc->sc_queue, bp);
1315 	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
1316 	wakeup(sc);
1317 	mtx_unlock(&sc->sc_queue_mtx);
1318 }
1319 
1320 /*
1321  * Send one synchronization request.
1322  */
1323 static void
1324 g_raid3_sync_one(struct g_raid3_softc *sc)
1325 {
1326 	struct g_raid3_disk *disk;
1327 	struct bio *bp;
1328 
1329 	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
1330 	    ("Wrong device state (%s, %s).", sc->sc_name,
1331 	    g_raid3_device_state2str(sc->sc_state)));
1332 	disk = sc->sc_syncdisk;
1333 	KASSERT(disk != NULL, ("No sync disk (%s).", sc->sc_name));
1334 	KASSERT(disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
1335 	    ("Disk %s is not marked for synchronization.",
1336 	    g_raid3_get_diskname(disk)));
1337 
1338 	bp = g_new_bio();
1339 	if (bp == NULL)
1340 		return;
1341 	bp->bio_parent = NULL;
1342 	bp->bio_cmd = BIO_READ;
1343 	bp->bio_offset = disk->d_sync.ds_offset * (sc->sc_ndisks - 1);
1344 	bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset);
1345 	bp->bio_cflags = 0;
1346 	bp->bio_done = g_raid3_sync_done;
1347 	bp->bio_data = disk->d_sync.ds_data;
1348 	if (bp->bio_data == NULL) {
1349 		g_destroy_bio(bp);
1350 		return;
1351 	}
1352 	bp->bio_cflags = G_RAID3_BIO_CFLAG_REGSYNC;
1353 	disk->d_sync.ds_offset += bp->bio_length / (sc->sc_ndisks - 1);
1354 	bp->bio_to = sc->sc_provider;
1355 	G_RAID3_LOGREQ(3, bp, "Sending synchronization request.");
1356 	disk->d_sync.ds_consumer->index++;
1357 	g_io_request(bp, disk->d_sync.ds_consumer);
1358 }
1359 
1360 static void
1361 g_raid3_sync_request(struct bio *bp)
1362 {
1363 	struct g_raid3_softc *sc;
1364 	struct g_raid3_disk *disk;
1365 
1366 	bp->bio_from->index--;
1367 	sc = bp->bio_from->geom->softc;
1368 	disk = bp->bio_from->private;
1369 	if (disk == NULL) {
1370 		g_topology_lock();
1371 		g_raid3_kill_consumer(sc, bp->bio_from);
1372 		g_topology_unlock();
1373 		g_destroy_bio(bp);
1374 		return;
1375 	}
1376 
1377 	/*
1378 	 * Synchronization request.
1379 	 */
1380 	switch (bp->bio_cmd) {
1381 	case BIO_READ:
1382 	    {
1383 		struct g_consumer *cp;
1384 		u_char *dst, *src;
1385 		off_t left;
1386 		u_int atom;
1387 
1388 		if (bp->bio_error != 0) {
1389 			G_RAID3_LOGREQ(0, bp,
1390 			    "Synchronization request failed (error=%d).",
1391 			    bp->bio_error);
1392 			g_destroy_bio(bp);
1393 			return;
1394 		}
1395 		G_RAID3_LOGREQ(3, bp, "Synchronization request finished.");
1396 		atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
1397 		dst = src = bp->bio_data;
1398 		if (disk->d_no == sc->sc_ndisks - 1) {
1399 			u_int n;
1400 
1401 			/* Parity component. */
1402 			for (left = bp->bio_length; left > 0;
1403 			    left -= sc->sc_sectorsize) {
1404 				bcopy(src, dst, atom);
1405 				src += atom;
1406 				for (n = 1; n < sc->sc_ndisks - 1; n++) {
1407 					g_raid3_xor(src, dst, dst, atom);
1408 					src += atom;
1409 				}
1410 				dst += atom;
1411 			}
1412 		} else {
1413 			/* Regular component. */
1414 			src += atom * disk->d_no;
1415 			for (left = bp->bio_length; left > 0;
1416 			    left -= sc->sc_sectorsize) {
1417 				bcopy(src, dst, atom);
1418 				src += sc->sc_sectorsize;
1419 				dst += atom;
1420 			}
1421 		}
1422 		bp->bio_offset /= sc->sc_ndisks - 1;
1423 		bp->bio_length /= sc->sc_ndisks - 1;
1424 		bp->bio_cmd = BIO_WRITE;
1425 		bp->bio_cflags = 0;
1426 		bp->bio_children = bp->bio_inbed = 0;
1427 		cp = disk->d_consumer;
1428 		KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1,
1429 		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1430 		    cp->acr, cp->acw, cp->ace));
1431 		cp->index++;
1432 		g_io_request(bp, cp);
1433 		return;
1434 	    }
1435 	case BIO_WRITE:
1436 	    {
1437 		struct g_raid3_disk_sync *sync;
1438 
1439 		if (bp->bio_error != 0) {
1440 			G_RAID3_LOGREQ(0, bp,
1441 			    "Synchronization request failed (error=%d).",
1442 			    bp->bio_error);
1443 			g_destroy_bio(bp);
1444 			sc->sc_bump_id |= G_RAID3_BUMP_GENID;
1445 			g_raid3_event_send(disk,
1446 			    G_RAID3_DISK_STATE_DISCONNECTED,
1447 			    G_RAID3_EVENT_DONTWAIT);
1448 			return;
1449 		}
1450 		G_RAID3_LOGREQ(3, bp, "Synchronization request finished.");
1451 		sync = &disk->d_sync;
1452 		sync->ds_offset_done = bp->bio_offset + bp->bio_length;
1453 		g_destroy_bio(bp);
1454 		if (sync->ds_resync != -1)
1455 			return;
1456 		if (sync->ds_offset_done ==
1457 		    sc->sc_mediasize / (sc->sc_ndisks - 1)) {
1458 			/*
1459 			 * Disk up-to-date, activate it.
1460 			 */
1461 			g_raid3_event_send(disk, G_RAID3_DISK_STATE_ACTIVE,
1462 			    G_RAID3_EVENT_DONTWAIT);
1463 			return;
1464 		} else if (sync->ds_offset_done % (MAXPHYS * 100) == 0) {
1465 			/*
1466 			 * Update offset_done on every 100 blocks.
1467 			 * XXX: This should be configurable.
1468 			 */
1469 			g_topology_lock();
1470 			g_raid3_update_metadata(disk);
1471 			g_topology_unlock();
1472 		}
1473 		return;
1474 	    }
1475 	default:
1476 		KASSERT(1 == 0, ("Invalid command here: %u (device=%s)",
1477 		    bp->bio_cmd, sc->sc_name));
1478 		break;
1479 	}
1480 }
1481 
1482 static int
1483 g_raid3_register_request(struct bio *pbp)
1484 {
1485 	struct g_raid3_softc *sc;
1486 	struct g_raid3_disk *disk;
1487 	struct g_consumer *cp;
1488 	struct bio *cbp;
1489 	off_t offset, length;
1490 	u_int n, ndisks;
1491 	int round_robin, verify;
1492 
1493 	ndisks = 0;
1494 	sc = pbp->bio_to->geom->softc;
1495 	if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGSYNC) != 0 &&
1496 	    sc->sc_syncdisk == NULL) {
1497 		g_io_deliver(pbp, EIO);
1498 		return (0);
1499 	}
1500 	g_raid3_init_bio(pbp);
1501 	length = pbp->bio_length / (sc->sc_ndisks - 1);
1502 	offset = pbp->bio_offset / (sc->sc_ndisks - 1);
1503 	round_robin = verify = 0;
1504 	switch (pbp->bio_cmd) {
1505 	case BIO_READ:
1506 		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 &&
1507 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
1508 			pbp->bio_pflags |= G_RAID3_BIO_PFLAG_VERIFY;
1509 			verify = 1;
1510 			ndisks = sc->sc_ndisks;
1511 		} else {
1512 			verify = 0;
1513 			ndisks = sc->sc_ndisks - 1;
1514 		}
1515 		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0 &&
1516 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
1517 			round_robin = 1;
1518 		} else {
1519 			round_robin = 0;
1520 		}
1521 		KASSERT(!round_robin || !verify,
1522 		    ("ROUND-ROBIN and VERIFY are mutually exclusive."));
1523 		pbp->bio_driver2 = &sc->sc_disks[sc->sc_ndisks - 1];
1524 		break;
1525 	case BIO_WRITE:
1526 	case BIO_DELETE:
1527 	    {
1528 		struct g_raid3_disk_sync *sync;
1529 
1530 		if (sc->sc_idle)
1531 			g_raid3_unidle(sc);
1532 
1533 		ndisks = sc->sc_ndisks;
1534 
1535 		if (sc->sc_syncdisk == NULL)
1536 			break;
1537 		sync = &sc->sc_syncdisk->d_sync;
1538 		if (offset >= sync->ds_offset)
1539 			break;
1540 		if (offset + length <= sync->ds_offset_done)
1541 			break;
1542 		if (offset >= sync->ds_resync && sync->ds_resync != -1)
1543 			break;
1544 		sync->ds_resync = offset - (offset % MAXPHYS);
1545 		break;
1546 	    }
1547 	}
1548 	for (n = 0; n < ndisks; n++) {
1549 		disk = &sc->sc_disks[n];
1550 		cbp = g_raid3_clone_bio(sc, pbp);
1551 		if (cbp == NULL) {
1552 			while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL)
1553 				g_raid3_destroy_bio(sc, cbp);
1554 			return (ENOMEM);
1555 		}
1556 		cbp->bio_offset = offset;
1557 		cbp->bio_length = length;
1558 		cbp->bio_done = g_raid3_done;
1559 		switch (pbp->bio_cmd) {
1560 		case BIO_READ:
1561 			if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) {
1562 				/*
1563 				 * Replace invalid component with the parity
1564 				 * component.
1565 				 */
1566 				disk = &sc->sc_disks[sc->sc_ndisks - 1];
1567 				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1568 				pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
1569 			} else if (round_robin &&
1570 			    disk->d_no == sc->sc_round_robin) {
1571 				/*
1572 				 * In round-robin mode skip one data component
1573 				 * and use parity component when reading.
1574 				 */
1575 				pbp->bio_driver2 = disk;
1576 				disk = &sc->sc_disks[sc->sc_ndisks - 1];
1577 				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1578 				sc->sc_round_robin++;
1579 				round_robin = 0;
1580 			} else if (verify && disk->d_no == sc->sc_ndisks - 1) {
1581 				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1582 			}
1583 			break;
1584 		case BIO_WRITE:
1585 		case BIO_DELETE:
1586 			if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
1587 			    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
1588 				if (n == ndisks - 1) {
1589 					/*
1590 					 * Active parity component, mark it as such.
1591 					 */
1592 					cbp->bio_cflags |=
1593 					    G_RAID3_BIO_CFLAG_PARITY;
1594 				}
1595 			} else {
1596 				pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
1597 				if (n == ndisks - 1) {
1598 					/*
1599 					 * Parity component is not connected,
1600 					 * so destroy its request.
1601 					 */
1602 					pbp->bio_pflags |=
1603 					    G_RAID3_BIO_PFLAG_NOPARITY;
1604 					g_raid3_destroy_bio(sc, cbp);
1605 					cbp = NULL;
1606 				} else {
1607 					cbp->bio_cflags |=
1608 					    G_RAID3_BIO_CFLAG_NODISK;
1609 					disk = NULL;
1610 				}
1611 			}
1612 			break;
1613 		}
1614 		if (cbp != NULL)
1615 			cbp->bio_caller2 = disk;
1616 	}
1617 	switch (pbp->bio_cmd) {
1618 	case BIO_READ:
1619 		if (round_robin) {
1620 			/*
1621 			 * If we are in round-robin mode and 'round_robin' is
1622 			 * still 1, it means, that we skipped parity component
1623 			 * for this read and must reset sc_round_robin field.
1624 			 */
1625 			sc->sc_round_robin = 0;
1626 		}
1627 		G_RAID3_FOREACH_BIO(pbp, cbp) {
1628 			disk = cbp->bio_caller2;
1629 			cp = disk->d_consumer;
1630 			cbp->bio_to = cp->provider;
1631 			G_RAID3_LOGREQ(3, cbp, "Sending request.");
1632 			KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1,
1633 			    ("Consumer %s not opened (r%dw%de%d).",
1634 			    cp->provider->name, cp->acr, cp->acw, cp->ace));
1635 			cp->index++;
1636 			g_io_request(cbp, cp);
1637 		}
1638 		break;
1639 	case BIO_WRITE:
1640 	case BIO_DELETE:
1641 		/*
1642 		 * Bump syncid on first write.
1643 		 */
1644 		if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0) {
1645 			sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID;
1646 			g_topology_lock();
1647 			g_raid3_bump_syncid(sc);
1648 			g_topology_unlock();
1649 		}
1650 		g_raid3_scatter(pbp);
1651 		break;
1652 	}
1653 	return (0);
1654 }
1655 
1656 static int
1657 g_raid3_can_destroy(struct g_raid3_softc *sc)
1658 {
1659 	struct g_geom *gp;
1660 	struct g_consumer *cp;
1661 
1662 	g_topology_assert();
1663 	gp = sc->sc_geom;
1664 	LIST_FOREACH(cp, &gp->consumer, consumer) {
1665 		if (g_raid3_is_busy(sc, cp))
1666 			return (0);
1667 	}
1668 	gp = sc->sc_sync.ds_geom;
1669 	LIST_FOREACH(cp, &gp->consumer, consumer) {
1670 		if (g_raid3_is_busy(sc, cp))
1671 			return (0);
1672 	}
1673 	G_RAID3_DEBUG(2, "No I/O requests for %s, it can be destroyed.",
1674 	    sc->sc_name);
1675 	return (1);
1676 }
1677 
1678 static int
1679 g_raid3_try_destroy(struct g_raid3_softc *sc)
1680 {
1681 
1682 	g_topology_lock();
1683 	if (!g_raid3_can_destroy(sc)) {
1684 		g_topology_unlock();
1685 		return (0);
1686 	}
1687 	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_WAIT) != 0) {
1688 		g_topology_unlock();
1689 		G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__,
1690 		    &sc->sc_worker);
1691 		wakeup(&sc->sc_worker);
1692 		sc->sc_worker = NULL;
1693 	} else {
1694 		g_raid3_destroy_device(sc);
1695 		g_topology_unlock();
1696 		free(sc->sc_disks, M_RAID3);
1697 		free(sc, M_RAID3);
1698 	}
1699 	return (1);
1700 }
1701 
1702 /*
1703  * Worker thread.
1704  */
1705 static void
1706 g_raid3_worker(void *arg)
1707 {
1708 	struct g_raid3_softc *sc;
1709 	struct g_raid3_disk *disk;
1710 	struct g_raid3_disk_sync *sync;
1711 	struct g_raid3_event *ep;
1712 	struct bio *bp;
1713 	u_int nreqs;
1714 
1715 	sc = arg;
1716 	mtx_lock_spin(&sched_lock);
1717 	sched_prio(curthread, PRIBIO);
1718 	mtx_unlock_spin(&sched_lock);
1719 
1720 	nreqs = 0;
1721 	for (;;) {
1722 		G_RAID3_DEBUG(5, "%s: Let's see...", __func__);
1723 		/*
1724 		 * First take a look at events.
1725 		 * This is important to handle events before any I/O requests.
1726 		 */
1727 		ep = g_raid3_event_get(sc);
1728 		if (ep != NULL && g_topology_try_lock()) {
1729 			g_raid3_event_remove(sc, ep);
1730 			if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0) {
1731 				/* Update only device status. */
1732 				G_RAID3_DEBUG(3,
1733 				    "Running event for device %s.",
1734 				    sc->sc_name);
1735 				ep->e_error = 0;
1736 				g_raid3_update_device(sc, 1);
1737 			} else {
1738 				/* Update disk status. */
1739 				G_RAID3_DEBUG(3, "Running event for disk %s.",
1740 				     g_raid3_get_diskname(ep->e_disk));
1741 				ep->e_error = g_raid3_update_disk(ep->e_disk,
1742 				    ep->e_state);
1743 				if (ep->e_error == 0)
1744 					g_raid3_update_device(sc, 0);
1745 			}
1746 			g_topology_unlock();
1747 			if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) {
1748 				KASSERT(ep->e_error == 0,
1749 				    ("Error cannot be handled."));
1750 				g_raid3_event_free(ep);
1751 			} else {
1752 				ep->e_flags |= G_RAID3_EVENT_DONE;
1753 				G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__,
1754 				    ep);
1755 				mtx_lock(&sc->sc_events_mtx);
1756 				wakeup(ep);
1757 				mtx_unlock(&sc->sc_events_mtx);
1758 			}
1759 			if ((sc->sc_flags &
1760 			    G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
1761 				if (g_raid3_try_destroy(sc))
1762 					kthread_exit(0);
1763 			}
1764 			G_RAID3_DEBUG(5, "%s: I'm here 1.", __func__);
1765 			continue;
1766 		}
1767 		/*
1768 		 * Now I/O requests.
1769 		 */
1770 		/* Get first request from the queue. */
1771 		mtx_lock(&sc->sc_queue_mtx);
1772 		bp = bioq_first(&sc->sc_queue);
1773 		if (bp == NULL) {
1774 			if (ep != NULL) {
1775 				/*
1776 				 * No I/O requests and topology lock was
1777 				 * already held? Try again.
1778 				 */
1779 				mtx_unlock(&sc->sc_queue_mtx);
1780 				tsleep(ep, PRIBIO, "r3:top1", hz / 5);
1781 				continue;
1782 			}
1783 			if ((sc->sc_flags &
1784 			    G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
1785 				mtx_unlock(&sc->sc_queue_mtx);
1786 				if (g_raid3_try_destroy(sc))
1787 					kthread_exit(0);
1788 				mtx_lock(&sc->sc_queue_mtx);
1789 			}
1790 		}
1791 		if (sc->sc_syncdisk != NULL &&
1792 		    (bp == NULL || nreqs > g_raid3_reqs_per_sync)) {
1793 			mtx_unlock(&sc->sc_queue_mtx);
1794 			/*
1795 			 * It is time for synchronization...
1796 			 */
1797 			nreqs = 0;
1798 			disk = sc->sc_syncdisk;
1799 			sync = &disk->d_sync;
1800 			if (sync->ds_offset <
1801 			    sc->sc_mediasize / (sc->sc_ndisks - 1) &&
1802 			    sync->ds_offset == sync->ds_offset_done) {
1803 				if (sync->ds_resync != -1) {
1804 					sync->ds_offset = sync->ds_resync;
1805 					sync->ds_offset_done = sync->ds_resync;
1806 					sync->ds_resync = -1;
1807 				}
1808 				g_raid3_sync_one(sc);
1809 			}
1810 			G_RAID3_DEBUG(5, "%s: I'm here 2.", __func__);
1811 			goto sleep;
1812 		}
1813 		if (bp == NULL) {
1814 			if (g_raid3_check_idle(sc)) {
1815 				u_int idletime;
1816 
1817 				idletime = g_raid3_idletime;
1818 				if (idletime == 0)
1819 					idletime = 1;
1820 				idletime *= hz;
1821 				if (msleep(sc, &sc->sc_queue_mtx, PRIBIO | PDROP,
1822 				    "r3:w1", idletime) == EWOULDBLOCK) {
1823 					G_RAID3_DEBUG(5, "%s: I'm here 3.",
1824 					    __func__);
1825 					/*
1826 					 * No I/O requests in 'idletime'
1827 					 * seconds, so mark components as clean.
1828 					 */
1829 					g_raid3_idle(sc);
1830 				}
1831 				G_RAID3_DEBUG(5, "%s: I'm here 4.", __func__);
1832 			} else {
1833 				MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP,
1834 				    "r3:w2", 0);
1835 				G_RAID3_DEBUG(5, "%s: I'm here 5.", __func__);
1836 			}
1837 			continue;
1838 		}
1839 		nreqs++;
1840 		bioq_remove(&sc->sc_queue, bp);
1841 		mtx_unlock(&sc->sc_queue_mtx);
1842 
1843 		if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0) {
1844 			g_raid3_regular_request(bp);
1845 		} else if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0) {
1846 			u_int timeout, sps;
1847 
1848 			g_raid3_sync_request(bp);
1849 sleep:
1850 			sps = atomic_load_acq_int(&g_raid3_syncs_per_sec);
1851 			if (sps == 0) {
1852 				G_RAID3_DEBUG(5, "%s: I'm here 6.", __func__);
1853 				continue;
1854 			}
1855 			if (ep != NULL) {
1856 				/*
1857 				 * We have some pending events, don't sleep now.
1858 				 */
1859 				G_RAID3_DEBUG(5, "%s: I'm here 7.", __func__);
1860 				tsleep(ep, PRIBIO, "r3:top2", hz / 5);
1861 				continue;
1862 			}
1863 			mtx_lock(&sc->sc_queue_mtx);
1864 			if (bioq_first(&sc->sc_queue) != NULL) {
1865 				mtx_unlock(&sc->sc_queue_mtx);
1866 				G_RAID3_DEBUG(5, "%s: I'm here 8.", __func__);
1867 				continue;
1868 			}
1869 			timeout = hz / sps;
1870 			if (timeout == 0)
1871 				timeout = 1;
1872 			MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "r3:w2",
1873 			    timeout);
1874 		} else {
1875 			if (g_raid3_register_request(bp) != 0) {
1876 				mtx_lock(&sc->sc_queue_mtx);
1877 				bioq_insert_tail(&sc->sc_queue, bp);
1878 				MSLEEP(&sc->sc_queue, &sc->sc_queue_mtx,
1879 				    PRIBIO | PDROP, "r3:lowmem", hz / 10);
1880 			}
1881 		}
1882 		G_RAID3_DEBUG(5, "%s: I'm here 9.", __func__);
1883 	}
1884 }
1885 
1886 /*
1887  * Open disk's consumer if needed.
1888  */
1889 static void
1890 g_raid3_update_access(struct g_raid3_disk *disk)
1891 {
1892 	struct g_provider *pp;
1893 
1894 	g_topology_assert();
1895 
1896 	pp = disk->d_softc->sc_provider;
1897 	if (pp == NULL)
1898 		return;
1899 	if (pp->acw > 0) {
1900 		if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) == 0) {
1901 			G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.",
1902 			    g_raid3_get_diskname(disk), disk->d_softc->sc_name);
1903 			disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
1904 		}
1905 	} else if (pp->acw == 0) {
1906 		if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0) {
1907 			G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.",
1908 			    g_raid3_get_diskname(disk), disk->d_softc->sc_name);
1909 			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
1910 		}
1911 	}
1912 }
1913 
1914 static void
1915 g_raid3_sync_start(struct g_raid3_softc *sc)
1916 {
1917 	struct g_raid3_disk *disk;
1918 	int error;
1919 	u_int n;
1920 
1921 	g_topology_assert();
1922 
1923 	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
1924 	    ("Device not in DEGRADED state (%s, %u).", sc->sc_name,
1925 	    sc->sc_state));
1926 	KASSERT(sc->sc_syncdisk == NULL, ("Syncdisk is not NULL (%s, %u).",
1927 	    sc->sc_name, sc->sc_state));
1928 	disk = NULL;
1929 	for (n = 0; n < sc->sc_ndisks; n++) {
1930 		if (sc->sc_disks[n].d_state != G_RAID3_DISK_STATE_SYNCHRONIZING)
1931 			continue;
1932 		disk = &sc->sc_disks[n];
1933 		break;
1934 	}
1935 	if (disk == NULL)
1936 		return;
1937 
1938 	G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name,
1939 	    g_raid3_get_diskname(disk));
1940 	disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
1941 	KASSERT(disk->d_sync.ds_consumer == NULL,
1942 	    ("Sync consumer already exists (device=%s, disk=%s).",
1943 	    sc->sc_name, g_raid3_get_diskname(disk)));
1944 	disk->d_sync.ds_consumer = g_new_consumer(sc->sc_sync.ds_geom);
1945 	disk->d_sync.ds_consumer->private = disk;
1946 	disk->d_sync.ds_consumer->index = 0;
1947 	error = g_attach(disk->d_sync.ds_consumer, disk->d_softc->sc_provider);
1948 	KASSERT(error == 0, ("Cannot attach to %s (error=%d).",
1949 	    disk->d_softc->sc_name, error));
1950 	error = g_access(disk->d_sync.ds_consumer, 1, 0, 0);
1951 	KASSERT(error == 0, ("Cannot open %s (error=%d).",
1952 	    disk->d_softc->sc_name, error));
1953 	disk->d_sync.ds_data = malloc(MAXPHYS, M_RAID3, M_WAITOK);
1954 	sc->sc_syncdisk = disk;
1955 }
1956 
1957 /*
1958  * Stop synchronization process.
1959  * type: 0 - synchronization finished
1960  *       1 - synchronization stopped
1961  */
1962 static void
1963 g_raid3_sync_stop(struct g_raid3_softc *sc, int type)
1964 {
1965 	struct g_raid3_disk *disk;
1966 
1967 	g_topology_assert();
1968 	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
1969 	    ("Device not in DEGRADED state (%s, %u).", sc->sc_name,
1970 	    sc->sc_state));
1971 	disk = sc->sc_syncdisk;
1972 	sc->sc_syncdisk = NULL;
1973 	KASSERT(disk != NULL, ("No disk was synchronized (%s).", sc->sc_name));
1974 	KASSERT(disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
1975 	    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
1976 	    g_raid3_disk_state2str(disk->d_state)));
1977 	if (disk->d_sync.ds_consumer == NULL)
1978 		return;
1979 
1980 	if (type == 0) {
1981 		G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s finished.",
1982 		    disk->d_softc->sc_name, g_raid3_get_diskname(disk));
1983 	} else /* if (type == 1) */ {
1984 		G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s stopped.",
1985 		    disk->d_softc->sc_name, g_raid3_get_diskname(disk));
1986 	}
1987 	g_raid3_kill_consumer(disk->d_softc, disk->d_sync.ds_consumer);
1988 	free(disk->d_sync.ds_data, M_RAID3);
1989 	disk->d_sync.ds_consumer = NULL;
1990 	disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
1991 }
1992 
1993 static void
1994 g_raid3_launch_provider(struct g_raid3_softc *sc)
1995 {
1996 	struct g_provider *pp;
1997 
1998 	g_topology_assert();
1999 
2000 	pp = g_new_providerf(sc->sc_geom, "raid3/%s", sc->sc_name);
2001 	pp->mediasize = sc->sc_mediasize;
2002 	pp->sectorsize = sc->sc_sectorsize;
2003 	sc->sc_provider = pp;
2004 	g_error_provider(pp, 0);
2005 	G_RAID3_DEBUG(0, "Device %s: provider %s launched.", sc->sc_name,
2006 	    pp->name);
2007 	if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED)
2008 		g_raid3_sync_start(sc);
2009 }
2010 
2011 static void
2012 g_raid3_destroy_provider(struct g_raid3_softc *sc)
2013 {
2014 	struct bio *bp;
2015 
2016 	g_topology_assert();
2017 	KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).",
2018 	    sc->sc_name));
2019 
2020 	g_error_provider(sc->sc_provider, ENXIO);
2021 	mtx_lock(&sc->sc_queue_mtx);
2022 	while ((bp = bioq_first(&sc->sc_queue)) != NULL) {
2023 		bioq_remove(&sc->sc_queue, bp);
2024 		g_io_deliver(bp, ENXIO);
2025 	}
2026 	mtx_unlock(&sc->sc_queue_mtx);
2027 	G_RAID3_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name,
2028 	    sc->sc_provider->name);
2029 	sc->sc_provider->flags |= G_PF_WITHER;
2030 	g_orphan_provider(sc->sc_provider, ENXIO);
2031 	sc->sc_provider = NULL;
2032 	if (sc->sc_syncdisk != NULL)
2033 		g_raid3_sync_stop(sc, 1);
2034 }
2035 
2036 static void
2037 g_raid3_go(void *arg)
2038 {
2039 	struct g_raid3_softc *sc;
2040 
2041 	sc = arg;
2042 	G_RAID3_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name);
2043 	g_raid3_event_send(sc, 0,
2044 	    G_RAID3_EVENT_DONTWAIT | G_RAID3_EVENT_DEVICE);
2045 }
2046 
2047 static u_int
2048 g_raid3_determine_state(struct g_raid3_disk *disk)
2049 {
2050 	struct g_raid3_softc *sc;
2051 	u_int state;
2052 
2053 	sc = disk->d_softc;
2054 	if (sc->sc_syncid == disk->d_sync.ds_syncid) {
2055 		if ((disk->d_flags &
2056 		    G_RAID3_DISK_FLAG_SYNCHRONIZING) == 0) {
2057 			/* Disk does not need synchronization. */
2058 			state = G_RAID3_DISK_STATE_ACTIVE;
2059 		} else {
2060 			if ((sc->sc_flags &
2061 			     G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0  ||
2062 			    (disk->d_flags &
2063 			     G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) {
2064 				/*
2065 				 * We can start synchronization from
2066 				 * the stored offset.
2067 				 */
2068 				state = G_RAID3_DISK_STATE_SYNCHRONIZING;
2069 			} else {
2070 				state = G_RAID3_DISK_STATE_STALE;
2071 			}
2072 		}
2073 	} else if (disk->d_sync.ds_syncid < sc->sc_syncid) {
2074 		/*
2075 		 * Reset all synchronization data for this disk,
2076 		 * because if it even was synchronized, it was
2077 		 * synchronized to disks with different syncid.
2078 		 */
2079 		disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING;
2080 		disk->d_sync.ds_offset = 0;
2081 		disk->d_sync.ds_offset_done = 0;
2082 		disk->d_sync.ds_syncid = sc->sc_syncid;
2083 		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 ||
2084 		    (disk->d_flags & G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) {
2085 			state = G_RAID3_DISK_STATE_SYNCHRONIZING;
2086 		} else {
2087 			state = G_RAID3_DISK_STATE_STALE;
2088 		}
2089 	} else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ {
2090 		/*
2091 		 * Not good, NOT GOOD!
2092 		 * It means that device was started on stale disks
2093 		 * and more fresh disk just arrive.
2094 		 * If there were writes, device is fucked up, sorry.
2095 		 * I think the best choice here is don't touch
2096 		 * this disk and inform the user laudly.
2097 		 */
2098 		G_RAID3_DEBUG(0, "Device %s was started before the freshest "
2099 		    "disk (%s) arrives!! It will not be connected to the "
2100 		    "running device.", sc->sc_name,
2101 		    g_raid3_get_diskname(disk));
2102 		g_raid3_destroy_disk(disk);
2103 		state = G_RAID3_DISK_STATE_NONE;
2104 		/* Return immediately, because disk was destroyed. */
2105 		return (state);
2106 	}
2107 	G_RAID3_DEBUG(3, "State for %s disk: %s.",
2108 	    g_raid3_get_diskname(disk), g_raid3_disk_state2str(state));
2109 	return (state);
2110 }
2111 
2112 /*
2113  * Update device state.
2114  */
2115 static void
2116 g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force)
2117 {
2118 	struct g_raid3_disk *disk;
2119 	u_int state;
2120 
2121 	g_topology_assert();
2122 
2123 	switch (sc->sc_state) {
2124 	case G_RAID3_DEVICE_STATE_STARTING:
2125 	    {
2126 		u_int n, ndirty, ndisks, genid, syncid;
2127 
2128 		KASSERT(sc->sc_provider == NULL,
2129 		    ("Non-NULL provider in STARTING state (%s).", sc->sc_name));
2130 		/*
2131 		 * Are we ready? We are, if all disks are connected or
2132 		 * one disk is missing and 'force' is true.
2133 		 */
2134 		if (g_raid3_ndisks(sc, -1) + force == sc->sc_ndisks) {
2135 			if (!force)
2136 				callout_drain(&sc->sc_callout);
2137 		} else {
2138 			if (force) {
2139 				/*
2140 				 * Timeout expired, so destroy device.
2141 				 */
2142 				sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2143 			}
2144 			return;
2145 		}
2146 
2147 		/*
2148 		 * Find the biggest genid.
2149 		 */
2150 		genid = 0;
2151 		for (n = 0; n < sc->sc_ndisks; n++) {
2152 			disk = &sc->sc_disks[n];
2153 			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2154 				continue;
2155 			if (disk->d_genid > genid)
2156 				genid = disk->d_genid;
2157 		}
2158 		sc->sc_genid = genid;
2159 		/*
2160 		 * Remove all disks without the biggest genid.
2161 		 */
2162 		for (n = 0; n < sc->sc_ndisks; n++) {
2163 			disk = &sc->sc_disks[n];
2164 			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2165 				continue;
2166 			if (disk->d_genid < genid) {
2167 				G_RAID3_DEBUG(0,
2168 				    "Component %s (device %s) broken, skipping.",
2169 				    g_raid3_get_diskname(disk), sc->sc_name);
2170 				g_raid3_destroy_disk(disk);
2171 			}
2172 		}
2173 
2174 		/*
2175 		 * There must be at least 'sc->sc_ndisks - 1' components
2176 		 * with the same syncid and without SYNCHRONIZING flag.
2177 		 */
2178 
2179 		/*
2180 		 * Find the biggest syncid, number of valid components and
2181 		 * number of dirty components.
2182 		 */
2183 		ndirty = ndisks = syncid = 0;
2184 		for (n = 0; n < sc->sc_ndisks; n++) {
2185 			disk = &sc->sc_disks[n];
2186 			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2187 				continue;
2188 			if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0)
2189 				ndirty++;
2190 			if (disk->d_sync.ds_syncid > syncid) {
2191 				syncid = disk->d_sync.ds_syncid;
2192 				ndisks = 0;
2193 			} else if (disk->d_sync.ds_syncid < syncid) {
2194 				continue;
2195 			}
2196 			if ((disk->d_flags &
2197 			    G_RAID3_DISK_FLAG_SYNCHRONIZING) != 0) {
2198 				continue;
2199 			}
2200 			ndisks++;
2201 		}
2202 		/*
2203 		 * Do we have enough valid components?
2204 		 */
2205 		if (ndisks + 1 < sc->sc_ndisks) {
2206 			G_RAID3_DEBUG(0,
2207 			    "Device %s is broken, too few valid components.",
2208 			    sc->sc_name);
2209 			sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2210 			return;
2211 		}
2212 		/*
2213 		 * If there is one DIRTY component and all disks are present,
2214 		 * mark it for synchronization. If there is more than one DIRTY
2215 		 * component, mark parity component for synchronization.
2216 		 */
2217 		if (ndisks == sc->sc_ndisks && ndirty == 1) {
2218 			for (n = 0; n < sc->sc_ndisks; n++) {
2219 				disk = &sc->sc_disks[n];
2220 				if ((disk->d_flags &
2221 				    G_RAID3_DISK_FLAG_DIRTY) == 0) {
2222 					continue;
2223 				}
2224 				disk->d_flags |=
2225 				    G_RAID3_DISK_FLAG_SYNCHRONIZING;
2226 			}
2227 		} else if (ndisks == sc->sc_ndisks && ndirty > 1) {
2228 			disk = &sc->sc_disks[sc->sc_ndisks - 1];
2229 			disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING;
2230 		}
2231 
2232 		sc->sc_syncid = syncid;
2233 		if (force) {
2234 			/* Remember to bump syncid on first write. */
2235 			sc->sc_bump_id |= G_RAID3_BUMP_SYNCID;
2236 		}
2237 		if (ndisks == sc->sc_ndisks)
2238 			state = G_RAID3_DEVICE_STATE_COMPLETE;
2239 		else /* if (ndisks == sc->sc_ndisks - 1) */
2240 			state = G_RAID3_DEVICE_STATE_DEGRADED;
2241 		G_RAID3_DEBUG(1, "Device %s state changed from %s to %s.",
2242 		    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
2243 		    g_raid3_device_state2str(state));
2244 		sc->sc_state = state;
2245 		for (n = 0; n < sc->sc_ndisks; n++) {
2246 			disk = &sc->sc_disks[n];
2247 			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2248 				continue;
2249 			state = g_raid3_determine_state(disk);
2250 			g_raid3_event_send(disk, state, G_RAID3_EVENT_DONTWAIT);
2251 			if (state == G_RAID3_DISK_STATE_STALE)
2252 				sc->sc_bump_id |= G_RAID3_BUMP_SYNCID;
2253 		}
2254 		break;
2255 	    }
2256 	case G_RAID3_DEVICE_STATE_DEGRADED:
2257 		/*
2258 		 * Genid need to be bumped immediately, so do it here.
2259 		 */
2260 		if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) {
2261 			sc->sc_bump_id &= ~G_RAID3_BUMP_GENID;
2262 			g_raid3_bump_genid(sc);
2263 		}
2264 
2265 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0)
2266 			return;
2267 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) <
2268 		    sc->sc_ndisks - 1) {
2269 			if (sc->sc_provider != NULL)
2270 				g_raid3_destroy_provider(sc);
2271 			sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2272 			return;
2273 		}
2274 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) ==
2275 		    sc->sc_ndisks) {
2276 			state = G_RAID3_DEVICE_STATE_COMPLETE;
2277 			G_RAID3_DEBUG(1,
2278 			    "Device %s state changed from %s to %s.",
2279 			    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
2280 			    g_raid3_device_state2str(state));
2281 			sc->sc_state = state;
2282 		}
2283 		if (sc->sc_provider == NULL)
2284 			g_raid3_launch_provider(sc);
2285 		break;
2286 	case G_RAID3_DEVICE_STATE_COMPLETE:
2287 		/*
2288 		 * Genid need to be bumped immediately, so do it here.
2289 		 */
2290 		if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) {
2291 			sc->sc_bump_id &= ~G_RAID3_BUMP_GENID;
2292 			g_raid3_bump_genid(sc);
2293 		}
2294 
2295 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0)
2296 			return;
2297 		KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) >=
2298 		    sc->sc_ndisks - 1,
2299 		    ("Too few ACTIVE components in COMPLETE state (device %s).",
2300 		    sc->sc_name));
2301 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) ==
2302 		    sc->sc_ndisks - 1) {
2303 			state = G_RAID3_DEVICE_STATE_DEGRADED;
2304 			G_RAID3_DEBUG(1,
2305 			    "Device %s state changed from %s to %s.",
2306 			    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
2307 			    g_raid3_device_state2str(state));
2308 			sc->sc_state = state;
2309 		}
2310 		if (sc->sc_provider == NULL)
2311 			g_raid3_launch_provider(sc);
2312 		break;
2313 	default:
2314 		KASSERT(1 == 0, ("Wrong device state (%s, %s).", sc->sc_name,
2315 		    g_raid3_device_state2str(sc->sc_state)));
2316 		break;
2317 	}
2318 }
2319 
2320 /*
2321  * Update disk state and device state if needed.
2322  */
2323 #define	DISK_STATE_CHANGED()	G_RAID3_DEBUG(1,			\
2324 	"Disk %s state changed from %s to %s (device %s).",		\
2325 	g_raid3_get_diskname(disk),					\
2326 	g_raid3_disk_state2str(disk->d_state),				\
2327 	g_raid3_disk_state2str(state), sc->sc_name)
2328 static int
2329 g_raid3_update_disk(struct g_raid3_disk *disk, u_int state)
2330 {
2331 	struct g_raid3_softc *sc;
2332 
2333 	g_topology_assert();
2334 
2335 	sc = disk->d_softc;
2336 again:
2337 	G_RAID3_DEBUG(3, "Changing disk %s state from %s to %s.",
2338 	    g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state),
2339 	    g_raid3_disk_state2str(state));
2340 	switch (state) {
2341 	case G_RAID3_DISK_STATE_NEW:
2342 		/*
2343 		 * Possible scenarios:
2344 		 * 1. New disk arrive.
2345 		 */
2346 		/* Previous state should be NONE. */
2347 		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NONE,
2348 		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2349 		    g_raid3_disk_state2str(disk->d_state)));
2350 		DISK_STATE_CHANGED();
2351 
2352 		disk->d_state = state;
2353 		G_RAID3_DEBUG(0, "Device %s: provider %s detected.",
2354 		    sc->sc_name, g_raid3_get_diskname(disk));
2355 		if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING)
2356 			break;
2357 		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2358 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2359 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2360 		    g_raid3_device_state2str(sc->sc_state),
2361 		    g_raid3_get_diskname(disk),
2362 		    g_raid3_disk_state2str(disk->d_state)));
2363 		state = g_raid3_determine_state(disk);
2364 		if (state != G_RAID3_DISK_STATE_NONE)
2365 			goto again;
2366 		break;
2367 	case G_RAID3_DISK_STATE_ACTIVE:
2368 		/*
2369 		 * Possible scenarios:
2370 		 * 1. New disk does not need synchronization.
2371 		 * 2. Synchronization process finished successfully.
2372 		 */
2373 		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2374 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2375 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2376 		    g_raid3_device_state2str(sc->sc_state),
2377 		    g_raid3_get_diskname(disk),
2378 		    g_raid3_disk_state2str(disk->d_state)));
2379 		/* Previous state should be NEW or SYNCHRONIZING. */
2380 		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW ||
2381 		    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
2382 		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2383 		    g_raid3_disk_state2str(disk->d_state)));
2384 		DISK_STATE_CHANGED();
2385 
2386 		if (disk->d_state == G_RAID3_DISK_STATE_NEW)
2387 			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2388 		else if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
2389 			disk->d_flags &= ~G_RAID3_DISK_FLAG_SYNCHRONIZING;
2390 			disk->d_flags &= ~G_RAID3_DISK_FLAG_FORCE_SYNC;
2391 			g_raid3_sync_stop(sc, 0);
2392 		}
2393 		disk->d_state = state;
2394 		disk->d_sync.ds_offset = 0;
2395 		disk->d_sync.ds_offset_done = 0;
2396 		g_raid3_update_access(disk);
2397 		g_raid3_update_metadata(disk);
2398 		G_RAID3_DEBUG(0, "Device %s: provider %s activated.",
2399 		    sc->sc_name, g_raid3_get_diskname(disk));
2400 		break;
2401 	case G_RAID3_DISK_STATE_STALE:
2402 		/*
2403 		 * Possible scenarios:
2404 		 * 1. Stale disk was connected.
2405 		 */
2406 		/* Previous state should be NEW. */
2407 		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
2408 		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2409 		    g_raid3_disk_state2str(disk->d_state)));
2410 		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2411 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2412 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2413 		    g_raid3_device_state2str(sc->sc_state),
2414 		    g_raid3_get_diskname(disk),
2415 		    g_raid3_disk_state2str(disk->d_state)));
2416 		/*
2417 		 * STALE state is only possible if device is marked
2418 		 * NOAUTOSYNC.
2419 		 */
2420 		KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) != 0,
2421 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2422 		    g_raid3_device_state2str(sc->sc_state),
2423 		    g_raid3_get_diskname(disk),
2424 		    g_raid3_disk_state2str(disk->d_state)));
2425 		DISK_STATE_CHANGED();
2426 
2427 		disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2428 		disk->d_state = state;
2429 		g_raid3_update_metadata(disk);
2430 		G_RAID3_DEBUG(0, "Device %s: provider %s is stale.",
2431 		    sc->sc_name, g_raid3_get_diskname(disk));
2432 		break;
2433 	case G_RAID3_DISK_STATE_SYNCHRONIZING:
2434 		/*
2435 		 * Possible scenarios:
2436 		 * 1. Disk which needs synchronization was connected.
2437 		 */
2438 		/* Previous state should be NEW. */
2439 		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
2440 		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2441 		    g_raid3_disk_state2str(disk->d_state)));
2442 		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2443 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2444 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2445 		    g_raid3_device_state2str(sc->sc_state),
2446 		    g_raid3_get_diskname(disk),
2447 		    g_raid3_disk_state2str(disk->d_state)));
2448 		DISK_STATE_CHANGED();
2449 
2450 		if (disk->d_state == G_RAID3_DISK_STATE_NEW)
2451 			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2452 		disk->d_state = state;
2453 		if (sc->sc_provider != NULL) {
2454 			g_raid3_sync_start(sc);
2455 			g_raid3_update_metadata(disk);
2456 		}
2457 		break;
2458 	case G_RAID3_DISK_STATE_DISCONNECTED:
2459 		/*
2460 		 * Possible scenarios:
2461 		 * 1. Device wasn't running yet, but disk disappear.
2462 		 * 2. Disk was active and disapppear.
2463 		 * 3. Disk disappear during synchronization process.
2464 		 */
2465 		if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2466 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
2467 			/*
2468 			 * Previous state should be ACTIVE, STALE or
2469 			 * SYNCHRONIZING.
2470 			 */
2471 			KASSERT(disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
2472 			    disk->d_state == G_RAID3_DISK_STATE_STALE ||
2473 			    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
2474 			    ("Wrong disk state (%s, %s).",
2475 			    g_raid3_get_diskname(disk),
2476 			    g_raid3_disk_state2str(disk->d_state)));
2477 		} else if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING) {
2478 			/* Previous state should be NEW. */
2479 			KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
2480 			    ("Wrong disk state (%s, %s).",
2481 			    g_raid3_get_diskname(disk),
2482 			    g_raid3_disk_state2str(disk->d_state)));
2483 			/*
2484 			 * Reset bumping syncid if disk disappeared in STARTING
2485 			 * state.
2486 			 */
2487 			if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0)
2488 				sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID;
2489 #ifdef	INVARIANTS
2490 		} else {
2491 			KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).",
2492 			    sc->sc_name,
2493 			    g_raid3_device_state2str(sc->sc_state),
2494 			    g_raid3_get_diskname(disk),
2495 			    g_raid3_disk_state2str(disk->d_state)));
2496 #endif
2497 		}
2498 		DISK_STATE_CHANGED();
2499 		G_RAID3_DEBUG(0, "Device %s: provider %s disconnected.",
2500 		    sc->sc_name, g_raid3_get_diskname(disk));
2501 
2502 		g_raid3_destroy_disk(disk);
2503 		break;
2504 	default:
2505 		KASSERT(1 == 0, ("Unknown state (%u).", state));
2506 		break;
2507 	}
2508 	return (0);
2509 }
2510 #undef	DISK_STATE_CHANGED
2511 
2512 int
2513 g_raid3_read_metadata(struct g_consumer *cp, struct g_raid3_metadata *md)
2514 {
2515 	struct g_provider *pp;
2516 	u_char *buf;
2517 	int error;
2518 
2519 	g_topology_assert();
2520 
2521 	error = g_access(cp, 1, 0, 0);
2522 	if (error != 0)
2523 		return (error);
2524 	pp = cp->provider;
2525 	g_topology_unlock();
2526 	/* Metadata are stored on last sector. */
2527 	buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
2528 	    &error);
2529 	g_topology_lock();
2530 	g_access(cp, -1, 0, 0);
2531 	if (error != 0) {
2532 		G_RAID3_DEBUG(1, "Cannot read metadata from %s (error=%d).",
2533 		    cp->provider->name, error);
2534 		if (buf != NULL)
2535 			g_free(buf);
2536 		return (error);
2537 	}
2538 
2539 	/* Decode metadata. */
2540 	error = raid3_metadata_decode(buf, md);
2541 	g_free(buf);
2542 	if (strcmp(md->md_magic, G_RAID3_MAGIC) != 0)
2543 		return (EINVAL);
2544 	if (md->md_version > G_RAID3_VERSION) {
2545 		G_RAID3_DEBUG(0,
2546 		    "Kernel module is too old to handle metadata from %s.",
2547 		    cp->provider->name);
2548 		return (EINVAL);
2549 	}
2550 	if (error != 0) {
2551 		G_RAID3_DEBUG(1, "MD5 metadata hash mismatch for provider %s.",
2552 		    cp->provider->name);
2553 		return (error);
2554 	}
2555 
2556 	return (0);
2557 }
2558 
2559 static int
2560 g_raid3_check_metadata(struct g_raid3_softc *sc, struct g_provider *pp,
2561     struct g_raid3_metadata *md)
2562 {
2563 
2564 	if (md->md_no >= sc->sc_ndisks) {
2565 		G_RAID3_DEBUG(1, "Invalid disk %s number (no=%u), skipping.",
2566 		    pp->name, md->md_no);
2567 		return (EINVAL);
2568 	}
2569 	if (sc->sc_disks[md->md_no].d_state != G_RAID3_DISK_STATE_NODISK) {
2570 		G_RAID3_DEBUG(1, "Disk %s (no=%u) already exists, skipping.",
2571 		    pp->name, md->md_no);
2572 		return (EEXIST);
2573 	}
2574 	if (md->md_all != sc->sc_ndisks) {
2575 		G_RAID3_DEBUG(1,
2576 		    "Invalid '%s' field on disk %s (device %s), skipping.",
2577 		    "md_all", pp->name, sc->sc_name);
2578 		return (EINVAL);
2579 	}
2580 	if (md->md_mediasize != sc->sc_mediasize) {
2581 		G_RAID3_DEBUG(1,
2582 		    "Invalid '%s' field on disk %s (device %s), skipping.",
2583 		    "md_mediasize", pp->name, sc->sc_name);
2584 		return (EINVAL);
2585 	}
2586 	if ((md->md_mediasize % (sc->sc_ndisks - 1)) != 0) {
2587 		G_RAID3_DEBUG(1,
2588 		    "Invalid '%s' field on disk %s (device %s), skipping.",
2589 		    "md_mediasize", pp->name, sc->sc_name);
2590 		return (EINVAL);
2591 	}
2592 	if ((sc->sc_mediasize / (sc->sc_ndisks - 1)) > pp->mediasize) {
2593 		G_RAID3_DEBUG(1,
2594 		    "Invalid size of disk %s (device %s), skipping.", pp->name,
2595 		    sc->sc_name);
2596 		return (EINVAL);
2597 	}
2598 	if ((md->md_sectorsize / pp->sectorsize) < sc->sc_ndisks - 1) {
2599 		G_RAID3_DEBUG(1,
2600 		    "Invalid '%s' field on disk %s (device %s), skipping.",
2601 		    "md_sectorsize", pp->name, sc->sc_name);
2602 		return (EINVAL);
2603 	}
2604 	if (md->md_sectorsize != sc->sc_sectorsize) {
2605 		G_RAID3_DEBUG(1,
2606 		    "Invalid '%s' field on disk %s (device %s), skipping.",
2607 		    "md_sectorsize", pp->name, sc->sc_name);
2608 		return (EINVAL);
2609 	}
2610 	if ((sc->sc_sectorsize % pp->sectorsize) != 0) {
2611 		G_RAID3_DEBUG(1,
2612 		    "Invalid sector size of disk %s (device %s), skipping.",
2613 		    pp->name, sc->sc_name);
2614 		return (EINVAL);
2615 	}
2616 	if ((md->md_mflags & ~G_RAID3_DEVICE_FLAG_MASK) != 0) {
2617 		G_RAID3_DEBUG(1,
2618 		    "Invalid device flags on disk %s (device %s), skipping.",
2619 		    pp->name, sc->sc_name);
2620 		return (EINVAL);
2621 	}
2622 	if ((md->md_mflags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 &&
2623 	    (md->md_mflags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0) {
2624 		/*
2625 		 * VERIFY and ROUND-ROBIN options are mutally exclusive.
2626 		 */
2627 		G_RAID3_DEBUG(1, "Both VERIFY and ROUND-ROBIN flags exist on "
2628 		    "disk %s (device %s), skipping.", pp->name, sc->sc_name);
2629 		return (EINVAL);
2630 	}
2631 	if ((md->md_dflags & ~G_RAID3_DISK_FLAG_MASK) != 0) {
2632 		G_RAID3_DEBUG(1,
2633 		    "Invalid disk flags on disk %s (device %s), skipping.",
2634 		    pp->name, sc->sc_name);
2635 		return (EINVAL);
2636 	}
2637 	return (0);
2638 }
2639 
2640 int
2641 g_raid3_add_disk(struct g_raid3_softc *sc, struct g_provider *pp,
2642     struct g_raid3_metadata *md)
2643 {
2644 	struct g_raid3_disk *disk;
2645 	int error;
2646 
2647 	g_topology_assert();
2648 	G_RAID3_DEBUG(2, "Adding disk %s.", pp->name);
2649 
2650 	error = g_raid3_check_metadata(sc, pp, md);
2651 	if (error != 0)
2652 		return (error);
2653 	if (sc->sc_state != G_RAID3_DEVICE_STATE_STARTING &&
2654 	    md->md_genid < sc->sc_genid) {
2655 		G_RAID3_DEBUG(0, "Component %s (device %s) broken, skipping.",
2656 		    pp->name, sc->sc_name);
2657 		return (EINVAL);
2658 	}
2659 	disk = g_raid3_init_disk(sc, pp, md, &error);
2660 	if (disk == NULL)
2661 		return (error);
2662 	error = g_raid3_event_send(disk, G_RAID3_DISK_STATE_NEW,
2663 	    G_RAID3_EVENT_WAIT);
2664 	if (error != 0)
2665 		return (error);
2666 	if (md->md_version < G_RAID3_VERSION) {
2667 		G_RAID3_DEBUG(0, "Upgrading metadata on %s (v%d->v%d).",
2668 		    pp->name, md->md_version, G_RAID3_VERSION);
2669 		g_raid3_update_metadata(disk);
2670 	}
2671 	return (0);
2672 }
2673 
2674 static int
2675 g_raid3_access(struct g_provider *pp, int acr, int acw, int ace)
2676 {
2677 	struct g_raid3_softc *sc;
2678 	struct g_raid3_disk *disk;
2679 	int dcr, dcw, dce;
2680 	u_int n;
2681 
2682 	g_topology_assert();
2683 	G_RAID3_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr,
2684 	    acw, ace);
2685 
2686 	dcr = pp->acr + acr;
2687 	dcw = pp->acw + acw;
2688 	dce = pp->ace + ace;
2689 
2690 	sc = pp->geom->softc;
2691 	if (sc == NULL ||
2692 	    g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < sc->sc_ndisks - 1 ||
2693 	    (sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
2694 		if (acr <= 0 && acw <= 0 && ace <= 0)
2695 			return (0);
2696 		else
2697 			return (ENXIO);
2698 	}
2699 	for (n = 0; n < sc->sc_ndisks; n++) {
2700 		disk = &sc->sc_disks[n];
2701 		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
2702 			continue;
2703 		/*
2704 		 * Mark disk as dirty on open and unmark on close.
2705 		 */
2706 		if (pp->acw == 0 && dcw > 0) {
2707 			G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.",
2708 			    g_raid3_get_diskname(disk), sc->sc_name);
2709 			disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
2710 			g_raid3_update_metadata(disk);
2711 		} else if (pp->acw > 0 && dcw == 0) {
2712 			G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.",
2713 			    g_raid3_get_diskname(disk), sc->sc_name);
2714 			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2715 			g_raid3_update_metadata(disk);
2716 		}
2717 	}
2718 	return (0);
2719 }
2720 
2721 static struct g_geom *
2722 g_raid3_create(struct g_class *mp, const struct g_raid3_metadata *md)
2723 {
2724 	struct g_raid3_softc *sc;
2725 	struct g_geom *gp;
2726 	int error, timeout;
2727 	u_int n;
2728 
2729 	g_topology_assert();
2730 	G_RAID3_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id);
2731 
2732 	/* One disk is minimum. */
2733 	if (md->md_all < 1)
2734 		return (NULL);
2735 	/*
2736 	 * Action geom.
2737 	 */
2738 	gp = g_new_geomf(mp, "%s", md->md_name);
2739 	sc = malloc(sizeof(*sc), M_RAID3, M_WAITOK | M_ZERO);
2740 	sc->sc_disks = malloc(sizeof(struct g_raid3_disk) * md->md_all, M_RAID3,
2741 	    M_WAITOK | M_ZERO);
2742 	gp->start = g_raid3_start;
2743 	gp->orphan = g_raid3_orphan;
2744 	gp->access = g_raid3_access;
2745 	gp->dumpconf = g_raid3_dumpconf;
2746 
2747 	sc->sc_id = md->md_id;
2748 	sc->sc_mediasize = md->md_mediasize;
2749 	sc->sc_sectorsize = md->md_sectorsize;
2750 	sc->sc_ndisks = md->md_all;
2751 	sc->sc_round_robin = 0;
2752 	sc->sc_flags = md->md_mflags;
2753 	sc->sc_bump_id = 0;
2754 	sc->sc_idle = 0;
2755 	for (n = 0; n < sc->sc_ndisks; n++) {
2756 		sc->sc_disks[n].d_softc = sc;
2757 		sc->sc_disks[n].d_no = n;
2758 		sc->sc_disks[n].d_state = G_RAID3_DISK_STATE_NODISK;
2759 	}
2760 	bioq_init(&sc->sc_queue);
2761 	mtx_init(&sc->sc_queue_mtx, "graid3:queue", NULL, MTX_DEF);
2762 	TAILQ_INIT(&sc->sc_events);
2763 	mtx_init(&sc->sc_events_mtx, "graid3:events", NULL, MTX_DEF);
2764 	callout_init(&sc->sc_callout, CALLOUT_MPSAFE);
2765 	sc->sc_state = G_RAID3_DEVICE_STATE_STARTING;
2766 	gp->softc = sc;
2767 	sc->sc_geom = gp;
2768 	sc->sc_provider = NULL;
2769 	/*
2770 	 * Synchronization geom.
2771 	 */
2772 	gp = g_new_geomf(mp, "%s.sync", md->md_name);
2773 	gp->softc = sc;
2774 	gp->orphan = g_raid3_orphan;
2775 	sc->sc_sync.ds_geom = gp;
2776 	sc->sc_zone_64k = uma_zcreate("gr3:64k", 65536, NULL, NULL, NULL, NULL,
2777 	    UMA_ALIGN_PTR, 0);
2778 	uma_zone_set_max(sc->sc_zone_64k, g_raid3_n64k);
2779 	sc->sc_zone_16k = uma_zcreate("gr3:16k", 16384, NULL, NULL, NULL, NULL,
2780 	    UMA_ALIGN_PTR, 0);
2781 	uma_zone_set_max(sc->sc_zone_64k, g_raid3_n16k);
2782 	sc->sc_zone_4k = uma_zcreate("gr3:4k", 4096, NULL, NULL, NULL, NULL,
2783 	    UMA_ALIGN_PTR, 0);
2784 	uma_zone_set_max(sc->sc_zone_4k, g_raid3_n4k);
2785 	error = kthread_create(g_raid3_worker, sc, &sc->sc_worker, 0, 0,
2786 	    "g_raid3 %s", md->md_name);
2787 	if (error != 0) {
2788 		G_RAID3_DEBUG(1, "Cannot create kernel thread for %s.",
2789 		    sc->sc_name);
2790 		uma_zdestroy(sc->sc_zone_64k);
2791 		uma_zdestroy(sc->sc_zone_16k);
2792 		uma_zdestroy(sc->sc_zone_4k);
2793 		g_destroy_geom(sc->sc_sync.ds_geom);
2794 		mtx_destroy(&sc->sc_events_mtx);
2795 		mtx_destroy(&sc->sc_queue_mtx);
2796 		g_destroy_geom(sc->sc_geom);
2797 		free(sc->sc_disks, M_RAID3);
2798 		free(sc, M_RAID3);
2799 		return (NULL);
2800 	}
2801 
2802 	G_RAID3_DEBUG(0, "Device %s created (id=%u).", sc->sc_name, sc->sc_id);
2803 
2804 	/*
2805 	 * Run timeout.
2806 	 */
2807 	timeout = atomic_load_acq_int(&g_raid3_timeout);
2808 	callout_reset(&sc->sc_callout, timeout * hz, g_raid3_go, sc);
2809 	return (sc->sc_geom);
2810 }
2811 
2812 int
2813 g_raid3_destroy(struct g_raid3_softc *sc, boolean_t force)
2814 {
2815 	struct g_provider *pp;
2816 
2817 	g_topology_assert();
2818 
2819 	if (sc == NULL)
2820 		return (ENXIO);
2821 	pp = sc->sc_provider;
2822 	if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
2823 		if (force) {
2824 			G_RAID3_DEBUG(1, "Device %s is still open, so it "
2825 			    "can't be definitely removed.", pp->name);
2826 		} else {
2827 			G_RAID3_DEBUG(1,
2828 			    "Device %s is still open (r%dw%de%d).", pp->name,
2829 			    pp->acr, pp->acw, pp->ace);
2830 			return (EBUSY);
2831 		}
2832 	}
2833 
2834 	sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2835 	sc->sc_flags |= G_RAID3_DEVICE_FLAG_WAIT;
2836 	g_topology_unlock();
2837 	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
2838 	mtx_lock(&sc->sc_queue_mtx);
2839 	wakeup(sc);
2840 	wakeup(&sc->sc_queue);
2841 	mtx_unlock(&sc->sc_queue_mtx);
2842 	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker);
2843 	while (sc->sc_worker != NULL)
2844 		tsleep(&sc->sc_worker, PRIBIO, "r3:destroy", hz / 5);
2845 	G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker);
2846 	g_topology_lock();
2847 	g_raid3_destroy_device(sc);
2848 	free(sc->sc_disks, M_RAID3);
2849 	free(sc, M_RAID3);
2850 	return (0);
2851 }
2852 
2853 static void
2854 g_raid3_taste_orphan(struct g_consumer *cp)
2855 {
2856 
2857 	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
2858 	    cp->provider->name));
2859 }
2860 
2861 static struct g_geom *
2862 g_raid3_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
2863 {
2864 	struct g_raid3_metadata md;
2865 	struct g_raid3_softc *sc;
2866 	struct g_consumer *cp;
2867 	struct g_geom *gp;
2868 	int error;
2869 
2870 	g_topology_assert();
2871 	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
2872 	G_RAID3_DEBUG(2, "Tasting %s.", pp->name);
2873 
2874 	gp = g_new_geomf(mp, "raid3:taste");
2875 	/* This orphan function should be never called. */
2876 	gp->orphan = g_raid3_taste_orphan;
2877 	cp = g_new_consumer(gp);
2878 	g_attach(cp, pp);
2879 	error = g_raid3_read_metadata(cp, &md);
2880 	g_detach(cp);
2881 	g_destroy_consumer(cp);
2882 	g_destroy_geom(gp);
2883 	if (error != 0)
2884 		return (NULL);
2885 	gp = NULL;
2886 
2887 	if (md.md_provider[0] != '\0' && strcmp(md.md_provider, pp->name) != 0)
2888 		return (NULL);
2889 	if (g_raid3_debug >= 2)
2890 		raid3_metadata_dump(&md);
2891 
2892 	/*
2893 	 * Let's check if device already exists.
2894 	 */
2895 	sc = NULL;
2896 	LIST_FOREACH(gp, &mp->geom, geom) {
2897 		sc = gp->softc;
2898 		if (sc == NULL)
2899 			continue;
2900 		if (sc->sc_sync.ds_geom == gp)
2901 			continue;
2902 		if (strcmp(md.md_name, sc->sc_name) != 0)
2903 			continue;
2904 		if (md.md_id != sc->sc_id) {
2905 			G_RAID3_DEBUG(0, "Device %s already configured.",
2906 			    sc->sc_name);
2907 			return (NULL);
2908 		}
2909 		break;
2910 	}
2911 	if (gp == NULL) {
2912 		gp = g_raid3_create(mp, &md);
2913 		if (gp == NULL) {
2914 			G_RAID3_DEBUG(0, "Cannot create device %s.",
2915 			    md.md_name);
2916 			return (NULL);
2917 		}
2918 		sc = gp->softc;
2919 	}
2920 	G_RAID3_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name);
2921 	error = g_raid3_add_disk(sc, pp, &md);
2922 	if (error != 0) {
2923 		G_RAID3_DEBUG(0, "Cannot add disk %s to %s (error=%d).",
2924 		    pp->name, gp->name, error);
2925 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NODISK) ==
2926 		    sc->sc_ndisks) {
2927 			g_raid3_destroy(sc, 1);
2928 		}
2929 		return (NULL);
2930 	}
2931 	return (gp);
2932 }
2933 
2934 static int
2935 g_raid3_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused,
2936     struct g_geom *gp)
2937 {
2938 
2939 	return (g_raid3_destroy(gp->softc, 0));
2940 }
2941 
2942 static void
2943 g_raid3_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
2944     struct g_consumer *cp, struct g_provider *pp)
2945 {
2946 	struct g_raid3_softc *sc;
2947 
2948 	g_topology_assert();
2949 
2950 	sc = gp->softc;
2951 	if (sc == NULL)
2952 		return;
2953 	/* Skip synchronization geom. */
2954 	if (gp == sc->sc_sync.ds_geom)
2955 		return;
2956 	if (pp != NULL) {
2957 		/* Nothing here. */
2958 	} else if (cp != NULL) {
2959 		struct g_raid3_disk *disk;
2960 
2961 		disk = cp->private;
2962 		if (disk == NULL)
2963 			return;
2964 		sbuf_printf(sb, "%s<Type>", indent);
2965 		if (disk->d_no == sc->sc_ndisks - 1)
2966 			sbuf_printf(sb, "PARITY");
2967 		else
2968 			sbuf_printf(sb, "DATA");
2969 		sbuf_printf(sb, "</Type>\n");
2970 		sbuf_printf(sb, "%s<Number>%u</Number>\n", indent,
2971 		    (u_int)disk->d_no);
2972 		if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
2973 			sbuf_printf(sb, "%s<Synchronized>", indent);
2974 			if (disk->d_sync.ds_offset_done == 0)
2975 				sbuf_printf(sb, "0%%");
2976 			else {
2977 				sbuf_printf(sb, "%u%%",
2978 				    (u_int)((disk->d_sync.ds_offset_done * 100) /
2979 				    (sc->sc_mediasize / (sc->sc_ndisks - 1))));
2980 			}
2981 			sbuf_printf(sb, "</Synchronized>\n");
2982 		}
2983 		sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent,
2984 		    disk->d_sync.ds_syncid);
2985 		sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent, disk->d_genid);
2986 		sbuf_printf(sb, "%s<Flags>", indent);
2987 		if (disk->d_flags == 0)
2988 			sbuf_printf(sb, "NONE");
2989 		else {
2990 			int first = 1;
2991 
2992 #define	ADD_FLAG(flag, name)	do {					\
2993 	if ((disk->d_flags & (flag)) != 0) {				\
2994 		if (!first)						\
2995 			sbuf_printf(sb, ", ");				\
2996 		else							\
2997 			first = 0;					\
2998 		sbuf_printf(sb, name);					\
2999 	}								\
3000 } while (0)
3001 			ADD_FLAG(G_RAID3_DISK_FLAG_DIRTY, "DIRTY");
3002 			ADD_FLAG(G_RAID3_DISK_FLAG_HARDCODED, "HARDCODED");
3003 			ADD_FLAG(G_RAID3_DISK_FLAG_SYNCHRONIZING,
3004 			    "SYNCHRONIZING");
3005 			ADD_FLAG(G_RAID3_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC");
3006 #undef	ADD_FLAG
3007 		}
3008 		sbuf_printf(sb, "</Flags>\n");
3009 		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
3010 		    g_raid3_disk_state2str(disk->d_state));
3011 	} else {
3012 		sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
3013 		sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent, sc->sc_syncid);
3014 		sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent, sc->sc_genid);
3015 		sbuf_printf(sb, "%s<Flags>", indent);
3016 		if (sc->sc_flags == 0)
3017 			sbuf_printf(sb, "NONE");
3018 		else {
3019 			int first = 1;
3020 
3021 #define	ADD_FLAG(flag, name)	do {					\
3022 	if ((sc->sc_flags & (flag)) != 0) {				\
3023 		if (!first)						\
3024 			sbuf_printf(sb, ", ");				\
3025 		else							\
3026 			first = 0;					\
3027 		sbuf_printf(sb, name);					\
3028 	}								\
3029 } while (0)
3030 			ADD_FLAG(G_RAID3_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC");
3031 			ADD_FLAG(G_RAID3_DEVICE_FLAG_ROUND_ROBIN,
3032 			    "ROUND-ROBIN");
3033 			ADD_FLAG(G_RAID3_DEVICE_FLAG_VERIFY, "VERIFY");
3034 #undef	ADD_FLAG
3035 		}
3036 		sbuf_printf(sb, "</Flags>\n");
3037 		sbuf_printf(sb, "%s<Components>%u</Components>\n", indent,
3038 		    sc->sc_ndisks);
3039 		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
3040 		    g_raid3_device_state2str(sc->sc_state));
3041 	}
3042 }
3043 
3044 static void
3045 g_raid3_shutdown(void *arg, int howto)
3046 {
3047 	struct g_class *mp;
3048 	struct g_geom *gp, *gp2;
3049 
3050 	mp = arg;
3051 	DROP_GIANT();
3052 	g_topology_lock();
3053 	LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
3054 		if (gp->softc == NULL)
3055 			continue;
3056 		g_raid3_destroy(gp->softc, 1);
3057 	}
3058 	g_topology_unlock();
3059 	PICKUP_GIANT();
3060 #if 0
3061 	tsleep(&gp, PRIBIO, "r3:shutdown", hz * 20);
3062 #endif
3063 }
3064 
3065 static void
3066 g_raid3_init(struct g_class *mp)
3067 {
3068 
3069 	g_raid3_ehtag = EVENTHANDLER_REGISTER(shutdown_post_sync,
3070 	    g_raid3_shutdown, mp, SHUTDOWN_PRI_FIRST);
3071 	if (g_raid3_ehtag == NULL)
3072 		G_RAID3_DEBUG(0, "Warning! Cannot register shutdown event.");
3073 }
3074 
3075 static void
3076 g_raid3_fini(struct g_class *mp)
3077 {
3078 
3079 	if (g_raid3_ehtag == NULL)
3080 		return;
3081 	EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_raid3_ehtag);
3082 }
3083 
3084 DECLARE_GEOM_CLASS(g_raid3_class, g_raid3);
3085