xref: /freebsd/sys/geom/raid3/g_raid3.c (revision 262e143bd46171a6415a5b28af260a5efa2a3db8)
1 /*-
2  * Copyright (c) 2004-2005 Pawel Jakub Dawidek <pjd@FreeBSD.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29 
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/kernel.h>
33 #include <sys/module.h>
34 #include <sys/limits.h>
35 #include <sys/lock.h>
36 #include <sys/mutex.h>
37 #include <sys/bio.h>
38 #include <sys/sysctl.h>
39 #include <sys/malloc.h>
40 #include <sys/eventhandler.h>
41 #include <vm/uma.h>
42 #include <geom/geom.h>
43 #include <sys/proc.h>
44 #include <sys/kthread.h>
45 #include <sys/sched.h>
46 #include <geom/raid3/g_raid3.h>
47 
48 
49 static MALLOC_DEFINE(M_RAID3, "raid3_data", "GEOM_RAID3 Data");
50 
51 SYSCTL_DECL(_kern_geom);
52 SYSCTL_NODE(_kern_geom, OID_AUTO, raid3, CTLFLAG_RW, 0, "GEOM_RAID3 stuff");
53 u_int g_raid3_debug = 0;
54 TUNABLE_INT("kern.geom.raid3.debug", &g_raid3_debug);
55 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, debug, CTLFLAG_RW, &g_raid3_debug, 0,
56     "Debug level");
57 static u_int g_raid3_timeout = 4;
58 TUNABLE_INT("kern.geom.raid3.timeout", &g_raid3_timeout);
59 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, timeout, CTLFLAG_RW, &g_raid3_timeout,
60     0, "Time to wait on all raid3 components");
61 static u_int g_raid3_idletime = 5;
62 TUNABLE_INT("kern.geom.raid3.idletime", &g_raid3_idletime);
63 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, idletime, CTLFLAG_RW,
64     &g_raid3_idletime, 0, "Mark components as clean when idling");
65 static u_int g_raid3_reqs_per_sync = 5;
66 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, reqs_per_sync, CTLFLAG_RW,
67     &g_raid3_reqs_per_sync, 0,
68     "Number of regular I/O requests per synchronization request");
69 static u_int g_raid3_syncs_per_sec = 1000;
70 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, syncs_per_sec, CTLFLAG_RW,
71     &g_raid3_syncs_per_sec, 0,
72     "Number of synchronizations requests per second");
73 
74 static u_int g_raid3_n64k = 50;
75 TUNABLE_INT("kern.geom.raid3.n64k", &g_raid3_n64k);
76 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n64k, CTLFLAG_RD, &g_raid3_n64k, 0,
77     "Maximum number of 64kB allocations");
78 static u_int g_raid3_n16k = 200;
79 TUNABLE_INT("kern.geom.raid3.n16k", &g_raid3_n16k);
80 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n16k, CTLFLAG_RD, &g_raid3_n16k, 0,
81     "Maximum number of 16kB allocations");
82 static u_int g_raid3_n4k = 1200;
83 TUNABLE_INT("kern.geom.raid3.n4k", &g_raid3_n4k);
84 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n4k, CTLFLAG_RD, &g_raid3_n4k, 0,
85     "Maximum number of 4kB allocations");
86 
87 SYSCTL_NODE(_kern_geom_raid3, OID_AUTO, stat, CTLFLAG_RW, 0,
88     "GEOM_RAID3 statistics");
89 static u_int g_raid3_parity_mismatch = 0;
90 SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, parity_mismatch, CTLFLAG_RD,
91     &g_raid3_parity_mismatch, 0, "Number of failures in VERIFY mode");
92 static u_int g_raid3_64k_requested = 0;
93 SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 64k_requested, CTLFLAG_RD,
94     &g_raid3_64k_requested, 0, "Number of requested 64kB allocations");
95 static u_int g_raid3_64k_failed = 0;
96 SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 64k_failed, CTLFLAG_RD,
97     &g_raid3_64k_failed, 0, "Number of failed 64kB allocations");
98 static u_int g_raid3_16k_requested = 0;
99 SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 16k_requested, CTLFLAG_RD,
100     &g_raid3_16k_requested, 0, "Number of requested 16kB allocations");
101 static u_int g_raid3_16k_failed = 0;
102 SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 16k_failed, CTLFLAG_RD,
103     &g_raid3_16k_failed, 0, "Number of failed 16kB allocations");
104 static u_int g_raid3_4k_requested = 0;
105 SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 4k_requested, CTLFLAG_RD,
106     &g_raid3_4k_requested, 0, "Number of requested 4kB allocations");
107 static u_int g_raid3_4k_failed = 0;
108 SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 4k_failed, CTLFLAG_RD,
109     &g_raid3_4k_failed, 0, "Number of failed 4kB allocations");
110 
111 #define	MSLEEP(ident, mtx, priority, wmesg, timeout)	do {		\
112 	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, (ident));	\
113 	msleep((ident), (mtx), (priority), (wmesg), (timeout));		\
114 	G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, (ident));	\
115 } while (0)
116 
117 static eventhandler_tag g_raid3_ehtag = NULL;
118 
119 static int g_raid3_destroy_geom(struct gctl_req *req, struct g_class *mp,
120     struct g_geom *gp);
121 static g_taste_t g_raid3_taste;
122 static void g_raid3_init(struct g_class *mp);
123 static void g_raid3_fini(struct g_class *mp);
124 
125 struct g_class g_raid3_class = {
126 	.name = G_RAID3_CLASS_NAME,
127 	.version = G_VERSION,
128 	.ctlreq = g_raid3_config,
129 	.taste = g_raid3_taste,
130 	.destroy_geom = g_raid3_destroy_geom,
131 	.init = g_raid3_init,
132 	.fini = g_raid3_fini
133 };
134 
135 
136 static void g_raid3_destroy_provider(struct g_raid3_softc *sc);
137 static int g_raid3_update_disk(struct g_raid3_disk *disk, u_int state);
138 static void g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force);
139 static void g_raid3_dumpconf(struct sbuf *sb, const char *indent,
140     struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
141 static void g_raid3_sync_stop(struct g_raid3_softc *sc, int type);
142 
143 
144 static const char *
145 g_raid3_disk_state2str(int state)
146 {
147 
148 	switch (state) {
149 	case G_RAID3_DISK_STATE_NODISK:
150 		return ("NODISK");
151 	case G_RAID3_DISK_STATE_NONE:
152 		return ("NONE");
153 	case G_RAID3_DISK_STATE_NEW:
154 		return ("NEW");
155 	case G_RAID3_DISK_STATE_ACTIVE:
156 		return ("ACTIVE");
157 	case G_RAID3_DISK_STATE_STALE:
158 		return ("STALE");
159 	case G_RAID3_DISK_STATE_SYNCHRONIZING:
160 		return ("SYNCHRONIZING");
161 	case G_RAID3_DISK_STATE_DISCONNECTED:
162 		return ("DISCONNECTED");
163 	default:
164 		return ("INVALID");
165 	}
166 }
167 
168 static const char *
169 g_raid3_device_state2str(int state)
170 {
171 
172 	switch (state) {
173 	case G_RAID3_DEVICE_STATE_STARTING:
174 		return ("STARTING");
175 	case G_RAID3_DEVICE_STATE_DEGRADED:
176 		return ("DEGRADED");
177 	case G_RAID3_DEVICE_STATE_COMPLETE:
178 		return ("COMPLETE");
179 	default:
180 		return ("INVALID");
181 	}
182 }
183 
184 const char *
185 g_raid3_get_diskname(struct g_raid3_disk *disk)
186 {
187 
188 	if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL)
189 		return ("[unknown]");
190 	return (disk->d_name);
191 }
192 
193 #define	g_raid3_xor(src1, src2, dst, size)				\
194 	_g_raid3_xor((uint64_t *)(src1), (uint64_t *)(src2),		\
195 	    (uint64_t *)(dst), (size_t)size)
196 static void
197 _g_raid3_xor(uint64_t *src1, uint64_t *src2, uint64_t *dst, size_t size)
198 {
199 
200 	KASSERT((size % 128) == 0, ("Invalid size: %zu.", size));
201 	for (; size > 0; size -= 128) {
202 		*dst++ = (*src1++) ^ (*src2++);
203 		*dst++ = (*src1++) ^ (*src2++);
204 		*dst++ = (*src1++) ^ (*src2++);
205 		*dst++ = (*src1++) ^ (*src2++);
206 		*dst++ = (*src1++) ^ (*src2++);
207 		*dst++ = (*src1++) ^ (*src2++);
208 		*dst++ = (*src1++) ^ (*src2++);
209 		*dst++ = (*src1++) ^ (*src2++);
210 		*dst++ = (*src1++) ^ (*src2++);
211 		*dst++ = (*src1++) ^ (*src2++);
212 		*dst++ = (*src1++) ^ (*src2++);
213 		*dst++ = (*src1++) ^ (*src2++);
214 		*dst++ = (*src1++) ^ (*src2++);
215 		*dst++ = (*src1++) ^ (*src2++);
216 		*dst++ = (*src1++) ^ (*src2++);
217 		*dst++ = (*src1++) ^ (*src2++);
218 	}
219 }
220 
221 static int
222 g_raid3_is_zero(struct bio *bp)
223 {
224 	static const uint64_t zeros[] = {
225 	    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
226 	};
227 	u_char *addr;
228 	ssize_t size;
229 
230 	size = bp->bio_length;
231 	addr = (u_char *)bp->bio_data;
232 	for (; size > 0; size -= sizeof(zeros), addr += sizeof(zeros)) {
233 		if (bcmp(addr, zeros, sizeof(zeros)) != 0)
234 			return (0);
235 	}
236 	return (1);
237 }
238 
239 /*
240  * --- Events handling functions ---
241  * Events in geom_raid3 are used to maintain disks and device status
242  * from one thread to simplify locking.
243  */
244 static void
245 g_raid3_event_free(struct g_raid3_event *ep)
246 {
247 
248 	free(ep, M_RAID3);
249 }
250 
251 int
252 g_raid3_event_send(void *arg, int state, int flags)
253 {
254 	struct g_raid3_softc *sc;
255 	struct g_raid3_disk *disk;
256 	struct g_raid3_event *ep;
257 	int error;
258 
259 	ep = malloc(sizeof(*ep), M_RAID3, M_WAITOK);
260 	G_RAID3_DEBUG(4, "%s: Sending event %p.", __func__, ep);
261 	if ((flags & G_RAID3_EVENT_DEVICE) != 0) {
262 		disk = NULL;
263 		sc = arg;
264 	} else {
265 		disk = arg;
266 		sc = disk->d_softc;
267 	}
268 	ep->e_disk = disk;
269 	ep->e_state = state;
270 	ep->e_flags = flags;
271 	ep->e_error = 0;
272 	mtx_lock(&sc->sc_events_mtx);
273 	TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next);
274 	mtx_unlock(&sc->sc_events_mtx);
275 	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
276 	mtx_lock(&sc->sc_queue_mtx);
277 	wakeup(sc);
278 	wakeup(&sc->sc_queue);
279 	mtx_unlock(&sc->sc_queue_mtx);
280 	if ((flags & G_RAID3_EVENT_DONTWAIT) != 0)
281 		return (0);
282 	g_topology_assert();
283 	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, ep);
284 	g_topology_unlock();
285 	while ((ep->e_flags & G_RAID3_EVENT_DONE) == 0) {
286 		mtx_lock(&sc->sc_events_mtx);
287 		MSLEEP(ep, &sc->sc_events_mtx, PRIBIO | PDROP, "r3:event",
288 		    hz * 5);
289 	}
290 	/* Don't even try to use 'sc' here, because it could be already dead. */
291 	g_topology_lock();
292 	error = ep->e_error;
293 	g_raid3_event_free(ep);
294 	return (error);
295 }
296 
297 static struct g_raid3_event *
298 g_raid3_event_get(struct g_raid3_softc *sc)
299 {
300 	struct g_raid3_event *ep;
301 
302 	mtx_lock(&sc->sc_events_mtx);
303 	ep = TAILQ_FIRST(&sc->sc_events);
304 	mtx_unlock(&sc->sc_events_mtx);
305 	return (ep);
306 }
307 
308 static void
309 g_raid3_event_remove(struct g_raid3_softc *sc, struct g_raid3_event *ep)
310 {
311 
312 	mtx_lock(&sc->sc_events_mtx);
313 	TAILQ_REMOVE(&sc->sc_events, ep, e_next);
314 	mtx_unlock(&sc->sc_events_mtx);
315 }
316 
317 static void
318 g_raid3_event_cancel(struct g_raid3_disk *disk)
319 {
320 	struct g_raid3_softc *sc;
321 	struct g_raid3_event *ep, *tmpep;
322 
323 	g_topology_assert();
324 
325 	sc = disk->d_softc;
326 	mtx_lock(&sc->sc_events_mtx);
327 	TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) {
328 		if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0)
329 			continue;
330 		if (ep->e_disk != disk)
331 			continue;
332 		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
333 		if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0)
334 			g_raid3_event_free(ep);
335 		else {
336 			ep->e_error = ECANCELED;
337 			wakeup(ep);
338 		}
339 	}
340 	mtx_unlock(&sc->sc_events_mtx);
341 }
342 
343 /*
344  * Return the number of disks in the given state.
345  * If state is equal to -1, count all connected disks.
346  */
347 u_int
348 g_raid3_ndisks(struct g_raid3_softc *sc, int state)
349 {
350 	struct g_raid3_disk *disk;
351 	u_int n, ndisks;
352 
353 	for (n = ndisks = 0; n < sc->sc_ndisks; n++) {
354 		disk = &sc->sc_disks[n];
355 		if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
356 			continue;
357 		if (state == -1 || disk->d_state == state)
358 			ndisks++;
359 	}
360 	return (ndisks);
361 }
362 
363 static u_int
364 g_raid3_nrequests(struct g_raid3_softc *sc, struct g_consumer *cp)
365 {
366 	struct bio *bp;
367 	u_int nreqs = 0;
368 
369 	mtx_lock(&sc->sc_queue_mtx);
370 	TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
371 		if (bp->bio_from == cp)
372 			nreqs++;
373 	}
374 	mtx_unlock(&sc->sc_queue_mtx);
375 	return (nreqs);
376 }
377 
378 static int
379 g_raid3_is_busy(struct g_raid3_softc *sc, struct g_consumer *cp)
380 {
381 
382 	if (cp->index > 0) {
383 		G_RAID3_DEBUG(2,
384 		    "I/O requests for %s exist, can't destroy it now.",
385 		    cp->provider->name);
386 		return (1);
387 	}
388 	if (g_raid3_nrequests(sc, cp) > 0) {
389 		G_RAID3_DEBUG(2,
390 		    "I/O requests for %s in queue, can't destroy it now.",
391 		    cp->provider->name);
392 		return (1);
393 	}
394 	return (0);
395 }
396 
397 static void
398 g_raid3_destroy_consumer(void *arg, int flags __unused)
399 {
400 	struct g_consumer *cp;
401 
402 	cp = arg;
403 	G_RAID3_DEBUG(1, "Consumer %s destroyed.", cp->provider->name);
404 	g_detach(cp);
405 	g_destroy_consumer(cp);
406 }
407 
408 static void
409 g_raid3_kill_consumer(struct g_raid3_softc *sc, struct g_consumer *cp)
410 {
411 	struct g_provider *pp;
412 	int retaste_wait;
413 
414 	g_topology_assert();
415 
416 	cp->private = NULL;
417 	if (g_raid3_is_busy(sc, cp))
418 		return;
419 	G_RAID3_DEBUG(2, "Consumer %s destroyed.", cp->provider->name);
420 	pp = cp->provider;
421 	retaste_wait = 0;
422 	if (cp->acw == 1) {
423 		if ((pp->geom->flags & G_GEOM_WITHER) == 0)
424 			retaste_wait = 1;
425 	}
426 	G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", pp->name, -cp->acr,
427 	    -cp->acw, -cp->ace, 0);
428 	if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
429 		g_access(cp, -cp->acr, -cp->acw, -cp->ace);
430 	if (retaste_wait) {
431 		/*
432 		 * After retaste event was send (inside g_access()), we can send
433 		 * event to detach and destroy consumer.
434 		 * A class, which has consumer to the given provider connected
435 		 * will not receive retaste event for the provider.
436 		 * This is the way how I ignore retaste events when I close
437 		 * consumers opened for write: I detach and destroy consumer
438 		 * after retaste event is sent.
439 		 */
440 		g_post_event(g_raid3_destroy_consumer, cp, M_WAITOK, NULL);
441 		return;
442 	}
443 	G_RAID3_DEBUG(1, "Consumer %s destroyed.", pp->name);
444 	g_detach(cp);
445 	g_destroy_consumer(cp);
446 }
447 
448 static int
449 g_raid3_connect_disk(struct g_raid3_disk *disk, struct g_provider *pp)
450 {
451 	struct g_consumer *cp;
452 	int error;
453 
454 	g_topology_assert();
455 	KASSERT(disk->d_consumer == NULL,
456 	    ("Disk already connected (device %s).", disk->d_softc->sc_name));
457 
458 	cp = g_new_consumer(disk->d_softc->sc_geom);
459 	error = g_attach(cp, pp);
460 	if (error != 0) {
461 		g_destroy_consumer(cp);
462 		return (error);
463 	}
464 	error = g_access(cp, 1, 1, 1);
465 	if (error != 0) {
466 		g_detach(cp);
467 		g_destroy_consumer(cp);
468 		G_RAID3_DEBUG(0, "Cannot open consumer %s (error=%d).",
469 		    pp->name, error);
470 		return (error);
471 	}
472 	disk->d_consumer = cp;
473 	disk->d_consumer->private = disk;
474 	disk->d_consumer->index = 0;
475 	G_RAID3_DEBUG(2, "Disk %s connected.", g_raid3_get_diskname(disk));
476 	return (0);
477 }
478 
479 static void
480 g_raid3_disconnect_consumer(struct g_raid3_softc *sc, struct g_consumer *cp)
481 {
482 
483 	g_topology_assert();
484 
485 	if (cp == NULL)
486 		return;
487 	if (cp->provider != NULL)
488 		g_raid3_kill_consumer(sc, cp);
489 	else
490 		g_destroy_consumer(cp);
491 }
492 
493 /*
494  * Initialize disk. This means allocate memory, create consumer, attach it
495  * to the provider and open access (r1w1e1) to it.
496  */
497 static struct g_raid3_disk *
498 g_raid3_init_disk(struct g_raid3_softc *sc, struct g_provider *pp,
499     struct g_raid3_metadata *md, int *errorp)
500 {
501 	struct g_raid3_disk *disk;
502 	int error;
503 
504 	disk = &sc->sc_disks[md->md_no];
505 	error = g_raid3_connect_disk(disk, pp);
506 	if (error != 0) {
507 		if (errorp != NULL)
508 			*errorp = error;
509 		return (NULL);
510 	}
511 	disk->d_state = G_RAID3_DISK_STATE_NONE;
512 	disk->d_flags = md->md_dflags;
513 	if (md->md_provider[0] != '\0')
514 		disk->d_flags |= G_RAID3_DISK_FLAG_HARDCODED;
515 	disk->d_sync.ds_consumer = NULL;
516 	disk->d_sync.ds_offset = md->md_sync_offset;
517 	disk->d_sync.ds_offset_done = md->md_sync_offset;
518 	disk->d_sync.ds_resync = -1;
519 	disk->d_genid = md->md_genid;
520 	disk->d_sync.ds_syncid = md->md_syncid;
521 	if (errorp != NULL)
522 		*errorp = 0;
523 	return (disk);
524 }
525 
526 static void
527 g_raid3_destroy_disk(struct g_raid3_disk *disk)
528 {
529 	struct g_raid3_softc *sc;
530 
531 	g_topology_assert();
532 
533 	if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
534 		return;
535 	g_raid3_event_cancel(disk);
536 	sc = disk->d_softc;
537 	switch (disk->d_state) {
538 	case G_RAID3_DISK_STATE_SYNCHRONIZING:
539 		if (sc->sc_syncdisk != NULL)
540 			g_raid3_sync_stop(sc, 1);
541 		/* FALLTHROUGH */
542 	case G_RAID3_DISK_STATE_NEW:
543 	case G_RAID3_DISK_STATE_STALE:
544 	case G_RAID3_DISK_STATE_ACTIVE:
545 		g_raid3_disconnect_consumer(sc, disk->d_consumer);
546 		disk->d_consumer = NULL;
547 		break;
548 	default:
549 		KASSERT(0 == 1, ("Wrong disk state (%s, %s).",
550 		    g_raid3_get_diskname(disk),
551 		    g_raid3_disk_state2str(disk->d_state)));
552 	}
553 	disk->d_state = G_RAID3_DISK_STATE_NODISK;
554 }
555 
556 static void
557 g_raid3_destroy_device(struct g_raid3_softc *sc)
558 {
559 	struct g_raid3_event *ep;
560 	struct g_raid3_disk *disk;
561 	struct g_geom *gp;
562 	struct g_consumer *cp;
563 	u_int n;
564 
565 	g_topology_assert();
566 
567 	gp = sc->sc_geom;
568 	if (sc->sc_provider != NULL)
569 		g_raid3_destroy_provider(sc);
570 	for (n = 0; n < sc->sc_ndisks; n++) {
571 		disk = &sc->sc_disks[n];
572 		if (disk->d_state != G_RAID3_DISK_STATE_NODISK) {
573 			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
574 			g_raid3_update_metadata(disk);
575 			g_raid3_destroy_disk(disk);
576 		}
577 	}
578 	while ((ep = g_raid3_event_get(sc)) != NULL) {
579 		g_raid3_event_remove(sc, ep);
580 		if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0)
581 			g_raid3_event_free(ep);
582 		else {
583 			ep->e_error = ECANCELED;
584 			ep->e_flags |= G_RAID3_EVENT_DONE;
585 			G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, ep);
586 			mtx_lock(&sc->sc_events_mtx);
587 			wakeup(ep);
588 			mtx_unlock(&sc->sc_events_mtx);
589 		}
590 	}
591 	callout_drain(&sc->sc_callout);
592 	gp->softc = NULL;
593 	cp = LIST_FIRST(&sc->sc_sync.ds_geom->consumer);
594 	if (cp != NULL)
595 		g_raid3_disconnect_consumer(sc, cp);
596 	sc->sc_sync.ds_geom->softc = NULL;
597 	g_wither_geom(sc->sc_sync.ds_geom, ENXIO);
598 	uma_zdestroy(sc->sc_zone_64k);
599 	uma_zdestroy(sc->sc_zone_16k);
600 	uma_zdestroy(sc->sc_zone_4k);
601 	mtx_destroy(&sc->sc_queue_mtx);
602 	mtx_destroy(&sc->sc_events_mtx);
603 	G_RAID3_DEBUG(0, "Device %s destroyed.", gp->name);
604 	g_wither_geom(gp, ENXIO);
605 }
606 
607 static void
608 g_raid3_orphan(struct g_consumer *cp)
609 {
610 	struct g_raid3_disk *disk;
611 
612 	g_topology_assert();
613 
614 	disk = cp->private;
615 	if (disk == NULL)
616 		return;
617 	disk->d_softc->sc_bump_id = G_RAID3_BUMP_SYNCID;
618 	g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
619 	    G_RAID3_EVENT_DONTWAIT);
620 }
621 
622 static int
623 g_raid3_write_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md)
624 {
625 	struct g_raid3_softc *sc;
626 	struct g_consumer *cp;
627 	off_t offset, length;
628 	u_char *sector;
629 	int error = 0;
630 
631 	g_topology_assert();
632 
633 	sc = disk->d_softc;
634 	cp = disk->d_consumer;
635 	KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name));
636 	KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name));
637 	KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1,
638 	    ("Consumer %s closed? (r%dw%de%d).", cp->provider->name, cp->acr,
639 	    cp->acw, cp->ace));
640 	length = cp->provider->sectorsize;
641 	offset = cp->provider->mediasize - length;
642 	sector = malloc((size_t)length, M_RAID3, M_WAITOK | M_ZERO);
643 	if (md != NULL)
644 		raid3_metadata_encode(md, sector);
645 	g_topology_unlock();
646 	error = g_write_data(cp, offset, sector, length);
647 	g_topology_lock();
648 	free(sector, M_RAID3);
649 	if (error != 0) {
650 		disk->d_softc->sc_bump_id = G_RAID3_BUMP_GENID;
651 		g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
652 		    G_RAID3_EVENT_DONTWAIT);
653 	}
654 	return (error);
655 }
656 
657 int
658 g_raid3_clear_metadata(struct g_raid3_disk *disk)
659 {
660 	int error;
661 
662 	g_topology_assert();
663 	error = g_raid3_write_metadata(disk, NULL);
664 	if (error == 0) {
665 		G_RAID3_DEBUG(2, "Metadata on %s cleared.",
666 		    g_raid3_get_diskname(disk));
667 	} else {
668 		G_RAID3_DEBUG(0,
669 		    "Cannot clear metadata on disk %s (error=%d).",
670 		    g_raid3_get_diskname(disk), error);
671 	}
672 	return (error);
673 }
674 
675 void
676 g_raid3_fill_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md)
677 {
678 	struct g_raid3_softc *sc;
679 	struct g_provider *pp;
680 
681 	sc = disk->d_softc;
682 	strlcpy(md->md_magic, G_RAID3_MAGIC, sizeof(md->md_magic));
683 	md->md_version = G_RAID3_VERSION;
684 	strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name));
685 	md->md_id = sc->sc_id;
686 	md->md_all = sc->sc_ndisks;
687 	md->md_genid = sc->sc_genid;
688 	md->md_mediasize = sc->sc_mediasize;
689 	md->md_sectorsize = sc->sc_sectorsize;
690 	md->md_mflags = (sc->sc_flags & G_RAID3_DEVICE_FLAG_MASK);
691 	md->md_no = disk->d_no;
692 	md->md_syncid = disk->d_sync.ds_syncid;
693 	md->md_dflags = (disk->d_flags & G_RAID3_DISK_FLAG_MASK);
694 	if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING)
695 		md->md_sync_offset = disk->d_sync.ds_offset_done;
696 	else
697 		md->md_sync_offset = 0;
698 	if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL)
699 		pp = disk->d_consumer->provider;
700 	else
701 		pp = NULL;
702 	if ((disk->d_flags & G_RAID3_DISK_FLAG_HARDCODED) != 0 && pp != NULL)
703 		strlcpy(md->md_provider, pp->name, sizeof(md->md_provider));
704 	else
705 		bzero(md->md_provider, sizeof(md->md_provider));
706 	if (pp != NULL)
707 		md->md_provsize = pp->mediasize;
708 	else
709 		md->md_provsize = 0;
710 }
711 
712 void
713 g_raid3_update_metadata(struct g_raid3_disk *disk)
714 {
715 	struct g_raid3_metadata md;
716 	int error;
717 
718 	g_topology_assert();
719 	g_raid3_fill_metadata(disk, &md);
720 	error = g_raid3_write_metadata(disk, &md);
721 	if (error == 0) {
722 		G_RAID3_DEBUG(2, "Metadata on %s updated.",
723 		    g_raid3_get_diskname(disk));
724 	} else {
725 		G_RAID3_DEBUG(0,
726 		    "Cannot update metadata on disk %s (error=%d).",
727 		    g_raid3_get_diskname(disk), error);
728 	}
729 }
730 
731 static void
732 g_raid3_bump_syncid(struct g_raid3_softc *sc)
733 {
734 	struct g_raid3_disk *disk;
735 	u_int n;
736 
737 	g_topology_assert();
738 	KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0,
739 	    ("%s called with no active disks (device=%s).", __func__,
740 	    sc->sc_name));
741 
742 	sc->sc_syncid++;
743 	G_RAID3_DEBUG(1, "Device %s: syncid bumped to %u.", sc->sc_name,
744 	    sc->sc_syncid);
745 	for (n = 0; n < sc->sc_ndisks; n++) {
746 		disk = &sc->sc_disks[n];
747 		if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
748 		    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
749 			disk->d_sync.ds_syncid = sc->sc_syncid;
750 			g_raid3_update_metadata(disk);
751 		}
752 	}
753 }
754 
755 static void
756 g_raid3_bump_genid(struct g_raid3_softc *sc)
757 {
758 	struct g_raid3_disk *disk;
759 	u_int n;
760 
761 	g_topology_assert();
762 	KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0,
763 	    ("%s called with no active disks (device=%s).", __func__,
764 	    sc->sc_name));
765 
766 	sc->sc_genid++;
767 	G_RAID3_DEBUG(1, "Device %s: genid bumped to %u.", sc->sc_name,
768 	    sc->sc_genid);
769 	for (n = 0; n < sc->sc_ndisks; n++) {
770 		disk = &sc->sc_disks[n];
771 		if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
772 		    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
773 			disk->d_genid = sc->sc_genid;
774 			g_raid3_update_metadata(disk);
775 		}
776 	}
777 }
778 
779 static void
780 g_raid3_idle(struct g_raid3_softc *sc)
781 {
782 	struct g_raid3_disk *disk;
783 	u_int i;
784 
785 	if (sc->sc_provider == NULL || sc->sc_provider->acw == 0)
786 		return;
787 	sc->sc_idle = 1;
788 	g_topology_lock();
789 	for (i = 0; i < sc->sc_ndisks; i++) {
790 		disk = &sc->sc_disks[i];
791 		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
792 			continue;
793 		G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.",
794 		    g_raid3_get_diskname(disk), sc->sc_name);
795 		disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
796 		g_raid3_update_metadata(disk);
797 	}
798 	g_topology_unlock();
799 }
800 
801 static void
802 g_raid3_unidle(struct g_raid3_softc *sc)
803 {
804 	struct g_raid3_disk *disk;
805 	u_int i;
806 
807 	sc->sc_idle = 0;
808 	g_topology_lock();
809 	for (i = 0; i < sc->sc_ndisks; i++) {
810 		disk = &sc->sc_disks[i];
811 		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
812 			continue;
813 		G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.",
814 		    g_raid3_get_diskname(disk), sc->sc_name);
815 		disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
816 		g_raid3_update_metadata(disk);
817 	}
818 	g_topology_unlock();
819 }
820 
821 /*
822  * Return 1 if we should check if RAID3 device is idling.
823  */
824 static int
825 g_raid3_check_idle(struct g_raid3_softc *sc)
826 {
827 	struct g_raid3_disk *disk;
828 	u_int i;
829 
830 	if (sc->sc_idle)
831 		return (0);
832 	if (sc->sc_provider != NULL && sc->sc_provider->acw == 0)
833 		return (0);
834 	/*
835 	 * Check if there are no in-flight requests.
836 	 */
837 	for (i = 0; i < sc->sc_ndisks; i++) {
838 		disk = &sc->sc_disks[i];
839 		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
840 			continue;
841 		if (disk->d_consumer->index > 0)
842 			return (0);
843 	}
844 	return (1);
845 }
846 
847 /*
848  * Treat bio_driver1 field in parent bio as list head and field bio_caller1
849  * in child bio as pointer to the next element on the list.
850  */
851 #define	G_RAID3_HEAD_BIO(pbp)	(pbp)->bio_driver1
852 
853 #define	G_RAID3_NEXT_BIO(cbp)	(cbp)->bio_caller1
854 
855 #define	G_RAID3_FOREACH_BIO(pbp, bp)					\
856 	for ((bp) = G_RAID3_HEAD_BIO(pbp); (bp) != NULL;		\
857 	    (bp) = G_RAID3_NEXT_BIO(bp))
858 
859 #define	G_RAID3_FOREACH_SAFE_BIO(pbp, bp, tmpbp)			\
860 	for ((bp) = G_RAID3_HEAD_BIO(pbp);				\
861 	    (bp) != NULL && ((tmpbp) = G_RAID3_NEXT_BIO(bp), 1);	\
862 	    (bp) = (tmpbp))
863 
864 static void
865 g_raid3_init_bio(struct bio *pbp)
866 {
867 
868 	G_RAID3_HEAD_BIO(pbp) = NULL;
869 }
870 
871 static void
872 g_raid3_remove_bio(struct bio *cbp)
873 {
874 	struct bio *pbp, *bp;
875 
876 	pbp = cbp->bio_parent;
877 	if (G_RAID3_HEAD_BIO(pbp) == cbp)
878 		G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp);
879 	else {
880 		G_RAID3_FOREACH_BIO(pbp, bp) {
881 			if (G_RAID3_NEXT_BIO(bp) == cbp) {
882 				G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp);
883 				break;
884 			}
885 		}
886 	}
887 	G_RAID3_NEXT_BIO(cbp) = NULL;
888 }
889 
890 static void
891 g_raid3_replace_bio(struct bio *sbp, struct bio *dbp)
892 {
893 	struct bio *pbp, *bp;
894 
895 	g_raid3_remove_bio(sbp);
896 	pbp = dbp->bio_parent;
897 	G_RAID3_NEXT_BIO(sbp) = G_RAID3_NEXT_BIO(dbp);
898 	if (G_RAID3_HEAD_BIO(pbp) == dbp)
899 		G_RAID3_HEAD_BIO(pbp) = sbp;
900 	else {
901 		G_RAID3_FOREACH_BIO(pbp, bp) {
902 			if (G_RAID3_NEXT_BIO(bp) == dbp) {
903 				G_RAID3_NEXT_BIO(bp) = sbp;
904 				break;
905 			}
906 		}
907 	}
908 	G_RAID3_NEXT_BIO(dbp) = NULL;
909 }
910 
911 static void
912 g_raid3_destroy_bio(struct g_raid3_softc *sc, struct bio *cbp)
913 {
914 	struct bio *bp, *pbp;
915 	size_t size;
916 
917 	pbp = cbp->bio_parent;
918 	pbp->bio_children--;
919 	KASSERT(cbp->bio_data != NULL, ("NULL bio_data"));
920 	size = pbp->bio_length / (sc->sc_ndisks - 1);
921 	if (size > 16384)
922 		uma_zfree(sc->sc_zone_64k, cbp->bio_data);
923 	else if (size > 4096)
924 		uma_zfree(sc->sc_zone_16k, cbp->bio_data);
925 	else
926 		uma_zfree(sc->sc_zone_4k, cbp->bio_data);
927 	if (G_RAID3_HEAD_BIO(pbp) == cbp) {
928 		G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp);
929 		G_RAID3_NEXT_BIO(cbp) = NULL;
930 		g_destroy_bio(cbp);
931 	} else {
932 		G_RAID3_FOREACH_BIO(pbp, bp) {
933 			if (G_RAID3_NEXT_BIO(bp) == cbp)
934 				break;
935 		}
936 		if (bp != NULL) {
937 			KASSERT(G_RAID3_NEXT_BIO(bp) != NULL,
938 			    ("NULL bp->bio_driver1"));
939 			G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp);
940 			G_RAID3_NEXT_BIO(cbp) = NULL;
941 		}
942 		g_destroy_bio(cbp);
943 	}
944 }
945 
946 static struct bio *
947 g_raid3_clone_bio(struct g_raid3_softc *sc, struct bio *pbp)
948 {
949 	struct bio *bp, *cbp;
950 	size_t size;
951 
952 	cbp = g_clone_bio(pbp);
953 	if (cbp == NULL)
954 		return (NULL);
955 	size = pbp->bio_length / (sc->sc_ndisks - 1);
956 	if (size > 16384) {
957 		cbp->bio_data = uma_zalloc(sc->sc_zone_64k, M_NOWAIT);
958 		g_raid3_64k_requested++;
959 	} else if (size > 4096) {
960 		cbp->bio_data = uma_zalloc(sc->sc_zone_16k, M_NOWAIT);
961 		g_raid3_16k_requested++;
962 	} else {
963 		cbp->bio_data = uma_zalloc(sc->sc_zone_4k, M_NOWAIT);
964 		g_raid3_4k_requested++;
965 	}
966 	if (cbp->bio_data == NULL) {
967 		if (size > 16384)
968 			g_raid3_64k_failed++;
969 		else if (size > 4096)
970 			g_raid3_16k_failed++;
971 		else
972 			g_raid3_4k_failed++;
973 		pbp->bio_children--;
974 		g_destroy_bio(cbp);
975 		return (NULL);
976 	}
977 	G_RAID3_NEXT_BIO(cbp) = NULL;
978 	if (G_RAID3_HEAD_BIO(pbp) == NULL)
979 		G_RAID3_HEAD_BIO(pbp) = cbp;
980 	else {
981 		G_RAID3_FOREACH_BIO(pbp, bp) {
982 			if (G_RAID3_NEXT_BIO(bp) == NULL) {
983 				G_RAID3_NEXT_BIO(bp) = cbp;
984 				break;
985 			}
986 		}
987 	}
988 	return (cbp);
989 }
990 
991 static void
992 g_raid3_scatter(struct bio *pbp)
993 {
994 	struct g_raid3_softc *sc;
995 	struct g_raid3_disk *disk;
996 	struct bio *bp, *cbp;
997 	off_t atom, cadd, padd, left;
998 
999 	sc = pbp->bio_to->geom->softc;
1000 	bp = NULL;
1001 	if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) {
1002 		/*
1003 		 * Find bio for which we should calculate data.
1004 		 */
1005 		G_RAID3_FOREACH_BIO(pbp, cbp) {
1006 			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) {
1007 				bp = cbp;
1008 				break;
1009 			}
1010 		}
1011 		KASSERT(bp != NULL, ("NULL parity bio."));
1012 	}
1013 	atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
1014 	cadd = padd = 0;
1015 	for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) {
1016 		G_RAID3_FOREACH_BIO(pbp, cbp) {
1017 			if (cbp == bp)
1018 				continue;
1019 			bcopy(pbp->bio_data + padd, cbp->bio_data + cadd, atom);
1020 			padd += atom;
1021 		}
1022 		cadd += atom;
1023 	}
1024 	if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) {
1025 		struct bio *tmpbp;
1026 
1027 		/*
1028 		 * Calculate parity.
1029 		 */
1030 		bzero(bp->bio_data, bp->bio_length);
1031 		G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) {
1032 			if (cbp == bp)
1033 				continue;
1034 			g_raid3_xor(cbp->bio_data, bp->bio_data, bp->bio_data,
1035 			    bp->bio_length);
1036 			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_NODISK) != 0)
1037 				g_raid3_destroy_bio(sc, cbp);
1038 		}
1039 	}
1040 	G_RAID3_FOREACH_BIO(pbp, cbp) {
1041 		struct g_consumer *cp;
1042 
1043 		disk = cbp->bio_caller2;
1044 		cp = disk->d_consumer;
1045 		cbp->bio_to = cp->provider;
1046 		G_RAID3_LOGREQ(3, cbp, "Sending request.");
1047 		KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1,
1048 		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1049 		    cp->acr, cp->acw, cp->ace));
1050 		cp->index++;
1051 		g_io_request(cbp, cp);
1052 	}
1053 }
1054 
1055 static void
1056 g_raid3_gather(struct bio *pbp)
1057 {
1058 	struct g_raid3_softc *sc;
1059 	struct g_raid3_disk *disk;
1060 	struct bio *xbp, *fbp, *cbp;
1061 	off_t atom, cadd, padd, left;
1062 
1063 	sc = pbp->bio_to->geom->softc;
1064 	/*
1065 	 * Find bio for which we have to calculate data.
1066 	 * While going through this path, check if all requests
1067 	 * succeeded, if not, deny whole request.
1068 	 * If we're in COMPLETE mode, we allow one request to fail,
1069 	 * so if we find one, we're sending it to the parity consumer.
1070 	 * If there are more failed requests, we deny whole request.
1071 	 */
1072 	xbp = fbp = NULL;
1073 	G_RAID3_FOREACH_BIO(pbp, cbp) {
1074 		if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) {
1075 			KASSERT(xbp == NULL, ("More than one parity bio."));
1076 			xbp = cbp;
1077 		}
1078 		if (cbp->bio_error == 0)
1079 			continue;
1080 		/*
1081 		 * Found failed request.
1082 		 */
1083 		G_RAID3_LOGREQ(0, cbp, "Request failed.");
1084 		disk = cbp->bio_caller2;
1085 		if (disk != NULL) {
1086 			/*
1087 			 * Actually this is pointless to bump genid,
1088 			 * because whole device is fucked up.
1089 			 */
1090 			sc->sc_bump_id |= G_RAID3_BUMP_GENID;
1091 			g_raid3_event_send(disk,
1092 			    G_RAID3_DISK_STATE_DISCONNECTED,
1093 			    G_RAID3_EVENT_DONTWAIT);
1094 		}
1095 		if (fbp == NULL) {
1096 			if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_DEGRADED) != 0) {
1097 				/*
1098 				 * We are already in degraded mode, so we can't
1099 				 * accept any failures.
1100 				 */
1101 				if (pbp->bio_error == 0)
1102 					pbp->bio_error = fbp->bio_error;
1103 			} else {
1104 				fbp = cbp;
1105 			}
1106 		} else {
1107 			/*
1108 			 * Next failed request, that's too many.
1109 			 */
1110 			if (pbp->bio_error == 0)
1111 				pbp->bio_error = fbp->bio_error;
1112 		}
1113 	}
1114 	if (pbp->bio_error != 0)
1115 		goto finish;
1116 	if (fbp != NULL && (pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) {
1117 		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_VERIFY;
1118 		if (xbp != fbp)
1119 			g_raid3_replace_bio(xbp, fbp);
1120 		g_raid3_destroy_bio(sc, fbp);
1121 	} else if (fbp != NULL) {
1122 		struct g_consumer *cp;
1123 
1124 		/*
1125 		 * One request failed, so send the same request to
1126 		 * the parity consumer.
1127 		 */
1128 		disk = pbp->bio_driver2;
1129 		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) {
1130 			pbp->bio_error = fbp->bio_error;
1131 			goto finish;
1132 		}
1133 		pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
1134 		pbp->bio_inbed--;
1135 		fbp->bio_flags &= ~(BIO_DONE | BIO_ERROR);
1136 		if (disk->d_no == sc->sc_ndisks - 1)
1137 			fbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1138 		fbp->bio_error = 0;
1139 		fbp->bio_completed = 0;
1140 		fbp->bio_children = 0;
1141 		fbp->bio_inbed = 0;
1142 		cp = disk->d_consumer;
1143 		fbp->bio_caller2 = disk;
1144 		fbp->bio_to = cp->provider;
1145 		G_RAID3_LOGREQ(3, fbp, "Sending request (recover).");
1146 		KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1,
1147 		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1148 		    cp->acr, cp->acw, cp->ace));
1149 		cp->index++;
1150 		g_io_request(fbp, cp);
1151 		return;
1152 	}
1153 	if (xbp != NULL) {
1154 		/*
1155 		 * Calculate parity.
1156 		 */
1157 		G_RAID3_FOREACH_BIO(pbp, cbp) {
1158 			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0)
1159 				continue;
1160 			g_raid3_xor(cbp->bio_data, xbp->bio_data, xbp->bio_data,
1161 			    xbp->bio_length);
1162 		}
1163 		xbp->bio_cflags &= ~G_RAID3_BIO_CFLAG_PARITY;
1164 		if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) {
1165 			if (!g_raid3_is_zero(xbp)) {
1166 				g_raid3_parity_mismatch++;
1167 				pbp->bio_error = EIO;
1168 				goto finish;
1169 			}
1170 			g_raid3_destroy_bio(sc, xbp);
1171 		}
1172 	}
1173 	atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
1174 	cadd = padd = 0;
1175 	for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) {
1176 		G_RAID3_FOREACH_BIO(pbp, cbp) {
1177 			bcopy(cbp->bio_data + cadd, pbp->bio_data + padd, atom);
1178 			pbp->bio_completed += atom;
1179 			padd += atom;
1180 		}
1181 		cadd += atom;
1182 	}
1183 finish:
1184 	if (pbp->bio_error == 0)
1185 		G_RAID3_LOGREQ(3, pbp, "Request finished.");
1186 	else {
1187 		if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0)
1188 			G_RAID3_LOGREQ(1, pbp, "Verification error.");
1189 		else
1190 			G_RAID3_LOGREQ(0, pbp, "Request failed.");
1191 	}
1192 	pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_MASK;
1193 	g_io_deliver(pbp, pbp->bio_error);
1194 	while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL)
1195 		g_raid3_destroy_bio(sc, cbp);
1196 }
1197 
1198 static void
1199 g_raid3_done(struct bio *bp)
1200 {
1201 	struct g_raid3_softc *sc;
1202 
1203 	sc = bp->bio_from->geom->softc;
1204 	bp->bio_cflags |= G_RAID3_BIO_CFLAG_REGULAR;
1205 	G_RAID3_LOGREQ(3, bp, "Regular request done (error=%d).", bp->bio_error);
1206 	mtx_lock(&sc->sc_queue_mtx);
1207 	bioq_insert_head(&sc->sc_queue, bp);
1208 	wakeup(sc);
1209 	wakeup(&sc->sc_queue);
1210 	mtx_unlock(&sc->sc_queue_mtx);
1211 }
1212 
1213 static void
1214 g_raid3_regular_request(struct bio *cbp)
1215 {
1216 	struct g_raid3_softc *sc;
1217 	struct g_raid3_disk *disk;
1218 	struct bio *pbp;
1219 
1220 	g_topology_assert_not();
1221 
1222 	cbp->bio_from->index--;
1223 	pbp = cbp->bio_parent;
1224 	sc = pbp->bio_to->geom->softc;
1225 	disk = cbp->bio_from->private;
1226 	if (disk == NULL) {
1227 		g_topology_lock();
1228 		g_raid3_kill_consumer(sc, cbp->bio_from);
1229 		g_topology_unlock();
1230 	}
1231 
1232 	G_RAID3_LOGREQ(3, cbp, "Request finished.");
1233 	pbp->bio_inbed++;
1234 	KASSERT(pbp->bio_inbed <= pbp->bio_children,
1235 	    ("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed,
1236 	    pbp->bio_children));
1237 	if (pbp->bio_inbed != pbp->bio_children)
1238 		return;
1239 	switch (pbp->bio_cmd) {
1240 	case BIO_READ:
1241 		g_raid3_gather(pbp);
1242 		break;
1243 	case BIO_WRITE:
1244 	case BIO_DELETE:
1245 	    {
1246 		int error = 0;
1247 
1248 		pbp->bio_completed = pbp->bio_length;
1249 		while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) {
1250 			if (cbp->bio_error != 0) {
1251 				disk = cbp->bio_caller2;
1252 				if (disk != NULL) {
1253 					sc->sc_bump_id |= G_RAID3_BUMP_GENID;
1254 					g_raid3_event_send(disk,
1255 					    G_RAID3_DISK_STATE_DISCONNECTED,
1256 					    G_RAID3_EVENT_DONTWAIT);
1257 				}
1258 				if (error == 0)
1259 					error = cbp->bio_error;
1260 				else if (pbp->bio_error == 0) {
1261 					/*
1262 					 * Next failed request, that's too many.
1263 					 */
1264 					pbp->bio_error = error;
1265 				}
1266 			}
1267 			g_raid3_destroy_bio(sc, cbp);
1268 		}
1269 		if (pbp->bio_error == 0)
1270 			G_RAID3_LOGREQ(3, pbp, "Request finished.");
1271 		else
1272 			G_RAID3_LOGREQ(0, pbp, "Request failed.");
1273 		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_DEGRADED;
1274 		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_NOPARITY;
1275 		g_io_deliver(pbp, pbp->bio_error);
1276 		break;
1277 	    }
1278 	}
1279 }
1280 
1281 static void
1282 g_raid3_sync_done(struct bio *bp)
1283 {
1284 	struct g_raid3_softc *sc;
1285 
1286 	G_RAID3_LOGREQ(3, bp, "Synchronization request delivered.");
1287 	sc = bp->bio_from->geom->softc;
1288 	bp->bio_cflags |= G_RAID3_BIO_CFLAG_SYNC;
1289 	mtx_lock(&sc->sc_queue_mtx);
1290 	bioq_insert_head(&sc->sc_queue, bp);
1291 	wakeup(sc);
1292 	wakeup(&sc->sc_queue);
1293 	mtx_unlock(&sc->sc_queue_mtx);
1294 }
1295 
1296 static void
1297 g_raid3_start(struct bio *bp)
1298 {
1299 	struct g_raid3_softc *sc;
1300 
1301 	sc = bp->bio_to->geom->softc;
1302 	/*
1303 	 * If sc == NULL or there are no valid disks, provider's error
1304 	 * should be set and g_raid3_start() should not be called at all.
1305 	 */
1306 	KASSERT(sc != NULL && (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
1307 	    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE),
1308 	    ("Provider's error should be set (error=%d)(device=%s).",
1309 	    bp->bio_to->error, bp->bio_to->name));
1310 	G_RAID3_LOGREQ(3, bp, "Request received.");
1311 
1312 	switch (bp->bio_cmd) {
1313 	case BIO_READ:
1314 	case BIO_WRITE:
1315 	case BIO_DELETE:
1316 		break;
1317 	case BIO_GETATTR:
1318 	default:
1319 		g_io_deliver(bp, EOPNOTSUPP);
1320 		return;
1321 	}
1322 	mtx_lock(&sc->sc_queue_mtx);
1323 	bioq_insert_tail(&sc->sc_queue, bp);
1324 	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
1325 	wakeup(sc);
1326 	mtx_unlock(&sc->sc_queue_mtx);
1327 }
1328 
1329 /*
1330  * Send one synchronization request.
1331  */
1332 static void
1333 g_raid3_sync_one(struct g_raid3_softc *sc)
1334 {
1335 	struct g_raid3_disk *disk;
1336 	struct bio *bp;
1337 
1338 	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
1339 	    ("Wrong device state (%s, %s).", sc->sc_name,
1340 	    g_raid3_device_state2str(sc->sc_state)));
1341 	disk = sc->sc_syncdisk;
1342 	KASSERT(disk != NULL, ("No sync disk (%s).", sc->sc_name));
1343 	KASSERT(disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
1344 	    ("Disk %s is not marked for synchronization.",
1345 	    g_raid3_get_diskname(disk)));
1346 
1347 	bp = g_new_bio();
1348 	if (bp == NULL)
1349 		return;
1350 	bp->bio_parent = NULL;
1351 	bp->bio_cmd = BIO_READ;
1352 	bp->bio_offset = disk->d_sync.ds_offset * (sc->sc_ndisks - 1);
1353 	bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset);
1354 	bp->bio_cflags = 0;
1355 	bp->bio_done = g_raid3_sync_done;
1356 	bp->bio_data = disk->d_sync.ds_data;
1357 	if (bp->bio_data == NULL) {
1358 		g_destroy_bio(bp);
1359 		return;
1360 	}
1361 	bp->bio_cflags = G_RAID3_BIO_CFLAG_REGSYNC;
1362 	disk->d_sync.ds_offset += bp->bio_length / (sc->sc_ndisks - 1);
1363 	bp->bio_to = sc->sc_provider;
1364 	G_RAID3_LOGREQ(3, bp, "Sending synchronization request.");
1365 	disk->d_sync.ds_consumer->index++;
1366 	g_io_request(bp, disk->d_sync.ds_consumer);
1367 }
1368 
1369 static void
1370 g_raid3_sync_request(struct bio *bp)
1371 {
1372 	struct g_raid3_softc *sc;
1373 	struct g_raid3_disk *disk;
1374 
1375 	bp->bio_from->index--;
1376 	sc = bp->bio_from->geom->softc;
1377 	disk = bp->bio_from->private;
1378 	if (disk == NULL) {
1379 		g_topology_lock();
1380 		g_raid3_kill_consumer(sc, bp->bio_from);
1381 		g_topology_unlock();
1382 		g_destroy_bio(bp);
1383 		return;
1384 	}
1385 
1386 	/*
1387 	 * Synchronization request.
1388 	 */
1389 	switch (bp->bio_cmd) {
1390 	case BIO_READ:
1391 	    {
1392 		struct g_consumer *cp;
1393 		u_char *dst, *src;
1394 		off_t left;
1395 		u_int atom;
1396 
1397 		if (bp->bio_error != 0) {
1398 			G_RAID3_LOGREQ(0, bp,
1399 			    "Synchronization request failed (error=%d).",
1400 			    bp->bio_error);
1401 			g_destroy_bio(bp);
1402 			return;
1403 		}
1404 		G_RAID3_LOGREQ(3, bp, "Synchronization request finished.");
1405 		atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
1406 		dst = src = bp->bio_data;
1407 		if (disk->d_no == sc->sc_ndisks - 1) {
1408 			u_int n;
1409 
1410 			/* Parity component. */
1411 			for (left = bp->bio_length; left > 0;
1412 			    left -= sc->sc_sectorsize) {
1413 				bcopy(src, dst, atom);
1414 				src += atom;
1415 				for (n = 1; n < sc->sc_ndisks - 1; n++) {
1416 					g_raid3_xor(src, dst, dst, atom);
1417 					src += atom;
1418 				}
1419 				dst += atom;
1420 			}
1421 		} else {
1422 			/* Regular component. */
1423 			src += atom * disk->d_no;
1424 			for (left = bp->bio_length; left > 0;
1425 			    left -= sc->sc_sectorsize) {
1426 				bcopy(src, dst, atom);
1427 				src += sc->sc_sectorsize;
1428 				dst += atom;
1429 			}
1430 		}
1431 		bp->bio_offset /= sc->sc_ndisks - 1;
1432 		bp->bio_length /= sc->sc_ndisks - 1;
1433 		bp->bio_cmd = BIO_WRITE;
1434 		bp->bio_cflags = 0;
1435 		bp->bio_children = bp->bio_inbed = 0;
1436 		cp = disk->d_consumer;
1437 		KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1,
1438 		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1439 		    cp->acr, cp->acw, cp->ace));
1440 		cp->index++;
1441 		g_io_request(bp, cp);
1442 		return;
1443 	    }
1444 	case BIO_WRITE:
1445 	    {
1446 		struct g_raid3_disk_sync *sync;
1447 
1448 		if (bp->bio_error != 0) {
1449 			G_RAID3_LOGREQ(0, bp,
1450 			    "Synchronization request failed (error=%d).",
1451 			    bp->bio_error);
1452 			g_destroy_bio(bp);
1453 			sc->sc_bump_id |= G_RAID3_BUMP_GENID;
1454 			g_raid3_event_send(disk,
1455 			    G_RAID3_DISK_STATE_DISCONNECTED,
1456 			    G_RAID3_EVENT_DONTWAIT);
1457 			return;
1458 		}
1459 		G_RAID3_LOGREQ(3, bp, "Synchronization request finished.");
1460 		sync = &disk->d_sync;
1461 		sync->ds_offset_done = bp->bio_offset + bp->bio_length;
1462 		g_destroy_bio(bp);
1463 		if (sync->ds_resync != -1)
1464 			return;
1465 		if (sync->ds_offset_done ==
1466 		    sc->sc_mediasize / (sc->sc_ndisks - 1)) {
1467 			/*
1468 			 * Disk up-to-date, activate it.
1469 			 */
1470 			g_raid3_event_send(disk, G_RAID3_DISK_STATE_ACTIVE,
1471 			    G_RAID3_EVENT_DONTWAIT);
1472 			return;
1473 		} else if (sync->ds_offset_done % (MAXPHYS * 100) == 0) {
1474 			/*
1475 			 * Update offset_done on every 100 blocks.
1476 			 * XXX: This should be configurable.
1477 			 */
1478 			g_topology_lock();
1479 			g_raid3_update_metadata(disk);
1480 			g_topology_unlock();
1481 		}
1482 		return;
1483 	    }
1484 	default:
1485 		KASSERT(1 == 0, ("Invalid command here: %u (device=%s)",
1486 		    bp->bio_cmd, sc->sc_name));
1487 		break;
1488 	}
1489 }
1490 
1491 static int
1492 g_raid3_register_request(struct bio *pbp)
1493 {
1494 	struct g_raid3_softc *sc;
1495 	struct g_raid3_disk *disk;
1496 	struct g_consumer *cp;
1497 	struct bio *cbp;
1498 	off_t offset, length;
1499 	u_int n, ndisks;
1500 	int round_robin, verify;
1501 
1502 	ndisks = 0;
1503 	sc = pbp->bio_to->geom->softc;
1504 	if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGSYNC) != 0 &&
1505 	    sc->sc_syncdisk == NULL) {
1506 		g_io_deliver(pbp, EIO);
1507 		return (0);
1508 	}
1509 	g_raid3_init_bio(pbp);
1510 	length = pbp->bio_length / (sc->sc_ndisks - 1);
1511 	offset = pbp->bio_offset / (sc->sc_ndisks - 1);
1512 	round_robin = verify = 0;
1513 	switch (pbp->bio_cmd) {
1514 	case BIO_READ:
1515 		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 &&
1516 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
1517 			pbp->bio_pflags |= G_RAID3_BIO_PFLAG_VERIFY;
1518 			verify = 1;
1519 			ndisks = sc->sc_ndisks;
1520 		} else {
1521 			verify = 0;
1522 			ndisks = sc->sc_ndisks - 1;
1523 		}
1524 		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0 &&
1525 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
1526 			round_robin = 1;
1527 		} else {
1528 			round_robin = 0;
1529 		}
1530 		KASSERT(!round_robin || !verify,
1531 		    ("ROUND-ROBIN and VERIFY are mutually exclusive."));
1532 		pbp->bio_driver2 = &sc->sc_disks[sc->sc_ndisks - 1];
1533 		break;
1534 	case BIO_WRITE:
1535 	case BIO_DELETE:
1536 	    {
1537 		struct g_raid3_disk_sync *sync;
1538 
1539 		if (sc->sc_idle)
1540 			g_raid3_unidle(sc);
1541 
1542 		ndisks = sc->sc_ndisks;
1543 
1544 		if (sc->sc_syncdisk == NULL)
1545 			break;
1546 		sync = &sc->sc_syncdisk->d_sync;
1547 		if (offset >= sync->ds_offset)
1548 			break;
1549 		if (offset + length <= sync->ds_offset_done)
1550 			break;
1551 		if (offset >= sync->ds_resync && sync->ds_resync != -1)
1552 			break;
1553 		sync->ds_resync = offset - (offset % MAXPHYS);
1554 		break;
1555 	    }
1556 	}
1557 	for (n = 0; n < ndisks; n++) {
1558 		disk = &sc->sc_disks[n];
1559 		cbp = g_raid3_clone_bio(sc, pbp);
1560 		if (cbp == NULL) {
1561 			while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL)
1562 				g_raid3_destroy_bio(sc, cbp);
1563 			/*
1564 			 * To prevent deadlock, we must run back up
1565 			 * with the ENOMEM for failed requests of any
1566 			 * of our consumers.  Our own sync requests
1567 			 * can stick around, as they are finite.
1568 			 */
1569 			if ((pbp->bio_cflags &
1570 			    G_RAID3_BIO_CFLAG_REGULAR) != 0) {
1571 				g_io_deliver(pbp, ENOMEM);
1572 				return (0);
1573 			}
1574 			return (ENOMEM);
1575 		}
1576 		cbp->bio_offset = offset;
1577 		cbp->bio_length = length;
1578 		cbp->bio_done = g_raid3_done;
1579 		switch (pbp->bio_cmd) {
1580 		case BIO_READ:
1581 			if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) {
1582 				/*
1583 				 * Replace invalid component with the parity
1584 				 * component.
1585 				 */
1586 				disk = &sc->sc_disks[sc->sc_ndisks - 1];
1587 				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1588 				pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
1589 			} else if (round_robin &&
1590 			    disk->d_no == sc->sc_round_robin) {
1591 				/*
1592 				 * In round-robin mode skip one data component
1593 				 * and use parity component when reading.
1594 				 */
1595 				pbp->bio_driver2 = disk;
1596 				disk = &sc->sc_disks[sc->sc_ndisks - 1];
1597 				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1598 				sc->sc_round_robin++;
1599 				round_robin = 0;
1600 			} else if (verify && disk->d_no == sc->sc_ndisks - 1) {
1601 				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1602 			}
1603 			break;
1604 		case BIO_WRITE:
1605 		case BIO_DELETE:
1606 			if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
1607 			    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
1608 				if (n == ndisks - 1) {
1609 					/*
1610 					 * Active parity component, mark it as such.
1611 					 */
1612 					cbp->bio_cflags |=
1613 					    G_RAID3_BIO_CFLAG_PARITY;
1614 				}
1615 			} else {
1616 				pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
1617 				if (n == ndisks - 1) {
1618 					/*
1619 					 * Parity component is not connected,
1620 					 * so destroy its request.
1621 					 */
1622 					pbp->bio_pflags |=
1623 					    G_RAID3_BIO_PFLAG_NOPARITY;
1624 					g_raid3_destroy_bio(sc, cbp);
1625 					cbp = NULL;
1626 				} else {
1627 					cbp->bio_cflags |=
1628 					    G_RAID3_BIO_CFLAG_NODISK;
1629 					disk = NULL;
1630 				}
1631 			}
1632 			break;
1633 		}
1634 		if (cbp != NULL)
1635 			cbp->bio_caller2 = disk;
1636 	}
1637 	switch (pbp->bio_cmd) {
1638 	case BIO_READ:
1639 		if (round_robin) {
1640 			/*
1641 			 * If we are in round-robin mode and 'round_robin' is
1642 			 * still 1, it means, that we skipped parity component
1643 			 * for this read and must reset sc_round_robin field.
1644 			 */
1645 			sc->sc_round_robin = 0;
1646 		}
1647 		G_RAID3_FOREACH_BIO(pbp, cbp) {
1648 			disk = cbp->bio_caller2;
1649 			cp = disk->d_consumer;
1650 			cbp->bio_to = cp->provider;
1651 			G_RAID3_LOGREQ(3, cbp, "Sending request.");
1652 			KASSERT(cp->acr == 1 && cp->acw == 1 && cp->ace == 1,
1653 			    ("Consumer %s not opened (r%dw%de%d).",
1654 			    cp->provider->name, cp->acr, cp->acw, cp->ace));
1655 			cp->index++;
1656 			g_io_request(cbp, cp);
1657 		}
1658 		break;
1659 	case BIO_WRITE:
1660 	case BIO_DELETE:
1661 		/*
1662 		 * Bump syncid on first write.
1663 		 */
1664 		if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0) {
1665 			sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID;
1666 			g_topology_lock();
1667 			g_raid3_bump_syncid(sc);
1668 			g_topology_unlock();
1669 		}
1670 		g_raid3_scatter(pbp);
1671 		break;
1672 	}
1673 	return (0);
1674 }
1675 
1676 static int
1677 g_raid3_can_destroy(struct g_raid3_softc *sc)
1678 {
1679 	struct g_geom *gp;
1680 	struct g_consumer *cp;
1681 
1682 	g_topology_assert();
1683 	gp = sc->sc_geom;
1684 	LIST_FOREACH(cp, &gp->consumer, consumer) {
1685 		if (g_raid3_is_busy(sc, cp))
1686 			return (0);
1687 	}
1688 	gp = sc->sc_sync.ds_geom;
1689 	LIST_FOREACH(cp, &gp->consumer, consumer) {
1690 		if (g_raid3_is_busy(sc, cp))
1691 			return (0);
1692 	}
1693 	G_RAID3_DEBUG(2, "No I/O requests for %s, it can be destroyed.",
1694 	    sc->sc_name);
1695 	return (1);
1696 }
1697 
1698 static int
1699 g_raid3_try_destroy(struct g_raid3_softc *sc)
1700 {
1701 
1702 	if (sc->sc_rootmount != NULL) {
1703 		G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__,
1704 		    sc->sc_rootmount);
1705 		root_mount_rel(sc->sc_rootmount);
1706 		sc->sc_rootmount = NULL;
1707 	}
1708 
1709 	g_topology_lock();
1710 	if (!g_raid3_can_destroy(sc)) {
1711 		g_topology_unlock();
1712 		return (0);
1713 	}
1714 	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_WAIT) != 0) {
1715 		g_topology_unlock();
1716 		G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__,
1717 		    &sc->sc_worker);
1718 		wakeup(&sc->sc_worker);
1719 		sc->sc_worker = NULL;
1720 	} else {
1721 		g_raid3_destroy_device(sc);
1722 		g_topology_unlock();
1723 		free(sc->sc_disks, M_RAID3);
1724 		free(sc, M_RAID3);
1725 	}
1726 	return (1);
1727 }
1728 
1729 /*
1730  * Worker thread.
1731  */
1732 static void
1733 g_raid3_worker(void *arg)
1734 {
1735 	struct g_raid3_softc *sc;
1736 	struct g_raid3_disk *disk;
1737 	struct g_raid3_disk_sync *sync;
1738 	struct g_raid3_event *ep;
1739 	struct bio *bp;
1740 	u_int nreqs;
1741 
1742 	sc = arg;
1743 	mtx_lock_spin(&sched_lock);
1744 	sched_prio(curthread, PRIBIO);
1745 	mtx_unlock_spin(&sched_lock);
1746 
1747 	nreqs = 0;
1748 	for (;;) {
1749 		G_RAID3_DEBUG(5, "%s: Let's see...", __func__);
1750 		/*
1751 		 * First take a look at events.
1752 		 * This is important to handle events before any I/O requests.
1753 		 */
1754 		ep = g_raid3_event_get(sc);
1755 		if (ep != NULL && g_topology_try_lock()) {
1756 			g_raid3_event_remove(sc, ep);
1757 			if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0) {
1758 				/* Update only device status. */
1759 				G_RAID3_DEBUG(3,
1760 				    "Running event for device %s.",
1761 				    sc->sc_name);
1762 				ep->e_error = 0;
1763 				g_raid3_update_device(sc, 1);
1764 			} else {
1765 				/* Update disk status. */
1766 				G_RAID3_DEBUG(3, "Running event for disk %s.",
1767 				     g_raid3_get_diskname(ep->e_disk));
1768 				ep->e_error = g_raid3_update_disk(ep->e_disk,
1769 				    ep->e_state);
1770 				if (ep->e_error == 0)
1771 					g_raid3_update_device(sc, 0);
1772 			}
1773 			g_topology_unlock();
1774 			if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) {
1775 				KASSERT(ep->e_error == 0,
1776 				    ("Error cannot be handled."));
1777 				g_raid3_event_free(ep);
1778 			} else {
1779 				ep->e_flags |= G_RAID3_EVENT_DONE;
1780 				G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__,
1781 				    ep);
1782 				mtx_lock(&sc->sc_events_mtx);
1783 				wakeup(ep);
1784 				mtx_unlock(&sc->sc_events_mtx);
1785 			}
1786 			if ((sc->sc_flags &
1787 			    G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
1788 				if (g_raid3_try_destroy(sc))
1789 					kthread_exit(0);
1790 			}
1791 			G_RAID3_DEBUG(5, "%s: I'm here 1.", __func__);
1792 			continue;
1793 		}
1794 		/*
1795 		 * Now I/O requests.
1796 		 */
1797 		/* Get first request from the queue. */
1798 		mtx_lock(&sc->sc_queue_mtx);
1799 		bp = bioq_first(&sc->sc_queue);
1800 		if (bp == NULL) {
1801 			if (ep != NULL) {
1802 				/*
1803 				 * No I/O requests and topology lock was
1804 				 * already held? Try again.
1805 				 */
1806 				mtx_unlock(&sc->sc_queue_mtx);
1807 				tsleep(ep, PRIBIO, "r3:top1", hz / 5);
1808 				continue;
1809 			}
1810 			if ((sc->sc_flags &
1811 			    G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
1812 				mtx_unlock(&sc->sc_queue_mtx);
1813 				if (g_raid3_try_destroy(sc))
1814 					kthread_exit(0);
1815 				mtx_lock(&sc->sc_queue_mtx);
1816 			}
1817 		}
1818 		if (sc->sc_syncdisk != NULL &&
1819 		    (bp == NULL || nreqs > g_raid3_reqs_per_sync)) {
1820 			mtx_unlock(&sc->sc_queue_mtx);
1821 			/*
1822 			 * It is time for synchronization...
1823 			 */
1824 			nreqs = 0;
1825 			disk = sc->sc_syncdisk;
1826 			sync = &disk->d_sync;
1827 			if (sync->ds_offset <
1828 			    sc->sc_mediasize / (sc->sc_ndisks - 1) &&
1829 			    sync->ds_offset == sync->ds_offset_done) {
1830 				if (sync->ds_resync != -1) {
1831 					sync->ds_offset = sync->ds_resync;
1832 					sync->ds_offset_done = sync->ds_resync;
1833 					sync->ds_resync = -1;
1834 				}
1835 				g_raid3_sync_one(sc);
1836 			}
1837 			G_RAID3_DEBUG(5, "%s: I'm here 2.", __func__);
1838 			goto sleep;
1839 		}
1840 		if (bp == NULL) {
1841 			if (g_raid3_check_idle(sc)) {
1842 				u_int idletime;
1843 
1844 				idletime = g_raid3_idletime;
1845 				if (idletime == 0)
1846 					idletime = 1;
1847 				idletime *= hz;
1848 				if (msleep(sc, &sc->sc_queue_mtx, PRIBIO | PDROP,
1849 				    "r3:w1", idletime) == EWOULDBLOCK) {
1850 					G_RAID3_DEBUG(5, "%s: I'm here 3.",
1851 					    __func__);
1852 					/*
1853 					 * No I/O requests in 'idletime'
1854 					 * seconds, so mark components as clean.
1855 					 */
1856 					g_raid3_idle(sc);
1857 				}
1858 				G_RAID3_DEBUG(5, "%s: I'm here 4.", __func__);
1859 			} else {
1860 				MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP,
1861 				    "r3:w2", 0);
1862 				G_RAID3_DEBUG(5, "%s: I'm here 5.", __func__);
1863 			}
1864 			continue;
1865 		}
1866 		nreqs++;
1867 		bioq_remove(&sc->sc_queue, bp);
1868 		mtx_unlock(&sc->sc_queue_mtx);
1869 
1870 		if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0) {
1871 			g_raid3_regular_request(bp);
1872 		} else if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0) {
1873 			u_int timeout, sps;
1874 
1875 			g_raid3_sync_request(bp);
1876 sleep:
1877 			sps = atomic_load_acq_int(&g_raid3_syncs_per_sec);
1878 			if (sps == 0) {
1879 				G_RAID3_DEBUG(5, "%s: I'm here 6.", __func__);
1880 				continue;
1881 			}
1882 			if (ep != NULL) {
1883 				/*
1884 				 * We have some pending events, don't sleep now.
1885 				 */
1886 				G_RAID3_DEBUG(5, "%s: I'm here 7.", __func__);
1887 				tsleep(ep, PRIBIO, "r3:top2", hz / 5);
1888 				continue;
1889 			}
1890 			mtx_lock(&sc->sc_queue_mtx);
1891 			if (bioq_first(&sc->sc_queue) != NULL) {
1892 				mtx_unlock(&sc->sc_queue_mtx);
1893 				G_RAID3_DEBUG(5, "%s: I'm here 8.", __func__);
1894 				continue;
1895 			}
1896 			timeout = hz / sps;
1897 			if (timeout == 0)
1898 				timeout = 1;
1899 			MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "r3:w2",
1900 			    timeout);
1901 		} else {
1902 			if (g_raid3_register_request(bp) != 0) {
1903 				mtx_lock(&sc->sc_queue_mtx);
1904 				bioq_insert_tail(&sc->sc_queue, bp);
1905 				MSLEEP(&sc->sc_queue, &sc->sc_queue_mtx,
1906 				    PRIBIO | PDROP, "r3:lowmem", hz / 10);
1907 			}
1908 		}
1909 		G_RAID3_DEBUG(5, "%s: I'm here 9.", __func__);
1910 	}
1911 }
1912 
1913 /*
1914  * Open disk's consumer if needed.
1915  */
1916 static void
1917 g_raid3_update_access(struct g_raid3_disk *disk)
1918 {
1919 	struct g_provider *pp;
1920 
1921 	g_topology_assert();
1922 
1923 	pp = disk->d_softc->sc_provider;
1924 	if (pp == NULL)
1925 		return;
1926 	if (pp->acw > 0) {
1927 		if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) == 0) {
1928 			G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.",
1929 			    g_raid3_get_diskname(disk), disk->d_softc->sc_name);
1930 			disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
1931 		}
1932 	} else if (pp->acw == 0) {
1933 		if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0) {
1934 			G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.",
1935 			    g_raid3_get_diskname(disk), disk->d_softc->sc_name);
1936 			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
1937 		}
1938 	}
1939 }
1940 
1941 static void
1942 g_raid3_sync_start(struct g_raid3_softc *sc)
1943 {
1944 	struct g_raid3_disk *disk;
1945 	int error;
1946 	u_int n;
1947 
1948 	g_topology_assert();
1949 
1950 	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
1951 	    ("Device not in DEGRADED state (%s, %u).", sc->sc_name,
1952 	    sc->sc_state));
1953 	KASSERT(sc->sc_syncdisk == NULL, ("Syncdisk is not NULL (%s, %u).",
1954 	    sc->sc_name, sc->sc_state));
1955 	disk = NULL;
1956 	for (n = 0; n < sc->sc_ndisks; n++) {
1957 		if (sc->sc_disks[n].d_state != G_RAID3_DISK_STATE_SYNCHRONIZING)
1958 			continue;
1959 		disk = &sc->sc_disks[n];
1960 		break;
1961 	}
1962 	if (disk == NULL)
1963 		return;
1964 
1965 	G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name,
1966 	    g_raid3_get_diskname(disk));
1967 	disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
1968 	KASSERT(disk->d_sync.ds_consumer == NULL,
1969 	    ("Sync consumer already exists (device=%s, disk=%s).",
1970 	    sc->sc_name, g_raid3_get_diskname(disk)));
1971 	disk->d_sync.ds_consumer = g_new_consumer(sc->sc_sync.ds_geom);
1972 	disk->d_sync.ds_consumer->private = disk;
1973 	disk->d_sync.ds_consumer->index = 0;
1974 	error = g_attach(disk->d_sync.ds_consumer, disk->d_softc->sc_provider);
1975 	KASSERT(error == 0, ("Cannot attach to %s (error=%d).",
1976 	    disk->d_softc->sc_name, error));
1977 	error = g_access(disk->d_sync.ds_consumer, 1, 0, 0);
1978 	KASSERT(error == 0, ("Cannot open %s (error=%d).",
1979 	    disk->d_softc->sc_name, error));
1980 	disk->d_sync.ds_data = malloc(MAXPHYS, M_RAID3, M_WAITOK);
1981 	sc->sc_syncdisk = disk;
1982 }
1983 
1984 /*
1985  * Stop synchronization process.
1986  * type: 0 - synchronization finished
1987  *       1 - synchronization stopped
1988  */
1989 static void
1990 g_raid3_sync_stop(struct g_raid3_softc *sc, int type)
1991 {
1992 	struct g_raid3_disk *disk;
1993 
1994 	g_topology_assert();
1995 	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
1996 	    ("Device not in DEGRADED state (%s, %u).", sc->sc_name,
1997 	    sc->sc_state));
1998 	disk = sc->sc_syncdisk;
1999 	sc->sc_syncdisk = NULL;
2000 	KASSERT(disk != NULL, ("No disk was synchronized (%s).", sc->sc_name));
2001 	KASSERT(disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
2002 	    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2003 	    g_raid3_disk_state2str(disk->d_state)));
2004 	if (disk->d_sync.ds_consumer == NULL)
2005 		return;
2006 
2007 	if (type == 0) {
2008 		G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s finished.",
2009 		    disk->d_softc->sc_name, g_raid3_get_diskname(disk));
2010 	} else /* if (type == 1) */ {
2011 		G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s stopped.",
2012 		    disk->d_softc->sc_name, g_raid3_get_diskname(disk));
2013 	}
2014 	g_raid3_kill_consumer(disk->d_softc, disk->d_sync.ds_consumer);
2015 	free(disk->d_sync.ds_data, M_RAID3);
2016 	disk->d_sync.ds_consumer = NULL;
2017 	disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2018 }
2019 
2020 static void
2021 g_raid3_launch_provider(struct g_raid3_softc *sc)
2022 {
2023 	struct g_provider *pp;
2024 
2025 	g_topology_assert();
2026 
2027 	pp = g_new_providerf(sc->sc_geom, "raid3/%s", sc->sc_name);
2028 	pp->mediasize = sc->sc_mediasize;
2029 	pp->sectorsize = sc->sc_sectorsize;
2030 	sc->sc_provider = pp;
2031 	g_error_provider(pp, 0);
2032 	G_RAID3_DEBUG(0, "Device %s: provider %s launched.", sc->sc_name,
2033 	    pp->name);
2034 	if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED)
2035 		g_raid3_sync_start(sc);
2036 }
2037 
2038 static void
2039 g_raid3_destroy_provider(struct g_raid3_softc *sc)
2040 {
2041 	struct bio *bp;
2042 
2043 	g_topology_assert();
2044 	KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).",
2045 	    sc->sc_name));
2046 
2047 	g_error_provider(sc->sc_provider, ENXIO);
2048 	mtx_lock(&sc->sc_queue_mtx);
2049 	while ((bp = bioq_first(&sc->sc_queue)) != NULL) {
2050 		bioq_remove(&sc->sc_queue, bp);
2051 		g_io_deliver(bp, ENXIO);
2052 	}
2053 	mtx_unlock(&sc->sc_queue_mtx);
2054 	G_RAID3_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name,
2055 	    sc->sc_provider->name);
2056 	sc->sc_provider->flags |= G_PF_WITHER;
2057 	g_orphan_provider(sc->sc_provider, ENXIO);
2058 	sc->sc_provider = NULL;
2059 	if (sc->sc_syncdisk != NULL)
2060 		g_raid3_sync_stop(sc, 1);
2061 }
2062 
2063 static void
2064 g_raid3_go(void *arg)
2065 {
2066 	struct g_raid3_softc *sc;
2067 
2068 	sc = arg;
2069 	G_RAID3_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name);
2070 	g_raid3_event_send(sc, 0,
2071 	    G_RAID3_EVENT_DONTWAIT | G_RAID3_EVENT_DEVICE);
2072 }
2073 
2074 static u_int
2075 g_raid3_determine_state(struct g_raid3_disk *disk)
2076 {
2077 	struct g_raid3_softc *sc;
2078 	u_int state;
2079 
2080 	sc = disk->d_softc;
2081 	if (sc->sc_syncid == disk->d_sync.ds_syncid) {
2082 		if ((disk->d_flags &
2083 		    G_RAID3_DISK_FLAG_SYNCHRONIZING) == 0) {
2084 			/* Disk does not need synchronization. */
2085 			state = G_RAID3_DISK_STATE_ACTIVE;
2086 		} else {
2087 			if ((sc->sc_flags &
2088 			     G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0  ||
2089 			    (disk->d_flags &
2090 			     G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) {
2091 				/*
2092 				 * We can start synchronization from
2093 				 * the stored offset.
2094 				 */
2095 				state = G_RAID3_DISK_STATE_SYNCHRONIZING;
2096 			} else {
2097 				state = G_RAID3_DISK_STATE_STALE;
2098 			}
2099 		}
2100 	} else if (disk->d_sync.ds_syncid < sc->sc_syncid) {
2101 		/*
2102 		 * Reset all synchronization data for this disk,
2103 		 * because if it even was synchronized, it was
2104 		 * synchronized to disks with different syncid.
2105 		 */
2106 		disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING;
2107 		disk->d_sync.ds_offset = 0;
2108 		disk->d_sync.ds_offset_done = 0;
2109 		disk->d_sync.ds_syncid = sc->sc_syncid;
2110 		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 ||
2111 		    (disk->d_flags & G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) {
2112 			state = G_RAID3_DISK_STATE_SYNCHRONIZING;
2113 		} else {
2114 			state = G_RAID3_DISK_STATE_STALE;
2115 		}
2116 	} else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ {
2117 		/*
2118 		 * Not good, NOT GOOD!
2119 		 * It means that device was started on stale disks
2120 		 * and more fresh disk just arrive.
2121 		 * If there were writes, device is fucked up, sorry.
2122 		 * I think the best choice here is don't touch
2123 		 * this disk and inform the user laudly.
2124 		 */
2125 		G_RAID3_DEBUG(0, "Device %s was started before the freshest "
2126 		    "disk (%s) arrives!! It will not be connected to the "
2127 		    "running device.", sc->sc_name,
2128 		    g_raid3_get_diskname(disk));
2129 		g_raid3_destroy_disk(disk);
2130 		state = G_RAID3_DISK_STATE_NONE;
2131 		/* Return immediately, because disk was destroyed. */
2132 		return (state);
2133 	}
2134 	G_RAID3_DEBUG(3, "State for %s disk: %s.",
2135 	    g_raid3_get_diskname(disk), g_raid3_disk_state2str(state));
2136 	return (state);
2137 }
2138 
2139 /*
2140  * Update device state.
2141  */
2142 static void
2143 g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force)
2144 {
2145 	struct g_raid3_disk *disk;
2146 	u_int state;
2147 
2148 	g_topology_assert();
2149 
2150 	switch (sc->sc_state) {
2151 	case G_RAID3_DEVICE_STATE_STARTING:
2152 	    {
2153 		u_int n, ndirty, ndisks, genid, syncid;
2154 
2155 		KASSERT(sc->sc_provider == NULL,
2156 		    ("Non-NULL provider in STARTING state (%s).", sc->sc_name));
2157 		/*
2158 		 * Are we ready? We are, if all disks are connected or
2159 		 * one disk is missing and 'force' is true.
2160 		 */
2161 		if (g_raid3_ndisks(sc, -1) + force == sc->sc_ndisks) {
2162 			if (!force)
2163 				callout_drain(&sc->sc_callout);
2164 		} else {
2165 			if (force) {
2166 				/*
2167 				 * Timeout expired, so destroy device.
2168 				 */
2169 				sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2170 				G_RAID3_DEBUG(1, "root_mount_rel[%u] %p",
2171 				    __LINE__, sc->sc_rootmount);
2172 				root_mount_rel(sc->sc_rootmount);
2173 				sc->sc_rootmount = NULL;
2174 			}
2175 			return;
2176 		}
2177 
2178 		/*
2179 		 * Find the biggest genid.
2180 		 */
2181 		genid = 0;
2182 		for (n = 0; n < sc->sc_ndisks; n++) {
2183 			disk = &sc->sc_disks[n];
2184 			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2185 				continue;
2186 			if (disk->d_genid > genid)
2187 				genid = disk->d_genid;
2188 		}
2189 		sc->sc_genid = genid;
2190 		/*
2191 		 * Remove all disks without the biggest genid.
2192 		 */
2193 		for (n = 0; n < sc->sc_ndisks; n++) {
2194 			disk = &sc->sc_disks[n];
2195 			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2196 				continue;
2197 			if (disk->d_genid < genid) {
2198 				G_RAID3_DEBUG(0,
2199 				    "Component %s (device %s) broken, skipping.",
2200 				    g_raid3_get_diskname(disk), sc->sc_name);
2201 				g_raid3_destroy_disk(disk);
2202 			}
2203 		}
2204 
2205 		/*
2206 		 * There must be at least 'sc->sc_ndisks - 1' components
2207 		 * with the same syncid and without SYNCHRONIZING flag.
2208 		 */
2209 
2210 		/*
2211 		 * Find the biggest syncid, number of valid components and
2212 		 * number of dirty components.
2213 		 */
2214 		ndirty = ndisks = syncid = 0;
2215 		for (n = 0; n < sc->sc_ndisks; n++) {
2216 			disk = &sc->sc_disks[n];
2217 			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2218 				continue;
2219 			if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0)
2220 				ndirty++;
2221 			if (disk->d_sync.ds_syncid > syncid) {
2222 				syncid = disk->d_sync.ds_syncid;
2223 				ndisks = 0;
2224 			} else if (disk->d_sync.ds_syncid < syncid) {
2225 				continue;
2226 			}
2227 			if ((disk->d_flags &
2228 			    G_RAID3_DISK_FLAG_SYNCHRONIZING) != 0) {
2229 				continue;
2230 			}
2231 			ndisks++;
2232 		}
2233 		/*
2234 		 * Do we have enough valid components?
2235 		 */
2236 		if (ndisks + 1 < sc->sc_ndisks) {
2237 			G_RAID3_DEBUG(0,
2238 			    "Device %s is broken, too few valid components.",
2239 			    sc->sc_name);
2240 			sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2241 			return;
2242 		}
2243 		/*
2244 		 * If there is one DIRTY component and all disks are present,
2245 		 * mark it for synchronization. If there is more than one DIRTY
2246 		 * component, mark parity component for synchronization.
2247 		 */
2248 		if (ndisks == sc->sc_ndisks && ndirty == 1) {
2249 			for (n = 0; n < sc->sc_ndisks; n++) {
2250 				disk = &sc->sc_disks[n];
2251 				if ((disk->d_flags &
2252 				    G_RAID3_DISK_FLAG_DIRTY) == 0) {
2253 					continue;
2254 				}
2255 				disk->d_flags |=
2256 				    G_RAID3_DISK_FLAG_SYNCHRONIZING;
2257 			}
2258 		} else if (ndisks == sc->sc_ndisks && ndirty > 1) {
2259 			disk = &sc->sc_disks[sc->sc_ndisks - 1];
2260 			disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING;
2261 		}
2262 
2263 		sc->sc_syncid = syncid;
2264 		if (force) {
2265 			/* Remember to bump syncid on first write. */
2266 			sc->sc_bump_id |= G_RAID3_BUMP_SYNCID;
2267 		}
2268 		if (ndisks == sc->sc_ndisks)
2269 			state = G_RAID3_DEVICE_STATE_COMPLETE;
2270 		else /* if (ndisks == sc->sc_ndisks - 1) */
2271 			state = G_RAID3_DEVICE_STATE_DEGRADED;
2272 		G_RAID3_DEBUG(1, "Device %s state changed from %s to %s.",
2273 		    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
2274 		    g_raid3_device_state2str(state));
2275 		sc->sc_state = state;
2276 		for (n = 0; n < sc->sc_ndisks; n++) {
2277 			disk = &sc->sc_disks[n];
2278 			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2279 				continue;
2280 			state = g_raid3_determine_state(disk);
2281 			g_raid3_event_send(disk, state, G_RAID3_EVENT_DONTWAIT);
2282 			if (state == G_RAID3_DISK_STATE_STALE)
2283 				sc->sc_bump_id |= G_RAID3_BUMP_SYNCID;
2284 		}
2285 		break;
2286 	    }
2287 	case G_RAID3_DEVICE_STATE_DEGRADED:
2288 		/*
2289 		 * Genid need to be bumped immediately, so do it here.
2290 		 */
2291 		if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) {
2292 			sc->sc_bump_id &= ~G_RAID3_BUMP_GENID;
2293 			g_raid3_bump_genid(sc);
2294 		}
2295 
2296 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0)
2297 			return;
2298 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) <
2299 		    sc->sc_ndisks - 1) {
2300 			if (sc->sc_provider != NULL)
2301 				g_raid3_destroy_provider(sc);
2302 			sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2303 			return;
2304 		}
2305 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) ==
2306 		    sc->sc_ndisks) {
2307 			state = G_RAID3_DEVICE_STATE_COMPLETE;
2308 			G_RAID3_DEBUG(1,
2309 			    "Device %s state changed from %s to %s.",
2310 			    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
2311 			    g_raid3_device_state2str(state));
2312 			sc->sc_state = state;
2313 		}
2314 		if (sc->sc_provider == NULL)
2315 			g_raid3_launch_provider(sc);
2316 		if (sc->sc_rootmount != NULL) {
2317 			G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__,
2318 			    sc->sc_rootmount);
2319 			root_mount_rel(sc->sc_rootmount);
2320 			sc->sc_rootmount = NULL;
2321 		}
2322 		break;
2323 	case G_RAID3_DEVICE_STATE_COMPLETE:
2324 		/*
2325 		 * Genid need to be bumped immediately, so do it here.
2326 		 */
2327 		if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) {
2328 			sc->sc_bump_id &= ~G_RAID3_BUMP_GENID;
2329 			g_raid3_bump_genid(sc);
2330 		}
2331 
2332 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0)
2333 			return;
2334 		KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) >=
2335 		    sc->sc_ndisks - 1,
2336 		    ("Too few ACTIVE components in COMPLETE state (device %s).",
2337 		    sc->sc_name));
2338 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) ==
2339 		    sc->sc_ndisks - 1) {
2340 			state = G_RAID3_DEVICE_STATE_DEGRADED;
2341 			G_RAID3_DEBUG(1,
2342 			    "Device %s state changed from %s to %s.",
2343 			    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
2344 			    g_raid3_device_state2str(state));
2345 			sc->sc_state = state;
2346 		}
2347 		if (sc->sc_provider == NULL)
2348 			g_raid3_launch_provider(sc);
2349 		if (sc->sc_rootmount != NULL) {
2350 			G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__,
2351 			    sc->sc_rootmount);
2352 			root_mount_rel(sc->sc_rootmount);
2353 			sc->sc_rootmount = NULL;
2354 		}
2355 		break;
2356 	default:
2357 		KASSERT(1 == 0, ("Wrong device state (%s, %s).", sc->sc_name,
2358 		    g_raid3_device_state2str(sc->sc_state)));
2359 		break;
2360 	}
2361 }
2362 
2363 /*
2364  * Update disk state and device state if needed.
2365  */
2366 #define	DISK_STATE_CHANGED()	G_RAID3_DEBUG(1,			\
2367 	"Disk %s state changed from %s to %s (device %s).",		\
2368 	g_raid3_get_diskname(disk),					\
2369 	g_raid3_disk_state2str(disk->d_state),				\
2370 	g_raid3_disk_state2str(state), sc->sc_name)
2371 static int
2372 g_raid3_update_disk(struct g_raid3_disk *disk, u_int state)
2373 {
2374 	struct g_raid3_softc *sc;
2375 
2376 	g_topology_assert();
2377 
2378 	sc = disk->d_softc;
2379 again:
2380 	G_RAID3_DEBUG(3, "Changing disk %s state from %s to %s.",
2381 	    g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state),
2382 	    g_raid3_disk_state2str(state));
2383 	switch (state) {
2384 	case G_RAID3_DISK_STATE_NEW:
2385 		/*
2386 		 * Possible scenarios:
2387 		 * 1. New disk arrive.
2388 		 */
2389 		/* Previous state should be NONE. */
2390 		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NONE,
2391 		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2392 		    g_raid3_disk_state2str(disk->d_state)));
2393 		DISK_STATE_CHANGED();
2394 
2395 		disk->d_state = state;
2396 		G_RAID3_DEBUG(0, "Device %s: provider %s detected.",
2397 		    sc->sc_name, g_raid3_get_diskname(disk));
2398 		if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING)
2399 			break;
2400 		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2401 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2402 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2403 		    g_raid3_device_state2str(sc->sc_state),
2404 		    g_raid3_get_diskname(disk),
2405 		    g_raid3_disk_state2str(disk->d_state)));
2406 		state = g_raid3_determine_state(disk);
2407 		if (state != G_RAID3_DISK_STATE_NONE)
2408 			goto again;
2409 		break;
2410 	case G_RAID3_DISK_STATE_ACTIVE:
2411 		/*
2412 		 * Possible scenarios:
2413 		 * 1. New disk does not need synchronization.
2414 		 * 2. Synchronization process finished successfully.
2415 		 */
2416 		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2417 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2418 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2419 		    g_raid3_device_state2str(sc->sc_state),
2420 		    g_raid3_get_diskname(disk),
2421 		    g_raid3_disk_state2str(disk->d_state)));
2422 		/* Previous state should be NEW or SYNCHRONIZING. */
2423 		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW ||
2424 		    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
2425 		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2426 		    g_raid3_disk_state2str(disk->d_state)));
2427 		DISK_STATE_CHANGED();
2428 
2429 		if (disk->d_state == G_RAID3_DISK_STATE_NEW)
2430 			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2431 		else if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
2432 			disk->d_flags &= ~G_RAID3_DISK_FLAG_SYNCHRONIZING;
2433 			disk->d_flags &= ~G_RAID3_DISK_FLAG_FORCE_SYNC;
2434 			g_raid3_sync_stop(sc, 0);
2435 		}
2436 		disk->d_state = state;
2437 		disk->d_sync.ds_offset = 0;
2438 		disk->d_sync.ds_offset_done = 0;
2439 		g_raid3_update_access(disk);
2440 		g_raid3_update_metadata(disk);
2441 		G_RAID3_DEBUG(0, "Device %s: provider %s activated.",
2442 		    sc->sc_name, g_raid3_get_diskname(disk));
2443 		break;
2444 	case G_RAID3_DISK_STATE_STALE:
2445 		/*
2446 		 * Possible scenarios:
2447 		 * 1. Stale disk was connected.
2448 		 */
2449 		/* Previous state should be NEW. */
2450 		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
2451 		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2452 		    g_raid3_disk_state2str(disk->d_state)));
2453 		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2454 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2455 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2456 		    g_raid3_device_state2str(sc->sc_state),
2457 		    g_raid3_get_diskname(disk),
2458 		    g_raid3_disk_state2str(disk->d_state)));
2459 		/*
2460 		 * STALE state is only possible if device is marked
2461 		 * NOAUTOSYNC.
2462 		 */
2463 		KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) != 0,
2464 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2465 		    g_raid3_device_state2str(sc->sc_state),
2466 		    g_raid3_get_diskname(disk),
2467 		    g_raid3_disk_state2str(disk->d_state)));
2468 		DISK_STATE_CHANGED();
2469 
2470 		disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2471 		disk->d_state = state;
2472 		g_raid3_update_metadata(disk);
2473 		G_RAID3_DEBUG(0, "Device %s: provider %s is stale.",
2474 		    sc->sc_name, g_raid3_get_diskname(disk));
2475 		break;
2476 	case G_RAID3_DISK_STATE_SYNCHRONIZING:
2477 		/*
2478 		 * Possible scenarios:
2479 		 * 1. Disk which needs synchronization was connected.
2480 		 */
2481 		/* Previous state should be NEW. */
2482 		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
2483 		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2484 		    g_raid3_disk_state2str(disk->d_state)));
2485 		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2486 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2487 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2488 		    g_raid3_device_state2str(sc->sc_state),
2489 		    g_raid3_get_diskname(disk),
2490 		    g_raid3_disk_state2str(disk->d_state)));
2491 		DISK_STATE_CHANGED();
2492 
2493 		if (disk->d_state == G_RAID3_DISK_STATE_NEW)
2494 			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2495 		disk->d_state = state;
2496 		if (sc->sc_provider != NULL) {
2497 			g_raid3_sync_start(sc);
2498 			g_raid3_update_metadata(disk);
2499 		}
2500 		break;
2501 	case G_RAID3_DISK_STATE_DISCONNECTED:
2502 		/*
2503 		 * Possible scenarios:
2504 		 * 1. Device wasn't running yet, but disk disappear.
2505 		 * 2. Disk was active and disapppear.
2506 		 * 3. Disk disappear during synchronization process.
2507 		 */
2508 		if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2509 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
2510 			/*
2511 			 * Previous state should be ACTIVE, STALE or
2512 			 * SYNCHRONIZING.
2513 			 */
2514 			KASSERT(disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
2515 			    disk->d_state == G_RAID3_DISK_STATE_STALE ||
2516 			    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
2517 			    ("Wrong disk state (%s, %s).",
2518 			    g_raid3_get_diskname(disk),
2519 			    g_raid3_disk_state2str(disk->d_state)));
2520 		} else if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING) {
2521 			/* Previous state should be NEW. */
2522 			KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
2523 			    ("Wrong disk state (%s, %s).",
2524 			    g_raid3_get_diskname(disk),
2525 			    g_raid3_disk_state2str(disk->d_state)));
2526 			/*
2527 			 * Reset bumping syncid if disk disappeared in STARTING
2528 			 * state.
2529 			 */
2530 			if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0)
2531 				sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID;
2532 #ifdef	INVARIANTS
2533 		} else {
2534 			KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).",
2535 			    sc->sc_name,
2536 			    g_raid3_device_state2str(sc->sc_state),
2537 			    g_raid3_get_diskname(disk),
2538 			    g_raid3_disk_state2str(disk->d_state)));
2539 #endif
2540 		}
2541 		DISK_STATE_CHANGED();
2542 		G_RAID3_DEBUG(0, "Device %s: provider %s disconnected.",
2543 		    sc->sc_name, g_raid3_get_diskname(disk));
2544 
2545 		g_raid3_destroy_disk(disk);
2546 		break;
2547 	default:
2548 		KASSERT(1 == 0, ("Unknown state (%u).", state));
2549 		break;
2550 	}
2551 	return (0);
2552 }
2553 #undef	DISK_STATE_CHANGED
2554 
2555 int
2556 g_raid3_read_metadata(struct g_consumer *cp, struct g_raid3_metadata *md)
2557 {
2558 	struct g_provider *pp;
2559 	u_char *buf;
2560 	int error;
2561 
2562 	g_topology_assert();
2563 
2564 	error = g_access(cp, 1, 0, 0);
2565 	if (error != 0)
2566 		return (error);
2567 	pp = cp->provider;
2568 	g_topology_unlock();
2569 	/* Metadata are stored on last sector. */
2570 	buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
2571 	    &error);
2572 	g_topology_lock();
2573 	g_access(cp, -1, 0, 0);
2574 	if (buf == NULL) {
2575 		G_RAID3_DEBUG(1, "Cannot read metadata from %s (error=%d).",
2576 		    cp->provider->name, error);
2577 		if (buf != NULL)
2578 			g_free(buf);
2579 		return (error);
2580 	}
2581 
2582 	/* Decode metadata. */
2583 	error = raid3_metadata_decode(buf, md);
2584 	g_free(buf);
2585 	if (strcmp(md->md_magic, G_RAID3_MAGIC) != 0)
2586 		return (EINVAL);
2587 	if (md->md_version > G_RAID3_VERSION) {
2588 		G_RAID3_DEBUG(0,
2589 		    "Kernel module is too old to handle metadata from %s.",
2590 		    cp->provider->name);
2591 		return (EINVAL);
2592 	}
2593 	if (error != 0) {
2594 		G_RAID3_DEBUG(1, "MD5 metadata hash mismatch for provider %s.",
2595 		    cp->provider->name);
2596 		return (error);
2597 	}
2598 
2599 	return (0);
2600 }
2601 
2602 static int
2603 g_raid3_check_metadata(struct g_raid3_softc *sc, struct g_provider *pp,
2604     struct g_raid3_metadata *md)
2605 {
2606 
2607 	if (md->md_no >= sc->sc_ndisks) {
2608 		G_RAID3_DEBUG(1, "Invalid disk %s number (no=%u), skipping.",
2609 		    pp->name, md->md_no);
2610 		return (EINVAL);
2611 	}
2612 	if (sc->sc_disks[md->md_no].d_state != G_RAID3_DISK_STATE_NODISK) {
2613 		G_RAID3_DEBUG(1, "Disk %s (no=%u) already exists, skipping.",
2614 		    pp->name, md->md_no);
2615 		return (EEXIST);
2616 	}
2617 	if (md->md_all != sc->sc_ndisks) {
2618 		G_RAID3_DEBUG(1,
2619 		    "Invalid '%s' field on disk %s (device %s), skipping.",
2620 		    "md_all", pp->name, sc->sc_name);
2621 		return (EINVAL);
2622 	}
2623 	if (md->md_mediasize != sc->sc_mediasize) {
2624 		G_RAID3_DEBUG(1,
2625 		    "Invalid '%s' field on disk %s (device %s), skipping.",
2626 		    "md_mediasize", pp->name, sc->sc_name);
2627 		return (EINVAL);
2628 	}
2629 	if ((md->md_mediasize % (sc->sc_ndisks - 1)) != 0) {
2630 		G_RAID3_DEBUG(1,
2631 		    "Invalid '%s' field on disk %s (device %s), skipping.",
2632 		    "md_mediasize", pp->name, sc->sc_name);
2633 		return (EINVAL);
2634 	}
2635 	if ((sc->sc_mediasize / (sc->sc_ndisks - 1)) > pp->mediasize) {
2636 		G_RAID3_DEBUG(1,
2637 		    "Invalid size of disk %s (device %s), skipping.", pp->name,
2638 		    sc->sc_name);
2639 		return (EINVAL);
2640 	}
2641 	if ((md->md_sectorsize / pp->sectorsize) < sc->sc_ndisks - 1) {
2642 		G_RAID3_DEBUG(1,
2643 		    "Invalid '%s' field on disk %s (device %s), skipping.",
2644 		    "md_sectorsize", pp->name, sc->sc_name);
2645 		return (EINVAL);
2646 	}
2647 	if (md->md_sectorsize != sc->sc_sectorsize) {
2648 		G_RAID3_DEBUG(1,
2649 		    "Invalid '%s' field on disk %s (device %s), skipping.",
2650 		    "md_sectorsize", pp->name, sc->sc_name);
2651 		return (EINVAL);
2652 	}
2653 	if ((sc->sc_sectorsize % pp->sectorsize) != 0) {
2654 		G_RAID3_DEBUG(1,
2655 		    "Invalid sector size of disk %s (device %s), skipping.",
2656 		    pp->name, sc->sc_name);
2657 		return (EINVAL);
2658 	}
2659 	if ((md->md_mflags & ~G_RAID3_DEVICE_FLAG_MASK) != 0) {
2660 		G_RAID3_DEBUG(1,
2661 		    "Invalid device flags on disk %s (device %s), skipping.",
2662 		    pp->name, sc->sc_name);
2663 		return (EINVAL);
2664 	}
2665 	if ((md->md_mflags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 &&
2666 	    (md->md_mflags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0) {
2667 		/*
2668 		 * VERIFY and ROUND-ROBIN options are mutally exclusive.
2669 		 */
2670 		G_RAID3_DEBUG(1, "Both VERIFY and ROUND-ROBIN flags exist on "
2671 		    "disk %s (device %s), skipping.", pp->name, sc->sc_name);
2672 		return (EINVAL);
2673 	}
2674 	if ((md->md_dflags & ~G_RAID3_DISK_FLAG_MASK) != 0) {
2675 		G_RAID3_DEBUG(1,
2676 		    "Invalid disk flags on disk %s (device %s), skipping.",
2677 		    pp->name, sc->sc_name);
2678 		return (EINVAL);
2679 	}
2680 	return (0);
2681 }
2682 
2683 int
2684 g_raid3_add_disk(struct g_raid3_softc *sc, struct g_provider *pp,
2685     struct g_raid3_metadata *md)
2686 {
2687 	struct g_raid3_disk *disk;
2688 	int error;
2689 
2690 	g_topology_assert();
2691 	G_RAID3_DEBUG(2, "Adding disk %s.", pp->name);
2692 
2693 	error = g_raid3_check_metadata(sc, pp, md);
2694 	if (error != 0)
2695 		return (error);
2696 	if (sc->sc_state != G_RAID3_DEVICE_STATE_STARTING &&
2697 	    md->md_genid < sc->sc_genid) {
2698 		G_RAID3_DEBUG(0, "Component %s (device %s) broken, skipping.",
2699 		    pp->name, sc->sc_name);
2700 		return (EINVAL);
2701 	}
2702 	disk = g_raid3_init_disk(sc, pp, md, &error);
2703 	if (disk == NULL)
2704 		return (error);
2705 	error = g_raid3_event_send(disk, G_RAID3_DISK_STATE_NEW,
2706 	    G_RAID3_EVENT_WAIT);
2707 	if (error != 0)
2708 		return (error);
2709 	if (md->md_version < G_RAID3_VERSION) {
2710 		G_RAID3_DEBUG(0, "Upgrading metadata on %s (v%d->v%d).",
2711 		    pp->name, md->md_version, G_RAID3_VERSION);
2712 		g_raid3_update_metadata(disk);
2713 	}
2714 	return (0);
2715 }
2716 
2717 static int
2718 g_raid3_access(struct g_provider *pp, int acr, int acw, int ace)
2719 {
2720 	struct g_raid3_softc *sc;
2721 	struct g_raid3_disk *disk;
2722 	int dcr, dcw, dce;
2723 	u_int n;
2724 
2725 	g_topology_assert();
2726 	G_RAID3_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr,
2727 	    acw, ace);
2728 
2729 	dcr = pp->acr + acr;
2730 	dcw = pp->acw + acw;
2731 	dce = pp->ace + ace;
2732 
2733 	sc = pp->geom->softc;
2734 	if (sc == NULL ||
2735 	    g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < sc->sc_ndisks - 1 ||
2736 	    (sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
2737 		if (acr <= 0 && acw <= 0 && ace <= 0)
2738 			return (0);
2739 		else
2740 			return (ENXIO);
2741 	}
2742 	for (n = 0; n < sc->sc_ndisks; n++) {
2743 		disk = &sc->sc_disks[n];
2744 		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
2745 			continue;
2746 		/*
2747 		 * Mark disk as dirty on open and unmark on close.
2748 		 */
2749 		if (pp->acw == 0 && dcw > 0) {
2750 			G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.",
2751 			    g_raid3_get_diskname(disk), sc->sc_name);
2752 			disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
2753 			g_raid3_update_metadata(disk);
2754 		} else if (pp->acw > 0 && dcw == 0) {
2755 			G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.",
2756 			    g_raid3_get_diskname(disk), sc->sc_name);
2757 			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2758 			g_raid3_update_metadata(disk);
2759 		}
2760 	}
2761 	return (0);
2762 }
2763 
2764 static struct g_geom *
2765 g_raid3_create(struct g_class *mp, const struct g_raid3_metadata *md)
2766 {
2767 	struct g_raid3_softc *sc;
2768 	struct g_geom *gp;
2769 	int error, timeout;
2770 	u_int n;
2771 
2772 	g_topology_assert();
2773 	G_RAID3_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id);
2774 
2775 	/* One disk is minimum. */
2776 	if (md->md_all < 1)
2777 		return (NULL);
2778 	/*
2779 	 * Action geom.
2780 	 */
2781 	gp = g_new_geomf(mp, "%s", md->md_name);
2782 	sc = malloc(sizeof(*sc), M_RAID3, M_WAITOK | M_ZERO);
2783 	sc->sc_disks = malloc(sizeof(struct g_raid3_disk) * md->md_all, M_RAID3,
2784 	    M_WAITOK | M_ZERO);
2785 	gp->start = g_raid3_start;
2786 	gp->orphan = g_raid3_orphan;
2787 	gp->access = g_raid3_access;
2788 	gp->dumpconf = g_raid3_dumpconf;
2789 
2790 	sc->sc_id = md->md_id;
2791 	sc->sc_mediasize = md->md_mediasize;
2792 	sc->sc_sectorsize = md->md_sectorsize;
2793 	sc->sc_ndisks = md->md_all;
2794 	sc->sc_round_robin = 0;
2795 	sc->sc_flags = md->md_mflags;
2796 	sc->sc_bump_id = 0;
2797 	sc->sc_idle = 0;
2798 	for (n = 0; n < sc->sc_ndisks; n++) {
2799 		sc->sc_disks[n].d_softc = sc;
2800 		sc->sc_disks[n].d_no = n;
2801 		sc->sc_disks[n].d_state = G_RAID3_DISK_STATE_NODISK;
2802 	}
2803 	bioq_init(&sc->sc_queue);
2804 	mtx_init(&sc->sc_queue_mtx, "graid3:queue", NULL, MTX_DEF);
2805 	TAILQ_INIT(&sc->sc_events);
2806 	mtx_init(&sc->sc_events_mtx, "graid3:events", NULL, MTX_DEF);
2807 	callout_init(&sc->sc_callout, CALLOUT_MPSAFE);
2808 	sc->sc_state = G_RAID3_DEVICE_STATE_STARTING;
2809 	gp->softc = sc;
2810 	sc->sc_geom = gp;
2811 	sc->sc_provider = NULL;
2812 	/*
2813 	 * Synchronization geom.
2814 	 */
2815 	gp = g_new_geomf(mp, "%s.sync", md->md_name);
2816 	gp->softc = sc;
2817 	gp->orphan = g_raid3_orphan;
2818 	sc->sc_sync.ds_geom = gp;
2819 	sc->sc_zone_64k = uma_zcreate("gr3:64k", 65536, NULL, NULL, NULL, NULL,
2820 	    UMA_ALIGN_PTR, 0);
2821 	uma_zone_set_max(sc->sc_zone_64k, g_raid3_n64k);
2822 	sc->sc_zone_16k = uma_zcreate("gr3:16k", 16384, NULL, NULL, NULL, NULL,
2823 	    UMA_ALIGN_PTR, 0);
2824 	uma_zone_set_max(sc->sc_zone_64k, g_raid3_n16k);
2825 	sc->sc_zone_4k = uma_zcreate("gr3:4k", 4096, NULL, NULL, NULL, NULL,
2826 	    UMA_ALIGN_PTR, 0);
2827 	uma_zone_set_max(sc->sc_zone_4k, g_raid3_n4k);
2828 	error = kthread_create(g_raid3_worker, sc, &sc->sc_worker, 0, 0,
2829 	    "g_raid3 %s", md->md_name);
2830 	if (error != 0) {
2831 		G_RAID3_DEBUG(1, "Cannot create kernel thread for %s.",
2832 		    sc->sc_name);
2833 		uma_zdestroy(sc->sc_zone_64k);
2834 		uma_zdestroy(sc->sc_zone_16k);
2835 		uma_zdestroy(sc->sc_zone_4k);
2836 		g_destroy_geom(sc->sc_sync.ds_geom);
2837 		mtx_destroy(&sc->sc_events_mtx);
2838 		mtx_destroy(&sc->sc_queue_mtx);
2839 		g_destroy_geom(sc->sc_geom);
2840 		free(sc->sc_disks, M_RAID3);
2841 		free(sc, M_RAID3);
2842 		return (NULL);
2843 	}
2844 
2845 	G_RAID3_DEBUG(0, "Device %s created (id=%u).", sc->sc_name, sc->sc_id);
2846 
2847 	sc->sc_rootmount = root_mount_hold("GRAID3");
2848 	G_RAID3_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount);
2849 
2850 	/*
2851 	 * Run timeout.
2852 	 */
2853 	timeout = atomic_load_acq_int(&g_raid3_timeout);
2854 	callout_reset(&sc->sc_callout, timeout * hz, g_raid3_go, sc);
2855 	return (sc->sc_geom);
2856 }
2857 
2858 int
2859 g_raid3_destroy(struct g_raid3_softc *sc, boolean_t force)
2860 {
2861 	struct g_provider *pp;
2862 
2863 	g_topology_assert();
2864 
2865 	if (sc == NULL)
2866 		return (ENXIO);
2867 	pp = sc->sc_provider;
2868 	if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
2869 		if (force) {
2870 			G_RAID3_DEBUG(1, "Device %s is still open, so it "
2871 			    "can't be definitely removed.", pp->name);
2872 		} else {
2873 			G_RAID3_DEBUG(1,
2874 			    "Device %s is still open (r%dw%de%d).", pp->name,
2875 			    pp->acr, pp->acw, pp->ace);
2876 			return (EBUSY);
2877 		}
2878 	}
2879 
2880 	sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2881 	sc->sc_flags |= G_RAID3_DEVICE_FLAG_WAIT;
2882 	g_topology_unlock();
2883 	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
2884 	mtx_lock(&sc->sc_queue_mtx);
2885 	wakeup(sc);
2886 	wakeup(&sc->sc_queue);
2887 	mtx_unlock(&sc->sc_queue_mtx);
2888 	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker);
2889 	while (sc->sc_worker != NULL)
2890 		tsleep(&sc->sc_worker, PRIBIO, "r3:destroy", hz / 5);
2891 	G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker);
2892 	g_topology_lock();
2893 	g_raid3_destroy_device(sc);
2894 	free(sc->sc_disks, M_RAID3);
2895 	free(sc, M_RAID3);
2896 	return (0);
2897 }
2898 
2899 static void
2900 g_raid3_taste_orphan(struct g_consumer *cp)
2901 {
2902 
2903 	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
2904 	    cp->provider->name));
2905 }
2906 
2907 static struct g_geom *
2908 g_raid3_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
2909 {
2910 	struct g_raid3_metadata md;
2911 	struct g_raid3_softc *sc;
2912 	struct g_consumer *cp;
2913 	struct g_geom *gp;
2914 	int error;
2915 
2916 	g_topology_assert();
2917 	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
2918 	G_RAID3_DEBUG(2, "Tasting %s.", pp->name);
2919 
2920 	gp = g_new_geomf(mp, "raid3:taste");
2921 	/* This orphan function should be never called. */
2922 	gp->orphan = g_raid3_taste_orphan;
2923 	cp = g_new_consumer(gp);
2924 	g_attach(cp, pp);
2925 	error = g_raid3_read_metadata(cp, &md);
2926 	g_detach(cp);
2927 	g_destroy_consumer(cp);
2928 	g_destroy_geom(gp);
2929 	if (error != 0)
2930 		return (NULL);
2931 	gp = NULL;
2932 
2933 	if (md.md_provider[0] != '\0' && strcmp(md.md_provider, pp->name) != 0)
2934 		return (NULL);
2935 	if (md.md_provsize != 0 && md.md_provsize != pp->mediasize)
2936 		return (NULL);
2937 	if (g_raid3_debug >= 2)
2938 		raid3_metadata_dump(&md);
2939 
2940 	/*
2941 	 * Let's check if device already exists.
2942 	 */
2943 	sc = NULL;
2944 	LIST_FOREACH(gp, &mp->geom, geom) {
2945 		sc = gp->softc;
2946 		if (sc == NULL)
2947 			continue;
2948 		if (sc->sc_sync.ds_geom == gp)
2949 			continue;
2950 		if (strcmp(md.md_name, sc->sc_name) != 0)
2951 			continue;
2952 		if (md.md_id != sc->sc_id) {
2953 			G_RAID3_DEBUG(0, "Device %s already configured.",
2954 			    sc->sc_name);
2955 			return (NULL);
2956 		}
2957 		break;
2958 	}
2959 	if (gp == NULL) {
2960 		gp = g_raid3_create(mp, &md);
2961 		if (gp == NULL) {
2962 			G_RAID3_DEBUG(0, "Cannot create device %s.",
2963 			    md.md_name);
2964 			return (NULL);
2965 		}
2966 		sc = gp->softc;
2967 	}
2968 	G_RAID3_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name);
2969 	error = g_raid3_add_disk(sc, pp, &md);
2970 	if (error != 0) {
2971 		G_RAID3_DEBUG(0, "Cannot add disk %s to %s (error=%d).",
2972 		    pp->name, gp->name, error);
2973 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NODISK) ==
2974 		    sc->sc_ndisks) {
2975 			g_raid3_destroy(sc, 1);
2976 		}
2977 		return (NULL);
2978 	}
2979 	return (gp);
2980 }
2981 
2982 static int
2983 g_raid3_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused,
2984     struct g_geom *gp)
2985 {
2986 
2987 	return (g_raid3_destroy(gp->softc, 0));
2988 }
2989 
2990 static void
2991 g_raid3_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
2992     struct g_consumer *cp, struct g_provider *pp)
2993 {
2994 	struct g_raid3_softc *sc;
2995 
2996 	g_topology_assert();
2997 
2998 	sc = gp->softc;
2999 	if (sc == NULL)
3000 		return;
3001 	/* Skip synchronization geom. */
3002 	if (gp == sc->sc_sync.ds_geom)
3003 		return;
3004 	if (pp != NULL) {
3005 		/* Nothing here. */
3006 	} else if (cp != NULL) {
3007 		struct g_raid3_disk *disk;
3008 
3009 		disk = cp->private;
3010 		if (disk == NULL)
3011 			return;
3012 		sbuf_printf(sb, "%s<Type>", indent);
3013 		if (disk->d_no == sc->sc_ndisks - 1)
3014 			sbuf_printf(sb, "PARITY");
3015 		else
3016 			sbuf_printf(sb, "DATA");
3017 		sbuf_printf(sb, "</Type>\n");
3018 		sbuf_printf(sb, "%s<Number>%u</Number>\n", indent,
3019 		    (u_int)disk->d_no);
3020 		if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
3021 			sbuf_printf(sb, "%s<Synchronized>", indent);
3022 			if (disk->d_sync.ds_offset_done == 0)
3023 				sbuf_printf(sb, "0%%");
3024 			else {
3025 				sbuf_printf(sb, "%u%%",
3026 				    (u_int)((disk->d_sync.ds_offset_done * 100) /
3027 				    (sc->sc_mediasize / (sc->sc_ndisks - 1))));
3028 			}
3029 			sbuf_printf(sb, "</Synchronized>\n");
3030 		}
3031 		sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent,
3032 		    disk->d_sync.ds_syncid);
3033 		sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent, disk->d_genid);
3034 		sbuf_printf(sb, "%s<Flags>", indent);
3035 		if (disk->d_flags == 0)
3036 			sbuf_printf(sb, "NONE");
3037 		else {
3038 			int first = 1;
3039 
3040 #define	ADD_FLAG(flag, name)	do {					\
3041 	if ((disk->d_flags & (flag)) != 0) {				\
3042 		if (!first)						\
3043 			sbuf_printf(sb, ", ");				\
3044 		else							\
3045 			first = 0;					\
3046 		sbuf_printf(sb, name);					\
3047 	}								\
3048 } while (0)
3049 			ADD_FLAG(G_RAID3_DISK_FLAG_DIRTY, "DIRTY");
3050 			ADD_FLAG(G_RAID3_DISK_FLAG_HARDCODED, "HARDCODED");
3051 			ADD_FLAG(G_RAID3_DISK_FLAG_SYNCHRONIZING,
3052 			    "SYNCHRONIZING");
3053 			ADD_FLAG(G_RAID3_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC");
3054 #undef	ADD_FLAG
3055 		}
3056 		sbuf_printf(sb, "</Flags>\n");
3057 		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
3058 		    g_raid3_disk_state2str(disk->d_state));
3059 	} else {
3060 		sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
3061 		sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent, sc->sc_syncid);
3062 		sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent, sc->sc_genid);
3063 		sbuf_printf(sb, "%s<Flags>", indent);
3064 		if (sc->sc_flags == 0)
3065 			sbuf_printf(sb, "NONE");
3066 		else {
3067 			int first = 1;
3068 
3069 #define	ADD_FLAG(flag, name)	do {					\
3070 	if ((sc->sc_flags & (flag)) != 0) {				\
3071 		if (!first)						\
3072 			sbuf_printf(sb, ", ");				\
3073 		else							\
3074 			first = 0;					\
3075 		sbuf_printf(sb, name);					\
3076 	}								\
3077 } while (0)
3078 			ADD_FLAG(G_RAID3_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC");
3079 			ADD_FLAG(G_RAID3_DEVICE_FLAG_ROUND_ROBIN,
3080 			    "ROUND-ROBIN");
3081 			ADD_FLAG(G_RAID3_DEVICE_FLAG_VERIFY, "VERIFY");
3082 #undef	ADD_FLAG
3083 		}
3084 		sbuf_printf(sb, "</Flags>\n");
3085 		sbuf_printf(sb, "%s<Components>%u</Components>\n", indent,
3086 		    sc->sc_ndisks);
3087 		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
3088 		    g_raid3_device_state2str(sc->sc_state));
3089 	}
3090 }
3091 
3092 static void
3093 g_raid3_shutdown(void *arg, int howto)
3094 {
3095 	struct g_class *mp;
3096 	struct g_geom *gp, *gp2;
3097 
3098 	mp = arg;
3099 	DROP_GIANT();
3100 	g_topology_lock();
3101 	LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
3102 		if (gp->softc == NULL)
3103 			continue;
3104 		g_raid3_destroy(gp->softc, 1);
3105 	}
3106 	g_topology_unlock();
3107 	PICKUP_GIANT();
3108 #if 0
3109 	tsleep(&gp, PRIBIO, "r3:shutdown", hz * 20);
3110 #endif
3111 }
3112 
3113 static void
3114 g_raid3_init(struct g_class *mp)
3115 {
3116 
3117 	g_raid3_ehtag = EVENTHANDLER_REGISTER(shutdown_post_sync,
3118 	    g_raid3_shutdown, mp, SHUTDOWN_PRI_FIRST);
3119 	if (g_raid3_ehtag == NULL)
3120 		G_RAID3_DEBUG(0, "Warning! Cannot register shutdown event.");
3121 }
3122 
3123 static void
3124 g_raid3_fini(struct g_class *mp)
3125 {
3126 
3127 	if (g_raid3_ehtag == NULL)
3128 		return;
3129 	EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_raid3_ehtag);
3130 }
3131 
3132 DECLARE_GEOM_CLASS(g_raid3_class, g_raid3);
3133