xref: /freebsd/sys/geom/raid3/g_raid3.c (revision 74bf4e164ba5851606a27d4feff27717452583e5)
1 /*-
2  * Copyright (c) 2004 Pawel Jakub Dawidek <pjd@FreeBSD.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29 
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/kernel.h>
33 #include <sys/module.h>
34 #include <sys/limits.h>
35 #include <sys/lock.h>
36 #include <sys/mutex.h>
37 #include <sys/bio.h>
38 #include <sys/sysctl.h>
39 #include <sys/malloc.h>
40 #include <sys/bitstring.h>
41 #include <vm/uma.h>
42 #include <machine/atomic.h>
43 #include <geom/geom.h>
44 #include <sys/proc.h>
45 #include <sys/kthread.h>
46 #include <geom/raid3/g_raid3.h>
47 
48 
49 static MALLOC_DEFINE(M_RAID3, "raid3 data", "GEOM_RAID3 Data");
50 
51 SYSCTL_DECL(_kern_geom);
52 SYSCTL_NODE(_kern_geom, OID_AUTO, raid3, CTLFLAG_RW, 0, "GEOM_RAID3 stuff");
53 u_int g_raid3_debug = 0;
54 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, debug, CTLFLAG_RW, &g_raid3_debug, 0,
55     "Debug level");
56 static u_int g_raid3_timeout = 8;
57 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, timeout, CTLFLAG_RW, &g_raid3_timeout,
58     0, "Time to wait on all raid3 components");
59 static u_int g_raid3_reqs_per_sync = 5;
60 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, reqs_per_sync, CTLFLAG_RW,
61     &g_raid3_reqs_per_sync, 0,
62     "Number of regular I/O requests per synchronization request");
63 static u_int g_raid3_syncs_per_sec = 100;
64 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, syncs_per_sec, CTLFLAG_RW,
65     &g_raid3_syncs_per_sec, 0,
66     "Number of synchronizations requests per second");
67 
68 static u_int g_raid3_n64k = 50;
69 TUNABLE_INT("kern.geom.raid3.n64k", &g_raid3_n64k);
70 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n64k, CTLFLAG_RD, &g_raid3_n64k, 0,
71     "Maximum number of 64kB allocations");
72 static u_int g_raid3_n16k = 200;
73 TUNABLE_INT("kern.geom.raid3.n16k", &g_raid3_n16k);
74 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n16k, CTLFLAG_RD, &g_raid3_n16k, 0,
75     "Maximum number of 16kB allocations");
76 static u_int g_raid3_n4k = 1200;
77 TUNABLE_INT("kern.geom.raid3.n4k", &g_raid3_n4k);
78 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n4k, CTLFLAG_RD, &g_raid3_n4k, 0,
79     "Maximum number of 4kB allocations");
80 
81 SYSCTL_NODE(_kern_geom_raid3, OID_AUTO, stat, CTLFLAG_RW, 0,
82     "GEOM_RAID3 statistics");
83 static u_int g_raid3_parity_mismatch = 0;
84 SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, parity_mismatch, CTLFLAG_RD,
85     &g_raid3_parity_mismatch, 0, "Number of failures in VERIFY mode");
86 static u_int g_raid3_64k_requested = 0;
87 SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 64k_requested, CTLFLAG_RD,
88     &g_raid3_64k_requested, 0, "Number of requested 64kB allocations");
89 static u_int g_raid3_64k_failed = 0;
90 SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 64k_failed, CTLFLAG_RD,
91     &g_raid3_64k_failed, 0, "Number of failed 64kB allocations");
92 static u_int g_raid3_16k_requested = 0;
93 SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 16k_requested, CTLFLAG_RD,
94     &g_raid3_16k_requested, 0, "Number of requested 16kB allocations");
95 static u_int g_raid3_16k_failed = 0;
96 SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 16k_failed, CTLFLAG_RD,
97     &g_raid3_16k_failed, 0, "Number of failed 16kB allocations");
98 static u_int g_raid3_4k_requested = 0;
99 SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 4k_requested, CTLFLAG_RD,
100     &g_raid3_4k_requested, 0, "Number of requested 4kB allocations");
101 static u_int g_raid3_4k_failed = 0;
102 SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 4k_failed, CTLFLAG_RD,
103     &g_raid3_4k_failed, 0, "Number of failed 4kB allocations");
104 
105 #define	MSLEEP(ident, mtx, priority, wmesg, timeout)	do {		\
106 	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, (ident));	\
107 	msleep((ident), (mtx), (priority), (wmesg), (timeout));		\
108 	G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, (ident));	\
109 } while (0)
110 
111 
112 static int g_raid3_destroy_geom(struct gctl_req *req, struct g_class *mp,
113     struct g_geom *gp);
114 static g_taste_t g_raid3_taste;
115 
116 struct g_class g_raid3_class = {
117 	.name = G_RAID3_CLASS_NAME,
118 	.version = G_VERSION,
119 	.ctlreq = g_raid3_config,
120 	.taste = g_raid3_taste,
121 	.destroy_geom = g_raid3_destroy_geom
122 };
123 
124 
125 static void g_raid3_destroy_provider(struct g_raid3_softc *sc);
126 static int g_raid3_update_disk(struct g_raid3_disk *disk, u_int state);
127 static void g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force);
128 static void g_raid3_dumpconf(struct sbuf *sb, const char *indent,
129     struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
130 static void g_raid3_sync_stop(struct g_raid3_softc *sc, int type);
131 
132 
133 /*
134  * XXX: it should be placed in subr_disk.c.
135  */
136 static void
137 bioq_insert_head(struct bio_queue_head *head, struct bio *bp)
138 {
139 
140 	TAILQ_INSERT_HEAD(&head->queue, bp, bio_queue);
141 }
142 
143 static const char *
144 g_raid3_disk_state2str(int state)
145 {
146 
147 	switch (state) {
148 	case G_RAID3_DISK_STATE_NODISK:
149 		return ("NODISK");
150 	case G_RAID3_DISK_STATE_NONE:
151 		return ("NONE");
152 	case G_RAID3_DISK_STATE_NEW:
153 		return ("NEW");
154 	case G_RAID3_DISK_STATE_ACTIVE:
155 		return ("ACTIVE");
156 	case G_RAID3_DISK_STATE_STALE:
157 		return ("STALE");
158 	case G_RAID3_DISK_STATE_SYNCHRONIZING:
159 		return ("SYNCHRONIZING");
160 	case G_RAID3_DISK_STATE_DISCONNECTED:
161 		return ("DISCONNECTED");
162 	default:
163 		return ("INVALID");
164 	}
165 }
166 
167 static const char *
168 g_raid3_device_state2str(int state)
169 {
170 
171 	switch (state) {
172 	case G_RAID3_DEVICE_STATE_STARTING:
173 		return ("STARTING");
174 	case G_RAID3_DEVICE_STATE_DEGRADED:
175 		return ("DEGRADED");
176 	case G_RAID3_DEVICE_STATE_COMPLETE:
177 		return ("COMPLETE");
178 	default:
179 		return ("INVALID");
180 	}
181 }
182 
183 const char *
184 g_raid3_get_diskname(struct g_raid3_disk *disk)
185 {
186 
187 	if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL)
188 		return ("[unknown]");
189 	return (disk->d_name);
190 }
191 
192 #define	g_raid3_xor(src1, src2, dst, size)				\
193 	_g_raid3_xor((uint64_t *)(src1), (uint64_t *)(src2),		\
194 	    (uint64_t *)(dst), (size_t)size)
195 static void
196 _g_raid3_xor(uint64_t *src1, uint64_t *src2, uint64_t *dst, size_t size)
197 {
198 
199 	KASSERT((size % 128) == 0, ("Invalid size: %zu.", size));
200 	for (; size > 0; size -= 128) {
201 		*dst++ = (*src1++) ^ (*src2++);
202 		*dst++ = (*src1++) ^ (*src2++);
203 		*dst++ = (*src1++) ^ (*src2++);
204 		*dst++ = (*src1++) ^ (*src2++);
205 		*dst++ = (*src1++) ^ (*src2++);
206 		*dst++ = (*src1++) ^ (*src2++);
207 		*dst++ = (*src1++) ^ (*src2++);
208 		*dst++ = (*src1++) ^ (*src2++);
209 		*dst++ = (*src1++) ^ (*src2++);
210 		*dst++ = (*src1++) ^ (*src2++);
211 		*dst++ = (*src1++) ^ (*src2++);
212 		*dst++ = (*src1++) ^ (*src2++);
213 		*dst++ = (*src1++) ^ (*src2++);
214 		*dst++ = (*src1++) ^ (*src2++);
215 		*dst++ = (*src1++) ^ (*src2++);
216 		*dst++ = (*src1++) ^ (*src2++);
217 	}
218 }
219 
220 static int
221 g_raid3_is_zero(struct bio *bp)
222 {
223 	static const uint64_t zeros[] = {
224 	    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
225 	};
226 	u_char *addr;
227 	ssize_t size;
228 
229 	size = bp->bio_length;
230 	addr = (u_char *)bp->bio_data;
231 	for (; size > 0; size -= sizeof(zeros), addr += sizeof(zeros)) {
232 		if (bcmp(addr, zeros, sizeof(zeros)) != 0)
233 			return (0);
234 	}
235 	return (1);
236 }
237 
238 /*
239  * --- Events handling functions ---
240  * Events in geom_raid3 are used to maintain disks and device status
241  * from one thread to simplify locking.
242  */
243 static void
244 g_raid3_event_free(struct g_raid3_event *ep)
245 {
246 
247 	free(ep, M_RAID3);
248 }
249 
250 int
251 g_raid3_event_send(void *arg, int state, int flags)
252 {
253 	struct g_raid3_softc *sc;
254 	struct g_raid3_disk *disk;
255 	struct g_raid3_event *ep;
256 	int error;
257 
258 	ep = malloc(sizeof(*ep), M_RAID3, M_WAITOK);
259 	G_RAID3_DEBUG(4, "%s: Sending event %p.", __func__, ep);
260 	if ((flags & G_RAID3_EVENT_DEVICE) != 0) {
261 		disk = NULL;
262 		sc = arg;
263 	} else {
264 		disk = arg;
265 		sc = disk->d_softc;
266 	}
267 	ep->e_disk = disk;
268 	ep->e_state = state;
269 	ep->e_flags = flags;
270 	ep->e_error = 0;
271 	mtx_lock(&sc->sc_events_mtx);
272 	TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next);
273 	mtx_unlock(&sc->sc_events_mtx);
274 	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
275 	mtx_lock(&sc->sc_queue_mtx);
276 	wakeup(sc);
277 	wakeup(&sc->sc_queue);
278 	mtx_unlock(&sc->sc_queue_mtx);
279 	if ((flags & G_RAID3_EVENT_DONTWAIT) != 0)
280 		return (0);
281 	g_topology_assert();
282 	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, ep);
283 	g_topology_unlock();
284 	while ((ep->e_flags & G_RAID3_EVENT_DONE) == 0) {
285 		mtx_lock(&sc->sc_events_mtx);
286 		MSLEEP(ep, &sc->sc_events_mtx, PRIBIO | PDROP, "r3:event",
287 		    hz * 5);
288 	}
289 	/* Don't even try to use 'sc' here, because it could be already dead. */
290 	g_topology_lock();
291 	error = ep->e_error;
292 	g_raid3_event_free(ep);
293 	return (error);
294 }
295 
296 static struct g_raid3_event *
297 g_raid3_event_get(struct g_raid3_softc *sc)
298 {
299 	struct g_raid3_event *ep;
300 
301 	mtx_lock(&sc->sc_events_mtx);
302 	ep = TAILQ_FIRST(&sc->sc_events);
303 	if (ep != NULL)
304 		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
305 	mtx_unlock(&sc->sc_events_mtx);
306 	return (ep);
307 }
308 
309 static void
310 g_raid3_event_cancel(struct g_raid3_disk *disk)
311 {
312 	struct g_raid3_softc *sc;
313 	struct g_raid3_event *ep, *tmpep;
314 
315 	g_topology_assert();
316 
317 	sc = disk->d_softc;
318 	mtx_lock(&sc->sc_events_mtx);
319 	TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) {
320 		if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0)
321 			continue;
322 		if (ep->e_disk != disk)
323 			continue;
324 		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
325 		if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0)
326 			g_raid3_event_free(ep);
327 		else {
328 			ep->e_error = ECANCELED;
329 			wakeup(ep);
330 		}
331 	}
332 	mtx_unlock(&sc->sc_events_mtx);
333 }
334 
335 /*
336  * Return the number of disks in the given state.
337  * If state is equal to -1, count all connected disks.
338  */
339 u_int
340 g_raid3_ndisks(struct g_raid3_softc *sc, int state)
341 {
342 	struct g_raid3_disk *disk;
343 	u_int n, ndisks;
344 
345 	for (n = ndisks = 0; n < sc->sc_ndisks; n++) {
346 		disk = &sc->sc_disks[n];
347 		if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
348 			continue;
349 		if (state == -1 || disk->d_state == state)
350 			ndisks++;
351 	}
352 	return (ndisks);
353 }
354 
355 static u_int
356 g_raid3_nrequests(struct g_raid3_softc *sc, struct g_consumer *cp)
357 {
358 	struct bio *bp;
359 	u_int nreqs = 0;
360 
361 	mtx_lock(&sc->sc_queue_mtx);
362 	TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
363 		if (bp->bio_from == cp)
364 			nreqs++;
365 	}
366 	mtx_unlock(&sc->sc_queue_mtx);
367 	return (nreqs);
368 }
369 
370 static int
371 g_raid3_is_busy(struct g_raid3_softc *sc, struct g_consumer *cp)
372 {
373 
374 	if (cp->nstart != cp->nend) {
375 		G_RAID3_DEBUG(2,
376 		    "I/O requests for %s exist, can't destroy it now.",
377 		    cp->provider->name);
378 		return (1);
379 	}
380 	if (g_raid3_nrequests(sc, cp) > 0) {
381 		G_RAID3_DEBUG(2,
382 		    "I/O requests for %s in queue, can't destroy it now.",
383 		    cp->provider->name);
384 		return (1);
385 	}
386 	return (0);
387 }
388 
389 static void
390 g_raid3_kill_consumer(struct g_raid3_softc *sc, struct g_consumer *cp)
391 {
392 
393 	g_topology_assert();
394 
395 	cp->private = NULL;
396 	if (g_raid3_is_busy(sc, cp))
397 		return;
398 	G_RAID3_DEBUG(2, "Consumer %s destroyed.", cp->provider->name);
399 	g_detach(cp);
400 	g_destroy_consumer(cp);
401 }
402 
403 static int
404 g_raid3_connect_disk(struct g_raid3_disk *disk, struct g_provider *pp)
405 {
406 	int error;
407 
408 	g_topology_assert();
409 	KASSERT(disk->d_consumer == NULL,
410 	    ("Disk already connected (device %s).", disk->d_softc->sc_name));
411 
412 	disk->d_consumer = g_new_consumer(disk->d_softc->sc_geom);
413 	disk->d_consumer->private = disk;
414 	error = g_attach(disk->d_consumer, pp);
415 	if (error != 0)
416 		return (error);
417 	G_RAID3_DEBUG(2, "Disk %s connected.", g_raid3_get_diskname(disk));
418 	return (0);
419 }
420 
421 static void
422 g_raid3_disconnect_consumer(struct g_raid3_softc *sc, struct g_consumer *cp)
423 {
424 
425 	g_topology_assert();
426 
427 	if (cp == NULL)
428 		return;
429 	if (cp->provider != NULL) {
430 		G_RAID3_DEBUG(2, "Disk %s disconnected.", cp->provider->name);
431 		if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) {
432 			G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d",
433 			    cp->provider->name, -cp->acr, -cp->acw, -cp->ace,
434 			    0);
435 			g_access(cp, -cp->acr, -cp->acw, -cp->ace);
436 		}
437 		g_raid3_kill_consumer(sc, cp);
438 	} else {
439 		g_destroy_consumer(cp);
440 	}
441 }
442 
443 /*
444  * Initialize disk. This means allocate memory, create consumer, attach it
445  * to the provider and open access (r1w1e1) to it.
446  */
447 static struct g_raid3_disk *
448 g_raid3_init_disk(struct g_raid3_softc *sc, struct g_provider *pp,
449     struct g_raid3_metadata *md, int *errorp)
450 {
451 	struct g_raid3_disk *disk;
452 	int error;
453 
454 	disk = &sc->sc_disks[md->md_no];
455 	disk->d_softc = sc;
456 	error = g_raid3_connect_disk(disk, pp);
457 	if (error != 0)
458 		goto fail;
459 	disk->d_no = md->md_no;
460 	disk->d_state = G_RAID3_DISK_STATE_NONE;
461 	disk->d_flags = md->md_dflags;
462 	if (md->md_provider[0] != '\0')
463 		disk->d_flags |= G_RAID3_DISK_FLAG_HARDCODED;
464 	disk->d_sync.ds_consumer = NULL;
465 	disk->d_sync.ds_offset = md->md_sync_offset;
466 	disk->d_sync.ds_offset_done = md->md_sync_offset;
467 	disk->d_sync.ds_syncid = md->md_syncid;
468 	if (errorp != NULL)
469 		*errorp = 0;
470 	return (disk);
471 fail:
472 	if (errorp != NULL)
473 		*errorp = error;
474 	if (disk != NULL)
475 		g_raid3_disconnect_consumer(sc, disk->d_consumer);
476 	return (NULL);
477 }
478 
479 static void
480 g_raid3_destroy_disk(struct g_raid3_disk *disk)
481 {
482 	struct g_raid3_softc *sc;
483 
484 	g_topology_assert();
485 
486 	if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
487 		return;
488 	g_raid3_event_cancel(disk);
489 	sc = disk->d_softc;
490 	switch (disk->d_state) {
491 	case G_RAID3_DISK_STATE_SYNCHRONIZING:
492 		if (sc->sc_syncdisk != NULL)
493 			g_raid3_sync_stop(sc, 1);
494 		/* FALLTHROUGH */
495 	case G_RAID3_DISK_STATE_NEW:
496 	case G_RAID3_DISK_STATE_STALE:
497 	case G_RAID3_DISK_STATE_ACTIVE:
498 		g_raid3_disconnect_consumer(sc, disk->d_consumer);
499 		disk->d_consumer = NULL;
500 		break;
501 	default:
502 		KASSERT(0 == 1, ("Wrong disk state (%s, %s).",
503 		    g_raid3_get_diskname(disk),
504 		    g_raid3_disk_state2str(disk->d_state)));
505 	}
506 	disk->d_state = G_RAID3_DISK_STATE_NODISK;
507 }
508 
509 static void
510 g_raid3_destroy_device(struct g_raid3_softc *sc)
511 {
512 	struct g_raid3_event *ep;
513 	struct g_geom *gp;
514 	struct g_consumer *cp;
515 	u_int n;
516 
517 	g_topology_assert();
518 
519 	gp = sc->sc_geom;
520 	if (sc->sc_provider != NULL)
521 		g_raid3_destroy_provider(sc);
522 	for (n = 0; n < sc->sc_ndisks; n++)
523 		g_raid3_destroy_disk(&sc->sc_disks[n]);
524 	while ((ep = g_raid3_event_get(sc)) != NULL) {
525 		if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0)
526 			g_raid3_event_free(ep);
527 		else {
528 			ep->e_error = ECANCELED;
529 			ep->e_flags |= G_RAID3_EVENT_DONE;
530 			G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, ep);
531 			mtx_lock(&sc->sc_events_mtx);
532 			wakeup(ep);
533 			mtx_unlock(&sc->sc_events_mtx);
534 		}
535 	}
536 	callout_drain(&sc->sc_callout);
537 	gp->softc = NULL;
538 	cp = LIST_FIRST(&sc->sc_sync.ds_geom->consumer);
539 	if (cp != NULL)
540 		g_raid3_disconnect_consumer(sc, cp);
541 	sc->sc_sync.ds_geom->softc = NULL;
542 	g_wither_geom(sc->sc_sync.ds_geom, ENXIO);
543 	uma_zdestroy(sc->sc_zone_64k);
544 	uma_zdestroy(sc->sc_zone_16k);
545 	uma_zdestroy(sc->sc_zone_4k);
546 	mtx_destroy(&sc->sc_queue_mtx);
547 	mtx_destroy(&sc->sc_events_mtx);
548 	G_RAID3_DEBUG(0, "Device %s destroyed.", gp->name);
549 	g_wither_geom(gp, ENXIO);
550 }
551 
552 static void
553 g_raid3_orphan(struct g_consumer *cp)
554 {
555 	struct g_raid3_disk *disk;
556 
557 	g_topology_assert();
558 
559 	disk = cp->private;
560 	if (disk == NULL)
561 		return;
562 	disk->d_softc->sc_bump_syncid = G_RAID3_BUMP_ON_FIRST_WRITE;
563 	g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
564 	    G_RAID3_EVENT_DONTWAIT);
565 }
566 
567 static void
568 g_raid3_spoiled(struct g_consumer *cp)
569 {
570 	struct g_raid3_disk *disk;
571 
572 	g_topology_assert();
573 
574 	disk = cp->private;
575 	if (disk == NULL)
576 		return;
577 	disk->d_softc->sc_bump_syncid = G_RAID3_BUMP_IMMEDIATELY;
578 	g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
579 	    G_RAID3_EVENT_DONTWAIT);
580 }
581 
582 static int
583 g_raid3_write_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md)
584 {
585 	struct g_raid3_softc *sc;
586 	struct g_consumer *cp;
587 	off_t offset, length;
588 	int close = 0, error = 0;
589 	u_char *sector;
590 
591 	g_topology_assert();
592 
593 	sc = disk->d_softc;
594 	cp = disk->d_consumer;
595 	KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name));
596 	KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name));
597 	length = cp->provider->sectorsize;
598 	offset = cp->provider->mediasize - length;
599 	sector = malloc((size_t)length, M_RAID3, M_WAITOK | M_ZERO);
600 	/*
601 	 * Open consumer if it wasn't opened and remember to close it.
602 	 */
603 	if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) == 0) {
604 		error = g_access(cp, 0, 1, 1);
605 		G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", cp->provider->name,
606 		    0, 1, 1, error);
607 		if (error == 0)
608 			close = 1;
609 #ifdef	INVARIANTS
610 	} else {
611 		KASSERT(cp->acw > 0 && cp->ace > 0,
612 		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
613 		    cp->acr, cp->acw, cp->ace));
614 #endif
615 	}
616 	if (error == 0) {
617 		if (md != NULL)
618 			raid3_metadata_encode(md, sector);
619 		g_topology_unlock();
620 		error = g_write_data(cp, offset, sector, length);
621 		g_topology_lock();
622 	}
623 	free(sector, M_RAID3);
624 	if (close) {
625 		g_access(cp, 0, -1, -1);
626 		G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d",
627 		    cp->provider->name, 0, -1, -1, 0);
628 	}
629 	if (error != 0) {
630 		disk->d_softc->sc_bump_syncid = G_RAID3_BUMP_IMMEDIATELY;
631 		g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
632 		    G_RAID3_EVENT_DONTWAIT);
633 	}
634 	return (error);
635 }
636 
637 int
638 g_raid3_clear_metadata(struct g_raid3_disk *disk)
639 {
640 	int error;
641 
642 	g_topology_assert();
643 	error = g_raid3_write_metadata(disk, NULL);
644 	if (error == 0) {
645 		G_RAID3_DEBUG(2, "Metadata on %s cleared.",
646 		    g_raid3_get_diskname(disk));
647 	} else {
648 		G_RAID3_DEBUG(0,
649 		    "Cannot clear metadata on disk %s (error=%d).",
650 		    g_raid3_get_diskname(disk), error);
651 	}
652 	return (error);
653 }
654 
655 void
656 g_raid3_fill_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md)
657 {
658 	struct g_raid3_softc *sc;
659 
660 	sc = disk->d_softc;
661 	strlcpy(md->md_magic, G_RAID3_MAGIC, sizeof(md->md_magic));
662 	md->md_version = G_RAID3_VERSION;
663 	strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name));
664 	md->md_id = sc->sc_id;
665 	md->md_all = sc->sc_ndisks;
666 	md->md_mediasize = sc->sc_mediasize;
667 	md->md_sectorsize = sc->sc_sectorsize;
668 	md->md_mflags = (sc->sc_flags & G_RAID3_DEVICE_FLAG_MASK);
669 	md->md_no = disk->d_no;
670 	md->md_syncid = disk->d_sync.ds_syncid;
671 	md->md_dflags = (disk->d_flags & G_RAID3_DISK_FLAG_MASK);
672 	if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING)
673 		md->md_sync_offset = disk->d_sync.ds_offset_done;
674 	else
675 		md->md_sync_offset = 0;
676 	if ((disk->d_flags & G_RAID3_DISK_FLAG_HARDCODED) != 0 &&
677 	    disk->d_consumer != NULL && disk->d_consumer->provider != NULL) {
678 		strlcpy(md->md_provider, disk->d_consumer->provider->name,
679 		    sizeof(md->md_provider));
680 	} else {
681 		bzero(md->md_provider, sizeof(md->md_provider));
682 	}
683 }
684 
685 void
686 g_raid3_update_metadata(struct g_raid3_disk *disk)
687 {
688 	struct g_raid3_metadata md;
689 	int error;
690 
691 	g_topology_assert();
692 	g_raid3_fill_metadata(disk, &md);
693 	error = g_raid3_write_metadata(disk, &md);
694 	if (error == 0) {
695 		G_RAID3_DEBUG(2, "Metadata on %s updated.",
696 		    g_raid3_get_diskname(disk));
697 	} else {
698 		G_RAID3_DEBUG(0,
699 		    "Cannot update metadata on disk %s (error=%d).",
700 		    g_raid3_get_diskname(disk), error);
701 	}
702 }
703 
704 static void
705 g_raid3_bump_syncid(struct g_raid3_softc *sc)
706 {
707 	struct g_raid3_disk *disk;
708 	u_int n;
709 
710 	g_topology_assert();
711 	KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0,
712 	    ("%s called with no active disks (device=%s).", __func__,
713 	    sc->sc_name));
714 
715 	sc->sc_syncid++;
716 	for (n = 0; n < sc->sc_ndisks; n++) {
717 		disk = &sc->sc_disks[n];
718 		if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
719 		    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
720 			disk->d_sync.ds_syncid = sc->sc_syncid;
721 			g_raid3_update_metadata(disk);
722 		}
723 	}
724 }
725 
726 /*
727  * Treat bio_driver1 field in parent bio as list head and field bio_caller1
728  * in child bio as pointer to the next element on the list.
729  */
730 #define	G_RAID3_HEAD_BIO(pbp)	(pbp)->bio_driver1
731 
732 #define	G_RAID3_NEXT_BIO(cbp)	(cbp)->bio_caller1
733 
734 #define	G_RAID3_FOREACH_BIO(pbp, bp)					\
735 	for ((bp) = G_RAID3_HEAD_BIO(pbp); (bp) != NULL;		\
736 	    (bp) = G_RAID3_NEXT_BIO(bp))
737 
738 #define	G_RAID3_FOREACH_SAFE_BIO(pbp, bp, tmpbp)			\
739 	for ((bp) = G_RAID3_HEAD_BIO(pbp);				\
740 	    (bp) != NULL && ((tmpbp) = G_RAID3_NEXT_BIO(bp), 1);	\
741 	    (bp) = (tmpbp))
742 
743 static void
744 g_raid3_init_bio(struct bio *pbp)
745 {
746 
747 	G_RAID3_HEAD_BIO(pbp) = NULL;
748 }
749 
750 static void
751 g_raid3_remove_bio(struct bio *cbp)
752 {
753 	struct bio *pbp, *bp;
754 
755 	pbp = cbp->bio_parent;
756 	if (G_RAID3_HEAD_BIO(pbp) == cbp)
757 		G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp);
758 	else {
759 		G_RAID3_FOREACH_BIO(pbp, bp) {
760 			if (G_RAID3_NEXT_BIO(bp) == cbp) {
761 				G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp);
762 				break;
763 			}
764 		}
765 	}
766 	G_RAID3_NEXT_BIO(cbp) = NULL;
767 }
768 
769 static void
770 g_raid3_replace_bio(struct bio *sbp, struct bio *dbp)
771 {
772 	struct bio *pbp, *bp;
773 
774 	g_raid3_remove_bio(sbp);
775 	pbp = dbp->bio_parent;
776 	G_RAID3_NEXT_BIO(sbp) = G_RAID3_NEXT_BIO(dbp);
777 	if (G_RAID3_HEAD_BIO(pbp) == dbp)
778 		G_RAID3_HEAD_BIO(pbp) = sbp;
779 	else {
780 		G_RAID3_FOREACH_BIO(pbp, bp) {
781 			if (G_RAID3_NEXT_BIO(bp) == dbp) {
782 				G_RAID3_NEXT_BIO(bp) = sbp;
783 				break;
784 			}
785 		}
786 	}
787 	G_RAID3_NEXT_BIO(dbp) = NULL;
788 }
789 
790 static void
791 g_raid3_destroy_bio(struct g_raid3_softc *sc, struct bio *cbp)
792 {
793 	struct bio *bp, *pbp;
794 	size_t size;
795 
796 	pbp = cbp->bio_parent;
797 	pbp->bio_children--;
798 	KASSERT(cbp->bio_data != NULL, ("NULL bio_data"));
799 	size = pbp->bio_length / (sc->sc_ndisks - 1);
800 	if (size > 16384)
801 		uma_zfree(sc->sc_zone_64k, cbp->bio_data);
802 	else if (size > 4096)
803 		uma_zfree(sc->sc_zone_16k, cbp->bio_data);
804 	else
805 		uma_zfree(sc->sc_zone_4k, cbp->bio_data);
806 	if (G_RAID3_HEAD_BIO(pbp) == cbp) {
807 		G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp);
808 		G_RAID3_NEXT_BIO(cbp) = NULL;
809 		g_destroy_bio(cbp);
810 	} else {
811 		G_RAID3_FOREACH_BIO(pbp, bp) {
812 			if (G_RAID3_NEXT_BIO(bp) == cbp)
813 				break;
814 		}
815 		if (bp != NULL) {
816 			KASSERT(G_RAID3_NEXT_BIO(bp) != NULL,
817 			    ("NULL bp->bio_driver1"));
818 			G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp);
819 			G_RAID3_NEXT_BIO(cbp) = NULL;
820 		}
821 		g_destroy_bio(cbp);
822 	}
823 }
824 
825 static struct bio *
826 g_raid3_clone_bio(struct g_raid3_softc *sc, struct bio *pbp)
827 {
828 	struct bio *bp, *cbp;
829 	size_t size;
830 
831 	cbp = g_clone_bio(pbp);
832 	if (cbp == NULL)
833 		return (NULL);
834 	size = pbp->bio_length / (sc->sc_ndisks - 1);
835 	if (size > 16384) {
836 		cbp->bio_data = uma_zalloc(sc->sc_zone_64k, M_NOWAIT);
837 		g_raid3_64k_requested++;
838 	} else if (size > 4096) {
839 		cbp->bio_data = uma_zalloc(sc->sc_zone_16k, M_NOWAIT);
840 		g_raid3_16k_requested++;
841 	} else {
842 		cbp->bio_data = uma_zalloc(sc->sc_zone_4k, M_NOWAIT);
843 		g_raid3_4k_requested++;
844 	}
845 	if (cbp->bio_data == NULL) {
846 		if (size > 16384)
847 			g_raid3_64k_failed++;
848 		if (size > 4096)
849 			g_raid3_16k_failed++;
850 		else
851 			g_raid3_4k_failed++;
852 		pbp->bio_children--;
853 		g_destroy_bio(cbp);
854 		return (NULL);
855 	}
856 	G_RAID3_NEXT_BIO(cbp) = NULL;
857 	if (G_RAID3_HEAD_BIO(pbp) == NULL)
858 		G_RAID3_HEAD_BIO(pbp) = cbp;
859 	else {
860 		G_RAID3_FOREACH_BIO(pbp, bp) {
861 			if (G_RAID3_NEXT_BIO(bp) == NULL) {
862 				G_RAID3_NEXT_BIO(bp) = cbp;
863 				break;
864 			}
865 		}
866 	}
867 	return (cbp);
868 }
869 
870 static void
871 g_raid3_scatter(struct bio *pbp)
872 {
873 	struct g_raid3_softc *sc;
874 	struct g_raid3_disk *disk;
875 	struct bio *bp, *cbp;
876 	off_t atom, cadd, padd, left;
877 
878 	sc = pbp->bio_to->geom->softc;
879 	bp = NULL;
880 	if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) {
881 		/*
882 		 * Find bio for which we should calculate data.
883 		 */
884 		G_RAID3_FOREACH_BIO(pbp, cbp) {
885 			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) {
886 				bp = cbp;
887 				break;
888 			}
889 		}
890 		KASSERT(bp != NULL, ("NULL parity bio."));
891 	}
892 	atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
893 	cadd = padd = 0;
894 	for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) {
895 		G_RAID3_FOREACH_BIO(pbp, cbp) {
896 			if (cbp == bp)
897 				continue;
898 			bcopy(pbp->bio_data + padd, cbp->bio_data + cadd, atom);
899 			padd += atom;
900 		}
901 		cadd += atom;
902 	}
903 	if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) {
904 		struct bio *tmpbp;
905 
906 		/*
907 		 * Calculate parity.
908 		 */
909 		bzero(bp->bio_data, bp->bio_length);
910 		G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) {
911 			if (cbp == bp)
912 				continue;
913 			g_raid3_xor(cbp->bio_data, bp->bio_data, bp->bio_data,
914 			    bp->bio_length);
915 			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_NODISK) != 0)
916 				g_raid3_destroy_bio(sc, cbp);
917 		}
918 	}
919 	G_RAID3_FOREACH_BIO(pbp, cbp) {
920 		struct g_consumer *cp;
921 
922 		disk = cbp->bio_caller2;
923 		cp = disk->d_consumer;
924 		cbp->bio_to = cp->provider;
925 		G_RAID3_LOGREQ(3, cbp, "Sending request.");
926 		KASSERT(cp->acr > 0 && cp->ace > 0,
927 		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
928 		    cp->acr, cp->acw, cp->ace));
929 		g_io_request(cbp, cp);
930 	}
931 }
932 
933 static void
934 g_raid3_gather(struct bio *pbp)
935 {
936 	struct g_raid3_softc *sc;
937 	struct g_raid3_disk *disk;
938 	struct bio *xbp, *fbp, *cbp;
939 	off_t atom, cadd, padd, left;
940 
941 	sc = pbp->bio_to->geom->softc;
942 	/*
943 	 * Find bio for which we have to calculate data.
944 	 * While going through this path, check if all requests
945 	 * succeeded, if not, deny whole request.
946 	 * If we're in COMPLETE mode, we allow one request to fail,
947 	 * so if we find one, we're sending it to the parity consumer.
948 	 * If there are more failed requests, we deny whole request.
949 	 */
950 	xbp = fbp = NULL;
951 	G_RAID3_FOREACH_BIO(pbp, cbp) {
952 		if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) {
953 			KASSERT(xbp == NULL, ("More than one parity bio."));
954 			xbp = cbp;
955 		}
956 		if (cbp->bio_error == 0)
957 			continue;
958 		/*
959 		 * Found failed request.
960 		 */
961 		G_RAID3_LOGREQ(0, cbp, "Request failed.");
962 		disk = cbp->bio_caller2;
963 		if (disk != NULL) {
964 			/*
965 			 * Actually this is pointless to bump syncid,
966 			 * because whole device is fucked up.
967 			 */
968 			sc->sc_bump_syncid = G_RAID3_BUMP_IMMEDIATELY;
969 			g_raid3_event_send(disk,
970 			    G_RAID3_DISK_STATE_DISCONNECTED,
971 			    G_RAID3_EVENT_DONTWAIT);
972 		}
973 		if (fbp == NULL) {
974 			if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_DEGRADED) != 0) {
975 				/*
976 				 * We are already in degraded mode, so we can't
977 				 * accept any failures.
978 				 */
979 				if (pbp->bio_error == 0)
980 					pbp->bio_error = fbp->bio_error;
981 			} else {
982 				fbp = cbp;
983 			}
984 		} else {
985 			/*
986 			 * Next failed request, that's too many.
987 			 */
988 			if (pbp->bio_error == 0)
989 				pbp->bio_error = fbp->bio_error;
990 		}
991 	}
992 	if (pbp->bio_error != 0)
993 		goto finish;
994 	if (fbp != NULL && (pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) {
995 		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_VERIFY;
996 		if (xbp != fbp)
997 			g_raid3_replace_bio(xbp, fbp);
998 		g_raid3_destroy_bio(sc, fbp);
999 	} else if (fbp != NULL) {
1000 		struct g_consumer *cp;
1001 
1002 		/*
1003 		 * One request failed, so send the same request to
1004 		 * the parity consumer.
1005 		 */
1006 		disk = pbp->bio_driver2;
1007 		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) {
1008 			pbp->bio_error = fbp->bio_error;
1009 			goto finish;
1010 		}
1011 		pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
1012 		pbp->bio_inbed--;
1013 		fbp->bio_flags &= ~(BIO_DONE | BIO_ERROR);
1014 		if (disk->d_no == sc->sc_ndisks - 1)
1015 			fbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1016 		fbp->bio_error = 0;
1017 		fbp->bio_completed = 0;
1018 		fbp->bio_children = 0;
1019 		fbp->bio_inbed = 0;
1020 		cp = disk->d_consumer;
1021 		fbp->bio_caller2 = disk;
1022 		fbp->bio_to = cp->provider;
1023 		G_RAID3_LOGREQ(3, fbp, "Sending request (recover).");
1024 		KASSERT(cp->acr > 0 && cp->ace > 0,
1025 		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1026 		    cp->acr, cp->acw, cp->ace));
1027 		g_io_request(fbp, cp);
1028 		return;
1029 	}
1030 	if (xbp != NULL) {
1031 		/*
1032 		 * Calculate parity.
1033 		 */
1034 		G_RAID3_FOREACH_BIO(pbp, cbp) {
1035 			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0)
1036 				continue;
1037 			g_raid3_xor(cbp->bio_data, xbp->bio_data, xbp->bio_data,
1038 			    xbp->bio_length);
1039 		}
1040 		xbp->bio_cflags &= ~G_RAID3_BIO_CFLAG_PARITY;
1041 		if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) {
1042 			if (!g_raid3_is_zero(xbp)) {
1043 				g_raid3_parity_mismatch++;
1044 				pbp->bio_error = EIO;
1045 				goto finish;
1046 			}
1047 			g_raid3_destroy_bio(sc, xbp);
1048 		}
1049 	}
1050 	atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
1051 	cadd = padd = 0;
1052 	for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) {
1053 		G_RAID3_FOREACH_BIO(pbp, cbp) {
1054 			bcopy(cbp->bio_data + cadd, pbp->bio_data + padd, atom);
1055 			pbp->bio_completed += atom;
1056 			padd += atom;
1057 		}
1058 		cadd += atom;
1059 	}
1060 finish:
1061 	if (pbp->bio_error == 0)
1062 		G_RAID3_LOGREQ(3, pbp, "Request finished.");
1063 	else {
1064 		if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0)
1065 			G_RAID3_LOGREQ(1, pbp, "Verification error.");
1066 		else
1067 			G_RAID3_LOGREQ(0, pbp, "Request failed.");
1068 	}
1069 	pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_MASK;
1070 	g_io_deliver(pbp, pbp->bio_error);
1071 	while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL)
1072 		g_raid3_destroy_bio(sc, cbp);
1073 }
1074 
1075 static void
1076 g_raid3_done(struct bio *bp)
1077 {
1078 	struct g_raid3_softc *sc;
1079 
1080 	sc = bp->bio_from->geom->softc;
1081 	bp->bio_cflags |= G_RAID3_BIO_CFLAG_REGULAR;
1082 	G_RAID3_LOGREQ(3, bp, "Regular request done (error=%d).", bp->bio_error);
1083 	mtx_lock(&sc->sc_queue_mtx);
1084 	bioq_insert_head(&sc->sc_queue, bp);
1085 	wakeup(sc);
1086 	wakeup(&sc->sc_queue);
1087 	mtx_unlock(&sc->sc_queue_mtx);
1088 }
1089 
1090 static void
1091 g_raid3_regular_request(struct bio *cbp)
1092 {
1093 	struct g_raid3_softc *sc;
1094 	struct g_raid3_disk *disk;
1095 	struct bio *pbp;
1096 
1097 	g_topology_assert_not();
1098 
1099 	pbp = cbp->bio_parent;
1100 	sc = pbp->bio_to->geom->softc;
1101 	disk = cbp->bio_from->private;
1102 	if (disk == NULL) {
1103 		g_topology_lock();
1104 		g_raid3_kill_consumer(sc, cbp->bio_from);
1105 		g_topology_unlock();
1106 	}
1107 
1108 	G_RAID3_LOGREQ(3, cbp, "Request finished.");
1109 	pbp->bio_inbed++;
1110 	KASSERT(pbp->bio_inbed <= pbp->bio_children,
1111 	    ("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed,
1112 	    pbp->bio_children));
1113 	if (pbp->bio_inbed != pbp->bio_children)
1114 		return;
1115 	switch (pbp->bio_cmd) {
1116 	case BIO_READ:
1117 		g_raid3_gather(pbp);
1118 		break;
1119 	case BIO_WRITE:
1120 	case BIO_DELETE:
1121 	    {
1122 		int error = 0;
1123 
1124 		pbp->bio_completed = pbp->bio_length;
1125 		while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) {
1126 			if (cbp->bio_error != 0) {
1127 				disk = cbp->bio_caller2;
1128 				if (disk != NULL) {
1129 					sc->sc_bump_syncid =
1130 					    G_RAID3_BUMP_IMMEDIATELY;
1131 					g_raid3_event_send(disk,
1132 					    G_RAID3_DISK_STATE_DISCONNECTED,
1133 					    G_RAID3_EVENT_DONTWAIT);
1134 				}
1135 				if (error == 0)
1136 					error = cbp->bio_error;
1137 				else if (pbp->bio_error == 0) {
1138 					/*
1139 					 * Next failed request, that's too many.
1140 					 */
1141 					pbp->bio_error = error;
1142 				}
1143 			}
1144 			g_raid3_destroy_bio(sc, cbp);
1145 		}
1146 		if (pbp->bio_error == 0)
1147 			G_RAID3_LOGREQ(3, pbp, "Request finished.");
1148 		else
1149 			G_RAID3_LOGREQ(0, pbp, "Request failed.");
1150 		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_DEGRADED;
1151 		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_NOPARITY;
1152 		g_io_deliver(pbp, pbp->bio_error);
1153 		break;
1154 	    }
1155 	}
1156 }
1157 
1158 static void
1159 g_raid3_sync_done(struct bio *bp)
1160 {
1161 	struct g_raid3_softc *sc;
1162 
1163 	G_RAID3_LOGREQ(3, bp, "Synchronization request delivered.");
1164 	sc = bp->bio_from->geom->softc;
1165 	bp->bio_cflags |= G_RAID3_BIO_CFLAG_SYNC;
1166 	mtx_lock(&sc->sc_queue_mtx);
1167 	bioq_insert_head(&sc->sc_queue, bp);
1168 	wakeup(sc);
1169 	wakeup(&sc->sc_queue);
1170 	mtx_unlock(&sc->sc_queue_mtx);
1171 }
1172 
1173 static void
1174 g_raid3_start(struct bio *bp)
1175 {
1176 	struct g_raid3_softc *sc;
1177 
1178 	sc = bp->bio_to->geom->softc;
1179 	/*
1180 	 * If sc == NULL or there are no valid disks, provider's error
1181 	 * should be set and g_raid3_start() should not be called at all.
1182 	 */
1183 	KASSERT(sc != NULL && (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
1184 	    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE),
1185 	    ("Provider's error should be set (error=%d)(device=%s).",
1186 	    bp->bio_to->error, bp->bio_to->name));
1187 	G_RAID3_LOGREQ(3, bp, "Request received.");
1188 
1189 	switch (bp->bio_cmd) {
1190 	case BIO_READ:
1191 	case BIO_WRITE:
1192 	case BIO_DELETE:
1193 		break;
1194 	case BIO_GETATTR:
1195 	default:
1196 		g_io_deliver(bp, EOPNOTSUPP);
1197 		return;
1198 	}
1199 	mtx_lock(&sc->sc_queue_mtx);
1200 	bioq_insert_tail(&sc->sc_queue, bp);
1201 	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
1202 	wakeup(sc);
1203 	mtx_unlock(&sc->sc_queue_mtx);
1204 }
1205 
1206 /*
1207  * Send one synchronization request.
1208  */
1209 static void
1210 g_raid3_sync_one(struct g_raid3_softc *sc)
1211 {
1212 	struct g_raid3_disk *disk;
1213 	struct bio *bp;
1214 
1215 	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
1216 	    ("Wrong device state (%s, %s).", sc->sc_name,
1217 	    g_raid3_device_state2str(sc->sc_state)));
1218 	disk = sc->sc_syncdisk;
1219 	KASSERT(disk != NULL, ("No sync disk (%s).", sc->sc_name));
1220 	KASSERT(disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
1221 	    ("Disk %s is not marked for synchronization.",
1222 	    g_raid3_get_diskname(disk)));
1223 
1224 	bp = g_new_bio();
1225 	if (bp == NULL)
1226 		return;
1227 	bp->bio_parent = NULL;
1228 	bp->bio_cmd = BIO_READ;
1229 	bp->bio_offset = disk->d_sync.ds_offset * (sc->sc_ndisks - 1);
1230 	bp->bio_length = MIN(G_RAID3_MAX_IO_SIZE,
1231 	    sc->sc_mediasize - bp->bio_offset);
1232 	bp->bio_cflags = 0;
1233 	bp->bio_done = g_raid3_sync_done;
1234 	bp->bio_data = disk->d_sync.ds_data;
1235 	if (bp->bio_data == NULL) {
1236 		g_destroy_bio(bp);
1237 		return;
1238 	}
1239 	bp->bio_cflags = G_RAID3_BIO_CFLAG_REGSYNC;
1240 	disk->d_sync.ds_offset += bp->bio_length / (sc->sc_ndisks - 1);
1241 	bp->bio_to = sc->sc_provider;
1242 	G_RAID3_LOGREQ(3, bp, "Sending synchronization request.");
1243 	g_io_request(bp, disk->d_sync.ds_consumer);
1244 }
1245 
1246 static void
1247 g_raid3_sync_request(struct bio *bp)
1248 {
1249 	struct g_raid3_softc *sc;
1250 	struct g_raid3_disk *disk;
1251 
1252 	sc = bp->bio_from->geom->softc;
1253 	disk = bp->bio_from->private;
1254 	if (disk == NULL) {
1255 		g_topology_lock();
1256 		g_raid3_kill_consumer(sc, bp->bio_from);
1257 		g_topology_unlock();
1258 		g_destroy_bio(bp);
1259 		return;
1260 	}
1261 
1262 	/*
1263 	 * Synchronization request.
1264 	 */
1265 	switch (bp->bio_cmd) {
1266 	case BIO_READ:
1267 	    {
1268 		struct g_consumer *cp;
1269 		u_char *dst, *src;
1270 		off_t left;
1271 		u_int atom;
1272 
1273 		if (bp->bio_error != 0) {
1274 			G_RAID3_LOGREQ(0, bp,
1275 			    "Synchronization request failed (error=%d).",
1276 			    bp->bio_error);
1277 			g_destroy_bio(bp);
1278 			return;
1279 		}
1280 		G_RAID3_LOGREQ(3, bp, "Synchronization request finished.");
1281 		atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
1282 		dst = src = bp->bio_data;
1283 		if (disk->d_no == sc->sc_ndisks - 1) {
1284 			u_int n;
1285 
1286 			/* Parity component. */
1287 			for (left = bp->bio_length; left > 0;
1288 			    left -= sc->sc_sectorsize) {
1289 				bcopy(src, dst, atom);
1290 				src += atom;
1291 				for (n = 1; n < sc->sc_ndisks - 1; n++) {
1292 					g_raid3_xor(src, dst, dst, atom);
1293 					src += atom;
1294 				}
1295 				dst += atom;
1296 			}
1297 		} else {
1298 			/* Regular component. */
1299 			src += atom * disk->d_no;
1300 			for (left = bp->bio_length; left > 0;
1301 			    left -= sc->sc_sectorsize) {
1302 				bcopy(src, dst, atom);
1303 				src += sc->sc_sectorsize;
1304 				dst += atom;
1305 			}
1306 		}
1307 		bp->bio_offset /= sc->sc_ndisks - 1;
1308 		bp->bio_length /= sc->sc_ndisks - 1;
1309 		bp->bio_cmd = BIO_WRITE;
1310 		bp->bio_cflags = 0;
1311 		bp->bio_children = bp->bio_inbed = 0;
1312 		cp = disk->d_consumer;
1313 		KASSERT(cp->acr == 0 && cp->acw == 1 && cp->ace == 1,
1314 		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1315 		    cp->acr, cp->acw, cp->ace));
1316 		g_io_request(bp, cp);
1317 		return;
1318 	    }
1319 	case BIO_WRITE:
1320 		if (bp->bio_error != 0) {
1321 			G_RAID3_LOGREQ(0, bp,
1322 			    "Synchronization request failed (error=%d).",
1323 			    bp->bio_error);
1324 			g_destroy_bio(bp);
1325 			sc->sc_bump_syncid = G_RAID3_BUMP_IMMEDIATELY;
1326 			g_raid3_event_send(disk,
1327 			    G_RAID3_DISK_STATE_DISCONNECTED,
1328 			    G_RAID3_EVENT_DONTWAIT);
1329 			return;
1330 		}
1331 		G_RAID3_LOGREQ(3, bp, "Synchronization request finished.");
1332 		disk->d_sync.ds_offset_done = bp->bio_offset + bp->bio_length;
1333 		g_destroy_bio(bp);
1334 		if (disk->d_sync.ds_offset_done ==
1335 		    sc->sc_mediasize / (sc->sc_ndisks - 1)) {
1336 			/*
1337 			 * Disk up-to-date, activate it.
1338 			 */
1339 			g_raid3_event_send(disk, G_RAID3_DISK_STATE_ACTIVE,
1340 			    G_RAID3_EVENT_DONTWAIT);
1341 			return;
1342 		} else if ((disk->d_sync.ds_offset_done %
1343 		    (G_RAID3_MAX_IO_SIZE * 100)) == 0) {
1344 			/*
1345 			 * Update offset_done on every 100 blocks.
1346 			 * XXX: This should be configurable.
1347 			 */
1348 			g_topology_lock();
1349 			g_raid3_update_metadata(disk);
1350 			g_topology_unlock();
1351 		}
1352 		return;
1353 	default:
1354 		KASSERT(1 == 0, ("Invalid command here: %u (device=%s)",
1355 		    bp->bio_cmd, sc->sc_name));
1356 		break;
1357 	}
1358 }
1359 
1360 static int
1361 g_raid3_register_request(struct bio *pbp)
1362 {
1363 	struct g_raid3_softc *sc;
1364 	struct g_raid3_disk *disk;
1365 	struct g_consumer *cp;
1366 	struct bio *cbp;
1367 	off_t offset, length;
1368 	u_int n, ndisks;
1369 	int round_robin, verify;
1370 
1371 	ndisks = 0;
1372 	sc = pbp->bio_to->geom->softc;
1373 	if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGSYNC) != 0 &&
1374 	    sc->sc_syncdisk == NULL) {
1375 		g_io_deliver(pbp, EIO);
1376 		return (0);
1377 	}
1378 	g_raid3_init_bio(pbp);
1379 	length = pbp->bio_length / (sc->sc_ndisks - 1);
1380 	offset = pbp->bio_offset / (sc->sc_ndisks - 1);
1381 	round_robin = verify = 0;
1382 	switch (pbp->bio_cmd) {
1383 	case BIO_READ:
1384 		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 &&
1385 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
1386 			pbp->bio_pflags |= G_RAID3_BIO_PFLAG_VERIFY;
1387 			verify = 1;
1388 			ndisks = sc->sc_ndisks;
1389 		} else {
1390 			verify = 0;
1391 			ndisks = sc->sc_ndisks - 1;
1392 		}
1393 		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0 &&
1394 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
1395 			round_robin = 1;
1396 		} else {
1397 			round_robin = 0;
1398 		}
1399 		KASSERT(!round_robin || !verify,
1400 		    ("ROUND-ROBIN and VERIFY are mutually exclusive."));
1401 		pbp->bio_driver2 = &sc->sc_disks[sc->sc_ndisks - 1];
1402 		break;
1403 	case BIO_WRITE:
1404 	case BIO_DELETE:
1405 		ndisks = sc->sc_ndisks;
1406 		break;
1407 	}
1408 	for (n = 0; n < ndisks; n++) {
1409 		disk = &sc->sc_disks[n];
1410 		cbp = g_raid3_clone_bio(sc, pbp);
1411 		if (cbp == NULL) {
1412 			while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL)
1413 				g_raid3_destroy_bio(sc, cbp);
1414 			return (ENOMEM);
1415 		}
1416 		cbp->bio_offset = offset;
1417 		cbp->bio_length = length;
1418 		cbp->bio_done = g_raid3_done;
1419 		switch (pbp->bio_cmd) {
1420 		case BIO_READ:
1421 			if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) {
1422 				/*
1423 				 * Replace invalid component with the parity
1424 				 * component.
1425 				 */
1426 				disk = &sc->sc_disks[sc->sc_ndisks - 1];
1427 				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1428 				pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
1429 			} else if (round_robin &&
1430 			    disk->d_no == sc->sc_round_robin) {
1431 				/*
1432 				 * In round-robin mode skip one data component
1433 				 * and use parity component when reading.
1434 				 */
1435 				pbp->bio_driver2 = disk;
1436 				disk = &sc->sc_disks[sc->sc_ndisks - 1];
1437 				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1438 				sc->sc_round_robin++;
1439 				round_robin = 0;
1440 			} else if (verify && disk->d_no == sc->sc_ndisks - 1) {
1441 				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1442 			}
1443 			break;
1444 		case BIO_WRITE:
1445 		case BIO_DELETE:
1446 			if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
1447 			    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
1448 				if (n == ndisks - 1) {
1449 					/*
1450 					 * Active parity component, mark it as such.
1451 					 */
1452 					cbp->bio_cflags |=
1453 					    G_RAID3_BIO_CFLAG_PARITY;
1454 				}
1455 			} else {
1456 				pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
1457 				if (n == ndisks - 1) {
1458 					/*
1459 					 * Parity component is not connected,
1460 					 * so destroy its request.
1461 					 */
1462 					pbp->bio_pflags |=
1463 					    G_RAID3_BIO_PFLAG_NOPARITY;
1464 					g_raid3_destroy_bio(sc, cbp);
1465 					cbp = NULL;
1466 				} else {
1467 					cbp->bio_cflags |=
1468 					    G_RAID3_BIO_CFLAG_NODISK;
1469 					disk = NULL;
1470 				}
1471 			}
1472 			break;
1473 		}
1474 		if (cbp != NULL)
1475 			cbp->bio_caller2 = disk;
1476 	}
1477 	switch (pbp->bio_cmd) {
1478 	case BIO_READ:
1479 		if (round_robin) {
1480 			/*
1481 			 * If we are in round-robin mode and 'round_robin' is
1482 			 * still 1, it means, that we skipped parity component
1483 			 * for this read and must reset sc_round_robin field.
1484 			 */
1485 			sc->sc_round_robin = 0;
1486 		}
1487 		G_RAID3_FOREACH_BIO(pbp, cbp) {
1488 			disk = cbp->bio_caller2;
1489 			cp = disk->d_consumer;
1490 			cbp->bio_to = cp->provider;
1491 			G_RAID3_LOGREQ(3, cbp, "Sending request.");
1492 			KASSERT(cp->acr > 0 && cp->ace > 0,
1493 			    ("Consumer %s not opened (r%dw%de%d).",
1494 			    cp->provider->name, cp->acr, cp->acw, cp->ace));
1495 			g_io_request(cbp, cp);
1496 		}
1497 		break;
1498 	case BIO_WRITE:
1499 	case BIO_DELETE:
1500 		/*
1501 		 * Bump syncid on first write.
1502 		 */
1503 		if (sc->sc_bump_syncid == G_RAID3_BUMP_ON_FIRST_WRITE) {
1504 			sc->sc_bump_syncid = 0;
1505 			g_topology_lock();
1506 			g_raid3_bump_syncid(sc);
1507 			g_topology_unlock();
1508 		}
1509 		g_raid3_scatter(pbp);
1510 		break;
1511 	}
1512 	return (0);
1513 }
1514 
1515 static int
1516 g_raid3_can_destroy(struct g_raid3_softc *sc)
1517 {
1518 	struct g_geom *gp;
1519 	struct g_consumer *cp;
1520 
1521 	g_topology_assert();
1522 	gp = sc->sc_geom;
1523 	LIST_FOREACH(cp, &gp->consumer, consumer) {
1524 		if (g_raid3_is_busy(sc, cp))
1525 			return (0);
1526 	}
1527 	gp = sc->sc_sync.ds_geom;
1528 	LIST_FOREACH(cp, &gp->consumer, consumer) {
1529 		if (g_raid3_is_busy(sc, cp))
1530 			return (0);
1531 	}
1532 	G_RAID3_DEBUG(2, "No I/O requests for %s, it can be destroyed.",
1533 	    sc->sc_name);
1534 	return (1);
1535 }
1536 
1537 static int
1538 g_raid3_try_destroy(struct g_raid3_softc *sc)
1539 {
1540 
1541 	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_WAIT) != 0) {
1542 		g_topology_lock();
1543 		if (!g_raid3_can_destroy(sc)) {
1544 			g_topology_unlock();
1545 			return (0);
1546 		}
1547 		g_topology_unlock();
1548 		G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__,
1549 		    &sc->sc_worker);
1550 		wakeup(&sc->sc_worker);
1551 		sc->sc_worker = NULL;
1552 	} else {
1553 		g_topology_lock();
1554 		if (!g_raid3_can_destroy(sc)) {
1555 			g_topology_unlock();
1556 			return (0);
1557 		}
1558 		g_raid3_destroy_device(sc);
1559 		g_topology_unlock();
1560 		free(sc->sc_disks, M_RAID3);
1561 		free(sc, M_RAID3);
1562 	}
1563 	return (1);
1564 }
1565 
1566 /*
1567  * Worker thread.
1568  */
1569 static void
1570 g_raid3_worker(void *arg)
1571 {
1572 	struct g_raid3_softc *sc;
1573 	struct g_raid3_disk *disk;
1574 	struct g_raid3_event *ep;
1575 	struct bio *bp;
1576 	u_int nreqs;
1577 
1578 	sc = arg;
1579 	curthread->td_base_pri = PRIBIO;
1580 
1581 	nreqs = 0;
1582 	for (;;) {
1583 		G_RAID3_DEBUG(5, "%s: Let's see...", __func__);
1584 		/*
1585 		 * First take a look at events.
1586 		 * This is important to handle events before any I/O requests.
1587 		 */
1588 		ep = g_raid3_event_get(sc);
1589 		if (ep != NULL) {
1590 			g_topology_lock();
1591 			if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0) {
1592 				/* Update only device status. */
1593 				G_RAID3_DEBUG(3,
1594 				    "Running event for device %s.",
1595 				    sc->sc_name);
1596 				ep->e_error = 0;
1597 				g_raid3_update_device(sc, 1);
1598 			} else {
1599 				/* Update disk status. */
1600 				G_RAID3_DEBUG(3, "Running event for disk %s.",
1601 				     g_raid3_get_diskname(ep->e_disk));
1602 				ep->e_error = g_raid3_update_disk(ep->e_disk,
1603 				    ep->e_state);
1604 				if (ep->e_error == 0)
1605 					g_raid3_update_device(sc, 0);
1606 			}
1607 			g_topology_unlock();
1608 			if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) {
1609 				KASSERT(ep->e_error == 0,
1610 				    ("Error cannot be handled."));
1611 				g_raid3_event_free(ep);
1612 			} else {
1613 				ep->e_flags |= G_RAID3_EVENT_DONE;
1614 				G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__,
1615 				    ep);
1616 				mtx_lock(&sc->sc_events_mtx);
1617 				wakeup(ep);
1618 				mtx_unlock(&sc->sc_events_mtx);
1619 			}
1620 			if ((sc->sc_flags &
1621 			    G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
1622 				if (g_raid3_try_destroy(sc))
1623 					kthread_exit(0);
1624 			}
1625 			G_RAID3_DEBUG(5, "%s: I'm here 1.", __func__);
1626 			continue;
1627 		}
1628 		/*
1629 		 * Now I/O requests.
1630 		 */
1631 		/* Get first request from the queue. */
1632 		mtx_lock(&sc->sc_queue_mtx);
1633 		bp = bioq_first(&sc->sc_queue);
1634 		if (bp == NULL) {
1635 			if ((sc->sc_flags &
1636 			    G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
1637 				mtx_unlock(&sc->sc_queue_mtx);
1638 				if (g_raid3_try_destroy(sc))
1639 					kthread_exit(0);
1640 				mtx_lock(&sc->sc_queue_mtx);
1641 			}
1642 		}
1643 		if (sc->sc_syncdisk != NULL &&
1644 		    (bp == NULL || nreqs > g_raid3_reqs_per_sync)) {
1645 			mtx_unlock(&sc->sc_queue_mtx);
1646 			/*
1647 			 * It is time for synchronization...
1648 			 */
1649 			nreqs = 0;
1650 			disk = sc->sc_syncdisk;
1651 			if (disk->d_sync.ds_offset <
1652 			    sc->sc_mediasize / (sc->sc_ndisks - 1) &&
1653 			    disk->d_sync.ds_offset ==
1654 			    disk->d_sync.ds_offset_done) {
1655 				g_raid3_sync_one(sc);
1656 			}
1657 			G_RAID3_DEBUG(5, "%s: I'm here 2.", __func__);
1658 			goto sleep;
1659 		}
1660 		if (bp == NULL) {
1661 			MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "r3:w1", 0);
1662 			G_RAID3_DEBUG(5, "%s: I'm here 3.", __func__);
1663 			continue;
1664 		}
1665 		nreqs++;
1666 		bioq_remove(&sc->sc_queue, bp);
1667 		mtx_unlock(&sc->sc_queue_mtx);
1668 
1669 		if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0) {
1670 			g_raid3_regular_request(bp);
1671 		} else if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0) {
1672 			u_int timeout, sps;
1673 
1674 			g_raid3_sync_request(bp);
1675 sleep:
1676 			sps = atomic_load_acq_int(&g_raid3_syncs_per_sec);
1677 			if (sps == 0) {
1678 				G_RAID3_DEBUG(5, "%s: I'm here 5.", __func__);
1679 				continue;
1680 			}
1681 			mtx_lock(&sc->sc_queue_mtx);
1682 			if (bioq_first(&sc->sc_queue) != NULL) {
1683 				mtx_unlock(&sc->sc_queue_mtx);
1684 				G_RAID3_DEBUG(5, "%s: I'm here 4.", __func__);
1685 				continue;
1686 			}
1687 			timeout = hz / sps;
1688 			if (timeout == 0)
1689 				timeout = 1;
1690 			MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "r3:w2",
1691 			    timeout);
1692 		} else {
1693 			if (g_raid3_register_request(bp) != 0) {
1694 				mtx_lock(&sc->sc_queue_mtx);
1695 				bioq_insert_tail(&sc->sc_queue, bp);
1696 				MSLEEP(&sc->sc_queue, &sc->sc_queue_mtx,
1697 				    PRIBIO | PDROP, "r3:lowmem", hz / 10);
1698 			}
1699 		}
1700 		G_RAID3_DEBUG(5, "%s: I'm here 6.", __func__);
1701 	}
1702 }
1703 
1704 /*
1705  * Open disk's consumer if needed.
1706  */
1707 static void
1708 g_raid3_update_access(struct g_raid3_disk *disk)
1709 {
1710 	struct g_provider *pp;
1711 	struct g_consumer *cp;
1712 	int acr, acw, ace, cpw, error;
1713 
1714 	g_topology_assert();
1715 
1716 	cp = disk->d_consumer;
1717 	pp = disk->d_softc->sc_provider;
1718 	if (pp == NULL) {
1719 		acr = -cp->acr;
1720 		acw = -cp->acw;
1721 		ace = -cp->ace;
1722 	} else {
1723 		acr = pp->acr - cp->acr;
1724 		acw = pp->acw - cp->acw;
1725 		ace = pp->ace - cp->ace;
1726 		/* Grab an extra "exclusive" bit. */
1727 		if (pp->acr > 0 || pp->acw > 0 || pp->ace > 0)
1728 			ace++;
1729 	}
1730 	if (acr == 0 && acw == 0 && ace == 0)
1731 		return;
1732 	cpw = cp->acw;
1733 	error = g_access(cp, acr, acw, ace);
1734 	G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", cp->provider->name, acr,
1735 	    acw, ace, error);
1736 	if (error != 0) {
1737 		disk->d_softc->sc_bump_syncid = G_RAID3_BUMP_ON_FIRST_WRITE;
1738 		g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
1739 		    G_RAID3_EVENT_DONTWAIT);
1740 		return;
1741 	}
1742 	if (cpw == 0 && cp->acw > 0) {
1743 		G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.",
1744 		    g_raid3_get_diskname(disk), disk->d_softc->sc_name);
1745 		disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
1746 	} else if (cpw > 0 && cp->acw == 0) {
1747 		G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.",
1748 		    g_raid3_get_diskname(disk), disk->d_softc->sc_name);
1749 		disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
1750 	}
1751 }
1752 
1753 static void
1754 g_raid3_sync_start(struct g_raid3_softc *sc)
1755 {
1756 	struct g_raid3_disk *disk;
1757 	struct g_consumer *cp;
1758 	int error;
1759 	u_int n;
1760 
1761 	g_topology_assert();
1762 
1763 	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
1764 	    ("Device not in DEGRADED state (%s, %u).", sc->sc_name,
1765 	    sc->sc_state));
1766 	KASSERT(sc->sc_syncdisk == NULL, ("Syncdisk is not NULL (%s, %u).",
1767 	    sc->sc_name, sc->sc_state));
1768 	disk = NULL;
1769 	for (n = 0; n < sc->sc_ndisks; n++) {
1770 		if (sc->sc_disks[n].d_state != G_RAID3_DISK_STATE_SYNCHRONIZING)
1771 			continue;
1772 		disk = &sc->sc_disks[n];
1773 		break;
1774 	}
1775 	if (disk == NULL)
1776 		return;
1777 	cp = disk->d_consumer;
1778 	KASSERT(cp->acr == 0 && cp->acw == 0 && cp->ace == 0,
1779 	    ("Consumer %s already opened.", cp->provider->name));
1780 
1781 	G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name,
1782 	    g_raid3_get_diskname(disk));
1783 	error = g_access(cp, 0, 1, 1);
1784 	G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", cp->provider->name, 0, 1,
1785 	    1, error);
1786 	if (error != 0) {
1787 		g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
1788 		    G_RAID3_EVENT_DONTWAIT);
1789 		return;
1790 	}
1791 	disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
1792 	KASSERT(disk->d_sync.ds_consumer == NULL,
1793 	    ("Sync consumer already exists (device=%s, disk=%s).",
1794 	    sc->sc_name, g_raid3_get_diskname(disk)));
1795 	disk->d_sync.ds_consumer = g_new_consumer(sc->sc_sync.ds_geom);
1796 	disk->d_sync.ds_consumer->private = disk;
1797 	error = g_attach(disk->d_sync.ds_consumer, disk->d_softc->sc_provider);
1798 	KASSERT(error == 0, ("Cannot attach to %s (error=%d).",
1799 	    disk->d_softc->sc_name, error));
1800 	error = g_access(disk->d_sync.ds_consumer, 1, 0, 0);
1801 	KASSERT(error == 0, ("Cannot open %s (error=%d).",
1802 	    disk->d_softc->sc_name, error));
1803 	disk->d_sync.ds_data = malloc(G_RAID3_MAX_IO_SIZE, M_RAID3, M_WAITOK);
1804 	sc->sc_syncdisk = disk;
1805 }
1806 
1807 /*
1808  * Stop synchronization process.
1809  * type: 0 - synchronization finished
1810  *       1 - synchronization stopped
1811  */
1812 static void
1813 g_raid3_sync_stop(struct g_raid3_softc *sc, int type)
1814 {
1815 	struct g_raid3_disk *disk;
1816 	struct g_consumer *cp;
1817 
1818 	g_topology_assert();
1819 	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
1820 	    ("Device not in DEGRADED state (%s, %u).", sc->sc_name,
1821 	    sc->sc_state));
1822 	disk = sc->sc_syncdisk;
1823 	sc->sc_syncdisk = NULL;
1824 	KASSERT(disk != NULL, ("No disk was synchronized (%s).", sc->sc_name));
1825 	KASSERT(disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
1826 	    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
1827 	    g_raid3_disk_state2str(disk->d_state)));
1828 	if (disk->d_sync.ds_consumer == NULL)
1829 		return;
1830 
1831 	if (type == 0) {
1832 		G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s finished.",
1833 		    disk->d_softc->sc_name, g_raid3_get_diskname(disk));
1834 	} else /* if (type == 1) */ {
1835 		G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s stopped.",
1836 		    disk->d_softc->sc_name, g_raid3_get_diskname(disk));
1837 	}
1838 	cp = disk->d_sync.ds_consumer;
1839 	g_access(cp, -1, 0, 0);
1840 	g_raid3_kill_consumer(disk->d_softc, cp);
1841 	free(disk->d_sync.ds_data, M_RAID3);
1842 	disk->d_sync.ds_consumer = NULL;
1843 	cp = disk->d_consumer;
1844 	KASSERT(cp->acr == 0 && cp->acw == 1 && cp->ace == 1,
1845 	    ("Consumer %s not opened.", cp->provider->name));
1846 	g_access(cp, 0, -1, -1);
1847 	G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", cp->provider->name, 0, -1,
1848 	    -1, 0);
1849 	disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
1850 }
1851 
1852 static void
1853 g_raid3_launch_provider(struct g_raid3_softc *sc)
1854 {
1855 	struct g_provider *pp;
1856 
1857 	g_topology_assert();
1858 
1859 	pp = g_new_providerf(sc->sc_geom, "raid3/%s", sc->sc_name);
1860 	pp->mediasize = sc->sc_mediasize;
1861 	pp->sectorsize = sc->sc_sectorsize;
1862 	sc->sc_provider = pp;
1863 	g_error_provider(pp, 0);
1864 	G_RAID3_DEBUG(0, "Device %s: provider %s launched.", sc->sc_name,
1865 	    pp->name);
1866 	if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED)
1867 		g_raid3_sync_start(sc);
1868 }
1869 
1870 static void
1871 g_raid3_destroy_provider(struct g_raid3_softc *sc)
1872 {
1873 	struct bio *bp;
1874 
1875 	g_topology_assert();
1876 	KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).",
1877 	    sc->sc_name));
1878 
1879 	g_error_provider(sc->sc_provider, ENXIO);
1880 	mtx_lock(&sc->sc_queue_mtx);
1881 	while ((bp = bioq_first(&sc->sc_queue)) != NULL) {
1882 		bioq_remove(&sc->sc_queue, bp);
1883 		g_io_deliver(bp, ENXIO);
1884 	}
1885 	mtx_unlock(&sc->sc_queue_mtx);
1886 	G_RAID3_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name,
1887 	    sc->sc_provider->name);
1888 	sc->sc_provider->flags |= G_PF_WITHER;
1889 	g_orphan_provider(sc->sc_provider, ENXIO);
1890 	sc->sc_provider = NULL;
1891 	if (sc->sc_syncdisk != NULL)
1892 		g_raid3_sync_stop(sc, 1);
1893 }
1894 
1895 static void
1896 g_raid3_go(void *arg)
1897 {
1898 	struct g_raid3_softc *sc;
1899 
1900 	sc = arg;
1901 	G_RAID3_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name);
1902 	g_raid3_event_send(sc, 0,
1903 	    G_RAID3_EVENT_DONTWAIT | G_RAID3_EVENT_DEVICE);
1904 }
1905 
1906 static u_int
1907 g_raid3_determine_state(struct g_raid3_disk *disk)
1908 {
1909 	struct g_raid3_softc *sc;
1910 	u_int state;
1911 
1912 	sc = disk->d_softc;
1913 	if (sc->sc_syncid == disk->d_sync.ds_syncid) {
1914 		if ((disk->d_flags &
1915 		    G_RAID3_DISK_FLAG_SYNCHRONIZING) == 0) {
1916 			/* Disk does not need synchronization. */
1917 			state = G_RAID3_DISK_STATE_ACTIVE;
1918 		} else {
1919 			if ((sc->sc_flags &
1920 			     G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0  ||
1921 			    (disk->d_flags &
1922 			     G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) {
1923 				/*
1924 				 * We can start synchronization from
1925 				 * the stored offset.
1926 				 */
1927 				state = G_RAID3_DISK_STATE_SYNCHRONIZING;
1928 			} else {
1929 				state = G_RAID3_DISK_STATE_STALE;
1930 			}
1931 		}
1932 	} else if (disk->d_sync.ds_syncid < sc->sc_syncid) {
1933 		/*
1934 		 * Reset all synchronization data for this disk,
1935 		 * because if it even was synchronized, it was
1936 		 * synchronized to disks with different syncid.
1937 		 */
1938 		disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING;
1939 		disk->d_sync.ds_offset = 0;
1940 		disk->d_sync.ds_offset_done = 0;
1941 		disk->d_sync.ds_syncid = sc->sc_syncid;
1942 		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 ||
1943 		    (disk->d_flags & G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) {
1944 			state = G_RAID3_DISK_STATE_SYNCHRONIZING;
1945 		} else {
1946 			state = G_RAID3_DISK_STATE_STALE;
1947 		}
1948 	} else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ {
1949 		/*
1950 		 * Not good, NOT GOOD!
1951 		 * It means that device was started on stale disks
1952 		 * and more fresh disk just arrive.
1953 		 * If there were writes, device is fucked up, sorry.
1954 		 * I think the best choice here is don't touch
1955 		 * this disk and inform the user laudly.
1956 		 */
1957 		G_RAID3_DEBUG(0, "Device %s was started before the freshest "
1958 		    "disk (%s) arrives!! It will not be connected to the "
1959 		    "running device.", sc->sc_name,
1960 		    g_raid3_get_diskname(disk));
1961 		g_raid3_destroy_disk(disk);
1962 		state = G_RAID3_DISK_STATE_NONE;
1963 		/* Return immediately, because disk was destroyed. */
1964 		return (state);
1965 	}
1966 	G_RAID3_DEBUG(3, "State for %s disk: %s.",
1967 	    g_raid3_get_diskname(disk), g_raid3_disk_state2str(state));
1968 	return (state);
1969 }
1970 
1971 /*
1972  * Update device state.
1973  */
1974 static void
1975 g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force)
1976 {
1977 	struct g_raid3_disk *disk;
1978 	u_int state;
1979 
1980 	g_topology_assert();
1981 
1982 	switch (sc->sc_state) {
1983 	case G_RAID3_DEVICE_STATE_STARTING:
1984 	    {
1985 		u_int n, ndirty, ndisks, syncid;
1986 
1987 		KASSERT(sc->sc_provider == NULL,
1988 		    ("Non-NULL provider in STARTING state (%s).", sc->sc_name));
1989 		/*
1990 		 * Are we ready? We are, if all disks are connected or
1991 		 * one disk is missing and 'force' is true.
1992 		 */
1993 		if (g_raid3_ndisks(sc, -1) + force == sc->sc_ndisks) {
1994 			if (!force)
1995 				callout_drain(&sc->sc_callout);
1996 		} else {
1997 			if (force) {
1998 				/*
1999 				 * Timeout expired, so destroy device.
2000 				 */
2001 				sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2002 			}
2003 			return;
2004 		}
2005 
2006 		/*
2007 		 * There must be at least 'sc->sc_ndisks - 1' components
2008 		 * with the same syncid and without SYNCHRONIZING flag.
2009 		 */
2010 
2011 		/*
2012 		 * Find the biggest syncid, number of valid components and
2013 		 * number of dirty components.
2014 		 */
2015 		ndirty = ndisks = syncid = 0;
2016 		for (n = 0; n < sc->sc_ndisks; n++) {
2017 			disk = &sc->sc_disks[n];
2018 			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2019 				continue;
2020 			if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0)
2021 				ndirty++;
2022 			if (disk->d_sync.ds_syncid > syncid) {
2023 				syncid = disk->d_sync.ds_syncid;
2024 				ndisks = 0;
2025 			} else if (disk->d_sync.ds_syncid < syncid) {
2026 				continue;
2027 			}
2028 			if ((disk->d_flags &
2029 			    G_RAID3_DISK_FLAG_SYNCHRONIZING) != 0) {
2030 				continue;
2031 			}
2032 			ndisks++;
2033 		}
2034 		/*
2035 		 * Do we have enough valid components?
2036 		 */
2037 		if (ndisks + 1 < sc->sc_ndisks) {
2038 			G_RAID3_DEBUG(0,
2039 			    "Device %s is broken, too few valid components.",
2040 			    sc->sc_name);
2041 			sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2042 			return;
2043 		}
2044 		/*
2045 		 * If there is one DIRTY component and all disks are present,
2046 		 * mark it for synchronization. If there is more than one DIRTY
2047 		 * component, mark parity component for synchronization.
2048 		 */
2049 		if (ndisks == sc->sc_ndisks && ndirty == 1) {
2050 			for (n = 0; n < sc->sc_ndisks; n++) {
2051 				disk = &sc->sc_disks[n];
2052 				if ((disk->d_flags &
2053 				    G_RAID3_DISK_FLAG_DIRTY) == 0) {
2054 					continue;
2055 				}
2056 				disk->d_flags |=
2057 				    G_RAID3_DISK_FLAG_SYNCHRONIZING;
2058 			}
2059 		} else if (ndisks == sc->sc_ndisks && ndirty > 1) {
2060 			disk = &sc->sc_disks[sc->sc_ndisks - 1];
2061 			disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING;
2062 		}
2063 
2064 		sc->sc_syncid = syncid;
2065 		if (force) {
2066 			/* Remember to bump syncid on first write. */
2067 			sc->sc_bump_syncid = G_RAID3_BUMP_ON_FIRST_WRITE;
2068 		}
2069 		if (ndisks == sc->sc_ndisks)
2070 			state = G_RAID3_DEVICE_STATE_COMPLETE;
2071 		else /* if (ndisks == sc->sc_ndisks - 1) */
2072 			state = G_RAID3_DEVICE_STATE_DEGRADED;
2073 		G_RAID3_DEBUG(1, "Device %s state changed from %s to %s.",
2074 		    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
2075 		    g_raid3_device_state2str(state));
2076 		sc->sc_state = state;
2077 		for (n = 0; n < sc->sc_ndisks; n++) {
2078 			disk = &sc->sc_disks[n];
2079 			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2080 				continue;
2081 			state = g_raid3_determine_state(disk);
2082 			g_raid3_event_send(disk, state, G_RAID3_EVENT_DONTWAIT);
2083 			if (state == G_RAID3_DISK_STATE_STALE) {
2084 				sc->sc_bump_syncid =
2085 				    G_RAID3_BUMP_ON_FIRST_WRITE;
2086 			}
2087 		}
2088 		break;
2089 	    }
2090 	case G_RAID3_DEVICE_STATE_DEGRADED:
2091 		/*
2092 		 * Bump syncid here, if we need to do it immediately.
2093 		 */
2094 		if (sc->sc_bump_syncid == G_RAID3_BUMP_IMMEDIATELY) {
2095 			sc->sc_bump_syncid = 0;
2096 			g_raid3_bump_syncid(sc);
2097 		}
2098 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0)
2099 			return;
2100 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) <
2101 		    sc->sc_ndisks - 1) {
2102 			if (sc->sc_provider != NULL)
2103 				g_raid3_destroy_provider(sc);
2104 			sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2105 			return;
2106 		}
2107 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) ==
2108 		    sc->sc_ndisks) {
2109 			state = G_RAID3_DEVICE_STATE_COMPLETE;
2110 			G_RAID3_DEBUG(1,
2111 			    "Device %s state changed from %s to %s.",
2112 			    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
2113 			    g_raid3_device_state2str(state));
2114 			sc->sc_state = state;
2115 		}
2116 		if (sc->sc_provider == NULL)
2117 			g_raid3_launch_provider(sc);
2118 		break;
2119 	case G_RAID3_DEVICE_STATE_COMPLETE:
2120 		/*
2121 		 * Bump syncid here, if we need to do it immediately.
2122 		 */
2123 		if (sc->sc_bump_syncid == G_RAID3_BUMP_IMMEDIATELY) {
2124 			sc->sc_bump_syncid = 0;
2125 			g_raid3_bump_syncid(sc);
2126 		}
2127 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0)
2128 			return;
2129 		KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) >=
2130 		    sc->sc_ndisks - 1,
2131 		    ("Too few ACTIVE components in COMPLETE state (device %s).",
2132 		    sc->sc_name));
2133 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) ==
2134 		    sc->sc_ndisks - 1) {
2135 			state = G_RAID3_DEVICE_STATE_DEGRADED;
2136 			G_RAID3_DEBUG(1,
2137 			    "Device %s state changed from %s to %s.",
2138 			    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
2139 			    g_raid3_device_state2str(state));
2140 			sc->sc_state = state;
2141 		}
2142 		if (sc->sc_provider == NULL)
2143 			g_raid3_launch_provider(sc);
2144 		break;
2145 	default:
2146 		KASSERT(1 == 0, ("Wrong device state (%s, %s).", sc->sc_name,
2147 		    g_raid3_device_state2str(sc->sc_state)));
2148 		break;
2149 	}
2150 }
2151 
2152 /*
2153  * Update disk state and device state if needed.
2154  */
2155 #define	DISK_STATE_CHANGED()	G_RAID3_DEBUG(1,			\
2156 	"Disk %s state changed from %s to %s (device %s).",		\
2157 	g_raid3_get_diskname(disk),					\
2158 	g_raid3_disk_state2str(disk->d_state),				\
2159 	g_raid3_disk_state2str(state), sc->sc_name)
2160 static int
2161 g_raid3_update_disk(struct g_raid3_disk *disk, u_int state)
2162 {
2163 	struct g_raid3_softc *sc;
2164 
2165 	g_topology_assert();
2166 
2167 	sc = disk->d_softc;
2168 again:
2169 	G_RAID3_DEBUG(3, "Changing disk %s state from %s to %s.",
2170 	    g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state),
2171 	    g_raid3_disk_state2str(state));
2172 	switch (state) {
2173 	case G_RAID3_DISK_STATE_NEW:
2174 		/*
2175 		 * Possible scenarios:
2176 		 * 1. New disk arrive.
2177 		 */
2178 		/* Previous state should be NONE. */
2179 		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NONE,
2180 		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2181 		    g_raid3_disk_state2str(disk->d_state)));
2182 		DISK_STATE_CHANGED();
2183 
2184 		disk->d_state = state;
2185 		G_RAID3_DEBUG(0, "Device %s: provider %s detected.",
2186 		    sc->sc_name, g_raid3_get_diskname(disk));
2187 		if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING)
2188 			break;
2189 		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2190 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2191 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2192 		    g_raid3_device_state2str(sc->sc_state),
2193 		    g_raid3_get_diskname(disk),
2194 		    g_raid3_disk_state2str(disk->d_state)));
2195 		state = g_raid3_determine_state(disk);
2196 		if (state != G_RAID3_DISK_STATE_NONE)
2197 			goto again;
2198 		break;
2199 	case G_RAID3_DISK_STATE_ACTIVE:
2200 		/*
2201 		 * Possible scenarios:
2202 		 * 1. New disk does not need synchronization.
2203 		 * 2. Synchronization process finished successfully.
2204 		 */
2205 		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2206 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2207 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2208 		    g_raid3_device_state2str(sc->sc_state),
2209 		    g_raid3_get_diskname(disk),
2210 		    g_raid3_disk_state2str(disk->d_state)));
2211 		/* Previous state should be NEW or SYNCHRONIZING. */
2212 		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW ||
2213 		    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
2214 		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2215 		    g_raid3_disk_state2str(disk->d_state)));
2216 		DISK_STATE_CHANGED();
2217 
2218 		if (disk->d_state == G_RAID3_DISK_STATE_NEW)
2219 			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2220 		else if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
2221 			disk->d_flags &= ~G_RAID3_DISK_FLAG_SYNCHRONIZING;
2222 			disk->d_flags &= ~G_RAID3_DISK_FLAG_FORCE_SYNC;
2223 			g_raid3_sync_stop(sc, 0);
2224 		}
2225 		disk->d_state = state;
2226 		disk->d_sync.ds_offset = 0;
2227 		disk->d_sync.ds_offset_done = 0;
2228 		g_raid3_update_access(disk);
2229 		g_raid3_update_metadata(disk);
2230 		G_RAID3_DEBUG(0, "Device %s: provider %s activated.",
2231 		    sc->sc_name, g_raid3_get_diskname(disk));
2232 		break;
2233 	case G_RAID3_DISK_STATE_STALE:
2234 		/*
2235 		 * Possible scenarios:
2236 		 * 1. Stale disk was connected.
2237 		 */
2238 		/* Previous state should be NEW. */
2239 		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
2240 		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2241 		    g_raid3_disk_state2str(disk->d_state)));
2242 		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2243 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2244 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2245 		    g_raid3_device_state2str(sc->sc_state),
2246 		    g_raid3_get_diskname(disk),
2247 		    g_raid3_disk_state2str(disk->d_state)));
2248 		/*
2249 		 * STALE state is only possible if device is marked
2250 		 * NOAUTOSYNC.
2251 		 */
2252 		KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) != 0,
2253 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2254 		    g_raid3_device_state2str(sc->sc_state),
2255 		    g_raid3_get_diskname(disk),
2256 		    g_raid3_disk_state2str(disk->d_state)));
2257 		DISK_STATE_CHANGED();
2258 
2259 		disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2260 		disk->d_state = state;
2261 		g_raid3_update_metadata(disk);
2262 		G_RAID3_DEBUG(0, "Device %s: provider %s is stale.",
2263 		    sc->sc_name, g_raid3_get_diskname(disk));
2264 		break;
2265 	case G_RAID3_DISK_STATE_SYNCHRONIZING:
2266 		/*
2267 		 * Possible scenarios:
2268 		 * 1. Disk which needs synchronization was connected.
2269 		 */
2270 		/* Previous state should be NEW. */
2271 		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
2272 		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2273 		    g_raid3_disk_state2str(disk->d_state)));
2274 		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2275 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2276 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2277 		    g_raid3_device_state2str(sc->sc_state),
2278 		    g_raid3_get_diskname(disk),
2279 		    g_raid3_disk_state2str(disk->d_state)));
2280 		DISK_STATE_CHANGED();
2281 
2282 		if (disk->d_state == G_RAID3_DISK_STATE_NEW)
2283 			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2284 		disk->d_state = state;
2285 		if (sc->sc_provider != NULL) {
2286 			g_raid3_sync_start(sc);
2287 			g_raid3_update_metadata(disk);
2288 		}
2289 		break;
2290 	case G_RAID3_DISK_STATE_DISCONNECTED:
2291 		/*
2292 		 * Possible scenarios:
2293 		 * 1. Device wasn't running yet, but disk disappear.
2294 		 * 2. Disk was active and disapppear.
2295 		 * 3. Disk disappear during synchronization process.
2296 		 */
2297 		if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2298 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
2299 			/*
2300 			 * Previous state should be ACTIVE, STALE or
2301 			 * SYNCHRONIZING.
2302 			 */
2303 			KASSERT(disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
2304 			    disk->d_state == G_RAID3_DISK_STATE_STALE ||
2305 			    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
2306 			    ("Wrong disk state (%s, %s).",
2307 			    g_raid3_get_diskname(disk),
2308 			    g_raid3_disk_state2str(disk->d_state)));
2309 		} else if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING) {
2310 			/* Previous state should be NEW. */
2311 			KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
2312 			    ("Wrong disk state (%s, %s).",
2313 			    g_raid3_get_diskname(disk),
2314 			    g_raid3_disk_state2str(disk->d_state)));
2315 			/*
2316 			 * Reset bumping syncid if disk disappeared in STARTING
2317 			 * state.
2318 			 */
2319 			if (sc->sc_bump_syncid == G_RAID3_BUMP_ON_FIRST_WRITE)
2320 				sc->sc_bump_syncid = 0;
2321 #ifdef	INVARIANTS
2322 		} else {
2323 			KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).",
2324 			    sc->sc_name,
2325 			    g_raid3_device_state2str(sc->sc_state),
2326 			    g_raid3_get_diskname(disk),
2327 			    g_raid3_disk_state2str(disk->d_state)));
2328 #endif
2329 		}
2330 		DISK_STATE_CHANGED();
2331 		G_RAID3_DEBUG(0, "Device %s: provider %s disconnected.",
2332 		    sc->sc_name, g_raid3_get_diskname(disk));
2333 
2334 		g_raid3_destroy_disk(disk);
2335 		break;
2336 	default:
2337 		KASSERT(1 == 0, ("Unknown state (%u).", state));
2338 		break;
2339 	}
2340 	return (0);
2341 }
2342 #undef	DISK_STATE_CHANGED
2343 
2344 static int
2345 g_raid3_read_metadata(struct g_consumer *cp, struct g_raid3_metadata *md)
2346 {
2347 	struct g_provider *pp;
2348 	u_char *buf;
2349 	int error;
2350 
2351 	g_topology_assert();
2352 
2353 	error = g_access(cp, 1, 0, 0);
2354 	if (error != 0)
2355 		return (error);
2356 	pp = cp->provider;
2357 	g_topology_unlock();
2358 	/* Metadata are stored on last sector. */
2359 	buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
2360 	    &error);
2361 	g_topology_lock();
2362 	if (buf == NULL) {
2363 		g_access(cp, -1, 0, 0);
2364 		return (error);
2365 	}
2366 	if (error != 0) {
2367 		g_access(cp, -1, 0, 0);
2368 		g_free(buf);
2369 		return (error);
2370 	}
2371 	error = g_access(cp, -1, 0, 0);
2372 	KASSERT(error == 0, ("Cannot decrease access count for %s.", pp->name));
2373 
2374 	/* Decode metadata. */
2375 	error = raid3_metadata_decode(buf, md);
2376 	g_free(buf);
2377 	if (strcmp(md->md_magic, G_RAID3_MAGIC) != 0)
2378 		return (EINVAL);
2379 	if (error != 0) {
2380 		G_RAID3_DEBUG(1, "MD5 metadata hash mismatch for provider %s.",
2381 		    cp->provider->name);
2382 		return (error);
2383 	}
2384 
2385 	return (0);
2386 }
2387 
2388 static int
2389 g_raid3_check_metadata(struct g_raid3_softc *sc, struct g_provider *pp,
2390     struct g_raid3_metadata *md)
2391 {
2392 
2393 	if (md->md_no >= sc->sc_ndisks) {
2394 		G_RAID3_DEBUG(1, "Invalid disk %s number (no=%u), skipping.",
2395 		    pp->name, md->md_no);
2396 		return (EINVAL);
2397 	}
2398 	if (sc->sc_disks[md->md_no].d_state != G_RAID3_DISK_STATE_NODISK) {
2399 		G_RAID3_DEBUG(1, "Disk %s (no=%u) already exists, skipping.",
2400 		    pp->name, md->md_no);
2401 		return (EEXIST);
2402 	}
2403 	if (md->md_all != sc->sc_ndisks) {
2404 		G_RAID3_DEBUG(1,
2405 		    "Invalid '%s' field on disk %s (device %s), skipping.",
2406 		    "md_all", pp->name, sc->sc_name);
2407 		return (EINVAL);
2408 	}
2409 	if (md->md_mediasize != sc->sc_mediasize) {
2410 		G_RAID3_DEBUG(1,
2411 		    "Invalid '%s' field on disk %s (device %s), skipping.",
2412 		    "md_mediasize", pp->name, sc->sc_name);
2413 		return (EINVAL);
2414 	}
2415 	if ((md->md_mediasize % (sc->sc_ndisks - 1)) != 0) {
2416 		G_RAID3_DEBUG(1,
2417 		    "Invalid '%s' field on disk %s (device %s), skipping.",
2418 		    "md_mediasize", pp->name, sc->sc_name);
2419 		return (EINVAL);
2420 	}
2421 	if ((sc->sc_mediasize / (sc->sc_ndisks - 1)) > pp->mediasize) {
2422 		G_RAID3_DEBUG(1,
2423 		    "Invalid size of disk %s (device %s), skipping.", pp->name,
2424 		    sc->sc_name);
2425 		return (EINVAL);
2426 	}
2427 	if ((md->md_sectorsize / pp->sectorsize) < sc->sc_ndisks - 1) {
2428 		G_RAID3_DEBUG(1,
2429 		    "Invalid '%s' field on disk %s (device %s), skipping.",
2430 		    "md_sectorsize", pp->name, sc->sc_name);
2431 		return (EINVAL);
2432 	}
2433 	if (md->md_sectorsize != sc->sc_sectorsize) {
2434 		G_RAID3_DEBUG(1,
2435 		    "Invalid '%s' field on disk %s (device %s), skipping.",
2436 		    "md_sectorsize", pp->name, sc->sc_name);
2437 		return (EINVAL);
2438 	}
2439 	if ((sc->sc_sectorsize % pp->sectorsize) != 0) {
2440 		G_RAID3_DEBUG(1,
2441 		    "Invalid sector size of disk %s (device %s), skipping.",
2442 		    pp->name, sc->sc_name);
2443 		return (EINVAL);
2444 	}
2445 	if ((md->md_mflags & ~G_RAID3_DEVICE_FLAG_MASK) != 0) {
2446 		G_RAID3_DEBUG(1,
2447 		    "Invalid device flags on disk %s (device %s), skipping.",
2448 		    pp->name, sc->sc_name);
2449 		return (EINVAL);
2450 	}
2451 	if ((md->md_mflags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 &&
2452 	    (md->md_mflags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0) {
2453 		/*
2454 		 * VERIFY and ROUND-ROBIN options are mutally exclusive.
2455 		 */
2456 		G_RAID3_DEBUG(1, "Both VERIFY and ROUND-ROBIN flags exist on "
2457 		    "disk %s (device %s), skipping.", pp->name, sc->sc_name);
2458 		return (EINVAL);
2459 	}
2460 	if ((md->md_dflags & ~G_RAID3_DISK_FLAG_MASK) != 0) {
2461 		G_RAID3_DEBUG(1,
2462 		    "Invalid disk flags on disk %s (device %s), skipping.",
2463 		    pp->name, sc->sc_name);
2464 		return (EINVAL);
2465 	}
2466 	return (0);
2467 }
2468 
2469 static int
2470 g_raid3_add_disk(struct g_raid3_softc *sc, struct g_provider *pp,
2471     struct g_raid3_metadata *md)
2472 {
2473 	struct g_raid3_disk *disk;
2474 	int error;
2475 
2476 	g_topology_assert();
2477 	G_RAID3_DEBUG(2, "Adding disk %s.", pp->name);
2478 
2479 	error = g_raid3_check_metadata(sc, pp, md);
2480 	if (error != 0)
2481 		return (error);
2482 	disk = g_raid3_init_disk(sc, pp, md, &error);
2483 	if (disk == NULL)
2484 		return (error);
2485 	error = g_raid3_event_send(disk, G_RAID3_DISK_STATE_NEW,
2486 	    G_RAID3_EVENT_WAIT);
2487 	return (error);
2488 }
2489 
2490 static int
2491 g_raid3_access(struct g_provider *pp, int acr, int acw, int ace)
2492 {
2493 	struct g_raid3_softc *sc;
2494 	struct g_raid3_disk *disk;
2495 	int dcr, dcw, dce, err, error;
2496 	u_int n;
2497 
2498 	g_topology_assert();
2499 	G_RAID3_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr,
2500 	    acw, ace);
2501 
2502 	dcr = pp->acr + acr;
2503 	dcw = pp->acw + acw;
2504 	dce = pp->ace + ace;
2505 
2506 	/* On first open, grab an extra "exclusive" bit */
2507 	if (pp->acr == 0 && pp->acw == 0 && pp->ace == 0)
2508 		ace++;
2509 	/* ... and let go of it on last close */
2510 	if (dcr == 0 && dcw == 0 && dce == 0)
2511 		ace--;
2512 
2513 	sc = pp->geom->softc;
2514 	if (sc == NULL ||
2515 	    g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < sc->sc_ndisks - 1) {
2516 		if (acr <= 0 && acw <= 0 && ace <= 0)
2517 			return (0);
2518 		else
2519 			return (ENXIO);
2520 	}
2521 	error = ENXIO;
2522 	for (n = 0; n < sc->sc_ndisks; n++) {
2523 		disk = &sc->sc_disks[n];
2524 		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
2525 			continue;
2526 		err = g_access(disk->d_consumer, acr, acw, ace);
2527 		G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d",
2528 		    g_raid3_get_diskname(disk), acr, acw, ace, err);
2529 		if (err == 0) {
2530 			/*
2531 			 * Mark disk as dirty on open and unmark on close.
2532 			 */
2533 			if (pp->acw == 0 && dcw > 0) {
2534 				G_RAID3_DEBUG(1,
2535 				    "Disk %s (device %s) marked as dirty.",
2536 				    g_raid3_get_diskname(disk), sc->sc_name);
2537 				disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
2538 				g_raid3_update_metadata(disk);
2539 			} else if (pp->acw > 0 && dcw == 0) {
2540 				G_RAID3_DEBUG(1,
2541 				    "Disk %s (device %s) marked as clean.",
2542 				    g_raid3_get_diskname(disk), sc->sc_name);
2543 				disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2544 				g_raid3_update_metadata(disk);
2545 			}
2546 			error = 0;
2547 		} else {
2548 			sc->sc_bump_syncid = G_RAID3_BUMP_ON_FIRST_WRITE;
2549 			g_raid3_event_send(disk,
2550 			    G_RAID3_DISK_STATE_DISCONNECTED,
2551 			    G_RAID3_EVENT_DONTWAIT);
2552 		}
2553 	}
2554 	return (error);
2555 }
2556 
2557 static struct g_geom *
2558 g_raid3_create(struct g_class *mp, const struct g_raid3_metadata *md)
2559 {
2560 	struct g_raid3_softc *sc;
2561 	struct g_geom *gp;
2562 	int error, timeout;
2563 	u_int n;
2564 
2565 	g_topology_assert();
2566 	G_RAID3_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id);
2567 
2568 	/* One disk is minimum. */
2569 	if (md->md_all < 1)
2570 		return (NULL);
2571 	/*
2572 	 * Action geom.
2573 	 */
2574 	gp = g_new_geomf(mp, "%s", md->md_name);
2575 	sc = malloc(sizeof(*sc), M_RAID3, M_WAITOK | M_ZERO);
2576 	sc->sc_disks = malloc(sizeof(struct g_raid3_disk) * md->md_all, M_RAID3,
2577 	    M_WAITOK | M_ZERO);
2578 	gp->start = g_raid3_start;
2579 	gp->spoiled = g_raid3_spoiled;
2580 	gp->orphan = g_raid3_orphan;
2581 	gp->access = g_raid3_access;
2582 	gp->dumpconf = g_raid3_dumpconf;
2583 
2584 	sc->sc_id = md->md_id;
2585 	sc->sc_mediasize = md->md_mediasize;
2586 	sc->sc_sectorsize = md->md_sectorsize;
2587 	sc->sc_ndisks = md->md_all;
2588 	sc->sc_round_robin = 0;
2589 	sc->sc_flags = md->md_mflags;
2590 	sc->sc_bump_syncid = 0;
2591 	for (n = 0; n < sc->sc_ndisks; n++)
2592 		sc->sc_disks[n].d_state = G_RAID3_DISK_STATE_NODISK;
2593 	bioq_init(&sc->sc_queue);
2594 	mtx_init(&sc->sc_queue_mtx, "graid3:queue", NULL, MTX_DEF);
2595 	TAILQ_INIT(&sc->sc_events);
2596 	mtx_init(&sc->sc_events_mtx, "graid3:events", NULL, MTX_DEF);
2597 	callout_init(&sc->sc_callout, CALLOUT_MPSAFE);
2598 	sc->sc_state = G_RAID3_DEVICE_STATE_STARTING;
2599 	gp->softc = sc;
2600 	sc->sc_geom = gp;
2601 	sc->sc_provider = NULL;
2602 	/*
2603 	 * Synchronization geom.
2604 	 */
2605 	gp = g_new_geomf(mp, "%s.sync", md->md_name);
2606 	gp->softc = sc;
2607 	gp->orphan = g_raid3_orphan;
2608 	sc->sc_sync.ds_geom = gp;
2609 	sc->sc_zone_64k = uma_zcreate("gr3:64k", 65536, NULL, NULL, NULL, NULL,
2610 	    UMA_ALIGN_PTR, 0);
2611 	uma_zone_set_max(sc->sc_zone_64k, g_raid3_n64k);
2612 	sc->sc_zone_16k = uma_zcreate("gr3:16k", 16384, NULL, NULL, NULL, NULL,
2613 	    UMA_ALIGN_PTR, 0);
2614 	uma_zone_set_max(sc->sc_zone_64k, g_raid3_n16k);
2615 	sc->sc_zone_4k = uma_zcreate("gr3:4k", 4096, NULL, NULL, NULL, NULL,
2616 	    UMA_ALIGN_PTR, 0);
2617 	uma_zone_set_max(sc->sc_zone_4k, g_raid3_n4k);
2618 	error = kthread_create(g_raid3_worker, sc, &sc->sc_worker, 0, 0,
2619 	    "g_raid3 %s", md->md_name);
2620 	if (error != 0) {
2621 		G_RAID3_DEBUG(1, "Cannot create kernel thread for %s.",
2622 		    sc->sc_name);
2623 		uma_zdestroy(sc->sc_zone_64k);
2624 		uma_zdestroy(sc->sc_zone_16k);
2625 		uma_zdestroy(sc->sc_zone_4k);
2626 		g_destroy_geom(sc->sc_sync.ds_geom);
2627 		mtx_destroy(&sc->sc_events_mtx);
2628 		mtx_destroy(&sc->sc_queue_mtx);
2629 		g_destroy_geom(sc->sc_geom);
2630 		free(sc->sc_disks, M_RAID3);
2631 		free(sc, M_RAID3);
2632 		return (NULL);
2633 	}
2634 
2635 	G_RAID3_DEBUG(0, "Device %s created (id=%u).", sc->sc_name, sc->sc_id);
2636 
2637 	/*
2638 	 * Run timeout.
2639 	 */
2640 	timeout = atomic_load_acq_int(&g_raid3_timeout);
2641 	callout_reset(&sc->sc_callout, timeout * hz, g_raid3_go, sc);
2642 	return (sc->sc_geom);
2643 }
2644 
2645 int
2646 g_raid3_destroy(struct g_raid3_softc *sc, boolean_t force)
2647 {
2648 	struct g_provider *pp;
2649 
2650 	g_topology_assert();
2651 
2652 	if (sc == NULL)
2653 		return (ENXIO);
2654 	pp = sc->sc_provider;
2655 	if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
2656 		if (force) {
2657 			G_RAID3_DEBUG(0, "Device %s is still open, so it "
2658 			    "can't be definitely removed.", pp->name);
2659 		} else {
2660 			G_RAID3_DEBUG(1,
2661 			    "Device %s is still open (r%dw%de%d).", pp->name,
2662 			    pp->acr, pp->acw, pp->ace);
2663 			return (EBUSY);
2664 		}
2665 	}
2666 
2667 	sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2668 	sc->sc_flags |= G_RAID3_DEVICE_FLAG_WAIT;
2669 	g_topology_unlock();
2670 	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
2671 	mtx_lock(&sc->sc_queue_mtx);
2672 	wakeup(sc);
2673 	wakeup(&sc->sc_queue);
2674 	mtx_unlock(&sc->sc_queue_mtx);
2675 	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker);
2676 	while (sc->sc_worker != NULL)
2677 		tsleep(&sc->sc_worker, PRIBIO, "r3:destroy", hz / 5);
2678 	G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker);
2679 	g_topology_lock();
2680 	g_raid3_destroy_device(sc);
2681 	free(sc->sc_disks, M_RAID3);
2682 	free(sc, M_RAID3);
2683 	return (0);
2684 }
2685 
2686 static void
2687 g_raid3_taste_orphan(struct g_consumer *cp)
2688 {
2689 
2690 	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
2691 	    cp->provider->name));
2692 }
2693 
2694 static struct g_geom *
2695 g_raid3_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
2696 {
2697 	struct g_raid3_metadata md;
2698 	struct g_raid3_softc *sc;
2699 	struct g_consumer *cp;
2700 	struct g_geom *gp;
2701 	int error;
2702 
2703 	g_topology_assert();
2704 	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
2705 	G_RAID3_DEBUG(2, "Tasting %s.", pp->name);
2706 	/* Skip providers with 0 sectorsize. */
2707 	if (pp->sectorsize == 0)
2708 		return (NULL);
2709 
2710 	gp = g_new_geomf(mp, "raid3:taste");
2711 	/* This orphan function should be never called. */
2712 	gp->orphan = g_raid3_taste_orphan;
2713 	cp = g_new_consumer(gp);
2714 	g_attach(cp, pp);
2715 	error = g_raid3_read_metadata(cp, &md);
2716 	g_detach(cp);
2717 	g_destroy_consumer(cp);
2718 	g_destroy_geom(gp);
2719 	if (error != 0)
2720 		return (NULL);
2721 	gp = NULL;
2722 
2723 	if (md.md_version > G_RAID3_VERSION) {
2724 		printf("geom_raid3.ko module is too old to handle %s.\n",
2725 		    pp->name);
2726 		return (NULL);
2727 	}
2728 	if (md.md_provider[0] != '\0' && strcmp(md.md_provider, pp->name) != 0)
2729 		return (NULL);
2730 	if (g_raid3_debug >= 2)
2731 		raid3_metadata_dump(&md);
2732 
2733 	/*
2734 	 * Let's check if device already exists.
2735 	 */
2736 	sc = NULL;
2737 	LIST_FOREACH(gp, &mp->geom, geom) {
2738 		sc = gp->softc;
2739 		if (sc == NULL)
2740 			continue;
2741 		if (sc->sc_sync.ds_geom == gp)
2742 			continue;
2743 		if (strcmp(md.md_name, sc->sc_name) != 0)
2744 			continue;
2745 		if (md.md_id != sc->sc_id) {
2746 			G_RAID3_DEBUG(0, "Device %s already configured.",
2747 			    sc->sc_name);
2748 			return (NULL);
2749 		}
2750 		break;
2751 	}
2752 	if (gp == NULL) {
2753 		gp = g_raid3_create(mp, &md);
2754 		if (gp == NULL) {
2755 			G_RAID3_DEBUG(0, "Cannot create device %s.",
2756 			    md.md_name);
2757 			return (NULL);
2758 		}
2759 		sc = gp->softc;
2760 	}
2761 	G_RAID3_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name);
2762 	error = g_raid3_add_disk(sc, pp, &md);
2763 	if (error != 0) {
2764 		G_RAID3_DEBUG(0, "Cannot add disk %s to %s (error=%d).",
2765 		    pp->name, gp->name, error);
2766 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NODISK) ==
2767 		    sc->sc_ndisks) {
2768 			g_raid3_destroy(sc, 1);
2769 		}
2770 		return (NULL);
2771 	}
2772 	return (gp);
2773 }
2774 
2775 static int
2776 g_raid3_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused,
2777     struct g_geom *gp)
2778 {
2779 
2780 	return (g_raid3_destroy(gp->softc, 0));
2781 }
2782 
2783 static void
2784 g_raid3_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
2785     struct g_consumer *cp, struct g_provider *pp)
2786 {
2787 	struct g_raid3_softc *sc;
2788 
2789 	g_topology_assert();
2790 
2791 	sc = gp->softc;
2792 	if (sc == NULL)
2793 		return;
2794 	/* Skip synchronization geom. */
2795 	if (gp == sc->sc_sync.ds_geom)
2796 		return;
2797 	if (pp != NULL) {
2798 		/* Nothing here. */
2799 	} else if (cp != NULL) {
2800 		struct g_raid3_disk *disk;
2801 
2802 		disk = cp->private;
2803 		if (disk == NULL)
2804 			return;
2805 		sbuf_printf(sb, "%s<Type>", indent);
2806 		if (disk->d_no == sc->sc_ndisks - 1)
2807 			sbuf_printf(sb, "PARITY");
2808 		else
2809 			sbuf_printf(sb, "DATA");
2810 		sbuf_printf(sb, "</Type>\n");
2811 		sbuf_printf(sb, "%s<Number>%u</Number>\n", indent,
2812 		    (u_int)disk->d_no);
2813 		if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
2814 			sbuf_printf(sb, "%s<Synchronized>", indent);
2815 			if (disk->d_sync.ds_offset_done == 0)
2816 				sbuf_printf(sb, "0%%");
2817 			else {
2818 				sbuf_printf(sb, "%u%%",
2819 				    (u_int)((disk->d_sync.ds_offset_done * 100) /
2820 				    (sc->sc_mediasize / (sc->sc_ndisks - 1))));
2821 			}
2822 			sbuf_printf(sb, "</Synchronized>\n");
2823 		}
2824 		sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent,
2825 		    disk->d_sync.ds_syncid);
2826 		sbuf_printf(sb, "%s<Flags>", indent);
2827 		if (disk->d_flags == 0)
2828 			sbuf_printf(sb, "NONE");
2829 		else {
2830 			int first = 1;
2831 
2832 #define	ADD_FLAG(flag, name)	do {					\
2833 	if ((disk->d_flags & (flag)) != 0) {				\
2834 		if (!first)						\
2835 			sbuf_printf(sb, ", ");				\
2836 		else							\
2837 			first = 0;					\
2838 		sbuf_printf(sb, name);					\
2839 	}								\
2840 } while (0)
2841 			ADD_FLAG(G_RAID3_DISK_FLAG_DIRTY, "DIRTY");
2842 			ADD_FLAG(G_RAID3_DISK_FLAG_HARDCODED, "HARDCODED");
2843 			ADD_FLAG(G_RAID3_DISK_FLAG_SYNCHRONIZING,
2844 			    "SYNCHRONIZING");
2845 			ADD_FLAG(G_RAID3_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC");
2846 #undef	ADD_FLAG
2847 		}
2848 		sbuf_printf(sb, "</Flags>\n");
2849 		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
2850 		    g_raid3_disk_state2str(disk->d_state));
2851 	} else {
2852 		sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
2853 		sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent, sc->sc_syncid);
2854 		sbuf_printf(sb, "%s<Flags>", indent);
2855 		if (sc->sc_flags == 0)
2856 			sbuf_printf(sb, "NONE");
2857 		else {
2858 			int first = 1;
2859 
2860 #define	ADD_FLAG(flag, name)	do {					\
2861 	if ((sc->sc_flags & (flag)) != 0) {				\
2862 		if (!first)						\
2863 			sbuf_printf(sb, ", ");				\
2864 		else							\
2865 			first = 0;					\
2866 		sbuf_printf(sb, name);					\
2867 	}								\
2868 } while (0)
2869 			ADD_FLAG(G_RAID3_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC");
2870 			ADD_FLAG(G_RAID3_DEVICE_FLAG_ROUND_ROBIN,
2871 			    "ROUND-ROBIN");
2872 			ADD_FLAG(G_RAID3_DEVICE_FLAG_VERIFY, "VERIFY");
2873 #undef	ADD_FLAG
2874 		}
2875 		sbuf_printf(sb, "</Flags>\n");
2876 		sbuf_printf(sb, "%s<Components>%u</Components>\n", indent,
2877 		    sc->sc_ndisks);
2878 		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
2879 		    g_raid3_device_state2str(sc->sc_state));
2880 	}
2881 }
2882 
2883 DECLARE_GEOM_CLASS(g_raid3_class, g_raid3);
2884