xref: /freebsd/sys/geom/raid3/g_raid3.c (revision cec50dea12481dc578c0805c887ab2097e1c06c5)
1 /*-
2  * Copyright (c) 2004 Pawel Jakub Dawidek <pjd@FreeBSD.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29 
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/kernel.h>
33 #include <sys/module.h>
34 #include <sys/limits.h>
35 #include <sys/lock.h>
36 #include <sys/mutex.h>
37 #include <sys/bio.h>
38 #include <sys/sysctl.h>
39 #include <sys/malloc.h>
40 #include <sys/bitstring.h>
41 #include <vm/uma.h>
42 #include <machine/atomic.h>
43 #include <geom/geom.h>
44 #include <sys/proc.h>
45 #include <sys/kthread.h>
46 #include <geom/raid3/g_raid3.h>
47 
48 
49 static MALLOC_DEFINE(M_RAID3, "raid3 data", "GEOM_RAID3 Data");
50 
51 SYSCTL_DECL(_kern_geom);
52 SYSCTL_NODE(_kern_geom, OID_AUTO, raid3, CTLFLAG_RW, 0, "GEOM_RAID3 stuff");
53 u_int g_raid3_debug = 0;
54 TUNABLE_INT("kern.geom.raid3.debug", &g_raid3_debug);
55 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, debug, CTLFLAG_RW, &g_raid3_debug, 0,
56     "Debug level");
57 static u_int g_raid3_timeout = 8;
58 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, timeout, CTLFLAG_RW, &g_raid3_timeout,
59     0, "Time to wait on all raid3 components");
60 static u_int g_raid3_reqs_per_sync = 5;
61 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, reqs_per_sync, CTLFLAG_RW,
62     &g_raid3_reqs_per_sync, 0,
63     "Number of regular I/O requests per synchronization request");
64 static u_int g_raid3_syncs_per_sec = 100;
65 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, syncs_per_sec, CTLFLAG_RW,
66     &g_raid3_syncs_per_sec, 0,
67     "Number of synchronizations requests per second");
68 
69 static u_int g_raid3_n64k = 50;
70 TUNABLE_INT("kern.geom.raid3.n64k", &g_raid3_n64k);
71 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n64k, CTLFLAG_RD, &g_raid3_n64k, 0,
72     "Maximum number of 64kB allocations");
73 static u_int g_raid3_n16k = 200;
74 TUNABLE_INT("kern.geom.raid3.n16k", &g_raid3_n16k);
75 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n16k, CTLFLAG_RD, &g_raid3_n16k, 0,
76     "Maximum number of 16kB allocations");
77 static u_int g_raid3_n4k = 1200;
78 TUNABLE_INT("kern.geom.raid3.n4k", &g_raid3_n4k);
79 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n4k, CTLFLAG_RD, &g_raid3_n4k, 0,
80     "Maximum number of 4kB allocations");
81 
82 SYSCTL_NODE(_kern_geom_raid3, OID_AUTO, stat, CTLFLAG_RW, 0,
83     "GEOM_RAID3 statistics");
84 static u_int g_raid3_parity_mismatch = 0;
85 SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, parity_mismatch, CTLFLAG_RD,
86     &g_raid3_parity_mismatch, 0, "Number of failures in VERIFY mode");
87 static u_int g_raid3_64k_requested = 0;
88 SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 64k_requested, CTLFLAG_RD,
89     &g_raid3_64k_requested, 0, "Number of requested 64kB allocations");
90 static u_int g_raid3_64k_failed = 0;
91 SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 64k_failed, CTLFLAG_RD,
92     &g_raid3_64k_failed, 0, "Number of failed 64kB allocations");
93 static u_int g_raid3_16k_requested = 0;
94 SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 16k_requested, CTLFLAG_RD,
95     &g_raid3_16k_requested, 0, "Number of requested 16kB allocations");
96 static u_int g_raid3_16k_failed = 0;
97 SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 16k_failed, CTLFLAG_RD,
98     &g_raid3_16k_failed, 0, "Number of failed 16kB allocations");
99 static u_int g_raid3_4k_requested = 0;
100 SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 4k_requested, CTLFLAG_RD,
101     &g_raid3_4k_requested, 0, "Number of requested 4kB allocations");
102 static u_int g_raid3_4k_failed = 0;
103 SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, 4k_failed, CTLFLAG_RD,
104     &g_raid3_4k_failed, 0, "Number of failed 4kB allocations");
105 
106 #define	MSLEEP(ident, mtx, priority, wmesg, timeout)	do {		\
107 	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, (ident));	\
108 	msleep((ident), (mtx), (priority), (wmesg), (timeout));		\
109 	G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, (ident));	\
110 } while (0)
111 
112 
113 static int g_raid3_destroy_geom(struct gctl_req *req, struct g_class *mp,
114     struct g_geom *gp);
115 static g_taste_t g_raid3_taste;
116 
117 struct g_class g_raid3_class = {
118 	.name = G_RAID3_CLASS_NAME,
119 	.version = G_VERSION,
120 	.ctlreq = g_raid3_config,
121 	.taste = g_raid3_taste,
122 	.destroy_geom = g_raid3_destroy_geom
123 };
124 
125 
126 static void g_raid3_destroy_provider(struct g_raid3_softc *sc);
127 static int g_raid3_update_disk(struct g_raid3_disk *disk, u_int state);
128 static void g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force);
129 static void g_raid3_dumpconf(struct sbuf *sb, const char *indent,
130     struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
131 static void g_raid3_sync_stop(struct g_raid3_softc *sc, int type);
132 
133 
134 /*
135  * XXX: it should be placed in subr_disk.c.
136  */
137 static void
138 bioq_insert_head(struct bio_queue_head *head, struct bio *bp)
139 {
140 
141 	TAILQ_INSERT_HEAD(&head->queue, bp, bio_queue);
142 }
143 
144 static const char *
145 g_raid3_disk_state2str(int state)
146 {
147 
148 	switch (state) {
149 	case G_RAID3_DISK_STATE_NODISK:
150 		return ("NODISK");
151 	case G_RAID3_DISK_STATE_NONE:
152 		return ("NONE");
153 	case G_RAID3_DISK_STATE_NEW:
154 		return ("NEW");
155 	case G_RAID3_DISK_STATE_ACTIVE:
156 		return ("ACTIVE");
157 	case G_RAID3_DISK_STATE_STALE:
158 		return ("STALE");
159 	case G_RAID3_DISK_STATE_SYNCHRONIZING:
160 		return ("SYNCHRONIZING");
161 	case G_RAID3_DISK_STATE_DISCONNECTED:
162 		return ("DISCONNECTED");
163 	default:
164 		return ("INVALID");
165 	}
166 }
167 
168 static const char *
169 g_raid3_device_state2str(int state)
170 {
171 
172 	switch (state) {
173 	case G_RAID3_DEVICE_STATE_STARTING:
174 		return ("STARTING");
175 	case G_RAID3_DEVICE_STATE_DEGRADED:
176 		return ("DEGRADED");
177 	case G_RAID3_DEVICE_STATE_COMPLETE:
178 		return ("COMPLETE");
179 	default:
180 		return ("INVALID");
181 	}
182 }
183 
184 const char *
185 g_raid3_get_diskname(struct g_raid3_disk *disk)
186 {
187 
188 	if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL)
189 		return ("[unknown]");
190 	return (disk->d_name);
191 }
192 
193 #define	g_raid3_xor(src1, src2, dst, size)				\
194 	_g_raid3_xor((uint64_t *)(src1), (uint64_t *)(src2),		\
195 	    (uint64_t *)(dst), (size_t)size)
196 static void
197 _g_raid3_xor(uint64_t *src1, uint64_t *src2, uint64_t *dst, size_t size)
198 {
199 
200 	KASSERT((size % 128) == 0, ("Invalid size: %zu.", size));
201 	for (; size > 0; size -= 128) {
202 		*dst++ = (*src1++) ^ (*src2++);
203 		*dst++ = (*src1++) ^ (*src2++);
204 		*dst++ = (*src1++) ^ (*src2++);
205 		*dst++ = (*src1++) ^ (*src2++);
206 		*dst++ = (*src1++) ^ (*src2++);
207 		*dst++ = (*src1++) ^ (*src2++);
208 		*dst++ = (*src1++) ^ (*src2++);
209 		*dst++ = (*src1++) ^ (*src2++);
210 		*dst++ = (*src1++) ^ (*src2++);
211 		*dst++ = (*src1++) ^ (*src2++);
212 		*dst++ = (*src1++) ^ (*src2++);
213 		*dst++ = (*src1++) ^ (*src2++);
214 		*dst++ = (*src1++) ^ (*src2++);
215 		*dst++ = (*src1++) ^ (*src2++);
216 		*dst++ = (*src1++) ^ (*src2++);
217 		*dst++ = (*src1++) ^ (*src2++);
218 	}
219 }
220 
221 static int
222 g_raid3_is_zero(struct bio *bp)
223 {
224 	static const uint64_t zeros[] = {
225 	    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
226 	};
227 	u_char *addr;
228 	ssize_t size;
229 
230 	size = bp->bio_length;
231 	addr = (u_char *)bp->bio_data;
232 	for (; size > 0; size -= sizeof(zeros), addr += sizeof(zeros)) {
233 		if (bcmp(addr, zeros, sizeof(zeros)) != 0)
234 			return (0);
235 	}
236 	return (1);
237 }
238 
239 /*
240  * --- Events handling functions ---
241  * Events in geom_raid3 are used to maintain disks and device status
242  * from one thread to simplify locking.
243  */
244 static void
245 g_raid3_event_free(struct g_raid3_event *ep)
246 {
247 
248 	free(ep, M_RAID3);
249 }
250 
251 int
252 g_raid3_event_send(void *arg, int state, int flags)
253 {
254 	struct g_raid3_softc *sc;
255 	struct g_raid3_disk *disk;
256 	struct g_raid3_event *ep;
257 	int error;
258 
259 	ep = malloc(sizeof(*ep), M_RAID3, M_WAITOK);
260 	G_RAID3_DEBUG(4, "%s: Sending event %p.", __func__, ep);
261 	if ((flags & G_RAID3_EVENT_DEVICE) != 0) {
262 		disk = NULL;
263 		sc = arg;
264 	} else {
265 		disk = arg;
266 		sc = disk->d_softc;
267 	}
268 	ep->e_disk = disk;
269 	ep->e_state = state;
270 	ep->e_flags = flags;
271 	ep->e_error = 0;
272 	mtx_lock(&sc->sc_events_mtx);
273 	TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next);
274 	mtx_unlock(&sc->sc_events_mtx);
275 	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
276 	mtx_lock(&sc->sc_queue_mtx);
277 	wakeup(sc);
278 	wakeup(&sc->sc_queue);
279 	mtx_unlock(&sc->sc_queue_mtx);
280 	if ((flags & G_RAID3_EVENT_DONTWAIT) != 0)
281 		return (0);
282 	g_topology_assert();
283 	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, ep);
284 	g_topology_unlock();
285 	while ((ep->e_flags & G_RAID3_EVENT_DONE) == 0) {
286 		mtx_lock(&sc->sc_events_mtx);
287 		MSLEEP(ep, &sc->sc_events_mtx, PRIBIO | PDROP, "r3:event",
288 		    hz * 5);
289 	}
290 	/* Don't even try to use 'sc' here, because it could be already dead. */
291 	g_topology_lock();
292 	error = ep->e_error;
293 	g_raid3_event_free(ep);
294 	return (error);
295 }
296 
297 static struct g_raid3_event *
298 g_raid3_event_get(struct g_raid3_softc *sc)
299 {
300 	struct g_raid3_event *ep;
301 
302 	mtx_lock(&sc->sc_events_mtx);
303 	ep = TAILQ_FIRST(&sc->sc_events);
304 	if (ep != NULL)
305 		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
306 	mtx_unlock(&sc->sc_events_mtx);
307 	return (ep);
308 }
309 
310 static void
311 g_raid3_event_cancel(struct g_raid3_disk *disk)
312 {
313 	struct g_raid3_softc *sc;
314 	struct g_raid3_event *ep, *tmpep;
315 
316 	g_topology_assert();
317 
318 	sc = disk->d_softc;
319 	mtx_lock(&sc->sc_events_mtx);
320 	TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) {
321 		if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0)
322 			continue;
323 		if (ep->e_disk != disk)
324 			continue;
325 		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
326 		if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0)
327 			g_raid3_event_free(ep);
328 		else {
329 			ep->e_error = ECANCELED;
330 			wakeup(ep);
331 		}
332 	}
333 	mtx_unlock(&sc->sc_events_mtx);
334 }
335 
336 /*
337  * Return the number of disks in the given state.
338  * If state is equal to -1, count all connected disks.
339  */
340 u_int
341 g_raid3_ndisks(struct g_raid3_softc *sc, int state)
342 {
343 	struct g_raid3_disk *disk;
344 	u_int n, ndisks;
345 
346 	for (n = ndisks = 0; n < sc->sc_ndisks; n++) {
347 		disk = &sc->sc_disks[n];
348 		if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
349 			continue;
350 		if (state == -1 || disk->d_state == state)
351 			ndisks++;
352 	}
353 	return (ndisks);
354 }
355 
356 static u_int
357 g_raid3_nrequests(struct g_raid3_softc *sc, struct g_consumer *cp)
358 {
359 	struct bio *bp;
360 	u_int nreqs = 0;
361 
362 	mtx_lock(&sc->sc_queue_mtx);
363 	TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
364 		if (bp->bio_from == cp)
365 			nreqs++;
366 	}
367 	mtx_unlock(&sc->sc_queue_mtx);
368 	return (nreqs);
369 }
370 
371 static int
372 g_raid3_is_busy(struct g_raid3_softc *sc, struct g_consumer *cp)
373 {
374 
375 	if (cp->nstart != cp->nend) {
376 		G_RAID3_DEBUG(2,
377 		    "I/O requests for %s exist, can't destroy it now.",
378 		    cp->provider->name);
379 		return (1);
380 	}
381 	if (g_raid3_nrequests(sc, cp) > 0) {
382 		G_RAID3_DEBUG(2,
383 		    "I/O requests for %s in queue, can't destroy it now.",
384 		    cp->provider->name);
385 		return (1);
386 	}
387 	return (0);
388 }
389 
390 static void
391 g_raid3_kill_consumer(struct g_raid3_softc *sc, struct g_consumer *cp)
392 {
393 
394 	g_topology_assert();
395 
396 	cp->private = NULL;
397 	if (g_raid3_is_busy(sc, cp))
398 		return;
399 	G_RAID3_DEBUG(2, "Consumer %s destroyed.", cp->provider->name);
400 	g_detach(cp);
401 	g_destroy_consumer(cp);
402 }
403 
404 static int
405 g_raid3_connect_disk(struct g_raid3_disk *disk, struct g_provider *pp)
406 {
407 	int error;
408 
409 	g_topology_assert();
410 	KASSERT(disk->d_consumer == NULL,
411 	    ("Disk already connected (device %s).", disk->d_softc->sc_name));
412 
413 	disk->d_consumer = g_new_consumer(disk->d_softc->sc_geom);
414 	disk->d_consumer->private = disk;
415 	error = g_attach(disk->d_consumer, pp);
416 	if (error != 0)
417 		return (error);
418 	G_RAID3_DEBUG(2, "Disk %s connected.", g_raid3_get_diskname(disk));
419 	return (0);
420 }
421 
422 static void
423 g_raid3_disconnect_consumer(struct g_raid3_softc *sc, struct g_consumer *cp)
424 {
425 
426 	g_topology_assert();
427 
428 	if (cp == NULL)
429 		return;
430 	if (cp->provider != NULL) {
431 		G_RAID3_DEBUG(2, "Disk %s disconnected.", cp->provider->name);
432 		if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) {
433 			G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d",
434 			    cp->provider->name, -cp->acr, -cp->acw, -cp->ace,
435 			    0);
436 			g_access(cp, -cp->acr, -cp->acw, -cp->ace);
437 		}
438 		g_raid3_kill_consumer(sc, cp);
439 	} else {
440 		g_destroy_consumer(cp);
441 	}
442 }
443 
444 /*
445  * Initialize disk. This means allocate memory, create consumer, attach it
446  * to the provider and open access (r1w1e1) to it.
447  */
448 static struct g_raid3_disk *
449 g_raid3_init_disk(struct g_raid3_softc *sc, struct g_provider *pp,
450     struct g_raid3_metadata *md, int *errorp)
451 {
452 	struct g_raid3_disk *disk;
453 	int error;
454 
455 	disk = &sc->sc_disks[md->md_no];
456 	disk->d_softc = sc;
457 	error = g_raid3_connect_disk(disk, pp);
458 	if (error != 0)
459 		goto fail;
460 	disk->d_no = md->md_no;
461 	disk->d_state = G_RAID3_DISK_STATE_NONE;
462 	disk->d_flags = md->md_dflags;
463 	if (md->md_provider[0] != '\0')
464 		disk->d_flags |= G_RAID3_DISK_FLAG_HARDCODED;
465 	disk->d_sync.ds_consumer = NULL;
466 	disk->d_sync.ds_offset = md->md_sync_offset;
467 	disk->d_sync.ds_offset_done = md->md_sync_offset;
468 	disk->d_sync.ds_syncid = md->md_syncid;
469 	if (errorp != NULL)
470 		*errorp = 0;
471 	return (disk);
472 fail:
473 	if (errorp != NULL)
474 		*errorp = error;
475 	if (disk != NULL)
476 		g_raid3_disconnect_consumer(sc, disk->d_consumer);
477 	return (NULL);
478 }
479 
480 static void
481 g_raid3_destroy_disk(struct g_raid3_disk *disk)
482 {
483 	struct g_raid3_softc *sc;
484 
485 	g_topology_assert();
486 
487 	if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
488 		return;
489 	g_raid3_event_cancel(disk);
490 	sc = disk->d_softc;
491 	switch (disk->d_state) {
492 	case G_RAID3_DISK_STATE_SYNCHRONIZING:
493 		if (sc->sc_syncdisk != NULL)
494 			g_raid3_sync_stop(sc, 1);
495 		/* FALLTHROUGH */
496 	case G_RAID3_DISK_STATE_NEW:
497 	case G_RAID3_DISK_STATE_STALE:
498 	case G_RAID3_DISK_STATE_ACTIVE:
499 		g_raid3_disconnect_consumer(sc, disk->d_consumer);
500 		disk->d_consumer = NULL;
501 		break;
502 	default:
503 		KASSERT(0 == 1, ("Wrong disk state (%s, %s).",
504 		    g_raid3_get_diskname(disk),
505 		    g_raid3_disk_state2str(disk->d_state)));
506 	}
507 	disk->d_state = G_RAID3_DISK_STATE_NODISK;
508 }
509 
510 static void
511 g_raid3_destroy_device(struct g_raid3_softc *sc)
512 {
513 	struct g_raid3_event *ep;
514 	struct g_geom *gp;
515 	struct g_consumer *cp;
516 	u_int n;
517 
518 	g_topology_assert();
519 
520 	gp = sc->sc_geom;
521 	if (sc->sc_provider != NULL)
522 		g_raid3_destroy_provider(sc);
523 	for (n = 0; n < sc->sc_ndisks; n++)
524 		g_raid3_destroy_disk(&sc->sc_disks[n]);
525 	while ((ep = g_raid3_event_get(sc)) != NULL) {
526 		if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0)
527 			g_raid3_event_free(ep);
528 		else {
529 			ep->e_error = ECANCELED;
530 			ep->e_flags |= G_RAID3_EVENT_DONE;
531 			G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, ep);
532 			mtx_lock(&sc->sc_events_mtx);
533 			wakeup(ep);
534 			mtx_unlock(&sc->sc_events_mtx);
535 		}
536 	}
537 	callout_drain(&sc->sc_callout);
538 	gp->softc = NULL;
539 	cp = LIST_FIRST(&sc->sc_sync.ds_geom->consumer);
540 	if (cp != NULL)
541 		g_raid3_disconnect_consumer(sc, cp);
542 	sc->sc_sync.ds_geom->softc = NULL;
543 	g_wither_geom(sc->sc_sync.ds_geom, ENXIO);
544 	uma_zdestroy(sc->sc_zone_64k);
545 	uma_zdestroy(sc->sc_zone_16k);
546 	uma_zdestroy(sc->sc_zone_4k);
547 	mtx_destroy(&sc->sc_queue_mtx);
548 	mtx_destroy(&sc->sc_events_mtx);
549 	G_RAID3_DEBUG(0, "Device %s destroyed.", gp->name);
550 	g_wither_geom(gp, ENXIO);
551 }
552 
553 static void
554 g_raid3_orphan(struct g_consumer *cp)
555 {
556 	struct g_raid3_disk *disk;
557 
558 	g_topology_assert();
559 
560 	disk = cp->private;
561 	if (disk == NULL)
562 		return;
563 	disk->d_softc->sc_bump_syncid = G_RAID3_BUMP_ON_FIRST_WRITE;
564 	g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
565 	    G_RAID3_EVENT_DONTWAIT);
566 }
567 
568 static void
569 g_raid3_spoiled(struct g_consumer *cp)
570 {
571 	struct g_raid3_disk *disk;
572 
573 	g_topology_assert();
574 
575 	disk = cp->private;
576 	if (disk == NULL)
577 		return;
578 	disk->d_softc->sc_bump_syncid = G_RAID3_BUMP_IMMEDIATELY;
579 	g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
580 	    G_RAID3_EVENT_DONTWAIT);
581 }
582 
583 static int
584 g_raid3_write_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md)
585 {
586 	struct g_raid3_softc *sc;
587 	struct g_consumer *cp;
588 	off_t offset, length;
589 	int close = 0, error = 0;
590 	u_char *sector;
591 
592 	g_topology_assert();
593 
594 	sc = disk->d_softc;
595 	cp = disk->d_consumer;
596 	KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name));
597 	KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name));
598 	length = cp->provider->sectorsize;
599 	offset = cp->provider->mediasize - length;
600 	sector = malloc((size_t)length, M_RAID3, M_WAITOK | M_ZERO);
601 	/*
602 	 * Open consumer if it wasn't opened and remember to close it.
603 	 */
604 	if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) == 0) {
605 		error = g_access(cp, 0, 1, 1);
606 		G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", cp->provider->name,
607 		    0, 1, 1, error);
608 		if (error == 0)
609 			close = 1;
610 #ifdef	INVARIANTS
611 	} else {
612 		KASSERT(cp->acw > 0 && cp->ace > 0,
613 		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
614 		    cp->acr, cp->acw, cp->ace));
615 #endif
616 	}
617 	if (error == 0) {
618 		if (md != NULL)
619 			raid3_metadata_encode(md, sector);
620 		g_topology_unlock();
621 		error = g_write_data(cp, offset, sector, length);
622 		g_topology_lock();
623 	}
624 	free(sector, M_RAID3);
625 	if (close) {
626 		g_access(cp, 0, -1, -1);
627 		G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d",
628 		    cp->provider->name, 0, -1, -1, 0);
629 	}
630 	if (error != 0) {
631 		disk->d_softc->sc_bump_syncid = G_RAID3_BUMP_IMMEDIATELY;
632 		g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
633 		    G_RAID3_EVENT_DONTWAIT);
634 	}
635 	return (error);
636 }
637 
638 int
639 g_raid3_clear_metadata(struct g_raid3_disk *disk)
640 {
641 	int error;
642 
643 	g_topology_assert();
644 	error = g_raid3_write_metadata(disk, NULL);
645 	if (error == 0) {
646 		G_RAID3_DEBUG(2, "Metadata on %s cleared.",
647 		    g_raid3_get_diskname(disk));
648 	} else {
649 		G_RAID3_DEBUG(0,
650 		    "Cannot clear metadata on disk %s (error=%d).",
651 		    g_raid3_get_diskname(disk), error);
652 	}
653 	return (error);
654 }
655 
656 void
657 g_raid3_fill_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md)
658 {
659 	struct g_raid3_softc *sc;
660 
661 	sc = disk->d_softc;
662 	strlcpy(md->md_magic, G_RAID3_MAGIC, sizeof(md->md_magic));
663 	md->md_version = G_RAID3_VERSION;
664 	strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name));
665 	md->md_id = sc->sc_id;
666 	md->md_all = sc->sc_ndisks;
667 	md->md_mediasize = sc->sc_mediasize;
668 	md->md_sectorsize = sc->sc_sectorsize;
669 	md->md_mflags = (sc->sc_flags & G_RAID3_DEVICE_FLAG_MASK);
670 	md->md_no = disk->d_no;
671 	md->md_syncid = disk->d_sync.ds_syncid;
672 	md->md_dflags = (disk->d_flags & G_RAID3_DISK_FLAG_MASK);
673 	if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING)
674 		md->md_sync_offset = disk->d_sync.ds_offset_done;
675 	else
676 		md->md_sync_offset = 0;
677 	if ((disk->d_flags & G_RAID3_DISK_FLAG_HARDCODED) != 0 &&
678 	    disk->d_consumer != NULL && disk->d_consumer->provider != NULL) {
679 		strlcpy(md->md_provider, disk->d_consumer->provider->name,
680 		    sizeof(md->md_provider));
681 	} else {
682 		bzero(md->md_provider, sizeof(md->md_provider));
683 	}
684 }
685 
686 void
687 g_raid3_update_metadata(struct g_raid3_disk *disk)
688 {
689 	struct g_raid3_metadata md;
690 	int error;
691 
692 	g_topology_assert();
693 	g_raid3_fill_metadata(disk, &md);
694 	error = g_raid3_write_metadata(disk, &md);
695 	if (error == 0) {
696 		G_RAID3_DEBUG(2, "Metadata on %s updated.",
697 		    g_raid3_get_diskname(disk));
698 	} else {
699 		G_RAID3_DEBUG(0,
700 		    "Cannot update metadata on disk %s (error=%d).",
701 		    g_raid3_get_diskname(disk), error);
702 	}
703 }
704 
705 static void
706 g_raid3_bump_syncid(struct g_raid3_softc *sc)
707 {
708 	struct g_raid3_disk *disk;
709 	u_int n;
710 
711 	g_topology_assert();
712 	KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0,
713 	    ("%s called with no active disks (device=%s).", __func__,
714 	    sc->sc_name));
715 
716 	sc->sc_syncid++;
717 	for (n = 0; n < sc->sc_ndisks; n++) {
718 		disk = &sc->sc_disks[n];
719 		if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
720 		    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
721 			disk->d_sync.ds_syncid = sc->sc_syncid;
722 			g_raid3_update_metadata(disk);
723 		}
724 	}
725 }
726 
727 /*
728  * Treat bio_driver1 field in parent bio as list head and field bio_caller1
729  * in child bio as pointer to the next element on the list.
730  */
731 #define	G_RAID3_HEAD_BIO(pbp)	(pbp)->bio_driver1
732 
733 #define	G_RAID3_NEXT_BIO(cbp)	(cbp)->bio_caller1
734 
735 #define	G_RAID3_FOREACH_BIO(pbp, bp)					\
736 	for ((bp) = G_RAID3_HEAD_BIO(pbp); (bp) != NULL;		\
737 	    (bp) = G_RAID3_NEXT_BIO(bp))
738 
739 #define	G_RAID3_FOREACH_SAFE_BIO(pbp, bp, tmpbp)			\
740 	for ((bp) = G_RAID3_HEAD_BIO(pbp);				\
741 	    (bp) != NULL && ((tmpbp) = G_RAID3_NEXT_BIO(bp), 1);	\
742 	    (bp) = (tmpbp))
743 
744 static void
745 g_raid3_init_bio(struct bio *pbp)
746 {
747 
748 	G_RAID3_HEAD_BIO(pbp) = NULL;
749 }
750 
751 static void
752 g_raid3_remove_bio(struct bio *cbp)
753 {
754 	struct bio *pbp, *bp;
755 
756 	pbp = cbp->bio_parent;
757 	if (G_RAID3_HEAD_BIO(pbp) == cbp)
758 		G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp);
759 	else {
760 		G_RAID3_FOREACH_BIO(pbp, bp) {
761 			if (G_RAID3_NEXT_BIO(bp) == cbp) {
762 				G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp);
763 				break;
764 			}
765 		}
766 	}
767 	G_RAID3_NEXT_BIO(cbp) = NULL;
768 }
769 
770 static void
771 g_raid3_replace_bio(struct bio *sbp, struct bio *dbp)
772 {
773 	struct bio *pbp, *bp;
774 
775 	g_raid3_remove_bio(sbp);
776 	pbp = dbp->bio_parent;
777 	G_RAID3_NEXT_BIO(sbp) = G_RAID3_NEXT_BIO(dbp);
778 	if (G_RAID3_HEAD_BIO(pbp) == dbp)
779 		G_RAID3_HEAD_BIO(pbp) = sbp;
780 	else {
781 		G_RAID3_FOREACH_BIO(pbp, bp) {
782 			if (G_RAID3_NEXT_BIO(bp) == dbp) {
783 				G_RAID3_NEXT_BIO(bp) = sbp;
784 				break;
785 			}
786 		}
787 	}
788 	G_RAID3_NEXT_BIO(dbp) = NULL;
789 }
790 
791 static void
792 g_raid3_destroy_bio(struct g_raid3_softc *sc, struct bio *cbp)
793 {
794 	struct bio *bp, *pbp;
795 	size_t size;
796 
797 	pbp = cbp->bio_parent;
798 	pbp->bio_children--;
799 	KASSERT(cbp->bio_data != NULL, ("NULL bio_data"));
800 	size = pbp->bio_length / (sc->sc_ndisks - 1);
801 	if (size > 16384)
802 		uma_zfree(sc->sc_zone_64k, cbp->bio_data);
803 	else if (size > 4096)
804 		uma_zfree(sc->sc_zone_16k, cbp->bio_data);
805 	else
806 		uma_zfree(sc->sc_zone_4k, cbp->bio_data);
807 	if (G_RAID3_HEAD_BIO(pbp) == cbp) {
808 		G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp);
809 		G_RAID3_NEXT_BIO(cbp) = NULL;
810 		g_destroy_bio(cbp);
811 	} else {
812 		G_RAID3_FOREACH_BIO(pbp, bp) {
813 			if (G_RAID3_NEXT_BIO(bp) == cbp)
814 				break;
815 		}
816 		if (bp != NULL) {
817 			KASSERT(G_RAID3_NEXT_BIO(bp) != NULL,
818 			    ("NULL bp->bio_driver1"));
819 			G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp);
820 			G_RAID3_NEXT_BIO(cbp) = NULL;
821 		}
822 		g_destroy_bio(cbp);
823 	}
824 }
825 
826 static struct bio *
827 g_raid3_clone_bio(struct g_raid3_softc *sc, struct bio *pbp)
828 {
829 	struct bio *bp, *cbp;
830 	size_t size;
831 
832 	cbp = g_clone_bio(pbp);
833 	if (cbp == NULL)
834 		return (NULL);
835 	size = pbp->bio_length / (sc->sc_ndisks - 1);
836 	if (size > 16384) {
837 		cbp->bio_data = uma_zalloc(sc->sc_zone_64k, M_NOWAIT);
838 		g_raid3_64k_requested++;
839 	} else if (size > 4096) {
840 		cbp->bio_data = uma_zalloc(sc->sc_zone_16k, M_NOWAIT);
841 		g_raid3_16k_requested++;
842 	} else {
843 		cbp->bio_data = uma_zalloc(sc->sc_zone_4k, M_NOWAIT);
844 		g_raid3_4k_requested++;
845 	}
846 	if (cbp->bio_data == NULL) {
847 		if (size > 16384)
848 			g_raid3_64k_failed++;
849 		if (size > 4096)
850 			g_raid3_16k_failed++;
851 		else
852 			g_raid3_4k_failed++;
853 		pbp->bio_children--;
854 		g_destroy_bio(cbp);
855 		return (NULL);
856 	}
857 	G_RAID3_NEXT_BIO(cbp) = NULL;
858 	if (G_RAID3_HEAD_BIO(pbp) == NULL)
859 		G_RAID3_HEAD_BIO(pbp) = cbp;
860 	else {
861 		G_RAID3_FOREACH_BIO(pbp, bp) {
862 			if (G_RAID3_NEXT_BIO(bp) == NULL) {
863 				G_RAID3_NEXT_BIO(bp) = cbp;
864 				break;
865 			}
866 		}
867 	}
868 	return (cbp);
869 }
870 
871 static void
872 g_raid3_scatter(struct bio *pbp)
873 {
874 	struct g_raid3_softc *sc;
875 	struct g_raid3_disk *disk;
876 	struct bio *bp, *cbp;
877 	off_t atom, cadd, padd, left;
878 
879 	sc = pbp->bio_to->geom->softc;
880 	bp = NULL;
881 	if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) {
882 		/*
883 		 * Find bio for which we should calculate data.
884 		 */
885 		G_RAID3_FOREACH_BIO(pbp, cbp) {
886 			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) {
887 				bp = cbp;
888 				break;
889 			}
890 		}
891 		KASSERT(bp != NULL, ("NULL parity bio."));
892 	}
893 	atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
894 	cadd = padd = 0;
895 	for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) {
896 		G_RAID3_FOREACH_BIO(pbp, cbp) {
897 			if (cbp == bp)
898 				continue;
899 			bcopy(pbp->bio_data + padd, cbp->bio_data + cadd, atom);
900 			padd += atom;
901 		}
902 		cadd += atom;
903 	}
904 	if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) {
905 		struct bio *tmpbp;
906 
907 		/*
908 		 * Calculate parity.
909 		 */
910 		bzero(bp->bio_data, bp->bio_length);
911 		G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) {
912 			if (cbp == bp)
913 				continue;
914 			g_raid3_xor(cbp->bio_data, bp->bio_data, bp->bio_data,
915 			    bp->bio_length);
916 			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_NODISK) != 0)
917 				g_raid3_destroy_bio(sc, cbp);
918 		}
919 	}
920 	G_RAID3_FOREACH_BIO(pbp, cbp) {
921 		struct g_consumer *cp;
922 
923 		disk = cbp->bio_caller2;
924 		cp = disk->d_consumer;
925 		cbp->bio_to = cp->provider;
926 		G_RAID3_LOGREQ(3, cbp, "Sending request.");
927 		KASSERT(cp->acr > 0 && cp->ace > 0,
928 		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
929 		    cp->acr, cp->acw, cp->ace));
930 		g_io_request(cbp, cp);
931 	}
932 }
933 
934 static void
935 g_raid3_gather(struct bio *pbp)
936 {
937 	struct g_raid3_softc *sc;
938 	struct g_raid3_disk *disk;
939 	struct bio *xbp, *fbp, *cbp;
940 	off_t atom, cadd, padd, left;
941 
942 	sc = pbp->bio_to->geom->softc;
943 	/*
944 	 * Find bio for which we have to calculate data.
945 	 * While going through this path, check if all requests
946 	 * succeeded, if not, deny whole request.
947 	 * If we're in COMPLETE mode, we allow one request to fail,
948 	 * so if we find one, we're sending it to the parity consumer.
949 	 * If there are more failed requests, we deny whole request.
950 	 */
951 	xbp = fbp = NULL;
952 	G_RAID3_FOREACH_BIO(pbp, cbp) {
953 		if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) {
954 			KASSERT(xbp == NULL, ("More than one parity bio."));
955 			xbp = cbp;
956 		}
957 		if (cbp->bio_error == 0)
958 			continue;
959 		/*
960 		 * Found failed request.
961 		 */
962 		G_RAID3_LOGREQ(0, cbp, "Request failed.");
963 		disk = cbp->bio_caller2;
964 		if (disk != NULL) {
965 			/*
966 			 * Actually this is pointless to bump syncid,
967 			 * because whole device is fucked up.
968 			 */
969 			sc->sc_bump_syncid = G_RAID3_BUMP_IMMEDIATELY;
970 			g_raid3_event_send(disk,
971 			    G_RAID3_DISK_STATE_DISCONNECTED,
972 			    G_RAID3_EVENT_DONTWAIT);
973 		}
974 		if (fbp == NULL) {
975 			if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_DEGRADED) != 0) {
976 				/*
977 				 * We are already in degraded mode, so we can't
978 				 * accept any failures.
979 				 */
980 				if (pbp->bio_error == 0)
981 					pbp->bio_error = fbp->bio_error;
982 			} else {
983 				fbp = cbp;
984 			}
985 		} else {
986 			/*
987 			 * Next failed request, that's too many.
988 			 */
989 			if (pbp->bio_error == 0)
990 				pbp->bio_error = fbp->bio_error;
991 		}
992 	}
993 	if (pbp->bio_error != 0)
994 		goto finish;
995 	if (fbp != NULL && (pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) {
996 		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_VERIFY;
997 		if (xbp != fbp)
998 			g_raid3_replace_bio(xbp, fbp);
999 		g_raid3_destroy_bio(sc, fbp);
1000 	} else if (fbp != NULL) {
1001 		struct g_consumer *cp;
1002 
1003 		/*
1004 		 * One request failed, so send the same request to
1005 		 * the parity consumer.
1006 		 */
1007 		disk = pbp->bio_driver2;
1008 		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) {
1009 			pbp->bio_error = fbp->bio_error;
1010 			goto finish;
1011 		}
1012 		pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
1013 		pbp->bio_inbed--;
1014 		fbp->bio_flags &= ~(BIO_DONE | BIO_ERROR);
1015 		if (disk->d_no == sc->sc_ndisks - 1)
1016 			fbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1017 		fbp->bio_error = 0;
1018 		fbp->bio_completed = 0;
1019 		fbp->bio_children = 0;
1020 		fbp->bio_inbed = 0;
1021 		cp = disk->d_consumer;
1022 		fbp->bio_caller2 = disk;
1023 		fbp->bio_to = cp->provider;
1024 		G_RAID3_LOGREQ(3, fbp, "Sending request (recover).");
1025 		KASSERT(cp->acr > 0 && cp->ace > 0,
1026 		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1027 		    cp->acr, cp->acw, cp->ace));
1028 		g_io_request(fbp, cp);
1029 		return;
1030 	}
1031 	if (xbp != NULL) {
1032 		/*
1033 		 * Calculate parity.
1034 		 */
1035 		G_RAID3_FOREACH_BIO(pbp, cbp) {
1036 			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0)
1037 				continue;
1038 			g_raid3_xor(cbp->bio_data, xbp->bio_data, xbp->bio_data,
1039 			    xbp->bio_length);
1040 		}
1041 		xbp->bio_cflags &= ~G_RAID3_BIO_CFLAG_PARITY;
1042 		if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) {
1043 			if (!g_raid3_is_zero(xbp)) {
1044 				g_raid3_parity_mismatch++;
1045 				pbp->bio_error = EIO;
1046 				goto finish;
1047 			}
1048 			g_raid3_destroy_bio(sc, xbp);
1049 		}
1050 	}
1051 	atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
1052 	cadd = padd = 0;
1053 	for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) {
1054 		G_RAID3_FOREACH_BIO(pbp, cbp) {
1055 			bcopy(cbp->bio_data + cadd, pbp->bio_data + padd, atom);
1056 			pbp->bio_completed += atom;
1057 			padd += atom;
1058 		}
1059 		cadd += atom;
1060 	}
1061 finish:
1062 	if (pbp->bio_error == 0)
1063 		G_RAID3_LOGREQ(3, pbp, "Request finished.");
1064 	else {
1065 		if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0)
1066 			G_RAID3_LOGREQ(1, pbp, "Verification error.");
1067 		else
1068 			G_RAID3_LOGREQ(0, pbp, "Request failed.");
1069 	}
1070 	pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_MASK;
1071 	g_io_deliver(pbp, pbp->bio_error);
1072 	while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL)
1073 		g_raid3_destroy_bio(sc, cbp);
1074 }
1075 
1076 static void
1077 g_raid3_done(struct bio *bp)
1078 {
1079 	struct g_raid3_softc *sc;
1080 
1081 	sc = bp->bio_from->geom->softc;
1082 	bp->bio_cflags |= G_RAID3_BIO_CFLAG_REGULAR;
1083 	G_RAID3_LOGREQ(3, bp, "Regular request done (error=%d).", bp->bio_error);
1084 	mtx_lock(&sc->sc_queue_mtx);
1085 	bioq_insert_head(&sc->sc_queue, bp);
1086 	wakeup(sc);
1087 	wakeup(&sc->sc_queue);
1088 	mtx_unlock(&sc->sc_queue_mtx);
1089 }
1090 
1091 static void
1092 g_raid3_regular_request(struct bio *cbp)
1093 {
1094 	struct g_raid3_softc *sc;
1095 	struct g_raid3_disk *disk;
1096 	struct bio *pbp;
1097 
1098 	g_topology_assert_not();
1099 
1100 	pbp = cbp->bio_parent;
1101 	sc = pbp->bio_to->geom->softc;
1102 	disk = cbp->bio_from->private;
1103 	if (disk == NULL) {
1104 		g_topology_lock();
1105 		g_raid3_kill_consumer(sc, cbp->bio_from);
1106 		g_topology_unlock();
1107 	}
1108 
1109 	G_RAID3_LOGREQ(3, cbp, "Request finished.");
1110 	pbp->bio_inbed++;
1111 	KASSERT(pbp->bio_inbed <= pbp->bio_children,
1112 	    ("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed,
1113 	    pbp->bio_children));
1114 	if (pbp->bio_inbed != pbp->bio_children)
1115 		return;
1116 	switch (pbp->bio_cmd) {
1117 	case BIO_READ:
1118 		g_raid3_gather(pbp);
1119 		break;
1120 	case BIO_WRITE:
1121 	case BIO_DELETE:
1122 	    {
1123 		int error = 0;
1124 
1125 		pbp->bio_completed = pbp->bio_length;
1126 		while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) {
1127 			if (cbp->bio_error != 0) {
1128 				disk = cbp->bio_caller2;
1129 				if (disk != NULL) {
1130 					sc->sc_bump_syncid =
1131 					    G_RAID3_BUMP_IMMEDIATELY;
1132 					g_raid3_event_send(disk,
1133 					    G_RAID3_DISK_STATE_DISCONNECTED,
1134 					    G_RAID3_EVENT_DONTWAIT);
1135 				}
1136 				if (error == 0)
1137 					error = cbp->bio_error;
1138 				else if (pbp->bio_error == 0) {
1139 					/*
1140 					 * Next failed request, that's too many.
1141 					 */
1142 					pbp->bio_error = error;
1143 				}
1144 			}
1145 			g_raid3_destroy_bio(sc, cbp);
1146 		}
1147 		if (pbp->bio_error == 0)
1148 			G_RAID3_LOGREQ(3, pbp, "Request finished.");
1149 		else
1150 			G_RAID3_LOGREQ(0, pbp, "Request failed.");
1151 		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_DEGRADED;
1152 		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_NOPARITY;
1153 		g_io_deliver(pbp, pbp->bio_error);
1154 		break;
1155 	    }
1156 	}
1157 }
1158 
1159 static void
1160 g_raid3_sync_done(struct bio *bp)
1161 {
1162 	struct g_raid3_softc *sc;
1163 
1164 	G_RAID3_LOGREQ(3, bp, "Synchronization request delivered.");
1165 	sc = bp->bio_from->geom->softc;
1166 	bp->bio_cflags |= G_RAID3_BIO_CFLAG_SYNC;
1167 	mtx_lock(&sc->sc_queue_mtx);
1168 	bioq_insert_head(&sc->sc_queue, bp);
1169 	wakeup(sc);
1170 	wakeup(&sc->sc_queue);
1171 	mtx_unlock(&sc->sc_queue_mtx);
1172 }
1173 
1174 static void
1175 g_raid3_start(struct bio *bp)
1176 {
1177 	struct g_raid3_softc *sc;
1178 
1179 	sc = bp->bio_to->geom->softc;
1180 	/*
1181 	 * If sc == NULL or there are no valid disks, provider's error
1182 	 * should be set and g_raid3_start() should not be called at all.
1183 	 */
1184 	KASSERT(sc != NULL && (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
1185 	    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE),
1186 	    ("Provider's error should be set (error=%d)(device=%s).",
1187 	    bp->bio_to->error, bp->bio_to->name));
1188 	G_RAID3_LOGREQ(3, bp, "Request received.");
1189 
1190 	switch (bp->bio_cmd) {
1191 	case BIO_READ:
1192 	case BIO_WRITE:
1193 	case BIO_DELETE:
1194 		break;
1195 	case BIO_GETATTR:
1196 	default:
1197 		g_io_deliver(bp, EOPNOTSUPP);
1198 		return;
1199 	}
1200 	mtx_lock(&sc->sc_queue_mtx);
1201 	bioq_insert_tail(&sc->sc_queue, bp);
1202 	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
1203 	wakeup(sc);
1204 	mtx_unlock(&sc->sc_queue_mtx);
1205 }
1206 
1207 /*
1208  * Send one synchronization request.
1209  */
1210 static void
1211 g_raid3_sync_one(struct g_raid3_softc *sc)
1212 {
1213 	struct g_raid3_disk *disk;
1214 	struct bio *bp;
1215 
1216 	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
1217 	    ("Wrong device state (%s, %s).", sc->sc_name,
1218 	    g_raid3_device_state2str(sc->sc_state)));
1219 	disk = sc->sc_syncdisk;
1220 	KASSERT(disk != NULL, ("No sync disk (%s).", sc->sc_name));
1221 	KASSERT(disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
1222 	    ("Disk %s is not marked for synchronization.",
1223 	    g_raid3_get_diskname(disk)));
1224 
1225 	bp = g_new_bio();
1226 	if (bp == NULL)
1227 		return;
1228 	bp->bio_parent = NULL;
1229 	bp->bio_cmd = BIO_READ;
1230 	bp->bio_offset = disk->d_sync.ds_offset * (sc->sc_ndisks - 1);
1231 	bp->bio_length = MIN(G_RAID3_MAX_IO_SIZE,
1232 	    sc->sc_mediasize - bp->bio_offset);
1233 	bp->bio_cflags = 0;
1234 	bp->bio_done = g_raid3_sync_done;
1235 	bp->bio_data = disk->d_sync.ds_data;
1236 	if (bp->bio_data == NULL) {
1237 		g_destroy_bio(bp);
1238 		return;
1239 	}
1240 	bp->bio_cflags = G_RAID3_BIO_CFLAG_REGSYNC;
1241 	disk->d_sync.ds_offset += bp->bio_length / (sc->sc_ndisks - 1);
1242 	bp->bio_to = sc->sc_provider;
1243 	G_RAID3_LOGREQ(3, bp, "Sending synchronization request.");
1244 	g_io_request(bp, disk->d_sync.ds_consumer);
1245 }
1246 
1247 static void
1248 g_raid3_sync_request(struct bio *bp)
1249 {
1250 	struct g_raid3_softc *sc;
1251 	struct g_raid3_disk *disk;
1252 
1253 	sc = bp->bio_from->geom->softc;
1254 	disk = bp->bio_from->private;
1255 	if (disk == NULL) {
1256 		g_topology_lock();
1257 		g_raid3_kill_consumer(sc, bp->bio_from);
1258 		g_topology_unlock();
1259 		g_destroy_bio(bp);
1260 		return;
1261 	}
1262 
1263 	/*
1264 	 * Synchronization request.
1265 	 */
1266 	switch (bp->bio_cmd) {
1267 	case BIO_READ:
1268 	    {
1269 		struct g_consumer *cp;
1270 		u_char *dst, *src;
1271 		off_t left;
1272 		u_int atom;
1273 
1274 		if (bp->bio_error != 0) {
1275 			G_RAID3_LOGREQ(0, bp,
1276 			    "Synchronization request failed (error=%d).",
1277 			    bp->bio_error);
1278 			g_destroy_bio(bp);
1279 			return;
1280 		}
1281 		G_RAID3_LOGREQ(3, bp, "Synchronization request finished.");
1282 		atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
1283 		dst = src = bp->bio_data;
1284 		if (disk->d_no == sc->sc_ndisks - 1) {
1285 			u_int n;
1286 
1287 			/* Parity component. */
1288 			for (left = bp->bio_length; left > 0;
1289 			    left -= sc->sc_sectorsize) {
1290 				bcopy(src, dst, atom);
1291 				src += atom;
1292 				for (n = 1; n < sc->sc_ndisks - 1; n++) {
1293 					g_raid3_xor(src, dst, dst, atom);
1294 					src += atom;
1295 				}
1296 				dst += atom;
1297 			}
1298 		} else {
1299 			/* Regular component. */
1300 			src += atom * disk->d_no;
1301 			for (left = bp->bio_length; left > 0;
1302 			    left -= sc->sc_sectorsize) {
1303 				bcopy(src, dst, atom);
1304 				src += sc->sc_sectorsize;
1305 				dst += atom;
1306 			}
1307 		}
1308 		bp->bio_offset /= sc->sc_ndisks - 1;
1309 		bp->bio_length /= sc->sc_ndisks - 1;
1310 		bp->bio_cmd = BIO_WRITE;
1311 		bp->bio_cflags = 0;
1312 		bp->bio_children = bp->bio_inbed = 0;
1313 		cp = disk->d_consumer;
1314 		KASSERT(cp->acr == 0 && cp->acw == 1 && cp->ace == 1,
1315 		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
1316 		    cp->acr, cp->acw, cp->ace));
1317 		g_io_request(bp, cp);
1318 		return;
1319 	    }
1320 	case BIO_WRITE:
1321 		if (bp->bio_error != 0) {
1322 			G_RAID3_LOGREQ(0, bp,
1323 			    "Synchronization request failed (error=%d).",
1324 			    bp->bio_error);
1325 			g_destroy_bio(bp);
1326 			sc->sc_bump_syncid = G_RAID3_BUMP_IMMEDIATELY;
1327 			g_raid3_event_send(disk,
1328 			    G_RAID3_DISK_STATE_DISCONNECTED,
1329 			    G_RAID3_EVENT_DONTWAIT);
1330 			return;
1331 		}
1332 		G_RAID3_LOGREQ(3, bp, "Synchronization request finished.");
1333 		disk->d_sync.ds_offset_done = bp->bio_offset + bp->bio_length;
1334 		g_destroy_bio(bp);
1335 		if (disk->d_sync.ds_offset_done ==
1336 		    sc->sc_mediasize / (sc->sc_ndisks - 1)) {
1337 			/*
1338 			 * Disk up-to-date, activate it.
1339 			 */
1340 			g_raid3_event_send(disk, G_RAID3_DISK_STATE_ACTIVE,
1341 			    G_RAID3_EVENT_DONTWAIT);
1342 			return;
1343 		} else if ((disk->d_sync.ds_offset_done %
1344 		    (G_RAID3_MAX_IO_SIZE * 100)) == 0) {
1345 			/*
1346 			 * Update offset_done on every 100 blocks.
1347 			 * XXX: This should be configurable.
1348 			 */
1349 			g_topology_lock();
1350 			g_raid3_update_metadata(disk);
1351 			g_topology_unlock();
1352 		}
1353 		return;
1354 	default:
1355 		KASSERT(1 == 0, ("Invalid command here: %u (device=%s)",
1356 		    bp->bio_cmd, sc->sc_name));
1357 		break;
1358 	}
1359 }
1360 
1361 static int
1362 g_raid3_register_request(struct bio *pbp)
1363 {
1364 	struct g_raid3_softc *sc;
1365 	struct g_raid3_disk *disk;
1366 	struct g_consumer *cp;
1367 	struct bio *cbp;
1368 	off_t offset, length;
1369 	u_int n, ndisks;
1370 	int round_robin, verify;
1371 
1372 	ndisks = 0;
1373 	sc = pbp->bio_to->geom->softc;
1374 	if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGSYNC) != 0 &&
1375 	    sc->sc_syncdisk == NULL) {
1376 		g_io_deliver(pbp, EIO);
1377 		return (0);
1378 	}
1379 	g_raid3_init_bio(pbp);
1380 	length = pbp->bio_length / (sc->sc_ndisks - 1);
1381 	offset = pbp->bio_offset / (sc->sc_ndisks - 1);
1382 	round_robin = verify = 0;
1383 	switch (pbp->bio_cmd) {
1384 	case BIO_READ:
1385 		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 &&
1386 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
1387 			pbp->bio_pflags |= G_RAID3_BIO_PFLAG_VERIFY;
1388 			verify = 1;
1389 			ndisks = sc->sc_ndisks;
1390 		} else {
1391 			verify = 0;
1392 			ndisks = sc->sc_ndisks - 1;
1393 		}
1394 		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0 &&
1395 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
1396 			round_robin = 1;
1397 		} else {
1398 			round_robin = 0;
1399 		}
1400 		KASSERT(!round_robin || !verify,
1401 		    ("ROUND-ROBIN and VERIFY are mutually exclusive."));
1402 		pbp->bio_driver2 = &sc->sc_disks[sc->sc_ndisks - 1];
1403 		break;
1404 	case BIO_WRITE:
1405 	case BIO_DELETE:
1406 		ndisks = sc->sc_ndisks;
1407 		break;
1408 	}
1409 	for (n = 0; n < ndisks; n++) {
1410 		disk = &sc->sc_disks[n];
1411 		cbp = g_raid3_clone_bio(sc, pbp);
1412 		if (cbp == NULL) {
1413 			while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL)
1414 				g_raid3_destroy_bio(sc, cbp);
1415 			return (ENOMEM);
1416 		}
1417 		cbp->bio_offset = offset;
1418 		cbp->bio_length = length;
1419 		cbp->bio_done = g_raid3_done;
1420 		switch (pbp->bio_cmd) {
1421 		case BIO_READ:
1422 			if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) {
1423 				/*
1424 				 * Replace invalid component with the parity
1425 				 * component.
1426 				 */
1427 				disk = &sc->sc_disks[sc->sc_ndisks - 1];
1428 				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1429 				pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
1430 			} else if (round_robin &&
1431 			    disk->d_no == sc->sc_round_robin) {
1432 				/*
1433 				 * In round-robin mode skip one data component
1434 				 * and use parity component when reading.
1435 				 */
1436 				pbp->bio_driver2 = disk;
1437 				disk = &sc->sc_disks[sc->sc_ndisks - 1];
1438 				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1439 				sc->sc_round_robin++;
1440 				round_robin = 0;
1441 			} else if (verify && disk->d_no == sc->sc_ndisks - 1) {
1442 				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
1443 			}
1444 			break;
1445 		case BIO_WRITE:
1446 		case BIO_DELETE:
1447 			if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
1448 			    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
1449 				if (n == ndisks - 1) {
1450 					/*
1451 					 * Active parity component, mark it as such.
1452 					 */
1453 					cbp->bio_cflags |=
1454 					    G_RAID3_BIO_CFLAG_PARITY;
1455 				}
1456 			} else {
1457 				pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
1458 				if (n == ndisks - 1) {
1459 					/*
1460 					 * Parity component is not connected,
1461 					 * so destroy its request.
1462 					 */
1463 					pbp->bio_pflags |=
1464 					    G_RAID3_BIO_PFLAG_NOPARITY;
1465 					g_raid3_destroy_bio(sc, cbp);
1466 					cbp = NULL;
1467 				} else {
1468 					cbp->bio_cflags |=
1469 					    G_RAID3_BIO_CFLAG_NODISK;
1470 					disk = NULL;
1471 				}
1472 			}
1473 			break;
1474 		}
1475 		if (cbp != NULL)
1476 			cbp->bio_caller2 = disk;
1477 	}
1478 	switch (pbp->bio_cmd) {
1479 	case BIO_READ:
1480 		if (round_robin) {
1481 			/*
1482 			 * If we are in round-robin mode and 'round_robin' is
1483 			 * still 1, it means, that we skipped parity component
1484 			 * for this read and must reset sc_round_robin field.
1485 			 */
1486 			sc->sc_round_robin = 0;
1487 		}
1488 		G_RAID3_FOREACH_BIO(pbp, cbp) {
1489 			disk = cbp->bio_caller2;
1490 			cp = disk->d_consumer;
1491 			cbp->bio_to = cp->provider;
1492 			G_RAID3_LOGREQ(3, cbp, "Sending request.");
1493 			KASSERT(cp->acr > 0 && cp->ace > 0,
1494 			    ("Consumer %s not opened (r%dw%de%d).",
1495 			    cp->provider->name, cp->acr, cp->acw, cp->ace));
1496 			g_io_request(cbp, cp);
1497 		}
1498 		break;
1499 	case BIO_WRITE:
1500 	case BIO_DELETE:
1501 		/*
1502 		 * Bump syncid on first write.
1503 		 */
1504 		if (sc->sc_bump_syncid == G_RAID3_BUMP_ON_FIRST_WRITE) {
1505 			sc->sc_bump_syncid = 0;
1506 			g_topology_lock();
1507 			g_raid3_bump_syncid(sc);
1508 			g_topology_unlock();
1509 		}
1510 		g_raid3_scatter(pbp);
1511 		break;
1512 	}
1513 	return (0);
1514 }
1515 
1516 static int
1517 g_raid3_can_destroy(struct g_raid3_softc *sc)
1518 {
1519 	struct g_geom *gp;
1520 	struct g_consumer *cp;
1521 
1522 	g_topology_assert();
1523 	gp = sc->sc_geom;
1524 	LIST_FOREACH(cp, &gp->consumer, consumer) {
1525 		if (g_raid3_is_busy(sc, cp))
1526 			return (0);
1527 	}
1528 	gp = sc->sc_sync.ds_geom;
1529 	LIST_FOREACH(cp, &gp->consumer, consumer) {
1530 		if (g_raid3_is_busy(sc, cp))
1531 			return (0);
1532 	}
1533 	G_RAID3_DEBUG(2, "No I/O requests for %s, it can be destroyed.",
1534 	    sc->sc_name);
1535 	return (1);
1536 }
1537 
1538 static int
1539 g_raid3_try_destroy(struct g_raid3_softc *sc)
1540 {
1541 
1542 	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_WAIT) != 0) {
1543 		g_topology_lock();
1544 		if (!g_raid3_can_destroy(sc)) {
1545 			g_topology_unlock();
1546 			return (0);
1547 		}
1548 		g_topology_unlock();
1549 		G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__,
1550 		    &sc->sc_worker);
1551 		wakeup(&sc->sc_worker);
1552 		sc->sc_worker = NULL;
1553 	} else {
1554 		g_topology_lock();
1555 		if (!g_raid3_can_destroy(sc)) {
1556 			g_topology_unlock();
1557 			return (0);
1558 		}
1559 		g_raid3_destroy_device(sc);
1560 		g_topology_unlock();
1561 		free(sc->sc_disks, M_RAID3);
1562 		free(sc, M_RAID3);
1563 	}
1564 	return (1);
1565 }
1566 
1567 /*
1568  * Worker thread.
1569  */
1570 static void
1571 g_raid3_worker(void *arg)
1572 {
1573 	struct g_raid3_softc *sc;
1574 	struct g_raid3_disk *disk;
1575 	struct g_raid3_event *ep;
1576 	struct bio *bp;
1577 	u_int nreqs;
1578 
1579 	sc = arg;
1580 	curthread->td_base_pri = PRIBIO;
1581 
1582 	nreqs = 0;
1583 	for (;;) {
1584 		G_RAID3_DEBUG(5, "%s: Let's see...", __func__);
1585 		/*
1586 		 * First take a look at events.
1587 		 * This is important to handle events before any I/O requests.
1588 		 */
1589 		ep = g_raid3_event_get(sc);
1590 		if (ep != NULL) {
1591 			g_topology_lock();
1592 			if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0) {
1593 				/* Update only device status. */
1594 				G_RAID3_DEBUG(3,
1595 				    "Running event for device %s.",
1596 				    sc->sc_name);
1597 				ep->e_error = 0;
1598 				g_raid3_update_device(sc, 1);
1599 			} else {
1600 				/* Update disk status. */
1601 				G_RAID3_DEBUG(3, "Running event for disk %s.",
1602 				     g_raid3_get_diskname(ep->e_disk));
1603 				ep->e_error = g_raid3_update_disk(ep->e_disk,
1604 				    ep->e_state);
1605 				if (ep->e_error == 0)
1606 					g_raid3_update_device(sc, 0);
1607 			}
1608 			g_topology_unlock();
1609 			if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) {
1610 				KASSERT(ep->e_error == 0,
1611 				    ("Error cannot be handled."));
1612 				g_raid3_event_free(ep);
1613 			} else {
1614 				ep->e_flags |= G_RAID3_EVENT_DONE;
1615 				G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__,
1616 				    ep);
1617 				mtx_lock(&sc->sc_events_mtx);
1618 				wakeup(ep);
1619 				mtx_unlock(&sc->sc_events_mtx);
1620 			}
1621 			if ((sc->sc_flags &
1622 			    G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
1623 				if (g_raid3_try_destroy(sc))
1624 					kthread_exit(0);
1625 			}
1626 			G_RAID3_DEBUG(5, "%s: I'm here 1.", __func__);
1627 			continue;
1628 		}
1629 		/*
1630 		 * Now I/O requests.
1631 		 */
1632 		/* Get first request from the queue. */
1633 		mtx_lock(&sc->sc_queue_mtx);
1634 		bp = bioq_first(&sc->sc_queue);
1635 		if (bp == NULL) {
1636 			if ((sc->sc_flags &
1637 			    G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
1638 				mtx_unlock(&sc->sc_queue_mtx);
1639 				if (g_raid3_try_destroy(sc))
1640 					kthread_exit(0);
1641 				mtx_lock(&sc->sc_queue_mtx);
1642 			}
1643 		}
1644 		if (sc->sc_syncdisk != NULL &&
1645 		    (bp == NULL || nreqs > g_raid3_reqs_per_sync)) {
1646 			mtx_unlock(&sc->sc_queue_mtx);
1647 			/*
1648 			 * It is time for synchronization...
1649 			 */
1650 			nreqs = 0;
1651 			disk = sc->sc_syncdisk;
1652 			if (disk->d_sync.ds_offset <
1653 			    sc->sc_mediasize / (sc->sc_ndisks - 1) &&
1654 			    disk->d_sync.ds_offset ==
1655 			    disk->d_sync.ds_offset_done) {
1656 				g_raid3_sync_one(sc);
1657 			}
1658 			G_RAID3_DEBUG(5, "%s: I'm here 2.", __func__);
1659 			goto sleep;
1660 		}
1661 		if (bp == NULL) {
1662 			MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "r3:w1", 0);
1663 			G_RAID3_DEBUG(5, "%s: I'm here 3.", __func__);
1664 			continue;
1665 		}
1666 		nreqs++;
1667 		bioq_remove(&sc->sc_queue, bp);
1668 		mtx_unlock(&sc->sc_queue_mtx);
1669 
1670 		if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0) {
1671 			g_raid3_regular_request(bp);
1672 		} else if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0) {
1673 			u_int timeout, sps;
1674 
1675 			g_raid3_sync_request(bp);
1676 sleep:
1677 			sps = atomic_load_acq_int(&g_raid3_syncs_per_sec);
1678 			if (sps == 0) {
1679 				G_RAID3_DEBUG(5, "%s: I'm here 5.", __func__);
1680 				continue;
1681 			}
1682 			mtx_lock(&sc->sc_queue_mtx);
1683 			if (bioq_first(&sc->sc_queue) != NULL) {
1684 				mtx_unlock(&sc->sc_queue_mtx);
1685 				G_RAID3_DEBUG(5, "%s: I'm here 4.", __func__);
1686 				continue;
1687 			}
1688 			timeout = hz / sps;
1689 			if (timeout == 0)
1690 				timeout = 1;
1691 			MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "r3:w2",
1692 			    timeout);
1693 		} else {
1694 			if (g_raid3_register_request(bp) != 0) {
1695 				mtx_lock(&sc->sc_queue_mtx);
1696 				bioq_insert_tail(&sc->sc_queue, bp);
1697 				MSLEEP(&sc->sc_queue, &sc->sc_queue_mtx,
1698 				    PRIBIO | PDROP, "r3:lowmem", hz / 10);
1699 			}
1700 		}
1701 		G_RAID3_DEBUG(5, "%s: I'm here 6.", __func__);
1702 	}
1703 }
1704 
1705 /*
1706  * Open disk's consumer if needed.
1707  */
1708 static void
1709 g_raid3_update_access(struct g_raid3_disk *disk)
1710 {
1711 	struct g_provider *pp;
1712 	struct g_consumer *cp;
1713 	int acr, acw, ace, cpw, error;
1714 
1715 	g_topology_assert();
1716 
1717 	cp = disk->d_consumer;
1718 	pp = disk->d_softc->sc_provider;
1719 	if (pp == NULL) {
1720 		acr = -cp->acr;
1721 		acw = -cp->acw;
1722 		ace = -cp->ace;
1723 	} else {
1724 		acr = pp->acr - cp->acr;
1725 		acw = pp->acw - cp->acw;
1726 		ace = pp->ace - cp->ace;
1727 		/* Grab an extra "exclusive" bit. */
1728 		if (pp->acr > 0 || pp->acw > 0 || pp->ace > 0)
1729 			ace++;
1730 	}
1731 	if (acr == 0 && acw == 0 && ace == 0)
1732 		return;
1733 	cpw = cp->acw;
1734 	error = g_access(cp, acr, acw, ace);
1735 	G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", cp->provider->name, acr,
1736 	    acw, ace, error);
1737 	if (error != 0) {
1738 		disk->d_softc->sc_bump_syncid = G_RAID3_BUMP_ON_FIRST_WRITE;
1739 		g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
1740 		    G_RAID3_EVENT_DONTWAIT);
1741 		return;
1742 	}
1743 	if (cpw == 0 && cp->acw > 0) {
1744 		G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.",
1745 		    g_raid3_get_diskname(disk), disk->d_softc->sc_name);
1746 		disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
1747 	} else if (cpw > 0 && cp->acw == 0) {
1748 		G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.",
1749 		    g_raid3_get_diskname(disk), disk->d_softc->sc_name);
1750 		disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
1751 	}
1752 }
1753 
1754 static void
1755 g_raid3_sync_start(struct g_raid3_softc *sc)
1756 {
1757 	struct g_raid3_disk *disk;
1758 	struct g_consumer *cp;
1759 	int error;
1760 	u_int n;
1761 
1762 	g_topology_assert();
1763 
1764 	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
1765 	    ("Device not in DEGRADED state (%s, %u).", sc->sc_name,
1766 	    sc->sc_state));
1767 	KASSERT(sc->sc_syncdisk == NULL, ("Syncdisk is not NULL (%s, %u).",
1768 	    sc->sc_name, sc->sc_state));
1769 	disk = NULL;
1770 	for (n = 0; n < sc->sc_ndisks; n++) {
1771 		if (sc->sc_disks[n].d_state != G_RAID3_DISK_STATE_SYNCHRONIZING)
1772 			continue;
1773 		disk = &sc->sc_disks[n];
1774 		break;
1775 	}
1776 	if (disk == NULL)
1777 		return;
1778 	cp = disk->d_consumer;
1779 	KASSERT(cp->acr == 0 && cp->acw == 0 && cp->ace == 0,
1780 	    ("Consumer %s already opened.", cp->provider->name));
1781 
1782 	G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name,
1783 	    g_raid3_get_diskname(disk));
1784 	error = g_access(cp, 0, 1, 1);
1785 	G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", cp->provider->name, 0, 1,
1786 	    1, error);
1787 	if (error != 0) {
1788 		g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
1789 		    G_RAID3_EVENT_DONTWAIT);
1790 		return;
1791 	}
1792 	disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
1793 	KASSERT(disk->d_sync.ds_consumer == NULL,
1794 	    ("Sync consumer already exists (device=%s, disk=%s).",
1795 	    sc->sc_name, g_raid3_get_diskname(disk)));
1796 	disk->d_sync.ds_consumer = g_new_consumer(sc->sc_sync.ds_geom);
1797 	disk->d_sync.ds_consumer->private = disk;
1798 	error = g_attach(disk->d_sync.ds_consumer, disk->d_softc->sc_provider);
1799 	KASSERT(error == 0, ("Cannot attach to %s (error=%d).",
1800 	    disk->d_softc->sc_name, error));
1801 	error = g_access(disk->d_sync.ds_consumer, 1, 0, 0);
1802 	KASSERT(error == 0, ("Cannot open %s (error=%d).",
1803 	    disk->d_softc->sc_name, error));
1804 	disk->d_sync.ds_data = malloc(G_RAID3_MAX_IO_SIZE, M_RAID3, M_WAITOK);
1805 	sc->sc_syncdisk = disk;
1806 }
1807 
1808 /*
1809  * Stop synchronization process.
1810  * type: 0 - synchronization finished
1811  *       1 - synchronization stopped
1812  */
1813 static void
1814 g_raid3_sync_stop(struct g_raid3_softc *sc, int type)
1815 {
1816 	struct g_raid3_disk *disk;
1817 	struct g_consumer *cp;
1818 
1819 	g_topology_assert();
1820 	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
1821 	    ("Device not in DEGRADED state (%s, %u).", sc->sc_name,
1822 	    sc->sc_state));
1823 	disk = sc->sc_syncdisk;
1824 	sc->sc_syncdisk = NULL;
1825 	KASSERT(disk != NULL, ("No disk was synchronized (%s).", sc->sc_name));
1826 	KASSERT(disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
1827 	    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
1828 	    g_raid3_disk_state2str(disk->d_state)));
1829 	if (disk->d_sync.ds_consumer == NULL)
1830 		return;
1831 
1832 	if (type == 0) {
1833 		G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s finished.",
1834 		    disk->d_softc->sc_name, g_raid3_get_diskname(disk));
1835 	} else /* if (type == 1) */ {
1836 		G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s stopped.",
1837 		    disk->d_softc->sc_name, g_raid3_get_diskname(disk));
1838 	}
1839 	cp = disk->d_sync.ds_consumer;
1840 	g_access(cp, -1, 0, 0);
1841 	g_raid3_kill_consumer(disk->d_softc, cp);
1842 	free(disk->d_sync.ds_data, M_RAID3);
1843 	disk->d_sync.ds_consumer = NULL;
1844 	cp = disk->d_consumer;
1845 	KASSERT(cp->acr == 0 && cp->acw == 1 && cp->ace == 1,
1846 	    ("Consumer %s not opened.", cp->provider->name));
1847 	g_access(cp, 0, -1, -1);
1848 	G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", cp->provider->name, 0, -1,
1849 	    -1, 0);
1850 	disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
1851 }
1852 
1853 static void
1854 g_raid3_launch_provider(struct g_raid3_softc *sc)
1855 {
1856 	struct g_provider *pp;
1857 
1858 	g_topology_assert();
1859 
1860 	pp = g_new_providerf(sc->sc_geom, "raid3/%s", sc->sc_name);
1861 	pp->mediasize = sc->sc_mediasize;
1862 	pp->sectorsize = sc->sc_sectorsize;
1863 	sc->sc_provider = pp;
1864 	g_error_provider(pp, 0);
1865 	G_RAID3_DEBUG(0, "Device %s: provider %s launched.", sc->sc_name,
1866 	    pp->name);
1867 	if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED)
1868 		g_raid3_sync_start(sc);
1869 }
1870 
1871 static void
1872 g_raid3_destroy_provider(struct g_raid3_softc *sc)
1873 {
1874 	struct bio *bp;
1875 
1876 	g_topology_assert();
1877 	KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).",
1878 	    sc->sc_name));
1879 
1880 	g_error_provider(sc->sc_provider, ENXIO);
1881 	mtx_lock(&sc->sc_queue_mtx);
1882 	while ((bp = bioq_first(&sc->sc_queue)) != NULL) {
1883 		bioq_remove(&sc->sc_queue, bp);
1884 		g_io_deliver(bp, ENXIO);
1885 	}
1886 	mtx_unlock(&sc->sc_queue_mtx);
1887 	G_RAID3_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name,
1888 	    sc->sc_provider->name);
1889 	sc->sc_provider->flags |= G_PF_WITHER;
1890 	g_orphan_provider(sc->sc_provider, ENXIO);
1891 	sc->sc_provider = NULL;
1892 	if (sc->sc_syncdisk != NULL)
1893 		g_raid3_sync_stop(sc, 1);
1894 }
1895 
1896 static void
1897 g_raid3_go(void *arg)
1898 {
1899 	struct g_raid3_softc *sc;
1900 
1901 	sc = arg;
1902 	G_RAID3_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name);
1903 	g_raid3_event_send(sc, 0,
1904 	    G_RAID3_EVENT_DONTWAIT | G_RAID3_EVENT_DEVICE);
1905 }
1906 
1907 static u_int
1908 g_raid3_determine_state(struct g_raid3_disk *disk)
1909 {
1910 	struct g_raid3_softc *sc;
1911 	u_int state;
1912 
1913 	sc = disk->d_softc;
1914 	if (sc->sc_syncid == disk->d_sync.ds_syncid) {
1915 		if ((disk->d_flags &
1916 		    G_RAID3_DISK_FLAG_SYNCHRONIZING) == 0) {
1917 			/* Disk does not need synchronization. */
1918 			state = G_RAID3_DISK_STATE_ACTIVE;
1919 		} else {
1920 			if ((sc->sc_flags &
1921 			     G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0  ||
1922 			    (disk->d_flags &
1923 			     G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) {
1924 				/*
1925 				 * We can start synchronization from
1926 				 * the stored offset.
1927 				 */
1928 				state = G_RAID3_DISK_STATE_SYNCHRONIZING;
1929 			} else {
1930 				state = G_RAID3_DISK_STATE_STALE;
1931 			}
1932 		}
1933 	} else if (disk->d_sync.ds_syncid < sc->sc_syncid) {
1934 		/*
1935 		 * Reset all synchronization data for this disk,
1936 		 * because if it even was synchronized, it was
1937 		 * synchronized to disks with different syncid.
1938 		 */
1939 		disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING;
1940 		disk->d_sync.ds_offset = 0;
1941 		disk->d_sync.ds_offset_done = 0;
1942 		disk->d_sync.ds_syncid = sc->sc_syncid;
1943 		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 ||
1944 		    (disk->d_flags & G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) {
1945 			state = G_RAID3_DISK_STATE_SYNCHRONIZING;
1946 		} else {
1947 			state = G_RAID3_DISK_STATE_STALE;
1948 		}
1949 	} else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ {
1950 		/*
1951 		 * Not good, NOT GOOD!
1952 		 * It means that device was started on stale disks
1953 		 * and more fresh disk just arrive.
1954 		 * If there were writes, device is fucked up, sorry.
1955 		 * I think the best choice here is don't touch
1956 		 * this disk and inform the user laudly.
1957 		 */
1958 		G_RAID3_DEBUG(0, "Device %s was started before the freshest "
1959 		    "disk (%s) arrives!! It will not be connected to the "
1960 		    "running device.", sc->sc_name,
1961 		    g_raid3_get_diskname(disk));
1962 		g_raid3_destroy_disk(disk);
1963 		state = G_RAID3_DISK_STATE_NONE;
1964 		/* Return immediately, because disk was destroyed. */
1965 		return (state);
1966 	}
1967 	G_RAID3_DEBUG(3, "State for %s disk: %s.",
1968 	    g_raid3_get_diskname(disk), g_raid3_disk_state2str(state));
1969 	return (state);
1970 }
1971 
1972 /*
1973  * Update device state.
1974  */
1975 static void
1976 g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force)
1977 {
1978 	struct g_raid3_disk *disk;
1979 	u_int state;
1980 
1981 	g_topology_assert();
1982 
1983 	switch (sc->sc_state) {
1984 	case G_RAID3_DEVICE_STATE_STARTING:
1985 	    {
1986 		u_int n, ndirty, ndisks, syncid;
1987 
1988 		KASSERT(sc->sc_provider == NULL,
1989 		    ("Non-NULL provider in STARTING state (%s).", sc->sc_name));
1990 		/*
1991 		 * Are we ready? We are, if all disks are connected or
1992 		 * one disk is missing and 'force' is true.
1993 		 */
1994 		if (g_raid3_ndisks(sc, -1) + force == sc->sc_ndisks) {
1995 			if (!force)
1996 				callout_drain(&sc->sc_callout);
1997 		} else {
1998 			if (force) {
1999 				/*
2000 				 * Timeout expired, so destroy device.
2001 				 */
2002 				sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2003 			}
2004 			return;
2005 		}
2006 
2007 		/*
2008 		 * There must be at least 'sc->sc_ndisks - 1' components
2009 		 * with the same syncid and without SYNCHRONIZING flag.
2010 		 */
2011 
2012 		/*
2013 		 * Find the biggest syncid, number of valid components and
2014 		 * number of dirty components.
2015 		 */
2016 		ndirty = ndisks = syncid = 0;
2017 		for (n = 0; n < sc->sc_ndisks; n++) {
2018 			disk = &sc->sc_disks[n];
2019 			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2020 				continue;
2021 			if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0)
2022 				ndirty++;
2023 			if (disk->d_sync.ds_syncid > syncid) {
2024 				syncid = disk->d_sync.ds_syncid;
2025 				ndisks = 0;
2026 			} else if (disk->d_sync.ds_syncid < syncid) {
2027 				continue;
2028 			}
2029 			if ((disk->d_flags &
2030 			    G_RAID3_DISK_FLAG_SYNCHRONIZING) != 0) {
2031 				continue;
2032 			}
2033 			ndisks++;
2034 		}
2035 		/*
2036 		 * Do we have enough valid components?
2037 		 */
2038 		if (ndisks + 1 < sc->sc_ndisks) {
2039 			G_RAID3_DEBUG(0,
2040 			    "Device %s is broken, too few valid components.",
2041 			    sc->sc_name);
2042 			sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2043 			return;
2044 		}
2045 		/*
2046 		 * If there is one DIRTY component and all disks are present,
2047 		 * mark it for synchronization. If there is more than one DIRTY
2048 		 * component, mark parity component for synchronization.
2049 		 */
2050 		if (ndisks == sc->sc_ndisks && ndirty == 1) {
2051 			for (n = 0; n < sc->sc_ndisks; n++) {
2052 				disk = &sc->sc_disks[n];
2053 				if ((disk->d_flags &
2054 				    G_RAID3_DISK_FLAG_DIRTY) == 0) {
2055 					continue;
2056 				}
2057 				disk->d_flags |=
2058 				    G_RAID3_DISK_FLAG_SYNCHRONIZING;
2059 			}
2060 		} else if (ndisks == sc->sc_ndisks && ndirty > 1) {
2061 			disk = &sc->sc_disks[sc->sc_ndisks - 1];
2062 			disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING;
2063 		}
2064 
2065 		sc->sc_syncid = syncid;
2066 		if (force) {
2067 			/* Remember to bump syncid on first write. */
2068 			sc->sc_bump_syncid = G_RAID3_BUMP_ON_FIRST_WRITE;
2069 		}
2070 		if (ndisks == sc->sc_ndisks)
2071 			state = G_RAID3_DEVICE_STATE_COMPLETE;
2072 		else /* if (ndisks == sc->sc_ndisks - 1) */
2073 			state = G_RAID3_DEVICE_STATE_DEGRADED;
2074 		G_RAID3_DEBUG(1, "Device %s state changed from %s to %s.",
2075 		    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
2076 		    g_raid3_device_state2str(state));
2077 		sc->sc_state = state;
2078 		for (n = 0; n < sc->sc_ndisks; n++) {
2079 			disk = &sc->sc_disks[n];
2080 			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
2081 				continue;
2082 			state = g_raid3_determine_state(disk);
2083 			g_raid3_event_send(disk, state, G_RAID3_EVENT_DONTWAIT);
2084 			if (state == G_RAID3_DISK_STATE_STALE) {
2085 				sc->sc_bump_syncid =
2086 				    G_RAID3_BUMP_ON_FIRST_WRITE;
2087 			}
2088 		}
2089 		break;
2090 	    }
2091 	case G_RAID3_DEVICE_STATE_DEGRADED:
2092 		/*
2093 		 * Bump syncid here, if we need to do it immediately.
2094 		 */
2095 		if (sc->sc_bump_syncid == G_RAID3_BUMP_IMMEDIATELY) {
2096 			sc->sc_bump_syncid = 0;
2097 			g_raid3_bump_syncid(sc);
2098 		}
2099 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0)
2100 			return;
2101 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) <
2102 		    sc->sc_ndisks - 1) {
2103 			if (sc->sc_provider != NULL)
2104 				g_raid3_destroy_provider(sc);
2105 			sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2106 			return;
2107 		}
2108 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) ==
2109 		    sc->sc_ndisks) {
2110 			state = G_RAID3_DEVICE_STATE_COMPLETE;
2111 			G_RAID3_DEBUG(1,
2112 			    "Device %s state changed from %s to %s.",
2113 			    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
2114 			    g_raid3_device_state2str(state));
2115 			sc->sc_state = state;
2116 		}
2117 		if (sc->sc_provider == NULL)
2118 			g_raid3_launch_provider(sc);
2119 		break;
2120 	case G_RAID3_DEVICE_STATE_COMPLETE:
2121 		/*
2122 		 * Bump syncid here, if we need to do it immediately.
2123 		 */
2124 		if (sc->sc_bump_syncid == G_RAID3_BUMP_IMMEDIATELY) {
2125 			sc->sc_bump_syncid = 0;
2126 			g_raid3_bump_syncid(sc);
2127 		}
2128 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0)
2129 			return;
2130 		KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) >=
2131 		    sc->sc_ndisks - 1,
2132 		    ("Too few ACTIVE components in COMPLETE state (device %s).",
2133 		    sc->sc_name));
2134 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) ==
2135 		    sc->sc_ndisks - 1) {
2136 			state = G_RAID3_DEVICE_STATE_DEGRADED;
2137 			G_RAID3_DEBUG(1,
2138 			    "Device %s state changed from %s to %s.",
2139 			    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
2140 			    g_raid3_device_state2str(state));
2141 			sc->sc_state = state;
2142 		}
2143 		if (sc->sc_provider == NULL)
2144 			g_raid3_launch_provider(sc);
2145 		break;
2146 	default:
2147 		KASSERT(1 == 0, ("Wrong device state (%s, %s).", sc->sc_name,
2148 		    g_raid3_device_state2str(sc->sc_state)));
2149 		break;
2150 	}
2151 }
2152 
2153 /*
2154  * Update disk state and device state if needed.
2155  */
2156 #define	DISK_STATE_CHANGED()	G_RAID3_DEBUG(1,			\
2157 	"Disk %s state changed from %s to %s (device %s).",		\
2158 	g_raid3_get_diskname(disk),					\
2159 	g_raid3_disk_state2str(disk->d_state),				\
2160 	g_raid3_disk_state2str(state), sc->sc_name)
2161 static int
2162 g_raid3_update_disk(struct g_raid3_disk *disk, u_int state)
2163 {
2164 	struct g_raid3_softc *sc;
2165 
2166 	g_topology_assert();
2167 
2168 	sc = disk->d_softc;
2169 again:
2170 	G_RAID3_DEBUG(3, "Changing disk %s state from %s to %s.",
2171 	    g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state),
2172 	    g_raid3_disk_state2str(state));
2173 	switch (state) {
2174 	case G_RAID3_DISK_STATE_NEW:
2175 		/*
2176 		 * Possible scenarios:
2177 		 * 1. New disk arrive.
2178 		 */
2179 		/* Previous state should be NONE. */
2180 		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NONE,
2181 		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2182 		    g_raid3_disk_state2str(disk->d_state)));
2183 		DISK_STATE_CHANGED();
2184 
2185 		disk->d_state = state;
2186 		G_RAID3_DEBUG(0, "Device %s: provider %s detected.",
2187 		    sc->sc_name, g_raid3_get_diskname(disk));
2188 		if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING)
2189 			break;
2190 		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2191 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2192 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2193 		    g_raid3_device_state2str(sc->sc_state),
2194 		    g_raid3_get_diskname(disk),
2195 		    g_raid3_disk_state2str(disk->d_state)));
2196 		state = g_raid3_determine_state(disk);
2197 		if (state != G_RAID3_DISK_STATE_NONE)
2198 			goto again;
2199 		break;
2200 	case G_RAID3_DISK_STATE_ACTIVE:
2201 		/*
2202 		 * Possible scenarios:
2203 		 * 1. New disk does not need synchronization.
2204 		 * 2. Synchronization process finished successfully.
2205 		 */
2206 		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2207 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2208 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2209 		    g_raid3_device_state2str(sc->sc_state),
2210 		    g_raid3_get_diskname(disk),
2211 		    g_raid3_disk_state2str(disk->d_state)));
2212 		/* Previous state should be NEW or SYNCHRONIZING. */
2213 		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW ||
2214 		    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
2215 		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2216 		    g_raid3_disk_state2str(disk->d_state)));
2217 		DISK_STATE_CHANGED();
2218 
2219 		if (disk->d_state == G_RAID3_DISK_STATE_NEW)
2220 			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2221 		else if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
2222 			disk->d_flags &= ~G_RAID3_DISK_FLAG_SYNCHRONIZING;
2223 			disk->d_flags &= ~G_RAID3_DISK_FLAG_FORCE_SYNC;
2224 			g_raid3_sync_stop(sc, 0);
2225 		}
2226 		disk->d_state = state;
2227 		disk->d_sync.ds_offset = 0;
2228 		disk->d_sync.ds_offset_done = 0;
2229 		g_raid3_update_access(disk);
2230 		g_raid3_update_metadata(disk);
2231 		G_RAID3_DEBUG(0, "Device %s: provider %s activated.",
2232 		    sc->sc_name, g_raid3_get_diskname(disk));
2233 		break;
2234 	case G_RAID3_DISK_STATE_STALE:
2235 		/*
2236 		 * Possible scenarios:
2237 		 * 1. Stale disk was connected.
2238 		 */
2239 		/* Previous state should be NEW. */
2240 		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
2241 		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2242 		    g_raid3_disk_state2str(disk->d_state)));
2243 		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2244 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2245 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2246 		    g_raid3_device_state2str(sc->sc_state),
2247 		    g_raid3_get_diskname(disk),
2248 		    g_raid3_disk_state2str(disk->d_state)));
2249 		/*
2250 		 * STALE state is only possible if device is marked
2251 		 * NOAUTOSYNC.
2252 		 */
2253 		KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) != 0,
2254 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2255 		    g_raid3_device_state2str(sc->sc_state),
2256 		    g_raid3_get_diskname(disk),
2257 		    g_raid3_disk_state2str(disk->d_state)));
2258 		DISK_STATE_CHANGED();
2259 
2260 		disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2261 		disk->d_state = state;
2262 		g_raid3_update_metadata(disk);
2263 		G_RAID3_DEBUG(0, "Device %s: provider %s is stale.",
2264 		    sc->sc_name, g_raid3_get_diskname(disk));
2265 		break;
2266 	case G_RAID3_DISK_STATE_SYNCHRONIZING:
2267 		/*
2268 		 * Possible scenarios:
2269 		 * 1. Disk which needs synchronization was connected.
2270 		 */
2271 		/* Previous state should be NEW. */
2272 		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
2273 		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
2274 		    g_raid3_disk_state2str(disk->d_state)));
2275 		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2276 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
2277 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
2278 		    g_raid3_device_state2str(sc->sc_state),
2279 		    g_raid3_get_diskname(disk),
2280 		    g_raid3_disk_state2str(disk->d_state)));
2281 		DISK_STATE_CHANGED();
2282 
2283 		if (disk->d_state == G_RAID3_DISK_STATE_NEW)
2284 			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2285 		disk->d_state = state;
2286 		if (sc->sc_provider != NULL) {
2287 			g_raid3_sync_start(sc);
2288 			g_raid3_update_metadata(disk);
2289 		}
2290 		break;
2291 	case G_RAID3_DISK_STATE_DISCONNECTED:
2292 		/*
2293 		 * Possible scenarios:
2294 		 * 1. Device wasn't running yet, but disk disappear.
2295 		 * 2. Disk was active and disapppear.
2296 		 * 3. Disk disappear during synchronization process.
2297 		 */
2298 		if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
2299 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
2300 			/*
2301 			 * Previous state should be ACTIVE, STALE or
2302 			 * SYNCHRONIZING.
2303 			 */
2304 			KASSERT(disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
2305 			    disk->d_state == G_RAID3_DISK_STATE_STALE ||
2306 			    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
2307 			    ("Wrong disk state (%s, %s).",
2308 			    g_raid3_get_diskname(disk),
2309 			    g_raid3_disk_state2str(disk->d_state)));
2310 		} else if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING) {
2311 			/* Previous state should be NEW. */
2312 			KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
2313 			    ("Wrong disk state (%s, %s).",
2314 			    g_raid3_get_diskname(disk),
2315 			    g_raid3_disk_state2str(disk->d_state)));
2316 			/*
2317 			 * Reset bumping syncid if disk disappeared in STARTING
2318 			 * state.
2319 			 */
2320 			if (sc->sc_bump_syncid == G_RAID3_BUMP_ON_FIRST_WRITE)
2321 				sc->sc_bump_syncid = 0;
2322 #ifdef	INVARIANTS
2323 		} else {
2324 			KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).",
2325 			    sc->sc_name,
2326 			    g_raid3_device_state2str(sc->sc_state),
2327 			    g_raid3_get_diskname(disk),
2328 			    g_raid3_disk_state2str(disk->d_state)));
2329 #endif
2330 		}
2331 		DISK_STATE_CHANGED();
2332 		G_RAID3_DEBUG(0, "Device %s: provider %s disconnected.",
2333 		    sc->sc_name, g_raid3_get_diskname(disk));
2334 
2335 		g_raid3_destroy_disk(disk);
2336 		break;
2337 	default:
2338 		KASSERT(1 == 0, ("Unknown state (%u).", state));
2339 		break;
2340 	}
2341 	return (0);
2342 }
2343 #undef	DISK_STATE_CHANGED
2344 
2345 static int
2346 g_raid3_read_metadata(struct g_consumer *cp, struct g_raid3_metadata *md)
2347 {
2348 	struct g_provider *pp;
2349 	u_char *buf;
2350 	int error;
2351 
2352 	g_topology_assert();
2353 
2354 	error = g_access(cp, 1, 0, 0);
2355 	if (error != 0)
2356 		return (error);
2357 	pp = cp->provider;
2358 	g_topology_unlock();
2359 	/* Metadata are stored on last sector. */
2360 	buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
2361 	    &error);
2362 	g_topology_lock();
2363 	if (buf == NULL) {
2364 		g_access(cp, -1, 0, 0);
2365 		return (error);
2366 	}
2367 	if (error != 0) {
2368 		g_access(cp, -1, 0, 0);
2369 		g_free(buf);
2370 		return (error);
2371 	}
2372 	error = g_access(cp, -1, 0, 0);
2373 	KASSERT(error == 0, ("Cannot decrease access count for %s.", pp->name));
2374 
2375 	/* Decode metadata. */
2376 	error = raid3_metadata_decode(buf, md);
2377 	g_free(buf);
2378 	if (strcmp(md->md_magic, G_RAID3_MAGIC) != 0)
2379 		return (EINVAL);
2380 	if (error != 0) {
2381 		G_RAID3_DEBUG(1, "MD5 metadata hash mismatch for provider %s.",
2382 		    cp->provider->name);
2383 		return (error);
2384 	}
2385 
2386 	return (0);
2387 }
2388 
2389 static int
2390 g_raid3_check_metadata(struct g_raid3_softc *sc, struct g_provider *pp,
2391     struct g_raid3_metadata *md)
2392 {
2393 
2394 	if (md->md_no >= sc->sc_ndisks) {
2395 		G_RAID3_DEBUG(1, "Invalid disk %s number (no=%u), skipping.",
2396 		    pp->name, md->md_no);
2397 		return (EINVAL);
2398 	}
2399 	if (sc->sc_disks[md->md_no].d_state != G_RAID3_DISK_STATE_NODISK) {
2400 		G_RAID3_DEBUG(1, "Disk %s (no=%u) already exists, skipping.",
2401 		    pp->name, md->md_no);
2402 		return (EEXIST);
2403 	}
2404 	if (md->md_all != sc->sc_ndisks) {
2405 		G_RAID3_DEBUG(1,
2406 		    "Invalid '%s' field on disk %s (device %s), skipping.",
2407 		    "md_all", pp->name, sc->sc_name);
2408 		return (EINVAL);
2409 	}
2410 	if (md->md_mediasize != sc->sc_mediasize) {
2411 		G_RAID3_DEBUG(1,
2412 		    "Invalid '%s' field on disk %s (device %s), skipping.",
2413 		    "md_mediasize", pp->name, sc->sc_name);
2414 		return (EINVAL);
2415 	}
2416 	if ((md->md_mediasize % (sc->sc_ndisks - 1)) != 0) {
2417 		G_RAID3_DEBUG(1,
2418 		    "Invalid '%s' field on disk %s (device %s), skipping.",
2419 		    "md_mediasize", pp->name, sc->sc_name);
2420 		return (EINVAL);
2421 	}
2422 	if ((sc->sc_mediasize / (sc->sc_ndisks - 1)) > pp->mediasize) {
2423 		G_RAID3_DEBUG(1,
2424 		    "Invalid size of disk %s (device %s), skipping.", pp->name,
2425 		    sc->sc_name);
2426 		return (EINVAL);
2427 	}
2428 	if ((md->md_sectorsize / pp->sectorsize) < sc->sc_ndisks - 1) {
2429 		G_RAID3_DEBUG(1,
2430 		    "Invalid '%s' field on disk %s (device %s), skipping.",
2431 		    "md_sectorsize", pp->name, sc->sc_name);
2432 		return (EINVAL);
2433 	}
2434 	if (md->md_sectorsize != sc->sc_sectorsize) {
2435 		G_RAID3_DEBUG(1,
2436 		    "Invalid '%s' field on disk %s (device %s), skipping.",
2437 		    "md_sectorsize", pp->name, sc->sc_name);
2438 		return (EINVAL);
2439 	}
2440 	if ((sc->sc_sectorsize % pp->sectorsize) != 0) {
2441 		G_RAID3_DEBUG(1,
2442 		    "Invalid sector size of disk %s (device %s), skipping.",
2443 		    pp->name, sc->sc_name);
2444 		return (EINVAL);
2445 	}
2446 	if ((md->md_mflags & ~G_RAID3_DEVICE_FLAG_MASK) != 0) {
2447 		G_RAID3_DEBUG(1,
2448 		    "Invalid device flags on disk %s (device %s), skipping.",
2449 		    pp->name, sc->sc_name);
2450 		return (EINVAL);
2451 	}
2452 	if ((md->md_mflags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 &&
2453 	    (md->md_mflags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0) {
2454 		/*
2455 		 * VERIFY and ROUND-ROBIN options are mutally exclusive.
2456 		 */
2457 		G_RAID3_DEBUG(1, "Both VERIFY and ROUND-ROBIN flags exist on "
2458 		    "disk %s (device %s), skipping.", pp->name, sc->sc_name);
2459 		return (EINVAL);
2460 	}
2461 	if ((md->md_dflags & ~G_RAID3_DISK_FLAG_MASK) != 0) {
2462 		G_RAID3_DEBUG(1,
2463 		    "Invalid disk flags on disk %s (device %s), skipping.",
2464 		    pp->name, sc->sc_name);
2465 		return (EINVAL);
2466 	}
2467 	return (0);
2468 }
2469 
2470 static int
2471 g_raid3_add_disk(struct g_raid3_softc *sc, struct g_provider *pp,
2472     struct g_raid3_metadata *md)
2473 {
2474 	struct g_raid3_disk *disk;
2475 	int error;
2476 
2477 	g_topology_assert();
2478 	G_RAID3_DEBUG(2, "Adding disk %s.", pp->name);
2479 
2480 	error = g_raid3_check_metadata(sc, pp, md);
2481 	if (error != 0)
2482 		return (error);
2483 	disk = g_raid3_init_disk(sc, pp, md, &error);
2484 	if (disk == NULL)
2485 		return (error);
2486 	error = g_raid3_event_send(disk, G_RAID3_DISK_STATE_NEW,
2487 	    G_RAID3_EVENT_WAIT);
2488 	return (error);
2489 }
2490 
2491 static int
2492 g_raid3_access(struct g_provider *pp, int acr, int acw, int ace)
2493 {
2494 	struct g_raid3_softc *sc;
2495 	struct g_raid3_disk *disk;
2496 	int dcr, dcw, dce, err, error;
2497 	u_int n;
2498 
2499 	g_topology_assert();
2500 	G_RAID3_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr,
2501 	    acw, ace);
2502 
2503 	dcr = pp->acr + acr;
2504 	dcw = pp->acw + acw;
2505 	dce = pp->ace + ace;
2506 
2507 	/* On first open, grab an extra "exclusive" bit */
2508 	if (pp->acr == 0 && pp->acw == 0 && pp->ace == 0)
2509 		ace++;
2510 	/* ... and let go of it on last close */
2511 	if (dcr == 0 && dcw == 0 && dce == 0)
2512 		ace--;
2513 
2514 	sc = pp->geom->softc;
2515 	if (sc == NULL ||
2516 	    g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < sc->sc_ndisks - 1) {
2517 		if (acr <= 0 && acw <= 0 && ace <= 0)
2518 			return (0);
2519 		else
2520 			return (ENXIO);
2521 	}
2522 	error = ENXIO;
2523 	for (n = 0; n < sc->sc_ndisks; n++) {
2524 		disk = &sc->sc_disks[n];
2525 		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
2526 			continue;
2527 		err = g_access(disk->d_consumer, acr, acw, ace);
2528 		G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d",
2529 		    g_raid3_get_diskname(disk), acr, acw, ace, err);
2530 		if (err == 0) {
2531 			/*
2532 			 * Mark disk as dirty on open and unmark on close.
2533 			 */
2534 			if (pp->acw == 0 && dcw > 0) {
2535 				G_RAID3_DEBUG(1,
2536 				    "Disk %s (device %s) marked as dirty.",
2537 				    g_raid3_get_diskname(disk), sc->sc_name);
2538 				disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
2539 				g_raid3_update_metadata(disk);
2540 			} else if (pp->acw > 0 && dcw == 0) {
2541 				G_RAID3_DEBUG(1,
2542 				    "Disk %s (device %s) marked as clean.",
2543 				    g_raid3_get_diskname(disk), sc->sc_name);
2544 				disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
2545 				g_raid3_update_metadata(disk);
2546 			}
2547 			error = 0;
2548 		} else {
2549 			sc->sc_bump_syncid = G_RAID3_BUMP_ON_FIRST_WRITE;
2550 			g_raid3_event_send(disk,
2551 			    G_RAID3_DISK_STATE_DISCONNECTED,
2552 			    G_RAID3_EVENT_DONTWAIT);
2553 		}
2554 	}
2555 	return (error);
2556 }
2557 
2558 static struct g_geom *
2559 g_raid3_create(struct g_class *mp, const struct g_raid3_metadata *md)
2560 {
2561 	struct g_raid3_softc *sc;
2562 	struct g_geom *gp;
2563 	int error, timeout;
2564 	u_int n;
2565 
2566 	g_topology_assert();
2567 	G_RAID3_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id);
2568 
2569 	/* One disk is minimum. */
2570 	if (md->md_all < 1)
2571 		return (NULL);
2572 	/*
2573 	 * Action geom.
2574 	 */
2575 	gp = g_new_geomf(mp, "%s", md->md_name);
2576 	sc = malloc(sizeof(*sc), M_RAID3, M_WAITOK | M_ZERO);
2577 	sc->sc_disks = malloc(sizeof(struct g_raid3_disk) * md->md_all, M_RAID3,
2578 	    M_WAITOK | M_ZERO);
2579 	gp->start = g_raid3_start;
2580 	gp->spoiled = g_raid3_spoiled;
2581 	gp->orphan = g_raid3_orphan;
2582 	gp->access = g_raid3_access;
2583 	gp->dumpconf = g_raid3_dumpconf;
2584 
2585 	sc->sc_id = md->md_id;
2586 	sc->sc_mediasize = md->md_mediasize;
2587 	sc->sc_sectorsize = md->md_sectorsize;
2588 	sc->sc_ndisks = md->md_all;
2589 	sc->sc_round_robin = 0;
2590 	sc->sc_flags = md->md_mflags;
2591 	sc->sc_bump_syncid = 0;
2592 	for (n = 0; n < sc->sc_ndisks; n++)
2593 		sc->sc_disks[n].d_state = G_RAID3_DISK_STATE_NODISK;
2594 	bioq_init(&sc->sc_queue);
2595 	mtx_init(&sc->sc_queue_mtx, "graid3:queue", NULL, MTX_DEF);
2596 	TAILQ_INIT(&sc->sc_events);
2597 	mtx_init(&sc->sc_events_mtx, "graid3:events", NULL, MTX_DEF);
2598 	callout_init(&sc->sc_callout, CALLOUT_MPSAFE);
2599 	sc->sc_state = G_RAID3_DEVICE_STATE_STARTING;
2600 	gp->softc = sc;
2601 	sc->sc_geom = gp;
2602 	sc->sc_provider = NULL;
2603 	/*
2604 	 * Synchronization geom.
2605 	 */
2606 	gp = g_new_geomf(mp, "%s.sync", md->md_name);
2607 	gp->softc = sc;
2608 	gp->orphan = g_raid3_orphan;
2609 	sc->sc_sync.ds_geom = gp;
2610 	sc->sc_zone_64k = uma_zcreate("gr3:64k", 65536, NULL, NULL, NULL, NULL,
2611 	    UMA_ALIGN_PTR, 0);
2612 	uma_zone_set_max(sc->sc_zone_64k, g_raid3_n64k);
2613 	sc->sc_zone_16k = uma_zcreate("gr3:16k", 16384, NULL, NULL, NULL, NULL,
2614 	    UMA_ALIGN_PTR, 0);
2615 	uma_zone_set_max(sc->sc_zone_64k, g_raid3_n16k);
2616 	sc->sc_zone_4k = uma_zcreate("gr3:4k", 4096, NULL, NULL, NULL, NULL,
2617 	    UMA_ALIGN_PTR, 0);
2618 	uma_zone_set_max(sc->sc_zone_4k, g_raid3_n4k);
2619 	error = kthread_create(g_raid3_worker, sc, &sc->sc_worker, 0, 0,
2620 	    "g_raid3 %s", md->md_name);
2621 	if (error != 0) {
2622 		G_RAID3_DEBUG(1, "Cannot create kernel thread for %s.",
2623 		    sc->sc_name);
2624 		uma_zdestroy(sc->sc_zone_64k);
2625 		uma_zdestroy(sc->sc_zone_16k);
2626 		uma_zdestroy(sc->sc_zone_4k);
2627 		g_destroy_geom(sc->sc_sync.ds_geom);
2628 		mtx_destroy(&sc->sc_events_mtx);
2629 		mtx_destroy(&sc->sc_queue_mtx);
2630 		g_destroy_geom(sc->sc_geom);
2631 		free(sc->sc_disks, M_RAID3);
2632 		free(sc, M_RAID3);
2633 		return (NULL);
2634 	}
2635 
2636 	G_RAID3_DEBUG(0, "Device %s created (id=%u).", sc->sc_name, sc->sc_id);
2637 
2638 	/*
2639 	 * Run timeout.
2640 	 */
2641 	timeout = atomic_load_acq_int(&g_raid3_timeout);
2642 	callout_reset(&sc->sc_callout, timeout * hz, g_raid3_go, sc);
2643 	return (sc->sc_geom);
2644 }
2645 
2646 int
2647 g_raid3_destroy(struct g_raid3_softc *sc, boolean_t force)
2648 {
2649 	struct g_provider *pp;
2650 
2651 	g_topology_assert();
2652 
2653 	if (sc == NULL)
2654 		return (ENXIO);
2655 	pp = sc->sc_provider;
2656 	if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
2657 		if (force) {
2658 			G_RAID3_DEBUG(0, "Device %s is still open, so it "
2659 			    "can't be definitely removed.", pp->name);
2660 		} else {
2661 			G_RAID3_DEBUG(1,
2662 			    "Device %s is still open (r%dw%de%d).", pp->name,
2663 			    pp->acr, pp->acw, pp->ace);
2664 			return (EBUSY);
2665 		}
2666 	}
2667 
2668 	sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
2669 	sc->sc_flags |= G_RAID3_DEVICE_FLAG_WAIT;
2670 	g_topology_unlock();
2671 	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
2672 	mtx_lock(&sc->sc_queue_mtx);
2673 	wakeup(sc);
2674 	wakeup(&sc->sc_queue);
2675 	mtx_unlock(&sc->sc_queue_mtx);
2676 	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker);
2677 	while (sc->sc_worker != NULL)
2678 		tsleep(&sc->sc_worker, PRIBIO, "r3:destroy", hz / 5);
2679 	G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker);
2680 	g_topology_lock();
2681 	g_raid3_destroy_device(sc);
2682 	free(sc->sc_disks, M_RAID3);
2683 	free(sc, M_RAID3);
2684 	return (0);
2685 }
2686 
2687 static void
2688 g_raid3_taste_orphan(struct g_consumer *cp)
2689 {
2690 
2691 	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
2692 	    cp->provider->name));
2693 }
2694 
2695 static struct g_geom *
2696 g_raid3_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
2697 {
2698 	struct g_raid3_metadata md;
2699 	struct g_raid3_softc *sc;
2700 	struct g_consumer *cp;
2701 	struct g_geom *gp;
2702 	int error;
2703 
2704 	g_topology_assert();
2705 	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
2706 	G_RAID3_DEBUG(2, "Tasting %s.", pp->name);
2707 	/* Skip providers with 0 sectorsize. */
2708 	if (pp->sectorsize == 0)
2709 		return (NULL);
2710 
2711 	gp = g_new_geomf(mp, "raid3:taste");
2712 	/* This orphan function should be never called. */
2713 	gp->orphan = g_raid3_taste_orphan;
2714 	cp = g_new_consumer(gp);
2715 	g_attach(cp, pp);
2716 	error = g_raid3_read_metadata(cp, &md);
2717 	g_detach(cp);
2718 	g_destroy_consumer(cp);
2719 	g_destroy_geom(gp);
2720 	if (error != 0)
2721 		return (NULL);
2722 	gp = NULL;
2723 
2724 	if (md.md_version > G_RAID3_VERSION) {
2725 		printf("geom_raid3.ko module is too old to handle %s.\n",
2726 		    pp->name);
2727 		return (NULL);
2728 	}
2729 	if (md.md_provider[0] != '\0' && strcmp(md.md_provider, pp->name) != 0)
2730 		return (NULL);
2731 	if (g_raid3_debug >= 2)
2732 		raid3_metadata_dump(&md);
2733 
2734 	/*
2735 	 * Let's check if device already exists.
2736 	 */
2737 	sc = NULL;
2738 	LIST_FOREACH(gp, &mp->geom, geom) {
2739 		sc = gp->softc;
2740 		if (sc == NULL)
2741 			continue;
2742 		if (sc->sc_sync.ds_geom == gp)
2743 			continue;
2744 		if (strcmp(md.md_name, sc->sc_name) != 0)
2745 			continue;
2746 		if (md.md_id != sc->sc_id) {
2747 			G_RAID3_DEBUG(0, "Device %s already configured.",
2748 			    sc->sc_name);
2749 			return (NULL);
2750 		}
2751 		break;
2752 	}
2753 	if (gp == NULL) {
2754 		gp = g_raid3_create(mp, &md);
2755 		if (gp == NULL) {
2756 			G_RAID3_DEBUG(0, "Cannot create device %s.",
2757 			    md.md_name);
2758 			return (NULL);
2759 		}
2760 		sc = gp->softc;
2761 	}
2762 	G_RAID3_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name);
2763 	error = g_raid3_add_disk(sc, pp, &md);
2764 	if (error != 0) {
2765 		G_RAID3_DEBUG(0, "Cannot add disk %s to %s (error=%d).",
2766 		    pp->name, gp->name, error);
2767 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NODISK) ==
2768 		    sc->sc_ndisks) {
2769 			g_raid3_destroy(sc, 1);
2770 		}
2771 		return (NULL);
2772 	}
2773 	return (gp);
2774 }
2775 
2776 static int
2777 g_raid3_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused,
2778     struct g_geom *gp)
2779 {
2780 
2781 	return (g_raid3_destroy(gp->softc, 0));
2782 }
2783 
2784 static void
2785 g_raid3_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
2786     struct g_consumer *cp, struct g_provider *pp)
2787 {
2788 	struct g_raid3_softc *sc;
2789 
2790 	g_topology_assert();
2791 
2792 	sc = gp->softc;
2793 	if (sc == NULL)
2794 		return;
2795 	/* Skip synchronization geom. */
2796 	if (gp == sc->sc_sync.ds_geom)
2797 		return;
2798 	if (pp != NULL) {
2799 		/* Nothing here. */
2800 	} else if (cp != NULL) {
2801 		struct g_raid3_disk *disk;
2802 
2803 		disk = cp->private;
2804 		if (disk == NULL)
2805 			return;
2806 		sbuf_printf(sb, "%s<Type>", indent);
2807 		if (disk->d_no == sc->sc_ndisks - 1)
2808 			sbuf_printf(sb, "PARITY");
2809 		else
2810 			sbuf_printf(sb, "DATA");
2811 		sbuf_printf(sb, "</Type>\n");
2812 		sbuf_printf(sb, "%s<Number>%u</Number>\n", indent,
2813 		    (u_int)disk->d_no);
2814 		if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
2815 			sbuf_printf(sb, "%s<Synchronized>", indent);
2816 			if (disk->d_sync.ds_offset_done == 0)
2817 				sbuf_printf(sb, "0%%");
2818 			else {
2819 				sbuf_printf(sb, "%u%%",
2820 				    (u_int)((disk->d_sync.ds_offset_done * 100) /
2821 				    (sc->sc_mediasize / (sc->sc_ndisks - 1))));
2822 			}
2823 			sbuf_printf(sb, "</Synchronized>\n");
2824 		}
2825 		sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent,
2826 		    disk->d_sync.ds_syncid);
2827 		sbuf_printf(sb, "%s<Flags>", indent);
2828 		if (disk->d_flags == 0)
2829 			sbuf_printf(sb, "NONE");
2830 		else {
2831 			int first = 1;
2832 
2833 #define	ADD_FLAG(flag, name)	do {					\
2834 	if ((disk->d_flags & (flag)) != 0) {				\
2835 		if (!first)						\
2836 			sbuf_printf(sb, ", ");				\
2837 		else							\
2838 			first = 0;					\
2839 		sbuf_printf(sb, name);					\
2840 	}								\
2841 } while (0)
2842 			ADD_FLAG(G_RAID3_DISK_FLAG_DIRTY, "DIRTY");
2843 			ADD_FLAG(G_RAID3_DISK_FLAG_HARDCODED, "HARDCODED");
2844 			ADD_FLAG(G_RAID3_DISK_FLAG_SYNCHRONIZING,
2845 			    "SYNCHRONIZING");
2846 			ADD_FLAG(G_RAID3_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC");
2847 #undef	ADD_FLAG
2848 		}
2849 		sbuf_printf(sb, "</Flags>\n");
2850 		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
2851 		    g_raid3_disk_state2str(disk->d_state));
2852 	} else {
2853 		sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
2854 		sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent, sc->sc_syncid);
2855 		sbuf_printf(sb, "%s<Flags>", indent);
2856 		if (sc->sc_flags == 0)
2857 			sbuf_printf(sb, "NONE");
2858 		else {
2859 			int first = 1;
2860 
2861 #define	ADD_FLAG(flag, name)	do {					\
2862 	if ((sc->sc_flags & (flag)) != 0) {				\
2863 		if (!first)						\
2864 			sbuf_printf(sb, ", ");				\
2865 		else							\
2866 			first = 0;					\
2867 		sbuf_printf(sb, name);					\
2868 	}								\
2869 } while (0)
2870 			ADD_FLAG(G_RAID3_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC");
2871 			ADD_FLAG(G_RAID3_DEVICE_FLAG_ROUND_ROBIN,
2872 			    "ROUND-ROBIN");
2873 			ADD_FLAG(G_RAID3_DEVICE_FLAG_VERIFY, "VERIFY");
2874 #undef	ADD_FLAG
2875 		}
2876 		sbuf_printf(sb, "</Flags>\n");
2877 		sbuf_printf(sb, "%s<Components>%u</Components>\n", indent,
2878 		    sc->sc_ndisks);
2879 		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
2880 		    g_raid3_device_state2str(sc->sc_state));
2881 	}
2882 }
2883 
2884 DECLARE_GEOM_CLASS(g_raid3_class, g_raid3);
2885