xref: /freebsd/sys/geom/vinum/geom_vinum_plex.c (revision b4af4f93c682e445bf159f0d1ec90b636296c946)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2004, 2007 Lukas Ertl
5  * Copyright (c) 2007, 2009 Ulf Lilleengen
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include <sys/param.h>
34 #include <sys/bio.h>
35 #include <sys/lock.h>
36 #include <sys/malloc.h>
37 #include <sys/systm.h>
38 
39 #include <geom/geom.h>
40 #include <geom/geom_dbg.h>
41 #include <geom/vinum/geom_vinum_var.h>
42 #include <geom/vinum/geom_vinum_raid5.h>
43 #include <geom/vinum/geom_vinum.h>
44 
45 static int	gv_check_parity(struct gv_plex *, struct bio *,
46 		    struct gv_raid5_packet *);
47 static int	gv_normal_parity(struct gv_plex *, struct bio *,
48 		    struct gv_raid5_packet *);
49 static void	gv_plex_flush(struct gv_plex *);
50 static int	gv_plex_offset(struct gv_plex *, off_t, off_t, off_t *, off_t *,
51 		    int *, int);
52 static int 	gv_plex_normal_request(struct gv_plex *, struct bio *, off_t,
53 		    off_t,  caddr_t);
54 static void	gv_post_bio(struct gv_softc *, struct bio *);
55 
56 void
57 gv_plex_start(struct gv_plex *p, struct bio *bp)
58 {
59 	struct bio *cbp;
60 	struct gv_sd *s;
61 	struct gv_raid5_packet *wp;
62 	caddr_t addr;
63 	off_t bcount, boff, len;
64 
65 	bcount = bp->bio_length;
66 	addr = bp->bio_data;
67 	boff = bp->bio_offset;
68 
69 	/* Walk over the whole length of the request, we might split it up. */
70 	while (bcount > 0) {
71 		wp = NULL;
72 
73  		/*
74 		 * RAID5 plexes need special treatment, as a single request
75 		 * might involve several read/write sub-requests.
76  		 */
77 		if (p->org == GV_PLEX_RAID5) {
78 			wp = gv_raid5_start(p, bp, addr, boff, bcount);
79  			if (wp == NULL)
80  				return;
81 
82 			len = wp->length;
83 
84 			if (TAILQ_EMPTY(&wp->bits))
85 				g_free(wp);
86 			else if (wp->lockbase != -1)
87 				TAILQ_INSERT_TAIL(&p->packets, wp, list);
88 
89 		/*
90 		 * Requests to concatenated and striped plexes go straight
91 		 * through.
92 		 */
93 		} else {
94 			len = gv_plex_normal_request(p, bp, boff, bcount, addr);
95 		}
96 		if (len < 0)
97 			return;
98 
99 		bcount -= len;
100 		addr += len;
101 		boff += len;
102 	}
103 
104 	/*
105 	 * Fire off all sub-requests.  We get the correct consumer (== drive)
106 	 * to send each request to via the subdisk that was stored in
107 	 * cbp->bio_caller1.
108 	 */
109 	cbp = bioq_takefirst(p->bqueue);
110 	while (cbp != NULL) {
111 		/*
112 		 * RAID5 sub-requests need to come in correct order, otherwise
113 		 * we trip over the parity, as it might be overwritten by
114 		 * another sub-request.  We abuse cbp->bio_caller2 to mark
115 		 * potential overlap situations.
116 		 */
117 		if (cbp->bio_caller2 != NULL && gv_stripe_active(p, cbp)) {
118 			/* Park the bio on the waiting queue. */
119 			cbp->bio_pflags |= GV_BIO_ONHOLD;
120 			bioq_disksort(p->wqueue, cbp);
121 		} else {
122 			s = cbp->bio_caller1;
123 			g_io_request(cbp, s->drive_sc->consumer);
124 		}
125 		cbp = bioq_takefirst(p->bqueue);
126 	}
127 }
128 
129 static int
130 gv_plex_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off,
131     off_t *real_len, int *sdno, int growing)
132 {
133 	struct gv_sd *s;
134 	int i, sdcount;
135 	off_t len_left, stripeend, stripeno, stripestart;
136 
137 	switch (p->org) {
138 	case GV_PLEX_CONCAT:
139 		/*
140 		 * Find the subdisk where this request starts.  The subdisks in
141 		 * this list must be ordered by plex_offset.
142 		 */
143 		i = 0;
144 		LIST_FOREACH(s, &p->subdisks, in_plex) {
145 			if (s->plex_offset <= boff &&
146 			    s->plex_offset + s->size > boff) {
147 				*sdno = i;
148 				break;
149 			}
150 			i++;
151 		}
152 		if (s == NULL || s->drive_sc == NULL)
153 			return (GV_ERR_NOTFOUND);
154 
155 		/* Calculate corresponding offsets on disk. */
156 		*real_off = boff - s->plex_offset;
157 		len_left = s->size - (*real_off);
158 		KASSERT(len_left >= 0, ("gv_plex_offset: len_left < 0"));
159 		*real_len = (bcount > len_left) ? len_left : bcount;
160 		break;
161 
162 	case GV_PLEX_STRIPED:
163 		/* The number of the stripe where the request starts. */
164 		stripeno = boff / p->stripesize;
165 		KASSERT(stripeno >= 0, ("gv_plex_offset: stripeno < 0"));
166 
167 		/* Take growing subdisks into account when calculating. */
168 		sdcount = gv_sdcount(p, (boff >= p->synced));
169 
170 		if (!(boff + bcount <= p->synced) &&
171 		    (p->flags & GV_PLEX_GROWING) &&
172 		    !growing)
173 			return (GV_ERR_ISBUSY);
174 		*sdno = stripeno % sdcount;
175 
176 		KASSERT(sdno >= 0, ("gv_plex_offset: sdno < 0"));
177 		stripestart = (stripeno / sdcount) *
178 		    p->stripesize;
179 		KASSERT(stripestart >= 0, ("gv_plex_offset: stripestart < 0"));
180 		stripeend = stripestart + p->stripesize;
181 		*real_off = boff - (stripeno * p->stripesize) +
182 		    stripestart;
183 		len_left = stripeend - *real_off;
184 		KASSERT(len_left >= 0, ("gv_plex_offset: len_left < 0"));
185 
186 		*real_len = (bcount <= len_left) ? bcount : len_left;
187 		break;
188 
189 	default:
190 		return (GV_ERR_PLEXORG);
191 	}
192 	return (0);
193 }
194 
195 /*
196  * Prepare a normal plex request.
197  */
198 static int
199 gv_plex_normal_request(struct gv_plex *p, struct bio *bp, off_t boff,
200     off_t bcount,  caddr_t addr)
201 {
202 	struct gv_sd *s;
203 	struct bio *cbp;
204 	off_t real_len, real_off;
205 	int i, err, sdno;
206 
207 	s = NULL;
208 	sdno = -1;
209 	real_len = real_off = 0;
210 
211 	err = ENXIO;
212 
213 	if (p == NULL || LIST_EMPTY(&p->subdisks))
214 		goto bad;
215 
216 	err = gv_plex_offset(p, boff, bcount, &real_off,
217 	    &real_len, &sdno, (bp->bio_pflags & GV_BIO_GROW));
218 	/* If the request was blocked, put it into wait. */
219 	if (err == GV_ERR_ISBUSY) {
220 		bioq_disksort(p->rqueue, bp);
221 		return (-1); /* "Fail", and delay request. */
222 	}
223 	if (err) {
224 		err = ENXIO;
225 		goto bad;
226 	}
227 	err = ENXIO;
228 
229 	/* Find the right subdisk. */
230 	i = 0;
231 	LIST_FOREACH(s, &p->subdisks, in_plex) {
232 		if (i == sdno)
233 			break;
234 		i++;
235 	}
236 
237 	/* Subdisk not found. */
238 	if (s == NULL || s->drive_sc == NULL)
239 		goto bad;
240 
241 	/* Now check if we can handle the request on this subdisk. */
242 	switch (s->state) {
243 	case GV_SD_UP:
244 		/* If the subdisk is up, just continue. */
245 		break;
246 	case GV_SD_DOWN:
247 		if (bp->bio_pflags & GV_BIO_INTERNAL)
248 			G_VINUM_DEBUG(0, "subdisk must be in the stale state in"
249 			    " order to perform administrative requests");
250 		goto bad;
251 	case GV_SD_STALE:
252 		if (!(bp->bio_pflags & GV_BIO_SYNCREQ)) {
253 			G_VINUM_DEBUG(0, "subdisk stale, unable to perform "
254 			    "regular requests");
255 			goto bad;
256 		}
257 
258 		G_VINUM_DEBUG(1, "sd %s is initializing", s->name);
259 		gv_set_sd_state(s, GV_SD_INITIALIZING, GV_SETSTATE_FORCE);
260 		break;
261 	case GV_SD_INITIALIZING:
262 		if (bp->bio_cmd == BIO_READ)
263 			goto bad;
264 		break;
265 	default:
266 		/* All other subdisk states mean it's not accessible. */
267 		goto bad;
268 	}
269 
270 	/* Clone the bio and adjust the offsets and sizes. */
271 	cbp = g_clone_bio(bp);
272 	if (cbp == NULL) {
273 		err = ENOMEM;
274 		goto bad;
275 	}
276 	cbp->bio_offset = real_off + s->drive_offset;
277 	cbp->bio_length = real_len;
278 	cbp->bio_data = addr;
279 	cbp->bio_done = gv_done;
280 	cbp->bio_caller1 = s;
281 	s->drive_sc->active++;
282 
283 	/* Store the sub-requests now and let others issue them. */
284 	bioq_insert_tail(p->bqueue, cbp);
285 	return (real_len);
286 bad:
287 	G_VINUM_LOGREQ(0, bp, "plex request failed.");
288 	/* Building the sub-request failed. If internal BIO, do not deliver. */
289 	if (bp->bio_pflags & GV_BIO_INTERNAL) {
290 		if (bp->bio_pflags & GV_BIO_MALLOC)
291 			g_free(bp->bio_data);
292 		g_destroy_bio(bp);
293 		p->flags &= ~(GV_PLEX_SYNCING | GV_PLEX_REBUILDING |
294 		    GV_PLEX_GROWING);
295 		return (-1);
296 	}
297 	g_io_deliver(bp, err);
298 	return (-1);
299 }
300 
301 /*
302  * Handle a completed request to a striped or concatenated plex.
303  */
304 void
305 gv_plex_normal_done(struct gv_plex *p, struct bio *bp)
306 {
307 	struct bio *pbp;
308 
309 	pbp = bp->bio_parent;
310 	if (pbp->bio_error == 0)
311 		pbp->bio_error = bp->bio_error;
312 	g_destroy_bio(bp);
313 	pbp->bio_inbed++;
314 	if (pbp->bio_children == pbp->bio_inbed) {
315 		/* Just set it to length since multiple plexes will
316 		 * screw things up. */
317 		pbp->bio_completed = pbp->bio_length;
318 		if (pbp->bio_pflags & GV_BIO_SYNCREQ)
319 			gv_sync_complete(p, pbp);
320 		else if (pbp->bio_pflags & GV_BIO_GROW)
321 			gv_grow_complete(p, pbp);
322 		else
323 			g_io_deliver(pbp, pbp->bio_error);
324 	}
325 }
326 
327 /*
328  * Handle a completed request to a RAID-5 plex.
329  */
330 void
331 gv_plex_raid5_done(struct gv_plex *p, struct bio *bp)
332 {
333 	struct gv_softc *sc;
334 	struct bio *cbp, *pbp;
335 	struct gv_bioq *bq, *bq2;
336 	struct gv_raid5_packet *wp;
337 	off_t completed;
338 	int i;
339 
340 	completed = 0;
341 	sc = p->vinumconf;
342 	wp = bp->bio_caller2;
343 
344 	switch (bp->bio_parent->bio_cmd) {
345 	case BIO_READ:
346 		if (wp == NULL) {
347 			completed = bp->bio_completed;
348 			break;
349 		}
350 
351 		TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
352 			if (bq->bp != bp)
353 				continue;
354 			TAILQ_REMOVE(&wp->bits, bq, queue);
355 			g_free(bq);
356 			for (i = 0; i < wp->length; i++)
357 				wp->data[i] ^= bp->bio_data[i];
358 			break;
359 		}
360 		if (TAILQ_EMPTY(&wp->bits)) {
361 			completed = wp->length;
362 			if (wp->lockbase != -1) {
363 				TAILQ_REMOVE(&p->packets, wp, list);
364 				/* Bring the waiting bios back into the game. */
365 				pbp = bioq_takefirst(p->wqueue);
366 				while (pbp != NULL) {
367 					gv_post_bio(sc, pbp);
368 					pbp = bioq_takefirst(p->wqueue);
369 				}
370 			}
371 			g_free(wp);
372 		}
373 
374 		break;
375 
376  	case BIO_WRITE:
377 		/* XXX can this ever happen? */
378 		if (wp == NULL) {
379 			completed = bp->bio_completed;
380 			break;
381 		}
382 
383 		/* Check if we need to handle parity data. */
384 		TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
385 			if (bq->bp != bp)
386 				continue;
387 			TAILQ_REMOVE(&wp->bits, bq, queue);
388 			g_free(bq);
389 			cbp = wp->parity;
390 			if (cbp != NULL) {
391 				for (i = 0; i < wp->length; i++)
392 					cbp->bio_data[i] ^= bp->bio_data[i];
393 			}
394 			break;
395 		}
396 
397 		/* Handle parity data. */
398 		if (TAILQ_EMPTY(&wp->bits)) {
399 			if (bp->bio_parent->bio_pflags & GV_BIO_CHECK)
400 				i = gv_check_parity(p, bp, wp);
401 			else
402 				i = gv_normal_parity(p, bp, wp);
403 
404 			/* All of our sub-requests have finished. */
405 			if (i) {
406 				completed = wp->length;
407 				TAILQ_REMOVE(&p->packets, wp, list);
408 				/* Bring the waiting bios back into the game. */
409 				pbp = bioq_takefirst(p->wqueue);
410 				while (pbp != NULL) {
411 					gv_post_bio(sc, pbp);
412 					pbp = bioq_takefirst(p->wqueue);
413 				}
414 				g_free(wp);
415 			}
416 		}
417 
418 		break;
419 	}
420 
421 	pbp = bp->bio_parent;
422 	if (pbp->bio_error == 0)
423 		pbp->bio_error = bp->bio_error;
424 	pbp->bio_completed += completed;
425 
426 	/* When the original request is finished, we deliver it. */
427 	pbp->bio_inbed++;
428 	if (pbp->bio_inbed == pbp->bio_children) {
429 		/* Hand it over for checking or delivery. */
430 		if (pbp->bio_cmd == BIO_WRITE &&
431 		    (pbp->bio_pflags & GV_BIO_CHECK)) {
432 			gv_parity_complete(p, pbp);
433 		} else if (pbp->bio_cmd == BIO_WRITE &&
434 		    (pbp->bio_pflags & GV_BIO_REBUILD)) {
435 			gv_rebuild_complete(p, pbp);
436 		} else if (pbp->bio_pflags & GV_BIO_INIT) {
437 			gv_init_complete(p, pbp);
438 		} else if (pbp->bio_pflags & GV_BIO_SYNCREQ) {
439 			gv_sync_complete(p, pbp);
440 		} else if (pbp->bio_pflags & GV_BIO_GROW) {
441 			gv_grow_complete(p, pbp);
442 		} else {
443 			g_io_deliver(pbp, pbp->bio_error);
444 		}
445 	}
446 
447 	/* Clean up what we allocated. */
448 	if (bp->bio_cflags & GV_BIO_MALLOC)
449 		g_free(bp->bio_data);
450 	g_destroy_bio(bp);
451 }
452 
453 static int
454 gv_check_parity(struct gv_plex *p, struct bio *bp, struct gv_raid5_packet *wp)
455 {
456 	struct bio *pbp;
457 	struct gv_sd *s;
458 	int err, finished, i;
459 
460 	err = 0;
461 	finished = 1;
462 
463 	if (wp->waiting != NULL) {
464 		pbp = wp->waiting;
465 		wp->waiting = NULL;
466 		s = pbp->bio_caller1;
467 		g_io_request(pbp, s->drive_sc->consumer);
468 		finished = 0;
469 
470 	} else if (wp->parity != NULL) {
471 		pbp = wp->parity;
472 		wp->parity = NULL;
473 
474 		/* Check if the parity is correct. */
475 		for (i = 0; i < wp->length; i++) {
476 			if (bp->bio_data[i] != pbp->bio_data[i]) {
477 				err = 1;
478 				break;
479 			}
480 		}
481 
482 		/* The parity is not correct... */
483 		if (err) {
484 			bp->bio_parent->bio_error = EAGAIN;
485 
486 			/* ... but we rebuild it. */
487 			if (bp->bio_parent->bio_pflags & GV_BIO_PARITY) {
488 				s = pbp->bio_caller1;
489 				g_io_request(pbp, s->drive_sc->consumer);
490 				finished = 0;
491 			}
492 		}
493 
494 		/*
495 		 * Clean up the BIO we would have used for rebuilding the
496 		 * parity.
497 		 */
498 		if (finished) {
499 			bp->bio_parent->bio_inbed++;
500 			g_destroy_bio(pbp);
501 		}
502 
503 	}
504 
505 	return (finished);
506 }
507 
508 static int
509 gv_normal_parity(struct gv_plex *p, struct bio *bp, struct gv_raid5_packet *wp)
510 {
511 	struct bio *cbp, *pbp;
512 	struct gv_sd *s;
513 	int finished, i;
514 
515 	finished = 1;
516 
517 	if (wp->waiting != NULL) {
518 		pbp = wp->waiting;
519 		wp->waiting = NULL;
520 		cbp = wp->parity;
521 		for (i = 0; i < wp->length; i++)
522 			cbp->bio_data[i] ^= pbp->bio_data[i];
523 		s = pbp->bio_caller1;
524 		g_io_request(pbp, s->drive_sc->consumer);
525 		finished = 0;
526 
527 	} else if (wp->parity != NULL) {
528 		cbp = wp->parity;
529 		wp->parity = NULL;
530 		s = cbp->bio_caller1;
531 		g_io_request(cbp, s->drive_sc->consumer);
532 		finished = 0;
533 	}
534 
535 	return (finished);
536 }
537 
538 /* Flush the queue with delayed requests. */
539 static void
540 gv_plex_flush(struct gv_plex *p)
541 {
542 	struct gv_softc *sc;
543 	struct bio *bp;
544 
545 	sc = p->vinumconf;
546 	bp = bioq_takefirst(p->rqueue);
547 	while (bp != NULL) {
548 		gv_plex_start(p, bp);
549 		bp = bioq_takefirst(p->rqueue);
550 	}
551 }
552 
553 static void
554 gv_post_bio(struct gv_softc *sc, struct bio *bp)
555 {
556 
557 	KASSERT(sc != NULL, ("NULL sc"));
558 	KASSERT(bp != NULL, ("NULL bp"));
559 	mtx_lock(&sc->bqueue_mtx);
560 	bioq_disksort(sc->bqueue_down, bp);
561 	wakeup(sc);
562 	mtx_unlock(&sc->bqueue_mtx);
563 }
564 
565 int
566 gv_sync_request(struct gv_plex *from, struct gv_plex *to, off_t offset,
567     off_t length, int type, caddr_t data)
568 {
569 	struct gv_softc *sc;
570 	struct bio *bp;
571 
572 	KASSERT(from != NULL, ("NULL from"));
573 	KASSERT(to != NULL, ("NULL to"));
574 	sc = from->vinumconf;
575 	KASSERT(sc != NULL, ("NULL sc"));
576 
577 	bp = g_new_bio();
578 	if (bp == NULL) {
579 		G_VINUM_DEBUG(0, "sync from '%s' failed at offset "
580 		    " %jd; out of memory", from->name, offset);
581 		return (ENOMEM);
582 	}
583 	bp->bio_length = length;
584 	bp->bio_done = NULL;
585 	bp->bio_pflags |= GV_BIO_SYNCREQ;
586 	bp->bio_offset = offset;
587 	bp->bio_caller1 = from;
588 	bp->bio_caller2 = to;
589 	bp->bio_cmd = type;
590 	if (data == NULL)
591 		data = g_malloc(length, M_WAITOK);
592 	bp->bio_pflags |= GV_BIO_MALLOC; /* Free on the next run. */
593 	bp->bio_data = data;
594 
595 	/* Send down next. */
596 	gv_post_bio(sc, bp);
597 	//gv_plex_start(from, bp);
598 	return (0);
599 }
600 
601 /*
602  * Handle a finished plex sync bio.
603  */
604 int
605 gv_sync_complete(struct gv_plex *to, struct bio *bp)
606 {
607 	struct gv_plex *from, *p;
608 	struct gv_sd *s;
609 	struct gv_volume *v;
610 	struct gv_softc *sc;
611 	off_t offset;
612 	int err;
613 
614 	g_topology_assert_not();
615 
616 	err = 0;
617 	KASSERT(to != NULL, ("NULL to"));
618 	KASSERT(bp != NULL, ("NULL bp"));
619 	from = bp->bio_caller2;
620 	KASSERT(from != NULL, ("NULL from"));
621 	v = to->vol_sc;
622 	KASSERT(v != NULL, ("NULL v"));
623 	sc = v->vinumconf;
624 	KASSERT(sc != NULL, ("NULL sc"));
625 
626 	/* If it was a read, write it. */
627 	if (bp->bio_cmd == BIO_READ) {
628 		err = gv_sync_request(from, to, bp->bio_offset, bp->bio_length,
629 	    	    BIO_WRITE, bp->bio_data);
630 	/* If it was a write, read the next one. */
631 	} else if (bp->bio_cmd == BIO_WRITE) {
632 		if (bp->bio_pflags & GV_BIO_MALLOC)
633 			g_free(bp->bio_data);
634 		to->synced += bp->bio_length;
635 		/* If we're finished, clean up. */
636 		if (bp->bio_offset + bp->bio_length >= from->size) {
637 			G_VINUM_DEBUG(1, "syncing of %s from %s completed",
638 			    to->name, from->name);
639 			/* Update our state. */
640 			LIST_FOREACH(s, &to->subdisks, in_plex)
641 				gv_set_sd_state(s, GV_SD_UP, 0);
642 			gv_update_plex_state(to);
643 			to->flags &= ~GV_PLEX_SYNCING;
644 			to->synced = 0;
645 			gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0);
646 		} else {
647 			offset = bp->bio_offset + bp->bio_length;
648 			err = gv_sync_request(from, to, offset,
649 			    MIN(bp->bio_length, from->size - offset),
650 			    BIO_READ, NULL);
651 		}
652 	}
653 	g_destroy_bio(bp);
654 	/* Clean up if there was an error. */
655 	if (err) {
656 		to->flags &= ~GV_PLEX_SYNCING;
657 		G_VINUM_DEBUG(0, "error syncing plexes: error code %d", err);
658 	}
659 
660 	/* Check if all plexes are synced, and lower refcounts. */
661 	g_topology_lock();
662 	LIST_FOREACH(p, &v->plexes, in_volume) {
663 		if (p->flags & GV_PLEX_SYNCING) {
664 			g_topology_unlock();
665 			return (-1);
666 		}
667 	}
668 	/* If we came here, all plexes are synced, and we're free. */
669 	gv_access(v->provider, -1, -1, 0);
670 	g_topology_unlock();
671 	G_VINUM_DEBUG(1, "plex sync completed");
672 	gv_volume_flush(v);
673 	return (0);
674 }
675 
676 /*
677  * Create a new bio struct for the next grow request.
678  */
679 int
680 gv_grow_request(struct gv_plex *p, off_t offset, off_t length, int type,
681     caddr_t data)
682 {
683 	struct gv_softc *sc;
684 	struct bio *bp;
685 
686 	KASSERT(p != NULL, ("gv_grow_request: NULL p"));
687 	sc = p->vinumconf;
688 	KASSERT(sc != NULL, ("gv_grow_request: NULL sc"));
689 
690 	bp = g_new_bio();
691 	if (bp == NULL) {
692 		G_VINUM_DEBUG(0, "grow of %s failed creating bio: "
693 		    "out of memory", p->name);
694 		return (ENOMEM);
695 	}
696 
697 	bp->bio_cmd = type;
698 	bp->bio_done = NULL;
699 	bp->bio_error = 0;
700 	bp->bio_caller1 = p;
701 	bp->bio_offset = offset;
702 	bp->bio_length = length;
703 	bp->bio_pflags |= GV_BIO_GROW;
704 	if (data == NULL)
705 		data = g_malloc(length, M_WAITOK);
706 	bp->bio_pflags |= GV_BIO_MALLOC;
707 	bp->bio_data = data;
708 
709 	gv_post_bio(sc, bp);
710 	//gv_plex_start(p, bp);
711 	return (0);
712 }
713 
714 /*
715  * Finish handling of a bio to a growing plex.
716  */
717 void
718 gv_grow_complete(struct gv_plex *p, struct bio *bp)
719 {
720 	struct gv_softc *sc;
721 	struct gv_sd *s;
722 	struct gv_volume *v;
723 	off_t origsize, offset;
724 	int sdcount, err;
725 
726 	v = p->vol_sc;
727 	KASSERT(v != NULL, ("gv_grow_complete: NULL v"));
728 	sc = v->vinumconf;
729 	KASSERT(sc != NULL, ("gv_grow_complete: NULL sc"));
730 	err = 0;
731 
732 	/* If it was a read, write it. */
733 	if (bp->bio_cmd == BIO_READ) {
734 		p->synced += bp->bio_length;
735 		err = gv_grow_request(p, bp->bio_offset, bp->bio_length,
736 		    BIO_WRITE, bp->bio_data);
737 	/* If it was a write, read next. */
738 	} else if (bp->bio_cmd == BIO_WRITE) {
739 		if (bp->bio_pflags & GV_BIO_MALLOC)
740 			g_free(bp->bio_data);
741 
742 		/* Find the real size of the plex. */
743 		sdcount = gv_sdcount(p, 1);
744 		s = LIST_FIRST(&p->subdisks);
745 		KASSERT(s != NULL, ("NULL s"));
746 		origsize = (s->size * (sdcount - 1));
747 		if (bp->bio_offset + bp->bio_length >= origsize) {
748 			G_VINUM_DEBUG(1, "growing of %s completed", p->name);
749 			p->flags &= ~GV_PLEX_GROWING;
750 			LIST_FOREACH(s, &p->subdisks, in_plex) {
751 				s->flags &= ~GV_SD_GROW;
752 				gv_set_sd_state(s, GV_SD_UP, 0);
753 			}
754 			p->size = gv_plex_size(p);
755 			gv_update_vol_size(v, gv_vol_size(v));
756 			gv_set_plex_state(p, GV_PLEX_UP, 0);
757 			g_topology_lock();
758 			gv_access(v->provider, -1, -1, 0);
759 			g_topology_unlock();
760 			p->synced = 0;
761 			gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0);
762 			/* Issue delayed requests. */
763 			gv_plex_flush(p);
764 		} else {
765 			offset = bp->bio_offset + bp->bio_length;
766 			err = gv_grow_request(p, offset,
767 			   MIN(bp->bio_length, origsize - offset),
768 			   BIO_READ, NULL);
769 		}
770 	}
771 	g_destroy_bio(bp);
772 
773 	if (err) {
774 		p->flags &= ~GV_PLEX_GROWING;
775 		G_VINUM_DEBUG(0, "error growing plex: error code %d", err);
776 	}
777 }
778 
779 
780 /*
781  * Create an initialization BIO and send it off to the consumer. Assume that
782  * we're given initialization data as parameter.
783  */
784 void
785 gv_init_request(struct gv_sd *s, off_t start, caddr_t data, off_t length)
786 {
787 	struct gv_drive *d;
788 	struct g_consumer *cp;
789 	struct bio *bp, *cbp;
790 
791 	KASSERT(s != NULL, ("gv_init_request: NULL s"));
792 	d = s->drive_sc;
793 	KASSERT(d != NULL, ("gv_init_request: NULL d"));
794 	cp = d->consumer;
795 	KASSERT(cp != NULL, ("gv_init_request: NULL cp"));
796 
797 	bp = g_new_bio();
798 	if (bp == NULL) {
799 		G_VINUM_DEBUG(0, "subdisk '%s' init: write failed at offset %jd"
800 		    " (drive offset %jd); out of memory", s->name,
801 		    (intmax_t)s->initialized, (intmax_t)start);
802 		return; /* XXX: Error codes. */
803 	}
804 	bp->bio_cmd = BIO_WRITE;
805 	bp->bio_data = data;
806 	bp->bio_done = NULL;
807 	bp->bio_error = 0;
808 	bp->bio_length = length;
809 	bp->bio_pflags |= GV_BIO_INIT;
810 	bp->bio_offset = start;
811 	bp->bio_caller1 = s;
812 
813 	/* Then ofcourse, we have to clone it. */
814 	cbp = g_clone_bio(bp);
815 	if (cbp == NULL) {
816 		G_VINUM_DEBUG(0, "subdisk '%s' init: write failed at offset %jd"
817 		    " (drive offset %jd); out of memory", s->name,
818 		    (intmax_t)s->initialized, (intmax_t)start);
819 		return; /* XXX: Error codes. */
820 	}
821 	cbp->bio_done = gv_done;
822 	cbp->bio_caller1 = s;
823 	d->active++;
824 	/* Send it off to the consumer. */
825 	g_io_request(cbp, cp);
826 }
827 
828 /*
829  * Handle a finished initialization BIO.
830  */
831 void
832 gv_init_complete(struct gv_plex *p, struct bio *bp)
833 {
834 	struct gv_softc *sc;
835 	struct gv_drive *d;
836 	struct g_consumer *cp;
837 	struct gv_sd *s;
838 	off_t start, length;
839 	caddr_t data;
840 	int error;
841 
842 	s = bp->bio_caller1;
843 	start = bp->bio_offset;
844 	length = bp->bio_length;
845 	error = bp->bio_error;
846 	data = bp->bio_data;
847 
848 	KASSERT(s != NULL, ("gv_init_complete: NULL s"));
849 	d = s->drive_sc;
850 	KASSERT(d != NULL, ("gv_init_complete: NULL d"));
851 	cp = d->consumer;
852 	KASSERT(cp != NULL, ("gv_init_complete: NULL cp"));
853 	sc = p->vinumconf;
854 	KASSERT(sc != NULL, ("gv_init_complete: NULL sc"));
855 
856 	g_destroy_bio(bp);
857 
858 	/*
859 	 * First we need to find out if it was okay, and abort if it's not.
860 	 * Then we need to free previous buffers, find out the correct subdisk,
861 	 * as well as getting the correct starting point and length of the BIO.
862 	 */
863 	if (start >= s->drive_offset + s->size) {
864 		/* Free the data we initialized. */
865 		if (data != NULL)
866 			g_free(data);
867 		g_topology_assert_not();
868 		g_topology_lock();
869 		g_access(cp, 0, -1, 0);
870 		g_topology_unlock();
871 		if (error) {
872 			gv_set_sd_state(s, GV_SD_STALE, GV_SETSTATE_FORCE |
873 			    GV_SETSTATE_CONFIG);
874 		} else {
875 			gv_set_sd_state(s, GV_SD_UP, GV_SETSTATE_CONFIG);
876 			s->initialized = 0;
877 			gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0);
878 			G_VINUM_DEBUG(1, "subdisk '%s' init: finished "
879 			    "successfully", s->name);
880 		}
881 		return;
882 	}
883 	s->initialized += length;
884 	start += length;
885 	gv_init_request(s, start, data, length);
886 }
887 
888 /*
889  * Create a new bio struct for the next parity rebuild. Used both by internal
890  * rebuild of degraded plexes as well as user initiated rebuilds/checks.
891  */
892 void
893 gv_parity_request(struct gv_plex *p, int flags, off_t offset)
894 {
895 	struct gv_softc *sc;
896 	struct bio *bp;
897 
898 	KASSERT(p != NULL, ("gv_parity_request: NULL p"));
899 	sc = p->vinumconf;
900 	KASSERT(sc != NULL, ("gv_parity_request: NULL sc"));
901 
902 	bp = g_new_bio();
903 	if (bp == NULL) {
904 		G_VINUM_DEBUG(0, "rebuild of %s failed creating bio: "
905 		    "out of memory", p->name);
906 		return;
907 	}
908 
909 	bp->bio_cmd = BIO_WRITE;
910 	bp->bio_done = NULL;
911 	bp->bio_error = 0;
912 	bp->bio_length = p->stripesize;
913 	bp->bio_caller1 = p;
914 
915 	/*
916 	 * Check if it's a rebuild of a degraded plex or a user request of
917 	 * parity rebuild.
918 	 */
919 	if (flags & GV_BIO_REBUILD)
920 		bp->bio_data = g_malloc(GV_DFLT_SYNCSIZE, M_WAITOK);
921 	else if (flags & GV_BIO_CHECK)
922 		bp->bio_data = g_malloc(p->stripesize, M_WAITOK | M_ZERO);
923 	else {
924 		G_VINUM_DEBUG(0, "invalid flags given in rebuild");
925 		return;
926 	}
927 
928 	bp->bio_pflags = flags;
929 	bp->bio_pflags |= GV_BIO_MALLOC;
930 
931 	/* We still have more parity to build. */
932 	bp->bio_offset = offset;
933 	gv_post_bio(sc, bp);
934 	//gv_plex_start(p, bp); /* Send it down to the plex. */
935 }
936 
937 /*
938  * Handle a finished parity write.
939  */
940 void
941 gv_parity_complete(struct gv_plex *p, struct bio *bp)
942 {
943 	struct gv_softc *sc;
944 	int error, flags;
945 
946 	error = bp->bio_error;
947 	flags = bp->bio_pflags;
948 	flags &= ~GV_BIO_MALLOC;
949 
950 	sc = p->vinumconf;
951 	KASSERT(sc != NULL, ("gv_parity_complete: NULL sc"));
952 
953 	/* Clean up what we allocated. */
954 	if (bp->bio_pflags & GV_BIO_MALLOC)
955 		g_free(bp->bio_data);
956 	g_destroy_bio(bp);
957 
958 	if (error == EAGAIN) {
959 		G_VINUM_DEBUG(0, "parity incorrect at offset 0x%jx",
960 		    (intmax_t)p->synced);
961 	}
962 
963 	/* Any error is fatal, except EAGAIN when we're rebuilding. */
964 	if (error && !(error == EAGAIN && (flags & GV_BIO_PARITY))) {
965 		/* Make sure we don't have the lock. */
966 		g_topology_assert_not();
967 		g_topology_lock();
968 		gv_access(p->vol_sc->provider, -1, -1, 0);
969 		g_topology_unlock();
970 		G_VINUM_DEBUG(0, "parity check on %s failed at 0x%jx "
971 		    "errno %d", p->name, (intmax_t)p->synced, error);
972 		return;
973 	} else {
974 		p->synced += p->stripesize;
975 	}
976 
977 	if (p->synced >= p->size) {
978 		/* Make sure we don't have the lock. */
979 		g_topology_assert_not();
980 		g_topology_lock();
981 		gv_access(p->vol_sc->provider, -1, -1, 0);
982 		g_topology_unlock();
983 		/* We're finished. */
984 		G_VINUM_DEBUG(1, "parity operation on %s finished", p->name);
985 		p->synced = 0;
986 		gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0);
987 		return;
988 	}
989 
990 	/* Send down next. It will determine if we need to itself. */
991 	gv_parity_request(p, flags, p->synced);
992 }
993 
994 /*
995  * Handle a finished plex rebuild bio.
996  */
997 void
998 gv_rebuild_complete(struct gv_plex *p, struct bio *bp)
999 {
1000 	struct gv_softc *sc;
1001 	struct gv_sd *s;
1002 	int error, flags;
1003 	off_t offset;
1004 
1005 	error = bp->bio_error;
1006 	flags = bp->bio_pflags;
1007 	offset = bp->bio_offset;
1008 	flags &= ~GV_BIO_MALLOC;
1009 	sc = p->vinumconf;
1010 	KASSERT(sc != NULL, ("gv_rebuild_complete: NULL sc"));
1011 
1012 	/* Clean up what we allocated. */
1013 	if (bp->bio_pflags & GV_BIO_MALLOC)
1014 		g_free(bp->bio_data);
1015 	g_destroy_bio(bp);
1016 
1017 	if (error) {
1018 		g_topology_assert_not();
1019 		g_topology_lock();
1020 		gv_access(p->vol_sc->provider, -1, -1, 0);
1021 		g_topology_unlock();
1022 
1023 		G_VINUM_DEBUG(0, "rebuild of %s failed at offset %jd errno: %d",
1024 		    p->name, (intmax_t)offset, error);
1025 		p->flags &= ~GV_PLEX_REBUILDING;
1026 		p->synced = 0;
1027 		gv_plex_flush(p); /* Flush out remaining rebuild BIOs. */
1028 		return;
1029 	}
1030 
1031 	offset += (p->stripesize * (gv_sdcount(p, 1) - 1));
1032 	if (offset >= p->size) {
1033 		/* We're finished. */
1034 		g_topology_assert_not();
1035 		g_topology_lock();
1036 		gv_access(p->vol_sc->provider, -1, -1, 0);
1037 		g_topology_unlock();
1038 
1039 		G_VINUM_DEBUG(1, "rebuild of %s finished", p->name);
1040 		gv_save_config(p->vinumconf);
1041 		p->flags &= ~GV_PLEX_REBUILDING;
1042 		p->synced = 0;
1043 		/* Try to up all subdisks. */
1044 		LIST_FOREACH(s, &p->subdisks, in_plex)
1045 			gv_update_sd_state(s);
1046 		gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0);
1047 		gv_plex_flush(p); /* Flush out remaining rebuild BIOs. */
1048 		return;
1049 	}
1050 
1051 	/* Send down next. It will determine if we need to itself. */
1052 	gv_parity_request(p, flags, offset);
1053 }
1054