xref: /freebsd/sys/geom/vinum/geom_vinum_plex.c (revision 193d9e768ba63fcfb187cfd17f461f7d41345048)
1 /*-
2  * Copyright (c) 2004, 2007 Lukas Ertl
3  * Copyright (c) 2007, 2009 Ulf Lilleengen
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 
31 #include <sys/param.h>
32 #include <sys/bio.h>
33 #include <sys/lock.h>
34 #include <sys/malloc.h>
35 #include <sys/systm.h>
36 
37 #include <geom/geom.h>
38 #include <geom/vinum/geom_vinum_var.h>
39 #include <geom/vinum/geom_vinum_raid5.h>
40 #include <geom/vinum/geom_vinum.h>
41 
42 static int	gv_check_parity(struct gv_plex *, struct bio *,
43 		    struct gv_raid5_packet *);
44 static int	gv_normal_parity(struct gv_plex *, struct bio *,
45 		    struct gv_raid5_packet *);
46 static void	gv_plex_flush(struct gv_plex *);
47 static int	gv_plex_offset(struct gv_plex *, off_t, off_t, off_t *, off_t *,
48 		    int *, int);
49 static int 	gv_plex_normal_request(struct gv_plex *, struct bio *, off_t,
50 		    off_t,  caddr_t);
51 static void	gv_post_bio(struct gv_softc *, struct bio *);
52 
53 void
54 gv_plex_start(struct gv_plex *p, struct bio *bp)
55 {
56 	struct bio *cbp;
57 	struct gv_sd *s;
58 	struct gv_raid5_packet *wp;
59 	caddr_t addr;
60 	off_t bcount, boff, len;
61 
62 	bcount = bp->bio_length;
63 	addr = bp->bio_data;
64 	boff = bp->bio_offset;
65 
66 	/* Walk over the whole length of the request, we might split it up. */
67 	while (bcount > 0) {
68 		wp = NULL;
69 
70  		/*
71 		 * RAID5 plexes need special treatment, as a single request
72 		 * might involve several read/write sub-requests.
73  		 */
74 		if (p->org == GV_PLEX_RAID5) {
75 			wp = gv_raid5_start(p, bp, addr, boff, bcount);
76  			if (wp == NULL)
77  				return;
78 
79 			len = wp->length;
80 
81 			if (TAILQ_EMPTY(&wp->bits))
82 				g_free(wp);
83 			else if (wp->lockbase != -1)
84 				TAILQ_INSERT_TAIL(&p->packets, wp, list);
85 
86 		/*
87 		 * Requests to concatenated and striped plexes go straight
88 		 * through.
89 		 */
90 		} else {
91 			len = gv_plex_normal_request(p, bp, boff, bcount, addr);
92 		}
93 		if (len < 0)
94 			return;
95 
96 		bcount -= len;
97 		addr += len;
98 		boff += len;
99 	}
100 
101 	/*
102 	 * Fire off all sub-requests.  We get the correct consumer (== drive)
103 	 * to send each request to via the subdisk that was stored in
104 	 * cbp->bio_caller1.
105 	 */
106 	cbp = bioq_takefirst(p->bqueue);
107 	while (cbp != NULL) {
108 		/*
109 		 * RAID5 sub-requests need to come in correct order, otherwise
110 		 * we trip over the parity, as it might be overwritten by
111 		 * another sub-request.  We abuse cbp->bio_caller2 to mark
112 		 * potential overlap situations.
113 		 */
114 		if (cbp->bio_caller2 != NULL && gv_stripe_active(p, cbp)) {
115 			/* Park the bio on the waiting queue. */
116 			cbp->bio_pflags |= GV_BIO_ONHOLD;
117 			bioq_disksort(p->wqueue, cbp);
118 		} else {
119 			s = cbp->bio_caller1;
120 			g_io_request(cbp, s->drive_sc->consumer);
121 		}
122 		cbp = bioq_takefirst(p->bqueue);
123 	}
124 }
125 
126 static int
127 gv_plex_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off,
128     off_t *real_len, int *sdno, int growing)
129 {
130 	struct gv_sd *s;
131 	int i, sdcount;
132 	off_t len_left, stripeend, stripeno, stripestart;
133 
134 	switch (p->org) {
135 	case GV_PLEX_CONCAT:
136 		/*
137 		 * Find the subdisk where this request starts.  The subdisks in
138 		 * this list must be ordered by plex_offset.
139 		 */
140 		i = 0;
141 		LIST_FOREACH(s, &p->subdisks, in_plex) {
142 			if (s->plex_offset <= boff &&
143 			    s->plex_offset + s->size > boff) {
144 				*sdno = i;
145 				break;
146 			}
147 			i++;
148 		}
149 		if (s == NULL || s->drive_sc == NULL)
150 			return (GV_ERR_NOTFOUND);
151 
152 		/* Calculate corresponding offsets on disk. */
153 		*real_off = boff - s->plex_offset;
154 		len_left = s->size - (*real_off);
155 		KASSERT(len_left >= 0, ("gv_plex_offset: len_left < 0"));
156 		*real_len = (bcount > len_left) ? len_left : bcount;
157 		break;
158 
159 	case GV_PLEX_STRIPED:
160 		/* The number of the stripe where the request starts. */
161 		stripeno = boff / p->stripesize;
162 		KASSERT(stripeno >= 0, ("gv_plex_offset: stripeno < 0"));
163 
164 		/* Take growing subdisks into account when calculating. */
165 		sdcount = gv_sdcount(p, (boff >= p->synced));
166 
167 		if (!(boff + bcount <= p->synced) &&
168 		    (p->flags & GV_PLEX_GROWING) &&
169 		    !growing)
170 			return (GV_ERR_ISBUSY);
171 		*sdno = stripeno % sdcount;
172 
173 		KASSERT(sdno >= 0, ("gv_plex_offset: sdno < 0"));
174 		stripestart = (stripeno / sdcount) *
175 		    p->stripesize;
176 		KASSERT(stripestart >= 0, ("gv_plex_offset: stripestart < 0"));
177 		stripeend = stripestart + p->stripesize;
178 		*real_off = boff - (stripeno * p->stripesize) +
179 		    stripestart;
180 		len_left = stripeend - *real_off;
181 		KASSERT(len_left >= 0, ("gv_plex_offset: len_left < 0"));
182 
183 		*real_len = (bcount <= len_left) ? bcount : len_left;
184 		break;
185 
186 	default:
187 		return (GV_ERR_PLEXORG);
188 	}
189 	return (0);
190 }
191 
192 /*
193  * Prepare a normal plex request.
194  */
195 static int
196 gv_plex_normal_request(struct gv_plex *p, struct bio *bp, off_t boff,
197     off_t bcount,  caddr_t addr)
198 {
199 	struct gv_sd *s;
200 	struct bio *cbp;
201 	off_t real_len, real_off;
202 	int i, err, sdno;
203 
204 	s = NULL;
205 	sdno = -1;
206 	real_len = real_off = 0;
207 
208 	err = ENXIO;
209 
210 	if (p == NULL || LIST_EMPTY(&p->subdisks))
211 		goto bad;
212 
213 	err = gv_plex_offset(p, boff, bcount, &real_off,
214 	    &real_len, &sdno, (bp->bio_pflags & GV_BIO_GROW));
215 	/* If the request was blocked, put it into wait. */
216 	if (err == GV_ERR_ISBUSY) {
217 		bioq_disksort(p->rqueue, bp);
218 		return (-1); /* "Fail", and delay request. */
219 	}
220 	if (err) {
221 		err = ENXIO;
222 		goto bad;
223 	}
224 	err = ENXIO;
225 
226 	/* Find the right subdisk. */
227 	i = 0;
228 	LIST_FOREACH(s, &p->subdisks, in_plex) {
229 		if (i == sdno)
230 			break;
231 		i++;
232 	}
233 
234 	/* Subdisk not found. */
235 	if (s == NULL || s->drive_sc == NULL)
236 		goto bad;
237 
238 	/* Now check if we can handle the request on this subdisk. */
239 	switch (s->state) {
240 	case GV_SD_UP:
241 		/* If the subdisk is up, just continue. */
242 		break;
243 	case GV_SD_DOWN:
244 		if (bp->bio_pflags & GV_BIO_INTERNAL)
245 			G_VINUM_DEBUG(0, "subdisk must be in the stale state in"
246 			    " order to perform administrative requests");
247 		goto bad;
248 	case GV_SD_STALE:
249 		if (!(bp->bio_pflags & GV_BIO_SYNCREQ)) {
250 			G_VINUM_DEBUG(0, "subdisk stale, unable to perform "
251 			    "regular requests");
252 			goto bad;
253 		}
254 
255 		G_VINUM_DEBUG(1, "sd %s is initializing", s->name);
256 		gv_set_sd_state(s, GV_SD_INITIALIZING, GV_SETSTATE_FORCE);
257 		break;
258 	case GV_SD_INITIALIZING:
259 		if (bp->bio_cmd == BIO_READ)
260 			goto bad;
261 		break;
262 	default:
263 		/* All other subdisk states mean it's not accessible. */
264 		goto bad;
265 	}
266 
267 	/* Clone the bio and adjust the offsets and sizes. */
268 	cbp = g_clone_bio(bp);
269 	if (cbp == NULL) {
270 		err = ENOMEM;
271 		goto bad;
272 	}
273 	cbp->bio_offset = real_off + s->drive_offset;
274 	cbp->bio_length = real_len;
275 	cbp->bio_data = addr;
276 	cbp->bio_done = gv_done;
277 	cbp->bio_caller1 = s;
278 
279 	/* Store the sub-requests now and let others issue them. */
280 	bioq_insert_tail(p->bqueue, cbp);
281 	return (real_len);
282 bad:
283 	G_VINUM_LOGREQ(0, bp, "plex request failed.");
284 	/* Building the sub-request failed. If internal BIO, do not deliver. */
285 	if (bp->bio_pflags & GV_BIO_INTERNAL) {
286 		if (bp->bio_pflags & GV_BIO_MALLOC)
287 			g_free(bp->bio_data);
288 		g_destroy_bio(bp);
289 		p->flags &= ~(GV_PLEX_SYNCING | GV_PLEX_REBUILDING |
290 		    GV_PLEX_GROWING);
291 		return (-1);
292 	}
293 	g_io_deliver(bp, err);
294 	return (-1);
295 }
296 
297 /*
298  * Handle a completed request to a striped or concatenated plex.
299  */
300 void
301 gv_plex_normal_done(struct gv_plex *p, struct bio *bp)
302 {
303 	struct bio *pbp;
304 
305 	pbp = bp->bio_parent;
306 	if (pbp->bio_error == 0)
307 		pbp->bio_error = bp->bio_error;
308 	g_destroy_bio(bp);
309 	pbp->bio_inbed++;
310 	if (pbp->bio_children == pbp->bio_inbed) {
311 		/* Just set it to length since multiple plexes will
312 		 * screw things up. */
313 		pbp->bio_completed = pbp->bio_length;
314 		if (pbp->bio_pflags & GV_BIO_SYNCREQ)
315 			gv_sync_complete(p, pbp);
316 		else if (pbp->bio_pflags & GV_BIO_GROW)
317 			gv_grow_complete(p, pbp);
318 		else
319 			g_io_deliver(pbp, pbp->bio_error);
320 	}
321 }
322 
323 /*
324  * Handle a completed request to a RAID-5 plex.
325  */
326 void
327 gv_plex_raid5_done(struct gv_plex *p, struct bio *bp)
328 {
329 	struct gv_softc *sc;
330 	struct bio *cbp, *pbp;
331 	struct gv_bioq *bq, *bq2;
332 	struct gv_raid5_packet *wp;
333 	off_t completed;
334 	int i;
335 
336 	completed = 0;
337 	sc = p->vinumconf;
338 	wp = bp->bio_caller2;
339 
340 	switch (bp->bio_parent->bio_cmd) {
341 	case BIO_READ:
342 		if (wp == NULL) {
343 			completed = bp->bio_completed;
344 			break;
345 		}
346 
347 		TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
348 			if (bq->bp != bp)
349 				continue;
350 			TAILQ_REMOVE(&wp->bits, bq, queue);
351 			g_free(bq);
352 			for (i = 0; i < wp->length; i++)
353 				wp->data[i] ^= bp->bio_data[i];
354 			break;
355 		}
356 		if (TAILQ_EMPTY(&wp->bits)) {
357 			completed = wp->length;
358 			if (wp->lockbase != -1) {
359 				TAILQ_REMOVE(&p->packets, wp, list);
360 				/* Bring the waiting bios back into the game. */
361 				pbp = bioq_takefirst(p->wqueue);
362 				while (pbp != NULL) {
363 					gv_post_bio(sc, pbp);
364 					pbp = bioq_takefirst(p->wqueue);
365 				}
366 			}
367 			g_free(wp);
368 		}
369 
370 		break;
371 
372  	case BIO_WRITE:
373 		/* XXX can this ever happen? */
374 		if (wp == NULL) {
375 			completed = bp->bio_completed;
376 			break;
377 		}
378 
379 		/* Check if we need to handle parity data. */
380 		TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
381 			if (bq->bp != bp)
382 				continue;
383 			TAILQ_REMOVE(&wp->bits, bq, queue);
384 			g_free(bq);
385 			cbp = wp->parity;
386 			if (cbp != NULL) {
387 				for (i = 0; i < wp->length; i++)
388 					cbp->bio_data[i] ^= bp->bio_data[i];
389 			}
390 			break;
391 		}
392 
393 		/* Handle parity data. */
394 		if (TAILQ_EMPTY(&wp->bits)) {
395 			if (bp->bio_parent->bio_pflags & GV_BIO_CHECK)
396 				i = gv_check_parity(p, bp, wp);
397 			else
398 				i = gv_normal_parity(p, bp, wp);
399 
400 			/* All of our sub-requests have finished. */
401 			if (i) {
402 				completed = wp->length;
403 				TAILQ_REMOVE(&p->packets, wp, list);
404 				/* Bring the waiting bios back into the game. */
405 				pbp = bioq_takefirst(p->wqueue);
406 				while (pbp != NULL) {
407 					gv_post_bio(sc, pbp);
408 					pbp = bioq_takefirst(p->wqueue);
409 				}
410 				g_free(wp);
411 			}
412 		}
413 
414 		break;
415 	}
416 
417 	pbp = bp->bio_parent;
418 	if (pbp->bio_error == 0)
419 		pbp->bio_error = bp->bio_error;
420 	pbp->bio_completed += completed;
421 
422 	/* When the original request is finished, we deliver it. */
423 	pbp->bio_inbed++;
424 	if (pbp->bio_inbed == pbp->bio_children) {
425 		/* Hand it over for checking or delivery. */
426 		if (pbp->bio_cmd == BIO_WRITE &&
427 		    (pbp->bio_pflags & GV_BIO_CHECK)) {
428 			gv_parity_complete(p, pbp);
429 		} else if (pbp->bio_cmd == BIO_WRITE &&
430 		    (pbp->bio_pflags & GV_BIO_REBUILD)) {
431 			gv_rebuild_complete(p, pbp);
432 		} else if (pbp->bio_pflags & GV_BIO_INIT) {
433 			gv_init_complete(p, pbp);
434 		} else if (pbp->bio_pflags & GV_BIO_SYNCREQ) {
435 			gv_sync_complete(p, pbp);
436 		} else if (pbp->bio_pflags & GV_BIO_GROW) {
437 			gv_grow_complete(p, pbp);
438 		} else {
439 			g_io_deliver(pbp, pbp->bio_error);
440 		}
441 	}
442 
443 	/* Clean up what we allocated. */
444 	if (bp->bio_cflags & GV_BIO_MALLOC)
445 		g_free(bp->bio_data);
446 	g_destroy_bio(bp);
447 }
448 
449 static int
450 gv_check_parity(struct gv_plex *p, struct bio *bp, struct gv_raid5_packet *wp)
451 {
452 	struct bio *pbp;
453 	struct gv_sd *s;
454 	int err, finished, i;
455 
456 	err = 0;
457 	finished = 1;
458 
459 	if (wp->waiting != NULL) {
460 		pbp = wp->waiting;
461 		wp->waiting = NULL;
462 		s = pbp->bio_caller1;
463 		g_io_request(pbp, s->drive_sc->consumer);
464 		finished = 0;
465 
466 	} else if (wp->parity != NULL) {
467 		pbp = wp->parity;
468 		wp->parity = NULL;
469 
470 		/* Check if the parity is correct. */
471 		for (i = 0; i < wp->length; i++) {
472 			if (bp->bio_data[i] != pbp->bio_data[i]) {
473 				err = 1;
474 				break;
475 			}
476 		}
477 
478 		/* The parity is not correct... */
479 		if (err) {
480 			bp->bio_parent->bio_error = EAGAIN;
481 
482 			/* ... but we rebuild it. */
483 			if (bp->bio_parent->bio_pflags & GV_BIO_PARITY) {
484 				s = pbp->bio_caller1;
485 				g_io_request(pbp, s->drive_sc->consumer);
486 				finished = 0;
487 			}
488 		}
489 
490 		/*
491 		 * Clean up the BIO we would have used for rebuilding the
492 		 * parity.
493 		 */
494 		if (finished) {
495 			bp->bio_parent->bio_inbed++;
496 			g_destroy_bio(pbp);
497 		}
498 
499 	}
500 
501 	return (finished);
502 }
503 
504 static int
505 gv_normal_parity(struct gv_plex *p, struct bio *bp, struct gv_raid5_packet *wp)
506 {
507 	struct bio *cbp, *pbp;
508 	struct gv_sd *s;
509 	int finished, i;
510 
511 	finished = 1;
512 
513 	if (wp->waiting != NULL) {
514 		pbp = wp->waiting;
515 		wp->waiting = NULL;
516 		cbp = wp->parity;
517 		for (i = 0; i < wp->length; i++)
518 			cbp->bio_data[i] ^= pbp->bio_data[i];
519 		s = pbp->bio_caller1;
520 		g_io_request(pbp, s->drive_sc->consumer);
521 		finished = 0;
522 
523 	} else if (wp->parity != NULL) {
524 		cbp = wp->parity;
525 		wp->parity = NULL;
526 		s = cbp->bio_caller1;
527 		g_io_request(cbp, s->drive_sc->consumer);
528 		finished = 0;
529 	}
530 
531 	return (finished);
532 }
533 
534 /* Flush the queue with delayed requests. */
535 static void
536 gv_plex_flush(struct gv_plex *p)
537 {
538 	struct gv_softc *sc;
539 	struct bio *bp;
540 
541 	sc = p->vinumconf;
542 	bp = bioq_takefirst(p->rqueue);
543 	while (bp != NULL) {
544 		gv_plex_start(p, bp);
545 		bp = bioq_takefirst(p->rqueue);
546 	}
547 }
548 
549 static void
550 gv_post_bio(struct gv_softc *sc, struct bio *bp)
551 {
552 
553 	KASSERT(sc != NULL, ("NULL sc"));
554 	KASSERT(bp != NULL, ("NULL bp"));
555 	mtx_lock(&sc->bqueue_mtx);
556 	bioq_disksort(sc->bqueue_down, bp);
557 	wakeup(sc);
558 	mtx_unlock(&sc->bqueue_mtx);
559 }
560 
561 int
562 gv_sync_request(struct gv_plex *from, struct gv_plex *to, off_t offset,
563     off_t length, int type, caddr_t data)
564 {
565 	struct gv_softc *sc;
566 	struct bio *bp;
567 
568 	KASSERT(from != NULL, ("NULL from"));
569 	KASSERT(to != NULL, ("NULL to"));
570 	sc = from->vinumconf;
571 	KASSERT(sc != NULL, ("NULL sc"));
572 
573 	bp = g_new_bio();
574 	if (bp == NULL) {
575 		G_VINUM_DEBUG(0, "sync from '%s' failed at offset "
576 		    " %jd; out of memory", from->name, offset);
577 		return (ENOMEM);
578 	}
579 	bp->bio_length = length;
580 	bp->bio_done = gv_done;
581 	bp->bio_pflags |= GV_BIO_SYNCREQ;
582 	bp->bio_offset = offset;
583 	bp->bio_caller1 = from;
584 	bp->bio_caller2 = to;
585 	bp->bio_cmd = type;
586 	if (data == NULL)
587 		data = g_malloc(length, M_WAITOK);
588 	bp->bio_pflags |= GV_BIO_MALLOC; /* Free on the next run. */
589 	bp->bio_data = data;
590 
591 	/* Send down next. */
592 	gv_post_bio(sc, bp);
593 	//gv_plex_start(from, bp);
594 	return (0);
595 }
596 
597 /*
598  * Handle a finished plex sync bio.
599  */
600 int
601 gv_sync_complete(struct gv_plex *to, struct bio *bp)
602 {
603 	struct gv_plex *from, *p;
604 	struct gv_sd *s;
605 	struct gv_volume *v;
606 	struct gv_softc *sc;
607 	off_t offset;
608 	int err;
609 
610 	g_topology_assert_not();
611 
612 	err = 0;
613 	KASSERT(to != NULL, ("NULL to"));
614 	KASSERT(bp != NULL, ("NULL bp"));
615 	from = bp->bio_caller2;
616 	KASSERT(from != NULL, ("NULL from"));
617 	v = to->vol_sc;
618 	KASSERT(v != NULL, ("NULL v"));
619 	sc = v->vinumconf;
620 	KASSERT(sc != NULL, ("NULL sc"));
621 
622 	/* If it was a read, write it. */
623 	if (bp->bio_cmd == BIO_READ) {
624 		err = gv_sync_request(from, to, bp->bio_offset, bp->bio_length,
625 	    	    BIO_WRITE, bp->bio_data);
626 	/* If it was a write, read the next one. */
627 	} else if (bp->bio_cmd == BIO_WRITE) {
628 		if (bp->bio_pflags & GV_BIO_MALLOC)
629 			g_free(bp->bio_data);
630 		to->synced += bp->bio_length;
631 		/* If we're finished, clean up. */
632 		if (bp->bio_offset + bp->bio_length >= from->size) {
633 			G_VINUM_DEBUG(1, "syncing of %s from %s completed",
634 			    to->name, from->name);
635 			/* Update our state. */
636 			LIST_FOREACH(s, &to->subdisks, in_plex)
637 				gv_set_sd_state(s, GV_SD_UP, 0);
638 			gv_update_plex_state(to);
639 			to->flags &= ~GV_PLEX_SYNCING;
640 			to->synced = 0;
641 			gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0);
642 		} else {
643 			offset = bp->bio_offset + bp->bio_length;
644 			err = gv_sync_request(from, to, offset,
645 			    MIN(bp->bio_length, from->size - offset),
646 			    BIO_READ, NULL);
647 		}
648 	}
649 	g_destroy_bio(bp);
650 	/* Clean up if there was an error. */
651 	if (err) {
652 		to->flags &= ~GV_PLEX_SYNCING;
653 		G_VINUM_DEBUG(0, "error syncing plexes: error code %d", err);
654 	}
655 
656 	/* Check if all plexes are synced, and lower refcounts. */
657 	g_topology_lock();
658 	LIST_FOREACH(p, &v->plexes, in_volume) {
659 		if (p->flags & GV_PLEX_SYNCING) {
660 			g_topology_unlock();
661 			return (-1);
662 		}
663 	}
664 	/* If we came here, all plexes are synced, and we're free. */
665 	gv_access(v->provider, -1, -1, 0);
666 	g_topology_unlock();
667 	G_VINUM_DEBUG(1, "plex sync completed");
668 	gv_volume_flush(v);
669 	return (0);
670 }
671 
672 /*
673  * Create a new bio struct for the next grow request.
674  */
675 int
676 gv_grow_request(struct gv_plex *p, off_t offset, off_t length, int type,
677     caddr_t data)
678 {
679 	struct gv_softc *sc;
680 	struct bio *bp;
681 
682 	KASSERT(p != NULL, ("gv_grow_request: NULL p"));
683 	sc = p->vinumconf;
684 	KASSERT(sc != NULL, ("gv_grow_request: NULL sc"));
685 
686 	bp = g_new_bio();
687 	if (bp == NULL) {
688 		G_VINUM_DEBUG(0, "grow of %s failed creating bio: "
689 		    "out of memory", p->name);
690 		return (ENOMEM);
691 	}
692 
693 	bp->bio_cmd = type;
694 	bp->bio_done = gv_done;
695 	bp->bio_error = 0;
696 	bp->bio_caller1 = p;
697 	bp->bio_offset = offset;
698 	bp->bio_length = length;
699 	bp->bio_pflags |= GV_BIO_GROW;
700 	if (data == NULL)
701 		data = g_malloc(length, M_WAITOK);
702 	bp->bio_pflags |= GV_BIO_MALLOC;
703 	bp->bio_data = data;
704 
705 	gv_post_bio(sc, bp);
706 	//gv_plex_start(p, bp);
707 	return (0);
708 }
709 
710 /*
711  * Finish handling of a bio to a growing plex.
712  */
713 void
714 gv_grow_complete(struct gv_plex *p, struct bio *bp)
715 {
716 	struct gv_softc *sc;
717 	struct gv_sd *s;
718 	struct gv_volume *v;
719 	off_t origsize, offset;
720 	int sdcount, err;
721 
722 	v = p->vol_sc;
723 	KASSERT(v != NULL, ("gv_grow_complete: NULL v"));
724 	sc = v->vinumconf;
725 	KASSERT(sc != NULL, ("gv_grow_complete: NULL sc"));
726 	err = 0;
727 
728 	/* If it was a read, write it. */
729 	if (bp->bio_cmd == BIO_READ) {
730 		p->synced += bp->bio_length;
731 		err = gv_grow_request(p, bp->bio_offset, bp->bio_length,
732 		    BIO_WRITE, bp->bio_data);
733 	/* If it was a write, read next. */
734 	} else if (bp->bio_cmd == BIO_WRITE) {
735 		if (bp->bio_pflags & GV_BIO_MALLOC)
736 			g_free(bp->bio_data);
737 
738 		/* Find the real size of the plex. */
739 		sdcount = gv_sdcount(p, 1);
740 		s = LIST_FIRST(&p->subdisks);
741 		KASSERT(s != NULL, ("NULL s"));
742 		origsize = (s->size * (sdcount - 1));
743 		if (bp->bio_offset + bp->bio_length >= origsize) {
744 			G_VINUM_DEBUG(1, "growing of %s completed", p->name);
745 			p->flags &= ~GV_PLEX_GROWING;
746 			LIST_FOREACH(s, &p->subdisks, in_plex) {
747 				s->flags &= ~GV_SD_GROW;
748 				gv_set_sd_state(s, GV_SD_UP, 0);
749 			}
750 			p->size = gv_plex_size(p);
751 			gv_update_vol_size(v, gv_vol_size(v));
752 			gv_set_plex_state(p, GV_PLEX_UP, 0);
753 			g_topology_lock();
754 			gv_access(v->provider, -1, -1, 0);
755 			g_topology_unlock();
756 			p->synced = 0;
757 			gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0);
758 			/* Issue delayed requests. */
759 			gv_plex_flush(p);
760 		} else {
761 			offset = bp->bio_offset + bp->bio_length;
762 			err = gv_grow_request(p, offset,
763 			   MIN(bp->bio_length, origsize - offset),
764 			   BIO_READ, NULL);
765 		}
766 	}
767 	g_destroy_bio(bp);
768 
769 	if (err) {
770 		p->flags &= ~GV_PLEX_GROWING;
771 		G_VINUM_DEBUG(0, "error growing plex: error code %d", err);
772 	}
773 }
774 
775 
776 /*
777  * Create an initialization BIO and send it off to the consumer. Assume that
778  * we're given initialization data as parameter.
779  */
780 void
781 gv_init_request(struct gv_sd *s, off_t start, caddr_t data, off_t length)
782 {
783 	struct gv_drive *d;
784 	struct g_consumer *cp;
785 	struct bio *bp, *cbp;
786 
787 	KASSERT(s != NULL, ("gv_init_request: NULL s"));
788 	d = s->drive_sc;
789 	KASSERT(d != NULL, ("gv_init_request: NULL d"));
790 	cp = d->consumer;
791 	KASSERT(cp != NULL, ("gv_init_request: NULL cp"));
792 
793 	bp = g_new_bio();
794 	if (bp == NULL) {
795 		G_VINUM_DEBUG(0, "subdisk '%s' init: write failed at offset %jd"
796 		    " (drive offset %jd); out of memory", s->name,
797 		    (intmax_t)s->initialized, (intmax_t)start);
798 		return; /* XXX: Error codes. */
799 	}
800 	bp->bio_cmd = BIO_WRITE;
801 	bp->bio_data = data;
802 	bp->bio_done = gv_done;
803 	bp->bio_error = 0;
804 	bp->bio_length = length;
805 	bp->bio_pflags |= GV_BIO_INIT;
806 	bp->bio_offset = start;
807 	bp->bio_caller1 = s;
808 
809 	/* Then ofcourse, we have to clone it. */
810 	cbp = g_clone_bio(bp);
811 	if (cbp == NULL) {
812 		G_VINUM_DEBUG(0, "subdisk '%s' init: write failed at offset %jd"
813 		    " (drive offset %jd); out of memory", s->name,
814 		    (intmax_t)s->initialized, (intmax_t)start);
815 		return; /* XXX: Error codes. */
816 	}
817 	cbp->bio_done = gv_done;
818 	cbp->bio_caller1 = s;
819 	/* Send it off to the consumer. */
820 	g_io_request(cbp, cp);
821 }
822 
823 /*
824  * Handle a finished initialization BIO.
825  */
826 void
827 gv_init_complete(struct gv_plex *p, struct bio *bp)
828 {
829 	struct gv_softc *sc;
830 	struct gv_drive *d;
831 	struct g_consumer *cp;
832 	struct gv_sd *s;
833 	off_t start, length;
834 	caddr_t data;
835 	int error;
836 
837 	s = bp->bio_caller1;
838 	start = bp->bio_offset;
839 	length = bp->bio_length;
840 	error = bp->bio_error;
841 	data = bp->bio_data;
842 
843 	KASSERT(s != NULL, ("gv_init_complete: NULL s"));
844 	d = s->drive_sc;
845 	KASSERT(d != NULL, ("gv_init_complete: NULL d"));
846 	cp = d->consumer;
847 	KASSERT(cp != NULL, ("gv_init_complete: NULL cp"));
848 	sc = p->vinumconf;
849 	KASSERT(sc != NULL, ("gv_init_complete: NULL sc"));
850 
851 	g_destroy_bio(bp);
852 
853 	/*
854 	 * First we need to find out if it was okay, and abort if it's not.
855 	 * Then we need to free previous buffers, find out the correct subdisk,
856 	 * as well as getting the correct starting point and length of the BIO.
857 	 */
858 	if (start >= s->drive_offset + s->size) {
859 		/* Free the data we initialized. */
860 		if (data != NULL)
861 			g_free(data);
862 		g_topology_assert_not();
863 		g_topology_lock();
864 		g_access(cp, 0, -1, 0);
865 		g_topology_unlock();
866 		if (error) {
867 			gv_set_sd_state(s, GV_SD_STALE, GV_SETSTATE_FORCE |
868 			    GV_SETSTATE_CONFIG);
869 		} else {
870 			gv_set_sd_state(s, GV_SD_UP, GV_SETSTATE_CONFIG);
871 			s->initialized = 0;
872 			gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0);
873 			G_VINUM_DEBUG(1, "subdisk '%s' init: finished "
874 			    "successfully", s->name);
875 		}
876 		return;
877 	}
878 	s->initialized += length;
879 	start += length;
880 	gv_init_request(s, start, data, length);
881 }
882 
883 /*
884  * Create a new bio struct for the next parity rebuild. Used both by internal
885  * rebuild of degraded plexes as well as user initiated rebuilds/checks.
886  */
887 void
888 gv_parity_request(struct gv_plex *p, int flags, off_t offset)
889 {
890 	struct gv_softc *sc;
891 	struct bio *bp;
892 
893 	KASSERT(p != NULL, ("gv_parity_request: NULL p"));
894 	sc = p->vinumconf;
895 	KASSERT(sc != NULL, ("gv_parity_request: NULL sc"));
896 
897 	bp = g_new_bio();
898 	if (bp == NULL) {
899 		G_VINUM_DEBUG(0, "rebuild of %s failed creating bio: "
900 		    "out of memory", p->name);
901 		return;
902 	}
903 
904 	bp->bio_cmd = BIO_WRITE;
905 	bp->bio_done = gv_done;
906 	bp->bio_error = 0;
907 	bp->bio_length = p->stripesize;
908 	bp->bio_caller1 = p;
909 
910 	/*
911 	 * Check if it's a rebuild of a degraded plex or a user request of
912 	 * parity rebuild.
913 	 */
914 	if (flags & GV_BIO_REBUILD)
915 		bp->bio_data = g_malloc(GV_DFLT_SYNCSIZE, M_WAITOK);
916 	else if (flags & GV_BIO_CHECK)
917 		bp->bio_data = g_malloc(p->stripesize, M_WAITOK | M_ZERO);
918 	else {
919 		G_VINUM_DEBUG(0, "invalid flags given in rebuild");
920 		return;
921 	}
922 
923 	bp->bio_pflags = flags;
924 	bp->bio_pflags |= GV_BIO_MALLOC;
925 
926 	/* We still have more parity to build. */
927 	bp->bio_offset = offset;
928 	gv_post_bio(sc, bp);
929 	//gv_plex_start(p, bp); /* Send it down to the plex. */
930 }
931 
932 /*
933  * Handle a finished parity write.
934  */
935 void
936 gv_parity_complete(struct gv_plex *p, struct bio *bp)
937 {
938 	struct gv_softc *sc;
939 	int error, flags;
940 
941 	error = bp->bio_error;
942 	flags = bp->bio_pflags;
943 	flags &= ~GV_BIO_MALLOC;
944 
945 	sc = p->vinumconf;
946 	KASSERT(sc != NULL, ("gv_parity_complete: NULL sc"));
947 
948 	/* Clean up what we allocated. */
949 	if (bp->bio_pflags & GV_BIO_MALLOC)
950 		g_free(bp->bio_data);
951 	g_destroy_bio(bp);
952 
953 	if (error == EAGAIN) {
954 		G_VINUM_DEBUG(0, "parity incorrect at offset 0x%jx",
955 		    (intmax_t)p->synced);
956 	}
957 
958 	/* Any error is fatal, except EAGAIN when we're rebuilding. */
959 	if (error && !(error == EAGAIN && (flags & GV_BIO_PARITY))) {
960 		/* Make sure we don't have the lock. */
961 		g_topology_assert_not();
962 		g_topology_lock();
963 		gv_access(p->vol_sc->provider, -1, -1, 0);
964 		g_topology_unlock();
965 		G_VINUM_DEBUG(0, "parity check on %s failed at 0x%jx "
966 		    "errno %d", p->name, (intmax_t)p->synced, error);
967 		return;
968 	} else {
969 		p->synced += p->stripesize;
970 	}
971 
972 	if (p->synced >= p->size) {
973 		/* Make sure we don't have the lock. */
974 		g_topology_assert_not();
975 		g_topology_lock();
976 		gv_access(p->vol_sc->provider, -1, -1, 0);
977 		g_topology_unlock();
978 		/* We're finished. */
979 		G_VINUM_DEBUG(1, "parity operation on %s finished", p->name);
980 		p->synced = 0;
981 		gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0);
982 		return;
983 	}
984 
985 	/* Send down next. It will determine if we need to itself. */
986 	gv_parity_request(p, flags, p->synced);
987 }
988 
989 /*
990  * Handle a finished plex rebuild bio.
991  */
992 void
993 gv_rebuild_complete(struct gv_plex *p, struct bio *bp)
994 {
995 	struct gv_softc *sc;
996 	struct gv_sd *s;
997 	int error, flags;
998 	off_t offset;
999 
1000 	error = bp->bio_error;
1001 	flags = bp->bio_pflags;
1002 	offset = bp->bio_offset;
1003 	flags &= ~GV_BIO_MALLOC;
1004 	sc = p->vinumconf;
1005 	KASSERT(sc != NULL, ("gv_rebuild_complete: NULL sc"));
1006 
1007 	/* Clean up what we allocated. */
1008 	if (bp->bio_pflags & GV_BIO_MALLOC)
1009 		g_free(bp->bio_data);
1010 	g_destroy_bio(bp);
1011 
1012 	if (error) {
1013 		g_topology_assert_not();
1014 		g_topology_lock();
1015 		gv_access(p->vol_sc->provider, -1, -1, 0);
1016 		g_topology_unlock();
1017 
1018 		G_VINUM_DEBUG(0, "rebuild of %s failed at offset %jd errno: %d",
1019 		    p->name, (intmax_t)offset, error);
1020 		p->flags &= ~GV_PLEX_REBUILDING;
1021 		p->synced = 0;
1022 		gv_plex_flush(p); /* Flush out remaining rebuild BIOs. */
1023 		return;
1024 	}
1025 
1026 	offset += (p->stripesize * (gv_sdcount(p, 1) - 1));
1027 	if (offset >= p->size) {
1028 		/* We're finished. */
1029 		g_topology_assert_not();
1030 		g_topology_lock();
1031 		gv_access(p->vol_sc->provider, -1, -1, 0);
1032 		g_topology_unlock();
1033 
1034 		G_VINUM_DEBUG(1, "rebuild of %s finished", p->name);
1035 		gv_save_config(p->vinumconf);
1036 		p->flags &= ~GV_PLEX_REBUILDING;
1037 		p->synced = 0;
1038 		/* Try to up all subdisks. */
1039 		LIST_FOREACH(s, &p->subdisks, in_plex)
1040 			gv_update_sd_state(s);
1041 		gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0);
1042 		gv_plex_flush(p); /* Flush out remaining rebuild BIOs. */
1043 		return;
1044 	}
1045 
1046 	/* Send down next. It will determine if we need to itself. */
1047 	gv_parity_request(p, flags, offset);
1048 }
1049