xref: /freebsd/sys/geom/vinum/geom_vinum_plex.c (revision 6829dae12bb055451fa467da4589c43bd03b1e64)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2004, 2007 Lukas Ertl
5  * Copyright (c) 2007, 2009 Ulf Lilleengen
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include <sys/param.h>
34 #include <sys/bio.h>
35 #include <sys/lock.h>
36 #include <sys/malloc.h>
37 #include <sys/systm.h>
38 
39 #include <geom/geom.h>
40 #include <geom/vinum/geom_vinum_var.h>
41 #include <geom/vinum/geom_vinum_raid5.h>
42 #include <geom/vinum/geom_vinum.h>
43 
44 static int	gv_check_parity(struct gv_plex *, struct bio *,
45 		    struct gv_raid5_packet *);
46 static int	gv_normal_parity(struct gv_plex *, struct bio *,
47 		    struct gv_raid5_packet *);
48 static void	gv_plex_flush(struct gv_plex *);
49 static int	gv_plex_offset(struct gv_plex *, off_t, off_t, off_t *, off_t *,
50 		    int *, int);
51 static int 	gv_plex_normal_request(struct gv_plex *, struct bio *, off_t,
52 		    off_t,  caddr_t);
53 static void	gv_post_bio(struct gv_softc *, struct bio *);
54 
55 void
56 gv_plex_start(struct gv_plex *p, struct bio *bp)
57 {
58 	struct bio *cbp;
59 	struct gv_sd *s;
60 	struct gv_raid5_packet *wp;
61 	caddr_t addr;
62 	off_t bcount, boff, len;
63 
64 	bcount = bp->bio_length;
65 	addr = bp->bio_data;
66 	boff = bp->bio_offset;
67 
68 	/* Walk over the whole length of the request, we might split it up. */
69 	while (bcount > 0) {
70 		wp = NULL;
71 
72  		/*
73 		 * RAID5 plexes need special treatment, as a single request
74 		 * might involve several read/write sub-requests.
75  		 */
76 		if (p->org == GV_PLEX_RAID5) {
77 			wp = gv_raid5_start(p, bp, addr, boff, bcount);
78  			if (wp == NULL)
79  				return;
80 
81 			len = wp->length;
82 
83 			if (TAILQ_EMPTY(&wp->bits))
84 				g_free(wp);
85 			else if (wp->lockbase != -1)
86 				TAILQ_INSERT_TAIL(&p->packets, wp, list);
87 
88 		/*
89 		 * Requests to concatenated and striped plexes go straight
90 		 * through.
91 		 */
92 		} else {
93 			len = gv_plex_normal_request(p, bp, boff, bcount, addr);
94 		}
95 		if (len < 0)
96 			return;
97 
98 		bcount -= len;
99 		addr += len;
100 		boff += len;
101 	}
102 
103 	/*
104 	 * Fire off all sub-requests.  We get the correct consumer (== drive)
105 	 * to send each request to via the subdisk that was stored in
106 	 * cbp->bio_caller1.
107 	 */
108 	cbp = bioq_takefirst(p->bqueue);
109 	while (cbp != NULL) {
110 		/*
111 		 * RAID5 sub-requests need to come in correct order, otherwise
112 		 * we trip over the parity, as it might be overwritten by
113 		 * another sub-request.  We abuse cbp->bio_caller2 to mark
114 		 * potential overlap situations.
115 		 */
116 		if (cbp->bio_caller2 != NULL && gv_stripe_active(p, cbp)) {
117 			/* Park the bio on the waiting queue. */
118 			cbp->bio_pflags |= GV_BIO_ONHOLD;
119 			bioq_disksort(p->wqueue, cbp);
120 		} else {
121 			s = cbp->bio_caller1;
122 			g_io_request(cbp, s->drive_sc->consumer);
123 		}
124 		cbp = bioq_takefirst(p->bqueue);
125 	}
126 }
127 
128 static int
129 gv_plex_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off,
130     off_t *real_len, int *sdno, int growing)
131 {
132 	struct gv_sd *s;
133 	int i, sdcount;
134 	off_t len_left, stripeend, stripeno, stripestart;
135 
136 	switch (p->org) {
137 	case GV_PLEX_CONCAT:
138 		/*
139 		 * Find the subdisk where this request starts.  The subdisks in
140 		 * this list must be ordered by plex_offset.
141 		 */
142 		i = 0;
143 		LIST_FOREACH(s, &p->subdisks, in_plex) {
144 			if (s->plex_offset <= boff &&
145 			    s->plex_offset + s->size > boff) {
146 				*sdno = i;
147 				break;
148 			}
149 			i++;
150 		}
151 		if (s == NULL || s->drive_sc == NULL)
152 			return (GV_ERR_NOTFOUND);
153 
154 		/* Calculate corresponding offsets on disk. */
155 		*real_off = boff - s->plex_offset;
156 		len_left = s->size - (*real_off);
157 		KASSERT(len_left >= 0, ("gv_plex_offset: len_left < 0"));
158 		*real_len = (bcount > len_left) ? len_left : bcount;
159 		break;
160 
161 	case GV_PLEX_STRIPED:
162 		/* The number of the stripe where the request starts. */
163 		stripeno = boff / p->stripesize;
164 		KASSERT(stripeno >= 0, ("gv_plex_offset: stripeno < 0"));
165 
166 		/* Take growing subdisks into account when calculating. */
167 		sdcount = gv_sdcount(p, (boff >= p->synced));
168 
169 		if (!(boff + bcount <= p->synced) &&
170 		    (p->flags & GV_PLEX_GROWING) &&
171 		    !growing)
172 			return (GV_ERR_ISBUSY);
173 		*sdno = stripeno % sdcount;
174 
175 		KASSERT(sdno >= 0, ("gv_plex_offset: sdno < 0"));
176 		stripestart = (stripeno / sdcount) *
177 		    p->stripesize;
178 		KASSERT(stripestart >= 0, ("gv_plex_offset: stripestart < 0"));
179 		stripeend = stripestart + p->stripesize;
180 		*real_off = boff - (stripeno * p->stripesize) +
181 		    stripestart;
182 		len_left = stripeend - *real_off;
183 		KASSERT(len_left >= 0, ("gv_plex_offset: len_left < 0"));
184 
185 		*real_len = (bcount <= len_left) ? bcount : len_left;
186 		break;
187 
188 	default:
189 		return (GV_ERR_PLEXORG);
190 	}
191 	return (0);
192 }
193 
194 /*
195  * Prepare a normal plex request.
196  */
197 static int
198 gv_plex_normal_request(struct gv_plex *p, struct bio *bp, off_t boff,
199     off_t bcount,  caddr_t addr)
200 {
201 	struct gv_sd *s;
202 	struct bio *cbp;
203 	off_t real_len, real_off;
204 	int i, err, sdno;
205 
206 	s = NULL;
207 	sdno = -1;
208 	real_len = real_off = 0;
209 
210 	err = ENXIO;
211 
212 	if (p == NULL || LIST_EMPTY(&p->subdisks))
213 		goto bad;
214 
215 	err = gv_plex_offset(p, boff, bcount, &real_off,
216 	    &real_len, &sdno, (bp->bio_pflags & GV_BIO_GROW));
217 	/* If the request was blocked, put it into wait. */
218 	if (err == GV_ERR_ISBUSY) {
219 		bioq_disksort(p->rqueue, bp);
220 		return (-1); /* "Fail", and delay request. */
221 	}
222 	if (err) {
223 		err = ENXIO;
224 		goto bad;
225 	}
226 	err = ENXIO;
227 
228 	/* Find the right subdisk. */
229 	i = 0;
230 	LIST_FOREACH(s, &p->subdisks, in_plex) {
231 		if (i == sdno)
232 			break;
233 		i++;
234 	}
235 
236 	/* Subdisk not found. */
237 	if (s == NULL || s->drive_sc == NULL)
238 		goto bad;
239 
240 	/* Now check if we can handle the request on this subdisk. */
241 	switch (s->state) {
242 	case GV_SD_UP:
243 		/* If the subdisk is up, just continue. */
244 		break;
245 	case GV_SD_DOWN:
246 		if (bp->bio_pflags & GV_BIO_INTERNAL)
247 			G_VINUM_DEBUG(0, "subdisk must be in the stale state in"
248 			    " order to perform administrative requests");
249 		goto bad;
250 	case GV_SD_STALE:
251 		if (!(bp->bio_pflags & GV_BIO_SYNCREQ)) {
252 			G_VINUM_DEBUG(0, "subdisk stale, unable to perform "
253 			    "regular requests");
254 			goto bad;
255 		}
256 
257 		G_VINUM_DEBUG(1, "sd %s is initializing", s->name);
258 		gv_set_sd_state(s, GV_SD_INITIALIZING, GV_SETSTATE_FORCE);
259 		break;
260 	case GV_SD_INITIALIZING:
261 		if (bp->bio_cmd == BIO_READ)
262 			goto bad;
263 		break;
264 	default:
265 		/* All other subdisk states mean it's not accessible. */
266 		goto bad;
267 	}
268 
269 	/* Clone the bio and adjust the offsets and sizes. */
270 	cbp = g_clone_bio(bp);
271 	if (cbp == NULL) {
272 		err = ENOMEM;
273 		goto bad;
274 	}
275 	cbp->bio_offset = real_off + s->drive_offset;
276 	cbp->bio_length = real_len;
277 	cbp->bio_data = addr;
278 	cbp->bio_done = gv_done;
279 	cbp->bio_caller1 = s;
280 
281 	/* Store the sub-requests now and let others issue them. */
282 	bioq_insert_tail(p->bqueue, cbp);
283 	return (real_len);
284 bad:
285 	G_VINUM_LOGREQ(0, bp, "plex request failed.");
286 	/* Building the sub-request failed. If internal BIO, do not deliver. */
287 	if (bp->bio_pflags & GV_BIO_INTERNAL) {
288 		if (bp->bio_pflags & GV_BIO_MALLOC)
289 			g_free(bp->bio_data);
290 		g_destroy_bio(bp);
291 		p->flags &= ~(GV_PLEX_SYNCING | GV_PLEX_REBUILDING |
292 		    GV_PLEX_GROWING);
293 		return (-1);
294 	}
295 	g_io_deliver(bp, err);
296 	return (-1);
297 }
298 
299 /*
300  * Handle a completed request to a striped or concatenated plex.
301  */
302 void
303 gv_plex_normal_done(struct gv_plex *p, struct bio *bp)
304 {
305 	struct bio *pbp;
306 
307 	pbp = bp->bio_parent;
308 	if (pbp->bio_error == 0)
309 		pbp->bio_error = bp->bio_error;
310 	g_destroy_bio(bp);
311 	pbp->bio_inbed++;
312 	if (pbp->bio_children == pbp->bio_inbed) {
313 		/* Just set it to length since multiple plexes will
314 		 * screw things up. */
315 		pbp->bio_completed = pbp->bio_length;
316 		if (pbp->bio_pflags & GV_BIO_SYNCREQ)
317 			gv_sync_complete(p, pbp);
318 		else if (pbp->bio_pflags & GV_BIO_GROW)
319 			gv_grow_complete(p, pbp);
320 		else
321 			g_io_deliver(pbp, pbp->bio_error);
322 	}
323 }
324 
325 /*
326  * Handle a completed request to a RAID-5 plex.
327  */
328 void
329 gv_plex_raid5_done(struct gv_plex *p, struct bio *bp)
330 {
331 	struct gv_softc *sc;
332 	struct bio *cbp, *pbp;
333 	struct gv_bioq *bq, *bq2;
334 	struct gv_raid5_packet *wp;
335 	off_t completed;
336 	int i;
337 
338 	completed = 0;
339 	sc = p->vinumconf;
340 	wp = bp->bio_caller2;
341 
342 	switch (bp->bio_parent->bio_cmd) {
343 	case BIO_READ:
344 		if (wp == NULL) {
345 			completed = bp->bio_completed;
346 			break;
347 		}
348 
349 		TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
350 			if (bq->bp != bp)
351 				continue;
352 			TAILQ_REMOVE(&wp->bits, bq, queue);
353 			g_free(bq);
354 			for (i = 0; i < wp->length; i++)
355 				wp->data[i] ^= bp->bio_data[i];
356 			break;
357 		}
358 		if (TAILQ_EMPTY(&wp->bits)) {
359 			completed = wp->length;
360 			if (wp->lockbase != -1) {
361 				TAILQ_REMOVE(&p->packets, wp, list);
362 				/* Bring the waiting bios back into the game. */
363 				pbp = bioq_takefirst(p->wqueue);
364 				while (pbp != NULL) {
365 					gv_post_bio(sc, pbp);
366 					pbp = bioq_takefirst(p->wqueue);
367 				}
368 			}
369 			g_free(wp);
370 		}
371 
372 		break;
373 
374  	case BIO_WRITE:
375 		/* XXX can this ever happen? */
376 		if (wp == NULL) {
377 			completed = bp->bio_completed;
378 			break;
379 		}
380 
381 		/* Check if we need to handle parity data. */
382 		TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) {
383 			if (bq->bp != bp)
384 				continue;
385 			TAILQ_REMOVE(&wp->bits, bq, queue);
386 			g_free(bq);
387 			cbp = wp->parity;
388 			if (cbp != NULL) {
389 				for (i = 0; i < wp->length; i++)
390 					cbp->bio_data[i] ^= bp->bio_data[i];
391 			}
392 			break;
393 		}
394 
395 		/* Handle parity data. */
396 		if (TAILQ_EMPTY(&wp->bits)) {
397 			if (bp->bio_parent->bio_pflags & GV_BIO_CHECK)
398 				i = gv_check_parity(p, bp, wp);
399 			else
400 				i = gv_normal_parity(p, bp, wp);
401 
402 			/* All of our sub-requests have finished. */
403 			if (i) {
404 				completed = wp->length;
405 				TAILQ_REMOVE(&p->packets, wp, list);
406 				/* Bring the waiting bios back into the game. */
407 				pbp = bioq_takefirst(p->wqueue);
408 				while (pbp != NULL) {
409 					gv_post_bio(sc, pbp);
410 					pbp = bioq_takefirst(p->wqueue);
411 				}
412 				g_free(wp);
413 			}
414 		}
415 
416 		break;
417 	}
418 
419 	pbp = bp->bio_parent;
420 	if (pbp->bio_error == 0)
421 		pbp->bio_error = bp->bio_error;
422 	pbp->bio_completed += completed;
423 
424 	/* When the original request is finished, we deliver it. */
425 	pbp->bio_inbed++;
426 	if (pbp->bio_inbed == pbp->bio_children) {
427 		/* Hand it over for checking or delivery. */
428 		if (pbp->bio_cmd == BIO_WRITE &&
429 		    (pbp->bio_pflags & GV_BIO_CHECK)) {
430 			gv_parity_complete(p, pbp);
431 		} else if (pbp->bio_cmd == BIO_WRITE &&
432 		    (pbp->bio_pflags & GV_BIO_REBUILD)) {
433 			gv_rebuild_complete(p, pbp);
434 		} else if (pbp->bio_pflags & GV_BIO_INIT) {
435 			gv_init_complete(p, pbp);
436 		} else if (pbp->bio_pflags & GV_BIO_SYNCREQ) {
437 			gv_sync_complete(p, pbp);
438 		} else if (pbp->bio_pflags & GV_BIO_GROW) {
439 			gv_grow_complete(p, pbp);
440 		} else {
441 			g_io_deliver(pbp, pbp->bio_error);
442 		}
443 	}
444 
445 	/* Clean up what we allocated. */
446 	if (bp->bio_cflags & GV_BIO_MALLOC)
447 		g_free(bp->bio_data);
448 	g_destroy_bio(bp);
449 }
450 
451 static int
452 gv_check_parity(struct gv_plex *p, struct bio *bp, struct gv_raid5_packet *wp)
453 {
454 	struct bio *pbp;
455 	struct gv_sd *s;
456 	int err, finished, i;
457 
458 	err = 0;
459 	finished = 1;
460 
461 	if (wp->waiting != NULL) {
462 		pbp = wp->waiting;
463 		wp->waiting = NULL;
464 		s = pbp->bio_caller1;
465 		g_io_request(pbp, s->drive_sc->consumer);
466 		finished = 0;
467 
468 	} else if (wp->parity != NULL) {
469 		pbp = wp->parity;
470 		wp->parity = NULL;
471 
472 		/* Check if the parity is correct. */
473 		for (i = 0; i < wp->length; i++) {
474 			if (bp->bio_data[i] != pbp->bio_data[i]) {
475 				err = 1;
476 				break;
477 			}
478 		}
479 
480 		/* The parity is not correct... */
481 		if (err) {
482 			bp->bio_parent->bio_error = EAGAIN;
483 
484 			/* ... but we rebuild it. */
485 			if (bp->bio_parent->bio_pflags & GV_BIO_PARITY) {
486 				s = pbp->bio_caller1;
487 				g_io_request(pbp, s->drive_sc->consumer);
488 				finished = 0;
489 			}
490 		}
491 
492 		/*
493 		 * Clean up the BIO we would have used for rebuilding the
494 		 * parity.
495 		 */
496 		if (finished) {
497 			bp->bio_parent->bio_inbed++;
498 			g_destroy_bio(pbp);
499 		}
500 
501 	}
502 
503 	return (finished);
504 }
505 
506 static int
507 gv_normal_parity(struct gv_plex *p, struct bio *bp, struct gv_raid5_packet *wp)
508 {
509 	struct bio *cbp, *pbp;
510 	struct gv_sd *s;
511 	int finished, i;
512 
513 	finished = 1;
514 
515 	if (wp->waiting != NULL) {
516 		pbp = wp->waiting;
517 		wp->waiting = NULL;
518 		cbp = wp->parity;
519 		for (i = 0; i < wp->length; i++)
520 			cbp->bio_data[i] ^= pbp->bio_data[i];
521 		s = pbp->bio_caller1;
522 		g_io_request(pbp, s->drive_sc->consumer);
523 		finished = 0;
524 
525 	} else if (wp->parity != NULL) {
526 		cbp = wp->parity;
527 		wp->parity = NULL;
528 		s = cbp->bio_caller1;
529 		g_io_request(cbp, s->drive_sc->consumer);
530 		finished = 0;
531 	}
532 
533 	return (finished);
534 }
535 
536 /* Flush the queue with delayed requests. */
537 static void
538 gv_plex_flush(struct gv_plex *p)
539 {
540 	struct gv_softc *sc;
541 	struct bio *bp;
542 
543 	sc = p->vinumconf;
544 	bp = bioq_takefirst(p->rqueue);
545 	while (bp != NULL) {
546 		gv_plex_start(p, bp);
547 		bp = bioq_takefirst(p->rqueue);
548 	}
549 }
550 
551 static void
552 gv_post_bio(struct gv_softc *sc, struct bio *bp)
553 {
554 
555 	KASSERT(sc != NULL, ("NULL sc"));
556 	KASSERT(bp != NULL, ("NULL bp"));
557 	mtx_lock(&sc->bqueue_mtx);
558 	bioq_disksort(sc->bqueue_down, bp);
559 	wakeup(sc);
560 	mtx_unlock(&sc->bqueue_mtx);
561 }
562 
563 int
564 gv_sync_request(struct gv_plex *from, struct gv_plex *to, off_t offset,
565     off_t length, int type, caddr_t data)
566 {
567 	struct gv_softc *sc;
568 	struct bio *bp;
569 
570 	KASSERT(from != NULL, ("NULL from"));
571 	KASSERT(to != NULL, ("NULL to"));
572 	sc = from->vinumconf;
573 	KASSERT(sc != NULL, ("NULL sc"));
574 
575 	bp = g_new_bio();
576 	if (bp == NULL) {
577 		G_VINUM_DEBUG(0, "sync from '%s' failed at offset "
578 		    " %jd; out of memory", from->name, offset);
579 		return (ENOMEM);
580 	}
581 	bp->bio_length = length;
582 	bp->bio_done = gv_done;
583 	bp->bio_pflags |= GV_BIO_SYNCREQ;
584 	bp->bio_offset = offset;
585 	bp->bio_caller1 = from;
586 	bp->bio_caller2 = to;
587 	bp->bio_cmd = type;
588 	if (data == NULL)
589 		data = g_malloc(length, M_WAITOK);
590 	bp->bio_pflags |= GV_BIO_MALLOC; /* Free on the next run. */
591 	bp->bio_data = data;
592 
593 	/* Send down next. */
594 	gv_post_bio(sc, bp);
595 	//gv_plex_start(from, bp);
596 	return (0);
597 }
598 
599 /*
600  * Handle a finished plex sync bio.
601  */
602 int
603 gv_sync_complete(struct gv_plex *to, struct bio *bp)
604 {
605 	struct gv_plex *from, *p;
606 	struct gv_sd *s;
607 	struct gv_volume *v;
608 	struct gv_softc *sc;
609 	off_t offset;
610 	int err;
611 
612 	g_topology_assert_not();
613 
614 	err = 0;
615 	KASSERT(to != NULL, ("NULL to"));
616 	KASSERT(bp != NULL, ("NULL bp"));
617 	from = bp->bio_caller2;
618 	KASSERT(from != NULL, ("NULL from"));
619 	v = to->vol_sc;
620 	KASSERT(v != NULL, ("NULL v"));
621 	sc = v->vinumconf;
622 	KASSERT(sc != NULL, ("NULL sc"));
623 
624 	/* If it was a read, write it. */
625 	if (bp->bio_cmd == BIO_READ) {
626 		err = gv_sync_request(from, to, bp->bio_offset, bp->bio_length,
627 	    	    BIO_WRITE, bp->bio_data);
628 	/* If it was a write, read the next one. */
629 	} else if (bp->bio_cmd == BIO_WRITE) {
630 		if (bp->bio_pflags & GV_BIO_MALLOC)
631 			g_free(bp->bio_data);
632 		to->synced += bp->bio_length;
633 		/* If we're finished, clean up. */
634 		if (bp->bio_offset + bp->bio_length >= from->size) {
635 			G_VINUM_DEBUG(1, "syncing of %s from %s completed",
636 			    to->name, from->name);
637 			/* Update our state. */
638 			LIST_FOREACH(s, &to->subdisks, in_plex)
639 				gv_set_sd_state(s, GV_SD_UP, 0);
640 			gv_update_plex_state(to);
641 			to->flags &= ~GV_PLEX_SYNCING;
642 			to->synced = 0;
643 			gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0);
644 		} else {
645 			offset = bp->bio_offset + bp->bio_length;
646 			err = gv_sync_request(from, to, offset,
647 			    MIN(bp->bio_length, from->size - offset),
648 			    BIO_READ, NULL);
649 		}
650 	}
651 	g_destroy_bio(bp);
652 	/* Clean up if there was an error. */
653 	if (err) {
654 		to->flags &= ~GV_PLEX_SYNCING;
655 		G_VINUM_DEBUG(0, "error syncing plexes: error code %d", err);
656 	}
657 
658 	/* Check if all plexes are synced, and lower refcounts. */
659 	g_topology_lock();
660 	LIST_FOREACH(p, &v->plexes, in_volume) {
661 		if (p->flags & GV_PLEX_SYNCING) {
662 			g_topology_unlock();
663 			return (-1);
664 		}
665 	}
666 	/* If we came here, all plexes are synced, and we're free. */
667 	gv_access(v->provider, -1, -1, 0);
668 	g_topology_unlock();
669 	G_VINUM_DEBUG(1, "plex sync completed");
670 	gv_volume_flush(v);
671 	return (0);
672 }
673 
674 /*
675  * Create a new bio struct for the next grow request.
676  */
677 int
678 gv_grow_request(struct gv_plex *p, off_t offset, off_t length, int type,
679     caddr_t data)
680 {
681 	struct gv_softc *sc;
682 	struct bio *bp;
683 
684 	KASSERT(p != NULL, ("gv_grow_request: NULL p"));
685 	sc = p->vinumconf;
686 	KASSERT(sc != NULL, ("gv_grow_request: NULL sc"));
687 
688 	bp = g_new_bio();
689 	if (bp == NULL) {
690 		G_VINUM_DEBUG(0, "grow of %s failed creating bio: "
691 		    "out of memory", p->name);
692 		return (ENOMEM);
693 	}
694 
695 	bp->bio_cmd = type;
696 	bp->bio_done = gv_done;
697 	bp->bio_error = 0;
698 	bp->bio_caller1 = p;
699 	bp->bio_offset = offset;
700 	bp->bio_length = length;
701 	bp->bio_pflags |= GV_BIO_GROW;
702 	if (data == NULL)
703 		data = g_malloc(length, M_WAITOK);
704 	bp->bio_pflags |= GV_BIO_MALLOC;
705 	bp->bio_data = data;
706 
707 	gv_post_bio(sc, bp);
708 	//gv_plex_start(p, bp);
709 	return (0);
710 }
711 
712 /*
713  * Finish handling of a bio to a growing plex.
714  */
715 void
716 gv_grow_complete(struct gv_plex *p, struct bio *bp)
717 {
718 	struct gv_softc *sc;
719 	struct gv_sd *s;
720 	struct gv_volume *v;
721 	off_t origsize, offset;
722 	int sdcount, err;
723 
724 	v = p->vol_sc;
725 	KASSERT(v != NULL, ("gv_grow_complete: NULL v"));
726 	sc = v->vinumconf;
727 	KASSERT(sc != NULL, ("gv_grow_complete: NULL sc"));
728 	err = 0;
729 
730 	/* If it was a read, write it. */
731 	if (bp->bio_cmd == BIO_READ) {
732 		p->synced += bp->bio_length;
733 		err = gv_grow_request(p, bp->bio_offset, bp->bio_length,
734 		    BIO_WRITE, bp->bio_data);
735 	/* If it was a write, read next. */
736 	} else if (bp->bio_cmd == BIO_WRITE) {
737 		if (bp->bio_pflags & GV_BIO_MALLOC)
738 			g_free(bp->bio_data);
739 
740 		/* Find the real size of the plex. */
741 		sdcount = gv_sdcount(p, 1);
742 		s = LIST_FIRST(&p->subdisks);
743 		KASSERT(s != NULL, ("NULL s"));
744 		origsize = (s->size * (sdcount - 1));
745 		if (bp->bio_offset + bp->bio_length >= origsize) {
746 			G_VINUM_DEBUG(1, "growing of %s completed", p->name);
747 			p->flags &= ~GV_PLEX_GROWING;
748 			LIST_FOREACH(s, &p->subdisks, in_plex) {
749 				s->flags &= ~GV_SD_GROW;
750 				gv_set_sd_state(s, GV_SD_UP, 0);
751 			}
752 			p->size = gv_plex_size(p);
753 			gv_update_vol_size(v, gv_vol_size(v));
754 			gv_set_plex_state(p, GV_PLEX_UP, 0);
755 			g_topology_lock();
756 			gv_access(v->provider, -1, -1, 0);
757 			g_topology_unlock();
758 			p->synced = 0;
759 			gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0);
760 			/* Issue delayed requests. */
761 			gv_plex_flush(p);
762 		} else {
763 			offset = bp->bio_offset + bp->bio_length;
764 			err = gv_grow_request(p, offset,
765 			   MIN(bp->bio_length, origsize - offset),
766 			   BIO_READ, NULL);
767 		}
768 	}
769 	g_destroy_bio(bp);
770 
771 	if (err) {
772 		p->flags &= ~GV_PLEX_GROWING;
773 		G_VINUM_DEBUG(0, "error growing plex: error code %d", err);
774 	}
775 }
776 
777 
778 /*
779  * Create an initialization BIO and send it off to the consumer. Assume that
780  * we're given initialization data as parameter.
781  */
782 void
783 gv_init_request(struct gv_sd *s, off_t start, caddr_t data, off_t length)
784 {
785 	struct gv_drive *d;
786 	struct g_consumer *cp;
787 	struct bio *bp, *cbp;
788 
789 	KASSERT(s != NULL, ("gv_init_request: NULL s"));
790 	d = s->drive_sc;
791 	KASSERT(d != NULL, ("gv_init_request: NULL d"));
792 	cp = d->consumer;
793 	KASSERT(cp != NULL, ("gv_init_request: NULL cp"));
794 
795 	bp = g_new_bio();
796 	if (bp == NULL) {
797 		G_VINUM_DEBUG(0, "subdisk '%s' init: write failed at offset %jd"
798 		    " (drive offset %jd); out of memory", s->name,
799 		    (intmax_t)s->initialized, (intmax_t)start);
800 		return; /* XXX: Error codes. */
801 	}
802 	bp->bio_cmd = BIO_WRITE;
803 	bp->bio_data = data;
804 	bp->bio_done = gv_done;
805 	bp->bio_error = 0;
806 	bp->bio_length = length;
807 	bp->bio_pflags |= GV_BIO_INIT;
808 	bp->bio_offset = start;
809 	bp->bio_caller1 = s;
810 
811 	/* Then ofcourse, we have to clone it. */
812 	cbp = g_clone_bio(bp);
813 	if (cbp == NULL) {
814 		G_VINUM_DEBUG(0, "subdisk '%s' init: write failed at offset %jd"
815 		    " (drive offset %jd); out of memory", s->name,
816 		    (intmax_t)s->initialized, (intmax_t)start);
817 		return; /* XXX: Error codes. */
818 	}
819 	cbp->bio_done = gv_done;
820 	cbp->bio_caller1 = s;
821 	/* Send it off to the consumer. */
822 	g_io_request(cbp, cp);
823 }
824 
825 /*
826  * Handle a finished initialization BIO.
827  */
828 void
829 gv_init_complete(struct gv_plex *p, struct bio *bp)
830 {
831 	struct gv_softc *sc;
832 	struct gv_drive *d;
833 	struct g_consumer *cp;
834 	struct gv_sd *s;
835 	off_t start, length;
836 	caddr_t data;
837 	int error;
838 
839 	s = bp->bio_caller1;
840 	start = bp->bio_offset;
841 	length = bp->bio_length;
842 	error = bp->bio_error;
843 	data = bp->bio_data;
844 
845 	KASSERT(s != NULL, ("gv_init_complete: NULL s"));
846 	d = s->drive_sc;
847 	KASSERT(d != NULL, ("gv_init_complete: NULL d"));
848 	cp = d->consumer;
849 	KASSERT(cp != NULL, ("gv_init_complete: NULL cp"));
850 	sc = p->vinumconf;
851 	KASSERT(sc != NULL, ("gv_init_complete: NULL sc"));
852 
853 	g_destroy_bio(bp);
854 
855 	/*
856 	 * First we need to find out if it was okay, and abort if it's not.
857 	 * Then we need to free previous buffers, find out the correct subdisk,
858 	 * as well as getting the correct starting point and length of the BIO.
859 	 */
860 	if (start >= s->drive_offset + s->size) {
861 		/* Free the data we initialized. */
862 		if (data != NULL)
863 			g_free(data);
864 		g_topology_assert_not();
865 		g_topology_lock();
866 		g_access(cp, 0, -1, 0);
867 		g_topology_unlock();
868 		if (error) {
869 			gv_set_sd_state(s, GV_SD_STALE, GV_SETSTATE_FORCE |
870 			    GV_SETSTATE_CONFIG);
871 		} else {
872 			gv_set_sd_state(s, GV_SD_UP, GV_SETSTATE_CONFIG);
873 			s->initialized = 0;
874 			gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0);
875 			G_VINUM_DEBUG(1, "subdisk '%s' init: finished "
876 			    "successfully", s->name);
877 		}
878 		return;
879 	}
880 	s->initialized += length;
881 	start += length;
882 	gv_init_request(s, start, data, length);
883 }
884 
885 /*
886  * Create a new bio struct for the next parity rebuild. Used both by internal
887  * rebuild of degraded plexes as well as user initiated rebuilds/checks.
888  */
889 void
890 gv_parity_request(struct gv_plex *p, int flags, off_t offset)
891 {
892 	struct gv_softc *sc;
893 	struct bio *bp;
894 
895 	KASSERT(p != NULL, ("gv_parity_request: NULL p"));
896 	sc = p->vinumconf;
897 	KASSERT(sc != NULL, ("gv_parity_request: NULL sc"));
898 
899 	bp = g_new_bio();
900 	if (bp == NULL) {
901 		G_VINUM_DEBUG(0, "rebuild of %s failed creating bio: "
902 		    "out of memory", p->name);
903 		return;
904 	}
905 
906 	bp->bio_cmd = BIO_WRITE;
907 	bp->bio_done = gv_done;
908 	bp->bio_error = 0;
909 	bp->bio_length = p->stripesize;
910 	bp->bio_caller1 = p;
911 
912 	/*
913 	 * Check if it's a rebuild of a degraded plex or a user request of
914 	 * parity rebuild.
915 	 */
916 	if (flags & GV_BIO_REBUILD)
917 		bp->bio_data = g_malloc(GV_DFLT_SYNCSIZE, M_WAITOK);
918 	else if (flags & GV_BIO_CHECK)
919 		bp->bio_data = g_malloc(p->stripesize, M_WAITOK | M_ZERO);
920 	else {
921 		G_VINUM_DEBUG(0, "invalid flags given in rebuild");
922 		return;
923 	}
924 
925 	bp->bio_pflags = flags;
926 	bp->bio_pflags |= GV_BIO_MALLOC;
927 
928 	/* We still have more parity to build. */
929 	bp->bio_offset = offset;
930 	gv_post_bio(sc, bp);
931 	//gv_plex_start(p, bp); /* Send it down to the plex. */
932 }
933 
934 /*
935  * Handle a finished parity write.
936  */
937 void
938 gv_parity_complete(struct gv_plex *p, struct bio *bp)
939 {
940 	struct gv_softc *sc;
941 	int error, flags;
942 
943 	error = bp->bio_error;
944 	flags = bp->bio_pflags;
945 	flags &= ~GV_BIO_MALLOC;
946 
947 	sc = p->vinumconf;
948 	KASSERT(sc != NULL, ("gv_parity_complete: NULL sc"));
949 
950 	/* Clean up what we allocated. */
951 	if (bp->bio_pflags & GV_BIO_MALLOC)
952 		g_free(bp->bio_data);
953 	g_destroy_bio(bp);
954 
955 	if (error == EAGAIN) {
956 		G_VINUM_DEBUG(0, "parity incorrect at offset 0x%jx",
957 		    (intmax_t)p->synced);
958 	}
959 
960 	/* Any error is fatal, except EAGAIN when we're rebuilding. */
961 	if (error && !(error == EAGAIN && (flags & GV_BIO_PARITY))) {
962 		/* Make sure we don't have the lock. */
963 		g_topology_assert_not();
964 		g_topology_lock();
965 		gv_access(p->vol_sc->provider, -1, -1, 0);
966 		g_topology_unlock();
967 		G_VINUM_DEBUG(0, "parity check on %s failed at 0x%jx "
968 		    "errno %d", p->name, (intmax_t)p->synced, error);
969 		return;
970 	} else {
971 		p->synced += p->stripesize;
972 	}
973 
974 	if (p->synced >= p->size) {
975 		/* Make sure we don't have the lock. */
976 		g_topology_assert_not();
977 		g_topology_lock();
978 		gv_access(p->vol_sc->provider, -1, -1, 0);
979 		g_topology_unlock();
980 		/* We're finished. */
981 		G_VINUM_DEBUG(1, "parity operation on %s finished", p->name);
982 		p->synced = 0;
983 		gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0);
984 		return;
985 	}
986 
987 	/* Send down next. It will determine if we need to itself. */
988 	gv_parity_request(p, flags, p->synced);
989 }
990 
991 /*
992  * Handle a finished plex rebuild bio.
993  */
994 void
995 gv_rebuild_complete(struct gv_plex *p, struct bio *bp)
996 {
997 	struct gv_softc *sc;
998 	struct gv_sd *s;
999 	int error, flags;
1000 	off_t offset;
1001 
1002 	error = bp->bio_error;
1003 	flags = bp->bio_pflags;
1004 	offset = bp->bio_offset;
1005 	flags &= ~GV_BIO_MALLOC;
1006 	sc = p->vinumconf;
1007 	KASSERT(sc != NULL, ("gv_rebuild_complete: NULL sc"));
1008 
1009 	/* Clean up what we allocated. */
1010 	if (bp->bio_pflags & GV_BIO_MALLOC)
1011 		g_free(bp->bio_data);
1012 	g_destroy_bio(bp);
1013 
1014 	if (error) {
1015 		g_topology_assert_not();
1016 		g_topology_lock();
1017 		gv_access(p->vol_sc->provider, -1, -1, 0);
1018 		g_topology_unlock();
1019 
1020 		G_VINUM_DEBUG(0, "rebuild of %s failed at offset %jd errno: %d",
1021 		    p->name, (intmax_t)offset, error);
1022 		p->flags &= ~GV_PLEX_REBUILDING;
1023 		p->synced = 0;
1024 		gv_plex_flush(p); /* Flush out remaining rebuild BIOs. */
1025 		return;
1026 	}
1027 
1028 	offset += (p->stripesize * (gv_sdcount(p, 1) - 1));
1029 	if (offset >= p->size) {
1030 		/* We're finished. */
1031 		g_topology_assert_not();
1032 		g_topology_lock();
1033 		gv_access(p->vol_sc->provider, -1, -1, 0);
1034 		g_topology_unlock();
1035 
1036 		G_VINUM_DEBUG(1, "rebuild of %s finished", p->name);
1037 		gv_save_config(p->vinumconf);
1038 		p->flags &= ~GV_PLEX_REBUILDING;
1039 		p->synced = 0;
1040 		/* Try to up all subdisks. */
1041 		LIST_FOREACH(s, &p->subdisks, in_plex)
1042 			gv_update_sd_state(s);
1043 		gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0);
1044 		gv_plex_flush(p); /* Flush out remaining rebuild BIOs. */
1045 		return;
1046 	}
1047 
1048 	/* Send down next. It will determine if we need to itself. */
1049 	gv_parity_request(p, flags, offset);
1050 }
1051