xref: /freebsd/sys/geom/vinum/geom_vinum_raid5.c (revision 39beb93c3f8bdbf72a61fda42300b5ebed7390c8)
1 /*-
2  * Copyright (c) 2004 Lukas Ertl
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29 
30 #include <sys/param.h>
31 #include <sys/bio.h>
32 #include <sys/conf.h>
33 #include <sys/errno.h>
34 #include <sys/kernel.h>
35 #include <sys/kthread.h>
36 #include <sys/libkern.h>
37 #include <sys/lock.h>
38 #include <sys/malloc.h>
39 #include <sys/mutex.h>
40 #include <sys/systm.h>
41 
42 #include <geom/geom.h>
43 #include <geom/vinum/geom_vinum_var.h>
44 #include <geom/vinum/geom_vinum_raid5.h>
45 #include <geom/vinum/geom_vinum.h>
46 
47 int	gv_raid5_offset(struct gv_plex *, off_t, off_t, off_t *, off_t *,
48 	    int *, int *);
49 
50 /*
51  * Check if the stripe that the work packet wants is already being used by
52  * some other work packet.
53  */
54 int
55 gv_stripe_active(struct gv_plex *p, struct bio *bp)
56 {
57 	struct gv_raid5_packet *wp, *owp;
58 	int overlap;
59 
60 	wp = bp->bio_driver1;
61 	if (wp->lockbase == -1)
62 		return (0);
63 
64 	overlap = 0;
65 	TAILQ_FOREACH(owp, &p->packets, list) {
66 		if (owp == wp)
67 			break;
68 		if ((wp->lockbase >= owp->lockbase) &&
69 		    (wp->lockbase <= owp->lockbase + owp->length)) {
70 			overlap++;
71 			break;
72 		}
73 		if ((wp->lockbase <= owp->lockbase) &&
74 		    (wp->lockbase + wp->length >= owp->lockbase)) {
75 			overlap++;
76 			break;
77 		}
78 	}
79 
80 	return (overlap);
81 }
82 
83 int
84 gv_check_raid5(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp,
85     caddr_t addr, off_t boff, off_t bcount)
86 {
87 	struct gv_sd *parity, *s;
88 	struct gv_bioq *bq;
89 	struct bio *cbp, *pbp;
90 	int i, psdno;
91 	off_t real_len, real_off;
92 
93 	if (p == NULL || LIST_EMPTY(&p->subdisks))
94 		return (ENXIO);
95 
96 	gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, &psdno);
97 
98 	/* Find the right subdisk. */
99 	parity = NULL;
100 	i = 0;
101 	LIST_FOREACH(s, &p->subdisks, in_plex) {
102 		if (i == psdno) {
103 			parity = s;
104 			break;
105 		}
106 		i++;
107 	}
108 
109 	/* Parity stripe not found. */
110 	if (parity == NULL)
111 		return (ENXIO);
112 
113 	if (parity->state != GV_SD_UP)
114 		return (ENXIO);
115 
116 	wp->length = real_len;
117 	wp->data = addr;
118 	wp->lockbase = real_off;
119 
120 	/* Read all subdisks. */
121 	LIST_FOREACH(s, &p->subdisks, in_plex) {
122 		/* Skip the parity subdisk. */
123 		if (s == parity)
124 			continue;
125 
126 		cbp = g_clone_bio(bp);
127 		if (cbp == NULL)
128 			return (ENOMEM);
129 		cbp->bio_cmd = BIO_READ;
130 		cbp->bio_data = g_malloc(real_len, M_WAITOK);
131 		cbp->bio_cflags |= GV_BIO_MALLOC;
132 		cbp->bio_offset = real_off;
133 		cbp->bio_length = real_len;
134 		cbp->bio_done = gv_plex_done;
135 		cbp->bio_caller2 = s->consumer;
136 		cbp->bio_driver1 = wp;
137 
138 		GV_ENQUEUE(bp, cbp, pbp);
139 
140 		bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
141 		bq->bp = cbp;
142 		TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
143 	}
144 
145 	/* Read the parity data. */
146 	cbp = g_clone_bio(bp);
147 	if (cbp == NULL)
148 		return (ENOMEM);
149 	cbp->bio_cmd = BIO_READ;
150 	cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO);
151 	cbp->bio_cflags |= GV_BIO_MALLOC;
152 	cbp->bio_offset = real_off;
153 	cbp->bio_length = real_len;
154 	cbp->bio_done = gv_plex_done;
155 	cbp->bio_caller2 = parity->consumer;
156 	cbp->bio_driver1 = wp;
157 	wp->waiting = cbp;
158 
159 	/*
160 	 * In case we want to rebuild the parity, create an extra BIO to write
161 	 * it out.  It also acts as buffer for the XOR operations.
162 	 */
163 	cbp = g_clone_bio(bp);
164 	if (cbp == NULL)
165 		return (ENOMEM);
166 	cbp->bio_data = addr;
167 	cbp->bio_offset = real_off;
168 	cbp->bio_length = real_len;
169 	cbp->bio_done = gv_plex_done;
170 	cbp->bio_caller2 = parity->consumer;
171 	cbp->bio_driver1 = wp;
172 	wp->parity = cbp;
173 
174 	return (0);
175 }
176 
177 /* Rebuild a degraded RAID5 plex. */
178 int
179 gv_rebuild_raid5(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp,
180     caddr_t addr, off_t boff, off_t bcount)
181 {
182 	struct gv_sd *broken, *s;
183 	struct gv_bioq *bq;
184 	struct bio *cbp, *pbp;
185 	off_t real_len, real_off;
186 
187 	if (p == NULL || LIST_EMPTY(&p->subdisks))
188 		return (ENXIO);
189 
190 	gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, NULL);
191 
192 	/* Find the right subdisk. */
193 	broken = NULL;
194 	LIST_FOREACH(s, &p->subdisks, in_plex) {
195 		if (s->state != GV_SD_UP)
196 			broken = s;
197 	}
198 
199 	/* Broken stripe not found. */
200 	if (broken == NULL)
201 		return (ENXIO);
202 
203 	switch (broken->state) {
204 	case GV_SD_UP:
205 		return (EINVAL);
206 
207 	case GV_SD_STALE:
208 		if (!(bp->bio_cflags & GV_BIO_REBUILD))
209 			return (ENXIO);
210 
211 		G_VINUM_DEBUG(1, "sd %s is reviving", broken->name);
212 		gv_set_sd_state(broken, GV_SD_REVIVING, GV_SETSTATE_FORCE);
213 		break;
214 
215 	case GV_SD_REVIVING:
216 		break;
217 
218 	default:
219 		/* All other subdisk states mean it's not accessible. */
220 		return (ENXIO);
221 	}
222 
223 	wp->length = real_len;
224 	wp->data = addr;
225 	wp->lockbase = real_off;
226 
227 	KASSERT(wp->length >= 0, ("gv_rebuild_raid5: wp->length < 0"));
228 
229 	/* Read all subdisks. */
230 	LIST_FOREACH(s, &p->subdisks, in_plex) {
231 		/* Skip the broken subdisk. */
232 		if (s == broken)
233 			continue;
234 
235 		cbp = g_clone_bio(bp);
236 		if (cbp == NULL)
237 			return (ENOMEM);
238 		cbp->bio_cmd = BIO_READ;
239 		cbp->bio_data = g_malloc(real_len, M_WAITOK);
240 		cbp->bio_cflags |= GV_BIO_MALLOC;
241 		cbp->bio_offset = real_off;
242 		cbp->bio_length = real_len;
243 		cbp->bio_done = gv_plex_done;
244 		cbp->bio_caller2 = s->consumer;
245 		cbp->bio_driver1 = wp;
246 
247 		GV_ENQUEUE(bp, cbp, pbp);
248 
249 		bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
250 		bq->bp = cbp;
251 		TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
252 	}
253 
254 	/* Write the parity data. */
255 	cbp = g_clone_bio(bp);
256 	if (cbp == NULL)
257 		return (ENOMEM);
258 	cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO);
259 	cbp->bio_cflags |= GV_BIO_MALLOC;
260 	cbp->bio_offset = real_off;
261 	cbp->bio_length = real_len;
262 	cbp->bio_done = gv_plex_done;
263 	cbp->bio_caller2 = broken->consumer;
264 	cbp->bio_driver1 = wp;
265 	cbp->bio_cflags |= GV_BIO_REBUILD;
266 	wp->parity = cbp;
267 
268 	p->synced = boff;
269 
270 	return (0);
271 }
272 
273 /* Build a request group to perform (part of) a RAID5 request. */
274 int
275 gv_build_raid5_req(struct gv_plex *p, struct gv_raid5_packet *wp,
276     struct bio *bp, caddr_t addr, off_t boff, off_t bcount)
277 {
278 	struct g_geom *gp;
279 	struct gv_sd *broken, *original, *parity, *s;
280 	struct gv_bioq *bq;
281 	struct bio *cbp, *pbp;
282 	int i, psdno, sdno, type;
283 	off_t real_len, real_off;
284 
285 	gp = bp->bio_to->geom;
286 
287 	if (p == NULL || LIST_EMPTY(&p->subdisks))
288 		return (ENXIO);
289 
290 	/* We are optimistic and assume that this request will be OK. */
291 #define	REQ_TYPE_NORMAL		0
292 #define	REQ_TYPE_DEGRADED	1
293 #define	REQ_TYPE_NOPARITY	2
294 
295 	type = REQ_TYPE_NORMAL;
296 	original = parity = broken = NULL;
297 
298 	gv_raid5_offset(p, boff, bcount, &real_off, &real_len, &sdno, &psdno);
299 
300 	/* Find the right subdisks. */
301 	i = 0;
302 	LIST_FOREACH(s, &p->subdisks, in_plex) {
303 		if (i == sdno)
304 			original = s;
305 		if (i == psdno)
306 			parity = s;
307 		if (s->state != GV_SD_UP)
308 			broken = s;
309 		i++;
310 	}
311 
312 	if ((original == NULL) || (parity == NULL))
313 		return (ENXIO);
314 
315 	/* Our data stripe is missing. */
316 	if (original->state != GV_SD_UP)
317 		type = REQ_TYPE_DEGRADED;
318 	/* Our parity stripe is missing. */
319 	if (parity->state != GV_SD_UP) {
320 		/* We cannot take another failure if we're already degraded. */
321 		if (type != REQ_TYPE_NORMAL)
322 			return (ENXIO);
323 		else
324 			type = REQ_TYPE_NOPARITY;
325 	}
326 
327 	wp->length = real_len;
328 	wp->data = addr;
329 	wp->lockbase = real_off;
330 
331 	KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0"));
332 
333 	if ((p->flags & GV_PLEX_SYNCING) && (boff + real_len < p->synced))
334 		type = REQ_TYPE_NORMAL;
335 
336 	switch (bp->bio_cmd) {
337 	case BIO_READ:
338 		/*
339 		 * For a degraded read we need to read in all stripes except
340 		 * the broken one plus the parity stripe and then recalculate
341 		 * the desired data.
342 		 */
343 		if (type == REQ_TYPE_DEGRADED) {
344 			bzero(wp->data, wp->length);
345 			LIST_FOREACH(s, &p->subdisks, in_plex) {
346 				/* Skip the broken subdisk. */
347 				if (s == broken)
348 					continue;
349 				cbp = g_clone_bio(bp);
350 				if (cbp == NULL)
351 					return (ENOMEM);
352 				cbp->bio_data = g_malloc(real_len, M_WAITOK);
353 				cbp->bio_cflags |= GV_BIO_MALLOC;
354 				cbp->bio_offset = real_off;
355 				cbp->bio_length = real_len;
356 				cbp->bio_done = gv_plex_done;
357 				cbp->bio_caller2 = s->consumer;
358 				cbp->bio_driver1 = wp;
359 
360 				GV_ENQUEUE(bp, cbp, pbp);
361 
362 				bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
363 				bq->bp = cbp;
364 				TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
365 			}
366 
367 		/* A normal read can be fulfilled with the original subdisk. */
368 		} else {
369 			cbp = g_clone_bio(bp);
370 			if (cbp == NULL)
371 				return (ENOMEM);
372 			cbp->bio_offset = real_off;
373 			cbp->bio_length = real_len;
374 			cbp->bio_data = addr;
375 			cbp->bio_done = g_std_done;
376 			cbp->bio_caller2 = original->consumer;
377 
378 			GV_ENQUEUE(bp, cbp, pbp);
379 		}
380 		wp->lockbase = -1;
381 
382 		break;
383 
384 	case BIO_WRITE:
385 		/*
386 		 * A degraded write means we cannot write to the original data
387 		 * subdisk.  Thus we need to read in all valid stripes,
388 		 * recalculate the parity from the original data, and then
389 		 * write the parity stripe back out.
390 		 */
391 		if (type == REQ_TYPE_DEGRADED) {
392 			/* Read all subdisks. */
393 			LIST_FOREACH(s, &p->subdisks, in_plex) {
394 				/* Skip the broken and the parity subdisk. */
395 				if ((s == broken) || (s == parity))
396 					continue;
397 
398 				cbp = g_clone_bio(bp);
399 				if (cbp == NULL)
400 					return (ENOMEM);
401 				cbp->bio_cmd = BIO_READ;
402 				cbp->bio_data = g_malloc(real_len, M_WAITOK);
403 				cbp->bio_cflags |= GV_BIO_MALLOC;
404 				cbp->bio_offset = real_off;
405 				cbp->bio_length = real_len;
406 				cbp->bio_done = gv_plex_done;
407 				cbp->bio_caller2 = s->consumer;
408 				cbp->bio_driver1 = wp;
409 
410 				GV_ENQUEUE(bp, cbp, pbp);
411 
412 				bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
413 				bq->bp = cbp;
414 				TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
415 			}
416 
417 			/* Write the parity data. */
418 			cbp = g_clone_bio(bp);
419 			if (cbp == NULL)
420 				return (ENOMEM);
421 			cbp->bio_data = g_malloc(real_len, M_WAITOK);
422 			cbp->bio_cflags |= GV_BIO_MALLOC;
423 			bcopy(addr, cbp->bio_data, real_len);
424 			cbp->bio_offset = real_off;
425 			cbp->bio_length = real_len;
426 			cbp->bio_done = gv_plex_done;
427 			cbp->bio_caller2 = parity->consumer;
428 			cbp->bio_driver1 = wp;
429 			wp->parity = cbp;
430 
431 		/*
432 		 * When the parity stripe is missing we just write out the data.
433 		 */
434 		} else if (type == REQ_TYPE_NOPARITY) {
435 			cbp = g_clone_bio(bp);
436 			if (cbp == NULL)
437 				return (ENOMEM);
438 			cbp->bio_offset = real_off;
439 			cbp->bio_length = real_len;
440 			cbp->bio_data = addr;
441 			cbp->bio_done = gv_plex_done;
442 			cbp->bio_caller2 = original->consumer;
443 			cbp->bio_driver1 = wp;
444 
445 			GV_ENQUEUE(bp, cbp, pbp);
446 
447 			bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
448 			bq->bp = cbp;
449 			TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
450 
451 		/*
452 		 * A normal write request goes to the original subdisk, then we
453 		 * read in all other stripes, recalculate the parity and write
454 		 * out the parity again.
455 		 */
456 		} else {
457 			/* Read old parity. */
458 			cbp = g_clone_bio(bp);
459 			if (cbp == NULL)
460 				return (ENOMEM);
461 			cbp->bio_cmd = BIO_READ;
462 			cbp->bio_data = g_malloc(real_len, M_WAITOK);
463 			cbp->bio_cflags |= GV_BIO_MALLOC;
464 			cbp->bio_offset = real_off;
465 			cbp->bio_length = real_len;
466 			cbp->bio_done = gv_plex_done;
467 			cbp->bio_caller2 = parity->consumer;
468 			cbp->bio_driver1 = wp;
469 
470 			GV_ENQUEUE(bp, cbp, pbp);
471 
472 			bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
473 			bq->bp = cbp;
474 			TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
475 
476 			/* Read old data. */
477 			cbp = g_clone_bio(bp);
478 			if (cbp == NULL)
479 				return (ENOMEM);
480 			cbp->bio_cmd = BIO_READ;
481 			cbp->bio_data = g_malloc(real_len, M_WAITOK);
482 			cbp->bio_cflags |= GV_BIO_MALLOC;
483 			cbp->bio_offset = real_off;
484 			cbp->bio_length = real_len;
485 			cbp->bio_done = gv_plex_done;
486 			cbp->bio_caller2 = original->consumer;
487 			cbp->bio_driver1 = wp;
488 
489 			GV_ENQUEUE(bp, cbp, pbp);
490 
491 			bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO);
492 			bq->bp = cbp;
493 			TAILQ_INSERT_TAIL(&wp->bits, bq, queue);
494 
495 			/* Write new data. */
496 			cbp = g_clone_bio(bp);
497 			if (cbp == NULL)
498 				return (ENOMEM);
499 			cbp->bio_data = addr;
500 			cbp->bio_offset = real_off;
501 			cbp->bio_length = real_len;
502 			cbp->bio_done = gv_plex_done;
503 			cbp->bio_caller2 = original->consumer;
504 
505 			cbp->bio_driver1 = wp;
506 
507 			/*
508 			 * We must not write the new data until the old data
509 			 * was read, so hold this BIO back until we're ready
510 			 * for it.
511 			 */
512 			wp->waiting = cbp;
513 
514 			/* The final bio for the parity. */
515 			cbp = g_clone_bio(bp);
516 			if (cbp == NULL)
517 				return (ENOMEM);
518 			cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO);
519 			cbp->bio_cflags |= GV_BIO_MALLOC;
520 			cbp->bio_offset = real_off;
521 			cbp->bio_length = real_len;
522 			cbp->bio_done = gv_plex_done;
523 			cbp->bio_caller2 = parity->consumer;
524 			cbp->bio_driver1 = wp;
525 
526 			/* Remember that this is the BIO for the parity data. */
527 			wp->parity = cbp;
528 		}
529 		break;
530 
531 	default:
532 		return (EINVAL);
533 	}
534 
535 	return (0);
536 }
537 
538 /* Calculate the offsets in the various subdisks for a RAID5 request. */
539 int
540 gv_raid5_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off,
541     off_t *real_len, int *sdno, int *psdno)
542 {
543 	int sd, psd;
544 	off_t len_left, stripeend, stripeoff, stripestart;
545 
546 	/* The number of the subdisk containing the parity stripe. */
547 	psd = p->sdcount - 1 - ( boff / (p->stripesize * (p->sdcount - 1))) %
548 	    p->sdcount;
549 	KASSERT(psdno >= 0, ("gv_raid5_offset: psdno < 0"));
550 
551 	/* Offset of the start address from the start of the stripe. */
552 	stripeoff = boff % (p->stripesize * (p->sdcount - 1));
553 	KASSERT(stripeoff >= 0, ("gv_raid5_offset: stripeoff < 0"));
554 
555 	/* The number of the subdisk where the stripe resides. */
556 	sd = stripeoff / p->stripesize;
557 	KASSERT(sdno >= 0, ("gv_raid5_offset: sdno < 0"));
558 
559 	/* At or past parity subdisk. */
560 	if (sd >= psd)
561 		sd++;
562 
563 	/* The offset of the stripe on this subdisk. */
564 	stripestart = (boff - stripeoff) / (p->sdcount - 1);
565 	KASSERT(stripestart >= 0, ("gv_raid5_offset: stripestart < 0"));
566 
567 	stripeoff %= p->stripesize;
568 
569 	/* The offset of the request on this subdisk. */
570 	*real_off = stripestart + stripeoff;
571 
572 	stripeend = stripestart + p->stripesize;
573 	len_left = stripeend - *real_off;
574 	KASSERT(len_left >= 0, ("gv_raid5_offset: len_left < 0"));
575 
576 	*real_len = (bcount <= len_left) ? bcount : len_left;
577 
578 	if (sdno != NULL)
579 		*sdno = sd;
580 	if (psdno != NULL)
581 		*psdno = psd;
582 
583 	return (0);
584 }
585