1 /*- 2 * Copyright (c) 2004, 2007 Lukas Ertl 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #include <sys/param.h> 31 #include <sys/bio.h> 32 #include <sys/lock.h> 33 #include <sys/malloc.h> 34 #include <sys/systm.h> 35 36 #include <geom/geom.h> 37 #include <geom/vinum/geom_vinum_var.h> 38 #include <geom/vinum/geom_vinum_raid5.h> 39 #include <geom/vinum/geom_vinum.h> 40 41 static int gv_raid5_offset(struct gv_plex *, off_t, off_t, 42 off_t *, off_t *, int *, int *, int); 43 static struct bio * gv_raid5_clone_bio(struct bio *, struct gv_sd *, 44 struct gv_raid5_packet *, caddr_t, int); 45 static int gv_raid5_request(struct gv_plex *, struct gv_raid5_packet *, 46 struct bio *, caddr_t, off_t, off_t, int *); 47 static int gv_raid5_check(struct gv_plex *, struct gv_raid5_packet *, 48 struct bio *, caddr_t, off_t, off_t); 49 static int gv_raid5_rebuild(struct gv_plex *, struct gv_raid5_packet *, 50 struct bio *, caddr_t, off_t, off_t); 51 52 struct gv_raid5_packet * 53 gv_raid5_start(struct gv_plex *p, struct bio *bp, caddr_t addr, off_t boff, 54 off_t bcount) 55 { 56 struct bio *cbp; 57 struct gv_raid5_packet *wp, *wp2; 58 struct gv_bioq *bq, *bq2; 59 int err, delay; 60 61 delay = 0; 62 wp = g_malloc(sizeof(*wp), M_WAITOK | M_ZERO); 63 wp->bio = bp; 64 wp->waiting = NULL; 65 wp->parity = NULL; 66 TAILQ_INIT(&wp->bits); 67 68 if (bp->bio_pflags & GV_BIO_REBUILD) 69 err = gv_raid5_rebuild(p, wp, bp, addr, boff, bcount); 70 else if (bp->bio_pflags & GV_BIO_CHECK) 71 err = gv_raid5_check(p, wp, bp, addr, boff, bcount); 72 else 73 err = gv_raid5_request(p, wp, bp, addr, boff, bcount, &delay); 74 75 /* Means we have a delayed request. */ 76 if (delay) { 77 g_free(wp); 78 return (NULL); 79 } 80 81 /* 82 * Building the sub-request failed, we probably need to clean up a lot. 83 */ 84 if (err) { 85 G_VINUM_LOGREQ(0, bp, "raid5 plex request failed."); 86 TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { 87 TAILQ_REMOVE(&wp->bits, bq, queue); 88 g_free(bq); 89 } 90 if (wp->waiting != NULL) { 91 if (wp->waiting->bio_cflags & GV_BIO_MALLOC) 92 g_free(wp->waiting->bio_data); 93 g_destroy_bio(wp->waiting); 94 } 95 if (wp->parity != NULL) { 96 if (wp->parity->bio_cflags & GV_BIO_MALLOC) 97 g_free(wp->parity->bio_data); 98 g_destroy_bio(wp->parity); 99 } 100 g_free(wp); 101 102 TAILQ_FOREACH_SAFE(wp, &p->packets, list, wp2) { 103 if (wp->bio != bp) 104 continue; 105 106 TAILQ_REMOVE(&p->packets, wp, list); 107 TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { 108 TAILQ_REMOVE(&wp->bits, bq, queue); 109 g_free(bq); 110 } 111 g_free(wp); 112 } 113 114 cbp = bioq_takefirst(p->bqueue); 115 while (cbp != NULL) { 116 if (cbp->bio_cflags & GV_BIO_MALLOC) 117 g_free(cbp->bio_data); 118 g_destroy_bio(cbp); 119 cbp = bioq_takefirst(p->bqueue); 120 } 121 122 /* If internal, stop and reset state. */ 123 if (bp->bio_pflags & GV_BIO_INTERNAL) { 124 if (bp->bio_pflags & GV_BIO_MALLOC) 125 g_free(bp->bio_data); 126 g_destroy_bio(bp); 127 /* Reset flags. */ 128 p->flags &= ~(GV_PLEX_SYNCING | GV_PLEX_REBUILDING | 129 GV_PLEX_GROWING); 130 return (NULL); 131 } 132 g_io_deliver(bp, err); 133 return (NULL); 134 } 135 136 return (wp); 137 } 138 139 /* 140 * Check if the stripe that the work packet wants is already being used by 141 * some other work packet. 142 */ 143 int 144 gv_stripe_active(struct gv_plex *p, struct bio *bp) 145 { 146 struct gv_raid5_packet *wp, *owp; 147 int overlap; 148 149 wp = bp->bio_caller2; 150 if (wp->lockbase == -1) 151 return (0); 152 153 overlap = 0; 154 TAILQ_FOREACH(owp, &p->packets, list) { 155 if (owp == wp) 156 break; 157 if ((wp->lockbase >= owp->lockbase) && 158 (wp->lockbase <= owp->lockbase + owp->length)) { 159 overlap++; 160 break; 161 } 162 if ((wp->lockbase <= owp->lockbase) && 163 (wp->lockbase + wp->length >= owp->lockbase)) { 164 overlap++; 165 break; 166 } 167 } 168 169 return (overlap); 170 } 171 172 static int 173 gv_raid5_check(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp, 174 caddr_t addr, off_t boff, off_t bcount) 175 { 176 struct gv_sd *parity, *s; 177 struct gv_bioq *bq; 178 struct bio *cbp; 179 int i, psdno; 180 off_t real_len, real_off; 181 182 if (p == NULL || LIST_EMPTY(&p->subdisks)) 183 return (ENXIO); 184 185 gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, &psdno, 1); 186 187 /* Find the right subdisk. */ 188 parity = NULL; 189 i = 0; 190 LIST_FOREACH(s, &p->subdisks, in_plex) { 191 if (i == psdno) { 192 parity = s; 193 break; 194 } 195 i++; 196 } 197 198 /* Parity stripe not found. */ 199 if (parity == NULL) 200 return (ENXIO); 201 202 if (parity->state != GV_SD_UP) 203 return (ENXIO); 204 205 wp->length = real_len; 206 wp->data = addr; 207 wp->lockbase = real_off; 208 209 /* Read all subdisks. */ 210 LIST_FOREACH(s, &p->subdisks, in_plex) { 211 /* Skip the parity subdisk. */ 212 if (s == parity) 213 continue; 214 /* Skip growing subdisks. */ 215 if (s->flags & GV_SD_GROW) 216 continue; 217 218 cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); 219 if (cbp == NULL) 220 return (ENOMEM); 221 cbp->bio_cmd = BIO_READ; 222 223 bioq_insert_tail(p->bqueue, cbp); 224 225 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 226 bq->bp = cbp; 227 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 228 } 229 230 /* Read the parity data. */ 231 cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); 232 if (cbp == NULL) 233 return (ENOMEM); 234 cbp->bio_cmd = BIO_READ; 235 wp->waiting = cbp; 236 237 /* 238 * In case we want to rebuild the parity, create an extra BIO to write 239 * it out. It also acts as buffer for the XOR operations. 240 */ 241 cbp = gv_raid5_clone_bio(bp, parity, wp, addr, 1); 242 if (cbp == NULL) 243 return (ENOMEM); 244 wp->parity = cbp; 245 246 return (0); 247 } 248 249 /* Rebuild a degraded RAID5 plex. */ 250 static int 251 gv_raid5_rebuild(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp, 252 caddr_t addr, off_t boff, off_t bcount) 253 { 254 struct gv_sd *broken, *s; 255 struct gv_bioq *bq; 256 struct bio *cbp; 257 off_t real_len, real_off; 258 259 if (p == NULL || LIST_EMPTY(&p->subdisks)) 260 return (ENXIO); 261 262 gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, NULL, 1); 263 264 /* Find the right subdisk. */ 265 broken = NULL; 266 LIST_FOREACH(s, &p->subdisks, in_plex) { 267 if (s->state != GV_SD_UP) 268 broken = s; 269 } 270 271 /* Broken stripe not found. */ 272 if (broken == NULL) 273 return (ENXIO); 274 275 switch (broken->state) { 276 case GV_SD_UP: 277 return (EINVAL); 278 279 case GV_SD_STALE: 280 if (!(bp->bio_pflags & GV_BIO_REBUILD)) 281 return (ENXIO); 282 283 G_VINUM_DEBUG(1, "sd %s is reviving", broken->name); 284 gv_set_sd_state(broken, GV_SD_REVIVING, GV_SETSTATE_FORCE); 285 /* Set this bit now, but should be set at end. */ 286 broken->flags |= GV_SD_CANGOUP; 287 break; 288 289 case GV_SD_REVIVING: 290 break; 291 292 default: 293 /* All other subdisk states mean it's not accessible. */ 294 return (ENXIO); 295 } 296 297 wp->length = real_len; 298 wp->data = addr; 299 wp->lockbase = real_off; 300 301 KASSERT(wp->length >= 0, ("gv_rebuild_raid5: wp->length < 0")); 302 303 /* Read all subdisks. */ 304 LIST_FOREACH(s, &p->subdisks, in_plex) { 305 /* Skip the broken subdisk. */ 306 if (s == broken) 307 continue; 308 309 /* Skip growing subdisks. */ 310 if (s->flags & GV_SD_GROW) 311 continue; 312 313 cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); 314 if (cbp == NULL) 315 return (ENOMEM); 316 cbp->bio_cmd = BIO_READ; 317 318 bioq_insert_tail(p->bqueue, cbp); 319 320 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 321 bq->bp = cbp; 322 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 323 } 324 325 /* Write the parity data. */ 326 cbp = gv_raid5_clone_bio(bp, broken, wp, NULL, 1); 327 if (cbp == NULL) 328 return (ENOMEM); 329 wp->parity = cbp; 330 331 p->synced = boff; 332 333 /* Post notification that we're finished. */ 334 return (0); 335 } 336 337 /* Build a request group to perform (part of) a RAID5 request. */ 338 static int 339 gv_raid5_request(struct gv_plex *p, struct gv_raid5_packet *wp, 340 struct bio *bp, caddr_t addr, off_t boff, off_t bcount, int *delay) 341 { 342 struct g_geom *gp; 343 struct gv_sd *broken, *original, *parity, *s; 344 struct gv_bioq *bq; 345 struct bio *cbp; 346 int i, psdno, sdno, type, grow; 347 off_t real_len, real_off; 348 349 gp = bp->bio_to->geom; 350 351 if (p == NULL || LIST_EMPTY(&p->subdisks)) 352 return (ENXIO); 353 354 /* We are optimistic and assume that this request will be OK. */ 355 #define REQ_TYPE_NORMAL 0 356 #define REQ_TYPE_DEGRADED 1 357 #define REQ_TYPE_NOPARITY 2 358 359 type = REQ_TYPE_NORMAL; 360 original = parity = broken = NULL; 361 362 /* XXX: The resize won't crash with rebuild or sync, but we should still 363 * be aware of it. Also this should perhaps be done on rebuild/check as 364 * well? 365 */ 366 /* If we're over, we must use the old. */ 367 if (boff >= p->synced) { 368 grow = 1; 369 /* Or if over the resized offset, we use all drives. */ 370 } else if (boff + bcount <= p->synced) { 371 grow = 0; 372 /* Else, we're in the middle, and must wait a bit. */ 373 } else { 374 bioq_disksort(p->rqueue, bp); 375 *delay = 1; 376 return (0); 377 } 378 gv_raid5_offset(p, boff, bcount, &real_off, &real_len, 379 &sdno, &psdno, grow); 380 381 /* Find the right subdisks. */ 382 i = 0; 383 LIST_FOREACH(s, &p->subdisks, in_plex) { 384 if (i == sdno) 385 original = s; 386 if (i == psdno) 387 parity = s; 388 if (s->state != GV_SD_UP) 389 broken = s; 390 i++; 391 } 392 393 if ((original == NULL) || (parity == NULL)) 394 return (ENXIO); 395 396 /* Our data stripe is missing. */ 397 if (original->state != GV_SD_UP) 398 type = REQ_TYPE_DEGRADED; 399 400 /* If synchronizing request, just write it if disks are stale. */ 401 if (original->state == GV_SD_STALE && parity->state == GV_SD_STALE && 402 bp->bio_pflags & GV_BIO_SYNCREQ && bp->bio_cmd == BIO_WRITE) { 403 type = REQ_TYPE_NORMAL; 404 /* Our parity stripe is missing. */ 405 } else if (parity->state != GV_SD_UP) { 406 /* We cannot take another failure if we're already degraded. */ 407 if (type != REQ_TYPE_NORMAL) 408 return (ENXIO); 409 else 410 type = REQ_TYPE_NOPARITY; 411 } 412 413 wp->length = real_len; 414 wp->data = addr; 415 wp->lockbase = real_off; 416 417 KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0")); 418 419 if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len < p->synced)) 420 type = REQ_TYPE_NORMAL; 421 422 if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len >= p->synced)) { 423 bioq_disksort(p->rqueue, bp); 424 *delay = 1; 425 return (0); 426 } 427 428 switch (bp->bio_cmd) { 429 case BIO_READ: 430 /* 431 * For a degraded read we need to read in all stripes except 432 * the broken one plus the parity stripe and then recalculate 433 * the desired data. 434 */ 435 if (type == REQ_TYPE_DEGRADED) { 436 bzero(wp->data, wp->length); 437 LIST_FOREACH(s, &p->subdisks, in_plex) { 438 /* Skip the broken subdisk. */ 439 if (s == broken) 440 continue; 441 /* Skip growing if within offset. */ 442 if (grow && s->flags & GV_SD_GROW) 443 continue; 444 cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); 445 if (cbp == NULL) 446 return (ENOMEM); 447 448 bioq_insert_tail(p->bqueue, cbp); 449 450 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 451 bq->bp = cbp; 452 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 453 } 454 455 /* A normal read can be fulfilled with the original subdisk. */ 456 } else { 457 cbp = gv_raid5_clone_bio(bp, original, wp, addr, 0); 458 if (cbp == NULL) 459 return (ENOMEM); 460 461 bioq_insert_tail(p->bqueue, cbp); 462 } 463 wp->lockbase = -1; 464 465 break; 466 467 case BIO_WRITE: 468 /* 469 * A degraded write means we cannot write to the original data 470 * subdisk. Thus we need to read in all valid stripes, 471 * recalculate the parity from the original data, and then 472 * write the parity stripe back out. 473 */ 474 if (type == REQ_TYPE_DEGRADED) { 475 /* Read all subdisks. */ 476 LIST_FOREACH(s, &p->subdisks, in_plex) { 477 /* Skip the broken and the parity subdisk. */ 478 if ((s == broken) || (s == parity)) 479 continue; 480 /* Skip growing if within offset. */ 481 if (grow && s->flags & GV_SD_GROW) 482 continue; 483 484 cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); 485 if (cbp == NULL) 486 return (ENOMEM); 487 cbp->bio_cmd = BIO_READ; 488 489 bioq_insert_tail(p->bqueue, cbp); 490 491 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 492 bq->bp = cbp; 493 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 494 } 495 496 /* Write the parity data. */ 497 cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); 498 if (cbp == NULL) 499 return (ENOMEM); 500 bcopy(addr, cbp->bio_data, wp->length); 501 wp->parity = cbp; 502 503 /* 504 * When the parity stripe is missing we just write out the data. 505 */ 506 } else if (type == REQ_TYPE_NOPARITY) { 507 cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1); 508 if (cbp == NULL) 509 return (ENOMEM); 510 511 bioq_insert_tail(p->bqueue, cbp); 512 513 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 514 bq->bp = cbp; 515 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 516 517 /* 518 * A normal write request goes to the original subdisk, then we 519 * read in all other stripes, recalculate the parity and write 520 * out the parity again. 521 */ 522 } else { 523 /* Read old parity. */ 524 cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); 525 if (cbp == NULL) 526 return (ENOMEM); 527 cbp->bio_cmd = BIO_READ; 528 529 bioq_insert_tail(p->bqueue, cbp); 530 531 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 532 bq->bp = cbp; 533 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 534 535 /* Read old data. */ 536 cbp = gv_raid5_clone_bio(bp, original, wp, NULL, 1); 537 if (cbp == NULL) 538 return (ENOMEM); 539 cbp->bio_cmd = BIO_READ; 540 541 bioq_insert_tail(p->bqueue, cbp); 542 543 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 544 bq->bp = cbp; 545 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 546 547 /* Write new data. */ 548 cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1); 549 if (cbp == NULL) 550 return (ENOMEM); 551 552 /* 553 * We must not write the new data until the old data 554 * was read, so hold this BIO back until we're ready 555 * for it. 556 */ 557 wp->waiting = cbp; 558 559 /* The final bio for the parity. */ 560 cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); 561 if (cbp == NULL) 562 return (ENOMEM); 563 564 /* Remember that this is the BIO for the parity data. */ 565 wp->parity = cbp; 566 } 567 break; 568 569 default: 570 return (EINVAL); 571 } 572 573 return (0); 574 } 575 576 /* 577 * Calculate the offsets in the various subdisks for a RAID5 request. Also take 578 * care of new subdisks in an expanded RAID5 array. 579 * XXX: This assumes that the new subdisks are inserted after the others (which 580 * is okay as long as plex_offset is larger). If subdisks are inserted into the 581 * plexlist before, we get problems. 582 */ 583 static int 584 gv_raid5_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off, 585 off_t *real_len, int *sdno, int *psdno, int growing) 586 { 587 struct gv_sd *s; 588 int sd, psd, sdcount; 589 off_t len_left, stripeend, stripeoff, stripestart; 590 591 sdcount = p->sdcount; 592 if (growing) { 593 LIST_FOREACH(s, &p->subdisks, in_plex) { 594 if (s->flags & GV_SD_GROW) 595 sdcount--; 596 } 597 } 598 599 /* The number of the subdisk containing the parity stripe. */ 600 psd = sdcount - 1 - ( boff / (p->stripesize * (sdcount - 1))) % 601 sdcount; 602 KASSERT(psdno >= 0, ("gv_raid5_offset: psdno < 0")); 603 604 /* Offset of the start address from the start of the stripe. */ 605 stripeoff = boff % (p->stripesize * (sdcount - 1)); 606 KASSERT(stripeoff >= 0, ("gv_raid5_offset: stripeoff < 0")); 607 608 /* The number of the subdisk where the stripe resides. */ 609 sd = stripeoff / p->stripesize; 610 KASSERT(sdno >= 0, ("gv_raid5_offset: sdno < 0")); 611 612 /* At or past parity subdisk. */ 613 if (sd >= psd) 614 sd++; 615 616 /* The offset of the stripe on this subdisk. */ 617 stripestart = (boff - stripeoff) / (sdcount - 1); 618 KASSERT(stripestart >= 0, ("gv_raid5_offset: stripestart < 0")); 619 620 stripeoff %= p->stripesize; 621 622 /* The offset of the request on this subdisk. */ 623 *real_off = stripestart + stripeoff; 624 625 stripeend = stripestart + p->stripesize; 626 len_left = stripeend - *real_off; 627 KASSERT(len_left >= 0, ("gv_raid5_offset: len_left < 0")); 628 629 *real_len = (bcount <= len_left) ? bcount : len_left; 630 631 if (sdno != NULL) 632 *sdno = sd; 633 if (psdno != NULL) 634 *psdno = psd; 635 636 return (0); 637 } 638 639 static struct bio * 640 gv_raid5_clone_bio(struct bio *bp, struct gv_sd *s, struct gv_raid5_packet *wp, 641 caddr_t addr, int use_wp) 642 { 643 struct bio *cbp; 644 645 cbp = g_clone_bio(bp); 646 if (cbp == NULL) 647 return (NULL); 648 if (addr == NULL) { 649 cbp->bio_data = g_malloc(wp->length, M_WAITOK | M_ZERO); 650 cbp->bio_cflags |= GV_BIO_MALLOC; 651 } else 652 cbp->bio_data = addr; 653 cbp->bio_offset = wp->lockbase + s->drive_offset; 654 cbp->bio_length = wp->length; 655 cbp->bio_done = gv_done; 656 cbp->bio_caller1 = s; 657 if (use_wp) 658 cbp->bio_caller2 = wp; 659 660 return (cbp); 661 } 662