1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2004, 2007 Lukas Ertl 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include <sys/param.h> 33 #include <sys/bio.h> 34 #include <sys/lock.h> 35 #include <sys/malloc.h> 36 #include <sys/systm.h> 37 38 #include <geom/geom.h> 39 #include <geom/vinum/geom_vinum_var.h> 40 #include <geom/vinum/geom_vinum_raid5.h> 41 #include <geom/vinum/geom_vinum.h> 42 43 static int gv_raid5_offset(struct gv_plex *, off_t, off_t, 44 off_t *, off_t *, int *, int *, int); 45 static struct bio * gv_raid5_clone_bio(struct bio *, struct gv_sd *, 46 struct gv_raid5_packet *, caddr_t, int); 47 static int gv_raid5_request(struct gv_plex *, struct gv_raid5_packet *, 48 struct bio *, caddr_t, off_t, off_t, int *); 49 static int gv_raid5_check(struct gv_plex *, struct gv_raid5_packet *, 50 struct bio *, caddr_t, off_t, off_t); 51 static int gv_raid5_rebuild(struct gv_plex *, struct gv_raid5_packet *, 52 struct bio *, caddr_t, off_t, off_t); 53 54 struct gv_raid5_packet * 55 gv_raid5_start(struct gv_plex *p, struct bio *bp, caddr_t addr, off_t boff, 56 off_t bcount) 57 { 58 struct bio *cbp; 59 struct gv_raid5_packet *wp, *wp2; 60 struct gv_bioq *bq, *bq2; 61 int err, delay; 62 63 delay = 0; 64 wp = g_malloc(sizeof(*wp), M_WAITOK | M_ZERO); 65 wp->bio = bp; 66 wp->waiting = NULL; 67 wp->parity = NULL; 68 TAILQ_INIT(&wp->bits); 69 70 if (bp->bio_pflags & GV_BIO_REBUILD) 71 err = gv_raid5_rebuild(p, wp, bp, addr, boff, bcount); 72 else if (bp->bio_pflags & GV_BIO_CHECK) 73 err = gv_raid5_check(p, wp, bp, addr, boff, bcount); 74 else 75 err = gv_raid5_request(p, wp, bp, addr, boff, bcount, &delay); 76 77 /* Means we have a delayed request. */ 78 if (delay) { 79 g_free(wp); 80 return (NULL); 81 } 82 83 /* 84 * Building the sub-request failed, we probably need to clean up a lot. 85 */ 86 if (err) { 87 G_VINUM_LOGREQ(0, bp, "raid5 plex request failed."); 88 TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { 89 TAILQ_REMOVE(&wp->bits, bq, queue); 90 g_free(bq); 91 } 92 if (wp->waiting != NULL) { 93 if (wp->waiting->bio_cflags & GV_BIO_MALLOC) 94 g_free(wp->waiting->bio_data); 95 g_destroy_bio(wp->waiting); 96 } 97 if (wp->parity != NULL) { 98 if (wp->parity->bio_cflags & GV_BIO_MALLOC) 99 g_free(wp->parity->bio_data); 100 g_destroy_bio(wp->parity); 101 } 102 g_free(wp); 103 104 TAILQ_FOREACH_SAFE(wp, &p->packets, list, wp2) { 105 if (wp->bio != bp) 106 continue; 107 108 TAILQ_REMOVE(&p->packets, wp, list); 109 TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { 110 TAILQ_REMOVE(&wp->bits, bq, queue); 111 g_free(bq); 112 } 113 g_free(wp); 114 } 115 116 cbp = bioq_takefirst(p->bqueue); 117 while (cbp != NULL) { 118 if (cbp->bio_cflags & GV_BIO_MALLOC) 119 g_free(cbp->bio_data); 120 g_destroy_bio(cbp); 121 cbp = bioq_takefirst(p->bqueue); 122 } 123 124 /* If internal, stop and reset state. */ 125 if (bp->bio_pflags & GV_BIO_INTERNAL) { 126 if (bp->bio_pflags & GV_BIO_MALLOC) 127 g_free(bp->bio_data); 128 g_destroy_bio(bp); 129 /* Reset flags. */ 130 p->flags &= ~(GV_PLEX_SYNCING | GV_PLEX_REBUILDING | 131 GV_PLEX_GROWING); 132 return (NULL); 133 } 134 g_io_deliver(bp, err); 135 return (NULL); 136 } 137 138 return (wp); 139 } 140 141 /* 142 * Check if the stripe that the work packet wants is already being used by 143 * some other work packet. 144 */ 145 int 146 gv_stripe_active(struct gv_plex *p, struct bio *bp) 147 { 148 struct gv_raid5_packet *wp, *owp; 149 int overlap; 150 151 wp = bp->bio_caller2; 152 if (wp->lockbase == -1) 153 return (0); 154 155 overlap = 0; 156 TAILQ_FOREACH(owp, &p->packets, list) { 157 if (owp == wp) 158 break; 159 if ((wp->lockbase >= owp->lockbase) && 160 (wp->lockbase <= owp->lockbase + owp->length)) { 161 overlap++; 162 break; 163 } 164 if ((wp->lockbase <= owp->lockbase) && 165 (wp->lockbase + wp->length >= owp->lockbase)) { 166 overlap++; 167 break; 168 } 169 } 170 171 return (overlap); 172 } 173 174 static int 175 gv_raid5_check(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp, 176 caddr_t addr, off_t boff, off_t bcount) 177 { 178 struct gv_sd *parity, *s; 179 struct gv_bioq *bq; 180 struct bio *cbp; 181 int i, psdno; 182 off_t real_len, real_off; 183 184 if (p == NULL || LIST_EMPTY(&p->subdisks)) 185 return (ENXIO); 186 187 gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, &psdno, 1); 188 189 /* Find the right subdisk. */ 190 parity = NULL; 191 i = 0; 192 LIST_FOREACH(s, &p->subdisks, in_plex) { 193 if (i == psdno) { 194 parity = s; 195 break; 196 } 197 i++; 198 } 199 200 /* Parity stripe not found. */ 201 if (parity == NULL) 202 return (ENXIO); 203 204 if (parity->state != GV_SD_UP) 205 return (ENXIO); 206 207 wp->length = real_len; 208 wp->data = addr; 209 wp->lockbase = real_off; 210 211 /* Read all subdisks. */ 212 LIST_FOREACH(s, &p->subdisks, in_plex) { 213 /* Skip the parity subdisk. */ 214 if (s == parity) 215 continue; 216 /* Skip growing subdisks. */ 217 if (s->flags & GV_SD_GROW) 218 continue; 219 220 cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); 221 if (cbp == NULL) 222 return (ENOMEM); 223 cbp->bio_cmd = BIO_READ; 224 225 bioq_insert_tail(p->bqueue, cbp); 226 227 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 228 bq->bp = cbp; 229 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 230 } 231 232 /* Read the parity data. */ 233 cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); 234 if (cbp == NULL) 235 return (ENOMEM); 236 cbp->bio_cmd = BIO_READ; 237 wp->waiting = cbp; 238 239 /* 240 * In case we want to rebuild the parity, create an extra BIO to write 241 * it out. It also acts as buffer for the XOR operations. 242 */ 243 cbp = gv_raid5_clone_bio(bp, parity, wp, addr, 1); 244 if (cbp == NULL) 245 return (ENOMEM); 246 wp->parity = cbp; 247 248 return (0); 249 } 250 251 /* Rebuild a degraded RAID5 plex. */ 252 static int 253 gv_raid5_rebuild(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp, 254 caddr_t addr, off_t boff, off_t bcount) 255 { 256 struct gv_sd *broken, *s; 257 struct gv_bioq *bq; 258 struct bio *cbp; 259 off_t real_len, real_off; 260 261 if (p == NULL || LIST_EMPTY(&p->subdisks)) 262 return (ENXIO); 263 264 gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, NULL, 1); 265 266 /* Find the right subdisk. */ 267 broken = NULL; 268 LIST_FOREACH(s, &p->subdisks, in_plex) { 269 if (s->state != GV_SD_UP) 270 broken = s; 271 } 272 273 /* Broken stripe not found. */ 274 if (broken == NULL) 275 return (ENXIO); 276 277 switch (broken->state) { 278 case GV_SD_UP: 279 return (EINVAL); 280 281 case GV_SD_STALE: 282 if (!(bp->bio_pflags & GV_BIO_REBUILD)) 283 return (ENXIO); 284 285 G_VINUM_DEBUG(1, "sd %s is reviving", broken->name); 286 gv_set_sd_state(broken, GV_SD_REVIVING, GV_SETSTATE_FORCE); 287 /* Set this bit now, but should be set at end. */ 288 broken->flags |= GV_SD_CANGOUP; 289 break; 290 291 case GV_SD_REVIVING: 292 break; 293 294 default: 295 /* All other subdisk states mean it's not accessible. */ 296 return (ENXIO); 297 } 298 299 wp->length = real_len; 300 wp->data = addr; 301 wp->lockbase = real_off; 302 303 KASSERT(wp->length >= 0, ("gv_rebuild_raid5: wp->length < 0")); 304 305 /* Read all subdisks. */ 306 LIST_FOREACH(s, &p->subdisks, in_plex) { 307 /* Skip the broken subdisk. */ 308 if (s == broken) 309 continue; 310 311 /* Skip growing subdisks. */ 312 if (s->flags & GV_SD_GROW) 313 continue; 314 315 cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); 316 if (cbp == NULL) 317 return (ENOMEM); 318 cbp->bio_cmd = BIO_READ; 319 320 bioq_insert_tail(p->bqueue, cbp); 321 322 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 323 bq->bp = cbp; 324 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 325 } 326 327 /* Write the parity data. */ 328 cbp = gv_raid5_clone_bio(bp, broken, wp, NULL, 1); 329 if (cbp == NULL) 330 return (ENOMEM); 331 wp->parity = cbp; 332 333 p->synced = boff; 334 335 /* Post notification that we're finished. */ 336 return (0); 337 } 338 339 /* Build a request group to perform (part of) a RAID5 request. */ 340 static int 341 gv_raid5_request(struct gv_plex *p, struct gv_raid5_packet *wp, 342 struct bio *bp, caddr_t addr, off_t boff, off_t bcount, int *delay) 343 { 344 struct g_geom *gp; 345 struct gv_sd *broken, *original, *parity, *s; 346 struct gv_bioq *bq; 347 struct bio *cbp; 348 int i, psdno, sdno, type, grow; 349 off_t real_len, real_off; 350 351 gp = bp->bio_to->geom; 352 353 if (p == NULL || LIST_EMPTY(&p->subdisks)) 354 return (ENXIO); 355 356 /* We are optimistic and assume that this request will be OK. */ 357 #define REQ_TYPE_NORMAL 0 358 #define REQ_TYPE_DEGRADED 1 359 #define REQ_TYPE_NOPARITY 2 360 361 type = REQ_TYPE_NORMAL; 362 original = parity = broken = NULL; 363 364 /* XXX: The resize won't crash with rebuild or sync, but we should still 365 * be aware of it. Also this should perhaps be done on rebuild/check as 366 * well? 367 */ 368 /* If we're over, we must use the old. */ 369 if (boff >= p->synced) { 370 grow = 1; 371 /* Or if over the resized offset, we use all drives. */ 372 } else if (boff + bcount <= p->synced) { 373 grow = 0; 374 /* Else, we're in the middle, and must wait a bit. */ 375 } else { 376 bioq_disksort(p->rqueue, bp); 377 *delay = 1; 378 return (0); 379 } 380 gv_raid5_offset(p, boff, bcount, &real_off, &real_len, 381 &sdno, &psdno, grow); 382 383 /* Find the right subdisks. */ 384 i = 0; 385 LIST_FOREACH(s, &p->subdisks, in_plex) { 386 if (i == sdno) 387 original = s; 388 if (i == psdno) 389 parity = s; 390 if (s->state != GV_SD_UP) 391 broken = s; 392 i++; 393 } 394 395 if ((original == NULL) || (parity == NULL)) 396 return (ENXIO); 397 398 /* Our data stripe is missing. */ 399 if (original->state != GV_SD_UP) 400 type = REQ_TYPE_DEGRADED; 401 402 /* If synchronizing request, just write it if disks are stale. */ 403 if (original->state == GV_SD_STALE && parity->state == GV_SD_STALE && 404 bp->bio_pflags & GV_BIO_SYNCREQ && bp->bio_cmd == BIO_WRITE) { 405 type = REQ_TYPE_NORMAL; 406 /* Our parity stripe is missing. */ 407 } else if (parity->state != GV_SD_UP) { 408 /* We cannot take another failure if we're already degraded. */ 409 if (type != REQ_TYPE_NORMAL) 410 return (ENXIO); 411 else 412 type = REQ_TYPE_NOPARITY; 413 } 414 415 wp->length = real_len; 416 wp->data = addr; 417 wp->lockbase = real_off; 418 419 KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0")); 420 421 if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len < p->synced)) 422 type = REQ_TYPE_NORMAL; 423 424 if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len >= p->synced)) { 425 bioq_disksort(p->rqueue, bp); 426 *delay = 1; 427 return (0); 428 } 429 430 switch (bp->bio_cmd) { 431 case BIO_READ: 432 /* 433 * For a degraded read we need to read in all stripes except 434 * the broken one plus the parity stripe and then recalculate 435 * the desired data. 436 */ 437 if (type == REQ_TYPE_DEGRADED) { 438 bzero(wp->data, wp->length); 439 LIST_FOREACH(s, &p->subdisks, in_plex) { 440 /* Skip the broken subdisk. */ 441 if (s == broken) 442 continue; 443 /* Skip growing if within offset. */ 444 if (grow && s->flags & GV_SD_GROW) 445 continue; 446 cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); 447 if (cbp == NULL) 448 return (ENOMEM); 449 450 bioq_insert_tail(p->bqueue, cbp); 451 452 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 453 bq->bp = cbp; 454 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 455 } 456 457 /* A normal read can be fulfilled with the original subdisk. */ 458 } else { 459 cbp = gv_raid5_clone_bio(bp, original, wp, addr, 0); 460 if (cbp == NULL) 461 return (ENOMEM); 462 463 bioq_insert_tail(p->bqueue, cbp); 464 } 465 wp->lockbase = -1; 466 467 break; 468 469 case BIO_WRITE: 470 /* 471 * A degraded write means we cannot write to the original data 472 * subdisk. Thus we need to read in all valid stripes, 473 * recalculate the parity from the original data, and then 474 * write the parity stripe back out. 475 */ 476 if (type == REQ_TYPE_DEGRADED) { 477 /* Read all subdisks. */ 478 LIST_FOREACH(s, &p->subdisks, in_plex) { 479 /* Skip the broken and the parity subdisk. */ 480 if ((s == broken) || (s == parity)) 481 continue; 482 /* Skip growing if within offset. */ 483 if (grow && s->flags & GV_SD_GROW) 484 continue; 485 486 cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); 487 if (cbp == NULL) 488 return (ENOMEM); 489 cbp->bio_cmd = BIO_READ; 490 491 bioq_insert_tail(p->bqueue, cbp); 492 493 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 494 bq->bp = cbp; 495 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 496 } 497 498 /* Write the parity data. */ 499 cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); 500 if (cbp == NULL) 501 return (ENOMEM); 502 bcopy(addr, cbp->bio_data, wp->length); 503 wp->parity = cbp; 504 505 /* 506 * When the parity stripe is missing we just write out the data. 507 */ 508 } else if (type == REQ_TYPE_NOPARITY) { 509 cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1); 510 if (cbp == NULL) 511 return (ENOMEM); 512 513 bioq_insert_tail(p->bqueue, cbp); 514 515 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 516 bq->bp = cbp; 517 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 518 519 /* 520 * A normal write request goes to the original subdisk, then we 521 * read in all other stripes, recalculate the parity and write 522 * out the parity again. 523 */ 524 } else { 525 /* Read old parity. */ 526 cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); 527 if (cbp == NULL) 528 return (ENOMEM); 529 cbp->bio_cmd = BIO_READ; 530 531 bioq_insert_tail(p->bqueue, cbp); 532 533 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 534 bq->bp = cbp; 535 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 536 537 /* Read old data. */ 538 cbp = gv_raid5_clone_bio(bp, original, wp, NULL, 1); 539 if (cbp == NULL) 540 return (ENOMEM); 541 cbp->bio_cmd = BIO_READ; 542 543 bioq_insert_tail(p->bqueue, cbp); 544 545 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 546 bq->bp = cbp; 547 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 548 549 /* Write new data. */ 550 cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1); 551 if (cbp == NULL) 552 return (ENOMEM); 553 554 /* 555 * We must not write the new data until the old data 556 * was read, so hold this BIO back until we're ready 557 * for it. 558 */ 559 wp->waiting = cbp; 560 561 /* The final bio for the parity. */ 562 cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); 563 if (cbp == NULL) 564 return (ENOMEM); 565 566 /* Remember that this is the BIO for the parity data. */ 567 wp->parity = cbp; 568 } 569 break; 570 571 default: 572 return (EINVAL); 573 } 574 575 return (0); 576 } 577 578 /* 579 * Calculate the offsets in the various subdisks for a RAID5 request. Also take 580 * care of new subdisks in an expanded RAID5 array. 581 * XXX: This assumes that the new subdisks are inserted after the others (which 582 * is okay as long as plex_offset is larger). If subdisks are inserted into the 583 * plexlist before, we get problems. 584 */ 585 static int 586 gv_raid5_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off, 587 off_t *real_len, int *sdno, int *psdno, int growing) 588 { 589 struct gv_sd *s; 590 int sd, psd, sdcount; 591 off_t len_left, stripeend, stripeoff, stripestart; 592 593 sdcount = p->sdcount; 594 if (growing) { 595 LIST_FOREACH(s, &p->subdisks, in_plex) { 596 if (s->flags & GV_SD_GROW) 597 sdcount--; 598 } 599 } 600 601 /* The number of the subdisk containing the parity stripe. */ 602 psd = sdcount - 1 - ( boff / (p->stripesize * (sdcount - 1))) % 603 sdcount; 604 KASSERT(psdno >= 0, ("gv_raid5_offset: psdno < 0")); 605 606 /* Offset of the start address from the start of the stripe. */ 607 stripeoff = boff % (p->stripesize * (sdcount - 1)); 608 KASSERT(stripeoff >= 0, ("gv_raid5_offset: stripeoff < 0")); 609 610 /* The number of the subdisk where the stripe resides. */ 611 sd = stripeoff / p->stripesize; 612 KASSERT(sdno >= 0, ("gv_raid5_offset: sdno < 0")); 613 614 /* At or past parity subdisk. */ 615 if (sd >= psd) 616 sd++; 617 618 /* The offset of the stripe on this subdisk. */ 619 stripestart = (boff - stripeoff) / (sdcount - 1); 620 KASSERT(stripestart >= 0, ("gv_raid5_offset: stripestart < 0")); 621 622 stripeoff %= p->stripesize; 623 624 /* The offset of the request on this subdisk. */ 625 *real_off = stripestart + stripeoff; 626 627 stripeend = stripestart + p->stripesize; 628 len_left = stripeend - *real_off; 629 KASSERT(len_left >= 0, ("gv_raid5_offset: len_left < 0")); 630 631 *real_len = (bcount <= len_left) ? bcount : len_left; 632 633 if (sdno != NULL) 634 *sdno = sd; 635 if (psdno != NULL) 636 *psdno = psd; 637 638 return (0); 639 } 640 641 static struct bio * 642 gv_raid5_clone_bio(struct bio *bp, struct gv_sd *s, struct gv_raid5_packet *wp, 643 caddr_t addr, int use_wp) 644 { 645 struct bio *cbp; 646 647 cbp = g_clone_bio(bp); 648 if (cbp == NULL) 649 return (NULL); 650 if (addr == NULL) { 651 cbp->bio_data = g_malloc(wp->length, M_WAITOK | M_ZERO); 652 cbp->bio_cflags |= GV_BIO_MALLOC; 653 } else 654 cbp->bio_data = addr; 655 cbp->bio_offset = wp->lockbase + s->drive_offset; 656 cbp->bio_length = wp->length; 657 cbp->bio_done = gv_done; 658 cbp->bio_caller1 = s; 659 if (use_wp) 660 cbp->bio_caller2 = wp; 661 662 return (cbp); 663 } 664