1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2004, 2007 Lukas Ertl 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 #include <sys/param.h> 31 #include <sys/bio.h> 32 #include <sys/lock.h> 33 #include <sys/malloc.h> 34 #include <sys/systm.h> 35 36 #include <geom/geom.h> 37 #include <geom/geom_dbg.h> 38 #include <geom/vinum/geom_vinum_var.h> 39 #include <geom/vinum/geom_vinum_raid5.h> 40 #include <geom/vinum/geom_vinum.h> 41 42 static int gv_raid5_offset(struct gv_plex *, off_t, off_t, 43 off_t *, off_t *, int *, int *, int); 44 static struct bio * gv_raid5_clone_bio(struct bio *, struct gv_sd *, 45 struct gv_raid5_packet *, caddr_t, int); 46 static int gv_raid5_request(struct gv_plex *, struct gv_raid5_packet *, 47 struct bio *, caddr_t, off_t, off_t, int *); 48 static int gv_raid5_check(struct gv_plex *, struct gv_raid5_packet *, 49 struct bio *, caddr_t, off_t, off_t); 50 static int gv_raid5_rebuild(struct gv_plex *, struct gv_raid5_packet *, 51 struct bio *, caddr_t, off_t, off_t); 52 53 struct gv_raid5_packet * 54 gv_raid5_start(struct gv_plex *p, struct bio *bp, caddr_t addr, off_t boff, 55 off_t bcount) 56 { 57 struct bio *cbp; 58 struct gv_raid5_packet *wp, *wp2; 59 struct gv_bioq *bq, *bq2; 60 int err, delay; 61 62 delay = 0; 63 wp = g_malloc(sizeof(*wp), M_WAITOK | M_ZERO); 64 wp->bio = bp; 65 wp->waiting = NULL; 66 wp->parity = NULL; 67 TAILQ_INIT(&wp->bits); 68 69 if (bp->bio_pflags & GV_BIO_REBUILD) 70 err = gv_raid5_rebuild(p, wp, bp, addr, boff, bcount); 71 else if (bp->bio_pflags & GV_BIO_CHECK) 72 err = gv_raid5_check(p, wp, bp, addr, boff, bcount); 73 else 74 err = gv_raid5_request(p, wp, bp, addr, boff, bcount, &delay); 75 76 /* Means we have a delayed request. */ 77 if (delay) { 78 g_free(wp); 79 return (NULL); 80 } 81 82 /* 83 * Building the sub-request failed, we probably need to clean up a lot. 84 */ 85 if (err) { 86 G_VINUM_LOGREQ(0, bp, "raid5 plex request failed."); 87 TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { 88 TAILQ_REMOVE(&wp->bits, bq, queue); 89 g_free(bq); 90 } 91 if (wp->waiting != NULL) { 92 if (wp->waiting->bio_cflags & GV_BIO_MALLOC) 93 g_free(wp->waiting->bio_data); 94 gv_drive_done(wp->waiting->bio_caller1); 95 g_destroy_bio(wp->waiting); 96 } 97 if (wp->parity != NULL) { 98 if (wp->parity->bio_cflags & GV_BIO_MALLOC) 99 g_free(wp->parity->bio_data); 100 gv_drive_done(wp->parity->bio_caller1); 101 g_destroy_bio(wp->parity); 102 } 103 g_free(wp); 104 105 TAILQ_FOREACH_SAFE(wp, &p->packets, list, wp2) { 106 if (wp->bio != bp) 107 continue; 108 109 TAILQ_REMOVE(&p->packets, wp, list); 110 TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { 111 TAILQ_REMOVE(&wp->bits, bq, queue); 112 g_free(bq); 113 } 114 g_free(wp); 115 } 116 117 cbp = bioq_takefirst(p->bqueue); 118 while (cbp != NULL) { 119 if (cbp->bio_cflags & GV_BIO_MALLOC) 120 g_free(cbp->bio_data); 121 gv_drive_done(cbp->bio_caller1); 122 g_destroy_bio(cbp); 123 cbp = bioq_takefirst(p->bqueue); 124 } 125 126 /* If internal, stop and reset state. */ 127 if (bp->bio_pflags & GV_BIO_INTERNAL) { 128 if (bp->bio_pflags & GV_BIO_MALLOC) 129 g_free(bp->bio_data); 130 g_destroy_bio(bp); 131 /* Reset flags. */ 132 p->flags &= ~(GV_PLEX_SYNCING | GV_PLEX_REBUILDING | 133 GV_PLEX_GROWING); 134 return (NULL); 135 } 136 g_io_deliver(bp, err); 137 return (NULL); 138 } 139 140 return (wp); 141 } 142 143 /* 144 * Check if the stripe that the work packet wants is already being used by 145 * some other work packet. 146 */ 147 int 148 gv_stripe_active(struct gv_plex *p, struct bio *bp) 149 { 150 struct gv_raid5_packet *wp, *owp; 151 int overlap; 152 153 wp = bp->bio_caller2; 154 if (wp->lockbase == -1) 155 return (0); 156 157 overlap = 0; 158 TAILQ_FOREACH(owp, &p->packets, list) { 159 if (owp == wp) 160 break; 161 if ((wp->lockbase >= owp->lockbase) && 162 (wp->lockbase <= owp->lockbase + owp->length)) { 163 overlap++; 164 break; 165 } 166 if ((wp->lockbase <= owp->lockbase) && 167 (wp->lockbase + wp->length >= owp->lockbase)) { 168 overlap++; 169 break; 170 } 171 } 172 173 return (overlap); 174 } 175 176 static int 177 gv_raid5_check(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp, 178 caddr_t addr, off_t boff, off_t bcount) 179 { 180 struct gv_sd *parity, *s; 181 struct gv_bioq *bq; 182 struct bio *cbp; 183 int i, psdno; 184 off_t real_len, real_off; 185 186 if (p == NULL || LIST_EMPTY(&p->subdisks)) 187 return (ENXIO); 188 189 gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, &psdno, 1); 190 191 /* Find the right subdisk. */ 192 parity = NULL; 193 i = 0; 194 LIST_FOREACH(s, &p->subdisks, in_plex) { 195 if (i == psdno) { 196 parity = s; 197 break; 198 } 199 i++; 200 } 201 202 /* Parity stripe not found. */ 203 if (parity == NULL) 204 return (ENXIO); 205 206 if (parity->state != GV_SD_UP) 207 return (ENXIO); 208 209 wp->length = real_len; 210 wp->data = addr; 211 wp->lockbase = real_off; 212 213 /* Read all subdisks. */ 214 LIST_FOREACH(s, &p->subdisks, in_plex) { 215 /* Skip the parity subdisk. */ 216 if (s == parity) 217 continue; 218 /* Skip growing subdisks. */ 219 if (s->flags & GV_SD_GROW) 220 continue; 221 222 cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); 223 if (cbp == NULL) 224 return (ENOMEM); 225 cbp->bio_cmd = BIO_READ; 226 227 bioq_insert_tail(p->bqueue, cbp); 228 229 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 230 bq->bp = cbp; 231 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 232 } 233 234 /* Read the parity data. */ 235 cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); 236 if (cbp == NULL) 237 return (ENOMEM); 238 cbp->bio_cmd = BIO_READ; 239 wp->waiting = cbp; 240 241 /* 242 * In case we want to rebuild the parity, create an extra BIO to write 243 * it out. It also acts as buffer for the XOR operations. 244 */ 245 cbp = gv_raid5_clone_bio(bp, parity, wp, addr, 1); 246 if (cbp == NULL) 247 return (ENOMEM); 248 wp->parity = cbp; 249 250 return (0); 251 } 252 253 /* Rebuild a degraded RAID5 plex. */ 254 static int 255 gv_raid5_rebuild(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp, 256 caddr_t addr, off_t boff, off_t bcount) 257 { 258 struct gv_sd *broken, *s; 259 struct gv_bioq *bq; 260 struct bio *cbp; 261 off_t real_len, real_off; 262 263 if (p == NULL || LIST_EMPTY(&p->subdisks)) 264 return (ENXIO); 265 266 gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, NULL, 1); 267 268 /* Find the right subdisk. */ 269 broken = NULL; 270 LIST_FOREACH(s, &p->subdisks, in_plex) { 271 if (s->state != GV_SD_UP) 272 broken = s; 273 } 274 275 /* Broken stripe not found. */ 276 if (broken == NULL) 277 return (ENXIO); 278 279 switch (broken->state) { 280 case GV_SD_UP: 281 return (EINVAL); 282 283 case GV_SD_STALE: 284 if (!(bp->bio_pflags & GV_BIO_REBUILD)) 285 return (ENXIO); 286 287 G_VINUM_DEBUG(1, "sd %s is reviving", broken->name); 288 gv_set_sd_state(broken, GV_SD_REVIVING, GV_SETSTATE_FORCE); 289 /* Set this bit now, but should be set at end. */ 290 broken->flags |= GV_SD_CANGOUP; 291 break; 292 293 case GV_SD_REVIVING: 294 break; 295 296 default: 297 /* All other subdisk states mean it's not accessible. */ 298 return (ENXIO); 299 } 300 301 wp->length = real_len; 302 wp->data = addr; 303 wp->lockbase = real_off; 304 305 KASSERT(wp->length >= 0, ("gv_rebuild_raid5: wp->length < 0")); 306 307 /* Read all subdisks. */ 308 LIST_FOREACH(s, &p->subdisks, in_plex) { 309 /* Skip the broken subdisk. */ 310 if (s == broken) 311 continue; 312 313 /* Skip growing subdisks. */ 314 if (s->flags & GV_SD_GROW) 315 continue; 316 317 cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); 318 if (cbp == NULL) 319 return (ENOMEM); 320 cbp->bio_cmd = BIO_READ; 321 322 bioq_insert_tail(p->bqueue, cbp); 323 324 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 325 bq->bp = cbp; 326 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 327 } 328 329 /* Write the parity data. */ 330 cbp = gv_raid5_clone_bio(bp, broken, wp, NULL, 1); 331 if (cbp == NULL) 332 return (ENOMEM); 333 wp->parity = cbp; 334 335 p->synced = boff; 336 337 /* Post notification that we're finished. */ 338 return (0); 339 } 340 341 /* Build a request group to perform (part of) a RAID5 request. */ 342 static int 343 gv_raid5_request(struct gv_plex *p, struct gv_raid5_packet *wp, 344 struct bio *bp, caddr_t addr, off_t boff, off_t bcount, int *delay) 345 { 346 struct gv_sd *broken, *original, *parity, *s; 347 struct gv_bioq *bq; 348 struct bio *cbp; 349 int i, psdno, sdno, type, grow; 350 off_t real_len, real_off; 351 352 if (p == NULL || LIST_EMPTY(&p->subdisks)) 353 return (ENXIO); 354 355 /* We are optimistic and assume that this request will be OK. */ 356 #define REQ_TYPE_NORMAL 0 357 #define REQ_TYPE_DEGRADED 1 358 #define REQ_TYPE_NOPARITY 2 359 360 type = REQ_TYPE_NORMAL; 361 original = parity = broken = NULL; 362 363 /* XXX: The resize won't crash with rebuild or sync, but we should still 364 * be aware of it. Also this should perhaps be done on rebuild/check as 365 * well? 366 */ 367 /* If we're over, we must use the old. */ 368 if (boff >= p->synced) { 369 grow = 1; 370 /* Or if over the resized offset, we use all drives. */ 371 } else if (boff + bcount <= p->synced) { 372 grow = 0; 373 /* Else, we're in the middle, and must wait a bit. */ 374 } else { 375 bioq_disksort(p->rqueue, bp); 376 *delay = 1; 377 return (0); 378 } 379 gv_raid5_offset(p, boff, bcount, &real_off, &real_len, 380 &sdno, &psdno, grow); 381 382 /* Find the right subdisks. */ 383 i = 0; 384 LIST_FOREACH(s, &p->subdisks, in_plex) { 385 if (i == sdno) 386 original = s; 387 if (i == psdno) 388 parity = s; 389 if (s->state != GV_SD_UP) 390 broken = s; 391 i++; 392 } 393 394 if ((original == NULL) || (parity == NULL)) 395 return (ENXIO); 396 397 /* Our data stripe is missing. */ 398 if (original->state != GV_SD_UP) 399 type = REQ_TYPE_DEGRADED; 400 401 /* If synchronizing request, just write it if disks are stale. */ 402 if (original->state == GV_SD_STALE && parity->state == GV_SD_STALE && 403 bp->bio_pflags & GV_BIO_SYNCREQ && bp->bio_cmd == BIO_WRITE) { 404 type = REQ_TYPE_NORMAL; 405 /* Our parity stripe is missing. */ 406 } else if (parity->state != GV_SD_UP) { 407 /* We cannot take another failure if we're already degraded. */ 408 if (type != REQ_TYPE_NORMAL) 409 return (ENXIO); 410 else 411 type = REQ_TYPE_NOPARITY; 412 } 413 414 wp->length = real_len; 415 wp->data = addr; 416 wp->lockbase = real_off; 417 418 KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0")); 419 420 if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len < p->synced)) 421 type = REQ_TYPE_NORMAL; 422 423 if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len >= p->synced)) { 424 bioq_disksort(p->rqueue, bp); 425 *delay = 1; 426 return (0); 427 } 428 429 switch (bp->bio_cmd) { 430 case BIO_READ: 431 /* 432 * For a degraded read we need to read in all stripes except 433 * the broken one plus the parity stripe and then recalculate 434 * the desired data. 435 */ 436 if (type == REQ_TYPE_DEGRADED) { 437 bzero(wp->data, wp->length); 438 LIST_FOREACH(s, &p->subdisks, in_plex) { 439 /* Skip the broken subdisk. */ 440 if (s == broken) 441 continue; 442 /* Skip growing if within offset. */ 443 if (grow && s->flags & GV_SD_GROW) 444 continue; 445 cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); 446 if (cbp == NULL) 447 return (ENOMEM); 448 449 bioq_insert_tail(p->bqueue, cbp); 450 451 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 452 bq->bp = cbp; 453 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 454 } 455 456 /* A normal read can be fulfilled with the original subdisk. */ 457 } else { 458 cbp = gv_raid5_clone_bio(bp, original, wp, addr, 0); 459 if (cbp == NULL) 460 return (ENOMEM); 461 462 bioq_insert_tail(p->bqueue, cbp); 463 } 464 wp->lockbase = -1; 465 466 break; 467 468 case BIO_WRITE: 469 /* 470 * A degraded write means we cannot write to the original data 471 * subdisk. Thus we need to read in all valid stripes, 472 * recalculate the parity from the original data, and then 473 * write the parity stripe back out. 474 */ 475 if (type == REQ_TYPE_DEGRADED) { 476 /* Read all subdisks. */ 477 LIST_FOREACH(s, &p->subdisks, in_plex) { 478 /* Skip the broken and the parity subdisk. */ 479 if ((s == broken) || (s == parity)) 480 continue; 481 /* Skip growing if within offset. */ 482 if (grow && s->flags & GV_SD_GROW) 483 continue; 484 485 cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); 486 if (cbp == NULL) 487 return (ENOMEM); 488 cbp->bio_cmd = BIO_READ; 489 490 bioq_insert_tail(p->bqueue, cbp); 491 492 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 493 bq->bp = cbp; 494 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 495 } 496 497 /* Write the parity data. */ 498 cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); 499 if (cbp == NULL) 500 return (ENOMEM); 501 bcopy(addr, cbp->bio_data, wp->length); 502 wp->parity = cbp; 503 504 /* 505 * When the parity stripe is missing we just write out the data. 506 */ 507 } else if (type == REQ_TYPE_NOPARITY) { 508 cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1); 509 if (cbp == NULL) 510 return (ENOMEM); 511 512 bioq_insert_tail(p->bqueue, cbp); 513 514 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 515 bq->bp = cbp; 516 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 517 518 /* 519 * A normal write request goes to the original subdisk, then we 520 * read in all other stripes, recalculate the parity and write 521 * out the parity again. 522 */ 523 } else { 524 /* Read old parity. */ 525 cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); 526 if (cbp == NULL) 527 return (ENOMEM); 528 cbp->bio_cmd = BIO_READ; 529 530 bioq_insert_tail(p->bqueue, cbp); 531 532 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 533 bq->bp = cbp; 534 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 535 536 /* Read old data. */ 537 cbp = gv_raid5_clone_bio(bp, original, wp, NULL, 1); 538 if (cbp == NULL) 539 return (ENOMEM); 540 cbp->bio_cmd = BIO_READ; 541 542 bioq_insert_tail(p->bqueue, cbp); 543 544 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 545 bq->bp = cbp; 546 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 547 548 /* Write new data. */ 549 cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1); 550 if (cbp == NULL) 551 return (ENOMEM); 552 553 /* 554 * We must not write the new data until the old data 555 * was read, so hold this BIO back until we're ready 556 * for it. 557 */ 558 wp->waiting = cbp; 559 560 /* The final bio for the parity. */ 561 cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); 562 if (cbp == NULL) 563 return (ENOMEM); 564 565 /* Remember that this is the BIO for the parity data. */ 566 wp->parity = cbp; 567 } 568 break; 569 570 default: 571 return (EINVAL); 572 } 573 574 return (0); 575 } 576 577 /* 578 * Calculate the offsets in the various subdisks for a RAID5 request. Also take 579 * care of new subdisks in an expanded RAID5 array. 580 * XXX: This assumes that the new subdisks are inserted after the others (which 581 * is okay as long as plex_offset is larger). If subdisks are inserted into the 582 * plexlist before, we get problems. 583 */ 584 static int 585 gv_raid5_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off, 586 off_t *real_len, int *sdno, int *psdno, int growing) 587 { 588 struct gv_sd *s; 589 int sd, psd, sdcount; 590 off_t len_left, stripeend, stripeoff, stripestart; 591 592 sdcount = p->sdcount; 593 if (growing) { 594 LIST_FOREACH(s, &p->subdisks, in_plex) { 595 if (s->flags & GV_SD_GROW) 596 sdcount--; 597 } 598 } 599 600 /* The number of the subdisk containing the parity stripe. */ 601 psd = sdcount - 1 - ( boff / (p->stripesize * (sdcount - 1))) % 602 sdcount; 603 KASSERT(psd >= 0, ("gv_raid5_offset: psdno < 0")); 604 605 /* Offset of the start address from the start of the stripe. */ 606 stripeoff = boff % (p->stripesize * (sdcount - 1)); 607 KASSERT(stripeoff >= 0, ("gv_raid5_offset: stripeoff < 0")); 608 609 /* The number of the subdisk where the stripe resides. */ 610 sd = stripeoff / p->stripesize; 611 KASSERT(sd >= 0, ("gv_raid5_offset: sdno < 0")); 612 613 /* At or past parity subdisk. */ 614 if (sd >= psd) 615 sd++; 616 617 /* The offset of the stripe on this subdisk. */ 618 stripestart = (boff - stripeoff) / (sdcount - 1); 619 KASSERT(stripestart >= 0, ("gv_raid5_offset: stripestart < 0")); 620 621 stripeoff %= p->stripesize; 622 623 /* The offset of the request on this subdisk. */ 624 *real_off = stripestart + stripeoff; 625 626 stripeend = stripestart + p->stripesize; 627 len_left = stripeend - *real_off; 628 KASSERT(len_left >= 0, ("gv_raid5_offset: len_left < 0")); 629 630 *real_len = (bcount <= len_left) ? bcount : len_left; 631 632 if (sdno != NULL) 633 *sdno = sd; 634 if (psdno != NULL) 635 *psdno = psd; 636 637 return (0); 638 } 639 640 static struct bio * 641 gv_raid5_clone_bio(struct bio *bp, struct gv_sd *s, struct gv_raid5_packet *wp, 642 caddr_t addr, int use_wp) 643 { 644 struct bio *cbp; 645 646 cbp = g_clone_bio(bp); 647 if (cbp == NULL) 648 return (NULL); 649 if (addr == NULL) { 650 cbp->bio_data = g_malloc(wp->length, M_WAITOK | M_ZERO); 651 cbp->bio_cflags |= GV_BIO_MALLOC; 652 } else 653 cbp->bio_data = addr; 654 cbp->bio_offset = wp->lockbase + s->drive_offset; 655 cbp->bio_length = wp->length; 656 cbp->bio_done = gv_done; 657 cbp->bio_caller1 = s; 658 s->drive_sc->active++; 659 if (use_wp) 660 cbp->bio_caller2 = wp; 661 662 return (cbp); 663 } 664