1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2004, 2007 Lukas Ertl 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include <sys/param.h> 33 #include <sys/bio.h> 34 #include <sys/lock.h> 35 #include <sys/malloc.h> 36 #include <sys/systm.h> 37 38 #include <geom/geom.h> 39 #include <geom/geom_dbg.h> 40 #include <geom/vinum/geom_vinum_var.h> 41 #include <geom/vinum/geom_vinum_raid5.h> 42 #include <geom/vinum/geom_vinum.h> 43 44 static int gv_raid5_offset(struct gv_plex *, off_t, off_t, 45 off_t *, off_t *, int *, int *, int); 46 static struct bio * gv_raid5_clone_bio(struct bio *, struct gv_sd *, 47 struct gv_raid5_packet *, caddr_t, int); 48 static int gv_raid5_request(struct gv_plex *, struct gv_raid5_packet *, 49 struct bio *, caddr_t, off_t, off_t, int *); 50 static int gv_raid5_check(struct gv_plex *, struct gv_raid5_packet *, 51 struct bio *, caddr_t, off_t, off_t); 52 static int gv_raid5_rebuild(struct gv_plex *, struct gv_raid5_packet *, 53 struct bio *, caddr_t, off_t, off_t); 54 55 struct gv_raid5_packet * 56 gv_raid5_start(struct gv_plex *p, struct bio *bp, caddr_t addr, off_t boff, 57 off_t bcount) 58 { 59 struct bio *cbp; 60 struct gv_raid5_packet *wp, *wp2; 61 struct gv_bioq *bq, *bq2; 62 int err, delay; 63 64 delay = 0; 65 wp = g_malloc(sizeof(*wp), M_WAITOK | M_ZERO); 66 wp->bio = bp; 67 wp->waiting = NULL; 68 wp->parity = NULL; 69 TAILQ_INIT(&wp->bits); 70 71 if (bp->bio_pflags & GV_BIO_REBUILD) 72 err = gv_raid5_rebuild(p, wp, bp, addr, boff, bcount); 73 else if (bp->bio_pflags & GV_BIO_CHECK) 74 err = gv_raid5_check(p, wp, bp, addr, boff, bcount); 75 else 76 err = gv_raid5_request(p, wp, bp, addr, boff, bcount, &delay); 77 78 /* Means we have a delayed request. */ 79 if (delay) { 80 g_free(wp); 81 return (NULL); 82 } 83 84 /* 85 * Building the sub-request failed, we probably need to clean up a lot. 86 */ 87 if (err) { 88 G_VINUM_LOGREQ(0, bp, "raid5 plex request failed."); 89 TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { 90 TAILQ_REMOVE(&wp->bits, bq, queue); 91 g_free(bq); 92 } 93 if (wp->waiting != NULL) { 94 if (wp->waiting->bio_cflags & GV_BIO_MALLOC) 95 g_free(wp->waiting->bio_data); 96 gv_drive_done(wp->waiting->bio_caller1); 97 g_destroy_bio(wp->waiting); 98 } 99 if (wp->parity != NULL) { 100 if (wp->parity->bio_cflags & GV_BIO_MALLOC) 101 g_free(wp->parity->bio_data); 102 gv_drive_done(wp->parity->bio_caller1); 103 g_destroy_bio(wp->parity); 104 } 105 g_free(wp); 106 107 TAILQ_FOREACH_SAFE(wp, &p->packets, list, wp2) { 108 if (wp->bio != bp) 109 continue; 110 111 TAILQ_REMOVE(&p->packets, wp, list); 112 TAILQ_FOREACH_SAFE(bq, &wp->bits, queue, bq2) { 113 TAILQ_REMOVE(&wp->bits, bq, queue); 114 g_free(bq); 115 } 116 g_free(wp); 117 } 118 119 cbp = bioq_takefirst(p->bqueue); 120 while (cbp != NULL) { 121 if (cbp->bio_cflags & GV_BIO_MALLOC) 122 g_free(cbp->bio_data); 123 gv_drive_done(cbp->bio_caller1); 124 g_destroy_bio(cbp); 125 cbp = bioq_takefirst(p->bqueue); 126 } 127 128 /* If internal, stop and reset state. */ 129 if (bp->bio_pflags & GV_BIO_INTERNAL) { 130 if (bp->bio_pflags & GV_BIO_MALLOC) 131 g_free(bp->bio_data); 132 g_destroy_bio(bp); 133 /* Reset flags. */ 134 p->flags &= ~(GV_PLEX_SYNCING | GV_PLEX_REBUILDING | 135 GV_PLEX_GROWING); 136 return (NULL); 137 } 138 g_io_deliver(bp, err); 139 return (NULL); 140 } 141 142 return (wp); 143 } 144 145 /* 146 * Check if the stripe that the work packet wants is already being used by 147 * some other work packet. 148 */ 149 int 150 gv_stripe_active(struct gv_plex *p, struct bio *bp) 151 { 152 struct gv_raid5_packet *wp, *owp; 153 int overlap; 154 155 wp = bp->bio_caller2; 156 if (wp->lockbase == -1) 157 return (0); 158 159 overlap = 0; 160 TAILQ_FOREACH(owp, &p->packets, list) { 161 if (owp == wp) 162 break; 163 if ((wp->lockbase >= owp->lockbase) && 164 (wp->lockbase <= owp->lockbase + owp->length)) { 165 overlap++; 166 break; 167 } 168 if ((wp->lockbase <= owp->lockbase) && 169 (wp->lockbase + wp->length >= owp->lockbase)) { 170 overlap++; 171 break; 172 } 173 } 174 175 return (overlap); 176 } 177 178 static int 179 gv_raid5_check(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp, 180 caddr_t addr, off_t boff, off_t bcount) 181 { 182 struct gv_sd *parity, *s; 183 struct gv_bioq *bq; 184 struct bio *cbp; 185 int i, psdno; 186 off_t real_len, real_off; 187 188 if (p == NULL || LIST_EMPTY(&p->subdisks)) 189 return (ENXIO); 190 191 gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, &psdno, 1); 192 193 /* Find the right subdisk. */ 194 parity = NULL; 195 i = 0; 196 LIST_FOREACH(s, &p->subdisks, in_plex) { 197 if (i == psdno) { 198 parity = s; 199 break; 200 } 201 i++; 202 } 203 204 /* Parity stripe not found. */ 205 if (parity == NULL) 206 return (ENXIO); 207 208 if (parity->state != GV_SD_UP) 209 return (ENXIO); 210 211 wp->length = real_len; 212 wp->data = addr; 213 wp->lockbase = real_off; 214 215 /* Read all subdisks. */ 216 LIST_FOREACH(s, &p->subdisks, in_plex) { 217 /* Skip the parity subdisk. */ 218 if (s == parity) 219 continue; 220 /* Skip growing subdisks. */ 221 if (s->flags & GV_SD_GROW) 222 continue; 223 224 cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); 225 if (cbp == NULL) 226 return (ENOMEM); 227 cbp->bio_cmd = BIO_READ; 228 229 bioq_insert_tail(p->bqueue, cbp); 230 231 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 232 bq->bp = cbp; 233 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 234 } 235 236 /* Read the parity data. */ 237 cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); 238 if (cbp == NULL) 239 return (ENOMEM); 240 cbp->bio_cmd = BIO_READ; 241 wp->waiting = cbp; 242 243 /* 244 * In case we want to rebuild the parity, create an extra BIO to write 245 * it out. It also acts as buffer for the XOR operations. 246 */ 247 cbp = gv_raid5_clone_bio(bp, parity, wp, addr, 1); 248 if (cbp == NULL) 249 return (ENOMEM); 250 wp->parity = cbp; 251 252 return (0); 253 } 254 255 /* Rebuild a degraded RAID5 plex. */ 256 static int 257 gv_raid5_rebuild(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp, 258 caddr_t addr, off_t boff, off_t bcount) 259 { 260 struct gv_sd *broken, *s; 261 struct gv_bioq *bq; 262 struct bio *cbp; 263 off_t real_len, real_off; 264 265 if (p == NULL || LIST_EMPTY(&p->subdisks)) 266 return (ENXIO); 267 268 gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, NULL, 1); 269 270 /* Find the right subdisk. */ 271 broken = NULL; 272 LIST_FOREACH(s, &p->subdisks, in_plex) { 273 if (s->state != GV_SD_UP) 274 broken = s; 275 } 276 277 /* Broken stripe not found. */ 278 if (broken == NULL) 279 return (ENXIO); 280 281 switch (broken->state) { 282 case GV_SD_UP: 283 return (EINVAL); 284 285 case GV_SD_STALE: 286 if (!(bp->bio_pflags & GV_BIO_REBUILD)) 287 return (ENXIO); 288 289 G_VINUM_DEBUG(1, "sd %s is reviving", broken->name); 290 gv_set_sd_state(broken, GV_SD_REVIVING, GV_SETSTATE_FORCE); 291 /* Set this bit now, but should be set at end. */ 292 broken->flags |= GV_SD_CANGOUP; 293 break; 294 295 case GV_SD_REVIVING: 296 break; 297 298 default: 299 /* All other subdisk states mean it's not accessible. */ 300 return (ENXIO); 301 } 302 303 wp->length = real_len; 304 wp->data = addr; 305 wp->lockbase = real_off; 306 307 KASSERT(wp->length >= 0, ("gv_rebuild_raid5: wp->length < 0")); 308 309 /* Read all subdisks. */ 310 LIST_FOREACH(s, &p->subdisks, in_plex) { 311 /* Skip the broken subdisk. */ 312 if (s == broken) 313 continue; 314 315 /* Skip growing subdisks. */ 316 if (s->flags & GV_SD_GROW) 317 continue; 318 319 cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); 320 if (cbp == NULL) 321 return (ENOMEM); 322 cbp->bio_cmd = BIO_READ; 323 324 bioq_insert_tail(p->bqueue, cbp); 325 326 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 327 bq->bp = cbp; 328 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 329 } 330 331 /* Write the parity data. */ 332 cbp = gv_raid5_clone_bio(bp, broken, wp, NULL, 1); 333 if (cbp == NULL) 334 return (ENOMEM); 335 wp->parity = cbp; 336 337 p->synced = boff; 338 339 /* Post notification that we're finished. */ 340 return (0); 341 } 342 343 /* Build a request group to perform (part of) a RAID5 request. */ 344 static int 345 gv_raid5_request(struct gv_plex *p, struct gv_raid5_packet *wp, 346 struct bio *bp, caddr_t addr, off_t boff, off_t bcount, int *delay) 347 { 348 struct gv_sd *broken, *original, *parity, *s; 349 struct gv_bioq *bq; 350 struct bio *cbp; 351 int i, psdno, sdno, type, grow; 352 off_t real_len, real_off; 353 354 if (p == NULL || LIST_EMPTY(&p->subdisks)) 355 return (ENXIO); 356 357 /* We are optimistic and assume that this request will be OK. */ 358 #define REQ_TYPE_NORMAL 0 359 #define REQ_TYPE_DEGRADED 1 360 #define REQ_TYPE_NOPARITY 2 361 362 type = REQ_TYPE_NORMAL; 363 original = parity = broken = NULL; 364 365 /* XXX: The resize won't crash with rebuild or sync, but we should still 366 * be aware of it. Also this should perhaps be done on rebuild/check as 367 * well? 368 */ 369 /* If we're over, we must use the old. */ 370 if (boff >= p->synced) { 371 grow = 1; 372 /* Or if over the resized offset, we use all drives. */ 373 } else if (boff + bcount <= p->synced) { 374 grow = 0; 375 /* Else, we're in the middle, and must wait a bit. */ 376 } else { 377 bioq_disksort(p->rqueue, bp); 378 *delay = 1; 379 return (0); 380 } 381 gv_raid5_offset(p, boff, bcount, &real_off, &real_len, 382 &sdno, &psdno, grow); 383 384 /* Find the right subdisks. */ 385 i = 0; 386 LIST_FOREACH(s, &p->subdisks, in_plex) { 387 if (i == sdno) 388 original = s; 389 if (i == psdno) 390 parity = s; 391 if (s->state != GV_SD_UP) 392 broken = s; 393 i++; 394 } 395 396 if ((original == NULL) || (parity == NULL)) 397 return (ENXIO); 398 399 /* Our data stripe is missing. */ 400 if (original->state != GV_SD_UP) 401 type = REQ_TYPE_DEGRADED; 402 403 /* If synchronizing request, just write it if disks are stale. */ 404 if (original->state == GV_SD_STALE && parity->state == GV_SD_STALE && 405 bp->bio_pflags & GV_BIO_SYNCREQ && bp->bio_cmd == BIO_WRITE) { 406 type = REQ_TYPE_NORMAL; 407 /* Our parity stripe is missing. */ 408 } else if (parity->state != GV_SD_UP) { 409 /* We cannot take another failure if we're already degraded. */ 410 if (type != REQ_TYPE_NORMAL) 411 return (ENXIO); 412 else 413 type = REQ_TYPE_NOPARITY; 414 } 415 416 wp->length = real_len; 417 wp->data = addr; 418 wp->lockbase = real_off; 419 420 KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0")); 421 422 if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len < p->synced)) 423 type = REQ_TYPE_NORMAL; 424 425 if ((p->flags & GV_PLEX_REBUILDING) && (boff + real_len >= p->synced)) { 426 bioq_disksort(p->rqueue, bp); 427 *delay = 1; 428 return (0); 429 } 430 431 switch (bp->bio_cmd) { 432 case BIO_READ: 433 /* 434 * For a degraded read we need to read in all stripes except 435 * the broken one plus the parity stripe and then recalculate 436 * the desired data. 437 */ 438 if (type == REQ_TYPE_DEGRADED) { 439 bzero(wp->data, wp->length); 440 LIST_FOREACH(s, &p->subdisks, in_plex) { 441 /* Skip the broken subdisk. */ 442 if (s == broken) 443 continue; 444 /* Skip growing if within offset. */ 445 if (grow && s->flags & GV_SD_GROW) 446 continue; 447 cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); 448 if (cbp == NULL) 449 return (ENOMEM); 450 451 bioq_insert_tail(p->bqueue, cbp); 452 453 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 454 bq->bp = cbp; 455 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 456 } 457 458 /* A normal read can be fulfilled with the original subdisk. */ 459 } else { 460 cbp = gv_raid5_clone_bio(bp, original, wp, addr, 0); 461 if (cbp == NULL) 462 return (ENOMEM); 463 464 bioq_insert_tail(p->bqueue, cbp); 465 } 466 wp->lockbase = -1; 467 468 break; 469 470 case BIO_WRITE: 471 /* 472 * A degraded write means we cannot write to the original data 473 * subdisk. Thus we need to read in all valid stripes, 474 * recalculate the parity from the original data, and then 475 * write the parity stripe back out. 476 */ 477 if (type == REQ_TYPE_DEGRADED) { 478 /* Read all subdisks. */ 479 LIST_FOREACH(s, &p->subdisks, in_plex) { 480 /* Skip the broken and the parity subdisk. */ 481 if ((s == broken) || (s == parity)) 482 continue; 483 /* Skip growing if within offset. */ 484 if (grow && s->flags & GV_SD_GROW) 485 continue; 486 487 cbp = gv_raid5_clone_bio(bp, s, wp, NULL, 1); 488 if (cbp == NULL) 489 return (ENOMEM); 490 cbp->bio_cmd = BIO_READ; 491 492 bioq_insert_tail(p->bqueue, cbp); 493 494 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 495 bq->bp = cbp; 496 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 497 } 498 499 /* Write the parity data. */ 500 cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); 501 if (cbp == NULL) 502 return (ENOMEM); 503 bcopy(addr, cbp->bio_data, wp->length); 504 wp->parity = cbp; 505 506 /* 507 * When the parity stripe is missing we just write out the data. 508 */ 509 } else if (type == REQ_TYPE_NOPARITY) { 510 cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1); 511 if (cbp == NULL) 512 return (ENOMEM); 513 514 bioq_insert_tail(p->bqueue, cbp); 515 516 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 517 bq->bp = cbp; 518 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 519 520 /* 521 * A normal write request goes to the original subdisk, then we 522 * read in all other stripes, recalculate the parity and write 523 * out the parity again. 524 */ 525 } else { 526 /* Read old parity. */ 527 cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); 528 if (cbp == NULL) 529 return (ENOMEM); 530 cbp->bio_cmd = BIO_READ; 531 532 bioq_insert_tail(p->bqueue, cbp); 533 534 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 535 bq->bp = cbp; 536 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 537 538 /* Read old data. */ 539 cbp = gv_raid5_clone_bio(bp, original, wp, NULL, 1); 540 if (cbp == NULL) 541 return (ENOMEM); 542 cbp->bio_cmd = BIO_READ; 543 544 bioq_insert_tail(p->bqueue, cbp); 545 546 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 547 bq->bp = cbp; 548 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 549 550 /* Write new data. */ 551 cbp = gv_raid5_clone_bio(bp, original, wp, addr, 1); 552 if (cbp == NULL) 553 return (ENOMEM); 554 555 /* 556 * We must not write the new data until the old data 557 * was read, so hold this BIO back until we're ready 558 * for it. 559 */ 560 wp->waiting = cbp; 561 562 /* The final bio for the parity. */ 563 cbp = gv_raid5_clone_bio(bp, parity, wp, NULL, 1); 564 if (cbp == NULL) 565 return (ENOMEM); 566 567 /* Remember that this is the BIO for the parity data. */ 568 wp->parity = cbp; 569 } 570 break; 571 572 default: 573 return (EINVAL); 574 } 575 576 return (0); 577 } 578 579 /* 580 * Calculate the offsets in the various subdisks for a RAID5 request. Also take 581 * care of new subdisks in an expanded RAID5 array. 582 * XXX: This assumes that the new subdisks are inserted after the others (which 583 * is okay as long as plex_offset is larger). If subdisks are inserted into the 584 * plexlist before, we get problems. 585 */ 586 static int 587 gv_raid5_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off, 588 off_t *real_len, int *sdno, int *psdno, int growing) 589 { 590 struct gv_sd *s; 591 int sd, psd, sdcount; 592 off_t len_left, stripeend, stripeoff, stripestart; 593 594 sdcount = p->sdcount; 595 if (growing) { 596 LIST_FOREACH(s, &p->subdisks, in_plex) { 597 if (s->flags & GV_SD_GROW) 598 sdcount--; 599 } 600 } 601 602 /* The number of the subdisk containing the parity stripe. */ 603 psd = sdcount - 1 - ( boff / (p->stripesize * (sdcount - 1))) % 604 sdcount; 605 KASSERT(psdno >= 0, ("gv_raid5_offset: psdno < 0")); 606 607 /* Offset of the start address from the start of the stripe. */ 608 stripeoff = boff % (p->stripesize * (sdcount - 1)); 609 KASSERT(stripeoff >= 0, ("gv_raid5_offset: stripeoff < 0")); 610 611 /* The number of the subdisk where the stripe resides. */ 612 sd = stripeoff / p->stripesize; 613 KASSERT(sdno >= 0, ("gv_raid5_offset: sdno < 0")); 614 615 /* At or past parity subdisk. */ 616 if (sd >= psd) 617 sd++; 618 619 /* The offset of the stripe on this subdisk. */ 620 stripestart = (boff - stripeoff) / (sdcount - 1); 621 KASSERT(stripestart >= 0, ("gv_raid5_offset: stripestart < 0")); 622 623 stripeoff %= p->stripesize; 624 625 /* The offset of the request on this subdisk. */ 626 *real_off = stripestart + stripeoff; 627 628 stripeend = stripestart + p->stripesize; 629 len_left = stripeend - *real_off; 630 KASSERT(len_left >= 0, ("gv_raid5_offset: len_left < 0")); 631 632 *real_len = (bcount <= len_left) ? bcount : len_left; 633 634 if (sdno != NULL) 635 *sdno = sd; 636 if (psdno != NULL) 637 *psdno = psd; 638 639 return (0); 640 } 641 642 static struct bio * 643 gv_raid5_clone_bio(struct bio *bp, struct gv_sd *s, struct gv_raid5_packet *wp, 644 caddr_t addr, int use_wp) 645 { 646 struct bio *cbp; 647 648 cbp = g_clone_bio(bp); 649 if (cbp == NULL) 650 return (NULL); 651 if (addr == NULL) { 652 cbp->bio_data = g_malloc(wp->length, M_WAITOK | M_ZERO); 653 cbp->bio_cflags |= GV_BIO_MALLOC; 654 } else 655 cbp->bio_data = addr; 656 cbp->bio_offset = wp->lockbase + s->drive_offset; 657 cbp->bio_length = wp->length; 658 cbp->bio_done = gv_done; 659 cbp->bio_caller1 = s; 660 s->drive_sc->active++; 661 if (use_wp) 662 cbp->bio_caller2 = wp; 663 664 return (cbp); 665 } 666