1 /*- 2 * Copyright (c) 2004 Lukas Ertl 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #include <sys/param.h> 31 #include <sys/bio.h> 32 #include <sys/conf.h> 33 #include <sys/errno.h> 34 #include <sys/kernel.h> 35 #include <sys/kthread.h> 36 #include <sys/libkern.h> 37 #include <sys/lock.h> 38 #include <sys/malloc.h> 39 #include <sys/mutex.h> 40 #include <sys/systm.h> 41 42 #include <geom/geom.h> 43 #include <geom/vinum/geom_vinum_var.h> 44 #include <geom/vinum/geom_vinum_raid5.h> 45 #include <geom/vinum/geom_vinum.h> 46 47 int gv_raid5_offset(struct gv_plex *, off_t, off_t, off_t *, off_t *, 48 int *, int *); 49 50 /* 51 * Check if the stripe that the work packet wants is already being used by 52 * some other work packet. 53 */ 54 int 55 gv_stripe_active(struct gv_plex *p, struct bio *bp) 56 { 57 struct gv_raid5_packet *wp, *owp; 58 int overlap; 59 60 wp = bp->bio_driver1; 61 if (wp->lockbase == -1) 62 return (0); 63 64 overlap = 0; 65 TAILQ_FOREACH(owp, &p->packets, list) { 66 if (owp == wp) 67 break; 68 if ((wp->lockbase >= owp->lockbase) && 69 (wp->lockbase <= owp->lockbase + owp->length)) { 70 overlap++; 71 break; 72 } 73 if ((wp->lockbase <= owp->lockbase) && 74 (wp->lockbase + wp->length >= owp->lockbase)) { 75 overlap++; 76 break; 77 } 78 } 79 80 return (overlap); 81 } 82 83 int 84 gv_check_raid5(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp, 85 caddr_t addr, off_t boff, off_t bcount) 86 { 87 struct gv_sd *parity, *s; 88 struct gv_bioq *bq; 89 struct bio *cbp, *pbp; 90 int i, psdno; 91 off_t real_len, real_off; 92 93 if (p == NULL || LIST_EMPTY(&p->subdisks)) 94 return (ENXIO); 95 96 gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, &psdno); 97 98 /* Find the right subdisk. */ 99 parity = NULL; 100 i = 0; 101 LIST_FOREACH(s, &p->subdisks, in_plex) { 102 if (i == psdno) { 103 parity = s; 104 break; 105 } 106 i++; 107 } 108 109 /* Parity stripe not found. */ 110 if (parity == NULL) 111 return (ENXIO); 112 113 if (parity->state != GV_SD_UP) 114 return (ENXIO); 115 116 wp->length = real_len; 117 wp->data = addr; 118 wp->lockbase = real_off; 119 120 /* Read all subdisks. */ 121 LIST_FOREACH(s, &p->subdisks, in_plex) { 122 /* Skip the parity subdisk. */ 123 if (s == parity) 124 continue; 125 126 cbp = g_clone_bio(bp); 127 if (cbp == NULL) 128 return (ENOMEM); 129 cbp->bio_cmd = BIO_READ; 130 cbp->bio_data = g_malloc(real_len, M_WAITOK); 131 cbp->bio_cflags |= GV_BIO_MALLOC; 132 cbp->bio_offset = real_off; 133 cbp->bio_length = real_len; 134 cbp->bio_done = gv_plex_done; 135 cbp->bio_caller2 = s->consumer; 136 cbp->bio_driver1 = wp; 137 138 GV_ENQUEUE(bp, cbp, pbp); 139 140 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 141 bq->bp = cbp; 142 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 143 } 144 145 /* Read the parity data. */ 146 cbp = g_clone_bio(bp); 147 if (cbp == NULL) 148 return (ENOMEM); 149 cbp->bio_cmd = BIO_READ; 150 cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO); 151 cbp->bio_cflags |= GV_BIO_MALLOC; 152 cbp->bio_offset = real_off; 153 cbp->bio_length = real_len; 154 cbp->bio_done = gv_plex_done; 155 cbp->bio_caller2 = parity->consumer; 156 cbp->bio_driver1 = wp; 157 wp->waiting = cbp; 158 159 /* 160 * In case we want to rebuild the parity, create an extra BIO to write 161 * it out. It also acts as buffer for the XOR operations. 162 */ 163 cbp = g_clone_bio(bp); 164 if (cbp == NULL) 165 return (ENOMEM); 166 cbp->bio_data = addr; 167 cbp->bio_offset = real_off; 168 cbp->bio_length = real_len; 169 cbp->bio_done = gv_plex_done; 170 cbp->bio_caller2 = parity->consumer; 171 cbp->bio_driver1 = wp; 172 wp->parity = cbp; 173 174 return (0); 175 } 176 177 /* Rebuild a degraded RAID5 plex. */ 178 int 179 gv_rebuild_raid5(struct gv_plex *p, struct gv_raid5_packet *wp, struct bio *bp, 180 caddr_t addr, off_t boff, off_t bcount) 181 { 182 struct gv_sd *broken, *s; 183 struct gv_bioq *bq; 184 struct bio *cbp, *pbp; 185 off_t real_len, real_off; 186 187 if (p == NULL || LIST_EMPTY(&p->subdisks)) 188 return (ENXIO); 189 190 gv_raid5_offset(p, boff, bcount, &real_off, &real_len, NULL, NULL); 191 192 /* Find the right subdisk. */ 193 broken = NULL; 194 LIST_FOREACH(s, &p->subdisks, in_plex) { 195 if (s->state != GV_SD_UP) 196 broken = s; 197 } 198 199 /* Broken stripe not found. */ 200 if (broken == NULL) 201 return (ENXIO); 202 203 switch (broken->state) { 204 case GV_SD_UP: 205 return (EINVAL); 206 207 case GV_SD_STALE: 208 if (!(bp->bio_cflags & GV_BIO_REBUILD)) 209 return (ENXIO); 210 211 printf("GEOM_VINUM: sd %s is reviving\n", broken->name); 212 gv_set_sd_state(broken, GV_SD_REVIVING, GV_SETSTATE_FORCE); 213 break; 214 215 case GV_SD_REVIVING: 216 break; 217 218 default: 219 /* All other subdisk states mean it's not accessible. */ 220 return (ENXIO); 221 } 222 223 wp->length = real_len; 224 wp->data = addr; 225 wp->lockbase = real_off; 226 227 KASSERT(wp->length >= 0, ("gv_rebuild_raid5: wp->length < 0")); 228 229 /* Read all subdisks. */ 230 LIST_FOREACH(s, &p->subdisks, in_plex) { 231 /* Skip the broken subdisk. */ 232 if (s == broken) 233 continue; 234 235 cbp = g_clone_bio(bp); 236 if (cbp == NULL) 237 return (ENOMEM); 238 cbp->bio_cmd = BIO_READ; 239 cbp->bio_data = g_malloc(real_len, M_WAITOK); 240 cbp->bio_cflags |= GV_BIO_MALLOC; 241 cbp->bio_offset = real_off; 242 cbp->bio_length = real_len; 243 cbp->bio_done = gv_plex_done; 244 cbp->bio_caller2 = s->consumer; 245 cbp->bio_driver1 = wp; 246 247 GV_ENQUEUE(bp, cbp, pbp); 248 249 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 250 bq->bp = cbp; 251 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 252 } 253 254 /* Write the parity data. */ 255 cbp = g_clone_bio(bp); 256 if (cbp == NULL) 257 return (ENOMEM); 258 cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO); 259 cbp->bio_cflags |= GV_BIO_MALLOC; 260 cbp->bio_offset = real_off; 261 cbp->bio_length = real_len; 262 cbp->bio_done = gv_plex_done; 263 cbp->bio_caller2 = broken->consumer; 264 cbp->bio_driver1 = wp; 265 cbp->bio_cflags |= GV_BIO_REBUILD; 266 wp->parity = cbp; 267 268 p->synced = boff; 269 270 return (0); 271 } 272 273 /* Build a request group to perform (part of) a RAID5 request. */ 274 int 275 gv_build_raid5_req(struct gv_plex *p, struct gv_raid5_packet *wp, 276 struct bio *bp, caddr_t addr, off_t boff, off_t bcount) 277 { 278 struct g_geom *gp; 279 struct gv_sd *broken, *original, *parity, *s; 280 struct gv_bioq *bq; 281 struct bio *cbp, *pbp; 282 int i, psdno, sdno, type; 283 off_t real_len, real_off; 284 285 gp = bp->bio_to->geom; 286 287 if (p == NULL || LIST_EMPTY(&p->subdisks)) 288 return (ENXIO); 289 290 /* We are optimistic and assume that this request will be OK. */ 291 #define REQ_TYPE_NORMAL 0 292 #define REQ_TYPE_DEGRADED 1 293 #define REQ_TYPE_NOPARITY 2 294 295 type = REQ_TYPE_NORMAL; 296 original = parity = broken = NULL; 297 298 gv_raid5_offset(p, boff, bcount, &real_off, &real_len, &sdno, &psdno); 299 300 /* Find the right subdisks. */ 301 i = 0; 302 LIST_FOREACH(s, &p->subdisks, in_plex) { 303 if (i == sdno) 304 original = s; 305 if (i == psdno) 306 parity = s; 307 if (s->state != GV_SD_UP) 308 broken = s; 309 i++; 310 } 311 312 if ((original == NULL) || (parity == NULL)) 313 return (ENXIO); 314 315 /* Our data stripe is missing. */ 316 if (original->state != GV_SD_UP) 317 type = REQ_TYPE_DEGRADED; 318 /* Our parity stripe is missing. */ 319 if (parity->state != GV_SD_UP) { 320 /* We cannot take another failure if we're already degraded. */ 321 if (type != REQ_TYPE_NORMAL) 322 return (ENXIO); 323 else 324 type = REQ_TYPE_NOPARITY; 325 } 326 327 wp->length = real_len; 328 wp->data = addr; 329 wp->lockbase = real_off; 330 331 KASSERT(wp->length >= 0, ("gv_build_raid5_request: wp->length < 0")); 332 333 if ((p->flags & GV_PLEX_SYNCING) && (boff + real_len < p->synced)) 334 type = REQ_TYPE_NORMAL; 335 336 switch (bp->bio_cmd) { 337 case BIO_READ: 338 /* 339 * For a degraded read we need to read in all stripes except 340 * the broken one plus the parity stripe and then recalculate 341 * the desired data. 342 */ 343 if (type == REQ_TYPE_DEGRADED) { 344 bzero(wp->data, wp->length); 345 LIST_FOREACH(s, &p->subdisks, in_plex) { 346 /* Skip the broken subdisk. */ 347 if (s == broken) 348 continue; 349 cbp = g_clone_bio(bp); 350 if (cbp == NULL) 351 return (ENOMEM); 352 cbp->bio_data = g_malloc(real_len, M_WAITOK); 353 cbp->bio_cflags |= GV_BIO_MALLOC; 354 cbp->bio_offset = real_off; 355 cbp->bio_length = real_len; 356 cbp->bio_done = gv_plex_done; 357 cbp->bio_caller2 = s->consumer; 358 cbp->bio_driver1 = wp; 359 360 GV_ENQUEUE(bp, cbp, pbp); 361 362 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 363 bq->bp = cbp; 364 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 365 } 366 367 /* A normal read can be fulfilled with the original subdisk. */ 368 } else { 369 cbp = g_clone_bio(bp); 370 if (cbp == NULL) 371 return (ENOMEM); 372 cbp->bio_offset = real_off; 373 cbp->bio_length = real_len; 374 cbp->bio_data = addr; 375 cbp->bio_done = g_std_done; 376 cbp->bio_caller2 = original->consumer; 377 378 GV_ENQUEUE(bp, cbp, pbp); 379 } 380 wp->lockbase = -1; 381 382 break; 383 384 case BIO_WRITE: 385 /* 386 * A degraded write means we cannot write to the original data 387 * subdisk. Thus we need to read in all valid stripes, 388 * recalculate the parity from the original data, and then 389 * write the parity stripe back out. 390 */ 391 if (type == REQ_TYPE_DEGRADED) { 392 /* Read all subdisks. */ 393 LIST_FOREACH(s, &p->subdisks, in_plex) { 394 /* Skip the broken and the parity subdisk. */ 395 if ((s == broken) || (s == parity)) 396 continue; 397 398 cbp = g_clone_bio(bp); 399 if (cbp == NULL) 400 return (ENOMEM); 401 cbp->bio_cmd = BIO_READ; 402 cbp->bio_data = g_malloc(real_len, M_WAITOK); 403 cbp->bio_cflags |= GV_BIO_MALLOC; 404 cbp->bio_offset = real_off; 405 cbp->bio_length = real_len; 406 cbp->bio_done = gv_plex_done; 407 cbp->bio_caller2 = s->consumer; 408 cbp->bio_driver1 = wp; 409 410 GV_ENQUEUE(bp, cbp, pbp); 411 412 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 413 bq->bp = cbp; 414 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 415 } 416 417 /* Write the parity data. */ 418 cbp = g_clone_bio(bp); 419 if (cbp == NULL) 420 return (ENOMEM); 421 cbp->bio_data = g_malloc(real_len, M_WAITOK); 422 cbp->bio_cflags |= GV_BIO_MALLOC; 423 bcopy(addr, cbp->bio_data, real_len); 424 cbp->bio_offset = real_off; 425 cbp->bio_length = real_len; 426 cbp->bio_done = gv_plex_done; 427 cbp->bio_caller2 = parity->consumer; 428 cbp->bio_driver1 = wp; 429 wp->parity = cbp; 430 431 /* 432 * When the parity stripe is missing we just write out the data. 433 */ 434 } else if (type == REQ_TYPE_NOPARITY) { 435 cbp = g_clone_bio(bp); 436 if (cbp == NULL) 437 return (ENOMEM); 438 cbp->bio_offset = real_off; 439 cbp->bio_length = real_len; 440 cbp->bio_data = addr; 441 cbp->bio_done = gv_plex_done; 442 cbp->bio_caller2 = original->consumer; 443 cbp->bio_driver1 = wp; 444 445 GV_ENQUEUE(bp, cbp, pbp); 446 447 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 448 bq->bp = cbp; 449 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 450 451 /* 452 * A normal write request goes to the original subdisk, then we 453 * read in all other stripes, recalculate the parity and write 454 * out the parity again. 455 */ 456 } else { 457 /* Read old parity. */ 458 cbp = g_clone_bio(bp); 459 if (cbp == NULL) 460 return (ENOMEM); 461 cbp->bio_cmd = BIO_READ; 462 cbp->bio_data = g_malloc(real_len, M_WAITOK); 463 cbp->bio_cflags |= GV_BIO_MALLOC; 464 cbp->bio_offset = real_off; 465 cbp->bio_length = real_len; 466 cbp->bio_done = gv_plex_done; 467 cbp->bio_caller2 = parity->consumer; 468 cbp->bio_driver1 = wp; 469 470 GV_ENQUEUE(bp, cbp, pbp); 471 472 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 473 bq->bp = cbp; 474 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 475 476 /* Read old data. */ 477 cbp = g_clone_bio(bp); 478 if (cbp == NULL) 479 return (ENOMEM); 480 cbp->bio_cmd = BIO_READ; 481 cbp->bio_data = g_malloc(real_len, M_WAITOK); 482 cbp->bio_cflags |= GV_BIO_MALLOC; 483 cbp->bio_offset = real_off; 484 cbp->bio_length = real_len; 485 cbp->bio_done = gv_plex_done; 486 cbp->bio_caller2 = original->consumer; 487 cbp->bio_driver1 = wp; 488 489 GV_ENQUEUE(bp, cbp, pbp); 490 491 bq = g_malloc(sizeof(*bq), M_WAITOK | M_ZERO); 492 bq->bp = cbp; 493 TAILQ_INSERT_TAIL(&wp->bits, bq, queue); 494 495 /* Write new data. */ 496 cbp = g_clone_bio(bp); 497 if (cbp == NULL) 498 return (ENOMEM); 499 cbp->bio_data = addr; 500 cbp->bio_offset = real_off; 501 cbp->bio_length = real_len; 502 cbp->bio_done = gv_plex_done; 503 cbp->bio_caller2 = original->consumer; 504 505 cbp->bio_driver1 = wp; 506 507 /* 508 * We must not write the new data until the old data 509 * was read, so hold this BIO back until we're ready 510 * for it. 511 */ 512 wp->waiting = cbp; 513 514 /* The final bio for the parity. */ 515 cbp = g_clone_bio(bp); 516 if (cbp == NULL) 517 return (ENOMEM); 518 cbp->bio_data = g_malloc(real_len, M_WAITOK | M_ZERO); 519 cbp->bio_cflags |= GV_BIO_MALLOC; 520 cbp->bio_offset = real_off; 521 cbp->bio_length = real_len; 522 cbp->bio_done = gv_plex_done; 523 cbp->bio_caller2 = parity->consumer; 524 cbp->bio_driver1 = wp; 525 526 /* Remember that this is the BIO for the parity data. */ 527 wp->parity = cbp; 528 } 529 break; 530 531 default: 532 return (EINVAL); 533 } 534 535 return (0); 536 } 537 538 /* Calculate the offsets in the various subdisks for a RAID5 request. */ 539 int 540 gv_raid5_offset(struct gv_plex *p, off_t boff, off_t bcount, off_t *real_off, 541 off_t *real_len, int *sdno, int *psdno) 542 { 543 int sd, psd; 544 off_t len_left, stripeend, stripeoff, stripestart; 545 546 /* The number of the subdisk containing the parity stripe. */ 547 psd = p->sdcount - 1 - ( boff / (p->stripesize * (p->sdcount - 1))) % 548 p->sdcount; 549 KASSERT(psdno >= 0, ("gv_raid5_offset: psdno < 0")); 550 551 /* Offset of the start address from the start of the stripe. */ 552 stripeoff = boff % (p->stripesize * (p->sdcount - 1)); 553 KASSERT(stripeoff >= 0, ("gv_raid5_offset: stripeoff < 0")); 554 555 /* The number of the subdisk where the stripe resides. */ 556 sd = stripeoff / p->stripesize; 557 KASSERT(sdno >= 0, ("gv_raid5_offset: sdno < 0")); 558 559 /* At or past parity subdisk. */ 560 if (sd >= psd) 561 sd++; 562 563 /* The offset of the stripe on this subdisk. */ 564 stripestart = (boff - stripeoff) / (p->sdcount - 1); 565 KASSERT(stripestart >= 0, ("gv_raid5_offset: stripestart < 0")); 566 567 stripeoff %= p->stripesize; 568 569 /* The offset of the request on this subdisk. */ 570 *real_off = stripestart + stripeoff; 571 572 stripeend = stripestart + p->stripesize; 573 len_left = stripeend - *real_off; 574 KASSERT(len_left >= 0, ("gv_raid5_offset: len_left < 0")); 575 576 *real_len = (bcount <= len_left) ? bcount : len_left; 577 578 if (sdno != NULL) 579 *sdno = sd; 580 if (psdno != NULL) 581 *psdno = psd; 582 583 return (0); 584 } 585