1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/zfs_context.h> 30 #include <sys/spa.h> 31 #include <sys/vdev_impl.h> 32 #include <sys/zio.h> 33 #include <sys/zio_checksum.h> 34 #include <sys/fs/zfs.h> 35 36 /* 37 * Virtual device vector for RAID-Z. 38 */ 39 40 /* 41 * We currently allow up to two-way replication (i.e. single-fault 42 * reconstruction) models in RAID-Z vdevs. The blocks in such vdevs 43 * must all be multiples of two times the leaf vdev blocksize. 44 */ 45 #define VDEV_RAIDZ_ALIGN 2ULL 46 47 typedef struct raidz_col { 48 uint64_t rc_col; 49 uint64_t rc_offset; 50 uint64_t rc_size; 51 void *rc_data; 52 int rc_error; 53 short rc_tried; 54 short rc_skipped; 55 } raidz_col_t; 56 57 typedef struct raidz_map { 58 uint64_t rm_cols; 59 uint64_t rm_bigcols; 60 uint64_t rm_asize; 61 int rm_missing_child; 62 int rm_type; 63 int rm_firstdatacol; 64 raidz_col_t rm_col[1]; 65 } raidz_map_t; 66 67 #define RAIDZ_SINGLE 0 68 #define RAIDZ_PARITY 1 69 70 static raidz_map_t * 71 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, 72 int raid_type) 73 { 74 raidz_map_t *rm; 75 uint64_t b = zio->io_offset >> unit_shift; 76 uint64_t s = zio->io_size >> unit_shift; 77 uint64_t f = b % dcols; 78 uint64_t o = (b / dcols) << unit_shift; 79 uint64_t q, r, c, bc, col, acols, coff; 80 int firstdatacol; 81 82 switch (raid_type) { 83 case RAIDZ_SINGLE: 84 q = s / dcols; 85 r = s - q * dcols; 86 bc = r; 87 firstdatacol = 0; 88 break; 89 case RAIDZ_PARITY: 90 q = s / (dcols - 1); 91 r = s - q * (dcols - 1); 92 bc = r + !!r; 93 firstdatacol = 1; 94 break; 95 } 96 97 acols = (q == 0 ? bc : dcols); 98 99 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP); 100 101 rm->rm_cols = acols; 102 rm->rm_bigcols = bc; 103 rm->rm_asize = 0; 104 rm->rm_missing_child = -1; 105 rm->rm_type = raid_type; 106 rm->rm_firstdatacol = firstdatacol; 107 108 for (c = 0; c < acols; c++) { 109 col = f + c; 110 coff = o; 111 if (col >= dcols) { 112 col -= dcols; 113 coff += 1ULL << unit_shift; 114 } 115 rm->rm_col[c].rc_col = col; 116 rm->rm_col[c].rc_offset = coff; 117 rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift; 118 rm->rm_col[c].rc_data = NULL; 119 rm->rm_col[c].rc_error = 0; 120 rm->rm_col[c].rc_tried = 0; 121 rm->rm_col[c].rc_skipped = 0; 122 rm->rm_asize += rm->rm_col[c].rc_size; 123 } 124 125 rm->rm_asize = P2ROUNDUP(rm->rm_asize, VDEV_RAIDZ_ALIGN << unit_shift); 126 127 for (c = 0; c < rm->rm_firstdatacol; c++) 128 rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); 129 130 rm->rm_col[c].rc_data = zio->io_data; 131 132 for (c = c + 1; c < acols; c++) 133 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 134 rm->rm_col[c - 1].rc_size; 135 136 if (raid_type == RAIDZ_PARITY) { 137 /* 138 * To prevent hot parity disks, switch the parity and data 139 * columns every 1MB. 140 */ 141 ASSERT(rm->rm_cols >= 2); 142 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 143 144 if (zio->io_offset & (1ULL << 20)) { 145 col = rm->rm_col[0].rc_col; 146 o = rm->rm_col[0].rc_offset; 147 rm->rm_col[0].rc_col = rm->rm_col[1].rc_col; 148 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 149 rm->rm_col[1].rc_col = col; 150 rm->rm_col[1].rc_offset = o; 151 } 152 } 153 154 zio->io_vsd = rm; 155 return (rm); 156 } 157 158 static void 159 vdev_raidz_map_free(zio_t *zio) 160 { 161 raidz_map_t *rm = zio->io_vsd; 162 int c; 163 164 for (c = 0; c < rm->rm_firstdatacol; c++) 165 zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); 166 167 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols])); 168 zio->io_vsd = NULL; 169 } 170 171 static void 172 vdev_raidz_reconstruct(raidz_map_t *rm, int x) 173 { 174 uint64_t *dst, *src, count, xsize, csize; 175 int i, c; 176 177 for (c = 0; c < rm->rm_cols; c++) { 178 if (c == x) 179 continue; 180 src = rm->rm_col[c].rc_data; 181 dst = rm->rm_col[x].rc_data; 182 csize = rm->rm_col[c].rc_size; 183 xsize = rm->rm_col[x].rc_size; 184 count = MIN(csize, xsize) / sizeof (uint64_t); 185 if (c == !x) { 186 /* 187 * The initial copy happens at either c == 0 or c == 1. 188 * Both of these columns are 'big' columns, so we'll 189 * definitely initialize all of column x. 190 */ 191 ASSERT3U(xsize, <=, csize); 192 for (i = 0; i < count; i++) 193 *dst++ = *src++; 194 } else { 195 for (i = 0; i < count; i++) 196 *dst++ ^= *src++; 197 } 198 } 199 } 200 201 static int 202 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) 203 { 204 vdev_t *cvd; 205 int c, error; 206 int lasterror = 0; 207 int numerrors = 0; 208 209 /* 210 * XXX -- minimum children should be raid-type-specific 211 */ 212 if (vd->vdev_children < 2) { 213 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 214 return (EINVAL); 215 } 216 217 for (c = 0; c < vd->vdev_children; c++) { 218 cvd = vd->vdev_child[c]; 219 220 if ((error = vdev_open(cvd)) != 0) { 221 lasterror = error; 222 numerrors++; 223 continue; 224 } 225 226 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 227 *ashift = cvd->vdev_ashift; 228 } 229 230 *asize *= vd->vdev_children; 231 232 if (numerrors > 1) { 233 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 234 return (lasterror); 235 } 236 237 return (0); 238 } 239 240 static void 241 vdev_raidz_close(vdev_t *vd) 242 { 243 int c; 244 245 for (c = 0; c < vd->vdev_children; c++) 246 vdev_close(vd->vdev_child[c]); 247 } 248 249 static uint64_t 250 vdev_raidz_asize(vdev_t *vd, uint64_t psize) 251 { 252 uint64_t asize; 253 uint64_t cols = vd->vdev_children; 254 255 /* 256 * These calculations assume RAIDZ_PARITY. 257 */ 258 asize = psize >> vd->vdev_ashift; 259 asize += (asize + cols - 2) / (cols - 1); 260 asize = P2ROUNDUP(asize, VDEV_RAIDZ_ALIGN) << vd->vdev_ashift; 261 262 return (asize); 263 } 264 265 static void 266 vdev_raidz_child_done(zio_t *zio) 267 { 268 raidz_col_t *rc = zio->io_private; 269 270 rc->rc_error = zio->io_error; 271 rc->rc_tried = 1; 272 rc->rc_skipped = 0; 273 } 274 275 static void 276 vdev_raidz_repair_done(zio_t *zio) 277 { 278 zio_buf_free(zio->io_data, zio->io_size); 279 } 280 281 static void 282 vdev_raidz_io_start(zio_t *zio) 283 { 284 vdev_t *vd = zio->io_vd; 285 vdev_t *cvd; 286 blkptr_t *bp = zio->io_bp; 287 raidz_map_t *rm; 288 raidz_col_t *rc; 289 int c; 290 291 rm = vdev_raidz_map_alloc(zio, vd->vdev_ashift, vd->vdev_children, 292 RAIDZ_PARITY); 293 294 if (DVA_GET_GANG(ZIO_GET_DVA(zio))) { 295 ASSERT3U(rm->rm_asize, ==, 296 vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE)); 297 ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE); 298 } else { 299 ASSERT3U(rm->rm_asize, ==, DVA_GET_ASIZE(ZIO_GET_DVA(zio))); 300 ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); 301 } 302 303 if (zio->io_type == ZIO_TYPE_WRITE) { 304 305 /* 306 * Generate RAID parity in virtual column 0. 307 */ 308 vdev_raidz_reconstruct(rm, 0); 309 310 for (c = 0; c < rm->rm_cols; c++) { 311 rc = &rm->rm_col[c]; 312 cvd = vd->vdev_child[rc->rc_col]; 313 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 314 rc->rc_offset, rc->rc_data, rc->rc_size, 315 zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, 316 vdev_raidz_child_done, rc)); 317 } 318 zio_wait_children_done(zio); 319 return; 320 } 321 322 ASSERT(zio->io_type == ZIO_TYPE_READ); 323 324 for (c = rm->rm_cols - 1; c >= 0; c--) { 325 rc = &rm->rm_col[c]; 326 cvd = vd->vdev_child[rc->rc_col]; 327 if (vdev_is_dead(cvd)) { 328 rm->rm_missing_child = c; 329 rc->rc_error = ENXIO; 330 rc->rc_tried = 1; /* don't even try */ 331 rc->rc_skipped = 1; 332 continue; 333 } 334 if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) { 335 rm->rm_missing_child = c; 336 rc->rc_error = ESTALE; 337 rc->rc_skipped = 1; 338 continue; 339 } 340 if (c >= rm->rm_firstdatacol || rm->rm_missing_child != -1 || 341 (zio->io_flags & ZIO_FLAG_SCRUB)) { 342 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 343 rc->rc_offset, rc->rc_data, rc->rc_size, 344 zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, 345 vdev_raidz_child_done, rc)); 346 } 347 } 348 349 zio_wait_children_done(zio); 350 } 351 352 static void 353 vdev_raidz_io_done(zio_t *zio) 354 { 355 vdev_t *vd = zio->io_vd; 356 vdev_t *cvd; 357 raidz_map_t *rm = zio->io_vsd; 358 raidz_col_t *rc; 359 blkptr_t *bp = zio->io_bp; 360 int unexpected_errors = 0; 361 int c; 362 363 ASSERT(bp != NULL); /* XXX need to add code to enforce this */ 364 365 zio->io_error = 0; 366 zio->io_numerrors = 0; 367 368 for (c = 0; c < rm->rm_cols; c++) { 369 rc = &rm->rm_col[c]; 370 371 /* 372 * We preserve any EIOs because those may be worth retrying; 373 * whereas ECKSUM and ENXIO are more likely to be persistent. 374 */ 375 if (rc->rc_error) { 376 if (zio->io_error != EIO) 377 zio->io_error = rc->rc_error; 378 if (!rc->rc_skipped) 379 unexpected_errors++; 380 zio->io_numerrors++; 381 } 382 } 383 384 if (zio->io_type == ZIO_TYPE_WRITE) { 385 /* 386 * If this is not a failfast write, and we were able to 387 * write enough columns to reconstruct the data, good enough. 388 */ 389 /* XXPOLICY */ 390 if (zio->io_numerrors <= rm->rm_firstdatacol && 391 !(zio->io_flags & ZIO_FLAG_FAILFAST)) 392 zio->io_error = 0; 393 394 vdev_raidz_map_free(zio); 395 zio_next_stage(zio); 396 return; 397 } 398 399 ASSERT(zio->io_type == ZIO_TYPE_READ); 400 401 /* 402 * If there were no I/O errors, and the data checksums correctly, 403 * the read is complete. 404 */ 405 /* XXPOLICY */ 406 if (zio->io_numerrors == 0 && zio_checksum_error(zio) == 0) { 407 ASSERT(unexpected_errors == 0); 408 ASSERT(zio->io_error == 0); 409 410 /* 411 * We know the data's good. If we read the parity, 412 * verify that it's good as well. If not, fix it. 413 */ 414 for (c = 0; c < rm->rm_firstdatacol; c++) { 415 void *orig; 416 rc = &rm->rm_col[c]; 417 if (!rc->rc_tried) 418 continue; 419 orig = zio_buf_alloc(rc->rc_size); 420 bcopy(rc->rc_data, orig, rc->rc_size); 421 vdev_raidz_reconstruct(rm, c); 422 if (bcmp(orig, rc->rc_data, rc->rc_size) != 0) { 423 vdev_checksum_error(zio, 424 vd->vdev_child[rc->rc_col]); 425 rc->rc_error = ECKSUM; 426 unexpected_errors++; 427 } 428 zio_buf_free(orig, rc->rc_size); 429 } 430 goto done; 431 } 432 433 /* 434 * If there was exactly one I/O error, it's the one we expected, 435 * and the reconstructed data checksums, the read is complete. 436 * This happens when one child is offline and vdev_fault_assess() 437 * knows it, or when one child has stale data and the DTL knows it. 438 */ 439 if (zio->io_numerrors == 1 && (c = rm->rm_missing_child) != -1) { 440 rc = &rm->rm_col[c]; 441 ASSERT(unexpected_errors == 0); 442 ASSERT(rc->rc_error == ENXIO || rc->rc_error == ESTALE); 443 vdev_raidz_reconstruct(rm, c); 444 if (zio_checksum_error(zio) == 0) { 445 zio->io_error = 0; 446 goto done; 447 } 448 } 449 450 /* 451 * This isn't a typical error -- either we got a read error or 452 * more than one child claimed a problem. Read every block we 453 * haven't already so we can try combinatorial reconstruction. 454 */ 455 unexpected_errors = 1; 456 rm->rm_missing_child = -1; 457 458 for (c = 0; c < rm->rm_cols; c++) 459 if (!rm->rm_col[c].rc_tried) 460 break; 461 462 if (c != rm->rm_cols) { 463 zio->io_error = 0; 464 zio_vdev_io_redone(zio); 465 for (c = 0; c < rm->rm_cols; c++) { 466 rc = &rm->rm_col[c]; 467 if (rc->rc_tried) 468 continue; 469 zio_nowait(zio_vdev_child_io(zio, NULL, 470 vd->vdev_child[rc->rc_col], 471 rc->rc_offset, rc->rc_data, rc->rc_size, 472 zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, 473 vdev_raidz_child_done, rc)); 474 } 475 zio_wait_children_done(zio); 476 return; 477 } 478 479 /* 480 * If there were more errors than parity disks, give up. 481 */ 482 if (zio->io_numerrors > rm->rm_firstdatacol) { 483 ASSERT(zio->io_error != 0); 484 goto done; 485 } 486 487 /* 488 * The number of I/O errors is correctable. Correct them here. 489 */ 490 ASSERT(zio->io_numerrors <= rm->rm_firstdatacol); 491 for (c = 0; c < rm->rm_cols; c++) { 492 rc = &rm->rm_col[c]; 493 ASSERT(rc->rc_tried); 494 if (rc->rc_error) { 495 vdev_raidz_reconstruct(rm, c); 496 if (zio_checksum_error(zio) == 0) 497 zio->io_error = 0; 498 else 499 zio->io_error = rc->rc_error; 500 goto done; 501 } 502 } 503 504 /* 505 * There were no I/O errors, but the data doesn't checksum. 506 * Try all permutations to see if we can find one that does. 507 */ 508 ASSERT(zio->io_numerrors == 0); 509 for (c = 0; c < rm->rm_cols; c++) { 510 void *orig; 511 rc = &rm->rm_col[c]; 512 513 orig = zio_buf_alloc(rc->rc_size); 514 bcopy(rc->rc_data, orig, rc->rc_size); 515 vdev_raidz_reconstruct(rm, c); 516 517 if (zio_checksum_error(zio) == 0) { 518 zio_buf_free(orig, rc->rc_size); 519 zio->io_error = 0; 520 /* 521 * If this child didn't know that it returned bad data, 522 * inform it. 523 */ 524 if (rc->rc_tried && rc->rc_error == 0) 525 vdev_checksum_error(zio, 526 vd->vdev_child[rc->rc_col]); 527 rc->rc_error = ECKSUM; 528 goto done; 529 } 530 531 bcopy(orig, rc->rc_data, rc->rc_size); 532 zio_buf_free(orig, rc->rc_size); 533 } 534 535 /* 536 * All combinations failed to checksum. 537 */ 538 zio->io_error = ECKSUM; 539 540 done: 541 zio_checksum_verified(zio); 542 543 if (zio->io_error == 0 && (spa_mode & FWRITE) && 544 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { 545 /* 546 * Use the good data we have in hand to repair damaged children. 547 */ 548 for (c = 0; c < rm->rm_cols; c++) { 549 rc = &rm->rm_col[c]; 550 cvd = vd->vdev_child[rc->rc_col]; 551 552 if (rc->rc_error) { 553 /* 554 * Make a copy of the data because we're 555 * going to free the RAID-Z map below. 556 */ 557 void *data = zio_buf_alloc(rc->rc_size); 558 bcopy(rc->rc_data, data, rc->rc_size); 559 560 dprintf("%s resilvered %s @ 0x%llx error %d\n", 561 vdev_description(vd), 562 vdev_description(cvd), 563 zio->io_offset, rc->rc_error); 564 565 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 566 rc->rc_offset, data, rc->rc_size, 567 ZIO_TYPE_WRITE, zio->io_priority, 568 ZIO_FLAG_IO_REPAIR | ZIO_FLAG_CANFAIL | 569 ZIO_FLAG_DONT_PROPAGATE, 570 vdev_raidz_repair_done, NULL)); 571 } 572 } 573 } 574 575 vdev_raidz_map_free(zio); 576 zio_next_stage(zio); 577 } 578 579 static void 580 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 581 { 582 if (faulted > 1) 583 vdev_set_state(vd, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS); 584 else if (degraded + faulted != 0) 585 vdev_set_state(vd, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 586 else 587 vdev_set_state(vd, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 588 } 589 590 vdev_ops_t vdev_raidz_ops = { 591 vdev_raidz_open, 592 vdev_raidz_close, 593 vdev_raidz_asize, 594 vdev_raidz_io_start, 595 vdev_raidz_io_done, 596 vdev_raidz_state_change, 597 VDEV_TYPE_RAIDZ, /* name of this vdev type */ 598 B_FALSE /* not a leaf vdev */ 599 }; 600