1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/zfs_context.h> 29 #include <sys/spa.h> 30 #include <sys/vdev_impl.h> 31 #include <sys/zio.h> 32 #include <sys/zio_checksum.h> 33 #include <sys/fs/zfs.h> 34 #include <sys/fm/fs/zfs.h> 35 36 /* 37 * Virtual device vector for RAID-Z. 38 */ 39 40 /* 41 * We currently allow up to two-way replication (i.e. single-fault 42 * reconstruction) models in RAID-Z vdevs. The blocks in such vdevs 43 * must all be multiples of two times the leaf vdev blocksize. 44 */ 45 #define VDEV_RAIDZ_ALIGN 2ULL 46 47 typedef struct raidz_col { 48 uint64_t rc_col; 49 uint64_t rc_offset; 50 uint64_t rc_size; 51 void *rc_data; 52 int rc_error; 53 short rc_tried; 54 short rc_skipped; 55 } raidz_col_t; 56 57 typedef struct raidz_map { 58 uint64_t rm_cols; 59 uint64_t rm_bigcols; 60 uint64_t rm_asize; 61 int rm_missing_child; 62 int rm_firstdatacol; 63 raidz_col_t rm_col[1]; 64 } raidz_map_t; 65 66 static raidz_map_t * 67 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols) 68 { 69 raidz_map_t *rm; 70 uint64_t b = zio->io_offset >> unit_shift; 71 uint64_t s = zio->io_size >> unit_shift; 72 uint64_t f = b % dcols; 73 uint64_t o = (b / dcols) << unit_shift; 74 uint64_t q, r, c, bc, col, acols, coff; 75 int firstdatacol; 76 77 q = s / (dcols - 1); 78 r = s - q * (dcols - 1); 79 bc = r + !!r; 80 firstdatacol = 1; 81 82 acols = (q == 0 ? bc : dcols); 83 84 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP); 85 86 rm->rm_cols = acols; 87 rm->rm_bigcols = bc; 88 rm->rm_asize = 0; 89 rm->rm_missing_child = -1; 90 rm->rm_firstdatacol = firstdatacol; 91 92 for (c = 0; c < acols; c++) { 93 col = f + c; 94 coff = o; 95 if (col >= dcols) { 96 col -= dcols; 97 coff += 1ULL << unit_shift; 98 } 99 rm->rm_col[c].rc_col = col; 100 rm->rm_col[c].rc_offset = coff; 101 rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift; 102 rm->rm_col[c].rc_data = NULL; 103 rm->rm_col[c].rc_error = 0; 104 rm->rm_col[c].rc_tried = 0; 105 rm->rm_col[c].rc_skipped = 0; 106 rm->rm_asize += rm->rm_col[c].rc_size; 107 } 108 109 rm->rm_asize = P2ROUNDUP(rm->rm_asize, VDEV_RAIDZ_ALIGN << unit_shift); 110 111 for (c = 0; c < rm->rm_firstdatacol; c++) 112 rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); 113 114 rm->rm_col[c].rc_data = zio->io_data; 115 116 for (c = c + 1; c < acols; c++) 117 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 118 rm->rm_col[c - 1].rc_size; 119 120 /* 121 * To prevent hot parity disks, switch the parity and data 122 * columns every 1MB. 123 */ 124 ASSERT(rm->rm_cols >= 2); 125 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 126 127 if (zio->io_offset & (1ULL << 20)) { 128 col = rm->rm_col[0].rc_col; 129 o = rm->rm_col[0].rc_offset; 130 rm->rm_col[0].rc_col = rm->rm_col[1].rc_col; 131 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 132 rm->rm_col[1].rc_col = col; 133 rm->rm_col[1].rc_offset = o; 134 } 135 136 zio->io_vsd = rm; 137 return (rm); 138 } 139 140 static void 141 vdev_raidz_map_free(zio_t *zio) 142 { 143 raidz_map_t *rm = zio->io_vsd; 144 int c; 145 146 for (c = 0; c < rm->rm_firstdatacol; c++) 147 zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); 148 149 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols])); 150 zio->io_vsd = NULL; 151 } 152 153 static void 154 vdev_raidz_reconstruct(raidz_map_t *rm, int x) 155 { 156 uint64_t *dst, *src, count, xsize, csize; 157 int i, c; 158 159 for (c = 0; c < rm->rm_cols; c++) { 160 if (c == x) 161 continue; 162 src = rm->rm_col[c].rc_data; 163 dst = rm->rm_col[x].rc_data; 164 csize = rm->rm_col[c].rc_size; 165 xsize = rm->rm_col[x].rc_size; 166 count = MIN(csize, xsize) / sizeof (uint64_t); 167 if (c == !x) { 168 /* 169 * The initial copy happens at either c == 0 or c == 1. 170 * Both of these columns are 'big' columns, so we'll 171 * definitely initialize all of column x. 172 */ 173 ASSERT3U(xsize, <=, csize); 174 for (i = 0; i < count; i++) 175 *dst++ = *src++; 176 } else { 177 for (i = 0; i < count; i++) 178 *dst++ ^= *src++; 179 } 180 } 181 } 182 183 static int 184 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) 185 { 186 vdev_t *cvd; 187 int c, error; 188 int lasterror = 0; 189 int numerrors = 0; 190 191 /* 192 * XXX -- minimum children should be raid-type-specific 193 */ 194 if (vd->vdev_children < 2) { 195 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 196 return (EINVAL); 197 } 198 199 for (c = 0; c < vd->vdev_children; c++) { 200 cvd = vd->vdev_child[c]; 201 202 if ((error = vdev_open(cvd)) != 0) { 203 lasterror = error; 204 numerrors++; 205 continue; 206 } 207 208 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; 209 *ashift = MAX(*ashift, cvd->vdev_ashift); 210 } 211 212 *asize *= vd->vdev_children; 213 214 if (numerrors > 1) { 215 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; 216 return (lasterror); 217 } 218 219 return (0); 220 } 221 222 static void 223 vdev_raidz_close(vdev_t *vd) 224 { 225 int c; 226 227 for (c = 0; c < vd->vdev_children; c++) 228 vdev_close(vd->vdev_child[c]); 229 } 230 231 static uint64_t 232 vdev_raidz_asize(vdev_t *vd, uint64_t psize) 233 { 234 uint64_t asize; 235 uint64_t ashift = vd->vdev_top->vdev_ashift; 236 uint64_t cols = vd->vdev_children; 237 238 asize = ((psize - 1) >> ashift) + 1; 239 asize += (asize + cols - 2) / (cols - 1); 240 asize = P2ROUNDUP(asize, VDEV_RAIDZ_ALIGN) << ashift; 241 242 return (asize); 243 } 244 245 static void 246 vdev_raidz_child_done(zio_t *zio) 247 { 248 raidz_col_t *rc = zio->io_private; 249 250 rc->rc_error = zio->io_error; 251 rc->rc_tried = 1; 252 rc->rc_skipped = 0; 253 } 254 255 static void 256 vdev_raidz_repair_done(zio_t *zio) 257 { 258 ASSERT(zio->io_private == zio->io_parent); 259 vdev_raidz_map_free(zio->io_private); 260 } 261 262 static void 263 vdev_raidz_io_start(zio_t *zio) 264 { 265 vdev_t *vd = zio->io_vd; 266 vdev_t *tvd = vd->vdev_top; 267 vdev_t *cvd; 268 blkptr_t *bp = zio->io_bp; 269 raidz_map_t *rm; 270 raidz_col_t *rc; 271 int c; 272 273 rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children); 274 275 if (DVA_GET_GANG(ZIO_GET_DVA(zio))) { 276 ASSERT3U(rm->rm_asize, ==, 277 vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE)); 278 } else { 279 ASSERT3U(rm->rm_asize, ==, DVA_GET_ASIZE(ZIO_GET_DVA(zio))); 280 } 281 282 if (zio->io_type == ZIO_TYPE_WRITE) { 283 284 /* 285 * Generate RAID parity in virtual column 0. 286 */ 287 vdev_raidz_reconstruct(rm, 0); 288 289 for (c = 0; c < rm->rm_cols; c++) { 290 rc = &rm->rm_col[c]; 291 cvd = vd->vdev_child[rc->rc_col]; 292 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 293 rc->rc_offset, rc->rc_data, rc->rc_size, 294 zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, 295 vdev_raidz_child_done, rc)); 296 } 297 zio_wait_children_done(zio); 298 return; 299 } 300 301 ASSERT(zio->io_type == ZIO_TYPE_READ); 302 303 for (c = rm->rm_cols - 1; c >= 0; c--) { 304 rc = &rm->rm_col[c]; 305 cvd = vd->vdev_child[rc->rc_col]; 306 if (vdev_is_dead(cvd)) { 307 rm->rm_missing_child = c; 308 rc->rc_error = ENXIO; 309 rc->rc_tried = 1; /* don't even try */ 310 rc->rc_skipped = 1; 311 continue; 312 } 313 if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) { 314 rm->rm_missing_child = c; 315 rc->rc_error = ESTALE; 316 rc->rc_skipped = 1; 317 continue; 318 } 319 if (c >= rm->rm_firstdatacol || rm->rm_missing_child != -1 || 320 (zio->io_flags & ZIO_FLAG_SCRUB)) { 321 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, 322 rc->rc_offset, rc->rc_data, rc->rc_size, 323 zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, 324 vdev_raidz_child_done, rc)); 325 } 326 } 327 328 zio_wait_children_done(zio); 329 } 330 331 /* 332 * Report a checksum error for a child of a RAID-Z device. 333 */ 334 static void 335 raidz_checksum_error(zio_t *zio, raidz_col_t *rc) 336 { 337 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_col]; 338 dprintf_bp(zio->io_bp, "imputed checksum error on %s: ", 339 vdev_description(vd)); 340 341 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 342 mutex_enter(&vd->vdev_stat_lock); 343 vd->vdev_stat.vs_checksum_errors++; 344 mutex_exit(&vd->vdev_stat_lock); 345 } 346 347 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) 348 zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 349 zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size); 350 } 351 352 353 static void 354 vdev_raidz_io_done(zio_t *zio) 355 { 356 vdev_t *vd = zio->io_vd; 357 vdev_t *cvd; 358 raidz_map_t *rm = zio->io_vsd; 359 raidz_col_t *rc; 360 blkptr_t *bp = zio->io_bp; 361 int unexpected_errors = 0; 362 int c; 363 364 ASSERT(bp != NULL); /* XXX need to add code to enforce this */ 365 366 zio->io_error = 0; 367 zio->io_numerrors = 0; 368 369 for (c = 0; c < rm->rm_cols; c++) { 370 rc = &rm->rm_col[c]; 371 372 /* 373 * We preserve any EIOs because those may be worth retrying; 374 * whereas ECKSUM and ENXIO are more likely to be persistent. 375 */ 376 if (rc->rc_error) { 377 if (zio->io_error != EIO) 378 zio->io_error = rc->rc_error; 379 if (!rc->rc_skipped) 380 unexpected_errors++; 381 zio->io_numerrors++; 382 } 383 } 384 385 if (zio->io_type == ZIO_TYPE_WRITE) { 386 /* 387 * If this is not a failfast write, and we were able to 388 * write enough columns to reconstruct the data, good enough. 389 */ 390 /* XXPOLICY */ 391 if (zio->io_numerrors <= rm->rm_firstdatacol && 392 !(zio->io_flags & ZIO_FLAG_FAILFAST)) 393 zio->io_error = 0; 394 395 vdev_raidz_map_free(zio); 396 zio_next_stage(zio); 397 return; 398 } 399 400 ASSERT(zio->io_type == ZIO_TYPE_READ); 401 402 /* 403 * If there were no I/O errors, and the data checksums correctly, 404 * the read is complete. 405 */ 406 /* XXPOLICY */ 407 if (zio->io_numerrors == 0 && zio_checksum_error(zio) == 0) { 408 ASSERT(unexpected_errors == 0); 409 ASSERT(zio->io_error == 0); 410 411 /* 412 * We know the data's good. If we read the parity, 413 * verify that it's good as well. If not, fix it. 414 */ 415 for (c = 0; c < rm->rm_firstdatacol; c++) { 416 void *orig; 417 rc = &rm->rm_col[c]; 418 if (!rc->rc_tried) 419 continue; 420 orig = zio_buf_alloc(rc->rc_size); 421 bcopy(rc->rc_data, orig, rc->rc_size); 422 vdev_raidz_reconstruct(rm, c); 423 if (bcmp(orig, rc->rc_data, rc->rc_size) != 0) { 424 raidz_checksum_error(zio, rc); 425 rc->rc_error = ECKSUM; 426 unexpected_errors++; 427 } 428 zio_buf_free(orig, rc->rc_size); 429 } 430 goto done; 431 } 432 433 /* 434 * If there was exactly one I/O error, it's the one we expected, 435 * and the reconstructed data checksums, the read is complete. 436 * This happens when one child is offline and vdev_fault_assess() 437 * knows it, or when one child has stale data and the DTL knows it. 438 */ 439 if (zio->io_numerrors == 1 && (c = rm->rm_missing_child) != -1) { 440 rc = &rm->rm_col[c]; 441 ASSERT(unexpected_errors == 0); 442 ASSERT(rc->rc_error == ENXIO || rc->rc_error == ESTALE); 443 vdev_raidz_reconstruct(rm, c); 444 if (zio_checksum_error(zio) == 0) { 445 zio->io_error = 0; 446 goto done; 447 } 448 } 449 450 /* 451 * This isn't a typical error -- either we got a read error or 452 * more than one child claimed a problem. Read every block we 453 * haven't already so we can try combinatorial reconstruction. 454 */ 455 unexpected_errors = 1; 456 rm->rm_missing_child = -1; 457 458 for (c = 0; c < rm->rm_cols; c++) 459 if (!rm->rm_col[c].rc_tried) 460 break; 461 462 if (c != rm->rm_cols) { 463 zio->io_error = 0; 464 zio_vdev_io_redone(zio); 465 for (c = 0; c < rm->rm_cols; c++) { 466 rc = &rm->rm_col[c]; 467 if (rc->rc_tried) 468 continue; 469 zio_nowait(zio_vdev_child_io(zio, NULL, 470 vd->vdev_child[rc->rc_col], 471 rc->rc_offset, rc->rc_data, rc->rc_size, 472 zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, 473 vdev_raidz_child_done, rc)); 474 } 475 zio_wait_children_done(zio); 476 return; 477 } 478 479 /* 480 * If there were more errors than parity disks, give up. 481 */ 482 if (zio->io_numerrors > rm->rm_firstdatacol) { 483 ASSERT(zio->io_error != 0); 484 goto done; 485 } 486 487 /* 488 * The number of I/O errors is correctable. Correct them here. 489 */ 490 ASSERT(zio->io_numerrors <= rm->rm_firstdatacol); 491 for (c = 0; c < rm->rm_cols; c++) { 492 rc = &rm->rm_col[c]; 493 ASSERT(rc->rc_tried); 494 if (rc->rc_error) { 495 vdev_raidz_reconstruct(rm, c); 496 if (zio_checksum_error(zio) == 0) 497 zio->io_error = 0; 498 else 499 zio->io_error = rc->rc_error; 500 goto done; 501 } 502 } 503 504 /* 505 * There were no I/O errors, but the data doesn't checksum. 506 * Try all permutations to see if we can find one that does. 507 */ 508 ASSERT(zio->io_numerrors == 0); 509 for (c = 0; c < rm->rm_cols; c++) { 510 void *orig; 511 rc = &rm->rm_col[c]; 512 513 orig = zio_buf_alloc(rc->rc_size); 514 bcopy(rc->rc_data, orig, rc->rc_size); 515 vdev_raidz_reconstruct(rm, c); 516 517 if (zio_checksum_error(zio) == 0) { 518 zio_buf_free(orig, rc->rc_size); 519 zio->io_error = 0; 520 /* 521 * If this child didn't know that it returned bad data, 522 * inform it. 523 */ 524 if (rc->rc_tried && rc->rc_error == 0) 525 raidz_checksum_error(zio, rc); 526 rc->rc_error = ECKSUM; 527 goto done; 528 } 529 530 bcopy(orig, rc->rc_data, rc->rc_size); 531 zio_buf_free(orig, rc->rc_size); 532 } 533 534 /* 535 * All combinations failed to checksum. Generate checksum ereports for 536 * every one. 537 */ 538 zio->io_error = ECKSUM; 539 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 540 for (c = 0; c < rm->rm_cols; c++) { 541 rc = &rm->rm_col[c]; 542 zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, 543 zio->io_spa, vd->vdev_child[rc->rc_col], zio, 544 rc->rc_offset, rc->rc_size); 545 } 546 } 547 548 done: 549 zio_checksum_verified(zio); 550 551 if (zio->io_error == 0 && (spa_mode & FWRITE) && 552 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { 553 zio_t *rio; 554 555 /* 556 * Use the good data we have in hand to repair damaged children. 557 * 558 * We issue all repair I/Os as children of 'rio' to arrange 559 * that vdev_raidz_map_free(zio) will be invoked after all 560 * repairs complete, but before we advance to the next stage. 561 */ 562 rio = zio_null(zio, zio->io_spa, 563 vdev_raidz_repair_done, zio, ZIO_FLAG_CANFAIL); 564 565 for (c = 0; c < rm->rm_cols; c++) { 566 rc = &rm->rm_col[c]; 567 cvd = vd->vdev_child[rc->rc_col]; 568 569 if (rc->rc_error == 0) 570 continue; 571 572 dprintf("%s resilvered %s @ 0x%llx error %d\n", 573 vdev_description(vd), 574 vdev_description(cvd), 575 zio->io_offset, rc->rc_error); 576 577 zio_nowait(zio_vdev_child_io(rio, NULL, cvd, 578 rc->rc_offset, rc->rc_data, rc->rc_size, 579 ZIO_TYPE_WRITE, zio->io_priority, 580 ZIO_FLAG_IO_REPAIR | ZIO_FLAG_CANFAIL | 581 ZIO_FLAG_DONT_PROPAGATE, NULL, NULL)); 582 } 583 584 zio_nowait(rio); 585 zio_wait_children_done(zio); 586 return; 587 } 588 589 vdev_raidz_map_free(zio); 590 zio_next_stage(zio); 591 } 592 593 static void 594 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) 595 { 596 if (faulted > 1) 597 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 598 VDEV_AUX_NO_REPLICAS); 599 else if (degraded + faulted != 0) 600 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); 601 else 602 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); 603 } 604 605 vdev_ops_t vdev_raidz_ops = { 606 vdev_raidz_open, 607 vdev_raidz_close, 608 vdev_raidz_asize, 609 vdev_raidz_io_start, 610 vdev_raidz_io_done, 611 vdev_raidz_state_change, 612 VDEV_TYPE_RAIDZ, /* name of this vdev type */ 613 B_FALSE /* not a leaf vdev */ 614 }; 615