/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #pragma ident "%Z%%M% %I% %E% SMI" #include #include #include #include #include #include #include /* * Virtual device vector for RAID-Z. */ /* * We currently allow up to two-way replication (i.e. single-fault * reconstruction) models in RAID-Z vdevs. The blocks in such vdevs * must all be multiples of two times the leaf vdev blocksize. */ #define VDEV_RAIDZ_ALIGN 2ULL typedef struct raidz_col { uint64_t rc_col; uint64_t rc_offset; uint64_t rc_size; void *rc_data; int rc_error; short rc_tried; short rc_skipped; } raidz_col_t; typedef struct raidz_map { uint64_t rm_cols; uint64_t rm_bigcols; uint64_t rm_asize; int rm_missing_child; int rm_firstdatacol; raidz_col_t rm_col[1]; } raidz_map_t; static raidz_map_t * vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols) { raidz_map_t *rm; uint64_t b = zio->io_offset >> unit_shift; uint64_t s = zio->io_size >> unit_shift; uint64_t f = b % dcols; uint64_t o = (b / dcols) << unit_shift; uint64_t q, r, c, bc, col, acols, coff; int firstdatacol; q = s / (dcols - 1); r = s - q * (dcols - 1); bc = r + !!r; firstdatacol = 1; acols = (q == 0 ? bc : dcols); rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP); rm->rm_cols = acols; rm->rm_bigcols = bc; rm->rm_asize = 0; rm->rm_missing_child = -1; rm->rm_firstdatacol = firstdatacol; for (c = 0; c < acols; c++) { col = f + c; coff = o; if (col >= dcols) { col -= dcols; coff += 1ULL << unit_shift; } rm->rm_col[c].rc_col = col; rm->rm_col[c].rc_offset = coff; rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift; rm->rm_col[c].rc_data = NULL; rm->rm_col[c].rc_error = 0; rm->rm_col[c].rc_tried = 0; rm->rm_col[c].rc_skipped = 0; rm->rm_asize += rm->rm_col[c].rc_size; } rm->rm_asize = P2ROUNDUP(rm->rm_asize, VDEV_RAIDZ_ALIGN << unit_shift); for (c = 0; c < rm->rm_firstdatacol; c++) rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); rm->rm_col[c].rc_data = zio->io_data; for (c = c + 1; c < acols; c++) rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + rm->rm_col[c - 1].rc_size; /* * To prevent hot parity disks, switch the parity and data * columns every 1MB. */ ASSERT(rm->rm_cols >= 2); ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); if (zio->io_offset & (1ULL << 20)) { col = rm->rm_col[0].rc_col; o = rm->rm_col[0].rc_offset; rm->rm_col[0].rc_col = rm->rm_col[1].rc_col; rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; rm->rm_col[1].rc_col = col; rm->rm_col[1].rc_offset = o; } zio->io_vsd = rm; return (rm); } static void vdev_raidz_map_free(zio_t *zio) { raidz_map_t *rm = zio->io_vsd; int c; for (c = 0; c < rm->rm_firstdatacol; c++) zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols])); zio->io_vsd = NULL; } static void vdev_raidz_reconstruct(raidz_map_t *rm, int x) { uint64_t *dst, *src, count, xsize, csize; int i, c; for (c = 0; c < rm->rm_cols; c++) { if (c == x) continue; src = rm->rm_col[c].rc_data; dst = rm->rm_col[x].rc_data; csize = rm->rm_col[c].rc_size; xsize = rm->rm_col[x].rc_size; count = MIN(csize, xsize) / sizeof (uint64_t); if (c == !x) { /* * The initial copy happens at either c == 0 or c == 1. * Both of these columns are 'big' columns, so we'll * definitely initialize all of column x. */ ASSERT3U(xsize, <=, csize); for (i = 0; i < count; i++) *dst++ = *src++; } else { for (i = 0; i < count; i++) *dst++ ^= *src++; } } } static int vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) { vdev_t *cvd; int c, error; int lasterror = 0; int numerrors = 0; /* * XXX -- minimum children should be raid-type-specific */ if (vd->vdev_children < 2) { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return (EINVAL); } for (c = 0; c < vd->vdev_children; c++) { cvd = vd->vdev_child[c]; if ((error = vdev_open(cvd)) != 0) { lasterror = error; numerrors++; continue; } *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; *ashift = MAX(*ashift, cvd->vdev_ashift); } *asize *= vd->vdev_children; if (numerrors > 1) { vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; return (lasterror); } return (0); } static void vdev_raidz_close(vdev_t *vd) { int c; for (c = 0; c < vd->vdev_children; c++) vdev_close(vd->vdev_child[c]); } static uint64_t vdev_raidz_asize(vdev_t *vd, uint64_t psize) { uint64_t asize; uint64_t ashift = vd->vdev_top->vdev_ashift; uint64_t cols = vd->vdev_children; asize = ((psize - 1) >> ashift) + 1; asize += (asize + cols - 2) / (cols - 1); asize = P2ROUNDUP(asize, VDEV_RAIDZ_ALIGN) << ashift; return (asize); } static void vdev_raidz_child_done(zio_t *zio) { raidz_col_t *rc = zio->io_private; rc->rc_error = zio->io_error; rc->rc_tried = 1; rc->rc_skipped = 0; } static void vdev_raidz_repair_done(zio_t *zio) { ASSERT(zio->io_private == zio->io_parent); vdev_raidz_map_free(zio->io_private); } static void vdev_raidz_io_start(zio_t *zio) { vdev_t *vd = zio->io_vd; vdev_t *tvd = vd->vdev_top; vdev_t *cvd; blkptr_t *bp = zio->io_bp; raidz_map_t *rm; raidz_col_t *rc; int c; rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children); ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); if (zio->io_type == ZIO_TYPE_WRITE) { /* * Generate RAID parity in virtual column 0. */ vdev_raidz_reconstruct(rm, 0); for (c = 0; c < rm->rm_cols; c++) { rc = &rm->rm_col[c]; cvd = vd->vdev_child[rc->rc_col]; zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_data, rc->rc_size, zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, vdev_raidz_child_done, rc)); } zio_wait_children_done(zio); return; } ASSERT(zio->io_type == ZIO_TYPE_READ); for (c = rm->rm_cols - 1; c >= 0; c--) { rc = &rm->rm_col[c]; cvd = vd->vdev_child[rc->rc_col]; if (vdev_is_dead(cvd)) { rm->rm_missing_child = c; rc->rc_error = ENXIO; rc->rc_tried = 1; /* don't even try */ rc->rc_skipped = 1; continue; } if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) { rm->rm_missing_child = c; rc->rc_error = ESTALE; rc->rc_skipped = 1; continue; } if (c >= rm->rm_firstdatacol || rm->rm_missing_child != -1 || (zio->io_flags & ZIO_FLAG_SCRUB)) { zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_data, rc->rc_size, zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, vdev_raidz_child_done, rc)); } } zio_wait_children_done(zio); } /* * Report a checksum error for a child of a RAID-Z device. */ static void raidz_checksum_error(zio_t *zio, raidz_col_t *rc) { vdev_t *vd = zio->io_vd->vdev_child[rc->rc_col]; dprintf_bp(zio->io_bp, "imputed checksum error on %s: ", vdev_description(vd)); if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_checksum_errors++; mutex_exit(&vd->vdev_stat_lock); } if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size); } static void vdev_raidz_io_done(zio_t *zio) { vdev_t *vd = zio->io_vd; vdev_t *cvd; raidz_map_t *rm = zio->io_vsd; raidz_col_t *rc; int unexpected_errors = 0; int c; ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ zio->io_error = 0; zio->io_numerrors = 0; for (c = 0; c < rm->rm_cols; c++) { rc = &rm->rm_col[c]; /* * We preserve any EIOs because those may be worth retrying; * whereas ECKSUM and ENXIO are more likely to be persistent. */ if (rc->rc_error) { if (zio->io_error != EIO) zio->io_error = rc->rc_error; if (!rc->rc_skipped) unexpected_errors++; zio->io_numerrors++; } } if (zio->io_type == ZIO_TYPE_WRITE) { /* * If this is not a failfast write, and we were able to * write enough columns to reconstruct the data, good enough. */ /* XXPOLICY */ if (zio->io_numerrors <= rm->rm_firstdatacol && !(zio->io_flags & ZIO_FLAG_FAILFAST)) zio->io_error = 0; vdev_raidz_map_free(zio); zio_next_stage(zio); return; } ASSERT(zio->io_type == ZIO_TYPE_READ); /* * If there were no I/O errors, and the data checksums correctly, * the read is complete. */ /* XXPOLICY */ if (zio->io_numerrors == 0 && zio_checksum_error(zio) == 0) { ASSERT(unexpected_errors == 0); ASSERT(zio->io_error == 0); /* * We know the data's good. If we read the parity, * verify that it's good as well. If not, fix it. */ for (c = 0; c < rm->rm_firstdatacol; c++) { void *orig; rc = &rm->rm_col[c]; if (!rc->rc_tried) continue; orig = zio_buf_alloc(rc->rc_size); bcopy(rc->rc_data, orig, rc->rc_size); vdev_raidz_reconstruct(rm, c); if (bcmp(orig, rc->rc_data, rc->rc_size) != 0) { raidz_checksum_error(zio, rc); rc->rc_error = ECKSUM; unexpected_errors++; } zio_buf_free(orig, rc->rc_size); } goto done; } /* * If there was exactly one I/O error, it's the one we expected, * and the reconstructed data checksums, the read is complete. * This happens when one child is offline and vdev_fault_assess() * knows it, or when one child has stale data and the DTL knows it. */ if (zio->io_numerrors == 1 && (c = rm->rm_missing_child) != -1) { rc = &rm->rm_col[c]; ASSERT(unexpected_errors == 0); ASSERT(rc->rc_error == ENXIO || rc->rc_error == ESTALE); vdev_raidz_reconstruct(rm, c); if (zio_checksum_error(zio) == 0) { zio->io_error = 0; goto done; } } /* * This isn't a typical error -- either we got a read error or * more than one child claimed a problem. Read every block we * haven't already so we can try combinatorial reconstruction. */ unexpected_errors = 1; rm->rm_missing_child = -1; for (c = 0; c < rm->rm_cols; c++) if (!rm->rm_col[c].rc_tried) break; if (c != rm->rm_cols) { zio->io_error = 0; zio_vdev_io_redone(zio); for (c = 0; c < rm->rm_cols; c++) { rc = &rm->rm_col[c]; if (rc->rc_tried) continue; zio_nowait(zio_vdev_child_io(zio, NULL, vd->vdev_child[rc->rc_col], rc->rc_offset, rc->rc_data, rc->rc_size, zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, vdev_raidz_child_done, rc)); } zio_wait_children_done(zio); return; } /* * If there were more errors than parity disks, give up. */ if (zio->io_numerrors > rm->rm_firstdatacol) { ASSERT(zio->io_error != 0); goto done; } /* * The number of I/O errors is correctable. Correct them here. */ ASSERT(zio->io_numerrors <= rm->rm_firstdatacol); for (c = 0; c < rm->rm_cols; c++) { rc = &rm->rm_col[c]; ASSERT(rc->rc_tried); if (rc->rc_error) { vdev_raidz_reconstruct(rm, c); if (zio_checksum_error(zio) == 0) zio->io_error = 0; else zio->io_error = rc->rc_error; goto done; } } /* * There were no I/O errors, but the data doesn't checksum. * Try all permutations to see if we can find one that does. */ ASSERT(zio->io_numerrors == 0); for (c = 0; c < rm->rm_cols; c++) { void *orig; rc = &rm->rm_col[c]; orig = zio_buf_alloc(rc->rc_size); bcopy(rc->rc_data, orig, rc->rc_size); vdev_raidz_reconstruct(rm, c); if (zio_checksum_error(zio) == 0) { zio_buf_free(orig, rc->rc_size); zio->io_error = 0; /* * If this child didn't know that it returned bad data, * inform it. */ if (rc->rc_tried && rc->rc_error == 0) raidz_checksum_error(zio, rc); rc->rc_error = ECKSUM; goto done; } bcopy(orig, rc->rc_data, rc->rc_size); zio_buf_free(orig, rc->rc_size); } /* * All combinations failed to checksum. Generate checksum ereports for * every one. */ zio->io_error = ECKSUM; if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { for (c = 0; c < rm->rm_cols; c++) { rc = &rm->rm_col[c]; zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, zio->io_spa, vd->vdev_child[rc->rc_col], zio, rc->rc_offset, rc->rc_size); } } done: zio_checksum_verified(zio); if (zio->io_error == 0 && (spa_mode & FWRITE) && (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { zio_t *rio; /* * Use the good data we have in hand to repair damaged children. * * We issue all repair I/Os as children of 'rio' to arrange * that vdev_raidz_map_free(zio) will be invoked after all * repairs complete, but before we advance to the next stage. */ rio = zio_null(zio, zio->io_spa, vdev_raidz_repair_done, zio, ZIO_FLAG_CANFAIL); for (c = 0; c < rm->rm_cols; c++) { rc = &rm->rm_col[c]; cvd = vd->vdev_child[rc->rc_col]; if (rc->rc_error == 0) continue; dprintf("%s resilvered %s @ 0x%llx error %d\n", vdev_description(vd), vdev_description(cvd), zio->io_offset, rc->rc_error); zio_nowait(zio_vdev_child_io(rio, NULL, cvd, rc->rc_offset, rc->rc_data, rc->rc_size, ZIO_TYPE_WRITE, zio->io_priority, ZIO_FLAG_IO_REPAIR | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, NULL, NULL)); } zio_nowait(rio); zio_wait_children_done(zio); return; } vdev_raidz_map_free(zio); zio_next_stage(zio); } static void vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) { if (faulted > 1) vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS); else if (degraded + faulted != 0) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); else vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); } vdev_ops_t vdev_raidz_ops = { vdev_raidz_open, vdev_raidz_close, vdev_raidz_asize, vdev_raidz_io_start, vdev_raidz_io_done, vdev_raidz_state_change, VDEV_TYPE_RAIDZ, /* name of this vdev type */ B_FALSE /* not a leaf vdev */ };