1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/cdefs.h> 27 28 static uint64_t zfs_crc64_table[256]; 29 30 #define ECKSUM 666 31 32 #define ASSERT3S(x, y, z) ((void)0) 33 #define ASSERT3U(x, y, z) ((void)0) 34 #define ASSERT3P(x, y, z) ((void)0) 35 #define ASSERT0(x) ((void)0) 36 #define ASSERT(x) ((void)0) 37 38 #define panic(...) do { \ 39 printf(__VA_ARGS__); \ 40 for (;;) ; \ 41 } while (0) 42 43 #define kmem_alloc(size, flag) zfs_alloc((size)) 44 #define kmem_free(ptr, size) zfs_free((ptr), (size)) 45 46 static void 47 zfs_init_crc(void) 48 { 49 int i, j; 50 uint64_t *ct; 51 52 /* 53 * Calculate the crc64 table (used for the zap hash 54 * function). 55 */ 56 if (zfs_crc64_table[128] != ZFS_CRC64_POLY) { 57 memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table)); 58 for (i = 0; i < 256; i++) 59 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 60 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 61 } 62 } 63 64 static void 65 zio_checksum_off(const void *buf, uint64_t size, 66 const void *ctx_template, zio_cksum_t *zcp) 67 { 68 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); 69 } 70 71 /* 72 * Signature for checksum functions. 73 */ 74 typedef void zio_checksum_t(const void *data, uint64_t size, 75 const void *ctx_template, zio_cksum_t *zcp); 76 typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt); 77 typedef void zio_checksum_tmpl_free_t(void *ctx_template); 78 79 typedef enum zio_checksum_flags { 80 /* Strong enough for metadata? */ 81 ZCHECKSUM_FLAG_METADATA = (1 << 1), 82 /* ZIO embedded checksum */ 83 ZCHECKSUM_FLAG_EMBEDDED = (1 << 2), 84 /* Strong enough for dedup (without verification)? */ 85 ZCHECKSUM_FLAG_DEDUP = (1 << 3), 86 /* Uses salt value */ 87 ZCHECKSUM_FLAG_SALTED = (1 << 4), 88 /* Strong enough for nopwrite? */ 89 ZCHECKSUM_FLAG_NOPWRITE = (1 << 5) 90 } zio_checksum_flags_t; 91 92 /* 93 * Information about each checksum function. 94 */ 95 typedef struct zio_checksum_info { 96 /* checksum function for each byteorder */ 97 zio_checksum_t *ci_func[2]; 98 zio_checksum_tmpl_init_t *ci_tmpl_init; 99 zio_checksum_tmpl_free_t *ci_tmpl_free; 100 zio_checksum_flags_t ci_flags; 101 const char *ci_name; /* descriptive name */ 102 } zio_checksum_info_t; 103 104 #include "blkptr.c" 105 106 #include "fletcher.c" 107 #include "sha256.c" 108 109 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { 110 {{NULL, NULL}, NULL, NULL, 0, "inherit"}, 111 {{NULL, NULL}, NULL, NULL, 0, "on"}, 112 {{zio_checksum_off, zio_checksum_off}, NULL, NULL, 0, "off"}, 113 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, 114 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"}, 115 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, 116 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"}, 117 {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL, 118 ZCHECKSUM_FLAG_EMBEDDED, "zilog"}, 119 {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL, 120 0, "fletcher2"}, 121 {{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL, 122 ZCHECKSUM_FLAG_METADATA, "fletcher4"}, 123 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, 124 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | 125 ZCHECKSUM_FLAG_NOPWRITE, "SHA256"}, 126 {{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL, 127 ZCHECKSUM_FLAG_EMBEDDED, "zillog2"}, 128 {{zio_checksum_off, zio_checksum_off}, NULL, NULL, 129 0, "noparity"}, 130 {{zio_checksum_SHA512_native, zio_checksum_SHA512_byteswap}, 131 NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | 132 ZCHECKSUM_FLAG_NOPWRITE, "SHA512"}, 133 /* no skein and edonr for now */ 134 {{NULL, NULL}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | 135 ZCHECKSUM_FLAG_DEDUP | ZCHECKSUM_FLAG_SALTED | 136 ZCHECKSUM_FLAG_NOPWRITE, "skein"}, 137 {{NULL, NULL}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | 138 ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"}, 139 }; 140 141 /* 142 * Common signature for all zio compress/decompress functions. 143 */ 144 typedef size_t zio_compress_func_t(void *src, void *dst, 145 size_t s_len, size_t d_len, int); 146 typedef int zio_decompress_func_t(void *src, void *dst, 147 size_t s_len, size_t d_len, int); 148 149 extern int gzip_decompress(void *src, void *dst, 150 size_t s_len, size_t d_len, int); 151 /* 152 * Information about each compression function. 153 */ 154 typedef struct zio_compress_info { 155 zio_compress_func_t *ci_compress; /* compression function */ 156 zio_decompress_func_t *ci_decompress; /* decompression function */ 157 int ci_level; /* level parameter */ 158 const char *ci_name; /* algorithm name */ 159 } zio_compress_info_t; 160 161 #include "lzjb.c" 162 #include "zle.c" 163 #include "lz4.c" 164 165 /* 166 * Compression vectors. 167 */ 168 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { 169 {NULL, NULL, 0, "inherit"}, 170 {NULL, NULL, 0, "on"}, 171 {NULL, NULL, 0, "uncompressed"}, 172 {NULL, lzjb_decompress, 0, "lzjb"}, 173 {NULL, NULL, 0, "empty"}, 174 {NULL, gzip_decompress, 1, "gzip-1"}, 175 {NULL, gzip_decompress, 2, "gzip-2"}, 176 {NULL, gzip_decompress, 3, "gzip-3"}, 177 {NULL, gzip_decompress, 4, "gzip-4"}, 178 {NULL, gzip_decompress, 5, "gzip-5"}, 179 {NULL, gzip_decompress, 6, "gzip-6"}, 180 {NULL, gzip_decompress, 7, "gzip-7"}, 181 {NULL, gzip_decompress, 8, "gzip-8"}, 182 {NULL, gzip_decompress, 9, "gzip-9"}, 183 {NULL, zle_decompress, 64, "zle"}, 184 {NULL, lz4_decompress, 0, "lz4"}, 185 }; 186 187 static void 188 byteswap_uint64_array(void *vbuf, size_t size) 189 { 190 uint64_t *buf = vbuf; 191 size_t count = size >> 3; 192 int i; 193 194 ASSERT((size & 7) == 0); 195 196 for (i = 0; i < count; i++) 197 buf[i] = BSWAP_64(buf[i]); 198 } 199 200 /* 201 * Set the external verifier for a gang block based on <vdev, offset, txg>, 202 * a tuple which is guaranteed to be unique for the life of the pool. 203 */ 204 static void 205 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp) 206 { 207 const dva_t *dva = BP_IDENTITY(bp); 208 uint64_t txg = BP_PHYSICAL_BIRTH(bp); 209 210 ASSERT(BP_IS_GANG(bp)); 211 212 ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0); 213 } 214 215 /* 216 * Set the external verifier for a label block based on its offset. 217 * The vdev is implicit, and the txg is unknowable at pool open time -- 218 * hence the logic in vdev_uberblock_load() to find the most recent copy. 219 */ 220 static void 221 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset) 222 { 223 ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0); 224 } 225 226 /* 227 * Calls the template init function of a checksum which supports context 228 * templates and installs the template into the spa_t. 229 */ 230 static void 231 zio_checksum_template_init(enum zio_checksum checksum, const spa_t *spa) 232 { 233 zio_checksum_info_t *ci = &zio_checksum_table[checksum]; 234 235 if (ci->ci_tmpl_init == NULL) 236 return; 237 #if 0 /* for now we dont have anything here */ 238 if (spa->spa_cksum_tmpls[checksum] != NULL) 239 return; 240 241 VERIFY(ci->ci_tmpl_free != NULL); 242 mutex_enter(&spa->spa_cksum_tmpls_lock); 243 if (spa->spa_cksum_tmpls[checksum] == NULL) { 244 spa->spa_cksum_tmpls[checksum] = 245 ci->ci_tmpl_init(&spa->spa_cksum_salt); 246 VERIFY(spa->spa_cksum_tmpls[checksum] != NULL); 247 } 248 mutex_exit(&spa->spa_cksum_tmpls_lock); 249 #endif 250 } 251 252 static int 253 zio_checksum_verify(const blkptr_t *bp, void *data) 254 { 255 uint64_t size; 256 unsigned int checksum; 257 zio_checksum_info_t *ci; 258 zio_cksum_t actual_cksum, expected_cksum, verifier; 259 int byteswap; 260 261 checksum = BP_GET_CHECKSUM(bp); 262 size = BP_GET_PSIZE(bp); 263 264 if (checksum >= ZIO_CHECKSUM_FUNCTIONS) 265 return (EINVAL); 266 ci = &zio_checksum_table[checksum]; 267 if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL) 268 return (EINVAL); 269 270 zio_checksum_template_init(checksum, NULL); 271 if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { 272 zio_eck_t *eck; 273 274 ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER || 275 checksum == ZIO_CHECKSUM_LABEL); 276 277 eck = (zio_eck_t *)((char *)data + size) - 1; 278 279 if (checksum == ZIO_CHECKSUM_GANG_HEADER) 280 zio_checksum_gang_verifier(&verifier, bp); 281 else if (checksum == ZIO_CHECKSUM_LABEL) 282 zio_checksum_label_verifier(&verifier, 283 DVA_GET_OFFSET(BP_IDENTITY(bp))); 284 else 285 verifier = bp->blk_cksum; 286 287 byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC)); 288 289 if (byteswap) 290 byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); 291 292 expected_cksum = eck->zec_cksum; 293 eck->zec_cksum = verifier; 294 ci->ci_func[byteswap](data, size, NULL, &actual_cksum); 295 eck->zec_cksum = expected_cksum; 296 297 if (byteswap) 298 byteswap_uint64_array(&expected_cksum, 299 sizeof (zio_cksum_t)); 300 } else { 301 expected_cksum = bp->blk_cksum; 302 ci->ci_func[0](data, size, NULL, &actual_cksum); 303 } 304 305 if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) { 306 /*printf("ZFS: read checksum failed\n");*/ 307 return (EIO); 308 } 309 310 return (0); 311 } 312 313 static int 314 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize, 315 void *dest, uint64_t destsize) 316 { 317 zio_compress_info_t *ci; 318 319 if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) { 320 printf("ZFS: unsupported compression algorithm %u\n", cpfunc); 321 return (EIO); 322 } 323 324 ci = &zio_compress_table[cpfunc]; 325 if (!ci->ci_decompress) { 326 printf("ZFS: unsupported compression algorithm %s\n", 327 ci->ci_name); 328 return (EIO); 329 } 330 331 return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level)); 332 } 333 334 static uint64_t 335 zap_hash(uint64_t salt, const char *name) 336 { 337 const uint8_t *cp; 338 uint8_t c; 339 uint64_t crc = salt; 340 341 ASSERT(crc != 0); 342 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 343 for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++) 344 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF]; 345 346 /* 347 * Only use 28 bits, since we need 4 bits in the cookie for the 348 * collision differentiator. We MUST use the high bits, since 349 * those are the onces that we first pay attention to when 350 * chosing the bucket. 351 */ 352 crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1); 353 354 return (crc); 355 } 356 357 static void *zfs_alloc(size_t size); 358 static void zfs_free(void *ptr, size_t size); 359 360 typedef struct raidz_col { 361 uint64_t rc_devidx; /* child device index for I/O */ 362 uint64_t rc_offset; /* device offset */ 363 uint64_t rc_size; /* I/O size */ 364 void *rc_data; /* I/O data */ 365 int rc_error; /* I/O error for this device */ 366 uint8_t rc_tried; /* Did we attempt this I/O column? */ 367 uint8_t rc_skipped; /* Did we skip this I/O column? */ 368 } raidz_col_t; 369 370 typedef struct raidz_map { 371 uint64_t rm_cols; /* Regular column count */ 372 uint64_t rm_scols; /* Count including skipped columns */ 373 uint64_t rm_bigcols; /* Number of oversized columns */ 374 uint64_t rm_asize; /* Actual total I/O size */ 375 uint64_t rm_missingdata; /* Count of missing data devices */ 376 uint64_t rm_missingparity; /* Count of missing parity devices */ 377 uint64_t rm_firstdatacol; /* First data column/parity count */ 378 uint64_t rm_nskip; /* Skipped sectors for padding */ 379 uint64_t rm_skipstart; /* Column index of padding start */ 380 uintptr_t rm_reports; /* # of referencing checksum reports */ 381 uint8_t rm_freed; /* map no longer has referencing ZIO */ 382 uint8_t rm_ecksuminjected; /* checksum error was injected */ 383 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 384 } raidz_map_t; 385 386 #define VDEV_RAIDZ_P 0 387 #define VDEV_RAIDZ_Q 1 388 #define VDEV_RAIDZ_R 2 389 390 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 391 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 392 393 /* 394 * We provide a mechanism to perform the field multiplication operation on a 395 * 64-bit value all at once rather than a byte at a time. This works by 396 * creating a mask from the top bit in each byte and using that to 397 * conditionally apply the XOR of 0x1d. 398 */ 399 #define VDEV_RAIDZ_64MUL_2(x, mask) \ 400 { \ 401 (mask) = (x) & 0x8080808080808080ULL; \ 402 (mask) = ((mask) << 1) - ((mask) >> 7); \ 403 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 404 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ 405 } 406 407 #define VDEV_RAIDZ_64MUL_4(x, mask) \ 408 { \ 409 VDEV_RAIDZ_64MUL_2((x), mask); \ 410 VDEV_RAIDZ_64MUL_2((x), mask); \ 411 } 412 413 /* 414 * These two tables represent powers and logs of 2 in the Galois field defined 415 * above. These values were computed by repeatedly multiplying by 2 as above. 416 */ 417 static const uint8_t vdev_raidz_pow2[256] = { 418 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 419 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 420 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 421 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 422 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 423 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 424 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 425 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 426 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 427 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 428 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 429 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 430 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 431 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 432 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 433 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 434 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 435 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 436 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 437 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 438 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 439 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 440 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 441 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 442 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 443 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 444 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 445 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 446 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 447 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 448 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 449 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 450 }; 451 static const uint8_t vdev_raidz_log2[256] = { 452 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 453 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 454 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 455 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 456 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 457 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 458 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 459 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 460 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 461 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 462 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 463 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 464 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 465 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 466 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 467 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 468 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 469 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 470 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 471 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 472 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 473 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 474 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 475 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 476 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 477 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 478 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 479 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 480 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 481 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 482 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 483 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 484 }; 485 486 /* 487 * Multiply a given number by 2 raised to the given power. 488 */ 489 static uint8_t 490 vdev_raidz_exp2(uint8_t a, int exp) 491 { 492 if (a == 0) 493 return (0); 494 495 ASSERT(exp >= 0); 496 ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 497 498 exp += vdev_raidz_log2[a]; 499 if (exp > 255) 500 exp -= 255; 501 502 return (vdev_raidz_pow2[exp]); 503 } 504 505 static void 506 vdev_raidz_generate_parity_p(raidz_map_t *rm) 507 { 508 uint64_t *p, *src, pcount __attribute__((unused)), ccount, i; 509 int c; 510 511 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 512 513 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 514 src = rm->rm_col[c].rc_data; 515 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 516 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 517 518 if (c == rm->rm_firstdatacol) { 519 ASSERT(ccount == pcount); 520 for (i = 0; i < ccount; i++, src++, p++) { 521 *p = *src; 522 } 523 } else { 524 ASSERT(ccount <= pcount); 525 for (i = 0; i < ccount; i++, src++, p++) { 526 *p ^= *src; 527 } 528 } 529 } 530 } 531 532 static void 533 vdev_raidz_generate_parity_pq(raidz_map_t *rm) 534 { 535 uint64_t *p, *q, *src, pcnt, ccnt, mask, i; 536 int c; 537 538 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 539 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 540 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 541 542 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 543 src = rm->rm_col[c].rc_data; 544 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 545 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 546 547 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 548 549 if (c == rm->rm_firstdatacol) { 550 ASSERT(ccnt == pcnt || ccnt == 0); 551 for (i = 0; i < ccnt; i++, src++, p++, q++) { 552 *p = *src; 553 *q = *src; 554 } 555 for (; i < pcnt; i++, src++, p++, q++) { 556 *p = 0; 557 *q = 0; 558 } 559 } else { 560 ASSERT(ccnt <= pcnt); 561 562 /* 563 * Apply the algorithm described above by multiplying 564 * the previous result and adding in the new value. 565 */ 566 for (i = 0; i < ccnt; i++, src++, p++, q++) { 567 *p ^= *src; 568 569 VDEV_RAIDZ_64MUL_2(*q, mask); 570 *q ^= *src; 571 } 572 573 /* 574 * Treat short columns as though they are full of 0s. 575 * Note that there's therefore nothing needed for P. 576 */ 577 for (; i < pcnt; i++, q++) { 578 VDEV_RAIDZ_64MUL_2(*q, mask); 579 } 580 } 581 } 582 } 583 584 static void 585 vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 586 { 587 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; 588 int c; 589 590 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 591 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 592 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 593 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 594 rm->rm_col[VDEV_RAIDZ_R].rc_size); 595 596 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 597 src = rm->rm_col[c].rc_data; 598 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 599 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 600 r = rm->rm_col[VDEV_RAIDZ_R].rc_data; 601 602 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 603 604 if (c == rm->rm_firstdatacol) { 605 ASSERT(ccnt == pcnt || ccnt == 0); 606 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 607 *p = *src; 608 *q = *src; 609 *r = *src; 610 } 611 for (; i < pcnt; i++, src++, p++, q++, r++) { 612 *p = 0; 613 *q = 0; 614 *r = 0; 615 } 616 } else { 617 ASSERT(ccnt <= pcnt); 618 619 /* 620 * Apply the algorithm described above by multiplying 621 * the previous result and adding in the new value. 622 */ 623 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 624 *p ^= *src; 625 626 VDEV_RAIDZ_64MUL_2(*q, mask); 627 *q ^= *src; 628 629 VDEV_RAIDZ_64MUL_4(*r, mask); 630 *r ^= *src; 631 } 632 633 /* 634 * Treat short columns as though they are full of 0s. 635 * Note that there's therefore nothing needed for P. 636 */ 637 for (; i < pcnt; i++, q++, r++) { 638 VDEV_RAIDZ_64MUL_2(*q, mask); 639 VDEV_RAIDZ_64MUL_4(*r, mask); 640 } 641 } 642 } 643 } 644 645 /* 646 * Generate RAID parity in the first virtual columns according to the number of 647 * parity columns available. 648 */ 649 static void 650 vdev_raidz_generate_parity(raidz_map_t *rm) 651 { 652 switch (rm->rm_firstdatacol) { 653 case 1: 654 vdev_raidz_generate_parity_p(rm); 655 break; 656 case 2: 657 vdev_raidz_generate_parity_pq(rm); 658 break; 659 case 3: 660 vdev_raidz_generate_parity_pqr(rm); 661 break; 662 default: 663 panic("invalid RAID-Z configuration"); 664 } 665 } 666 667 /* BEGIN CSTYLED */ 668 /* 669 * In the general case of reconstruction, we must solve the system of linear 670 * equations defined by the coeffecients used to generate parity as well as 671 * the contents of the data and parity disks. This can be expressed with 672 * vectors for the original data (D) and the actual data (d) and parity (p) 673 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 674 * 675 * __ __ __ __ 676 * | | __ __ | p_0 | 677 * | V | | D_0 | | p_m-1 | 678 * | | x | : | = | d_0 | 679 * | I | | D_n-1 | | : | 680 * | | ~~ ~~ | d_n-1 | 681 * ~~ ~~ ~~ ~~ 682 * 683 * I is simply a square identity matrix of size n, and V is a vandermonde 684 * matrix defined by the coeffecients we chose for the various parity columns 685 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 686 * computation as well as linear separability. 687 * 688 * __ __ __ __ 689 * | 1 .. 1 1 1 | | p_0 | 690 * | 2^n-1 .. 4 2 1 | __ __ | : | 691 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 692 * | 1 .. 0 0 0 | | D_1 | | d_0 | 693 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 694 * | : : : : | | : | | d_2 | 695 * | 0 .. 1 0 0 | | D_n-1 | | : | 696 * | 0 .. 0 1 0 | ~~ ~~ | : | 697 * | 0 .. 0 0 1 | | d_n-1 | 698 * ~~ ~~ ~~ ~~ 699 * 700 * Note that I, V, d, and p are known. To compute D, we must invert the 701 * matrix and use the known data and parity values to reconstruct the unknown 702 * data values. We begin by removing the rows in V|I and d|p that correspond 703 * to failed or missing columns; we then make V|I square (n x n) and d|p 704 * sized n by removing rows corresponding to unused parity from the bottom up 705 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 706 * using Gauss-Jordan elimination. In the example below we use m=3 parity 707 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 708 * __ __ 709 * | 1 1 1 1 1 1 1 1 | 710 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 711 * | 19 205 116 29 64 16 4 1 | / / 712 * | 1 0 0 0 0 0 0 0 | / / 713 * | 0 1 0 0 0 0 0 0 | <--' / 714 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 715 * | 0 0 0 1 0 0 0 0 | 716 * | 0 0 0 0 1 0 0 0 | 717 * | 0 0 0 0 0 1 0 0 | 718 * | 0 0 0 0 0 0 1 0 | 719 * | 0 0 0 0 0 0 0 1 | 720 * ~~ ~~ 721 * __ __ 722 * | 1 1 1 1 1 1 1 1 | 723 * | 128 64 32 16 8 4 2 1 | 724 * | 19 205 116 29 64 16 4 1 | 725 * | 1 0 0 0 0 0 0 0 | 726 * | 0 1 0 0 0 0 0 0 | 727 * (V|I)' = | 0 0 1 0 0 0 0 0 | 728 * | 0 0 0 1 0 0 0 0 | 729 * | 0 0 0 0 1 0 0 0 | 730 * | 0 0 0 0 0 1 0 0 | 731 * | 0 0 0 0 0 0 1 0 | 732 * | 0 0 0 0 0 0 0 1 | 733 * ~~ ~~ 734 * 735 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 736 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 737 * matrix is not singular. 738 * __ __ 739 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 740 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 741 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 742 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 743 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 744 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 745 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 746 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 747 * ~~ ~~ 748 * __ __ 749 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 750 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 751 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 752 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 753 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 754 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 755 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 756 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 757 * ~~ ~~ 758 * __ __ 759 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 760 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 761 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 762 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 763 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 764 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 765 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 766 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 767 * ~~ ~~ 768 * __ __ 769 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 770 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 771 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 772 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 773 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 774 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 775 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 776 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 777 * ~~ ~~ 778 * __ __ 779 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 780 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 781 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 782 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 783 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 784 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 785 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 786 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 787 * ~~ ~~ 788 * __ __ 789 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 790 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 791 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 792 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 793 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 794 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 795 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 796 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 797 * ~~ ~~ 798 * __ __ 799 * | 0 0 1 0 0 0 0 0 | 800 * | 167 100 5 41 159 169 217 208 | 801 * | 166 100 4 40 158 168 216 209 | 802 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 803 * | 0 0 0 0 1 0 0 0 | 804 * | 0 0 0 0 0 1 0 0 | 805 * | 0 0 0 0 0 0 1 0 | 806 * | 0 0 0 0 0 0 0 1 | 807 * ~~ ~~ 808 * 809 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 810 * of the missing data. 811 * 812 * As is apparent from the example above, the only non-trivial rows in the 813 * inverse matrix correspond to the data disks that we're trying to 814 * reconstruct. Indeed, those are the only rows we need as the others would 815 * only be useful for reconstructing data known or assumed to be valid. For 816 * that reason, we only build the coefficients in the rows that correspond to 817 * targeted columns. 818 */ 819 /* END CSTYLED */ 820 821 static void 822 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 823 uint8_t **rows) 824 { 825 int i, j; 826 int pow; 827 828 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 829 830 /* 831 * Fill in the missing rows of interest. 832 */ 833 for (i = 0; i < nmap; i++) { 834 ASSERT3S(0, <=, map[i]); 835 ASSERT3S(map[i], <=, 2); 836 837 pow = map[i] * n; 838 if (pow > 255) 839 pow -= 255; 840 ASSERT(pow <= 255); 841 842 for (j = 0; j < n; j++) { 843 pow -= map[i]; 844 if (pow < 0) 845 pow += 255; 846 rows[i][j] = vdev_raidz_pow2[pow]; 847 } 848 } 849 } 850 851 static void 852 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 853 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 854 { 855 int i, j, ii, jj; 856 uint8_t log; 857 858 /* 859 * Assert that the first nmissing entries from the array of used 860 * columns correspond to parity columns and that subsequent entries 861 * correspond to data columns. 862 */ 863 for (i = 0; i < nmissing; i++) { 864 ASSERT3S(used[i], <, rm->rm_firstdatacol); 865 } 866 for (; i < n; i++) { 867 ASSERT3S(used[i], >=, rm->rm_firstdatacol); 868 } 869 870 /* 871 * First initialize the storage where we'll compute the inverse rows. 872 */ 873 for (i = 0; i < nmissing; i++) { 874 for (j = 0; j < n; j++) { 875 invrows[i][j] = (i == j) ? 1 : 0; 876 } 877 } 878 879 /* 880 * Subtract all trivial rows from the rows of consequence. 881 */ 882 for (i = 0; i < nmissing; i++) { 883 for (j = nmissing; j < n; j++) { 884 ASSERT3U(used[j], >=, rm->rm_firstdatacol); 885 jj = used[j] - rm->rm_firstdatacol; 886 ASSERT3S(jj, <, n); 887 invrows[i][j] = rows[i][jj]; 888 rows[i][jj] = 0; 889 } 890 } 891 892 /* 893 * For each of the rows of interest, we must normalize it and subtract 894 * a multiple of it from the other rows. 895 */ 896 for (i = 0; i < nmissing; i++) { 897 for (j = 0; j < missing[i]; j++) { 898 ASSERT3U(rows[i][j], ==, 0); 899 } 900 ASSERT3U(rows[i][missing[i]], !=, 0); 901 902 /* 903 * Compute the inverse of the first element and multiply each 904 * element in the row by that value. 905 */ 906 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 907 908 for (j = 0; j < n; j++) { 909 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 910 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 911 } 912 913 for (ii = 0; ii < nmissing; ii++) { 914 if (i == ii) 915 continue; 916 917 ASSERT3U(rows[ii][missing[i]], !=, 0); 918 919 log = vdev_raidz_log2[rows[ii][missing[i]]]; 920 921 for (j = 0; j < n; j++) { 922 rows[ii][j] ^= 923 vdev_raidz_exp2(rows[i][j], log); 924 invrows[ii][j] ^= 925 vdev_raidz_exp2(invrows[i][j], log); 926 } 927 } 928 } 929 930 /* 931 * Verify that the data that is left in the rows are properly part of 932 * an identity matrix. 933 */ 934 for (i = 0; i < nmissing; i++) { 935 for (j = 0; j < n; j++) { 936 if (j == missing[i]) { 937 ASSERT3U(rows[i][j], ==, 1); 938 } else { 939 ASSERT3U(rows[i][j], ==, 0); 940 } 941 } 942 } 943 } 944 945 static void 946 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 947 int *missing, uint8_t **invrows, const uint8_t *used) 948 { 949 int i, j, x, cc, c; 950 uint8_t *src; 951 uint64_t ccount; 952 uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 953 uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 954 uint8_t log, val; 955 int ll; 956 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 957 uint8_t *p, *pp; 958 size_t psize; 959 960 log = 0; /* gcc */ 961 psize = sizeof (invlog[0][0]) * n * nmissing; 962 p = zfs_alloc(psize); 963 964 for (pp = p, i = 0; i < nmissing; i++) { 965 invlog[i] = pp; 966 pp += n; 967 } 968 969 for (i = 0; i < nmissing; i++) { 970 for (j = 0; j < n; j++) { 971 ASSERT3U(invrows[i][j], !=, 0); 972 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 973 } 974 } 975 976 for (i = 0; i < n; i++) { 977 c = used[i]; 978 ASSERT3U(c, <, rm->rm_cols); 979 980 src = rm->rm_col[c].rc_data; 981 ccount = rm->rm_col[c].rc_size; 982 for (j = 0; j < nmissing; j++) { 983 cc = missing[j] + rm->rm_firstdatacol; 984 ASSERT3U(cc, >=, rm->rm_firstdatacol); 985 ASSERT3U(cc, <, rm->rm_cols); 986 ASSERT3U(cc, !=, c); 987 988 dst[j] = rm->rm_col[cc].rc_data; 989 dcount[j] = rm->rm_col[cc].rc_size; 990 } 991 992 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 993 994 for (x = 0; x < ccount; x++, src++) { 995 if (*src != 0) 996 log = vdev_raidz_log2[*src]; 997 998 for (cc = 0; cc < nmissing; cc++) { 999 if (x >= dcount[cc]) 1000 continue; 1001 1002 if (*src == 0) { 1003 val = 0; 1004 } else { 1005 if ((ll = log + invlog[cc][i]) >= 255) 1006 ll -= 255; 1007 val = vdev_raidz_pow2[ll]; 1008 } 1009 1010 if (i == 0) 1011 dst[cc][x] = val; 1012 else 1013 dst[cc][x] ^= val; 1014 } 1015 } 1016 } 1017 1018 zfs_free(p, psize); 1019 } 1020 1021 static int 1022 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 1023 { 1024 int n, i, c, t, tt; 1025 int nmissing_rows; 1026 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1027 int parity_map[VDEV_RAIDZ_MAXPARITY]; 1028 1029 uint8_t *p, *pp; 1030 size_t psize; 1031 1032 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1033 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1034 uint8_t *used; 1035 1036 int code = 0; 1037 1038 1039 n = rm->rm_cols - rm->rm_firstdatacol; 1040 1041 /* 1042 * Figure out which data columns are missing. 1043 */ 1044 nmissing_rows = 0; 1045 for (t = 0; t < ntgts; t++) { 1046 if (tgts[t] >= rm->rm_firstdatacol) { 1047 missing_rows[nmissing_rows++] = 1048 tgts[t] - rm->rm_firstdatacol; 1049 } 1050 } 1051 1052 /* 1053 * Figure out which parity columns to use to help generate the missing 1054 * data columns. 1055 */ 1056 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1057 ASSERT(tt < ntgts); 1058 ASSERT(c < rm->rm_firstdatacol); 1059 1060 /* 1061 * Skip any targeted parity columns. 1062 */ 1063 if (c == tgts[tt]) { 1064 tt++; 1065 continue; 1066 } 1067 1068 code |= 1 << c; 1069 1070 parity_map[i] = c; 1071 i++; 1072 } 1073 1074 ASSERT(code != 0); 1075 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1076 1077 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1078 nmissing_rows * n + sizeof (used[0]) * n; 1079 p = kmem_alloc(psize, KM_SLEEP); 1080 1081 for (pp = p, i = 0; i < nmissing_rows; i++) { 1082 rows[i] = pp; 1083 pp += n; 1084 invrows[i] = pp; 1085 pp += n; 1086 } 1087 used = pp; 1088 1089 for (i = 0; i < nmissing_rows; i++) { 1090 used[i] = parity_map[i]; 1091 } 1092 1093 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1094 if (tt < nmissing_rows && 1095 c == missing_rows[tt] + rm->rm_firstdatacol) { 1096 tt++; 1097 continue; 1098 } 1099 1100 ASSERT3S(i, <, n); 1101 used[i] = c; 1102 i++; 1103 } 1104 1105 /* 1106 * Initialize the interesting rows of the matrix. 1107 */ 1108 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1109 1110 /* 1111 * Invert the matrix. 1112 */ 1113 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1114 invrows, used); 1115 1116 /* 1117 * Reconstruct the missing data using the generated matrix. 1118 */ 1119 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1120 invrows, used); 1121 1122 kmem_free(p, psize); 1123 1124 return (code); 1125 } 1126 1127 static int 1128 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 1129 { 1130 int tgts[VDEV_RAIDZ_MAXPARITY]; 1131 int ntgts; 1132 int i, c; 1133 int code; 1134 int nbadparity, nbaddata; 1135 1136 /* 1137 * The tgts list must already be sorted. 1138 */ 1139 for (i = 1; i < nt; i++) { 1140 ASSERT(t[i] > t[i - 1]); 1141 } 1142 1143 nbadparity = rm->rm_firstdatacol; 1144 nbaddata = rm->rm_cols - nbadparity; 1145 ntgts = 0; 1146 for (i = 0, c = 0; c < rm->rm_cols; c++) { 1147 if (i < nt && c == t[i]) { 1148 tgts[ntgts++] = c; 1149 i++; 1150 } else if (rm->rm_col[c].rc_error != 0) { 1151 tgts[ntgts++] = c; 1152 } else if (c >= rm->rm_firstdatacol) { 1153 nbaddata--; 1154 } else { 1155 nbadparity--; 1156 } 1157 } 1158 1159 ASSERT(ntgts >= nt); 1160 ASSERT(nbaddata >= 0); 1161 ASSERT(nbaddata + nbadparity == ntgts); 1162 1163 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1164 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1165 ASSERT(code > 0); 1166 return (code); 1167 } 1168 1169 static raidz_map_t * 1170 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift, 1171 uint64_t dcols, uint64_t nparity) 1172 { 1173 raidz_map_t *rm; 1174 uint64_t b = offset >> unit_shift; 1175 uint64_t s = size >> unit_shift; 1176 uint64_t f = b % dcols; 1177 uint64_t o = (b / dcols) << unit_shift; 1178 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 1179 1180 q = s / (dcols - nparity); 1181 r = s - q * (dcols - nparity); 1182 bc = (r == 0 ? 0 : r + nparity); 1183 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 1184 1185 if (q == 0) { 1186 acols = bc; 1187 scols = MIN(dcols, roundup(bc, nparity + 1)); 1188 } else { 1189 acols = dcols; 1190 scols = dcols; 1191 } 1192 1193 ASSERT3U(acols, <=, scols); 1194 1195 rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols])); 1196 1197 rm->rm_cols = acols; 1198 rm->rm_scols = scols; 1199 rm->rm_bigcols = bc; 1200 rm->rm_skipstart = bc; 1201 rm->rm_missingdata = 0; 1202 rm->rm_missingparity = 0; 1203 rm->rm_firstdatacol = nparity; 1204 rm->rm_reports = 0; 1205 rm->rm_freed = 0; 1206 rm->rm_ecksuminjected = 0; 1207 1208 asize = 0; 1209 1210 for (c = 0; c < scols; c++) { 1211 col = f + c; 1212 coff = o; 1213 if (col >= dcols) { 1214 col -= dcols; 1215 coff += 1ULL << unit_shift; 1216 } 1217 rm->rm_col[c].rc_devidx = col; 1218 rm->rm_col[c].rc_offset = coff; 1219 rm->rm_col[c].rc_data = NULL; 1220 rm->rm_col[c].rc_error = 0; 1221 rm->rm_col[c].rc_tried = 0; 1222 rm->rm_col[c].rc_skipped = 0; 1223 1224 if (c >= acols) 1225 rm->rm_col[c].rc_size = 0; 1226 else if (c < bc) 1227 rm->rm_col[c].rc_size = (q + 1) << unit_shift; 1228 else 1229 rm->rm_col[c].rc_size = q << unit_shift; 1230 1231 asize += rm->rm_col[c].rc_size; 1232 } 1233 1234 ASSERT3U(asize, ==, tot << unit_shift); 1235 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 1236 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 1237 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 1238 ASSERT3U(rm->rm_nskip, <=, nparity); 1239 1240 for (c = 0; c < rm->rm_firstdatacol; c++) 1241 rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size); 1242 1243 rm->rm_col[c].rc_data = data; 1244 1245 for (c = c + 1; c < acols; c++) 1246 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 1247 rm->rm_col[c - 1].rc_size; 1248 1249 /* 1250 * If all data stored spans all columns, there's a danger that parity 1251 * will always be on the same device and, since parity isn't read 1252 * during normal operation, that that device's I/O bandwidth won't be 1253 * used effectively. We therefore switch the parity every 1MB. 1254 * 1255 * ... at least that was, ostensibly, the theory. As a practical 1256 * matter unless we juggle the parity between all devices evenly, we 1257 * won't see any benefit. Further, occasional writes that aren't a 1258 * multiple of the LCM of the number of children and the minimum 1259 * stripe width are sufficient to avoid pessimal behavior. 1260 * Unfortunately, this decision created an implicit on-disk format 1261 * requirement that we need to support for all eternity, but only 1262 * for single-parity RAID-Z. 1263 * 1264 * If we intend to skip a sector in the zeroth column for padding 1265 * we must make sure to note this swap. We will never intend to 1266 * skip the first column since at least one data and one parity 1267 * column must appear in each row. 1268 */ 1269 ASSERT(rm->rm_cols >= 2); 1270 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 1271 1272 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { 1273 devidx = rm->rm_col[0].rc_devidx; 1274 o = rm->rm_col[0].rc_offset; 1275 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 1276 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 1277 rm->rm_col[1].rc_devidx = devidx; 1278 rm->rm_col[1].rc_offset = o; 1279 1280 if (rm->rm_skipstart == 0) 1281 rm->rm_skipstart = 1; 1282 } 1283 1284 return (rm); 1285 } 1286 1287 static void 1288 vdev_raidz_map_free(raidz_map_t *rm) 1289 { 1290 int c; 1291 1292 for (c = rm->rm_firstdatacol - 1; c >= 0; c--) 1293 zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); 1294 1295 zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 1296 } 1297 1298 static vdev_t * 1299 vdev_child(vdev_t *pvd, uint64_t devidx) 1300 { 1301 vdev_t *cvd; 1302 1303 STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) { 1304 if (cvd->v_id == devidx) 1305 break; 1306 } 1307 1308 return (cvd); 1309 } 1310 1311 /* 1312 * We keep track of whether or not there were any injected errors, so that 1313 * any ereports we generate can note it. 1314 */ 1315 static int 1316 raidz_checksum_verify(const blkptr_t *bp, void *data, uint64_t size) 1317 { 1318 1319 return (zio_checksum_verify(bp, data)); 1320 } 1321 1322 /* 1323 * Generate the parity from the data columns. If we tried and were able to 1324 * read the parity without error, verify that the generated parity matches the 1325 * data we read. If it doesn't, we fire off a checksum error. Return the 1326 * number such failures. 1327 */ 1328 static int 1329 raidz_parity_verify(raidz_map_t *rm) 1330 { 1331 void *orig[VDEV_RAIDZ_MAXPARITY]; 1332 int c, ret = 0; 1333 raidz_col_t *rc; 1334 1335 for (c = 0; c < rm->rm_firstdatacol; c++) { 1336 rc = &rm->rm_col[c]; 1337 if (!rc->rc_tried || rc->rc_error != 0) 1338 continue; 1339 orig[c] = zfs_alloc(rc->rc_size); 1340 bcopy(rc->rc_data, orig[c], rc->rc_size); 1341 } 1342 1343 vdev_raidz_generate_parity(rm); 1344 1345 for (c = rm->rm_firstdatacol - 1; c >= 0; c--) { 1346 rc = &rm->rm_col[c]; 1347 if (!rc->rc_tried || rc->rc_error != 0) 1348 continue; 1349 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 1350 rc->rc_error = ECKSUM; 1351 ret++; 1352 } 1353 zfs_free(orig[c], rc->rc_size); 1354 } 1355 1356 return (ret); 1357 } 1358 1359 /* 1360 * Iterate over all combinations of bad data and attempt a reconstruction. 1361 * Note that the algorithm below is non-optimal because it doesn't take into 1362 * account how reconstruction is actually performed. For example, with 1363 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 1364 * is targeted as invalid as if columns 1 and 4 are targeted since in both 1365 * cases we'd only use parity information in column 0. 1366 */ 1367 static int 1368 vdev_raidz_combrec(raidz_map_t *rm, const blkptr_t *bp, void *data, 1369 off_t offset, uint64_t bytes, int total_errors, int data_errors) 1370 { 1371 raidz_col_t *rc; 1372 void *orig[VDEV_RAIDZ_MAXPARITY]; 1373 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 1374 int *tgts = &tstore[1]; 1375 int current, next, i, c, n; 1376 int code, ret = 0; 1377 1378 ASSERT(total_errors < rm->rm_firstdatacol); 1379 1380 /* 1381 * This simplifies one edge condition. 1382 */ 1383 tgts[-1] = -1; 1384 1385 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 1386 /* 1387 * Initialize the targets array by finding the first n columns 1388 * that contain no error. 1389 * 1390 * If there were no data errors, we need to ensure that we're 1391 * always explicitly attempting to reconstruct at least one 1392 * data column. To do this, we simply push the highest target 1393 * up into the data columns. 1394 */ 1395 for (c = 0, i = 0; i < n; i++) { 1396 if (i == n - 1 && data_errors == 0 && 1397 c < rm->rm_firstdatacol) { 1398 c = rm->rm_firstdatacol; 1399 } 1400 1401 while (rm->rm_col[c].rc_error != 0) { 1402 c++; 1403 ASSERT3S(c, <, rm->rm_cols); 1404 } 1405 1406 tgts[i] = c++; 1407 } 1408 1409 /* 1410 * Setting tgts[n] simplifies the other edge condition. 1411 */ 1412 tgts[n] = rm->rm_cols; 1413 1414 /* 1415 * These buffers were allocated in previous iterations. 1416 */ 1417 for (i = 0; i < n - 1; i++) { 1418 ASSERT(orig[i] != NULL); 1419 } 1420 1421 orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size); 1422 1423 current = 0; 1424 next = tgts[current]; 1425 1426 while (current != n) { 1427 tgts[current] = next; 1428 current = 0; 1429 1430 /* 1431 * Save off the original data that we're going to 1432 * attempt to reconstruct. 1433 */ 1434 for (i = 0; i < n; i++) { 1435 ASSERT(orig[i] != NULL); 1436 c = tgts[i]; 1437 ASSERT3S(c, >=, 0); 1438 ASSERT3S(c, <, rm->rm_cols); 1439 rc = &rm->rm_col[c]; 1440 bcopy(rc->rc_data, orig[i], rc->rc_size); 1441 } 1442 1443 /* 1444 * Attempt a reconstruction and exit the outer loop on 1445 * success. 1446 */ 1447 code = vdev_raidz_reconstruct(rm, tgts, n); 1448 if (raidz_checksum_verify(bp, data, bytes) == 0) { 1449 for (i = 0; i < n; i++) { 1450 c = tgts[i]; 1451 rc = &rm->rm_col[c]; 1452 ASSERT(rc->rc_error == 0); 1453 rc->rc_error = ECKSUM; 1454 } 1455 1456 ret = code; 1457 goto done; 1458 } 1459 1460 /* 1461 * Restore the original data. 1462 */ 1463 for (i = 0; i < n; i++) { 1464 c = tgts[i]; 1465 rc = &rm->rm_col[c]; 1466 bcopy(orig[i], rc->rc_data, rc->rc_size); 1467 } 1468 1469 do { 1470 /* 1471 * Find the next valid column after the current 1472 * position.. 1473 */ 1474 for (next = tgts[current] + 1; 1475 next < rm->rm_cols && 1476 rm->rm_col[next].rc_error != 0; next++) 1477 continue; 1478 1479 ASSERT(next <= tgts[current + 1]); 1480 1481 /* 1482 * If that spot is available, we're done here. 1483 */ 1484 if (next != tgts[current + 1]) 1485 break; 1486 1487 /* 1488 * Otherwise, find the next valid column after 1489 * the previous position. 1490 */ 1491 for (c = tgts[current - 1] + 1; 1492 rm->rm_col[c].rc_error != 0; c++) 1493 continue; 1494 1495 tgts[current] = c; 1496 current++; 1497 1498 } while (current != n); 1499 } 1500 } 1501 n--; 1502 done: 1503 for (i = n - 1; i >= 0; i--) { 1504 zfs_free(orig[i], rm->rm_col[0].rc_size); 1505 } 1506 1507 return (ret); 1508 } 1509 1510 static int 1511 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data, 1512 off_t offset, size_t bytes) 1513 { 1514 vdev_t *tvd = vd->v_top; 1515 vdev_t *cvd; 1516 raidz_map_t *rm; 1517 raidz_col_t *rc; 1518 int c, error; 1519 int unexpected_errors; 1520 int parity_errors; 1521 int parity_untried; 1522 int data_errors; 1523 int total_errors; 1524 int n; 1525 int tgts[VDEV_RAIDZ_MAXPARITY]; 1526 int code; 1527 1528 rc = NULL; /* gcc */ 1529 error = 0; 1530 1531 rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift, 1532 vd->v_nchildren, vd->v_nparity); 1533 1534 /* 1535 * Iterate over the columns in reverse order so that we hit the parity 1536 * last -- any errors along the way will force us to read the parity. 1537 */ 1538 for (c = rm->rm_cols - 1; c >= 0; c--) { 1539 rc = &rm->rm_col[c]; 1540 cvd = vdev_child(vd, rc->rc_devidx); 1541 if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) { 1542 if (c >= rm->rm_firstdatacol) 1543 rm->rm_missingdata++; 1544 else 1545 rm->rm_missingparity++; 1546 rc->rc_error = ENXIO; 1547 rc->rc_tried = 1; /* don't even try */ 1548 rc->rc_skipped = 1; 1549 continue; 1550 } 1551 #if 0 /* XXX: Too hard for the boot code. */ 1552 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 1553 if (c >= rm->rm_firstdatacol) 1554 rm->rm_missingdata++; 1555 else 1556 rm->rm_missingparity++; 1557 rc->rc_error = ESTALE; 1558 rc->rc_skipped = 1; 1559 continue; 1560 } 1561 #endif 1562 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) { 1563 rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data, 1564 rc->rc_offset, rc->rc_size); 1565 rc->rc_tried = 1; 1566 rc->rc_skipped = 0; 1567 } 1568 } 1569 1570 reconstruct: 1571 unexpected_errors = 0; 1572 parity_errors = 0; 1573 parity_untried = 0; 1574 data_errors = 0; 1575 total_errors = 0; 1576 1577 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 1578 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 1579 1580 for (c = 0; c < rm->rm_cols; c++) { 1581 rc = &rm->rm_col[c]; 1582 1583 if (rc->rc_error) { 1584 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 1585 1586 if (c < rm->rm_firstdatacol) 1587 parity_errors++; 1588 else 1589 data_errors++; 1590 1591 if (!rc->rc_skipped) 1592 unexpected_errors++; 1593 1594 total_errors++; 1595 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 1596 parity_untried++; 1597 } 1598 } 1599 1600 /* 1601 * There are three potential phases for a read: 1602 * 1. produce valid data from the columns read 1603 * 2. read all disks and try again 1604 * 3. perform combinatorial reconstruction 1605 * 1606 * Each phase is progressively both more expensive and less likely to 1607 * occur. If we encounter more errors than we can repair or all phases 1608 * fail, we have no choice but to return an error. 1609 */ 1610 1611 /* 1612 * If the number of errors we saw was correctable -- less than or equal 1613 * to the number of parity disks read -- attempt to produce data that 1614 * has a valid checksum. Naturally, this case applies in the absence of 1615 * any errors. 1616 */ 1617 if (total_errors <= rm->rm_firstdatacol - parity_untried) { 1618 if (data_errors == 0) { 1619 if (raidz_checksum_verify(bp, data, bytes) == 0) { 1620 /* 1621 * If we read parity information (unnecessarily 1622 * as it happens since no reconstruction was 1623 * needed) regenerate and verify the parity. 1624 * We also regenerate parity when resilvering 1625 * so we can write it out to the failed device 1626 * later. 1627 */ 1628 if (parity_errors + parity_untried < 1629 rm->rm_firstdatacol) { 1630 n = raidz_parity_verify(rm); 1631 unexpected_errors += n; 1632 ASSERT(parity_errors + n <= 1633 rm->rm_firstdatacol); 1634 } 1635 goto done; 1636 } 1637 } else { 1638 /* 1639 * We either attempt to read all the parity columns or 1640 * none of them. If we didn't try to read parity, we 1641 * wouldn't be here in the correctable case. There must 1642 * also have been fewer parity errors than parity 1643 * columns or, again, we wouldn't be in this code path. 1644 */ 1645 ASSERT(parity_untried == 0); 1646 ASSERT(parity_errors < rm->rm_firstdatacol); 1647 1648 /* 1649 * Identify the data columns that reported an error. 1650 */ 1651 n = 0; 1652 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1653 rc = &rm->rm_col[c]; 1654 if (rc->rc_error != 0) { 1655 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 1656 tgts[n++] = c; 1657 } 1658 } 1659 1660 ASSERT(rm->rm_firstdatacol >= n); 1661 1662 code = vdev_raidz_reconstruct(rm, tgts, n); 1663 1664 if (raidz_checksum_verify(bp, data, bytes) == 0) { 1665 /* 1666 * If we read more parity disks than were used 1667 * for reconstruction, confirm that the other 1668 * parity disks produced correct data. This 1669 * routine is suboptimal in that it regenerates 1670 * the parity that we already used in addition 1671 * to the parity that we're attempting to 1672 * verify, but this should be a relatively 1673 * uncommon case, and can be optimized if it 1674 * becomes a problem. Note that we regenerate 1675 * parity when resilvering so we can write it 1676 * out to failed devices later. 1677 */ 1678 if (parity_errors < rm->rm_firstdatacol - n) { 1679 n = raidz_parity_verify(rm); 1680 unexpected_errors += n; 1681 ASSERT(parity_errors + n <= 1682 rm->rm_firstdatacol); 1683 } 1684 1685 goto done; 1686 } 1687 } 1688 } 1689 1690 /* 1691 * This isn't a typical situation -- either we got a read 1692 * error or a child silently returned bad data. Read every 1693 * block so we can try again with as much data and parity as 1694 * we can track down. If we've already been through once 1695 * before, all children will be marked as tried so we'll 1696 * proceed to combinatorial reconstruction. 1697 */ 1698 unexpected_errors = 1; 1699 rm->rm_missingdata = 0; 1700 rm->rm_missingparity = 0; 1701 1702 n = 0; 1703 for (c = 0; c < rm->rm_cols; c++) { 1704 rc = &rm->rm_col[c]; 1705 1706 if (rc->rc_tried) 1707 continue; 1708 1709 cvd = vdev_child(vd, rc->rc_devidx); 1710 ASSERT(cvd != NULL); 1711 rc->rc_error = cvd->v_read(cvd, NULL, 1712 rc->rc_data, rc->rc_offset, rc->rc_size); 1713 if (rc->rc_error == 0) 1714 n++; 1715 rc->rc_tried = 1; 1716 rc->rc_skipped = 0; 1717 } 1718 /* 1719 * If we managed to read anything more, retry the 1720 * reconstruction. 1721 */ 1722 if (n > 0) 1723 goto reconstruct; 1724 1725 /* 1726 * At this point we've attempted to reconstruct the data given the 1727 * errors we detected, and we've attempted to read all columns. There 1728 * must, therefore, be one or more additional problems -- silent errors 1729 * resulting in invalid data rather than explicit I/O errors resulting 1730 * in absent data. We check if there is enough additional data to 1731 * possibly reconstruct the data and then perform combinatorial 1732 * reconstruction over all possible combinations. If that fails, 1733 * we're cooked. 1734 */ 1735 if (total_errors > rm->rm_firstdatacol) { 1736 error = EIO; 1737 } else if (total_errors < rm->rm_firstdatacol && 1738 (code = vdev_raidz_combrec(rm, bp, data, offset, bytes, 1739 total_errors, data_errors)) != 0) { 1740 /* 1741 * If we didn't use all the available parity for the 1742 * combinatorial reconstruction, verify that the remaining 1743 * parity is correct. 1744 */ 1745 if (code != (1 << rm->rm_firstdatacol) - 1) 1746 (void) raidz_parity_verify(rm); 1747 } else { 1748 /* 1749 * We're here because either: 1750 * 1751 * total_errors == rm_first_datacol, or 1752 * vdev_raidz_combrec() failed 1753 * 1754 * In either case, there is enough bad data to prevent 1755 * reconstruction. 1756 * 1757 * Start checksum ereports for all children which haven't 1758 * failed, and the IO wasn't speculative. 1759 */ 1760 error = ECKSUM; 1761 } 1762 1763 done: 1764 vdev_raidz_map_free(rm); 1765 1766 return (error); 1767 } 1768