1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/cdefs.h> 27 __FBSDID("$FreeBSD$"); 28 29 #include <lz4.h> 30 31 static uint64_t zfs_crc64_table[256]; 32 33 #define ECKSUM 666 34 35 #define ASSERT3S(x, y, z) ((void)0) 36 #define ASSERT3U(x, y, z) ((void)0) 37 #define ASSERT3P(x, y, z) ((void)0) 38 #define ASSERT0(x) ((void)0) 39 #define ASSERT(x) ((void)0) 40 41 #define panic(...) do { \ 42 printf(__VA_ARGS__); \ 43 for (;;) ; \ 44 } while (0) 45 46 static void 47 zfs_init_crc(void) 48 { 49 int i, j; 50 uint64_t *ct; 51 52 /* 53 * Calculate the crc64 table (used for the zap hash 54 * function). 55 */ 56 if (zfs_crc64_table[128] != ZFS_CRC64_POLY) { 57 memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table)); 58 for (i = 0; i < 256; i++) 59 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 60 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 61 } 62 } 63 64 static void 65 zio_checksum_off(const void *buf, uint64_t size, 66 const void *ctx_template, zio_cksum_t *zcp) 67 { 68 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); 69 } 70 71 /* 72 * Signature for checksum functions. 73 */ 74 typedef void zio_checksum_t(const void *data, uint64_t size, 75 const void *ctx_template, zio_cksum_t *zcp); 76 typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt); 77 typedef void zio_checksum_tmpl_free_t(void *ctx_template); 78 79 typedef enum zio_checksum_flags { 80 /* Strong enough for metadata? */ 81 ZCHECKSUM_FLAG_METADATA = (1 << 1), 82 /* ZIO embedded checksum */ 83 ZCHECKSUM_FLAG_EMBEDDED = (1 << 2), 84 /* Strong enough for dedup (without verification)? */ 85 ZCHECKSUM_FLAG_DEDUP = (1 << 3), 86 /* Uses salt value */ 87 ZCHECKSUM_FLAG_SALTED = (1 << 4), 88 /* Strong enough for nopwrite? */ 89 ZCHECKSUM_FLAG_NOPWRITE = (1 << 5) 90 } zio_checksum_flags_t; 91 92 /* 93 * Information about each checksum function. 94 */ 95 typedef struct zio_checksum_info { 96 /* checksum function for each byteorder */ 97 zio_checksum_t *ci_func[2]; 98 zio_checksum_tmpl_init_t *ci_tmpl_init; 99 zio_checksum_tmpl_free_t *ci_tmpl_free; 100 zio_checksum_flags_t ci_flags; 101 const char *ci_name; /* descriptive name */ 102 } zio_checksum_info_t; 103 104 #include "blkptr.c" 105 106 #include "fletcher.c" 107 #include "sha256.c" 108 #include "skein_zfs.c" 109 110 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { 111 {{NULL, NULL}, NULL, NULL, 0, "inherit"}, 112 {{NULL, NULL}, NULL, NULL, 0, "on"}, 113 {{zio_checksum_off, zio_checksum_off}, NULL, NULL, 0, "off"}, 114 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, 115 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"}, 116 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, 117 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"}, 118 {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL, 119 ZCHECKSUM_FLAG_EMBEDDED, "zilog"}, 120 {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL, 121 0, "fletcher2"}, 122 {{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL, 123 ZCHECKSUM_FLAG_METADATA, "fletcher4"}, 124 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, 125 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | 126 ZCHECKSUM_FLAG_NOPWRITE, "SHA256"}, 127 {{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL, 128 ZCHECKSUM_FLAG_EMBEDDED, "zillog2"}, 129 {{zio_checksum_off, zio_checksum_off}, NULL, NULL, 130 0, "noparity"}, 131 {{zio_checksum_SHA512_native, zio_checksum_SHA512_byteswap}, 132 NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | 133 ZCHECKSUM_FLAG_NOPWRITE, "SHA512"}, 134 {{zio_checksum_skein_native, zio_checksum_skein_byteswap}, 135 zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free, 136 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | 137 ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"}, 138 /* no edonr for now */ 139 {{NULL, NULL}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | 140 ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"} 141 }; 142 143 /* 144 * Common signature for all zio compress/decompress functions. 145 */ 146 typedef size_t zio_compress_func_t(void *src, void *dst, 147 size_t s_len, size_t d_len, int); 148 typedef int zio_decompress_func_t(void *src, void *dst, 149 size_t s_len, size_t d_len, int); 150 151 /* 152 * Information about each compression function. 153 */ 154 typedef struct zio_compress_info { 155 zio_compress_func_t *ci_compress; /* compression function */ 156 zio_decompress_func_t *ci_decompress; /* decompression function */ 157 int ci_level; /* level parameter */ 158 const char *ci_name; /* algorithm name */ 159 } zio_compress_info_t; 160 161 #include "lzjb.c" 162 #include "zle.c" 163 164 /* 165 * Compression vectors. 166 */ 167 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { 168 {NULL, NULL, 0, "inherit"}, 169 {NULL, NULL, 0, "on"}, 170 {NULL, NULL, 0, "uncompressed"}, 171 {NULL, lzjb_decompress, 0, "lzjb"}, 172 {NULL, NULL, 0, "empty"}, 173 {NULL, NULL, 1, "gzip-1"}, 174 {NULL, NULL, 2, "gzip-2"}, 175 {NULL, NULL, 3, "gzip-3"}, 176 {NULL, NULL, 4, "gzip-4"}, 177 {NULL, NULL, 5, "gzip-5"}, 178 {NULL, NULL, 6, "gzip-6"}, 179 {NULL, NULL, 7, "gzip-7"}, 180 {NULL, NULL, 8, "gzip-8"}, 181 {NULL, NULL, 9, "gzip-9"}, 182 {NULL, zle_decompress, 64, "zle"}, 183 {NULL, lz4_decompress, 0, "lz4"}, 184 }; 185 186 static void 187 byteswap_uint64_array(void *vbuf, size_t size) 188 { 189 uint64_t *buf = vbuf; 190 size_t count = size >> 3; 191 int i; 192 193 ASSERT((size & 7) == 0); 194 195 for (i = 0; i < count; i++) 196 buf[i] = BSWAP_64(buf[i]); 197 } 198 199 /* 200 * Set the external verifier for a gang block based on <vdev, offset, txg>, 201 * a tuple which is guaranteed to be unique for the life of the pool. 202 */ 203 static void 204 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp) 205 { 206 const dva_t *dva = BP_IDENTITY(bp); 207 uint64_t txg = BP_PHYSICAL_BIRTH(bp); 208 209 ASSERT(BP_IS_GANG(bp)); 210 211 ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0); 212 } 213 214 /* 215 * Set the external verifier for a label block based on its offset. 216 * The vdev is implicit, and the txg is unknowable at pool open time -- 217 * hence the logic in vdev_uberblock_load() to find the most recent copy. 218 */ 219 static void 220 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset) 221 { 222 ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0); 223 } 224 225 /* 226 * Calls the template init function of a checksum which supports context 227 * templates and installs the template into the spa_t. 228 */ 229 static void 230 zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa) 231 { 232 zio_checksum_info_t *ci = &zio_checksum_table[checksum]; 233 234 if (ci->ci_tmpl_init == NULL) 235 return; 236 237 if (spa->spa_cksum_tmpls[checksum] != NULL) 238 return; 239 240 if (spa->spa_cksum_tmpls[checksum] == NULL) { 241 spa->spa_cksum_tmpls[checksum] = 242 ci->ci_tmpl_init(&spa->spa_cksum_salt); 243 } 244 } 245 246 /* 247 * Called by a spa_t that's about to be deallocated. This steps through 248 * all of the checksum context templates and deallocates any that were 249 * initialized using the algorithm-specific template init function. 250 */ 251 static void __unused 252 zio_checksum_templates_free(spa_t *spa) 253 { 254 for (enum zio_checksum checksum = 0; 255 checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) { 256 if (spa->spa_cksum_tmpls[checksum] != NULL) { 257 zio_checksum_info_t *ci = &zio_checksum_table[checksum]; 258 259 ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]); 260 spa->spa_cksum_tmpls[checksum] = NULL; 261 } 262 } 263 } 264 265 static int 266 zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data) 267 { 268 uint64_t size; 269 unsigned int checksum; 270 zio_checksum_info_t *ci; 271 void *ctx = NULL; 272 zio_cksum_t actual_cksum, expected_cksum, verifier; 273 int byteswap; 274 275 checksum = BP_GET_CHECKSUM(bp); 276 size = BP_GET_PSIZE(bp); 277 278 if (checksum >= ZIO_CHECKSUM_FUNCTIONS) 279 return (EINVAL); 280 ci = &zio_checksum_table[checksum]; 281 if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL) 282 return (EINVAL); 283 284 if (spa != NULL) { 285 zio_checksum_template_init(checksum, __DECONST(spa_t *,spa)); 286 ctx = spa->spa_cksum_tmpls[checksum]; 287 } 288 289 if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { 290 zio_eck_t *eck; 291 292 ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER || 293 checksum == ZIO_CHECKSUM_LABEL); 294 295 eck = (zio_eck_t *)((char *)data + size) - 1; 296 297 if (checksum == ZIO_CHECKSUM_GANG_HEADER) 298 zio_checksum_gang_verifier(&verifier, bp); 299 else if (checksum == ZIO_CHECKSUM_LABEL) 300 zio_checksum_label_verifier(&verifier, 301 DVA_GET_OFFSET(BP_IDENTITY(bp))); 302 else 303 verifier = bp->blk_cksum; 304 305 byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC)); 306 307 if (byteswap) 308 byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); 309 310 expected_cksum = eck->zec_cksum; 311 eck->zec_cksum = verifier; 312 ci->ci_func[byteswap](data, size, ctx, &actual_cksum); 313 eck->zec_cksum = expected_cksum; 314 315 if (byteswap) 316 byteswap_uint64_array(&expected_cksum, 317 sizeof (zio_cksum_t)); 318 } else { 319 byteswap = BP_SHOULD_BYTESWAP(bp); 320 expected_cksum = bp->blk_cksum; 321 ci->ci_func[byteswap](data, size, ctx, &actual_cksum); 322 } 323 324 if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) { 325 /*printf("ZFS: read checksum %s failed\n", ci->ci_name);*/ 326 return (EIO); 327 } 328 329 return (0); 330 } 331 332 static int 333 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize, 334 void *dest, uint64_t destsize) 335 { 336 zio_compress_info_t *ci; 337 338 if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) { 339 printf("ZFS: unsupported compression algorithm %u\n", cpfunc); 340 return (EIO); 341 } 342 343 ci = &zio_compress_table[cpfunc]; 344 if (!ci->ci_decompress) { 345 printf("ZFS: unsupported compression algorithm %s\n", 346 ci->ci_name); 347 return (EIO); 348 } 349 350 return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level)); 351 } 352 353 static uint64_t 354 zap_hash(uint64_t salt, const char *name) 355 { 356 const uint8_t *cp; 357 uint8_t c; 358 uint64_t crc = salt; 359 360 ASSERT(crc != 0); 361 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 362 for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++) 363 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF]; 364 365 /* 366 * Only use 28 bits, since we need 4 bits in the cookie for the 367 * collision differentiator. We MUST use the high bits, since 368 * those are the onces that we first pay attention to when 369 * chosing the bucket. 370 */ 371 crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1); 372 373 return (crc); 374 } 375 376 typedef struct raidz_col { 377 uint64_t rc_devidx; /* child device index for I/O */ 378 uint64_t rc_offset; /* device offset */ 379 uint64_t rc_size; /* I/O size */ 380 void *rc_data; /* I/O data */ 381 int rc_error; /* I/O error for this device */ 382 uint8_t rc_tried; /* Did we attempt this I/O column? */ 383 uint8_t rc_skipped; /* Did we skip this I/O column? */ 384 } raidz_col_t; 385 386 typedef struct raidz_map { 387 uint64_t rm_cols; /* Regular column count */ 388 uint64_t rm_scols; /* Count including skipped columns */ 389 uint64_t rm_bigcols; /* Number of oversized columns */ 390 uint64_t rm_asize; /* Actual total I/O size */ 391 uint64_t rm_missingdata; /* Count of missing data devices */ 392 uint64_t rm_missingparity; /* Count of missing parity devices */ 393 uint64_t rm_firstdatacol; /* First data column/parity count */ 394 uint64_t rm_nskip; /* Skipped sectors for padding */ 395 uint64_t rm_skipstart; /* Column index of padding start */ 396 uintptr_t rm_reports; /* # of referencing checksum reports */ 397 uint8_t rm_freed; /* map no longer has referencing ZIO */ 398 uint8_t rm_ecksuminjected; /* checksum error was injected */ 399 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 400 } raidz_map_t; 401 402 #define VDEV_RAIDZ_P 0 403 #define VDEV_RAIDZ_Q 1 404 #define VDEV_RAIDZ_R 2 405 406 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 407 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 408 409 /* 410 * We provide a mechanism to perform the field multiplication operation on a 411 * 64-bit value all at once rather than a byte at a time. This works by 412 * creating a mask from the top bit in each byte and using that to 413 * conditionally apply the XOR of 0x1d. 414 */ 415 #define VDEV_RAIDZ_64MUL_2(x, mask) \ 416 { \ 417 (mask) = (x) & 0x8080808080808080ULL; \ 418 (mask) = ((mask) << 1) - ((mask) >> 7); \ 419 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 420 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ 421 } 422 423 #define VDEV_RAIDZ_64MUL_4(x, mask) \ 424 { \ 425 VDEV_RAIDZ_64MUL_2((x), mask); \ 426 VDEV_RAIDZ_64MUL_2((x), mask); \ 427 } 428 429 /* 430 * These two tables represent powers and logs of 2 in the Galois field defined 431 * above. These values were computed by repeatedly multiplying by 2 as above. 432 */ 433 static const uint8_t vdev_raidz_pow2[256] = { 434 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 435 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 436 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 437 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 438 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 439 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 440 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 441 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 442 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 443 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 444 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 445 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 446 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 447 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 448 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 449 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 450 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 451 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 452 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 453 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 454 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 455 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 456 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 457 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 458 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 459 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 460 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 461 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 462 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 463 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 464 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 465 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 466 }; 467 static const uint8_t vdev_raidz_log2[256] = { 468 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 469 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 470 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 471 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 472 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 473 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 474 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 475 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 476 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 477 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 478 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 479 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 480 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 481 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 482 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 483 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 484 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 485 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 486 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 487 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 488 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 489 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 490 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 491 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 492 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 493 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 494 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 495 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 496 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 497 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 498 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 499 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 500 }; 501 502 /* 503 * Multiply a given number by 2 raised to the given power. 504 */ 505 static uint8_t 506 vdev_raidz_exp2(uint8_t a, int exp) 507 { 508 if (a == 0) 509 return (0); 510 511 ASSERT(exp >= 0); 512 ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 513 514 exp += vdev_raidz_log2[a]; 515 if (exp > 255) 516 exp -= 255; 517 518 return (vdev_raidz_pow2[exp]); 519 } 520 521 static void 522 vdev_raidz_generate_parity_p(raidz_map_t *rm) 523 { 524 uint64_t *p, *src, pcount, ccount, i; 525 int c; 526 527 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 528 529 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 530 src = rm->rm_col[c].rc_data; 531 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 532 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 533 534 if (c == rm->rm_firstdatacol) { 535 ASSERT(ccount == pcount); 536 for (i = 0; i < ccount; i++, src++, p++) { 537 *p = *src; 538 } 539 } else { 540 ASSERT(ccount <= pcount); 541 for (i = 0; i < ccount; i++, src++, p++) { 542 *p ^= *src; 543 } 544 } 545 } 546 } 547 548 static void 549 vdev_raidz_generate_parity_pq(raidz_map_t *rm) 550 { 551 uint64_t *p, *q, *src, pcnt, ccnt, mask, i; 552 int c; 553 554 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 555 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 556 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 557 558 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 559 src = rm->rm_col[c].rc_data; 560 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 561 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 562 563 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 564 565 if (c == rm->rm_firstdatacol) { 566 ASSERT(ccnt == pcnt || ccnt == 0); 567 for (i = 0; i < ccnt; i++, src++, p++, q++) { 568 *p = *src; 569 *q = *src; 570 } 571 for (; i < pcnt; i++, src++, p++, q++) { 572 *p = 0; 573 *q = 0; 574 } 575 } else { 576 ASSERT(ccnt <= pcnt); 577 578 /* 579 * Apply the algorithm described above by multiplying 580 * the previous result and adding in the new value. 581 */ 582 for (i = 0; i < ccnt; i++, src++, p++, q++) { 583 *p ^= *src; 584 585 VDEV_RAIDZ_64MUL_2(*q, mask); 586 *q ^= *src; 587 } 588 589 /* 590 * Treat short columns as though they are full of 0s. 591 * Note that there's therefore nothing needed for P. 592 */ 593 for (; i < pcnt; i++, q++) { 594 VDEV_RAIDZ_64MUL_2(*q, mask); 595 } 596 } 597 } 598 } 599 600 static void 601 vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 602 { 603 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; 604 int c; 605 606 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 607 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 608 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 609 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 610 rm->rm_col[VDEV_RAIDZ_R].rc_size); 611 612 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 613 src = rm->rm_col[c].rc_data; 614 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 615 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 616 r = rm->rm_col[VDEV_RAIDZ_R].rc_data; 617 618 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 619 620 if (c == rm->rm_firstdatacol) { 621 ASSERT(ccnt == pcnt || ccnt == 0); 622 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 623 *p = *src; 624 *q = *src; 625 *r = *src; 626 } 627 for (; i < pcnt; i++, src++, p++, q++, r++) { 628 *p = 0; 629 *q = 0; 630 *r = 0; 631 } 632 } else { 633 ASSERT(ccnt <= pcnt); 634 635 /* 636 * Apply the algorithm described above by multiplying 637 * the previous result and adding in the new value. 638 */ 639 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 640 *p ^= *src; 641 642 VDEV_RAIDZ_64MUL_2(*q, mask); 643 *q ^= *src; 644 645 VDEV_RAIDZ_64MUL_4(*r, mask); 646 *r ^= *src; 647 } 648 649 /* 650 * Treat short columns as though they are full of 0s. 651 * Note that there's therefore nothing needed for P. 652 */ 653 for (; i < pcnt; i++, q++, r++) { 654 VDEV_RAIDZ_64MUL_2(*q, mask); 655 VDEV_RAIDZ_64MUL_4(*r, mask); 656 } 657 } 658 } 659 } 660 661 /* 662 * Generate RAID parity in the first virtual columns according to the number of 663 * parity columns available. 664 */ 665 static void 666 vdev_raidz_generate_parity(raidz_map_t *rm) 667 { 668 switch (rm->rm_firstdatacol) { 669 case 1: 670 vdev_raidz_generate_parity_p(rm); 671 break; 672 case 2: 673 vdev_raidz_generate_parity_pq(rm); 674 break; 675 case 3: 676 vdev_raidz_generate_parity_pqr(rm); 677 break; 678 default: 679 panic("invalid RAID-Z configuration"); 680 } 681 } 682 683 /* BEGIN CSTYLED */ 684 /* 685 * In the general case of reconstruction, we must solve the system of linear 686 * equations defined by the coeffecients used to generate parity as well as 687 * the contents of the data and parity disks. This can be expressed with 688 * vectors for the original data (D) and the actual data (d) and parity (p) 689 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 690 * 691 * __ __ __ __ 692 * | | __ __ | p_0 | 693 * | V | | D_0 | | p_m-1 | 694 * | | x | : | = | d_0 | 695 * | I | | D_n-1 | | : | 696 * | | ~~ ~~ | d_n-1 | 697 * ~~ ~~ ~~ ~~ 698 * 699 * I is simply a square identity matrix of size n, and V is a vandermonde 700 * matrix defined by the coeffecients we chose for the various parity columns 701 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 702 * computation as well as linear separability. 703 * 704 * __ __ __ __ 705 * | 1 .. 1 1 1 | | p_0 | 706 * | 2^n-1 .. 4 2 1 | __ __ | : | 707 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 708 * | 1 .. 0 0 0 | | D_1 | | d_0 | 709 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 710 * | : : : : | | : | | d_2 | 711 * | 0 .. 1 0 0 | | D_n-1 | | : | 712 * | 0 .. 0 1 0 | ~~ ~~ | : | 713 * | 0 .. 0 0 1 | | d_n-1 | 714 * ~~ ~~ ~~ ~~ 715 * 716 * Note that I, V, d, and p are known. To compute D, we must invert the 717 * matrix and use the known data and parity values to reconstruct the unknown 718 * data values. We begin by removing the rows in V|I and d|p that correspond 719 * to failed or missing columns; we then make V|I square (n x n) and d|p 720 * sized n by removing rows corresponding to unused parity from the bottom up 721 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 722 * using Gauss-Jordan elimination. In the example below we use m=3 parity 723 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 724 * __ __ 725 * | 1 1 1 1 1 1 1 1 | 726 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 727 * | 19 205 116 29 64 16 4 1 | / / 728 * | 1 0 0 0 0 0 0 0 | / / 729 * | 0 1 0 0 0 0 0 0 | <--' / 730 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 731 * | 0 0 0 1 0 0 0 0 | 732 * | 0 0 0 0 1 0 0 0 | 733 * | 0 0 0 0 0 1 0 0 | 734 * | 0 0 0 0 0 0 1 0 | 735 * | 0 0 0 0 0 0 0 1 | 736 * ~~ ~~ 737 * __ __ 738 * | 1 1 1 1 1 1 1 1 | 739 * | 128 64 32 16 8 4 2 1 | 740 * | 19 205 116 29 64 16 4 1 | 741 * | 1 0 0 0 0 0 0 0 | 742 * | 0 1 0 0 0 0 0 0 | 743 * (V|I)' = | 0 0 1 0 0 0 0 0 | 744 * | 0 0 0 1 0 0 0 0 | 745 * | 0 0 0 0 1 0 0 0 | 746 * | 0 0 0 0 0 1 0 0 | 747 * | 0 0 0 0 0 0 1 0 | 748 * | 0 0 0 0 0 0 0 1 | 749 * ~~ ~~ 750 * 751 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 752 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 753 * matrix is not singular. 754 * __ __ 755 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 756 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 757 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 758 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 759 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 760 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 761 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 762 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 763 * ~~ ~~ 764 * __ __ 765 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 766 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 767 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 768 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 769 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 770 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 771 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 772 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 773 * ~~ ~~ 774 * __ __ 775 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 776 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 777 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 778 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 779 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 780 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 781 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 782 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 783 * ~~ ~~ 784 * __ __ 785 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 786 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 787 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 788 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 789 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 790 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 791 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 792 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 793 * ~~ ~~ 794 * __ __ 795 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 796 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 797 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 798 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 799 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 800 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 801 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 802 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 803 * ~~ ~~ 804 * __ __ 805 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 806 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 807 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 808 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 809 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 810 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 811 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 812 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 813 * ~~ ~~ 814 * __ __ 815 * | 0 0 1 0 0 0 0 0 | 816 * | 167 100 5 41 159 169 217 208 | 817 * | 166 100 4 40 158 168 216 209 | 818 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 819 * | 0 0 0 0 1 0 0 0 | 820 * | 0 0 0 0 0 1 0 0 | 821 * | 0 0 0 0 0 0 1 0 | 822 * | 0 0 0 0 0 0 0 1 | 823 * ~~ ~~ 824 * 825 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 826 * of the missing data. 827 * 828 * As is apparent from the example above, the only non-trivial rows in the 829 * inverse matrix correspond to the data disks that we're trying to 830 * reconstruct. Indeed, those are the only rows we need as the others would 831 * only be useful for reconstructing data known or assumed to be valid. For 832 * that reason, we only build the coefficients in the rows that correspond to 833 * targeted columns. 834 */ 835 /* END CSTYLED */ 836 837 static void 838 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 839 uint8_t **rows) 840 { 841 int i, j; 842 int pow; 843 844 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 845 846 /* 847 * Fill in the missing rows of interest. 848 */ 849 for (i = 0; i < nmap; i++) { 850 ASSERT3S(0, <=, map[i]); 851 ASSERT3S(map[i], <=, 2); 852 853 pow = map[i] * n; 854 if (pow > 255) 855 pow -= 255; 856 ASSERT(pow <= 255); 857 858 for (j = 0; j < n; j++) { 859 pow -= map[i]; 860 if (pow < 0) 861 pow += 255; 862 rows[i][j] = vdev_raidz_pow2[pow]; 863 } 864 } 865 } 866 867 static void 868 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 869 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 870 { 871 int i, j, ii, jj; 872 uint8_t log; 873 874 /* 875 * Assert that the first nmissing entries from the array of used 876 * columns correspond to parity columns and that subsequent entries 877 * correspond to data columns. 878 */ 879 for (i = 0; i < nmissing; i++) { 880 ASSERT3S(used[i], <, rm->rm_firstdatacol); 881 } 882 for (; i < n; i++) { 883 ASSERT3S(used[i], >=, rm->rm_firstdatacol); 884 } 885 886 /* 887 * First initialize the storage where we'll compute the inverse rows. 888 */ 889 for (i = 0; i < nmissing; i++) { 890 for (j = 0; j < n; j++) { 891 invrows[i][j] = (i == j) ? 1 : 0; 892 } 893 } 894 895 /* 896 * Subtract all trivial rows from the rows of consequence. 897 */ 898 for (i = 0; i < nmissing; i++) { 899 for (j = nmissing; j < n; j++) { 900 ASSERT3U(used[j], >=, rm->rm_firstdatacol); 901 jj = used[j] - rm->rm_firstdatacol; 902 ASSERT3S(jj, <, n); 903 invrows[i][j] = rows[i][jj]; 904 rows[i][jj] = 0; 905 } 906 } 907 908 /* 909 * For each of the rows of interest, we must normalize it and subtract 910 * a multiple of it from the other rows. 911 */ 912 for (i = 0; i < nmissing; i++) { 913 for (j = 0; j < missing[i]; j++) { 914 ASSERT3U(rows[i][j], ==, 0); 915 } 916 ASSERT3U(rows[i][missing[i]], !=, 0); 917 918 /* 919 * Compute the inverse of the first element and multiply each 920 * element in the row by that value. 921 */ 922 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 923 924 for (j = 0; j < n; j++) { 925 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 926 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 927 } 928 929 for (ii = 0; ii < nmissing; ii++) { 930 if (i == ii) 931 continue; 932 933 ASSERT3U(rows[ii][missing[i]], !=, 0); 934 935 log = vdev_raidz_log2[rows[ii][missing[i]]]; 936 937 for (j = 0; j < n; j++) { 938 rows[ii][j] ^= 939 vdev_raidz_exp2(rows[i][j], log); 940 invrows[ii][j] ^= 941 vdev_raidz_exp2(invrows[i][j], log); 942 } 943 } 944 } 945 946 /* 947 * Verify that the data that is left in the rows are properly part of 948 * an identity matrix. 949 */ 950 for (i = 0; i < nmissing; i++) { 951 for (j = 0; j < n; j++) { 952 if (j == missing[i]) { 953 ASSERT3U(rows[i][j], ==, 1); 954 } else { 955 ASSERT3U(rows[i][j], ==, 0); 956 } 957 } 958 } 959 } 960 961 static void 962 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 963 int *missing, uint8_t **invrows, const uint8_t *used) 964 { 965 int i, j, x, cc, c; 966 uint8_t *src; 967 uint64_t ccount; 968 uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 969 uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 970 uint8_t log, val; 971 int ll; 972 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 973 uint8_t *p, *pp; 974 size_t psize; 975 976 log = 0; /* gcc */ 977 psize = sizeof (invlog[0][0]) * n * nmissing; 978 p = malloc(psize); 979 if (p == NULL) { 980 printf("Out of memory\n"); 981 return; 982 } 983 984 for (pp = p, i = 0; i < nmissing; i++) { 985 invlog[i] = pp; 986 pp += n; 987 } 988 989 for (i = 0; i < nmissing; i++) { 990 for (j = 0; j < n; j++) { 991 ASSERT3U(invrows[i][j], !=, 0); 992 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 993 } 994 } 995 996 for (i = 0; i < n; i++) { 997 c = used[i]; 998 ASSERT3U(c, <, rm->rm_cols); 999 1000 src = rm->rm_col[c].rc_data; 1001 ccount = rm->rm_col[c].rc_size; 1002 for (j = 0; j < nmissing; j++) { 1003 cc = missing[j] + rm->rm_firstdatacol; 1004 ASSERT3U(cc, >=, rm->rm_firstdatacol); 1005 ASSERT3U(cc, <, rm->rm_cols); 1006 ASSERT3U(cc, !=, c); 1007 1008 dst[j] = rm->rm_col[cc].rc_data; 1009 dcount[j] = rm->rm_col[cc].rc_size; 1010 } 1011 1012 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 1013 1014 for (x = 0; x < ccount; x++, src++) { 1015 if (*src != 0) 1016 log = vdev_raidz_log2[*src]; 1017 1018 for (cc = 0; cc < nmissing; cc++) { 1019 if (x >= dcount[cc]) 1020 continue; 1021 1022 if (*src == 0) { 1023 val = 0; 1024 } else { 1025 if ((ll = log + invlog[cc][i]) >= 255) 1026 ll -= 255; 1027 val = vdev_raidz_pow2[ll]; 1028 } 1029 1030 if (i == 0) 1031 dst[cc][x] = val; 1032 else 1033 dst[cc][x] ^= val; 1034 } 1035 } 1036 } 1037 1038 free(p); 1039 } 1040 1041 static int 1042 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 1043 { 1044 int n, i, c, t, tt; 1045 int nmissing_rows; 1046 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1047 int parity_map[VDEV_RAIDZ_MAXPARITY]; 1048 1049 uint8_t *p, *pp; 1050 size_t psize; 1051 1052 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1053 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1054 uint8_t *used; 1055 1056 int code = 0; 1057 1058 1059 n = rm->rm_cols - rm->rm_firstdatacol; 1060 1061 /* 1062 * Figure out which data columns are missing. 1063 */ 1064 nmissing_rows = 0; 1065 for (t = 0; t < ntgts; t++) { 1066 if (tgts[t] >= rm->rm_firstdatacol) { 1067 missing_rows[nmissing_rows++] = 1068 tgts[t] - rm->rm_firstdatacol; 1069 } 1070 } 1071 1072 /* 1073 * Figure out which parity columns to use to help generate the missing 1074 * data columns. 1075 */ 1076 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1077 ASSERT(tt < ntgts); 1078 ASSERT(c < rm->rm_firstdatacol); 1079 1080 /* 1081 * Skip any targeted parity columns. 1082 */ 1083 if (c == tgts[tt]) { 1084 tt++; 1085 continue; 1086 } 1087 1088 code |= 1 << c; 1089 1090 parity_map[i] = c; 1091 i++; 1092 } 1093 1094 ASSERT(code != 0); 1095 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1096 1097 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1098 nmissing_rows * n + sizeof (used[0]) * n; 1099 p = malloc(psize); 1100 if (p == NULL) { 1101 printf("Out of memory\n"); 1102 return (code); 1103 } 1104 1105 for (pp = p, i = 0; i < nmissing_rows; i++) { 1106 rows[i] = pp; 1107 pp += n; 1108 invrows[i] = pp; 1109 pp += n; 1110 } 1111 used = pp; 1112 1113 for (i = 0; i < nmissing_rows; i++) { 1114 used[i] = parity_map[i]; 1115 } 1116 1117 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1118 if (tt < nmissing_rows && 1119 c == missing_rows[tt] + rm->rm_firstdatacol) { 1120 tt++; 1121 continue; 1122 } 1123 1124 ASSERT3S(i, <, n); 1125 used[i] = c; 1126 i++; 1127 } 1128 1129 /* 1130 * Initialize the interesting rows of the matrix. 1131 */ 1132 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1133 1134 /* 1135 * Invert the matrix. 1136 */ 1137 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1138 invrows, used); 1139 1140 /* 1141 * Reconstruct the missing data using the generated matrix. 1142 */ 1143 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1144 invrows, used); 1145 1146 free(p); 1147 1148 return (code); 1149 } 1150 1151 static int 1152 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 1153 { 1154 int tgts[VDEV_RAIDZ_MAXPARITY]; 1155 int ntgts; 1156 int i, c; 1157 int code; 1158 int nbadparity, nbaddata; 1159 1160 /* 1161 * The tgts list must already be sorted. 1162 */ 1163 for (i = 1; i < nt; i++) { 1164 ASSERT(t[i] > t[i - 1]); 1165 } 1166 1167 nbadparity = rm->rm_firstdatacol; 1168 nbaddata = rm->rm_cols - nbadparity; 1169 ntgts = 0; 1170 for (i = 0, c = 0; c < rm->rm_cols; c++) { 1171 if (i < nt && c == t[i]) { 1172 tgts[ntgts++] = c; 1173 i++; 1174 } else if (rm->rm_col[c].rc_error != 0) { 1175 tgts[ntgts++] = c; 1176 } else if (c >= rm->rm_firstdatacol) { 1177 nbaddata--; 1178 } else { 1179 nbadparity--; 1180 } 1181 } 1182 1183 ASSERT(ntgts >= nt); 1184 ASSERT(nbaddata >= 0); 1185 ASSERT(nbaddata + nbadparity == ntgts); 1186 1187 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1188 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1189 ASSERT(code > 0); 1190 return (code); 1191 } 1192 1193 static raidz_map_t * 1194 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift, 1195 uint64_t dcols, uint64_t nparity) 1196 { 1197 raidz_map_t *rm; 1198 uint64_t b = offset >> unit_shift; 1199 uint64_t s = size >> unit_shift; 1200 uint64_t f = b % dcols; 1201 uint64_t o = (b / dcols) << unit_shift; 1202 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 1203 1204 q = s / (dcols - nparity); 1205 r = s - q * (dcols - nparity); 1206 bc = (r == 0 ? 0 : r + nparity); 1207 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 1208 1209 if (q == 0) { 1210 acols = bc; 1211 scols = MIN(dcols, roundup(bc, nparity + 1)); 1212 } else { 1213 acols = dcols; 1214 scols = dcols; 1215 } 1216 1217 ASSERT3U(acols, <=, scols); 1218 1219 rm = malloc(offsetof(raidz_map_t, rm_col[scols])); 1220 if (rm == NULL) 1221 return (rm); 1222 1223 rm->rm_cols = acols; 1224 rm->rm_scols = scols; 1225 rm->rm_bigcols = bc; 1226 rm->rm_skipstart = bc; 1227 rm->rm_missingdata = 0; 1228 rm->rm_missingparity = 0; 1229 rm->rm_firstdatacol = nparity; 1230 rm->rm_reports = 0; 1231 rm->rm_freed = 0; 1232 rm->rm_ecksuminjected = 0; 1233 1234 asize = 0; 1235 1236 for (c = 0; c < scols; c++) { 1237 col = f + c; 1238 coff = o; 1239 if (col >= dcols) { 1240 col -= dcols; 1241 coff += 1ULL << unit_shift; 1242 } 1243 rm->rm_col[c].rc_devidx = col; 1244 rm->rm_col[c].rc_offset = coff; 1245 rm->rm_col[c].rc_data = NULL; 1246 rm->rm_col[c].rc_error = 0; 1247 rm->rm_col[c].rc_tried = 0; 1248 rm->rm_col[c].rc_skipped = 0; 1249 1250 if (c >= acols) 1251 rm->rm_col[c].rc_size = 0; 1252 else if (c < bc) 1253 rm->rm_col[c].rc_size = (q + 1) << unit_shift; 1254 else 1255 rm->rm_col[c].rc_size = q << unit_shift; 1256 1257 asize += rm->rm_col[c].rc_size; 1258 } 1259 1260 ASSERT3U(asize, ==, tot << unit_shift); 1261 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 1262 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 1263 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 1264 ASSERT3U(rm->rm_nskip, <=, nparity); 1265 1266 for (c = 0; c < rm->rm_firstdatacol; c++) { 1267 rm->rm_col[c].rc_data = malloc(rm->rm_col[c].rc_size); 1268 if (rm->rm_col[c].rc_data == NULL) { 1269 c++; 1270 while (c != 0) 1271 free(rm->rm_col[--c].rc_data); 1272 free(rm); 1273 return (NULL); 1274 } 1275 } 1276 1277 rm->rm_col[c].rc_data = data; 1278 1279 for (c = c + 1; c < acols; c++) 1280 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 1281 rm->rm_col[c - 1].rc_size; 1282 1283 /* 1284 * If all data stored spans all columns, there's a danger that parity 1285 * will always be on the same device and, since parity isn't read 1286 * during normal operation, that that device's I/O bandwidth won't be 1287 * used effectively. We therefore switch the parity every 1MB. 1288 * 1289 * ... at least that was, ostensibly, the theory. As a practical 1290 * matter unless we juggle the parity between all devices evenly, we 1291 * won't see any benefit. Further, occasional writes that aren't a 1292 * multiple of the LCM of the number of children and the minimum 1293 * stripe width are sufficient to avoid pessimal behavior. 1294 * Unfortunately, this decision created an implicit on-disk format 1295 * requirement that we need to support for all eternity, but only 1296 * for single-parity RAID-Z. 1297 * 1298 * If we intend to skip a sector in the zeroth column for padding 1299 * we must make sure to note this swap. We will never intend to 1300 * skip the first column since at least one data and one parity 1301 * column must appear in each row. 1302 */ 1303 ASSERT(rm->rm_cols >= 2); 1304 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 1305 1306 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { 1307 devidx = rm->rm_col[0].rc_devidx; 1308 o = rm->rm_col[0].rc_offset; 1309 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 1310 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 1311 rm->rm_col[1].rc_devidx = devidx; 1312 rm->rm_col[1].rc_offset = o; 1313 1314 if (rm->rm_skipstart == 0) 1315 rm->rm_skipstart = 1; 1316 } 1317 1318 return (rm); 1319 } 1320 1321 static void 1322 vdev_raidz_map_free(raidz_map_t *rm) 1323 { 1324 int c; 1325 1326 for (c = rm->rm_firstdatacol - 1; c >= 0; c--) 1327 free(rm->rm_col[c].rc_data); 1328 1329 free(rm); 1330 } 1331 1332 static vdev_t * 1333 vdev_child(vdev_t *pvd, uint64_t devidx) 1334 { 1335 vdev_t *cvd; 1336 1337 STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) { 1338 if (cvd->v_id == devidx) 1339 break; 1340 } 1341 1342 return (cvd); 1343 } 1344 1345 /* 1346 * We keep track of whether or not there were any injected errors, so that 1347 * any ereports we generate can note it. 1348 */ 1349 static int 1350 raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data, 1351 uint64_t size) 1352 { 1353 return (zio_checksum_verify(spa, bp, data)); 1354 } 1355 1356 /* 1357 * Generate the parity from the data columns. If we tried and were able to 1358 * read the parity without error, verify that the generated parity matches the 1359 * data we read. If it doesn't, we fire off a checksum error. Return the 1360 * number such failures. 1361 */ 1362 static int 1363 raidz_parity_verify(raidz_map_t *rm) 1364 { 1365 void *orig[VDEV_RAIDZ_MAXPARITY]; 1366 int c, ret = 0; 1367 raidz_col_t *rc; 1368 1369 for (c = 0; c < rm->rm_firstdatacol; c++) { 1370 rc = &rm->rm_col[c]; 1371 if (!rc->rc_tried || rc->rc_error != 0) 1372 continue; 1373 orig[c] = malloc(rc->rc_size); 1374 if (orig[c] != NULL) { 1375 bcopy(rc->rc_data, orig[c], rc->rc_size); 1376 } else { 1377 printf("Out of memory\n"); 1378 } 1379 } 1380 1381 vdev_raidz_generate_parity(rm); 1382 1383 for (c = rm->rm_firstdatacol - 1; c >= 0; c--) { 1384 rc = &rm->rm_col[c]; 1385 if (!rc->rc_tried || rc->rc_error != 0) 1386 continue; 1387 if (orig[c] == NULL || 1388 bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 1389 rc->rc_error = ECKSUM; 1390 ret++; 1391 } 1392 free(orig[c]); 1393 } 1394 1395 return (ret); 1396 } 1397 1398 /* 1399 * Iterate over all combinations of bad data and attempt a reconstruction. 1400 * Note that the algorithm below is non-optimal because it doesn't take into 1401 * account how reconstruction is actually performed. For example, with 1402 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 1403 * is targeted as invalid as if columns 1 and 4 are targeted since in both 1404 * cases we'd only use parity information in column 0. 1405 */ 1406 static int 1407 vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp, 1408 void *data, off_t offset, uint64_t bytes, int total_errors, int data_errors) 1409 { 1410 raidz_col_t *rc; 1411 void *orig[VDEV_RAIDZ_MAXPARITY]; 1412 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 1413 int *tgts = &tstore[1]; 1414 int current, next, i, c, n; 1415 int code, ret = 0; 1416 1417 ASSERT(total_errors < rm->rm_firstdatacol); 1418 1419 /* 1420 * This simplifies one edge condition. 1421 */ 1422 tgts[-1] = -1; 1423 1424 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 1425 /* 1426 * Initialize the targets array by finding the first n columns 1427 * that contain no error. 1428 * 1429 * If there were no data errors, we need to ensure that we're 1430 * always explicitly attempting to reconstruct at least one 1431 * data column. To do this, we simply push the highest target 1432 * up into the data columns. 1433 */ 1434 for (c = 0, i = 0; i < n; i++) { 1435 if (i == n - 1 && data_errors == 0 && 1436 c < rm->rm_firstdatacol) { 1437 c = rm->rm_firstdatacol; 1438 } 1439 1440 while (rm->rm_col[c].rc_error != 0) { 1441 c++; 1442 ASSERT3S(c, <, rm->rm_cols); 1443 } 1444 1445 tgts[i] = c++; 1446 } 1447 1448 /* 1449 * Setting tgts[n] simplifies the other edge condition. 1450 */ 1451 tgts[n] = rm->rm_cols; 1452 1453 /* 1454 * These buffers were allocated in previous iterations. 1455 */ 1456 for (i = 0; i < n - 1; i++) { 1457 ASSERT(orig[i] != NULL); 1458 } 1459 1460 orig[n - 1] = malloc(rm->rm_col[0].rc_size); 1461 if (orig[n - 1] == NULL) { 1462 ret = ENOMEM; 1463 goto done; 1464 } 1465 1466 current = 0; 1467 next = tgts[current]; 1468 1469 while (current != n) { 1470 tgts[current] = next; 1471 current = 0; 1472 1473 /* 1474 * Save off the original data that we're going to 1475 * attempt to reconstruct. 1476 */ 1477 for (i = 0; i < n; i++) { 1478 ASSERT(orig[i] != NULL); 1479 c = tgts[i]; 1480 ASSERT3S(c, >=, 0); 1481 ASSERT3S(c, <, rm->rm_cols); 1482 rc = &rm->rm_col[c]; 1483 bcopy(rc->rc_data, orig[i], rc->rc_size); 1484 } 1485 1486 /* 1487 * Attempt a reconstruction and exit the outer loop on 1488 * success. 1489 */ 1490 code = vdev_raidz_reconstruct(rm, tgts, n); 1491 if (raidz_checksum_verify(spa, bp, data, bytes) == 0) { 1492 for (i = 0; i < n; i++) { 1493 c = tgts[i]; 1494 rc = &rm->rm_col[c]; 1495 ASSERT(rc->rc_error == 0); 1496 rc->rc_error = ECKSUM; 1497 } 1498 1499 ret = code; 1500 goto done; 1501 } 1502 1503 /* 1504 * Restore the original data. 1505 */ 1506 for (i = 0; i < n; i++) { 1507 c = tgts[i]; 1508 rc = &rm->rm_col[c]; 1509 bcopy(orig[i], rc->rc_data, rc->rc_size); 1510 } 1511 1512 do { 1513 /* 1514 * Find the next valid column after the current 1515 * position.. 1516 */ 1517 for (next = tgts[current] + 1; 1518 next < rm->rm_cols && 1519 rm->rm_col[next].rc_error != 0; next++) 1520 continue; 1521 1522 ASSERT(next <= tgts[current + 1]); 1523 1524 /* 1525 * If that spot is available, we're done here. 1526 */ 1527 if (next != tgts[current + 1]) 1528 break; 1529 1530 /* 1531 * Otherwise, find the next valid column after 1532 * the previous position. 1533 */ 1534 for (c = tgts[current - 1] + 1; 1535 rm->rm_col[c].rc_error != 0; c++) 1536 continue; 1537 1538 tgts[current] = c; 1539 current++; 1540 1541 } while (current != n); 1542 } 1543 } 1544 n--; 1545 done: 1546 for (i = n - 1; i >= 0; i--) { 1547 free(orig[i]); 1548 } 1549 1550 return (ret); 1551 } 1552 1553 static int 1554 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data, 1555 off_t offset, size_t bytes) 1556 { 1557 vdev_t *tvd = vd->v_top; 1558 vdev_t *cvd; 1559 raidz_map_t *rm; 1560 raidz_col_t *rc; 1561 int c, error; 1562 int unexpected_errors; 1563 int parity_errors; 1564 int parity_untried; 1565 int data_errors; 1566 int total_errors; 1567 int n; 1568 int tgts[VDEV_RAIDZ_MAXPARITY]; 1569 int code; 1570 1571 rc = NULL; /* gcc */ 1572 error = 0; 1573 1574 rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift, 1575 vd->v_nchildren, vd->v_nparity); 1576 if (rm == NULL) 1577 return (ENOMEM); 1578 1579 /* 1580 * Iterate over the columns in reverse order so that we hit the parity 1581 * last -- any errors along the way will force us to read the parity. 1582 */ 1583 for (c = rm->rm_cols - 1; c >= 0; c--) { 1584 rc = &rm->rm_col[c]; 1585 cvd = vdev_child(vd, rc->rc_devidx); 1586 if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) { 1587 if (c >= rm->rm_firstdatacol) 1588 rm->rm_missingdata++; 1589 else 1590 rm->rm_missingparity++; 1591 rc->rc_error = ENXIO; 1592 rc->rc_tried = 1; /* don't even try */ 1593 rc->rc_skipped = 1; 1594 continue; 1595 } 1596 #if 0 /* XXX: Too hard for the boot code. */ 1597 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 1598 if (c >= rm->rm_firstdatacol) 1599 rm->rm_missingdata++; 1600 else 1601 rm->rm_missingparity++; 1602 rc->rc_error = ESTALE; 1603 rc->rc_skipped = 1; 1604 continue; 1605 } 1606 #endif 1607 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) { 1608 rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data, 1609 rc->rc_offset, rc->rc_size); 1610 rc->rc_tried = 1; 1611 rc->rc_skipped = 0; 1612 } 1613 } 1614 1615 reconstruct: 1616 unexpected_errors = 0; 1617 parity_errors = 0; 1618 parity_untried = 0; 1619 data_errors = 0; 1620 total_errors = 0; 1621 1622 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 1623 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 1624 1625 for (c = 0; c < rm->rm_cols; c++) { 1626 rc = &rm->rm_col[c]; 1627 1628 if (rc->rc_error) { 1629 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 1630 1631 if (c < rm->rm_firstdatacol) 1632 parity_errors++; 1633 else 1634 data_errors++; 1635 1636 if (!rc->rc_skipped) 1637 unexpected_errors++; 1638 1639 total_errors++; 1640 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 1641 parity_untried++; 1642 } 1643 } 1644 1645 /* 1646 * There are three potential phases for a read: 1647 * 1. produce valid data from the columns read 1648 * 2. read all disks and try again 1649 * 3. perform combinatorial reconstruction 1650 * 1651 * Each phase is progressively both more expensive and less likely to 1652 * occur. If we encounter more errors than we can repair or all phases 1653 * fail, we have no choice but to return an error. 1654 */ 1655 1656 /* 1657 * If the number of errors we saw was correctable -- less than or equal 1658 * to the number of parity disks read -- attempt to produce data that 1659 * has a valid checksum. Naturally, this case applies in the absence of 1660 * any errors. 1661 */ 1662 if (total_errors <= rm->rm_firstdatacol - parity_untried) { 1663 int rv; 1664 1665 if (data_errors == 0) { 1666 rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes); 1667 if (rv == 0) { 1668 /* 1669 * If we read parity information (unnecessarily 1670 * as it happens since no reconstruction was 1671 * needed) regenerate and verify the parity. 1672 * We also regenerate parity when resilvering 1673 * so we can write it out to the failed device 1674 * later. 1675 */ 1676 if (parity_errors + parity_untried < 1677 rm->rm_firstdatacol) { 1678 n = raidz_parity_verify(rm); 1679 unexpected_errors += n; 1680 ASSERT(parity_errors + n <= 1681 rm->rm_firstdatacol); 1682 } 1683 goto done; 1684 } 1685 } else { 1686 /* 1687 * We either attempt to read all the parity columns or 1688 * none of them. If we didn't try to read parity, we 1689 * wouldn't be here in the correctable case. There must 1690 * also have been fewer parity errors than parity 1691 * columns or, again, we wouldn't be in this code path. 1692 */ 1693 ASSERT(parity_untried == 0); 1694 ASSERT(parity_errors < rm->rm_firstdatacol); 1695 1696 /* 1697 * Identify the data columns that reported an error. 1698 */ 1699 n = 0; 1700 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1701 rc = &rm->rm_col[c]; 1702 if (rc->rc_error != 0) { 1703 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 1704 tgts[n++] = c; 1705 } 1706 } 1707 1708 ASSERT(rm->rm_firstdatacol >= n); 1709 1710 code = vdev_raidz_reconstruct(rm, tgts, n); 1711 1712 rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes); 1713 if (rv == 0) { 1714 /* 1715 * If we read more parity disks than were used 1716 * for reconstruction, confirm that the other 1717 * parity disks produced correct data. This 1718 * routine is suboptimal in that it regenerates 1719 * the parity that we already used in addition 1720 * to the parity that we're attempting to 1721 * verify, but this should be a relatively 1722 * uncommon case, and can be optimized if it 1723 * becomes a problem. Note that we regenerate 1724 * parity when resilvering so we can write it 1725 * out to failed devices later. 1726 */ 1727 if (parity_errors < rm->rm_firstdatacol - n) { 1728 n = raidz_parity_verify(rm); 1729 unexpected_errors += n; 1730 ASSERT(parity_errors + n <= 1731 rm->rm_firstdatacol); 1732 } 1733 1734 goto done; 1735 } 1736 } 1737 } 1738 1739 /* 1740 * This isn't a typical situation -- either we got a read 1741 * error or a child silently returned bad data. Read every 1742 * block so we can try again with as much data and parity as 1743 * we can track down. If we've already been through once 1744 * before, all children will be marked as tried so we'll 1745 * proceed to combinatorial reconstruction. 1746 */ 1747 unexpected_errors = 1; 1748 rm->rm_missingdata = 0; 1749 rm->rm_missingparity = 0; 1750 1751 n = 0; 1752 for (c = 0; c < rm->rm_cols; c++) { 1753 rc = &rm->rm_col[c]; 1754 1755 if (rc->rc_tried) 1756 continue; 1757 1758 cvd = vdev_child(vd, rc->rc_devidx); 1759 ASSERT(cvd != NULL); 1760 rc->rc_error = cvd->v_read(cvd, NULL, 1761 rc->rc_data, rc->rc_offset, rc->rc_size); 1762 if (rc->rc_error == 0) 1763 n++; 1764 rc->rc_tried = 1; 1765 rc->rc_skipped = 0; 1766 } 1767 /* 1768 * If we managed to read anything more, retry the 1769 * reconstruction. 1770 */ 1771 if (n > 0) 1772 goto reconstruct; 1773 1774 /* 1775 * At this point we've attempted to reconstruct the data given the 1776 * errors we detected, and we've attempted to read all columns. There 1777 * must, therefore, be one or more additional problems -- silent errors 1778 * resulting in invalid data rather than explicit I/O errors resulting 1779 * in absent data. We check if there is enough additional data to 1780 * possibly reconstruct the data and then perform combinatorial 1781 * reconstruction over all possible combinations. If that fails, 1782 * we're cooked. 1783 */ 1784 if (total_errors > rm->rm_firstdatacol) { 1785 error = EIO; 1786 } else if (total_errors < rm->rm_firstdatacol && 1787 (code = vdev_raidz_combrec(vd->v_spa, rm, bp, data, offset, bytes, 1788 total_errors, data_errors)) != 0) { 1789 /* 1790 * If we didn't use all the available parity for the 1791 * combinatorial reconstruction, verify that the remaining 1792 * parity is correct. 1793 */ 1794 if (code != (1 << rm->rm_firstdatacol) - 1) 1795 (void) raidz_parity_verify(rm); 1796 } else { 1797 /* 1798 * We're here because either: 1799 * 1800 * total_errors == rm_first_datacol, or 1801 * vdev_raidz_combrec() failed 1802 * 1803 * In either case, there is enough bad data to prevent 1804 * reconstruction. 1805 * 1806 * Start checksum ereports for all children which haven't 1807 * failed, and the IO wasn't speculative. 1808 */ 1809 error = ECKSUM; 1810 } 1811 1812 done: 1813 vdev_raidz_map_free(rm); 1814 1815 return (error); 1816 } 1817