1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/cdefs.h> 27 __FBSDID("$FreeBSD$"); 28 29 static uint64_t zfs_crc64_table[256]; 30 31 #define ECKSUM 666 32 33 #define ASSERT3S(x, y, z) ((void)0) 34 #define ASSERT3U(x, y, z) ((void)0) 35 #define ASSERT3P(x, y, z) ((void)0) 36 #define ASSERT0(x) ((void)0) 37 #define ASSERT(x) ((void)0) 38 39 #define panic(...) do { \ 40 printf(__VA_ARGS__); \ 41 for (;;) ; \ 42 } while (0) 43 44 #define kmem_alloc(size, flag) zfs_alloc((size)) 45 #define kmem_free(ptr, size) zfs_free((ptr), (size)) 46 47 static void 48 zfs_init_crc(void) 49 { 50 int i, j; 51 uint64_t *ct; 52 53 /* 54 * Calculate the crc64 table (used for the zap hash 55 * function). 56 */ 57 if (zfs_crc64_table[128] != ZFS_CRC64_POLY) { 58 memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table)); 59 for (i = 0; i < 256; i++) 60 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 61 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 62 } 63 } 64 65 static void 66 zio_checksum_off(const void *buf, uint64_t size, 67 const void *ctx_template, zio_cksum_t *zcp) 68 { 69 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); 70 } 71 72 /* 73 * Signature for checksum functions. 74 */ 75 typedef void zio_checksum_t(const void *data, uint64_t size, 76 const void *ctx_template, zio_cksum_t *zcp); 77 typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt); 78 typedef void zio_checksum_tmpl_free_t(void *ctx_template); 79 80 typedef enum zio_checksum_flags { 81 /* Strong enough for metadata? */ 82 ZCHECKSUM_FLAG_METADATA = (1 << 1), 83 /* ZIO embedded checksum */ 84 ZCHECKSUM_FLAG_EMBEDDED = (1 << 2), 85 /* Strong enough for dedup (without verification)? */ 86 ZCHECKSUM_FLAG_DEDUP = (1 << 3), 87 /* Uses salt value */ 88 ZCHECKSUM_FLAG_SALTED = (1 << 4), 89 /* Strong enough for nopwrite? */ 90 ZCHECKSUM_FLAG_NOPWRITE = (1 << 5) 91 } zio_checksum_flags_t; 92 93 /* 94 * Information about each checksum function. 95 */ 96 typedef struct zio_checksum_info { 97 /* checksum function for each byteorder */ 98 zio_checksum_t *ci_func[2]; 99 zio_checksum_tmpl_init_t *ci_tmpl_init; 100 zio_checksum_tmpl_free_t *ci_tmpl_free; 101 zio_checksum_flags_t ci_flags; 102 const char *ci_name; /* descriptive name */ 103 } zio_checksum_info_t; 104 105 #include "blkptr.c" 106 107 #include "fletcher.c" 108 #include "sha256.c" 109 #include "skein_zfs.c" 110 111 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { 112 {{NULL, NULL}, NULL, NULL, 0, "inherit"}, 113 {{NULL, NULL}, NULL, NULL, 0, "on"}, 114 {{zio_checksum_off, zio_checksum_off}, NULL, NULL, 0, "off"}, 115 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, 116 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"}, 117 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, 118 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"}, 119 {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL, 120 ZCHECKSUM_FLAG_EMBEDDED, "zilog"}, 121 {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL, 122 0, "fletcher2"}, 123 {{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL, 124 ZCHECKSUM_FLAG_METADATA, "fletcher4"}, 125 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, 126 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | 127 ZCHECKSUM_FLAG_NOPWRITE, "SHA256"}, 128 {{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL, 129 ZCHECKSUM_FLAG_EMBEDDED, "zillog2"}, 130 {{zio_checksum_off, zio_checksum_off}, NULL, NULL, 131 0, "noparity"}, 132 {{zio_checksum_SHA512_native, zio_checksum_SHA512_byteswap}, 133 NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | 134 ZCHECKSUM_FLAG_NOPWRITE, "SHA512"}, 135 {{zio_checksum_skein_native, zio_checksum_skein_byteswap}, 136 zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free, 137 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | 138 ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"}, 139 /* no edonr for now */ 140 {{NULL, NULL}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | 141 ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"} 142 }; 143 144 /* 145 * Common signature for all zio compress/decompress functions. 146 */ 147 typedef size_t zio_compress_func_t(void *src, void *dst, 148 size_t s_len, size_t d_len, int); 149 typedef int zio_decompress_func_t(void *src, void *dst, 150 size_t s_len, size_t d_len, int); 151 152 /* 153 * Information about each compression function. 154 */ 155 typedef struct zio_compress_info { 156 zio_compress_func_t *ci_compress; /* compression function */ 157 zio_decompress_func_t *ci_decompress; /* decompression function */ 158 int ci_level; /* level parameter */ 159 const char *ci_name; /* algorithm name */ 160 } zio_compress_info_t; 161 162 #include "lzjb.c" 163 #include "zle.c" 164 #include "lz4.c" 165 166 /* 167 * Compression vectors. 168 */ 169 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { 170 {NULL, NULL, 0, "inherit"}, 171 {NULL, NULL, 0, "on"}, 172 {NULL, NULL, 0, "uncompressed"}, 173 {NULL, lzjb_decompress, 0, "lzjb"}, 174 {NULL, NULL, 0, "empty"}, 175 {NULL, NULL, 1, "gzip-1"}, 176 {NULL, NULL, 2, "gzip-2"}, 177 {NULL, NULL, 3, "gzip-3"}, 178 {NULL, NULL, 4, "gzip-4"}, 179 {NULL, NULL, 5, "gzip-5"}, 180 {NULL, NULL, 6, "gzip-6"}, 181 {NULL, NULL, 7, "gzip-7"}, 182 {NULL, NULL, 8, "gzip-8"}, 183 {NULL, NULL, 9, "gzip-9"}, 184 {NULL, zle_decompress, 64, "zle"}, 185 {NULL, lz4_decompress, 0, "lz4"}, 186 }; 187 188 static void 189 byteswap_uint64_array(void *vbuf, size_t size) 190 { 191 uint64_t *buf = vbuf; 192 size_t count = size >> 3; 193 int i; 194 195 ASSERT((size & 7) == 0); 196 197 for (i = 0; i < count; i++) 198 buf[i] = BSWAP_64(buf[i]); 199 } 200 201 /* 202 * Set the external verifier for a gang block based on <vdev, offset, txg>, 203 * a tuple which is guaranteed to be unique for the life of the pool. 204 */ 205 static void 206 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp) 207 { 208 const dva_t *dva = BP_IDENTITY(bp); 209 uint64_t txg = BP_PHYSICAL_BIRTH(bp); 210 211 ASSERT(BP_IS_GANG(bp)); 212 213 ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0); 214 } 215 216 /* 217 * Set the external verifier for a label block based on its offset. 218 * The vdev is implicit, and the txg is unknowable at pool open time -- 219 * hence the logic in vdev_uberblock_load() to find the most recent copy. 220 */ 221 static void 222 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset) 223 { 224 ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0); 225 } 226 227 /* 228 * Calls the template init function of a checksum which supports context 229 * templates and installs the template into the spa_t. 230 */ 231 static void 232 zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa) 233 { 234 zio_checksum_info_t *ci = &zio_checksum_table[checksum]; 235 236 if (ci->ci_tmpl_init == NULL) 237 return; 238 239 if (spa->spa_cksum_tmpls[checksum] != NULL) 240 return; 241 242 if (spa->spa_cksum_tmpls[checksum] == NULL) { 243 spa->spa_cksum_tmpls[checksum] = 244 ci->ci_tmpl_init(&spa->spa_cksum_salt); 245 } 246 } 247 248 /* 249 * Called by a spa_t that's about to be deallocated. This steps through 250 * all of the checksum context templates and deallocates any that were 251 * initialized using the algorithm-specific template init function. 252 */ 253 static void __unused 254 zio_checksum_templates_free(spa_t *spa) 255 { 256 for (enum zio_checksum checksum = 0; 257 checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) { 258 if (spa->spa_cksum_tmpls[checksum] != NULL) { 259 zio_checksum_info_t *ci = &zio_checksum_table[checksum]; 260 261 ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]); 262 spa->spa_cksum_tmpls[checksum] = NULL; 263 } 264 } 265 } 266 267 static int 268 zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data) 269 { 270 uint64_t size; 271 unsigned int checksum; 272 zio_checksum_info_t *ci; 273 void *ctx = NULL; 274 zio_cksum_t actual_cksum, expected_cksum, verifier; 275 int byteswap; 276 277 checksum = BP_GET_CHECKSUM(bp); 278 size = BP_GET_PSIZE(bp); 279 280 if (checksum >= ZIO_CHECKSUM_FUNCTIONS) 281 return (EINVAL); 282 ci = &zio_checksum_table[checksum]; 283 if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL) 284 return (EINVAL); 285 286 if (spa != NULL) { 287 zio_checksum_template_init(checksum, __DECONST(spa_t *,spa)); 288 ctx = spa->spa_cksum_tmpls[checksum]; 289 } 290 291 if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { 292 zio_eck_t *eck; 293 294 ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER || 295 checksum == ZIO_CHECKSUM_LABEL); 296 297 eck = (zio_eck_t *)((char *)data + size) - 1; 298 299 if (checksum == ZIO_CHECKSUM_GANG_HEADER) 300 zio_checksum_gang_verifier(&verifier, bp); 301 else if (checksum == ZIO_CHECKSUM_LABEL) 302 zio_checksum_label_verifier(&verifier, 303 DVA_GET_OFFSET(BP_IDENTITY(bp))); 304 else 305 verifier = bp->blk_cksum; 306 307 byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC)); 308 309 if (byteswap) 310 byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); 311 312 expected_cksum = eck->zec_cksum; 313 eck->zec_cksum = verifier; 314 ci->ci_func[byteswap](data, size, ctx, &actual_cksum); 315 eck->zec_cksum = expected_cksum; 316 317 if (byteswap) 318 byteswap_uint64_array(&expected_cksum, 319 sizeof (zio_cksum_t)); 320 } else { 321 expected_cksum = bp->blk_cksum; 322 ci->ci_func[0](data, size, ctx, &actual_cksum); 323 } 324 325 if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) { 326 /*printf("ZFS: read checksum %s failed\n", ci->ci_name);*/ 327 return (EIO); 328 } 329 330 return (0); 331 } 332 333 static int 334 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize, 335 void *dest, uint64_t destsize) 336 { 337 zio_compress_info_t *ci; 338 339 if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) { 340 printf("ZFS: unsupported compression algorithm %u\n", cpfunc); 341 return (EIO); 342 } 343 344 ci = &zio_compress_table[cpfunc]; 345 if (!ci->ci_decompress) { 346 printf("ZFS: unsupported compression algorithm %s\n", 347 ci->ci_name); 348 return (EIO); 349 } 350 351 return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level)); 352 } 353 354 static uint64_t 355 zap_hash(uint64_t salt, const char *name) 356 { 357 const uint8_t *cp; 358 uint8_t c; 359 uint64_t crc = salt; 360 361 ASSERT(crc != 0); 362 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 363 for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++) 364 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF]; 365 366 /* 367 * Only use 28 bits, since we need 4 bits in the cookie for the 368 * collision differentiator. We MUST use the high bits, since 369 * those are the onces that we first pay attention to when 370 * chosing the bucket. 371 */ 372 crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1); 373 374 return (crc); 375 } 376 377 static void *zfs_alloc(size_t size); 378 static void zfs_free(void *ptr, size_t size); 379 380 typedef struct raidz_col { 381 uint64_t rc_devidx; /* child device index for I/O */ 382 uint64_t rc_offset; /* device offset */ 383 uint64_t rc_size; /* I/O size */ 384 void *rc_data; /* I/O data */ 385 int rc_error; /* I/O error for this device */ 386 uint8_t rc_tried; /* Did we attempt this I/O column? */ 387 uint8_t rc_skipped; /* Did we skip this I/O column? */ 388 } raidz_col_t; 389 390 typedef struct raidz_map { 391 uint64_t rm_cols; /* Regular column count */ 392 uint64_t rm_scols; /* Count including skipped columns */ 393 uint64_t rm_bigcols; /* Number of oversized columns */ 394 uint64_t rm_asize; /* Actual total I/O size */ 395 uint64_t rm_missingdata; /* Count of missing data devices */ 396 uint64_t rm_missingparity; /* Count of missing parity devices */ 397 uint64_t rm_firstdatacol; /* First data column/parity count */ 398 uint64_t rm_nskip; /* Skipped sectors for padding */ 399 uint64_t rm_skipstart; /* Column index of padding start */ 400 uintptr_t rm_reports; /* # of referencing checksum reports */ 401 uint8_t rm_freed; /* map no longer has referencing ZIO */ 402 uint8_t rm_ecksuminjected; /* checksum error was injected */ 403 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 404 } raidz_map_t; 405 406 #define VDEV_RAIDZ_P 0 407 #define VDEV_RAIDZ_Q 1 408 #define VDEV_RAIDZ_R 2 409 410 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 411 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 412 413 /* 414 * We provide a mechanism to perform the field multiplication operation on a 415 * 64-bit value all at once rather than a byte at a time. This works by 416 * creating a mask from the top bit in each byte and using that to 417 * conditionally apply the XOR of 0x1d. 418 */ 419 #define VDEV_RAIDZ_64MUL_2(x, mask) \ 420 { \ 421 (mask) = (x) & 0x8080808080808080ULL; \ 422 (mask) = ((mask) << 1) - ((mask) >> 7); \ 423 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 424 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ 425 } 426 427 #define VDEV_RAIDZ_64MUL_4(x, mask) \ 428 { \ 429 VDEV_RAIDZ_64MUL_2((x), mask); \ 430 VDEV_RAIDZ_64MUL_2((x), mask); \ 431 } 432 433 /* 434 * These two tables represent powers and logs of 2 in the Galois field defined 435 * above. These values were computed by repeatedly multiplying by 2 as above. 436 */ 437 static const uint8_t vdev_raidz_pow2[256] = { 438 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 439 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 440 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 441 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 442 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 443 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 444 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 445 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 446 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 447 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 448 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 449 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 450 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 451 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 452 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 453 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 454 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 455 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 456 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 457 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 458 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 459 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 460 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 461 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 462 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 463 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 464 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 465 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 466 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 467 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 468 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 469 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 470 }; 471 static const uint8_t vdev_raidz_log2[256] = { 472 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 473 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 474 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 475 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 476 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 477 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 478 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 479 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 480 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 481 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 482 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 483 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 484 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 485 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 486 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 487 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 488 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 489 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 490 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 491 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 492 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 493 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 494 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 495 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 496 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 497 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 498 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 499 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 500 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 501 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 502 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 503 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 504 }; 505 506 /* 507 * Multiply a given number by 2 raised to the given power. 508 */ 509 static uint8_t 510 vdev_raidz_exp2(uint8_t a, int exp) 511 { 512 if (a == 0) 513 return (0); 514 515 ASSERT(exp >= 0); 516 ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 517 518 exp += vdev_raidz_log2[a]; 519 if (exp > 255) 520 exp -= 255; 521 522 return (vdev_raidz_pow2[exp]); 523 } 524 525 static void 526 vdev_raidz_generate_parity_p(raidz_map_t *rm) 527 { 528 uint64_t *p, *src, pcount, ccount, i; 529 int c; 530 531 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 532 533 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 534 src = rm->rm_col[c].rc_data; 535 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 536 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 537 538 if (c == rm->rm_firstdatacol) { 539 ASSERT(ccount == pcount); 540 for (i = 0; i < ccount; i++, src++, p++) { 541 *p = *src; 542 } 543 } else { 544 ASSERT(ccount <= pcount); 545 for (i = 0; i < ccount; i++, src++, p++) { 546 *p ^= *src; 547 } 548 } 549 } 550 } 551 552 static void 553 vdev_raidz_generate_parity_pq(raidz_map_t *rm) 554 { 555 uint64_t *p, *q, *src, pcnt, ccnt, mask, i; 556 int c; 557 558 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 559 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 560 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 561 562 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 563 src = rm->rm_col[c].rc_data; 564 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 565 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 566 567 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 568 569 if (c == rm->rm_firstdatacol) { 570 ASSERT(ccnt == pcnt || ccnt == 0); 571 for (i = 0; i < ccnt; i++, src++, p++, q++) { 572 *p = *src; 573 *q = *src; 574 } 575 for (; i < pcnt; i++, src++, p++, q++) { 576 *p = 0; 577 *q = 0; 578 } 579 } else { 580 ASSERT(ccnt <= pcnt); 581 582 /* 583 * Apply the algorithm described above by multiplying 584 * the previous result and adding in the new value. 585 */ 586 for (i = 0; i < ccnt; i++, src++, p++, q++) { 587 *p ^= *src; 588 589 VDEV_RAIDZ_64MUL_2(*q, mask); 590 *q ^= *src; 591 } 592 593 /* 594 * Treat short columns as though they are full of 0s. 595 * Note that there's therefore nothing needed for P. 596 */ 597 for (; i < pcnt; i++, q++) { 598 VDEV_RAIDZ_64MUL_2(*q, mask); 599 } 600 } 601 } 602 } 603 604 static void 605 vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 606 { 607 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; 608 int c; 609 610 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 611 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 612 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 613 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 614 rm->rm_col[VDEV_RAIDZ_R].rc_size); 615 616 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 617 src = rm->rm_col[c].rc_data; 618 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 619 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 620 r = rm->rm_col[VDEV_RAIDZ_R].rc_data; 621 622 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 623 624 if (c == rm->rm_firstdatacol) { 625 ASSERT(ccnt == pcnt || ccnt == 0); 626 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 627 *p = *src; 628 *q = *src; 629 *r = *src; 630 } 631 for (; i < pcnt; i++, src++, p++, q++, r++) { 632 *p = 0; 633 *q = 0; 634 *r = 0; 635 } 636 } else { 637 ASSERT(ccnt <= pcnt); 638 639 /* 640 * Apply the algorithm described above by multiplying 641 * the previous result and adding in the new value. 642 */ 643 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 644 *p ^= *src; 645 646 VDEV_RAIDZ_64MUL_2(*q, mask); 647 *q ^= *src; 648 649 VDEV_RAIDZ_64MUL_4(*r, mask); 650 *r ^= *src; 651 } 652 653 /* 654 * Treat short columns as though they are full of 0s. 655 * Note that there's therefore nothing needed for P. 656 */ 657 for (; i < pcnt; i++, q++, r++) { 658 VDEV_RAIDZ_64MUL_2(*q, mask); 659 VDEV_RAIDZ_64MUL_4(*r, mask); 660 } 661 } 662 } 663 } 664 665 /* 666 * Generate RAID parity in the first virtual columns according to the number of 667 * parity columns available. 668 */ 669 static void 670 vdev_raidz_generate_parity(raidz_map_t *rm) 671 { 672 switch (rm->rm_firstdatacol) { 673 case 1: 674 vdev_raidz_generate_parity_p(rm); 675 break; 676 case 2: 677 vdev_raidz_generate_parity_pq(rm); 678 break; 679 case 3: 680 vdev_raidz_generate_parity_pqr(rm); 681 break; 682 default: 683 panic("invalid RAID-Z configuration"); 684 } 685 } 686 687 /* BEGIN CSTYLED */ 688 /* 689 * In the general case of reconstruction, we must solve the system of linear 690 * equations defined by the coeffecients used to generate parity as well as 691 * the contents of the data and parity disks. This can be expressed with 692 * vectors for the original data (D) and the actual data (d) and parity (p) 693 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 694 * 695 * __ __ __ __ 696 * | | __ __ | p_0 | 697 * | V | | D_0 | | p_m-1 | 698 * | | x | : | = | d_0 | 699 * | I | | D_n-1 | | : | 700 * | | ~~ ~~ | d_n-1 | 701 * ~~ ~~ ~~ ~~ 702 * 703 * I is simply a square identity matrix of size n, and V is a vandermonde 704 * matrix defined by the coeffecients we chose for the various parity columns 705 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 706 * computation as well as linear separability. 707 * 708 * __ __ __ __ 709 * | 1 .. 1 1 1 | | p_0 | 710 * | 2^n-1 .. 4 2 1 | __ __ | : | 711 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 712 * | 1 .. 0 0 0 | | D_1 | | d_0 | 713 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 714 * | : : : : | | : | | d_2 | 715 * | 0 .. 1 0 0 | | D_n-1 | | : | 716 * | 0 .. 0 1 0 | ~~ ~~ | : | 717 * | 0 .. 0 0 1 | | d_n-1 | 718 * ~~ ~~ ~~ ~~ 719 * 720 * Note that I, V, d, and p are known. To compute D, we must invert the 721 * matrix and use the known data and parity values to reconstruct the unknown 722 * data values. We begin by removing the rows in V|I and d|p that correspond 723 * to failed or missing columns; we then make V|I square (n x n) and d|p 724 * sized n by removing rows corresponding to unused parity from the bottom up 725 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 726 * using Gauss-Jordan elimination. In the example below we use m=3 parity 727 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 728 * __ __ 729 * | 1 1 1 1 1 1 1 1 | 730 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 731 * | 19 205 116 29 64 16 4 1 | / / 732 * | 1 0 0 0 0 0 0 0 | / / 733 * | 0 1 0 0 0 0 0 0 | <--' / 734 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 735 * | 0 0 0 1 0 0 0 0 | 736 * | 0 0 0 0 1 0 0 0 | 737 * | 0 0 0 0 0 1 0 0 | 738 * | 0 0 0 0 0 0 1 0 | 739 * | 0 0 0 0 0 0 0 1 | 740 * ~~ ~~ 741 * __ __ 742 * | 1 1 1 1 1 1 1 1 | 743 * | 128 64 32 16 8 4 2 1 | 744 * | 19 205 116 29 64 16 4 1 | 745 * | 1 0 0 0 0 0 0 0 | 746 * | 0 1 0 0 0 0 0 0 | 747 * (V|I)' = | 0 0 1 0 0 0 0 0 | 748 * | 0 0 0 1 0 0 0 0 | 749 * | 0 0 0 0 1 0 0 0 | 750 * | 0 0 0 0 0 1 0 0 | 751 * | 0 0 0 0 0 0 1 0 | 752 * | 0 0 0 0 0 0 0 1 | 753 * ~~ ~~ 754 * 755 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 756 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 757 * matrix is not singular. 758 * __ __ 759 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 760 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 761 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 762 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 763 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 764 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 765 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 766 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 767 * ~~ ~~ 768 * __ __ 769 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 770 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 771 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 772 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 773 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 774 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 775 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 776 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 777 * ~~ ~~ 778 * __ __ 779 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 780 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 781 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 782 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 783 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 784 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 785 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 786 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 787 * ~~ ~~ 788 * __ __ 789 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 790 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 791 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 792 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 793 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 794 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 795 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 796 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 797 * ~~ ~~ 798 * __ __ 799 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 800 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 801 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 802 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 803 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 804 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 805 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 806 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 807 * ~~ ~~ 808 * __ __ 809 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 810 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 811 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 812 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 813 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 814 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 815 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 816 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 817 * ~~ ~~ 818 * __ __ 819 * | 0 0 1 0 0 0 0 0 | 820 * | 167 100 5 41 159 169 217 208 | 821 * | 166 100 4 40 158 168 216 209 | 822 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 823 * | 0 0 0 0 1 0 0 0 | 824 * | 0 0 0 0 0 1 0 0 | 825 * | 0 0 0 0 0 0 1 0 | 826 * | 0 0 0 0 0 0 0 1 | 827 * ~~ ~~ 828 * 829 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 830 * of the missing data. 831 * 832 * As is apparent from the example above, the only non-trivial rows in the 833 * inverse matrix correspond to the data disks that we're trying to 834 * reconstruct. Indeed, those are the only rows we need as the others would 835 * only be useful for reconstructing data known or assumed to be valid. For 836 * that reason, we only build the coefficients in the rows that correspond to 837 * targeted columns. 838 */ 839 /* END CSTYLED */ 840 841 static void 842 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 843 uint8_t **rows) 844 { 845 int i, j; 846 int pow; 847 848 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 849 850 /* 851 * Fill in the missing rows of interest. 852 */ 853 for (i = 0; i < nmap; i++) { 854 ASSERT3S(0, <=, map[i]); 855 ASSERT3S(map[i], <=, 2); 856 857 pow = map[i] * n; 858 if (pow > 255) 859 pow -= 255; 860 ASSERT(pow <= 255); 861 862 for (j = 0; j < n; j++) { 863 pow -= map[i]; 864 if (pow < 0) 865 pow += 255; 866 rows[i][j] = vdev_raidz_pow2[pow]; 867 } 868 } 869 } 870 871 static void 872 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 873 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 874 { 875 int i, j, ii, jj; 876 uint8_t log; 877 878 /* 879 * Assert that the first nmissing entries from the array of used 880 * columns correspond to parity columns and that subsequent entries 881 * correspond to data columns. 882 */ 883 for (i = 0; i < nmissing; i++) { 884 ASSERT3S(used[i], <, rm->rm_firstdatacol); 885 } 886 for (; i < n; i++) { 887 ASSERT3S(used[i], >=, rm->rm_firstdatacol); 888 } 889 890 /* 891 * First initialize the storage where we'll compute the inverse rows. 892 */ 893 for (i = 0; i < nmissing; i++) { 894 for (j = 0; j < n; j++) { 895 invrows[i][j] = (i == j) ? 1 : 0; 896 } 897 } 898 899 /* 900 * Subtract all trivial rows from the rows of consequence. 901 */ 902 for (i = 0; i < nmissing; i++) { 903 for (j = nmissing; j < n; j++) { 904 ASSERT3U(used[j], >=, rm->rm_firstdatacol); 905 jj = used[j] - rm->rm_firstdatacol; 906 ASSERT3S(jj, <, n); 907 invrows[i][j] = rows[i][jj]; 908 rows[i][jj] = 0; 909 } 910 } 911 912 /* 913 * For each of the rows of interest, we must normalize it and subtract 914 * a multiple of it from the other rows. 915 */ 916 for (i = 0; i < nmissing; i++) { 917 for (j = 0; j < missing[i]; j++) { 918 ASSERT3U(rows[i][j], ==, 0); 919 } 920 ASSERT3U(rows[i][missing[i]], !=, 0); 921 922 /* 923 * Compute the inverse of the first element and multiply each 924 * element in the row by that value. 925 */ 926 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 927 928 for (j = 0; j < n; j++) { 929 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 930 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 931 } 932 933 for (ii = 0; ii < nmissing; ii++) { 934 if (i == ii) 935 continue; 936 937 ASSERT3U(rows[ii][missing[i]], !=, 0); 938 939 log = vdev_raidz_log2[rows[ii][missing[i]]]; 940 941 for (j = 0; j < n; j++) { 942 rows[ii][j] ^= 943 vdev_raidz_exp2(rows[i][j], log); 944 invrows[ii][j] ^= 945 vdev_raidz_exp2(invrows[i][j], log); 946 } 947 } 948 } 949 950 /* 951 * Verify that the data that is left in the rows are properly part of 952 * an identity matrix. 953 */ 954 for (i = 0; i < nmissing; i++) { 955 for (j = 0; j < n; j++) { 956 if (j == missing[i]) { 957 ASSERT3U(rows[i][j], ==, 1); 958 } else { 959 ASSERT3U(rows[i][j], ==, 0); 960 } 961 } 962 } 963 } 964 965 static void 966 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 967 int *missing, uint8_t **invrows, const uint8_t *used) 968 { 969 int i, j, x, cc, c; 970 uint8_t *src; 971 uint64_t ccount; 972 uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 973 uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 974 uint8_t log, val; 975 int ll; 976 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 977 uint8_t *p, *pp; 978 size_t psize; 979 980 log = 0; /* gcc */ 981 psize = sizeof (invlog[0][0]) * n * nmissing; 982 p = zfs_alloc(psize); 983 984 for (pp = p, i = 0; i < nmissing; i++) { 985 invlog[i] = pp; 986 pp += n; 987 } 988 989 for (i = 0; i < nmissing; i++) { 990 for (j = 0; j < n; j++) { 991 ASSERT3U(invrows[i][j], !=, 0); 992 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 993 } 994 } 995 996 for (i = 0; i < n; i++) { 997 c = used[i]; 998 ASSERT3U(c, <, rm->rm_cols); 999 1000 src = rm->rm_col[c].rc_data; 1001 ccount = rm->rm_col[c].rc_size; 1002 for (j = 0; j < nmissing; j++) { 1003 cc = missing[j] + rm->rm_firstdatacol; 1004 ASSERT3U(cc, >=, rm->rm_firstdatacol); 1005 ASSERT3U(cc, <, rm->rm_cols); 1006 ASSERT3U(cc, !=, c); 1007 1008 dst[j] = rm->rm_col[cc].rc_data; 1009 dcount[j] = rm->rm_col[cc].rc_size; 1010 } 1011 1012 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 1013 1014 for (x = 0; x < ccount; x++, src++) { 1015 if (*src != 0) 1016 log = vdev_raidz_log2[*src]; 1017 1018 for (cc = 0; cc < nmissing; cc++) { 1019 if (x >= dcount[cc]) 1020 continue; 1021 1022 if (*src == 0) { 1023 val = 0; 1024 } else { 1025 if ((ll = log + invlog[cc][i]) >= 255) 1026 ll -= 255; 1027 val = vdev_raidz_pow2[ll]; 1028 } 1029 1030 if (i == 0) 1031 dst[cc][x] = val; 1032 else 1033 dst[cc][x] ^= val; 1034 } 1035 } 1036 } 1037 1038 zfs_free(p, psize); 1039 } 1040 1041 static int 1042 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 1043 { 1044 int n, i, c, t, tt; 1045 int nmissing_rows; 1046 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1047 int parity_map[VDEV_RAIDZ_MAXPARITY]; 1048 1049 uint8_t *p, *pp; 1050 size_t psize; 1051 1052 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1053 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1054 uint8_t *used; 1055 1056 int code = 0; 1057 1058 1059 n = rm->rm_cols - rm->rm_firstdatacol; 1060 1061 /* 1062 * Figure out which data columns are missing. 1063 */ 1064 nmissing_rows = 0; 1065 for (t = 0; t < ntgts; t++) { 1066 if (tgts[t] >= rm->rm_firstdatacol) { 1067 missing_rows[nmissing_rows++] = 1068 tgts[t] - rm->rm_firstdatacol; 1069 } 1070 } 1071 1072 /* 1073 * Figure out which parity columns to use to help generate the missing 1074 * data columns. 1075 */ 1076 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1077 ASSERT(tt < ntgts); 1078 ASSERT(c < rm->rm_firstdatacol); 1079 1080 /* 1081 * Skip any targeted parity columns. 1082 */ 1083 if (c == tgts[tt]) { 1084 tt++; 1085 continue; 1086 } 1087 1088 code |= 1 << c; 1089 1090 parity_map[i] = c; 1091 i++; 1092 } 1093 1094 ASSERT(code != 0); 1095 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1096 1097 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1098 nmissing_rows * n + sizeof (used[0]) * n; 1099 p = kmem_alloc(psize, KM_SLEEP); 1100 1101 for (pp = p, i = 0; i < nmissing_rows; i++) { 1102 rows[i] = pp; 1103 pp += n; 1104 invrows[i] = pp; 1105 pp += n; 1106 } 1107 used = pp; 1108 1109 for (i = 0; i < nmissing_rows; i++) { 1110 used[i] = parity_map[i]; 1111 } 1112 1113 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1114 if (tt < nmissing_rows && 1115 c == missing_rows[tt] + rm->rm_firstdatacol) { 1116 tt++; 1117 continue; 1118 } 1119 1120 ASSERT3S(i, <, n); 1121 used[i] = c; 1122 i++; 1123 } 1124 1125 /* 1126 * Initialize the interesting rows of the matrix. 1127 */ 1128 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1129 1130 /* 1131 * Invert the matrix. 1132 */ 1133 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1134 invrows, used); 1135 1136 /* 1137 * Reconstruct the missing data using the generated matrix. 1138 */ 1139 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1140 invrows, used); 1141 1142 kmem_free(p, psize); 1143 1144 return (code); 1145 } 1146 1147 static int 1148 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 1149 { 1150 int tgts[VDEV_RAIDZ_MAXPARITY]; 1151 int ntgts; 1152 int i, c; 1153 int code; 1154 int nbadparity, nbaddata; 1155 1156 /* 1157 * The tgts list must already be sorted. 1158 */ 1159 for (i = 1; i < nt; i++) { 1160 ASSERT(t[i] > t[i - 1]); 1161 } 1162 1163 nbadparity = rm->rm_firstdatacol; 1164 nbaddata = rm->rm_cols - nbadparity; 1165 ntgts = 0; 1166 for (i = 0, c = 0; c < rm->rm_cols; c++) { 1167 if (i < nt && c == t[i]) { 1168 tgts[ntgts++] = c; 1169 i++; 1170 } else if (rm->rm_col[c].rc_error != 0) { 1171 tgts[ntgts++] = c; 1172 } else if (c >= rm->rm_firstdatacol) { 1173 nbaddata--; 1174 } else { 1175 nbadparity--; 1176 } 1177 } 1178 1179 ASSERT(ntgts >= nt); 1180 ASSERT(nbaddata >= 0); 1181 ASSERT(nbaddata + nbadparity == ntgts); 1182 1183 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1184 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1185 ASSERT(code > 0); 1186 return (code); 1187 } 1188 1189 static raidz_map_t * 1190 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift, 1191 uint64_t dcols, uint64_t nparity) 1192 { 1193 raidz_map_t *rm; 1194 uint64_t b = offset >> unit_shift; 1195 uint64_t s = size >> unit_shift; 1196 uint64_t f = b % dcols; 1197 uint64_t o = (b / dcols) << unit_shift; 1198 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 1199 1200 q = s / (dcols - nparity); 1201 r = s - q * (dcols - nparity); 1202 bc = (r == 0 ? 0 : r + nparity); 1203 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 1204 1205 if (q == 0) { 1206 acols = bc; 1207 scols = MIN(dcols, roundup(bc, nparity + 1)); 1208 } else { 1209 acols = dcols; 1210 scols = dcols; 1211 } 1212 1213 ASSERT3U(acols, <=, scols); 1214 1215 rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols])); 1216 1217 rm->rm_cols = acols; 1218 rm->rm_scols = scols; 1219 rm->rm_bigcols = bc; 1220 rm->rm_skipstart = bc; 1221 rm->rm_missingdata = 0; 1222 rm->rm_missingparity = 0; 1223 rm->rm_firstdatacol = nparity; 1224 rm->rm_reports = 0; 1225 rm->rm_freed = 0; 1226 rm->rm_ecksuminjected = 0; 1227 1228 asize = 0; 1229 1230 for (c = 0; c < scols; c++) { 1231 col = f + c; 1232 coff = o; 1233 if (col >= dcols) { 1234 col -= dcols; 1235 coff += 1ULL << unit_shift; 1236 } 1237 rm->rm_col[c].rc_devidx = col; 1238 rm->rm_col[c].rc_offset = coff; 1239 rm->rm_col[c].rc_data = NULL; 1240 rm->rm_col[c].rc_error = 0; 1241 rm->rm_col[c].rc_tried = 0; 1242 rm->rm_col[c].rc_skipped = 0; 1243 1244 if (c >= acols) 1245 rm->rm_col[c].rc_size = 0; 1246 else if (c < bc) 1247 rm->rm_col[c].rc_size = (q + 1) << unit_shift; 1248 else 1249 rm->rm_col[c].rc_size = q << unit_shift; 1250 1251 asize += rm->rm_col[c].rc_size; 1252 } 1253 1254 ASSERT3U(asize, ==, tot << unit_shift); 1255 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 1256 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 1257 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 1258 ASSERT3U(rm->rm_nskip, <=, nparity); 1259 1260 for (c = 0; c < rm->rm_firstdatacol; c++) 1261 rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size); 1262 1263 rm->rm_col[c].rc_data = data; 1264 1265 for (c = c + 1; c < acols; c++) 1266 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 1267 rm->rm_col[c - 1].rc_size; 1268 1269 /* 1270 * If all data stored spans all columns, there's a danger that parity 1271 * will always be on the same device and, since parity isn't read 1272 * during normal operation, that that device's I/O bandwidth won't be 1273 * used effectively. We therefore switch the parity every 1MB. 1274 * 1275 * ... at least that was, ostensibly, the theory. As a practical 1276 * matter unless we juggle the parity between all devices evenly, we 1277 * won't see any benefit. Further, occasional writes that aren't a 1278 * multiple of the LCM of the number of children and the minimum 1279 * stripe width are sufficient to avoid pessimal behavior. 1280 * Unfortunately, this decision created an implicit on-disk format 1281 * requirement that we need to support for all eternity, but only 1282 * for single-parity RAID-Z. 1283 * 1284 * If we intend to skip a sector in the zeroth column for padding 1285 * we must make sure to note this swap. We will never intend to 1286 * skip the first column since at least one data and one parity 1287 * column must appear in each row. 1288 */ 1289 ASSERT(rm->rm_cols >= 2); 1290 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 1291 1292 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { 1293 devidx = rm->rm_col[0].rc_devidx; 1294 o = rm->rm_col[0].rc_offset; 1295 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 1296 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 1297 rm->rm_col[1].rc_devidx = devidx; 1298 rm->rm_col[1].rc_offset = o; 1299 1300 if (rm->rm_skipstart == 0) 1301 rm->rm_skipstart = 1; 1302 } 1303 1304 return (rm); 1305 } 1306 1307 static void 1308 vdev_raidz_map_free(raidz_map_t *rm) 1309 { 1310 int c; 1311 1312 for (c = rm->rm_firstdatacol - 1; c >= 0; c--) 1313 zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); 1314 1315 zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); 1316 } 1317 1318 static vdev_t * 1319 vdev_child(vdev_t *pvd, uint64_t devidx) 1320 { 1321 vdev_t *cvd; 1322 1323 STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) { 1324 if (cvd->v_id == devidx) 1325 break; 1326 } 1327 1328 return (cvd); 1329 } 1330 1331 /* 1332 * We keep track of whether or not there were any injected errors, so that 1333 * any ereports we generate can note it. 1334 */ 1335 static int 1336 raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data, 1337 uint64_t size) 1338 { 1339 return (zio_checksum_verify(spa, bp, data)); 1340 } 1341 1342 /* 1343 * Generate the parity from the data columns. If we tried and were able to 1344 * read the parity without error, verify that the generated parity matches the 1345 * data we read. If it doesn't, we fire off a checksum error. Return the 1346 * number such failures. 1347 */ 1348 static int 1349 raidz_parity_verify(raidz_map_t *rm) 1350 { 1351 void *orig[VDEV_RAIDZ_MAXPARITY]; 1352 int c, ret = 0; 1353 raidz_col_t *rc; 1354 1355 for (c = 0; c < rm->rm_firstdatacol; c++) { 1356 rc = &rm->rm_col[c]; 1357 if (!rc->rc_tried || rc->rc_error != 0) 1358 continue; 1359 orig[c] = zfs_alloc(rc->rc_size); 1360 bcopy(rc->rc_data, orig[c], rc->rc_size); 1361 } 1362 1363 vdev_raidz_generate_parity(rm); 1364 1365 for (c = rm->rm_firstdatacol - 1; c >= 0; c--) { 1366 rc = &rm->rm_col[c]; 1367 if (!rc->rc_tried || rc->rc_error != 0) 1368 continue; 1369 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 1370 rc->rc_error = ECKSUM; 1371 ret++; 1372 } 1373 zfs_free(orig[c], rc->rc_size); 1374 } 1375 1376 return (ret); 1377 } 1378 1379 /* 1380 * Iterate over all combinations of bad data and attempt a reconstruction. 1381 * Note that the algorithm below is non-optimal because it doesn't take into 1382 * account how reconstruction is actually performed. For example, with 1383 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 1384 * is targeted as invalid as if columns 1 and 4 are targeted since in both 1385 * cases we'd only use parity information in column 0. 1386 */ 1387 static int 1388 vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp, 1389 void *data, off_t offset, uint64_t bytes, int total_errors, int data_errors) 1390 { 1391 raidz_col_t *rc; 1392 void *orig[VDEV_RAIDZ_MAXPARITY]; 1393 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 1394 int *tgts = &tstore[1]; 1395 int current, next, i, c, n; 1396 int code, ret = 0; 1397 1398 ASSERT(total_errors < rm->rm_firstdatacol); 1399 1400 /* 1401 * This simplifies one edge condition. 1402 */ 1403 tgts[-1] = -1; 1404 1405 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 1406 /* 1407 * Initialize the targets array by finding the first n columns 1408 * that contain no error. 1409 * 1410 * If there were no data errors, we need to ensure that we're 1411 * always explicitly attempting to reconstruct at least one 1412 * data column. To do this, we simply push the highest target 1413 * up into the data columns. 1414 */ 1415 for (c = 0, i = 0; i < n; i++) { 1416 if (i == n - 1 && data_errors == 0 && 1417 c < rm->rm_firstdatacol) { 1418 c = rm->rm_firstdatacol; 1419 } 1420 1421 while (rm->rm_col[c].rc_error != 0) { 1422 c++; 1423 ASSERT3S(c, <, rm->rm_cols); 1424 } 1425 1426 tgts[i] = c++; 1427 } 1428 1429 /* 1430 * Setting tgts[n] simplifies the other edge condition. 1431 */ 1432 tgts[n] = rm->rm_cols; 1433 1434 /* 1435 * These buffers were allocated in previous iterations. 1436 */ 1437 for (i = 0; i < n - 1; i++) { 1438 ASSERT(orig[i] != NULL); 1439 } 1440 1441 orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size); 1442 1443 current = 0; 1444 next = tgts[current]; 1445 1446 while (current != n) { 1447 tgts[current] = next; 1448 current = 0; 1449 1450 /* 1451 * Save off the original data that we're going to 1452 * attempt to reconstruct. 1453 */ 1454 for (i = 0; i < n; i++) { 1455 ASSERT(orig[i] != NULL); 1456 c = tgts[i]; 1457 ASSERT3S(c, >=, 0); 1458 ASSERT3S(c, <, rm->rm_cols); 1459 rc = &rm->rm_col[c]; 1460 bcopy(rc->rc_data, orig[i], rc->rc_size); 1461 } 1462 1463 /* 1464 * Attempt a reconstruction and exit the outer loop on 1465 * success. 1466 */ 1467 code = vdev_raidz_reconstruct(rm, tgts, n); 1468 if (raidz_checksum_verify(spa, bp, data, bytes) == 0) { 1469 for (i = 0; i < n; i++) { 1470 c = tgts[i]; 1471 rc = &rm->rm_col[c]; 1472 ASSERT(rc->rc_error == 0); 1473 rc->rc_error = ECKSUM; 1474 } 1475 1476 ret = code; 1477 goto done; 1478 } 1479 1480 /* 1481 * Restore the original data. 1482 */ 1483 for (i = 0; i < n; i++) { 1484 c = tgts[i]; 1485 rc = &rm->rm_col[c]; 1486 bcopy(orig[i], rc->rc_data, rc->rc_size); 1487 } 1488 1489 do { 1490 /* 1491 * Find the next valid column after the current 1492 * position.. 1493 */ 1494 for (next = tgts[current] + 1; 1495 next < rm->rm_cols && 1496 rm->rm_col[next].rc_error != 0; next++) 1497 continue; 1498 1499 ASSERT(next <= tgts[current + 1]); 1500 1501 /* 1502 * If that spot is available, we're done here. 1503 */ 1504 if (next != tgts[current + 1]) 1505 break; 1506 1507 /* 1508 * Otherwise, find the next valid column after 1509 * the previous position. 1510 */ 1511 for (c = tgts[current - 1] + 1; 1512 rm->rm_col[c].rc_error != 0; c++) 1513 continue; 1514 1515 tgts[current] = c; 1516 current++; 1517 1518 } while (current != n); 1519 } 1520 } 1521 n--; 1522 done: 1523 for (i = n - 1; i >= 0; i--) { 1524 zfs_free(orig[i], rm->rm_col[0].rc_size); 1525 } 1526 1527 return (ret); 1528 } 1529 1530 static int 1531 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data, 1532 off_t offset, size_t bytes) 1533 { 1534 vdev_t *tvd = vd->v_top; 1535 vdev_t *cvd; 1536 raidz_map_t *rm; 1537 raidz_col_t *rc; 1538 int c, error; 1539 int unexpected_errors; 1540 int parity_errors; 1541 int parity_untried; 1542 int data_errors; 1543 int total_errors; 1544 int n; 1545 int tgts[VDEV_RAIDZ_MAXPARITY]; 1546 int code; 1547 1548 rc = NULL; /* gcc */ 1549 error = 0; 1550 1551 rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift, 1552 vd->v_nchildren, vd->v_nparity); 1553 1554 /* 1555 * Iterate over the columns in reverse order so that we hit the parity 1556 * last -- any errors along the way will force us to read the parity. 1557 */ 1558 for (c = rm->rm_cols - 1; c >= 0; c--) { 1559 rc = &rm->rm_col[c]; 1560 cvd = vdev_child(vd, rc->rc_devidx); 1561 if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) { 1562 if (c >= rm->rm_firstdatacol) 1563 rm->rm_missingdata++; 1564 else 1565 rm->rm_missingparity++; 1566 rc->rc_error = ENXIO; 1567 rc->rc_tried = 1; /* don't even try */ 1568 rc->rc_skipped = 1; 1569 continue; 1570 } 1571 #if 0 /* XXX: Too hard for the boot code. */ 1572 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 1573 if (c >= rm->rm_firstdatacol) 1574 rm->rm_missingdata++; 1575 else 1576 rm->rm_missingparity++; 1577 rc->rc_error = ESTALE; 1578 rc->rc_skipped = 1; 1579 continue; 1580 } 1581 #endif 1582 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) { 1583 rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data, 1584 rc->rc_offset, rc->rc_size); 1585 rc->rc_tried = 1; 1586 rc->rc_skipped = 0; 1587 } 1588 } 1589 1590 reconstruct: 1591 unexpected_errors = 0; 1592 parity_errors = 0; 1593 parity_untried = 0; 1594 data_errors = 0; 1595 total_errors = 0; 1596 1597 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 1598 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 1599 1600 for (c = 0; c < rm->rm_cols; c++) { 1601 rc = &rm->rm_col[c]; 1602 1603 if (rc->rc_error) { 1604 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 1605 1606 if (c < rm->rm_firstdatacol) 1607 parity_errors++; 1608 else 1609 data_errors++; 1610 1611 if (!rc->rc_skipped) 1612 unexpected_errors++; 1613 1614 total_errors++; 1615 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 1616 parity_untried++; 1617 } 1618 } 1619 1620 /* 1621 * There are three potential phases for a read: 1622 * 1. produce valid data from the columns read 1623 * 2. read all disks and try again 1624 * 3. perform combinatorial reconstruction 1625 * 1626 * Each phase is progressively both more expensive and less likely to 1627 * occur. If we encounter more errors than we can repair or all phases 1628 * fail, we have no choice but to return an error. 1629 */ 1630 1631 /* 1632 * If the number of errors we saw was correctable -- less than or equal 1633 * to the number of parity disks read -- attempt to produce data that 1634 * has a valid checksum. Naturally, this case applies in the absence of 1635 * any errors. 1636 */ 1637 if (total_errors <= rm->rm_firstdatacol - parity_untried) { 1638 if (data_errors == 0) { 1639 if (raidz_checksum_verify(vd->spa, bp, data, bytes) == 0) { 1640 /* 1641 * If we read parity information (unnecessarily 1642 * as it happens since no reconstruction was 1643 * needed) regenerate and verify the parity. 1644 * We also regenerate parity when resilvering 1645 * so we can write it out to the failed device 1646 * later. 1647 */ 1648 if (parity_errors + parity_untried < 1649 rm->rm_firstdatacol) { 1650 n = raidz_parity_verify(rm); 1651 unexpected_errors += n; 1652 ASSERT(parity_errors + n <= 1653 rm->rm_firstdatacol); 1654 } 1655 goto done; 1656 } 1657 } else { 1658 /* 1659 * We either attempt to read all the parity columns or 1660 * none of them. If we didn't try to read parity, we 1661 * wouldn't be here in the correctable case. There must 1662 * also have been fewer parity errors than parity 1663 * columns or, again, we wouldn't be in this code path. 1664 */ 1665 ASSERT(parity_untried == 0); 1666 ASSERT(parity_errors < rm->rm_firstdatacol); 1667 1668 /* 1669 * Identify the data columns that reported an error. 1670 */ 1671 n = 0; 1672 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1673 rc = &rm->rm_col[c]; 1674 if (rc->rc_error != 0) { 1675 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 1676 tgts[n++] = c; 1677 } 1678 } 1679 1680 ASSERT(rm->rm_firstdatacol >= n); 1681 1682 code = vdev_raidz_reconstruct(rm, tgts, n); 1683 1684 if (raidz_checksum_verify(vd->spa, bp, data, bytes) == 0) { 1685 /* 1686 * If we read more parity disks than were used 1687 * for reconstruction, confirm that the other 1688 * parity disks produced correct data. This 1689 * routine is suboptimal in that it regenerates 1690 * the parity that we already used in addition 1691 * to the parity that we're attempting to 1692 * verify, but this should be a relatively 1693 * uncommon case, and can be optimized if it 1694 * becomes a problem. Note that we regenerate 1695 * parity when resilvering so we can write it 1696 * out to failed devices later. 1697 */ 1698 if (parity_errors < rm->rm_firstdatacol - n) { 1699 n = raidz_parity_verify(rm); 1700 unexpected_errors += n; 1701 ASSERT(parity_errors + n <= 1702 rm->rm_firstdatacol); 1703 } 1704 1705 goto done; 1706 } 1707 } 1708 } 1709 1710 /* 1711 * This isn't a typical situation -- either we got a read 1712 * error or a child silently returned bad data. Read every 1713 * block so we can try again with as much data and parity as 1714 * we can track down. If we've already been through once 1715 * before, all children will be marked as tried so we'll 1716 * proceed to combinatorial reconstruction. 1717 */ 1718 unexpected_errors = 1; 1719 rm->rm_missingdata = 0; 1720 rm->rm_missingparity = 0; 1721 1722 n = 0; 1723 for (c = 0; c < rm->rm_cols; c++) { 1724 rc = &rm->rm_col[c]; 1725 1726 if (rc->rc_tried) 1727 continue; 1728 1729 cvd = vdev_child(vd, rc->rc_devidx); 1730 ASSERT(cvd != NULL); 1731 rc->rc_error = cvd->v_read(cvd, NULL, 1732 rc->rc_data, rc->rc_offset, rc->rc_size); 1733 if (rc->rc_error == 0) 1734 n++; 1735 rc->rc_tried = 1; 1736 rc->rc_skipped = 0; 1737 } 1738 /* 1739 * If we managed to read anything more, retry the 1740 * reconstruction. 1741 */ 1742 if (n > 0) 1743 goto reconstruct; 1744 1745 /* 1746 * At this point we've attempted to reconstruct the data given the 1747 * errors we detected, and we've attempted to read all columns. There 1748 * must, therefore, be one or more additional problems -- silent errors 1749 * resulting in invalid data rather than explicit I/O errors resulting 1750 * in absent data. We check if there is enough additional data to 1751 * possibly reconstruct the data and then perform combinatorial 1752 * reconstruction over all possible combinations. If that fails, 1753 * we're cooked. 1754 */ 1755 if (total_errors > rm->rm_firstdatacol) { 1756 error = EIO; 1757 } else if (total_errors < rm->rm_firstdatacol && 1758 (code = vdev_raidz_combrec(vd->spa, rm, bp, data, offset, bytes, 1759 total_errors, data_errors)) != 0) { 1760 /* 1761 * If we didn't use all the available parity for the 1762 * combinatorial reconstruction, verify that the remaining 1763 * parity is correct. 1764 */ 1765 if (code != (1 << rm->rm_firstdatacol) - 1) 1766 (void) raidz_parity_verify(rm); 1767 } else { 1768 /* 1769 * We're here because either: 1770 * 1771 * total_errors == rm_first_datacol, or 1772 * vdev_raidz_combrec() failed 1773 * 1774 * In either case, there is enough bad data to prevent 1775 * reconstruction. 1776 * 1777 * Start checksum ereports for all children which haven't 1778 * failed, and the IO wasn't speculative. 1779 */ 1780 error = ECKSUM; 1781 } 1782 1783 done: 1784 vdev_raidz_map_free(rm); 1785 1786 return (error); 1787 } 1788