1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/cdefs.h> 27 __FBSDID("$FreeBSD$"); 28 29 #include <lz4.h> 30 31 static uint64_t zfs_crc64_table[256]; 32 33 #define ASSERT3S(x, y, z) ((void)0) 34 #define ASSERT3U(x, y, z) ((void)0) 35 #define ASSERT3P(x, y, z) ((void)0) 36 #define ASSERT0(x) ((void)0) 37 #define ASSERT(x) ((void)0) 38 39 #define panic(...) do { \ 40 printf(__VA_ARGS__); \ 41 for (;;) ; \ 42 } while (0) 43 44 static void 45 zfs_init_crc(void) 46 { 47 int i, j; 48 uint64_t *ct; 49 50 /* 51 * Calculate the crc64 table (used for the zap hash 52 * function). 53 */ 54 if (zfs_crc64_table[128] != ZFS_CRC64_POLY) { 55 memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table)); 56 for (i = 0; i < 256; i++) 57 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 58 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 59 } 60 } 61 62 static void 63 zio_checksum_off(const void *buf, uint64_t size, 64 const void *ctx_template, zio_cksum_t *zcp) 65 { 66 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); 67 } 68 69 /* 70 * Signature for checksum functions. 71 */ 72 typedef void zio_checksum_t(const void *data, uint64_t size, 73 const void *ctx_template, zio_cksum_t *zcp); 74 typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt); 75 typedef void zio_checksum_tmpl_free_t(void *ctx_template); 76 77 typedef enum zio_checksum_flags { 78 /* Strong enough for metadata? */ 79 ZCHECKSUM_FLAG_METADATA = (1 << 1), 80 /* ZIO embedded checksum */ 81 ZCHECKSUM_FLAG_EMBEDDED = (1 << 2), 82 /* Strong enough for dedup (without verification)? */ 83 ZCHECKSUM_FLAG_DEDUP = (1 << 3), 84 /* Uses salt value */ 85 ZCHECKSUM_FLAG_SALTED = (1 << 4), 86 /* Strong enough for nopwrite? */ 87 ZCHECKSUM_FLAG_NOPWRITE = (1 << 5) 88 } zio_checksum_flags_t; 89 90 /* 91 * Information about each checksum function. 92 */ 93 typedef struct zio_checksum_info { 94 /* checksum function for each byteorder */ 95 zio_checksum_t *ci_func[2]; 96 zio_checksum_tmpl_init_t *ci_tmpl_init; 97 zio_checksum_tmpl_free_t *ci_tmpl_free; 98 zio_checksum_flags_t ci_flags; 99 const char *ci_name; /* descriptive name */ 100 } zio_checksum_info_t; 101 102 #include "blkptr.c" 103 104 #include "fletcher.c" 105 #include "sha256.c" 106 #include "skein_zfs.c" 107 108 extern int zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len, 109 size_t d_len, int n); 110 111 112 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { 113 {{NULL, NULL}, NULL, NULL, 0, "inherit"}, 114 {{NULL, NULL}, NULL, NULL, 0, "on"}, 115 {{zio_checksum_off, zio_checksum_off}, NULL, NULL, 0, "off"}, 116 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, 117 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"}, 118 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, 119 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"}, 120 {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL, 121 ZCHECKSUM_FLAG_EMBEDDED, "zilog"}, 122 {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL, 123 0, "fletcher2"}, 124 {{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL, 125 ZCHECKSUM_FLAG_METADATA, "fletcher4"}, 126 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, 127 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | 128 ZCHECKSUM_FLAG_NOPWRITE, "SHA256"}, 129 {{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL, 130 ZCHECKSUM_FLAG_EMBEDDED, "zillog2"}, 131 {{zio_checksum_off, zio_checksum_off}, NULL, NULL, 132 0, "noparity"}, 133 {{zio_checksum_SHA512_native, zio_checksum_SHA512_byteswap}, 134 NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | 135 ZCHECKSUM_FLAG_NOPWRITE, "SHA512"}, 136 {{zio_checksum_skein_native, zio_checksum_skein_byteswap}, 137 zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free, 138 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | 139 ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"}, 140 /* no edonr for now */ 141 {{NULL, NULL}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | 142 ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"} 143 }; 144 145 /* 146 * Common signature for all zio compress/decompress functions. 147 */ 148 typedef size_t zio_compress_func_t(void *src, void *dst, 149 size_t s_len, size_t d_len, int); 150 typedef int zio_decompress_func_t(void *src, void *dst, 151 size_t s_len, size_t d_len, int); 152 153 /* 154 * Information about each compression function. 155 */ 156 typedef struct zio_compress_info { 157 zio_compress_func_t *ci_compress; /* compression function */ 158 zio_decompress_func_t *ci_decompress; /* decompression function */ 159 int ci_level; /* level parameter */ 160 const char *ci_name; /* algorithm name */ 161 } zio_compress_info_t; 162 163 #include "lzjb.c" 164 #include "zle.c" 165 166 /* 167 * Compression vectors. 168 */ 169 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { 170 {NULL, NULL, 0, "inherit"}, 171 {NULL, NULL, 0, "on"}, 172 {NULL, NULL, 0, "uncompressed"}, 173 {NULL, lzjb_decompress, 0, "lzjb"}, 174 {NULL, NULL, 0, "empty"}, 175 {NULL, NULL, 1, "gzip-1"}, 176 {NULL, NULL, 2, "gzip-2"}, 177 {NULL, NULL, 3, "gzip-3"}, 178 {NULL, NULL, 4, "gzip-4"}, 179 {NULL, NULL, 5, "gzip-5"}, 180 {NULL, NULL, 6, "gzip-6"}, 181 {NULL, NULL, 7, "gzip-7"}, 182 {NULL, NULL, 8, "gzip-8"}, 183 {NULL, NULL, 9, "gzip-9"}, 184 {NULL, zle_decompress, 64, "zle"}, 185 {NULL, lz4_decompress, 0, "lz4"}, 186 {NULL, zfs_zstd_decompress, ZIO_ZSTD_LEVEL_DEFAULT, "zstd"} 187 }; 188 189 static void 190 byteswap_uint64_array(void *vbuf, size_t size) 191 { 192 uint64_t *buf = vbuf; 193 size_t count = size >> 3; 194 int i; 195 196 ASSERT((size & 7) == 0); 197 198 for (i = 0; i < count; i++) 199 buf[i] = BSWAP_64(buf[i]); 200 } 201 202 /* 203 * Set the external verifier for a gang block based on <vdev, offset, txg>, 204 * a tuple which is guaranteed to be unique for the life of the pool. 205 */ 206 static void 207 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp) 208 { 209 const dva_t *dva = BP_IDENTITY(bp); 210 uint64_t txg = BP_PHYSICAL_BIRTH(bp); 211 212 ASSERT(BP_IS_GANG(bp)); 213 214 ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0); 215 } 216 217 /* 218 * Set the external verifier for a label block based on its offset. 219 * The vdev is implicit, and the txg is unknowable at pool open time -- 220 * hence the logic in vdev_uberblock_load() to find the most recent copy. 221 */ 222 static void 223 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset) 224 { 225 ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0); 226 } 227 228 /* 229 * Calls the template init function of a checksum which supports context 230 * templates and installs the template into the spa_t. 231 */ 232 static void 233 zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa) 234 { 235 zio_checksum_info_t *ci = &zio_checksum_table[checksum]; 236 237 if (ci->ci_tmpl_init == NULL) 238 return; 239 240 if (spa->spa_cksum_tmpls[checksum] != NULL) 241 return; 242 243 if (spa->spa_cksum_tmpls[checksum] == NULL) { 244 spa->spa_cksum_tmpls[checksum] = 245 ci->ci_tmpl_init(&spa->spa_cksum_salt); 246 } 247 } 248 249 /* 250 * Called by a spa_t that's about to be deallocated. This steps through 251 * all of the checksum context templates and deallocates any that were 252 * initialized using the algorithm-specific template init function. 253 */ 254 static void __unused 255 zio_checksum_templates_free(spa_t *spa) 256 { 257 for (enum zio_checksum checksum = 0; 258 checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) { 259 if (spa->spa_cksum_tmpls[checksum] != NULL) { 260 zio_checksum_info_t *ci = &zio_checksum_table[checksum]; 261 262 ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]); 263 spa->spa_cksum_tmpls[checksum] = NULL; 264 } 265 } 266 } 267 268 static int 269 zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data) 270 { 271 uint64_t size; 272 unsigned int checksum; 273 zio_checksum_info_t *ci; 274 void *ctx = NULL; 275 zio_cksum_t actual_cksum, expected_cksum, verifier; 276 int byteswap; 277 278 checksum = BP_GET_CHECKSUM(bp); 279 size = BP_GET_PSIZE(bp); 280 281 if (checksum >= ZIO_CHECKSUM_FUNCTIONS) 282 return (EINVAL); 283 ci = &zio_checksum_table[checksum]; 284 if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL) 285 return (EINVAL); 286 287 if (spa != NULL) { 288 zio_checksum_template_init(checksum, __DECONST(spa_t *,spa)); 289 ctx = spa->spa_cksum_tmpls[checksum]; 290 } 291 292 if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { 293 zio_eck_t *eck; 294 295 ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER || 296 checksum == ZIO_CHECKSUM_LABEL); 297 298 eck = (zio_eck_t *)((char *)data + size) - 1; 299 300 if (checksum == ZIO_CHECKSUM_GANG_HEADER) 301 zio_checksum_gang_verifier(&verifier, bp); 302 else if (checksum == ZIO_CHECKSUM_LABEL) 303 zio_checksum_label_verifier(&verifier, 304 DVA_GET_OFFSET(BP_IDENTITY(bp))); 305 else 306 verifier = bp->blk_cksum; 307 308 byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC)); 309 310 if (byteswap) 311 byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); 312 313 expected_cksum = eck->zec_cksum; 314 eck->zec_cksum = verifier; 315 ci->ci_func[byteswap](data, size, ctx, &actual_cksum); 316 eck->zec_cksum = expected_cksum; 317 318 if (byteswap) 319 byteswap_uint64_array(&expected_cksum, 320 sizeof (zio_cksum_t)); 321 } else { 322 byteswap = BP_SHOULD_BYTESWAP(bp); 323 expected_cksum = bp->blk_cksum; 324 ci->ci_func[byteswap](data, size, ctx, &actual_cksum); 325 } 326 327 if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) { 328 /*printf("ZFS: read checksum %s failed\n", ci->ci_name);*/ 329 return (EIO); 330 } 331 332 return (0); 333 } 334 335 static int 336 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize, 337 void *dest, uint64_t destsize) 338 { 339 zio_compress_info_t *ci; 340 341 if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) { 342 printf("ZFS: unsupported compression algorithm %u\n", cpfunc); 343 return (EIO); 344 } 345 346 ci = &zio_compress_table[cpfunc]; 347 if (!ci->ci_decompress) { 348 printf("ZFS: unsupported compression algorithm %s\n", 349 ci->ci_name); 350 return (EIO); 351 } 352 353 return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level)); 354 } 355 356 static uint64_t 357 zap_hash(uint64_t salt, const char *name) 358 { 359 const uint8_t *cp; 360 uint8_t c; 361 uint64_t crc = salt; 362 363 ASSERT(crc != 0); 364 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 365 for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++) 366 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF]; 367 368 /* 369 * Only use 28 bits, since we need 4 bits in the cookie for the 370 * collision differentiator. We MUST use the high bits, since 371 * those are the onces that we first pay attention to when 372 * chosing the bucket. 373 */ 374 crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1); 375 376 return (crc); 377 } 378 379 typedef struct raidz_col { 380 uint64_t rc_devidx; /* child device index for I/O */ 381 uint64_t rc_offset; /* device offset */ 382 uint64_t rc_size; /* I/O size */ 383 void *rc_data; /* I/O data */ 384 int rc_error; /* I/O error for this device */ 385 uint8_t rc_tried; /* Did we attempt this I/O column? */ 386 uint8_t rc_skipped; /* Did we skip this I/O column? */ 387 } raidz_col_t; 388 389 typedef struct raidz_map { 390 uint64_t rm_cols; /* Regular column count */ 391 uint64_t rm_scols; /* Count including skipped columns */ 392 uint64_t rm_bigcols; /* Number of oversized columns */ 393 uint64_t rm_asize; /* Actual total I/O size */ 394 uint64_t rm_missingdata; /* Count of missing data devices */ 395 uint64_t rm_missingparity; /* Count of missing parity devices */ 396 uint64_t rm_firstdatacol; /* First data column/parity count */ 397 uint64_t rm_nskip; /* Skipped sectors for padding */ 398 uint64_t rm_skipstart; /* Column index of padding start */ 399 uintptr_t rm_reports; /* # of referencing checksum reports */ 400 uint8_t rm_freed; /* map no longer has referencing ZIO */ 401 uint8_t rm_ecksuminjected; /* checksum error was injected */ 402 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 403 } raidz_map_t; 404 405 #define VDEV_RAIDZ_P 0 406 #define VDEV_RAIDZ_Q 1 407 #define VDEV_RAIDZ_R 2 408 409 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 410 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 411 412 /* 413 * We provide a mechanism to perform the field multiplication operation on a 414 * 64-bit value all at once rather than a byte at a time. This works by 415 * creating a mask from the top bit in each byte and using that to 416 * conditionally apply the XOR of 0x1d. 417 */ 418 #define VDEV_RAIDZ_64MUL_2(x, mask) \ 419 { \ 420 (mask) = (x) & 0x8080808080808080ULL; \ 421 (mask) = ((mask) << 1) - ((mask) >> 7); \ 422 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 423 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ 424 } 425 426 #define VDEV_RAIDZ_64MUL_4(x, mask) \ 427 { \ 428 VDEV_RAIDZ_64MUL_2((x), mask); \ 429 VDEV_RAIDZ_64MUL_2((x), mask); \ 430 } 431 432 /* 433 * These two tables represent powers and logs of 2 in the Galois field defined 434 * above. These values were computed by repeatedly multiplying by 2 as above. 435 */ 436 static const uint8_t vdev_raidz_pow2[256] = { 437 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 438 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 439 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 440 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 441 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 442 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 443 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 444 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 445 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 446 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 447 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 448 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 449 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 450 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 451 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 452 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 453 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 454 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 455 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 456 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 457 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 458 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 459 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 460 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 461 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 462 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 463 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 464 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 465 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 466 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 467 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 468 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 469 }; 470 static const uint8_t vdev_raidz_log2[256] = { 471 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 472 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 473 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 474 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 475 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 476 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 477 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 478 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 479 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 480 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 481 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 482 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 483 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 484 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 485 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 486 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 487 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 488 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 489 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 490 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 491 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 492 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 493 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 494 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 495 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 496 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 497 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 498 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 499 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 500 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 501 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 502 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 503 }; 504 505 /* 506 * Multiply a given number by 2 raised to the given power. 507 */ 508 static uint8_t 509 vdev_raidz_exp2(uint8_t a, int exp) 510 { 511 if (a == 0) 512 return (0); 513 514 ASSERT(exp >= 0); 515 ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 516 517 exp += vdev_raidz_log2[a]; 518 if (exp > 255) 519 exp -= 255; 520 521 return (vdev_raidz_pow2[exp]); 522 } 523 524 static void 525 vdev_raidz_generate_parity_p(raidz_map_t *rm) 526 { 527 uint64_t *p, *src, pcount, ccount, i; 528 int c; 529 530 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 531 532 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 533 src = rm->rm_col[c].rc_data; 534 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 535 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 536 537 if (c == rm->rm_firstdatacol) { 538 ASSERT(ccount == pcount); 539 for (i = 0; i < ccount; i++, src++, p++) { 540 *p = *src; 541 } 542 } else { 543 ASSERT(ccount <= pcount); 544 for (i = 0; i < ccount; i++, src++, p++) { 545 *p ^= *src; 546 } 547 } 548 } 549 } 550 551 static void 552 vdev_raidz_generate_parity_pq(raidz_map_t *rm) 553 { 554 uint64_t *p, *q, *src, pcnt, ccnt, mask, i; 555 int c; 556 557 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 558 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 559 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 560 561 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 562 src = rm->rm_col[c].rc_data; 563 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 564 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 565 566 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 567 568 if (c == rm->rm_firstdatacol) { 569 ASSERT(ccnt == pcnt || ccnt == 0); 570 for (i = 0; i < ccnt; i++, src++, p++, q++) { 571 *p = *src; 572 *q = *src; 573 } 574 for (; i < pcnt; i++, src++, p++, q++) { 575 *p = 0; 576 *q = 0; 577 } 578 } else { 579 ASSERT(ccnt <= pcnt); 580 581 /* 582 * Apply the algorithm described above by multiplying 583 * the previous result and adding in the new value. 584 */ 585 for (i = 0; i < ccnt; i++, src++, p++, q++) { 586 *p ^= *src; 587 588 VDEV_RAIDZ_64MUL_2(*q, mask); 589 *q ^= *src; 590 } 591 592 /* 593 * Treat short columns as though they are full of 0s. 594 * Note that there's therefore nothing needed for P. 595 */ 596 for (; i < pcnt; i++, q++) { 597 VDEV_RAIDZ_64MUL_2(*q, mask); 598 } 599 } 600 } 601 } 602 603 static void 604 vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 605 { 606 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; 607 int c; 608 609 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 610 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 611 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 612 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 613 rm->rm_col[VDEV_RAIDZ_R].rc_size); 614 615 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 616 src = rm->rm_col[c].rc_data; 617 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 618 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 619 r = rm->rm_col[VDEV_RAIDZ_R].rc_data; 620 621 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 622 623 if (c == rm->rm_firstdatacol) { 624 ASSERT(ccnt == pcnt || ccnt == 0); 625 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 626 *p = *src; 627 *q = *src; 628 *r = *src; 629 } 630 for (; i < pcnt; i++, src++, p++, q++, r++) { 631 *p = 0; 632 *q = 0; 633 *r = 0; 634 } 635 } else { 636 ASSERT(ccnt <= pcnt); 637 638 /* 639 * Apply the algorithm described above by multiplying 640 * the previous result and adding in the new value. 641 */ 642 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 643 *p ^= *src; 644 645 VDEV_RAIDZ_64MUL_2(*q, mask); 646 *q ^= *src; 647 648 VDEV_RAIDZ_64MUL_4(*r, mask); 649 *r ^= *src; 650 } 651 652 /* 653 * Treat short columns as though they are full of 0s. 654 * Note that there's therefore nothing needed for P. 655 */ 656 for (; i < pcnt; i++, q++, r++) { 657 VDEV_RAIDZ_64MUL_2(*q, mask); 658 VDEV_RAIDZ_64MUL_4(*r, mask); 659 } 660 } 661 } 662 } 663 664 /* 665 * Generate RAID parity in the first virtual columns according to the number of 666 * parity columns available. 667 */ 668 static void 669 vdev_raidz_generate_parity(raidz_map_t *rm) 670 { 671 switch (rm->rm_firstdatacol) { 672 case 1: 673 vdev_raidz_generate_parity_p(rm); 674 break; 675 case 2: 676 vdev_raidz_generate_parity_pq(rm); 677 break; 678 case 3: 679 vdev_raidz_generate_parity_pqr(rm); 680 break; 681 default: 682 panic("invalid RAID-Z configuration"); 683 } 684 } 685 686 /* BEGIN CSTYLED */ 687 /* 688 * In the general case of reconstruction, we must solve the system of linear 689 * equations defined by the coeffecients used to generate parity as well as 690 * the contents of the data and parity disks. This can be expressed with 691 * vectors for the original data (D) and the actual data (d) and parity (p) 692 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 693 * 694 * __ __ __ __ 695 * | | __ __ | p_0 | 696 * | V | | D_0 | | p_m-1 | 697 * | | x | : | = | d_0 | 698 * | I | | D_n-1 | | : | 699 * | | ~~ ~~ | d_n-1 | 700 * ~~ ~~ ~~ ~~ 701 * 702 * I is simply a square identity matrix of size n, and V is a vandermonde 703 * matrix defined by the coeffecients we chose for the various parity columns 704 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 705 * computation as well as linear separability. 706 * 707 * __ __ __ __ 708 * | 1 .. 1 1 1 | | p_0 | 709 * | 2^n-1 .. 4 2 1 | __ __ | : | 710 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 711 * | 1 .. 0 0 0 | | D_1 | | d_0 | 712 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 713 * | : : : : | | : | | d_2 | 714 * | 0 .. 1 0 0 | | D_n-1 | | : | 715 * | 0 .. 0 1 0 | ~~ ~~ | : | 716 * | 0 .. 0 0 1 | | d_n-1 | 717 * ~~ ~~ ~~ ~~ 718 * 719 * Note that I, V, d, and p are known. To compute D, we must invert the 720 * matrix and use the known data and parity values to reconstruct the unknown 721 * data values. We begin by removing the rows in V|I and d|p that correspond 722 * to failed or missing columns; we then make V|I square (n x n) and d|p 723 * sized n by removing rows corresponding to unused parity from the bottom up 724 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 725 * using Gauss-Jordan elimination. In the example below we use m=3 parity 726 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 727 * __ __ 728 * | 1 1 1 1 1 1 1 1 | 729 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 730 * | 19 205 116 29 64 16 4 1 | / / 731 * | 1 0 0 0 0 0 0 0 | / / 732 * | 0 1 0 0 0 0 0 0 | <--' / 733 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 734 * | 0 0 0 1 0 0 0 0 | 735 * | 0 0 0 0 1 0 0 0 | 736 * | 0 0 0 0 0 1 0 0 | 737 * | 0 0 0 0 0 0 1 0 | 738 * | 0 0 0 0 0 0 0 1 | 739 * ~~ ~~ 740 * __ __ 741 * | 1 1 1 1 1 1 1 1 | 742 * | 128 64 32 16 8 4 2 1 | 743 * | 19 205 116 29 64 16 4 1 | 744 * | 1 0 0 0 0 0 0 0 | 745 * | 0 1 0 0 0 0 0 0 | 746 * (V|I)' = | 0 0 1 0 0 0 0 0 | 747 * | 0 0 0 1 0 0 0 0 | 748 * | 0 0 0 0 1 0 0 0 | 749 * | 0 0 0 0 0 1 0 0 | 750 * | 0 0 0 0 0 0 1 0 | 751 * | 0 0 0 0 0 0 0 1 | 752 * ~~ ~~ 753 * 754 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 755 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 756 * matrix is not singular. 757 * __ __ 758 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 759 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 760 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 761 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 762 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 763 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 764 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 765 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 766 * ~~ ~~ 767 * __ __ 768 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 769 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 770 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 771 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 772 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 773 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 774 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 775 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 776 * ~~ ~~ 777 * __ __ 778 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 779 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 780 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 781 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 782 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 783 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 784 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 785 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 786 * ~~ ~~ 787 * __ __ 788 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 789 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 790 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 791 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 792 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 793 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 794 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 795 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 796 * ~~ ~~ 797 * __ __ 798 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 799 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 800 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 801 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 802 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 803 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 804 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 805 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 806 * ~~ ~~ 807 * __ __ 808 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 809 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 810 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 811 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 812 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 813 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 814 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 815 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 816 * ~~ ~~ 817 * __ __ 818 * | 0 0 1 0 0 0 0 0 | 819 * | 167 100 5 41 159 169 217 208 | 820 * | 166 100 4 40 158 168 216 209 | 821 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 822 * | 0 0 0 0 1 0 0 0 | 823 * | 0 0 0 0 0 1 0 0 | 824 * | 0 0 0 0 0 0 1 0 | 825 * | 0 0 0 0 0 0 0 1 | 826 * ~~ ~~ 827 * 828 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 829 * of the missing data. 830 * 831 * As is apparent from the example above, the only non-trivial rows in the 832 * inverse matrix correspond to the data disks that we're trying to 833 * reconstruct. Indeed, those are the only rows we need as the others would 834 * only be useful for reconstructing data known or assumed to be valid. For 835 * that reason, we only build the coefficients in the rows that correspond to 836 * targeted columns. 837 */ 838 /* END CSTYLED */ 839 840 static void 841 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 842 uint8_t **rows) 843 { 844 int i, j; 845 int pow; 846 847 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 848 849 /* 850 * Fill in the missing rows of interest. 851 */ 852 for (i = 0; i < nmap; i++) { 853 ASSERT3S(0, <=, map[i]); 854 ASSERT3S(map[i], <=, 2); 855 856 pow = map[i] * n; 857 if (pow > 255) 858 pow -= 255; 859 ASSERT(pow <= 255); 860 861 for (j = 0; j < n; j++) { 862 pow -= map[i]; 863 if (pow < 0) 864 pow += 255; 865 rows[i][j] = vdev_raidz_pow2[pow]; 866 } 867 } 868 } 869 870 static void 871 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 872 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 873 { 874 int i, j, ii, jj; 875 uint8_t log; 876 877 /* 878 * Assert that the first nmissing entries from the array of used 879 * columns correspond to parity columns and that subsequent entries 880 * correspond to data columns. 881 */ 882 for (i = 0; i < nmissing; i++) { 883 ASSERT3S(used[i], <, rm->rm_firstdatacol); 884 } 885 for (; i < n; i++) { 886 ASSERT3S(used[i], >=, rm->rm_firstdatacol); 887 } 888 889 /* 890 * First initialize the storage where we'll compute the inverse rows. 891 */ 892 for (i = 0; i < nmissing; i++) { 893 for (j = 0; j < n; j++) { 894 invrows[i][j] = (i == j) ? 1 : 0; 895 } 896 } 897 898 /* 899 * Subtract all trivial rows from the rows of consequence. 900 */ 901 for (i = 0; i < nmissing; i++) { 902 for (j = nmissing; j < n; j++) { 903 ASSERT3U(used[j], >=, rm->rm_firstdatacol); 904 jj = used[j] - rm->rm_firstdatacol; 905 ASSERT3S(jj, <, n); 906 invrows[i][j] = rows[i][jj]; 907 rows[i][jj] = 0; 908 } 909 } 910 911 /* 912 * For each of the rows of interest, we must normalize it and subtract 913 * a multiple of it from the other rows. 914 */ 915 for (i = 0; i < nmissing; i++) { 916 for (j = 0; j < missing[i]; j++) { 917 ASSERT3U(rows[i][j], ==, 0); 918 } 919 ASSERT3U(rows[i][missing[i]], !=, 0); 920 921 /* 922 * Compute the inverse of the first element and multiply each 923 * element in the row by that value. 924 */ 925 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 926 927 for (j = 0; j < n; j++) { 928 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 929 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 930 } 931 932 for (ii = 0; ii < nmissing; ii++) { 933 if (i == ii) 934 continue; 935 936 ASSERT3U(rows[ii][missing[i]], !=, 0); 937 938 log = vdev_raidz_log2[rows[ii][missing[i]]]; 939 940 for (j = 0; j < n; j++) { 941 rows[ii][j] ^= 942 vdev_raidz_exp2(rows[i][j], log); 943 invrows[ii][j] ^= 944 vdev_raidz_exp2(invrows[i][j], log); 945 } 946 } 947 } 948 949 /* 950 * Verify that the data that is left in the rows are properly part of 951 * an identity matrix. 952 */ 953 for (i = 0; i < nmissing; i++) { 954 for (j = 0; j < n; j++) { 955 if (j == missing[i]) { 956 ASSERT3U(rows[i][j], ==, 1); 957 } else { 958 ASSERT3U(rows[i][j], ==, 0); 959 } 960 } 961 } 962 } 963 964 static void 965 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 966 int *missing, uint8_t **invrows, const uint8_t *used) 967 { 968 int i, j, x, cc, c; 969 uint8_t *src; 970 uint64_t ccount; 971 uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 972 uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 973 uint8_t log, val; 974 int ll; 975 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 976 uint8_t *p, *pp; 977 size_t psize; 978 979 log = 0; /* gcc */ 980 psize = sizeof (invlog[0][0]) * n * nmissing; 981 p = malloc(psize); 982 if (p == NULL) { 983 printf("Out of memory\n"); 984 return; 985 } 986 987 for (pp = p, i = 0; i < nmissing; i++) { 988 invlog[i] = pp; 989 pp += n; 990 } 991 992 for (i = 0; i < nmissing; i++) { 993 for (j = 0; j < n; j++) { 994 ASSERT3U(invrows[i][j], !=, 0); 995 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 996 } 997 } 998 999 for (i = 0; i < n; i++) { 1000 c = used[i]; 1001 ASSERT3U(c, <, rm->rm_cols); 1002 1003 src = rm->rm_col[c].rc_data; 1004 ccount = rm->rm_col[c].rc_size; 1005 for (j = 0; j < nmissing; j++) { 1006 cc = missing[j] + rm->rm_firstdatacol; 1007 ASSERT3U(cc, >=, rm->rm_firstdatacol); 1008 ASSERT3U(cc, <, rm->rm_cols); 1009 ASSERT3U(cc, !=, c); 1010 1011 dst[j] = rm->rm_col[cc].rc_data; 1012 dcount[j] = rm->rm_col[cc].rc_size; 1013 } 1014 1015 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 1016 1017 for (x = 0; x < ccount; x++, src++) { 1018 if (*src != 0) 1019 log = vdev_raidz_log2[*src]; 1020 1021 for (cc = 0; cc < nmissing; cc++) { 1022 if (x >= dcount[cc]) 1023 continue; 1024 1025 if (*src == 0) { 1026 val = 0; 1027 } else { 1028 if ((ll = log + invlog[cc][i]) >= 255) 1029 ll -= 255; 1030 val = vdev_raidz_pow2[ll]; 1031 } 1032 1033 if (i == 0) 1034 dst[cc][x] = val; 1035 else 1036 dst[cc][x] ^= val; 1037 } 1038 } 1039 } 1040 1041 free(p); 1042 } 1043 1044 static int 1045 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 1046 { 1047 int n, i, c, t, tt; 1048 int nmissing_rows; 1049 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1050 int parity_map[VDEV_RAIDZ_MAXPARITY]; 1051 1052 uint8_t *p, *pp; 1053 size_t psize; 1054 1055 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1056 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1057 uint8_t *used; 1058 1059 int code = 0; 1060 1061 1062 n = rm->rm_cols - rm->rm_firstdatacol; 1063 1064 /* 1065 * Figure out which data columns are missing. 1066 */ 1067 nmissing_rows = 0; 1068 for (t = 0; t < ntgts; t++) { 1069 if (tgts[t] >= rm->rm_firstdatacol) { 1070 missing_rows[nmissing_rows++] = 1071 tgts[t] - rm->rm_firstdatacol; 1072 } 1073 } 1074 1075 /* 1076 * Figure out which parity columns to use to help generate the missing 1077 * data columns. 1078 */ 1079 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1080 ASSERT(tt < ntgts); 1081 ASSERT(c < rm->rm_firstdatacol); 1082 1083 /* 1084 * Skip any targeted parity columns. 1085 */ 1086 if (c == tgts[tt]) { 1087 tt++; 1088 continue; 1089 } 1090 1091 code |= 1 << c; 1092 1093 parity_map[i] = c; 1094 i++; 1095 } 1096 1097 ASSERT(code != 0); 1098 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1099 1100 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1101 nmissing_rows * n + sizeof (used[0]) * n; 1102 p = malloc(psize); 1103 if (p == NULL) { 1104 printf("Out of memory\n"); 1105 return (code); 1106 } 1107 1108 for (pp = p, i = 0; i < nmissing_rows; i++) { 1109 rows[i] = pp; 1110 pp += n; 1111 invrows[i] = pp; 1112 pp += n; 1113 } 1114 used = pp; 1115 1116 for (i = 0; i < nmissing_rows; i++) { 1117 used[i] = parity_map[i]; 1118 } 1119 1120 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1121 if (tt < nmissing_rows && 1122 c == missing_rows[tt] + rm->rm_firstdatacol) { 1123 tt++; 1124 continue; 1125 } 1126 1127 ASSERT3S(i, <, n); 1128 used[i] = c; 1129 i++; 1130 } 1131 1132 /* 1133 * Initialize the interesting rows of the matrix. 1134 */ 1135 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1136 1137 /* 1138 * Invert the matrix. 1139 */ 1140 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1141 invrows, used); 1142 1143 /* 1144 * Reconstruct the missing data using the generated matrix. 1145 */ 1146 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1147 invrows, used); 1148 1149 free(p); 1150 1151 return (code); 1152 } 1153 1154 static int 1155 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 1156 { 1157 int tgts[VDEV_RAIDZ_MAXPARITY]; 1158 int ntgts; 1159 int i, c; 1160 int code; 1161 int nbadparity, nbaddata; 1162 1163 /* 1164 * The tgts list must already be sorted. 1165 */ 1166 for (i = 1; i < nt; i++) { 1167 ASSERT(t[i] > t[i - 1]); 1168 } 1169 1170 nbadparity = rm->rm_firstdatacol; 1171 nbaddata = rm->rm_cols - nbadparity; 1172 ntgts = 0; 1173 for (i = 0, c = 0; c < rm->rm_cols; c++) { 1174 if (i < nt && c == t[i]) { 1175 tgts[ntgts++] = c; 1176 i++; 1177 } else if (rm->rm_col[c].rc_error != 0) { 1178 tgts[ntgts++] = c; 1179 } else if (c >= rm->rm_firstdatacol) { 1180 nbaddata--; 1181 } else { 1182 nbadparity--; 1183 } 1184 } 1185 1186 ASSERT(ntgts >= nt); 1187 ASSERT(nbaddata >= 0); 1188 ASSERT(nbaddata + nbadparity == ntgts); 1189 1190 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1191 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1192 ASSERT(code > 0); 1193 return (code); 1194 } 1195 1196 static raidz_map_t * 1197 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift, 1198 uint64_t dcols, uint64_t nparity) 1199 { 1200 raidz_map_t *rm; 1201 uint64_t b = offset >> unit_shift; 1202 uint64_t s = size >> unit_shift; 1203 uint64_t f = b % dcols; 1204 uint64_t o = (b / dcols) << unit_shift; 1205 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 1206 1207 q = s / (dcols - nparity); 1208 r = s - q * (dcols - nparity); 1209 bc = (r == 0 ? 0 : r + nparity); 1210 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 1211 1212 if (q == 0) { 1213 acols = bc; 1214 scols = MIN(dcols, roundup(bc, nparity + 1)); 1215 } else { 1216 acols = dcols; 1217 scols = dcols; 1218 } 1219 1220 ASSERT3U(acols, <=, scols); 1221 1222 rm = malloc(offsetof(raidz_map_t, rm_col[scols])); 1223 if (rm == NULL) 1224 return (rm); 1225 1226 rm->rm_cols = acols; 1227 rm->rm_scols = scols; 1228 rm->rm_bigcols = bc; 1229 rm->rm_skipstart = bc; 1230 rm->rm_missingdata = 0; 1231 rm->rm_missingparity = 0; 1232 rm->rm_firstdatacol = nparity; 1233 rm->rm_reports = 0; 1234 rm->rm_freed = 0; 1235 rm->rm_ecksuminjected = 0; 1236 1237 asize = 0; 1238 1239 for (c = 0; c < scols; c++) { 1240 col = f + c; 1241 coff = o; 1242 if (col >= dcols) { 1243 col -= dcols; 1244 coff += 1ULL << unit_shift; 1245 } 1246 rm->rm_col[c].rc_devidx = col; 1247 rm->rm_col[c].rc_offset = coff; 1248 rm->rm_col[c].rc_data = NULL; 1249 rm->rm_col[c].rc_error = 0; 1250 rm->rm_col[c].rc_tried = 0; 1251 rm->rm_col[c].rc_skipped = 0; 1252 1253 if (c >= acols) 1254 rm->rm_col[c].rc_size = 0; 1255 else if (c < bc) 1256 rm->rm_col[c].rc_size = (q + 1) << unit_shift; 1257 else 1258 rm->rm_col[c].rc_size = q << unit_shift; 1259 1260 asize += rm->rm_col[c].rc_size; 1261 } 1262 1263 ASSERT3U(asize, ==, tot << unit_shift); 1264 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 1265 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 1266 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 1267 ASSERT3U(rm->rm_nskip, <=, nparity); 1268 1269 for (c = 0; c < rm->rm_firstdatacol; c++) { 1270 rm->rm_col[c].rc_data = malloc(rm->rm_col[c].rc_size); 1271 if (rm->rm_col[c].rc_data == NULL) { 1272 c++; 1273 while (c != 0) 1274 free(rm->rm_col[--c].rc_data); 1275 free(rm); 1276 return (NULL); 1277 } 1278 } 1279 1280 rm->rm_col[c].rc_data = data; 1281 1282 for (c = c + 1; c < acols; c++) 1283 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 1284 rm->rm_col[c - 1].rc_size; 1285 1286 /* 1287 * If all data stored spans all columns, there's a danger that parity 1288 * will always be on the same device and, since parity isn't read 1289 * during normal operation, that that device's I/O bandwidth won't be 1290 * used effectively. We therefore switch the parity every 1MB. 1291 * 1292 * ... at least that was, ostensibly, the theory. As a practical 1293 * matter unless we juggle the parity between all devices evenly, we 1294 * won't see any benefit. Further, occasional writes that aren't a 1295 * multiple of the LCM of the number of children and the minimum 1296 * stripe width are sufficient to avoid pessimal behavior. 1297 * Unfortunately, this decision created an implicit on-disk format 1298 * requirement that we need to support for all eternity, but only 1299 * for single-parity RAID-Z. 1300 * 1301 * If we intend to skip a sector in the zeroth column for padding 1302 * we must make sure to note this swap. We will never intend to 1303 * skip the first column since at least one data and one parity 1304 * column must appear in each row. 1305 */ 1306 ASSERT(rm->rm_cols >= 2); 1307 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 1308 1309 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { 1310 devidx = rm->rm_col[0].rc_devidx; 1311 o = rm->rm_col[0].rc_offset; 1312 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 1313 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 1314 rm->rm_col[1].rc_devidx = devidx; 1315 rm->rm_col[1].rc_offset = o; 1316 1317 if (rm->rm_skipstart == 0) 1318 rm->rm_skipstart = 1; 1319 } 1320 1321 return (rm); 1322 } 1323 1324 static void 1325 vdev_raidz_map_free(raidz_map_t *rm) 1326 { 1327 int c; 1328 1329 for (c = rm->rm_firstdatacol - 1; c >= 0; c--) 1330 free(rm->rm_col[c].rc_data); 1331 1332 free(rm); 1333 } 1334 1335 static vdev_t * 1336 vdev_child(vdev_t *pvd, uint64_t devidx) 1337 { 1338 vdev_t *cvd; 1339 1340 STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) { 1341 if (cvd->v_id == devidx) 1342 break; 1343 } 1344 1345 return (cvd); 1346 } 1347 1348 /* 1349 * We keep track of whether or not there were any injected errors, so that 1350 * any ereports we generate can note it. 1351 */ 1352 static int 1353 raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data, 1354 uint64_t size) 1355 { 1356 return (zio_checksum_verify(spa, bp, data)); 1357 } 1358 1359 /* 1360 * Generate the parity from the data columns. If we tried and were able to 1361 * read the parity without error, verify that the generated parity matches the 1362 * data we read. If it doesn't, we fire off a checksum error. Return the 1363 * number such failures. 1364 */ 1365 static int 1366 raidz_parity_verify(raidz_map_t *rm) 1367 { 1368 void *orig[VDEV_RAIDZ_MAXPARITY]; 1369 int c, ret = 0; 1370 raidz_col_t *rc; 1371 1372 for (c = 0; c < rm->rm_firstdatacol; c++) { 1373 rc = &rm->rm_col[c]; 1374 if (!rc->rc_tried || rc->rc_error != 0) 1375 continue; 1376 orig[c] = malloc(rc->rc_size); 1377 if (orig[c] != NULL) { 1378 bcopy(rc->rc_data, orig[c], rc->rc_size); 1379 } else { 1380 printf("Out of memory\n"); 1381 } 1382 } 1383 1384 vdev_raidz_generate_parity(rm); 1385 1386 for (c = rm->rm_firstdatacol - 1; c >= 0; c--) { 1387 rc = &rm->rm_col[c]; 1388 if (!rc->rc_tried || rc->rc_error != 0) 1389 continue; 1390 if (orig[c] == NULL || 1391 bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 1392 rc->rc_error = ECKSUM; 1393 ret++; 1394 } 1395 free(orig[c]); 1396 } 1397 1398 return (ret); 1399 } 1400 1401 /* 1402 * Iterate over all combinations of bad data and attempt a reconstruction. 1403 * Note that the algorithm below is non-optimal because it doesn't take into 1404 * account how reconstruction is actually performed. For example, with 1405 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 1406 * is targeted as invalid as if columns 1 and 4 are targeted since in both 1407 * cases we'd only use parity information in column 0. 1408 */ 1409 static int 1410 vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp, 1411 void *data, off_t offset, uint64_t bytes, int total_errors, int data_errors) 1412 { 1413 raidz_col_t *rc; 1414 void *orig[VDEV_RAIDZ_MAXPARITY]; 1415 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 1416 int *tgts = &tstore[1]; 1417 int current, next, i, c, n; 1418 int code, ret = 0; 1419 1420 ASSERT(total_errors < rm->rm_firstdatacol); 1421 1422 /* 1423 * This simplifies one edge condition. 1424 */ 1425 tgts[-1] = -1; 1426 1427 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 1428 /* 1429 * Initialize the targets array by finding the first n columns 1430 * that contain no error. 1431 * 1432 * If there were no data errors, we need to ensure that we're 1433 * always explicitly attempting to reconstruct at least one 1434 * data column. To do this, we simply push the highest target 1435 * up into the data columns. 1436 */ 1437 for (c = 0, i = 0; i < n; i++) { 1438 if (i == n - 1 && data_errors == 0 && 1439 c < rm->rm_firstdatacol) { 1440 c = rm->rm_firstdatacol; 1441 } 1442 1443 while (rm->rm_col[c].rc_error != 0) { 1444 c++; 1445 ASSERT3S(c, <, rm->rm_cols); 1446 } 1447 1448 tgts[i] = c++; 1449 } 1450 1451 /* 1452 * Setting tgts[n] simplifies the other edge condition. 1453 */ 1454 tgts[n] = rm->rm_cols; 1455 1456 /* 1457 * These buffers were allocated in previous iterations. 1458 */ 1459 for (i = 0; i < n - 1; i++) { 1460 ASSERT(orig[i] != NULL); 1461 } 1462 1463 orig[n - 1] = malloc(rm->rm_col[0].rc_size); 1464 if (orig[n - 1] == NULL) { 1465 ret = ENOMEM; 1466 goto done; 1467 } 1468 1469 current = 0; 1470 next = tgts[current]; 1471 1472 while (current != n) { 1473 tgts[current] = next; 1474 current = 0; 1475 1476 /* 1477 * Save off the original data that we're going to 1478 * attempt to reconstruct. 1479 */ 1480 for (i = 0; i < n; i++) { 1481 ASSERT(orig[i] != NULL); 1482 c = tgts[i]; 1483 ASSERT3S(c, >=, 0); 1484 ASSERT3S(c, <, rm->rm_cols); 1485 rc = &rm->rm_col[c]; 1486 bcopy(rc->rc_data, orig[i], rc->rc_size); 1487 } 1488 1489 /* 1490 * Attempt a reconstruction and exit the outer loop on 1491 * success. 1492 */ 1493 code = vdev_raidz_reconstruct(rm, tgts, n); 1494 if (raidz_checksum_verify(spa, bp, data, bytes) == 0) { 1495 for (i = 0; i < n; i++) { 1496 c = tgts[i]; 1497 rc = &rm->rm_col[c]; 1498 ASSERT(rc->rc_error == 0); 1499 rc->rc_error = ECKSUM; 1500 } 1501 1502 ret = code; 1503 goto done; 1504 } 1505 1506 /* 1507 * Restore the original data. 1508 */ 1509 for (i = 0; i < n; i++) { 1510 c = tgts[i]; 1511 rc = &rm->rm_col[c]; 1512 bcopy(orig[i], rc->rc_data, rc->rc_size); 1513 } 1514 1515 do { 1516 /* 1517 * Find the next valid column after the current 1518 * position.. 1519 */ 1520 for (next = tgts[current] + 1; 1521 next < rm->rm_cols && 1522 rm->rm_col[next].rc_error != 0; next++) 1523 continue; 1524 1525 ASSERT(next <= tgts[current + 1]); 1526 1527 /* 1528 * If that spot is available, we're done here. 1529 */ 1530 if (next != tgts[current + 1]) 1531 break; 1532 1533 /* 1534 * Otherwise, find the next valid column after 1535 * the previous position. 1536 */ 1537 for (c = tgts[current - 1] + 1; 1538 rm->rm_col[c].rc_error != 0; c++) 1539 continue; 1540 1541 tgts[current] = c; 1542 current++; 1543 1544 } while (current != n); 1545 } 1546 } 1547 n--; 1548 done: 1549 for (i = n - 1; i >= 0; i--) { 1550 free(orig[i]); 1551 } 1552 1553 return (ret); 1554 } 1555 1556 static int 1557 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data, 1558 off_t offset, size_t bytes) 1559 { 1560 vdev_t *tvd = vd->v_top; 1561 vdev_t *cvd; 1562 raidz_map_t *rm; 1563 raidz_col_t *rc; 1564 int c, error; 1565 int unexpected_errors; 1566 int parity_errors; 1567 int parity_untried; 1568 int data_errors; 1569 int total_errors; 1570 int n; 1571 int tgts[VDEV_RAIDZ_MAXPARITY]; 1572 int code; 1573 1574 rc = NULL; /* gcc */ 1575 error = 0; 1576 1577 rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift, 1578 vd->v_nchildren, vd->v_nparity); 1579 if (rm == NULL) 1580 return (ENOMEM); 1581 1582 /* 1583 * Iterate over the columns in reverse order so that we hit the parity 1584 * last -- any errors along the way will force us to read the parity. 1585 */ 1586 for (c = rm->rm_cols - 1; c >= 0; c--) { 1587 rc = &rm->rm_col[c]; 1588 cvd = vdev_child(vd, rc->rc_devidx); 1589 if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) { 1590 if (c >= rm->rm_firstdatacol) 1591 rm->rm_missingdata++; 1592 else 1593 rm->rm_missingparity++; 1594 rc->rc_error = ENXIO; 1595 rc->rc_tried = 1; /* don't even try */ 1596 rc->rc_skipped = 1; 1597 continue; 1598 } 1599 #if 0 /* XXX: Too hard for the boot code. */ 1600 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 1601 if (c >= rm->rm_firstdatacol) 1602 rm->rm_missingdata++; 1603 else 1604 rm->rm_missingparity++; 1605 rc->rc_error = ESTALE; 1606 rc->rc_skipped = 1; 1607 continue; 1608 } 1609 #endif 1610 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) { 1611 rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data, 1612 rc->rc_offset, rc->rc_size); 1613 rc->rc_tried = 1; 1614 rc->rc_skipped = 0; 1615 } 1616 } 1617 1618 reconstruct: 1619 unexpected_errors = 0; 1620 parity_errors = 0; 1621 parity_untried = 0; 1622 data_errors = 0; 1623 total_errors = 0; 1624 1625 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 1626 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 1627 1628 for (c = 0; c < rm->rm_cols; c++) { 1629 rc = &rm->rm_col[c]; 1630 1631 if (rc->rc_error) { 1632 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 1633 1634 if (c < rm->rm_firstdatacol) 1635 parity_errors++; 1636 else 1637 data_errors++; 1638 1639 if (!rc->rc_skipped) 1640 unexpected_errors++; 1641 1642 total_errors++; 1643 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 1644 parity_untried++; 1645 } 1646 } 1647 1648 /* 1649 * There are three potential phases for a read: 1650 * 1. produce valid data from the columns read 1651 * 2. read all disks and try again 1652 * 3. perform combinatorial reconstruction 1653 * 1654 * Each phase is progressively both more expensive and less likely to 1655 * occur. If we encounter more errors than we can repair or all phases 1656 * fail, we have no choice but to return an error. 1657 */ 1658 1659 /* 1660 * If the number of errors we saw was correctable -- less than or equal 1661 * to the number of parity disks read -- attempt to produce data that 1662 * has a valid checksum. Naturally, this case applies in the absence of 1663 * any errors. 1664 */ 1665 if (total_errors <= rm->rm_firstdatacol - parity_untried) { 1666 int rv; 1667 1668 if (data_errors == 0) { 1669 rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes); 1670 if (rv == 0) { 1671 /* 1672 * If we read parity information (unnecessarily 1673 * as it happens since no reconstruction was 1674 * needed) regenerate and verify the parity. 1675 * We also regenerate parity when resilvering 1676 * so we can write it out to the failed device 1677 * later. 1678 */ 1679 if (parity_errors + parity_untried < 1680 rm->rm_firstdatacol) { 1681 n = raidz_parity_verify(rm); 1682 unexpected_errors += n; 1683 ASSERT(parity_errors + n <= 1684 rm->rm_firstdatacol); 1685 } 1686 goto done; 1687 } 1688 } else { 1689 /* 1690 * We either attempt to read all the parity columns or 1691 * none of them. If we didn't try to read parity, we 1692 * wouldn't be here in the correctable case. There must 1693 * also have been fewer parity errors than parity 1694 * columns or, again, we wouldn't be in this code path. 1695 */ 1696 ASSERT(parity_untried == 0); 1697 ASSERT(parity_errors < rm->rm_firstdatacol); 1698 1699 /* 1700 * Identify the data columns that reported an error. 1701 */ 1702 n = 0; 1703 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1704 rc = &rm->rm_col[c]; 1705 if (rc->rc_error != 0) { 1706 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 1707 tgts[n++] = c; 1708 } 1709 } 1710 1711 ASSERT(rm->rm_firstdatacol >= n); 1712 1713 code = vdev_raidz_reconstruct(rm, tgts, n); 1714 1715 rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes); 1716 if (rv == 0) { 1717 /* 1718 * If we read more parity disks than were used 1719 * for reconstruction, confirm that the other 1720 * parity disks produced correct data. This 1721 * routine is suboptimal in that it regenerates 1722 * the parity that we already used in addition 1723 * to the parity that we're attempting to 1724 * verify, but this should be a relatively 1725 * uncommon case, and can be optimized if it 1726 * becomes a problem. Note that we regenerate 1727 * parity when resilvering so we can write it 1728 * out to failed devices later. 1729 */ 1730 if (parity_errors < rm->rm_firstdatacol - n) { 1731 n = raidz_parity_verify(rm); 1732 unexpected_errors += n; 1733 ASSERT(parity_errors + n <= 1734 rm->rm_firstdatacol); 1735 } 1736 1737 goto done; 1738 } 1739 } 1740 } 1741 1742 /* 1743 * This isn't a typical situation -- either we got a read 1744 * error or a child silently returned bad data. Read every 1745 * block so we can try again with as much data and parity as 1746 * we can track down. If we've already been through once 1747 * before, all children will be marked as tried so we'll 1748 * proceed to combinatorial reconstruction. 1749 */ 1750 unexpected_errors = 1; 1751 rm->rm_missingdata = 0; 1752 rm->rm_missingparity = 0; 1753 1754 n = 0; 1755 for (c = 0; c < rm->rm_cols; c++) { 1756 rc = &rm->rm_col[c]; 1757 1758 if (rc->rc_tried) 1759 continue; 1760 1761 cvd = vdev_child(vd, rc->rc_devidx); 1762 ASSERT(cvd != NULL); 1763 rc->rc_error = cvd->v_read(cvd, NULL, 1764 rc->rc_data, rc->rc_offset, rc->rc_size); 1765 if (rc->rc_error == 0) 1766 n++; 1767 rc->rc_tried = 1; 1768 rc->rc_skipped = 0; 1769 } 1770 /* 1771 * If we managed to read anything more, retry the 1772 * reconstruction. 1773 */ 1774 if (n > 0) 1775 goto reconstruct; 1776 1777 /* 1778 * At this point we've attempted to reconstruct the data given the 1779 * errors we detected, and we've attempted to read all columns. There 1780 * must, therefore, be one or more additional problems -- silent errors 1781 * resulting in invalid data rather than explicit I/O errors resulting 1782 * in absent data. We check if there is enough additional data to 1783 * possibly reconstruct the data and then perform combinatorial 1784 * reconstruction over all possible combinations. If that fails, 1785 * we're cooked. 1786 */ 1787 if (total_errors > rm->rm_firstdatacol) { 1788 error = EIO; 1789 } else if (total_errors < rm->rm_firstdatacol && 1790 (code = vdev_raidz_combrec(vd->v_spa, rm, bp, data, offset, bytes, 1791 total_errors, data_errors)) != 0) { 1792 /* 1793 * If we didn't use all the available parity for the 1794 * combinatorial reconstruction, verify that the remaining 1795 * parity is correct. 1796 */ 1797 if (code != (1 << rm->rm_firstdatacol) - 1) 1798 (void) raidz_parity_verify(rm); 1799 } else { 1800 /* 1801 * We're here because either: 1802 * 1803 * total_errors == rm_first_datacol, or 1804 * vdev_raidz_combrec() failed 1805 * 1806 * In either case, there is enough bad data to prevent 1807 * reconstruction. 1808 * 1809 * Start checksum ereports for all children which haven't 1810 * failed, and the IO wasn't speculative. 1811 */ 1812 error = ECKSUM; 1813 } 1814 1815 done: 1816 vdev_raidz_map_free(rm); 1817 1818 return (error); 1819 } 1820