1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/cdefs.h> 27 __FBSDID("$FreeBSD$"); 28 29 #include <lz4.h> 30 31 static uint64_t zfs_crc64_table[256]; 32 33 #define ASSERT3S(x, y, z) ((void)0) 34 #define ASSERT3U(x, y, z) ((void)0) 35 #define ASSERT3P(x, y, z) ((void)0) 36 #define ASSERT0(x) ((void)0) 37 #define ASSERT(x) ((void)0) 38 39 #define panic(...) do { \ 40 printf(__VA_ARGS__); \ 41 for (;;) ; \ 42 } while (0) 43 44 static void 45 zfs_init_crc(void) 46 { 47 int i, j; 48 uint64_t *ct; 49 50 /* 51 * Calculate the crc64 table (used for the zap hash 52 * function). 53 */ 54 if (zfs_crc64_table[128] != ZFS_CRC64_POLY) { 55 memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table)); 56 for (i = 0; i < 256; i++) 57 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 58 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 59 } 60 } 61 62 static void 63 zio_checksum_off(const void *buf, uint64_t size, 64 const void *ctx_template, zio_cksum_t *zcp) 65 { 66 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); 67 } 68 69 /* 70 * Signature for checksum functions. 71 */ 72 typedef void zio_checksum_t(const void *data, uint64_t size, 73 const void *ctx_template, zio_cksum_t *zcp); 74 typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt); 75 typedef void zio_checksum_tmpl_free_t(void *ctx_template); 76 77 typedef enum zio_checksum_flags { 78 /* Strong enough for metadata? */ 79 ZCHECKSUM_FLAG_METADATA = (1 << 1), 80 /* ZIO embedded checksum */ 81 ZCHECKSUM_FLAG_EMBEDDED = (1 << 2), 82 /* Strong enough for dedup (without verification)? */ 83 ZCHECKSUM_FLAG_DEDUP = (1 << 3), 84 /* Uses salt value */ 85 ZCHECKSUM_FLAG_SALTED = (1 << 4), 86 /* Strong enough for nopwrite? */ 87 ZCHECKSUM_FLAG_NOPWRITE = (1 << 5) 88 } zio_checksum_flags_t; 89 90 /* 91 * Information about each checksum function. 92 */ 93 typedef struct zio_checksum_info { 94 /* checksum function for each byteorder */ 95 zio_checksum_t *ci_func[2]; 96 zio_checksum_tmpl_init_t *ci_tmpl_init; 97 zio_checksum_tmpl_free_t *ci_tmpl_free; 98 zio_checksum_flags_t ci_flags; 99 const char *ci_name; /* descriptive name */ 100 } zio_checksum_info_t; 101 102 #include "blkptr.c" 103 104 #include "fletcher.c" 105 #include "sha256.c" 106 #include "skein_zfs.c" 107 108 #ifdef HAS_ZSTD_ZFS 109 extern int zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len, 110 size_t d_len, int n); 111 #endif 112 113 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { 114 {{NULL, NULL}, NULL, NULL, 0, "inherit"}, 115 {{NULL, NULL}, NULL, NULL, 0, "on"}, 116 {{zio_checksum_off, zio_checksum_off}, NULL, NULL, 0, "off"}, 117 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, 118 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"}, 119 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, 120 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"}, 121 {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL, 122 ZCHECKSUM_FLAG_EMBEDDED, "zilog"}, 123 {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL, 124 0, "fletcher2"}, 125 {{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL, 126 ZCHECKSUM_FLAG_METADATA, "fletcher4"}, 127 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, 128 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | 129 ZCHECKSUM_FLAG_NOPWRITE, "SHA256"}, 130 {{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL, 131 ZCHECKSUM_FLAG_EMBEDDED, "zillog2"}, 132 {{zio_checksum_off, zio_checksum_off}, NULL, NULL, 133 0, "noparity"}, 134 {{zio_checksum_SHA512_native, zio_checksum_SHA512_byteswap}, 135 NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | 136 ZCHECKSUM_FLAG_NOPWRITE, "SHA512"}, 137 {{zio_checksum_skein_native, zio_checksum_skein_byteswap}, 138 zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free, 139 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | 140 ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"}, 141 /* no edonr for now */ 142 {{NULL, NULL}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | 143 ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"} 144 }; 145 146 /* 147 * Common signature for all zio compress/decompress functions. 148 */ 149 typedef size_t zio_compress_func_t(void *src, void *dst, 150 size_t s_len, size_t d_len, int); 151 typedef int zio_decompress_func_t(void *src, void *dst, 152 size_t s_len, size_t d_len, int); 153 154 /* 155 * Information about each compression function. 156 */ 157 typedef struct zio_compress_info { 158 zio_compress_func_t *ci_compress; /* compression function */ 159 zio_decompress_func_t *ci_decompress; /* decompression function */ 160 int ci_level; /* level parameter */ 161 const char *ci_name; /* algorithm name */ 162 } zio_compress_info_t; 163 164 #include "lzjb.c" 165 #include "zle.c" 166 167 /* 168 * Compression vectors. 169 */ 170 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { 171 {NULL, NULL, 0, "inherit"}, 172 {NULL, NULL, 0, "on"}, 173 {NULL, NULL, 0, "uncompressed"}, 174 {NULL, lzjb_decompress, 0, "lzjb"}, 175 {NULL, NULL, 0, "empty"}, 176 {NULL, NULL, 1, "gzip-1"}, 177 {NULL, NULL, 2, "gzip-2"}, 178 {NULL, NULL, 3, "gzip-3"}, 179 {NULL, NULL, 4, "gzip-4"}, 180 {NULL, NULL, 5, "gzip-5"}, 181 {NULL, NULL, 6, "gzip-6"}, 182 {NULL, NULL, 7, "gzip-7"}, 183 {NULL, NULL, 8, "gzip-8"}, 184 {NULL, NULL, 9, "gzip-9"}, 185 {NULL, zle_decompress, 64, "zle"}, 186 {NULL, lz4_decompress, 0, "lz4"}, 187 #ifdef HAS_ZSTD_ZFS 188 {NULL, zfs_zstd_decompress, ZIO_ZSTD_LEVEL_DEFAULT, "zstd"} 189 #endif 190 }; 191 192 static void 193 byteswap_uint64_array(void *vbuf, size_t size) 194 { 195 uint64_t *buf = vbuf; 196 size_t count = size >> 3; 197 int i; 198 199 ASSERT((size & 7) == 0); 200 201 for (i = 0; i < count; i++) 202 buf[i] = BSWAP_64(buf[i]); 203 } 204 205 /* 206 * Set the external verifier for a gang block based on <vdev, offset, txg>, 207 * a tuple which is guaranteed to be unique for the life of the pool. 208 */ 209 static void 210 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp) 211 { 212 const dva_t *dva = BP_IDENTITY(bp); 213 uint64_t txg = BP_PHYSICAL_BIRTH(bp); 214 215 ASSERT(BP_IS_GANG(bp)); 216 217 ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0); 218 } 219 220 /* 221 * Set the external verifier for a label block based on its offset. 222 * The vdev is implicit, and the txg is unknowable at pool open time -- 223 * hence the logic in vdev_uberblock_load() to find the most recent copy. 224 */ 225 static void 226 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset) 227 { 228 ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0); 229 } 230 231 /* 232 * Calls the template init function of a checksum which supports context 233 * templates and installs the template into the spa_t. 234 */ 235 static void 236 zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa) 237 { 238 zio_checksum_info_t *ci = &zio_checksum_table[checksum]; 239 240 if (ci->ci_tmpl_init == NULL) 241 return; 242 243 if (spa->spa_cksum_tmpls[checksum] != NULL) 244 return; 245 246 if (spa->spa_cksum_tmpls[checksum] == NULL) { 247 spa->spa_cksum_tmpls[checksum] = 248 ci->ci_tmpl_init(&spa->spa_cksum_salt); 249 } 250 } 251 252 /* 253 * Called by a spa_t that's about to be deallocated. This steps through 254 * all of the checksum context templates and deallocates any that were 255 * initialized using the algorithm-specific template init function. 256 */ 257 static void __unused 258 zio_checksum_templates_free(spa_t *spa) 259 { 260 for (enum zio_checksum checksum = 0; 261 checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) { 262 if (spa->spa_cksum_tmpls[checksum] != NULL) { 263 zio_checksum_info_t *ci = &zio_checksum_table[checksum]; 264 265 ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]); 266 spa->spa_cksum_tmpls[checksum] = NULL; 267 } 268 } 269 } 270 271 static int 272 zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data) 273 { 274 uint64_t size; 275 unsigned int checksum; 276 zio_checksum_info_t *ci; 277 void *ctx = NULL; 278 zio_cksum_t actual_cksum, expected_cksum, verifier; 279 int byteswap; 280 281 checksum = BP_GET_CHECKSUM(bp); 282 size = BP_GET_PSIZE(bp); 283 284 if (checksum >= ZIO_CHECKSUM_FUNCTIONS) 285 return (EINVAL); 286 ci = &zio_checksum_table[checksum]; 287 if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL) 288 return (EINVAL); 289 290 if (spa != NULL) { 291 zio_checksum_template_init(checksum, __DECONST(spa_t *,spa)); 292 ctx = spa->spa_cksum_tmpls[checksum]; 293 } 294 295 if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { 296 zio_eck_t *eck; 297 298 ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER || 299 checksum == ZIO_CHECKSUM_LABEL); 300 301 eck = (zio_eck_t *)((char *)data + size) - 1; 302 303 if (checksum == ZIO_CHECKSUM_GANG_HEADER) 304 zio_checksum_gang_verifier(&verifier, bp); 305 else if (checksum == ZIO_CHECKSUM_LABEL) 306 zio_checksum_label_verifier(&verifier, 307 DVA_GET_OFFSET(BP_IDENTITY(bp))); 308 else 309 verifier = bp->blk_cksum; 310 311 byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC)); 312 313 if (byteswap) 314 byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); 315 316 expected_cksum = eck->zec_cksum; 317 eck->zec_cksum = verifier; 318 ci->ci_func[byteswap](data, size, ctx, &actual_cksum); 319 eck->zec_cksum = expected_cksum; 320 321 if (byteswap) 322 byteswap_uint64_array(&expected_cksum, 323 sizeof (zio_cksum_t)); 324 } else { 325 byteswap = BP_SHOULD_BYTESWAP(bp); 326 expected_cksum = bp->blk_cksum; 327 ci->ci_func[byteswap](data, size, ctx, &actual_cksum); 328 } 329 330 if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) { 331 /*printf("ZFS: read checksum %s failed\n", ci->ci_name);*/ 332 return (EIO); 333 } 334 335 return (0); 336 } 337 338 static int 339 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize, 340 void *dest, uint64_t destsize) 341 { 342 zio_compress_info_t *ci; 343 344 if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) { 345 printf("ZFS: unsupported compression algorithm %u\n", cpfunc); 346 return (EIO); 347 } 348 349 ci = &zio_compress_table[cpfunc]; 350 if (!ci->ci_decompress) { 351 printf("ZFS: unsupported compression algorithm %s\n", 352 ci->ci_name); 353 return (EIO); 354 } 355 356 return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level)); 357 } 358 359 static uint64_t 360 zap_hash(uint64_t salt, const char *name) 361 { 362 const uint8_t *cp; 363 uint8_t c; 364 uint64_t crc = salt; 365 366 ASSERT(crc != 0); 367 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 368 for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++) 369 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF]; 370 371 /* 372 * Only use 28 bits, since we need 4 bits in the cookie for the 373 * collision differentiator. We MUST use the high bits, since 374 * those are the onces that we first pay attention to when 375 * chosing the bucket. 376 */ 377 crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1); 378 379 return (crc); 380 } 381 382 typedef struct raidz_col { 383 uint64_t rc_devidx; /* child device index for I/O */ 384 uint64_t rc_offset; /* device offset */ 385 uint64_t rc_size; /* I/O size */ 386 void *rc_data; /* I/O data */ 387 int rc_error; /* I/O error for this device */ 388 uint8_t rc_tried; /* Did we attempt this I/O column? */ 389 uint8_t rc_skipped; /* Did we skip this I/O column? */ 390 } raidz_col_t; 391 392 typedef struct raidz_map { 393 uint64_t rm_cols; /* Regular column count */ 394 uint64_t rm_scols; /* Count including skipped columns */ 395 uint64_t rm_bigcols; /* Number of oversized columns */ 396 uint64_t rm_asize; /* Actual total I/O size */ 397 uint64_t rm_missingdata; /* Count of missing data devices */ 398 uint64_t rm_missingparity; /* Count of missing parity devices */ 399 uint64_t rm_firstdatacol; /* First data column/parity count */ 400 uint64_t rm_nskip; /* Skipped sectors for padding */ 401 uint64_t rm_skipstart; /* Column index of padding start */ 402 uintptr_t rm_reports; /* # of referencing checksum reports */ 403 uint8_t rm_freed; /* map no longer has referencing ZIO */ 404 uint8_t rm_ecksuminjected; /* checksum error was injected */ 405 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 406 } raidz_map_t; 407 408 #define VDEV_RAIDZ_P 0 409 #define VDEV_RAIDZ_Q 1 410 #define VDEV_RAIDZ_R 2 411 412 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 413 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 414 415 /* 416 * We provide a mechanism to perform the field multiplication operation on a 417 * 64-bit value all at once rather than a byte at a time. This works by 418 * creating a mask from the top bit in each byte and using that to 419 * conditionally apply the XOR of 0x1d. 420 */ 421 #define VDEV_RAIDZ_64MUL_2(x, mask) \ 422 { \ 423 (mask) = (x) & 0x8080808080808080ULL; \ 424 (mask) = ((mask) << 1) - ((mask) >> 7); \ 425 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 426 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ 427 } 428 429 #define VDEV_RAIDZ_64MUL_4(x, mask) \ 430 { \ 431 VDEV_RAIDZ_64MUL_2((x), mask); \ 432 VDEV_RAIDZ_64MUL_2((x), mask); \ 433 } 434 435 /* 436 * These two tables represent powers and logs of 2 in the Galois field defined 437 * above. These values were computed by repeatedly multiplying by 2 as above. 438 */ 439 static const uint8_t vdev_raidz_pow2[256] = { 440 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 441 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 442 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 443 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 444 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 445 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 446 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 447 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 448 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 449 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 450 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 451 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 452 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 453 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 454 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 455 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 456 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 457 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 458 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 459 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 460 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 461 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 462 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 463 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 464 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 465 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 466 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 467 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 468 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 469 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 470 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 471 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 472 }; 473 static const uint8_t vdev_raidz_log2[256] = { 474 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 475 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 476 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 477 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 478 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 479 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 480 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 481 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 482 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 483 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 484 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 485 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 486 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 487 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 488 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 489 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 490 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 491 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 492 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 493 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 494 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 495 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 496 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 497 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 498 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 499 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 500 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 501 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 502 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 503 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 504 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 505 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 506 }; 507 508 /* 509 * Multiply a given number by 2 raised to the given power. 510 */ 511 static uint8_t 512 vdev_raidz_exp2(uint8_t a, int exp) 513 { 514 if (a == 0) 515 return (0); 516 517 ASSERT(exp >= 0); 518 ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 519 520 exp += vdev_raidz_log2[a]; 521 if (exp > 255) 522 exp -= 255; 523 524 return (vdev_raidz_pow2[exp]); 525 } 526 527 static void 528 vdev_raidz_generate_parity_p(raidz_map_t *rm) 529 { 530 uint64_t *p, *src, pcount, ccount, i; 531 int c; 532 533 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 534 535 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 536 src = rm->rm_col[c].rc_data; 537 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 538 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 539 540 if (c == rm->rm_firstdatacol) { 541 ASSERT(ccount == pcount); 542 for (i = 0; i < ccount; i++, src++, p++) { 543 *p = *src; 544 } 545 } else { 546 ASSERT(ccount <= pcount); 547 for (i = 0; i < ccount; i++, src++, p++) { 548 *p ^= *src; 549 } 550 } 551 } 552 } 553 554 static void 555 vdev_raidz_generate_parity_pq(raidz_map_t *rm) 556 { 557 uint64_t *p, *q, *src, pcnt, ccnt, mask, i; 558 int c; 559 560 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 561 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 562 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 563 564 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 565 src = rm->rm_col[c].rc_data; 566 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 567 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 568 569 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 570 571 if (c == rm->rm_firstdatacol) { 572 ASSERT(ccnt == pcnt || ccnt == 0); 573 for (i = 0; i < ccnt; i++, src++, p++, q++) { 574 *p = *src; 575 *q = *src; 576 } 577 for (; i < pcnt; i++, src++, p++, q++) { 578 *p = 0; 579 *q = 0; 580 } 581 } else { 582 ASSERT(ccnt <= pcnt); 583 584 /* 585 * Apply the algorithm described above by multiplying 586 * the previous result and adding in the new value. 587 */ 588 for (i = 0; i < ccnt; i++, src++, p++, q++) { 589 *p ^= *src; 590 591 VDEV_RAIDZ_64MUL_2(*q, mask); 592 *q ^= *src; 593 } 594 595 /* 596 * Treat short columns as though they are full of 0s. 597 * Note that there's therefore nothing needed for P. 598 */ 599 for (; i < pcnt; i++, q++) { 600 VDEV_RAIDZ_64MUL_2(*q, mask); 601 } 602 } 603 } 604 } 605 606 static void 607 vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 608 { 609 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; 610 int c; 611 612 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 613 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 614 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 615 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 616 rm->rm_col[VDEV_RAIDZ_R].rc_size); 617 618 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 619 src = rm->rm_col[c].rc_data; 620 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 621 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 622 r = rm->rm_col[VDEV_RAIDZ_R].rc_data; 623 624 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 625 626 if (c == rm->rm_firstdatacol) { 627 ASSERT(ccnt == pcnt || ccnt == 0); 628 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 629 *p = *src; 630 *q = *src; 631 *r = *src; 632 } 633 for (; i < pcnt; i++, src++, p++, q++, r++) { 634 *p = 0; 635 *q = 0; 636 *r = 0; 637 } 638 } else { 639 ASSERT(ccnt <= pcnt); 640 641 /* 642 * Apply the algorithm described above by multiplying 643 * the previous result and adding in the new value. 644 */ 645 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 646 *p ^= *src; 647 648 VDEV_RAIDZ_64MUL_2(*q, mask); 649 *q ^= *src; 650 651 VDEV_RAIDZ_64MUL_4(*r, mask); 652 *r ^= *src; 653 } 654 655 /* 656 * Treat short columns as though they are full of 0s. 657 * Note that there's therefore nothing needed for P. 658 */ 659 for (; i < pcnt; i++, q++, r++) { 660 VDEV_RAIDZ_64MUL_2(*q, mask); 661 VDEV_RAIDZ_64MUL_4(*r, mask); 662 } 663 } 664 } 665 } 666 667 /* 668 * Generate RAID parity in the first virtual columns according to the number of 669 * parity columns available. 670 */ 671 static void 672 vdev_raidz_generate_parity(raidz_map_t *rm) 673 { 674 switch (rm->rm_firstdatacol) { 675 case 1: 676 vdev_raidz_generate_parity_p(rm); 677 break; 678 case 2: 679 vdev_raidz_generate_parity_pq(rm); 680 break; 681 case 3: 682 vdev_raidz_generate_parity_pqr(rm); 683 break; 684 default: 685 panic("invalid RAID-Z configuration"); 686 } 687 } 688 689 /* BEGIN CSTYLED */ 690 /* 691 * In the general case of reconstruction, we must solve the system of linear 692 * equations defined by the coeffecients used to generate parity as well as 693 * the contents of the data and parity disks. This can be expressed with 694 * vectors for the original data (D) and the actual data (d) and parity (p) 695 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 696 * 697 * __ __ __ __ 698 * | | __ __ | p_0 | 699 * | V | | D_0 | | p_m-1 | 700 * | | x | : | = | d_0 | 701 * | I | | D_n-1 | | : | 702 * | | ~~ ~~ | d_n-1 | 703 * ~~ ~~ ~~ ~~ 704 * 705 * I is simply a square identity matrix of size n, and V is a vandermonde 706 * matrix defined by the coeffecients we chose for the various parity columns 707 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 708 * computation as well as linear separability. 709 * 710 * __ __ __ __ 711 * | 1 .. 1 1 1 | | p_0 | 712 * | 2^n-1 .. 4 2 1 | __ __ | : | 713 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 714 * | 1 .. 0 0 0 | | D_1 | | d_0 | 715 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 716 * | : : : : | | : | | d_2 | 717 * | 0 .. 1 0 0 | | D_n-1 | | : | 718 * | 0 .. 0 1 0 | ~~ ~~ | : | 719 * | 0 .. 0 0 1 | | d_n-1 | 720 * ~~ ~~ ~~ ~~ 721 * 722 * Note that I, V, d, and p are known. To compute D, we must invert the 723 * matrix and use the known data and parity values to reconstruct the unknown 724 * data values. We begin by removing the rows in V|I and d|p that correspond 725 * to failed or missing columns; we then make V|I square (n x n) and d|p 726 * sized n by removing rows corresponding to unused parity from the bottom up 727 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 728 * using Gauss-Jordan elimination. In the example below we use m=3 parity 729 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 730 * __ __ 731 * | 1 1 1 1 1 1 1 1 | 732 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 733 * | 19 205 116 29 64 16 4 1 | / / 734 * | 1 0 0 0 0 0 0 0 | / / 735 * | 0 1 0 0 0 0 0 0 | <--' / 736 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 737 * | 0 0 0 1 0 0 0 0 | 738 * | 0 0 0 0 1 0 0 0 | 739 * | 0 0 0 0 0 1 0 0 | 740 * | 0 0 0 0 0 0 1 0 | 741 * | 0 0 0 0 0 0 0 1 | 742 * ~~ ~~ 743 * __ __ 744 * | 1 1 1 1 1 1 1 1 | 745 * | 128 64 32 16 8 4 2 1 | 746 * | 19 205 116 29 64 16 4 1 | 747 * | 1 0 0 0 0 0 0 0 | 748 * | 0 1 0 0 0 0 0 0 | 749 * (V|I)' = | 0 0 1 0 0 0 0 0 | 750 * | 0 0 0 1 0 0 0 0 | 751 * | 0 0 0 0 1 0 0 0 | 752 * | 0 0 0 0 0 1 0 0 | 753 * | 0 0 0 0 0 0 1 0 | 754 * | 0 0 0 0 0 0 0 1 | 755 * ~~ ~~ 756 * 757 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 758 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 759 * matrix is not singular. 760 * __ __ 761 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 762 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 763 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 764 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 765 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 766 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 767 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 768 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 769 * ~~ ~~ 770 * __ __ 771 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 772 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 773 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 774 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 775 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 776 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 777 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 778 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 779 * ~~ ~~ 780 * __ __ 781 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 782 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 783 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 784 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 785 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 786 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 787 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 788 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 789 * ~~ ~~ 790 * __ __ 791 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 792 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 793 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 794 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 795 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 796 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 797 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 798 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 799 * ~~ ~~ 800 * __ __ 801 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 802 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 803 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 804 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 805 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 806 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 807 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 808 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 809 * ~~ ~~ 810 * __ __ 811 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 812 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 813 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 814 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 815 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 816 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 817 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 818 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 819 * ~~ ~~ 820 * __ __ 821 * | 0 0 1 0 0 0 0 0 | 822 * | 167 100 5 41 159 169 217 208 | 823 * | 166 100 4 40 158 168 216 209 | 824 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 825 * | 0 0 0 0 1 0 0 0 | 826 * | 0 0 0 0 0 1 0 0 | 827 * | 0 0 0 0 0 0 1 0 | 828 * | 0 0 0 0 0 0 0 1 | 829 * ~~ ~~ 830 * 831 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 832 * of the missing data. 833 * 834 * As is apparent from the example above, the only non-trivial rows in the 835 * inverse matrix correspond to the data disks that we're trying to 836 * reconstruct. Indeed, those are the only rows we need as the others would 837 * only be useful for reconstructing data known or assumed to be valid. For 838 * that reason, we only build the coefficients in the rows that correspond to 839 * targeted columns. 840 */ 841 /* END CSTYLED */ 842 843 static void 844 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 845 uint8_t **rows) 846 { 847 int i, j; 848 int pow; 849 850 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 851 852 /* 853 * Fill in the missing rows of interest. 854 */ 855 for (i = 0; i < nmap; i++) { 856 ASSERT3S(0, <=, map[i]); 857 ASSERT3S(map[i], <=, 2); 858 859 pow = map[i] * n; 860 if (pow > 255) 861 pow -= 255; 862 ASSERT(pow <= 255); 863 864 for (j = 0; j < n; j++) { 865 pow -= map[i]; 866 if (pow < 0) 867 pow += 255; 868 rows[i][j] = vdev_raidz_pow2[pow]; 869 } 870 } 871 } 872 873 static void 874 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 875 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 876 { 877 int i, j, ii, jj; 878 uint8_t log; 879 880 /* 881 * Assert that the first nmissing entries from the array of used 882 * columns correspond to parity columns and that subsequent entries 883 * correspond to data columns. 884 */ 885 for (i = 0; i < nmissing; i++) { 886 ASSERT3S(used[i], <, rm->rm_firstdatacol); 887 } 888 for (; i < n; i++) { 889 ASSERT3S(used[i], >=, rm->rm_firstdatacol); 890 } 891 892 /* 893 * First initialize the storage where we'll compute the inverse rows. 894 */ 895 for (i = 0; i < nmissing; i++) { 896 for (j = 0; j < n; j++) { 897 invrows[i][j] = (i == j) ? 1 : 0; 898 } 899 } 900 901 /* 902 * Subtract all trivial rows from the rows of consequence. 903 */ 904 for (i = 0; i < nmissing; i++) { 905 for (j = nmissing; j < n; j++) { 906 ASSERT3U(used[j], >=, rm->rm_firstdatacol); 907 jj = used[j] - rm->rm_firstdatacol; 908 ASSERT3S(jj, <, n); 909 invrows[i][j] = rows[i][jj]; 910 rows[i][jj] = 0; 911 } 912 } 913 914 /* 915 * For each of the rows of interest, we must normalize it and subtract 916 * a multiple of it from the other rows. 917 */ 918 for (i = 0; i < nmissing; i++) { 919 for (j = 0; j < missing[i]; j++) { 920 ASSERT3U(rows[i][j], ==, 0); 921 } 922 ASSERT3U(rows[i][missing[i]], !=, 0); 923 924 /* 925 * Compute the inverse of the first element and multiply each 926 * element in the row by that value. 927 */ 928 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 929 930 for (j = 0; j < n; j++) { 931 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 932 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 933 } 934 935 for (ii = 0; ii < nmissing; ii++) { 936 if (i == ii) 937 continue; 938 939 ASSERT3U(rows[ii][missing[i]], !=, 0); 940 941 log = vdev_raidz_log2[rows[ii][missing[i]]]; 942 943 for (j = 0; j < n; j++) { 944 rows[ii][j] ^= 945 vdev_raidz_exp2(rows[i][j], log); 946 invrows[ii][j] ^= 947 vdev_raidz_exp2(invrows[i][j], log); 948 } 949 } 950 } 951 952 /* 953 * Verify that the data that is left in the rows are properly part of 954 * an identity matrix. 955 */ 956 for (i = 0; i < nmissing; i++) { 957 for (j = 0; j < n; j++) { 958 if (j == missing[i]) { 959 ASSERT3U(rows[i][j], ==, 1); 960 } else { 961 ASSERT3U(rows[i][j], ==, 0); 962 } 963 } 964 } 965 } 966 967 static void 968 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 969 int *missing, uint8_t **invrows, const uint8_t *used) 970 { 971 int i, j, x, cc, c; 972 uint8_t *src; 973 uint64_t ccount; 974 uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 975 uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 976 uint8_t log, val; 977 int ll; 978 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 979 uint8_t *p, *pp; 980 size_t psize; 981 982 log = 0; /* gcc */ 983 psize = sizeof (invlog[0][0]) * n * nmissing; 984 p = malloc(psize); 985 if (p == NULL) { 986 printf("Out of memory\n"); 987 return; 988 } 989 990 for (pp = p, i = 0; i < nmissing; i++) { 991 invlog[i] = pp; 992 pp += n; 993 } 994 995 for (i = 0; i < nmissing; i++) { 996 for (j = 0; j < n; j++) { 997 ASSERT3U(invrows[i][j], !=, 0); 998 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 999 } 1000 } 1001 1002 for (i = 0; i < n; i++) { 1003 c = used[i]; 1004 ASSERT3U(c, <, rm->rm_cols); 1005 1006 src = rm->rm_col[c].rc_data; 1007 ccount = rm->rm_col[c].rc_size; 1008 for (j = 0; j < nmissing; j++) { 1009 cc = missing[j] + rm->rm_firstdatacol; 1010 ASSERT3U(cc, >=, rm->rm_firstdatacol); 1011 ASSERT3U(cc, <, rm->rm_cols); 1012 ASSERT3U(cc, !=, c); 1013 1014 dst[j] = rm->rm_col[cc].rc_data; 1015 dcount[j] = rm->rm_col[cc].rc_size; 1016 } 1017 1018 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 1019 1020 for (x = 0; x < ccount; x++, src++) { 1021 if (*src != 0) 1022 log = vdev_raidz_log2[*src]; 1023 1024 for (cc = 0; cc < nmissing; cc++) { 1025 if (x >= dcount[cc]) 1026 continue; 1027 1028 if (*src == 0) { 1029 val = 0; 1030 } else { 1031 if ((ll = log + invlog[cc][i]) >= 255) 1032 ll -= 255; 1033 val = vdev_raidz_pow2[ll]; 1034 } 1035 1036 if (i == 0) 1037 dst[cc][x] = val; 1038 else 1039 dst[cc][x] ^= val; 1040 } 1041 } 1042 } 1043 1044 free(p); 1045 } 1046 1047 static int 1048 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 1049 { 1050 int n, i, c, t, tt; 1051 int nmissing_rows; 1052 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1053 int parity_map[VDEV_RAIDZ_MAXPARITY]; 1054 1055 uint8_t *p, *pp; 1056 size_t psize; 1057 1058 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1059 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1060 uint8_t *used; 1061 1062 int code = 0; 1063 1064 1065 n = rm->rm_cols - rm->rm_firstdatacol; 1066 1067 /* 1068 * Figure out which data columns are missing. 1069 */ 1070 nmissing_rows = 0; 1071 for (t = 0; t < ntgts; t++) { 1072 if (tgts[t] >= rm->rm_firstdatacol) { 1073 missing_rows[nmissing_rows++] = 1074 tgts[t] - rm->rm_firstdatacol; 1075 } 1076 } 1077 1078 /* 1079 * Figure out which parity columns to use to help generate the missing 1080 * data columns. 1081 */ 1082 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1083 ASSERT(tt < ntgts); 1084 ASSERT(c < rm->rm_firstdatacol); 1085 1086 /* 1087 * Skip any targeted parity columns. 1088 */ 1089 if (c == tgts[tt]) { 1090 tt++; 1091 continue; 1092 } 1093 1094 code |= 1 << c; 1095 1096 parity_map[i] = c; 1097 i++; 1098 } 1099 1100 ASSERT(code != 0); 1101 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1102 1103 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1104 nmissing_rows * n + sizeof (used[0]) * n; 1105 p = malloc(psize); 1106 if (p == NULL) { 1107 printf("Out of memory\n"); 1108 return (code); 1109 } 1110 1111 for (pp = p, i = 0; i < nmissing_rows; i++) { 1112 rows[i] = pp; 1113 pp += n; 1114 invrows[i] = pp; 1115 pp += n; 1116 } 1117 used = pp; 1118 1119 for (i = 0; i < nmissing_rows; i++) { 1120 used[i] = parity_map[i]; 1121 } 1122 1123 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1124 if (tt < nmissing_rows && 1125 c == missing_rows[tt] + rm->rm_firstdatacol) { 1126 tt++; 1127 continue; 1128 } 1129 1130 ASSERT3S(i, <, n); 1131 used[i] = c; 1132 i++; 1133 } 1134 1135 /* 1136 * Initialize the interesting rows of the matrix. 1137 */ 1138 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1139 1140 /* 1141 * Invert the matrix. 1142 */ 1143 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1144 invrows, used); 1145 1146 /* 1147 * Reconstruct the missing data using the generated matrix. 1148 */ 1149 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1150 invrows, used); 1151 1152 free(p); 1153 1154 return (code); 1155 } 1156 1157 static int 1158 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 1159 { 1160 int tgts[VDEV_RAIDZ_MAXPARITY]; 1161 int ntgts; 1162 int i, c; 1163 int code; 1164 int nbadparity, nbaddata; 1165 1166 /* 1167 * The tgts list must already be sorted. 1168 */ 1169 for (i = 1; i < nt; i++) { 1170 ASSERT(t[i] > t[i - 1]); 1171 } 1172 1173 nbadparity = rm->rm_firstdatacol; 1174 nbaddata = rm->rm_cols - nbadparity; 1175 ntgts = 0; 1176 for (i = 0, c = 0; c < rm->rm_cols; c++) { 1177 if (i < nt && c == t[i]) { 1178 tgts[ntgts++] = c; 1179 i++; 1180 } else if (rm->rm_col[c].rc_error != 0) { 1181 tgts[ntgts++] = c; 1182 } else if (c >= rm->rm_firstdatacol) { 1183 nbaddata--; 1184 } else { 1185 nbadparity--; 1186 } 1187 } 1188 1189 ASSERT(ntgts >= nt); 1190 ASSERT(nbaddata >= 0); 1191 ASSERT(nbaddata + nbadparity == ntgts); 1192 1193 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1194 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1195 ASSERT(code > 0); 1196 return (code); 1197 } 1198 1199 static raidz_map_t * 1200 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift, 1201 uint64_t dcols, uint64_t nparity) 1202 { 1203 raidz_map_t *rm; 1204 uint64_t b = offset >> unit_shift; 1205 uint64_t s = size >> unit_shift; 1206 uint64_t f = b % dcols; 1207 uint64_t o = (b / dcols) << unit_shift; 1208 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 1209 1210 q = s / (dcols - nparity); 1211 r = s - q * (dcols - nparity); 1212 bc = (r == 0 ? 0 : r + nparity); 1213 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 1214 1215 if (q == 0) { 1216 acols = bc; 1217 scols = MIN(dcols, roundup(bc, nparity + 1)); 1218 } else { 1219 acols = dcols; 1220 scols = dcols; 1221 } 1222 1223 ASSERT3U(acols, <=, scols); 1224 1225 rm = malloc(offsetof(raidz_map_t, rm_col[scols])); 1226 if (rm == NULL) 1227 return (rm); 1228 1229 rm->rm_cols = acols; 1230 rm->rm_scols = scols; 1231 rm->rm_bigcols = bc; 1232 rm->rm_skipstart = bc; 1233 rm->rm_missingdata = 0; 1234 rm->rm_missingparity = 0; 1235 rm->rm_firstdatacol = nparity; 1236 rm->rm_reports = 0; 1237 rm->rm_freed = 0; 1238 rm->rm_ecksuminjected = 0; 1239 1240 asize = 0; 1241 1242 for (c = 0; c < scols; c++) { 1243 col = f + c; 1244 coff = o; 1245 if (col >= dcols) { 1246 col -= dcols; 1247 coff += 1ULL << unit_shift; 1248 } 1249 rm->rm_col[c].rc_devidx = col; 1250 rm->rm_col[c].rc_offset = coff; 1251 rm->rm_col[c].rc_data = NULL; 1252 rm->rm_col[c].rc_error = 0; 1253 rm->rm_col[c].rc_tried = 0; 1254 rm->rm_col[c].rc_skipped = 0; 1255 1256 if (c >= acols) 1257 rm->rm_col[c].rc_size = 0; 1258 else if (c < bc) 1259 rm->rm_col[c].rc_size = (q + 1) << unit_shift; 1260 else 1261 rm->rm_col[c].rc_size = q << unit_shift; 1262 1263 asize += rm->rm_col[c].rc_size; 1264 } 1265 1266 ASSERT3U(asize, ==, tot << unit_shift); 1267 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 1268 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 1269 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 1270 ASSERT3U(rm->rm_nskip, <=, nparity); 1271 1272 for (c = 0; c < rm->rm_firstdatacol; c++) { 1273 rm->rm_col[c].rc_data = malloc(rm->rm_col[c].rc_size); 1274 if (rm->rm_col[c].rc_data == NULL) { 1275 c++; 1276 while (c != 0) 1277 free(rm->rm_col[--c].rc_data); 1278 free(rm); 1279 return (NULL); 1280 } 1281 } 1282 1283 rm->rm_col[c].rc_data = data; 1284 1285 for (c = c + 1; c < acols; c++) 1286 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 1287 rm->rm_col[c - 1].rc_size; 1288 1289 /* 1290 * If all data stored spans all columns, there's a danger that parity 1291 * will always be on the same device and, since parity isn't read 1292 * during normal operation, that that device's I/O bandwidth won't be 1293 * used effectively. We therefore switch the parity every 1MB. 1294 * 1295 * ... at least that was, ostensibly, the theory. As a practical 1296 * matter unless we juggle the parity between all devices evenly, we 1297 * won't see any benefit. Further, occasional writes that aren't a 1298 * multiple of the LCM of the number of children and the minimum 1299 * stripe width are sufficient to avoid pessimal behavior. 1300 * Unfortunately, this decision created an implicit on-disk format 1301 * requirement that we need to support for all eternity, but only 1302 * for single-parity RAID-Z. 1303 * 1304 * If we intend to skip a sector in the zeroth column for padding 1305 * we must make sure to note this swap. We will never intend to 1306 * skip the first column since at least one data and one parity 1307 * column must appear in each row. 1308 */ 1309 ASSERT(rm->rm_cols >= 2); 1310 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 1311 1312 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { 1313 devidx = rm->rm_col[0].rc_devidx; 1314 o = rm->rm_col[0].rc_offset; 1315 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 1316 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 1317 rm->rm_col[1].rc_devidx = devidx; 1318 rm->rm_col[1].rc_offset = o; 1319 1320 if (rm->rm_skipstart == 0) 1321 rm->rm_skipstart = 1; 1322 } 1323 1324 return (rm); 1325 } 1326 1327 static void 1328 vdev_raidz_map_free(raidz_map_t *rm) 1329 { 1330 int c; 1331 1332 for (c = rm->rm_firstdatacol - 1; c >= 0; c--) 1333 free(rm->rm_col[c].rc_data); 1334 1335 free(rm); 1336 } 1337 1338 static vdev_t * 1339 vdev_child(vdev_t *pvd, uint64_t devidx) 1340 { 1341 vdev_t *cvd; 1342 1343 STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) { 1344 if (cvd->v_id == devidx) 1345 break; 1346 } 1347 1348 return (cvd); 1349 } 1350 1351 /* 1352 * We keep track of whether or not there were any injected errors, so that 1353 * any ereports we generate can note it. 1354 */ 1355 static int 1356 raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data, 1357 uint64_t size) 1358 { 1359 return (zio_checksum_verify(spa, bp, data)); 1360 } 1361 1362 /* 1363 * Generate the parity from the data columns. If we tried and were able to 1364 * read the parity without error, verify that the generated parity matches the 1365 * data we read. If it doesn't, we fire off a checksum error. Return the 1366 * number such failures. 1367 */ 1368 static int 1369 raidz_parity_verify(raidz_map_t *rm) 1370 { 1371 void *orig[VDEV_RAIDZ_MAXPARITY]; 1372 int c, ret = 0; 1373 raidz_col_t *rc; 1374 1375 for (c = 0; c < rm->rm_firstdatacol; c++) { 1376 rc = &rm->rm_col[c]; 1377 if (!rc->rc_tried || rc->rc_error != 0) 1378 continue; 1379 orig[c] = malloc(rc->rc_size); 1380 if (orig[c] != NULL) { 1381 bcopy(rc->rc_data, orig[c], rc->rc_size); 1382 } else { 1383 printf("Out of memory\n"); 1384 } 1385 } 1386 1387 vdev_raidz_generate_parity(rm); 1388 1389 for (c = rm->rm_firstdatacol - 1; c >= 0; c--) { 1390 rc = &rm->rm_col[c]; 1391 if (!rc->rc_tried || rc->rc_error != 0) 1392 continue; 1393 if (orig[c] == NULL || 1394 bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 1395 rc->rc_error = ECKSUM; 1396 ret++; 1397 } 1398 free(orig[c]); 1399 } 1400 1401 return (ret); 1402 } 1403 1404 /* 1405 * Iterate over all combinations of bad data and attempt a reconstruction. 1406 * Note that the algorithm below is non-optimal because it doesn't take into 1407 * account how reconstruction is actually performed. For example, with 1408 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 1409 * is targeted as invalid as if columns 1 and 4 are targeted since in both 1410 * cases we'd only use parity information in column 0. 1411 */ 1412 static int 1413 vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp, 1414 void *data, off_t offset, uint64_t bytes, int total_errors, int data_errors) 1415 { 1416 raidz_col_t *rc; 1417 void *orig[VDEV_RAIDZ_MAXPARITY]; 1418 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 1419 int *tgts = &tstore[1]; 1420 int current, next, i, c, n; 1421 int code, ret = 0; 1422 1423 ASSERT(total_errors < rm->rm_firstdatacol); 1424 1425 /* 1426 * This simplifies one edge condition. 1427 */ 1428 tgts[-1] = -1; 1429 1430 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 1431 /* 1432 * Initialize the targets array by finding the first n columns 1433 * that contain no error. 1434 * 1435 * If there were no data errors, we need to ensure that we're 1436 * always explicitly attempting to reconstruct at least one 1437 * data column. To do this, we simply push the highest target 1438 * up into the data columns. 1439 */ 1440 for (c = 0, i = 0; i < n; i++) { 1441 if (i == n - 1 && data_errors == 0 && 1442 c < rm->rm_firstdatacol) { 1443 c = rm->rm_firstdatacol; 1444 } 1445 1446 while (rm->rm_col[c].rc_error != 0) { 1447 c++; 1448 ASSERT3S(c, <, rm->rm_cols); 1449 } 1450 1451 tgts[i] = c++; 1452 } 1453 1454 /* 1455 * Setting tgts[n] simplifies the other edge condition. 1456 */ 1457 tgts[n] = rm->rm_cols; 1458 1459 /* 1460 * These buffers were allocated in previous iterations. 1461 */ 1462 for (i = 0; i < n - 1; i++) { 1463 ASSERT(orig[i] != NULL); 1464 } 1465 1466 orig[n - 1] = malloc(rm->rm_col[0].rc_size); 1467 if (orig[n - 1] == NULL) { 1468 ret = ENOMEM; 1469 goto done; 1470 } 1471 1472 current = 0; 1473 next = tgts[current]; 1474 1475 while (current != n) { 1476 tgts[current] = next; 1477 current = 0; 1478 1479 /* 1480 * Save off the original data that we're going to 1481 * attempt to reconstruct. 1482 */ 1483 for (i = 0; i < n; i++) { 1484 ASSERT(orig[i] != NULL); 1485 c = tgts[i]; 1486 ASSERT3S(c, >=, 0); 1487 ASSERT3S(c, <, rm->rm_cols); 1488 rc = &rm->rm_col[c]; 1489 bcopy(rc->rc_data, orig[i], rc->rc_size); 1490 } 1491 1492 /* 1493 * Attempt a reconstruction and exit the outer loop on 1494 * success. 1495 */ 1496 code = vdev_raidz_reconstruct(rm, tgts, n); 1497 if (raidz_checksum_verify(spa, bp, data, bytes) == 0) { 1498 for (i = 0; i < n; i++) { 1499 c = tgts[i]; 1500 rc = &rm->rm_col[c]; 1501 ASSERT(rc->rc_error == 0); 1502 rc->rc_error = ECKSUM; 1503 } 1504 1505 ret = code; 1506 goto done; 1507 } 1508 1509 /* 1510 * Restore the original data. 1511 */ 1512 for (i = 0; i < n; i++) { 1513 c = tgts[i]; 1514 rc = &rm->rm_col[c]; 1515 bcopy(orig[i], rc->rc_data, rc->rc_size); 1516 } 1517 1518 do { 1519 /* 1520 * Find the next valid column after the current 1521 * position.. 1522 */ 1523 for (next = tgts[current] + 1; 1524 next < rm->rm_cols && 1525 rm->rm_col[next].rc_error != 0; next++) 1526 continue; 1527 1528 ASSERT(next <= tgts[current + 1]); 1529 1530 /* 1531 * If that spot is available, we're done here. 1532 */ 1533 if (next != tgts[current + 1]) 1534 break; 1535 1536 /* 1537 * Otherwise, find the next valid column after 1538 * the previous position. 1539 */ 1540 for (c = tgts[current - 1] + 1; 1541 rm->rm_col[c].rc_error != 0; c++) 1542 continue; 1543 1544 tgts[current] = c; 1545 current++; 1546 1547 } while (current != n); 1548 } 1549 } 1550 n--; 1551 done: 1552 for (i = n - 1; i >= 0; i--) { 1553 free(orig[i]); 1554 } 1555 1556 return (ret); 1557 } 1558 1559 static int 1560 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data, 1561 off_t offset, size_t bytes) 1562 { 1563 vdev_t *tvd = vd->v_top; 1564 vdev_t *cvd; 1565 raidz_map_t *rm; 1566 raidz_col_t *rc; 1567 int c, error; 1568 int unexpected_errors; 1569 int parity_errors; 1570 int parity_untried; 1571 int data_errors; 1572 int total_errors; 1573 int n; 1574 int tgts[VDEV_RAIDZ_MAXPARITY]; 1575 int code; 1576 1577 rc = NULL; /* gcc */ 1578 error = 0; 1579 1580 rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift, 1581 vd->v_nchildren, vd->v_nparity); 1582 if (rm == NULL) 1583 return (ENOMEM); 1584 1585 /* 1586 * Iterate over the columns in reverse order so that we hit the parity 1587 * last -- any errors along the way will force us to read the parity. 1588 */ 1589 for (c = rm->rm_cols - 1; c >= 0; c--) { 1590 rc = &rm->rm_col[c]; 1591 cvd = vdev_child(vd, rc->rc_devidx); 1592 if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) { 1593 if (c >= rm->rm_firstdatacol) 1594 rm->rm_missingdata++; 1595 else 1596 rm->rm_missingparity++; 1597 rc->rc_error = ENXIO; 1598 rc->rc_tried = 1; /* don't even try */ 1599 rc->rc_skipped = 1; 1600 continue; 1601 } 1602 #if 0 /* XXX: Too hard for the boot code. */ 1603 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 1604 if (c >= rm->rm_firstdatacol) 1605 rm->rm_missingdata++; 1606 else 1607 rm->rm_missingparity++; 1608 rc->rc_error = ESTALE; 1609 rc->rc_skipped = 1; 1610 continue; 1611 } 1612 #endif 1613 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) { 1614 rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data, 1615 rc->rc_offset, rc->rc_size); 1616 rc->rc_tried = 1; 1617 rc->rc_skipped = 0; 1618 } 1619 } 1620 1621 reconstruct: 1622 unexpected_errors = 0; 1623 parity_errors = 0; 1624 parity_untried = 0; 1625 data_errors = 0; 1626 total_errors = 0; 1627 1628 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 1629 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 1630 1631 for (c = 0; c < rm->rm_cols; c++) { 1632 rc = &rm->rm_col[c]; 1633 1634 if (rc->rc_error) { 1635 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 1636 1637 if (c < rm->rm_firstdatacol) 1638 parity_errors++; 1639 else 1640 data_errors++; 1641 1642 if (!rc->rc_skipped) 1643 unexpected_errors++; 1644 1645 total_errors++; 1646 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 1647 parity_untried++; 1648 } 1649 } 1650 1651 /* 1652 * There are three potential phases for a read: 1653 * 1. produce valid data from the columns read 1654 * 2. read all disks and try again 1655 * 3. perform combinatorial reconstruction 1656 * 1657 * Each phase is progressively both more expensive and less likely to 1658 * occur. If we encounter more errors than we can repair or all phases 1659 * fail, we have no choice but to return an error. 1660 */ 1661 1662 /* 1663 * If the number of errors we saw was correctable -- less than or equal 1664 * to the number of parity disks read -- attempt to produce data that 1665 * has a valid checksum. Naturally, this case applies in the absence of 1666 * any errors. 1667 */ 1668 if (total_errors <= rm->rm_firstdatacol - parity_untried) { 1669 int rv; 1670 1671 if (data_errors == 0) { 1672 rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes); 1673 if (rv == 0) { 1674 /* 1675 * If we read parity information (unnecessarily 1676 * as it happens since no reconstruction was 1677 * needed) regenerate and verify the parity. 1678 * We also regenerate parity when resilvering 1679 * so we can write it out to the failed device 1680 * later. 1681 */ 1682 if (parity_errors + parity_untried < 1683 rm->rm_firstdatacol) { 1684 n = raidz_parity_verify(rm); 1685 unexpected_errors += n; 1686 ASSERT(parity_errors + n <= 1687 rm->rm_firstdatacol); 1688 } 1689 goto done; 1690 } 1691 } else { 1692 /* 1693 * We either attempt to read all the parity columns or 1694 * none of them. If we didn't try to read parity, we 1695 * wouldn't be here in the correctable case. There must 1696 * also have been fewer parity errors than parity 1697 * columns or, again, we wouldn't be in this code path. 1698 */ 1699 ASSERT(parity_untried == 0); 1700 ASSERT(parity_errors < rm->rm_firstdatacol); 1701 1702 /* 1703 * Identify the data columns that reported an error. 1704 */ 1705 n = 0; 1706 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1707 rc = &rm->rm_col[c]; 1708 if (rc->rc_error != 0) { 1709 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 1710 tgts[n++] = c; 1711 } 1712 } 1713 1714 ASSERT(rm->rm_firstdatacol >= n); 1715 1716 code = vdev_raidz_reconstruct(rm, tgts, n); 1717 1718 rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes); 1719 if (rv == 0) { 1720 /* 1721 * If we read more parity disks than were used 1722 * for reconstruction, confirm that the other 1723 * parity disks produced correct data. This 1724 * routine is suboptimal in that it regenerates 1725 * the parity that we already used in addition 1726 * to the parity that we're attempting to 1727 * verify, but this should be a relatively 1728 * uncommon case, and can be optimized if it 1729 * becomes a problem. Note that we regenerate 1730 * parity when resilvering so we can write it 1731 * out to failed devices later. 1732 */ 1733 if (parity_errors < rm->rm_firstdatacol - n) { 1734 n = raidz_parity_verify(rm); 1735 unexpected_errors += n; 1736 ASSERT(parity_errors + n <= 1737 rm->rm_firstdatacol); 1738 } 1739 1740 goto done; 1741 } 1742 } 1743 } 1744 1745 /* 1746 * This isn't a typical situation -- either we got a read 1747 * error or a child silently returned bad data. Read every 1748 * block so we can try again with as much data and parity as 1749 * we can track down. If we've already been through once 1750 * before, all children will be marked as tried so we'll 1751 * proceed to combinatorial reconstruction. 1752 */ 1753 unexpected_errors = 1; 1754 rm->rm_missingdata = 0; 1755 rm->rm_missingparity = 0; 1756 1757 n = 0; 1758 for (c = 0; c < rm->rm_cols; c++) { 1759 rc = &rm->rm_col[c]; 1760 1761 if (rc->rc_tried) 1762 continue; 1763 1764 cvd = vdev_child(vd, rc->rc_devidx); 1765 ASSERT(cvd != NULL); 1766 rc->rc_error = cvd->v_read(cvd, NULL, 1767 rc->rc_data, rc->rc_offset, rc->rc_size); 1768 if (rc->rc_error == 0) 1769 n++; 1770 rc->rc_tried = 1; 1771 rc->rc_skipped = 0; 1772 } 1773 /* 1774 * If we managed to read anything more, retry the 1775 * reconstruction. 1776 */ 1777 if (n > 0) 1778 goto reconstruct; 1779 1780 /* 1781 * At this point we've attempted to reconstruct the data given the 1782 * errors we detected, and we've attempted to read all columns. There 1783 * must, therefore, be one or more additional problems -- silent errors 1784 * resulting in invalid data rather than explicit I/O errors resulting 1785 * in absent data. We check if there is enough additional data to 1786 * possibly reconstruct the data and then perform combinatorial 1787 * reconstruction over all possible combinations. If that fails, 1788 * we're cooked. 1789 */ 1790 if (total_errors > rm->rm_firstdatacol) { 1791 error = EIO; 1792 } else if (total_errors < rm->rm_firstdatacol && 1793 (code = vdev_raidz_combrec(vd->v_spa, rm, bp, data, offset, bytes, 1794 total_errors, data_errors)) != 0) { 1795 /* 1796 * If we didn't use all the available parity for the 1797 * combinatorial reconstruction, verify that the remaining 1798 * parity is correct. 1799 */ 1800 if (code != (1 << rm->rm_firstdatacol) - 1) 1801 (void) raidz_parity_verify(rm); 1802 } else { 1803 /* 1804 * We're here because either: 1805 * 1806 * total_errors == rm_first_datacol, or 1807 * vdev_raidz_combrec() failed 1808 * 1809 * In either case, there is enough bad data to prevent 1810 * reconstruction. 1811 * 1812 * Start checksum ereports for all children which haven't 1813 * failed, and the IO wasn't speculative. 1814 */ 1815 error = ECKSUM; 1816 } 1817 1818 done: 1819 vdev_raidz_map_free(rm); 1820 1821 return (error); 1822 } 1823