1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/cdefs.h> 27 __FBSDID("$FreeBSD$"); 28 29 #include <lz4.h> 30 31 static uint64_t zfs_crc64_table[256]; 32 33 #define ASSERT3S(x, y, z) ((void)0) 34 #define ASSERT3U(x, y, z) ((void)0) 35 #define ASSERT3P(x, y, z) ((void)0) 36 #define ASSERT0(x) ((void)0) 37 #define ASSERT(x) ((void)0) 38 39 #define panic(...) do { \ 40 printf(__VA_ARGS__); \ 41 for (;;) ; \ 42 } while (0) 43 44 static void 45 zfs_init_crc(void) 46 { 47 int i, j; 48 uint64_t *ct; 49 50 /* 51 * Calculate the crc64 table (used for the zap hash 52 * function). 53 */ 54 if (zfs_crc64_table[128] != ZFS_CRC64_POLY) { 55 memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table)); 56 for (i = 0; i < 256; i++) 57 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 58 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 59 } 60 } 61 62 static void 63 zio_checksum_off(const void *buf, uint64_t size, 64 const void *ctx_template, zio_cksum_t *zcp) 65 { 66 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); 67 } 68 69 /* 70 * Signature for checksum functions. 71 */ 72 typedef void zio_checksum_t(const void *data, uint64_t size, 73 const void *ctx_template, zio_cksum_t *zcp); 74 typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt); 75 typedef void zio_checksum_tmpl_free_t(void *ctx_template); 76 77 typedef enum zio_checksum_flags { 78 /* Strong enough for metadata? */ 79 ZCHECKSUM_FLAG_METADATA = (1 << 1), 80 /* ZIO embedded checksum */ 81 ZCHECKSUM_FLAG_EMBEDDED = (1 << 2), 82 /* Strong enough for dedup (without verification)? */ 83 ZCHECKSUM_FLAG_DEDUP = (1 << 3), 84 /* Uses salt value */ 85 ZCHECKSUM_FLAG_SALTED = (1 << 4), 86 /* Strong enough for nopwrite? */ 87 ZCHECKSUM_FLAG_NOPWRITE = (1 << 5) 88 } zio_checksum_flags_t; 89 90 /* 91 * Information about each checksum function. 92 */ 93 typedef struct zio_checksum_info { 94 /* checksum function for each byteorder */ 95 zio_checksum_t *ci_func[2]; 96 zio_checksum_tmpl_init_t *ci_tmpl_init; 97 zio_checksum_tmpl_free_t *ci_tmpl_free; 98 zio_checksum_flags_t ci_flags; 99 const char *ci_name; /* descriptive name */ 100 } zio_checksum_info_t; 101 102 #include "blkptr.c" 103 104 #include "fletcher.c" 105 #include "sha256.c" 106 #include "skein_zfs.c" 107 108 #ifdef HAS_ZSTD_ZFS 109 extern int zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len, 110 size_t d_len, int n); 111 #endif 112 113 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { 114 {{NULL, NULL}, NULL, NULL, 0, "inherit"}, 115 {{NULL, NULL}, NULL, NULL, 0, "on"}, 116 {{zio_checksum_off, zio_checksum_off}, NULL, NULL, 0, "off"}, 117 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, 118 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"}, 119 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, 120 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"}, 121 {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL, 122 ZCHECKSUM_FLAG_EMBEDDED, "zilog"}, 123 {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL, 124 0, "fletcher2"}, 125 {{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL, 126 ZCHECKSUM_FLAG_METADATA, "fletcher4"}, 127 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, 128 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | 129 ZCHECKSUM_FLAG_NOPWRITE, "SHA256"}, 130 {{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL, 131 ZCHECKSUM_FLAG_EMBEDDED, "zillog2"}, 132 {{zio_checksum_off, zio_checksum_off}, NULL, NULL, 133 0, "noparity"}, 134 {{zio_checksum_SHA512_native, zio_checksum_SHA512_byteswap}, 135 NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | 136 ZCHECKSUM_FLAG_NOPWRITE, "SHA512"}, 137 {{zio_checksum_skein_native, zio_checksum_skein_byteswap}, 138 zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free, 139 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | 140 ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"}, 141 /* no edonr for now */ 142 {{NULL, NULL}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | 143 ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"} 144 }; 145 146 /* 147 * Common signature for all zio compress/decompress functions. 148 */ 149 typedef size_t zio_compress_func_t(void *src, void *dst, 150 size_t s_len, size_t d_len, int); 151 typedef int zio_decompress_func_t(void *src, void *dst, 152 size_t s_len, size_t d_len, int); 153 154 /* 155 * Information about each compression function. 156 */ 157 typedef struct zio_compress_info { 158 zio_compress_func_t *ci_compress; /* compression function */ 159 zio_decompress_func_t *ci_decompress; /* decompression function */ 160 int ci_level; /* level parameter */ 161 const char *ci_name; /* algorithm name */ 162 } zio_compress_info_t; 163 164 #include "lzjb.c" 165 #include "zle.c" 166 167 /* 168 * Compression vectors. 169 */ 170 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { 171 {NULL, NULL, 0, "inherit"}, 172 {NULL, NULL, 0, "on"}, 173 {NULL, NULL, 0, "uncompressed"}, 174 {NULL, lzjb_decompress, 0, "lzjb"}, 175 {NULL, NULL, 0, "empty"}, 176 {NULL, NULL, 1, "gzip-1"}, 177 {NULL, NULL, 2, "gzip-2"}, 178 {NULL, NULL, 3, "gzip-3"}, 179 {NULL, NULL, 4, "gzip-4"}, 180 {NULL, NULL, 5, "gzip-5"}, 181 {NULL, NULL, 6, "gzip-6"}, 182 {NULL, NULL, 7, "gzip-7"}, 183 {NULL, NULL, 8, "gzip-8"}, 184 {NULL, NULL, 9, "gzip-9"}, 185 {NULL, zle_decompress, 64, "zle"}, 186 {NULL, lz4_decompress, 0, "lz4"}, 187 #ifdef HAS_ZSTD_ZFS 188 {NULL, zfs_zstd_decompress, ZIO_ZSTD_LEVEL_DEFAULT, "zstd"} 189 #endif 190 }; 191 192 static void 193 byteswap_uint64_array(void *vbuf, size_t size) 194 { 195 uint64_t *buf = vbuf; 196 size_t count = size >> 3; 197 int i; 198 199 ASSERT((size & 7) == 0); 200 201 for (i = 0; i < count; i++) 202 buf[i] = BSWAP_64(buf[i]); 203 } 204 205 /* 206 * Set the external verifier for a gang block based on <vdev, offset, txg>, 207 * a tuple which is guaranteed to be unique for the life of the pool. 208 */ 209 static void 210 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp) 211 { 212 const dva_t *dva = BP_IDENTITY(bp); 213 uint64_t txg = BP_PHYSICAL_BIRTH(bp); 214 215 ASSERT(BP_IS_GANG(bp)); 216 217 ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0); 218 } 219 220 /* 221 * Set the external verifier for a label block based on its offset. 222 * The vdev is implicit, and the txg is unknowable at pool open time -- 223 * hence the logic in vdev_uberblock_load() to find the most recent copy. 224 */ 225 static void 226 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset) 227 { 228 ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0); 229 } 230 231 /* 232 * Calls the template init function of a checksum which supports context 233 * templates and installs the template into the spa_t. 234 */ 235 static void 236 zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa) 237 { 238 zio_checksum_info_t *ci = &zio_checksum_table[checksum]; 239 240 if (ci->ci_tmpl_init == NULL) 241 return; 242 243 if (spa->spa_cksum_tmpls[checksum] != NULL) 244 return; 245 246 if (spa->spa_cksum_tmpls[checksum] == NULL) { 247 spa->spa_cksum_tmpls[checksum] = 248 ci->ci_tmpl_init(&spa->spa_cksum_salt); 249 } 250 } 251 252 /* 253 * Called by a spa_t that's about to be deallocated. This steps through 254 * all of the checksum context templates and deallocates any that were 255 * initialized using the algorithm-specific template init function. 256 */ 257 static void __unused 258 zio_checksum_templates_free(spa_t *spa) 259 { 260 for (enum zio_checksum checksum = 0; 261 checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) { 262 if (spa->spa_cksum_tmpls[checksum] != NULL) { 263 zio_checksum_info_t *ci = &zio_checksum_table[checksum]; 264 265 ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]); 266 spa->spa_cksum_tmpls[checksum] = NULL; 267 } 268 } 269 } 270 271 static int 272 zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data) 273 { 274 uint64_t size; 275 unsigned int checksum; 276 zio_checksum_info_t *ci; 277 void *ctx = NULL; 278 zio_cksum_t actual_cksum, expected_cksum, verifier; 279 int byteswap; 280 281 checksum = BP_GET_CHECKSUM(bp); 282 size = BP_GET_PSIZE(bp); 283 284 if (checksum >= ZIO_CHECKSUM_FUNCTIONS) 285 return (EINVAL); 286 ci = &zio_checksum_table[checksum]; 287 if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL) 288 return (EINVAL); 289 290 if (spa != NULL) { 291 zio_checksum_template_init(checksum, __DECONST(spa_t *,spa)); 292 ctx = spa->spa_cksum_tmpls[checksum]; 293 } 294 295 if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { 296 zio_eck_t *eck; 297 298 ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER || 299 checksum == ZIO_CHECKSUM_LABEL); 300 301 eck = (zio_eck_t *)((char *)data + size) - 1; 302 303 if (checksum == ZIO_CHECKSUM_GANG_HEADER) 304 zio_checksum_gang_verifier(&verifier, bp); 305 else if (checksum == ZIO_CHECKSUM_LABEL) 306 zio_checksum_label_verifier(&verifier, 307 DVA_GET_OFFSET(BP_IDENTITY(bp))); 308 else 309 verifier = bp->blk_cksum; 310 311 byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC)); 312 313 if (byteswap) 314 byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); 315 316 expected_cksum = eck->zec_cksum; 317 eck->zec_cksum = verifier; 318 ci->ci_func[byteswap](data, size, ctx, &actual_cksum); 319 eck->zec_cksum = expected_cksum; 320 321 if (byteswap) 322 byteswap_uint64_array(&expected_cksum, 323 sizeof (zio_cksum_t)); 324 } else { 325 byteswap = BP_SHOULD_BYTESWAP(bp); 326 expected_cksum = bp->blk_cksum; 327 ci->ci_func[byteswap](data, size, ctx, &actual_cksum); 328 } 329 330 if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) { 331 /*printf("ZFS: read checksum %s failed\n", ci->ci_name);*/ 332 return (EIO); 333 } 334 335 return (0); 336 } 337 338 static int 339 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize, 340 void *dest, uint64_t destsize) 341 { 342 zio_compress_info_t *ci; 343 344 if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) { 345 printf("ZFS: unsupported compression algorithm %u\n", cpfunc); 346 return (EIO); 347 } 348 349 ci = &zio_compress_table[cpfunc]; 350 if (!ci->ci_decompress) { 351 printf("ZFS: unsupported compression algorithm %s\n", 352 ci->ci_name); 353 return (EIO); 354 } 355 356 return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level)); 357 } 358 359 static uint64_t 360 zap_hash(uint64_t salt, const char *name) 361 { 362 const uint8_t *cp; 363 uint8_t c; 364 uint64_t crc = salt; 365 366 ASSERT(crc != 0); 367 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 368 for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++) 369 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF]; 370 371 /* 372 * Only use 28 bits, since we need 4 bits in the cookie for the 373 * collision differentiator. We MUST use the high bits, since 374 * those are the onces that we first pay attention to when 375 * chosing the bucket. 376 */ 377 crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1); 378 379 return (crc); 380 } 381 382 typedef struct raidz_col { 383 uint64_t rc_devidx; /* child device index for I/O */ 384 uint64_t rc_offset; /* device offset */ 385 uint64_t rc_size; /* I/O size */ 386 void *rc_data; /* I/O data */ 387 int rc_error; /* I/O error for this device */ 388 uint8_t rc_tried; /* Did we attempt this I/O column? */ 389 uint8_t rc_skipped; /* Did we skip this I/O column? */ 390 } raidz_col_t; 391 392 typedef struct raidz_map { 393 uint64_t rm_cols; /* Regular column count */ 394 uint64_t rm_scols; /* Count including skipped columns */ 395 uint64_t rm_bigcols; /* Number of oversized columns */ 396 uint64_t rm_asize; /* Actual total I/O size */ 397 uint64_t rm_missingdata; /* Count of missing data devices */ 398 uint64_t rm_missingparity; /* Count of missing parity devices */ 399 uint64_t rm_firstdatacol; /* First data column/parity count */ 400 uint64_t rm_nskip; /* Skipped sectors for padding */ 401 uint64_t rm_skipstart; /* Column index of padding start */ 402 uintptr_t rm_reports; /* # of referencing checksum reports */ 403 uint8_t rm_freed; /* map no longer has referencing ZIO */ 404 uint8_t rm_ecksuminjected; /* checksum error was injected */ 405 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 406 } raidz_map_t; 407 408 #define VDEV_RAIDZ_P 0 409 #define VDEV_RAIDZ_Q 1 410 #define VDEV_RAIDZ_R 2 411 412 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 413 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 414 415 /* 416 * We provide a mechanism to perform the field multiplication operation on a 417 * 64-bit value all at once rather than a byte at a time. This works by 418 * creating a mask from the top bit in each byte and using that to 419 * conditionally apply the XOR of 0x1d. 420 */ 421 #define VDEV_RAIDZ_64MUL_2(x, mask) \ 422 { \ 423 (mask) = (x) & 0x8080808080808080ULL; \ 424 (mask) = ((mask) << 1) - ((mask) >> 7); \ 425 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 426 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ 427 } 428 429 #define VDEV_RAIDZ_64MUL_4(x, mask) \ 430 { \ 431 VDEV_RAIDZ_64MUL_2((x), mask); \ 432 VDEV_RAIDZ_64MUL_2((x), mask); \ 433 } 434 435 /* 436 * These two tables represent powers and logs of 2 in the Galois field defined 437 * above. These values were computed by repeatedly multiplying by 2 as above. 438 */ 439 static const uint8_t vdev_raidz_pow2[256] = { 440 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 441 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 442 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 443 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 444 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 445 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 446 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 447 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 448 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 449 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 450 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 451 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 452 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 453 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 454 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 455 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 456 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 457 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 458 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 459 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 460 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 461 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 462 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 463 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 464 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 465 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 466 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 467 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 468 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 469 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 470 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 471 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 472 }; 473 static const uint8_t vdev_raidz_log2[256] = { 474 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 475 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 476 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 477 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 478 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 479 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 480 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 481 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 482 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 483 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 484 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 485 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 486 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 487 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 488 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 489 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 490 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 491 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 492 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 493 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 494 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 495 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 496 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 497 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 498 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 499 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 500 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 501 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 502 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 503 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 504 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 505 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 506 }; 507 508 /* 509 * Multiply a given number by 2 raised to the given power. 510 */ 511 static uint8_t 512 vdev_raidz_exp2(uint8_t a, int exp) 513 { 514 if (a == 0) 515 return (0); 516 517 ASSERT(exp >= 0); 518 ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 519 520 exp += vdev_raidz_log2[a]; 521 if (exp > 255) 522 exp -= 255; 523 524 return (vdev_raidz_pow2[exp]); 525 } 526 527 static void 528 vdev_raidz_generate_parity_p(raidz_map_t *rm) 529 { 530 uint64_t *p, *src, ccount, i; 531 uint64_t pcount __unused; 532 int c; 533 534 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 535 536 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 537 src = rm->rm_col[c].rc_data; 538 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 539 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 540 541 if (c == rm->rm_firstdatacol) { 542 ASSERT(ccount == pcount); 543 for (i = 0; i < ccount; i++, src++, p++) { 544 *p = *src; 545 } 546 } else { 547 ASSERT(ccount <= pcount); 548 for (i = 0; i < ccount; i++, src++, p++) { 549 *p ^= *src; 550 } 551 } 552 } 553 } 554 555 static void 556 vdev_raidz_generate_parity_pq(raidz_map_t *rm) 557 { 558 uint64_t *p, *q, *src, pcnt, ccnt, mask, i; 559 int c; 560 561 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 562 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 563 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 564 565 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 566 src = rm->rm_col[c].rc_data; 567 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 568 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 569 570 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 571 572 if (c == rm->rm_firstdatacol) { 573 ASSERT(ccnt == pcnt || ccnt == 0); 574 for (i = 0; i < ccnt; i++, src++, p++, q++) { 575 *p = *src; 576 *q = *src; 577 } 578 for (; i < pcnt; i++, src++, p++, q++) { 579 *p = 0; 580 *q = 0; 581 } 582 } else { 583 ASSERT(ccnt <= pcnt); 584 585 /* 586 * Apply the algorithm described above by multiplying 587 * the previous result and adding in the new value. 588 */ 589 for (i = 0; i < ccnt; i++, src++, p++, q++) { 590 *p ^= *src; 591 592 VDEV_RAIDZ_64MUL_2(*q, mask); 593 *q ^= *src; 594 } 595 596 /* 597 * Treat short columns as though they are full of 0s. 598 * Note that there's therefore nothing needed for P. 599 */ 600 for (; i < pcnt; i++, q++) { 601 VDEV_RAIDZ_64MUL_2(*q, mask); 602 } 603 } 604 } 605 } 606 607 static void 608 vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 609 { 610 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; 611 int c; 612 613 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 614 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 615 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 616 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 617 rm->rm_col[VDEV_RAIDZ_R].rc_size); 618 619 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 620 src = rm->rm_col[c].rc_data; 621 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 622 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 623 r = rm->rm_col[VDEV_RAIDZ_R].rc_data; 624 625 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 626 627 if (c == rm->rm_firstdatacol) { 628 ASSERT(ccnt == pcnt || ccnt == 0); 629 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 630 *p = *src; 631 *q = *src; 632 *r = *src; 633 } 634 for (; i < pcnt; i++, src++, p++, q++, r++) { 635 *p = 0; 636 *q = 0; 637 *r = 0; 638 } 639 } else { 640 ASSERT(ccnt <= pcnt); 641 642 /* 643 * Apply the algorithm described above by multiplying 644 * the previous result and adding in the new value. 645 */ 646 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 647 *p ^= *src; 648 649 VDEV_RAIDZ_64MUL_2(*q, mask); 650 *q ^= *src; 651 652 VDEV_RAIDZ_64MUL_4(*r, mask); 653 *r ^= *src; 654 } 655 656 /* 657 * Treat short columns as though they are full of 0s. 658 * Note that there's therefore nothing needed for P. 659 */ 660 for (; i < pcnt; i++, q++, r++) { 661 VDEV_RAIDZ_64MUL_2(*q, mask); 662 VDEV_RAIDZ_64MUL_4(*r, mask); 663 } 664 } 665 } 666 } 667 668 /* 669 * Generate RAID parity in the first virtual columns according to the number of 670 * parity columns available. 671 */ 672 static void 673 vdev_raidz_generate_parity(raidz_map_t *rm) 674 { 675 switch (rm->rm_firstdatacol) { 676 case 1: 677 vdev_raidz_generate_parity_p(rm); 678 break; 679 case 2: 680 vdev_raidz_generate_parity_pq(rm); 681 break; 682 case 3: 683 vdev_raidz_generate_parity_pqr(rm); 684 break; 685 default: 686 panic("invalid RAID-Z configuration"); 687 } 688 } 689 690 /* BEGIN CSTYLED */ 691 /* 692 * In the general case of reconstruction, we must solve the system of linear 693 * equations defined by the coeffecients used to generate parity as well as 694 * the contents of the data and parity disks. This can be expressed with 695 * vectors for the original data (D) and the actual data (d) and parity (p) 696 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 697 * 698 * __ __ __ __ 699 * | | __ __ | p_0 | 700 * | V | | D_0 | | p_m-1 | 701 * | | x | : | = | d_0 | 702 * | I | | D_n-1 | | : | 703 * | | ~~ ~~ | d_n-1 | 704 * ~~ ~~ ~~ ~~ 705 * 706 * I is simply a square identity matrix of size n, and V is a vandermonde 707 * matrix defined by the coeffecients we chose for the various parity columns 708 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 709 * computation as well as linear separability. 710 * 711 * __ __ __ __ 712 * | 1 .. 1 1 1 | | p_0 | 713 * | 2^n-1 .. 4 2 1 | __ __ | : | 714 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 715 * | 1 .. 0 0 0 | | D_1 | | d_0 | 716 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 717 * | : : : : | | : | | d_2 | 718 * | 0 .. 1 0 0 | | D_n-1 | | : | 719 * | 0 .. 0 1 0 | ~~ ~~ | : | 720 * | 0 .. 0 0 1 | | d_n-1 | 721 * ~~ ~~ ~~ ~~ 722 * 723 * Note that I, V, d, and p are known. To compute D, we must invert the 724 * matrix and use the known data and parity values to reconstruct the unknown 725 * data values. We begin by removing the rows in V|I and d|p that correspond 726 * to failed or missing columns; we then make V|I square (n x n) and d|p 727 * sized n by removing rows corresponding to unused parity from the bottom up 728 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 729 * using Gauss-Jordan elimination. In the example below we use m=3 parity 730 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 731 * __ __ 732 * | 1 1 1 1 1 1 1 1 | 733 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 734 * | 19 205 116 29 64 16 4 1 | / / 735 * | 1 0 0 0 0 0 0 0 | / / 736 * | 0 1 0 0 0 0 0 0 | <--' / 737 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 738 * | 0 0 0 1 0 0 0 0 | 739 * | 0 0 0 0 1 0 0 0 | 740 * | 0 0 0 0 0 1 0 0 | 741 * | 0 0 0 0 0 0 1 0 | 742 * | 0 0 0 0 0 0 0 1 | 743 * ~~ ~~ 744 * __ __ 745 * | 1 1 1 1 1 1 1 1 | 746 * | 128 64 32 16 8 4 2 1 | 747 * | 19 205 116 29 64 16 4 1 | 748 * | 1 0 0 0 0 0 0 0 | 749 * | 0 1 0 0 0 0 0 0 | 750 * (V|I)' = | 0 0 1 0 0 0 0 0 | 751 * | 0 0 0 1 0 0 0 0 | 752 * | 0 0 0 0 1 0 0 0 | 753 * | 0 0 0 0 0 1 0 0 | 754 * | 0 0 0 0 0 0 1 0 | 755 * | 0 0 0 0 0 0 0 1 | 756 * ~~ ~~ 757 * 758 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 759 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 760 * matrix is not singular. 761 * __ __ 762 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 763 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 764 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 765 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 766 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 767 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 768 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 769 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 770 * ~~ ~~ 771 * __ __ 772 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 773 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 774 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 775 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 776 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 777 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 778 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 779 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 780 * ~~ ~~ 781 * __ __ 782 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 783 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 784 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 785 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 786 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 787 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 788 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 789 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 790 * ~~ ~~ 791 * __ __ 792 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 793 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 794 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 795 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 796 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 797 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 798 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 799 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 800 * ~~ ~~ 801 * __ __ 802 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 803 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 804 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 805 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 806 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 807 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 808 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 809 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 810 * ~~ ~~ 811 * __ __ 812 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 813 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 814 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 815 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 816 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 817 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 818 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 819 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 820 * ~~ ~~ 821 * __ __ 822 * | 0 0 1 0 0 0 0 0 | 823 * | 167 100 5 41 159 169 217 208 | 824 * | 166 100 4 40 158 168 216 209 | 825 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 826 * | 0 0 0 0 1 0 0 0 | 827 * | 0 0 0 0 0 1 0 0 | 828 * | 0 0 0 0 0 0 1 0 | 829 * | 0 0 0 0 0 0 0 1 | 830 * ~~ ~~ 831 * 832 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 833 * of the missing data. 834 * 835 * As is apparent from the example above, the only non-trivial rows in the 836 * inverse matrix correspond to the data disks that we're trying to 837 * reconstruct. Indeed, those are the only rows we need as the others would 838 * only be useful for reconstructing data known or assumed to be valid. For 839 * that reason, we only build the coefficients in the rows that correspond to 840 * targeted columns. 841 */ 842 /* END CSTYLED */ 843 844 static void 845 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 846 uint8_t **rows) 847 { 848 int i, j; 849 int pow; 850 851 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 852 853 /* 854 * Fill in the missing rows of interest. 855 */ 856 for (i = 0; i < nmap; i++) { 857 ASSERT3S(0, <=, map[i]); 858 ASSERT3S(map[i], <=, 2); 859 860 pow = map[i] * n; 861 if (pow > 255) 862 pow -= 255; 863 ASSERT(pow <= 255); 864 865 for (j = 0; j < n; j++) { 866 pow -= map[i]; 867 if (pow < 0) 868 pow += 255; 869 rows[i][j] = vdev_raidz_pow2[pow]; 870 } 871 } 872 } 873 874 static void 875 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 876 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 877 { 878 int i, j, ii, jj; 879 uint8_t log; 880 881 /* 882 * Assert that the first nmissing entries from the array of used 883 * columns correspond to parity columns and that subsequent entries 884 * correspond to data columns. 885 */ 886 for (i = 0; i < nmissing; i++) { 887 ASSERT3S(used[i], <, rm->rm_firstdatacol); 888 } 889 for (; i < n; i++) { 890 ASSERT3S(used[i], >=, rm->rm_firstdatacol); 891 } 892 893 /* 894 * First initialize the storage where we'll compute the inverse rows. 895 */ 896 for (i = 0; i < nmissing; i++) { 897 for (j = 0; j < n; j++) { 898 invrows[i][j] = (i == j) ? 1 : 0; 899 } 900 } 901 902 /* 903 * Subtract all trivial rows from the rows of consequence. 904 */ 905 for (i = 0; i < nmissing; i++) { 906 for (j = nmissing; j < n; j++) { 907 ASSERT3U(used[j], >=, rm->rm_firstdatacol); 908 jj = used[j] - rm->rm_firstdatacol; 909 ASSERT3S(jj, <, n); 910 invrows[i][j] = rows[i][jj]; 911 rows[i][jj] = 0; 912 } 913 } 914 915 /* 916 * For each of the rows of interest, we must normalize it and subtract 917 * a multiple of it from the other rows. 918 */ 919 for (i = 0; i < nmissing; i++) { 920 for (j = 0; j < missing[i]; j++) { 921 ASSERT3U(rows[i][j], ==, 0); 922 } 923 ASSERT3U(rows[i][missing[i]], !=, 0); 924 925 /* 926 * Compute the inverse of the first element and multiply each 927 * element in the row by that value. 928 */ 929 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 930 931 for (j = 0; j < n; j++) { 932 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 933 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 934 } 935 936 for (ii = 0; ii < nmissing; ii++) { 937 if (i == ii) 938 continue; 939 940 ASSERT3U(rows[ii][missing[i]], !=, 0); 941 942 log = vdev_raidz_log2[rows[ii][missing[i]]]; 943 944 for (j = 0; j < n; j++) { 945 rows[ii][j] ^= 946 vdev_raidz_exp2(rows[i][j], log); 947 invrows[ii][j] ^= 948 vdev_raidz_exp2(invrows[i][j], log); 949 } 950 } 951 } 952 953 /* 954 * Verify that the data that is left in the rows are properly part of 955 * an identity matrix. 956 */ 957 for (i = 0; i < nmissing; i++) { 958 for (j = 0; j < n; j++) { 959 if (j == missing[i]) { 960 ASSERT3U(rows[i][j], ==, 1); 961 } else { 962 ASSERT3U(rows[i][j], ==, 0); 963 } 964 } 965 } 966 } 967 968 static void 969 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 970 int *missing, uint8_t **invrows, const uint8_t *used) 971 { 972 int i, j, x, cc, c; 973 uint8_t *src; 974 uint64_t ccount; 975 uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 976 uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 977 uint8_t log, val; 978 int ll; 979 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 980 uint8_t *p, *pp; 981 size_t psize; 982 983 log = 0; /* gcc */ 984 psize = sizeof (invlog[0][0]) * n * nmissing; 985 p = malloc(psize); 986 if (p == NULL) { 987 printf("Out of memory\n"); 988 return; 989 } 990 991 for (pp = p, i = 0; i < nmissing; i++) { 992 invlog[i] = pp; 993 pp += n; 994 } 995 996 for (i = 0; i < nmissing; i++) { 997 for (j = 0; j < n; j++) { 998 ASSERT3U(invrows[i][j], !=, 0); 999 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1000 } 1001 } 1002 1003 for (i = 0; i < n; i++) { 1004 c = used[i]; 1005 ASSERT3U(c, <, rm->rm_cols); 1006 1007 src = rm->rm_col[c].rc_data; 1008 ccount = rm->rm_col[c].rc_size; 1009 for (j = 0; j < nmissing; j++) { 1010 cc = missing[j] + rm->rm_firstdatacol; 1011 ASSERT3U(cc, >=, rm->rm_firstdatacol); 1012 ASSERT3U(cc, <, rm->rm_cols); 1013 ASSERT3U(cc, !=, c); 1014 1015 dst[j] = rm->rm_col[cc].rc_data; 1016 dcount[j] = rm->rm_col[cc].rc_size; 1017 } 1018 1019 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 1020 1021 for (x = 0; x < ccount; x++, src++) { 1022 if (*src != 0) 1023 log = vdev_raidz_log2[*src]; 1024 1025 for (cc = 0; cc < nmissing; cc++) { 1026 if (x >= dcount[cc]) 1027 continue; 1028 1029 if (*src == 0) { 1030 val = 0; 1031 } else { 1032 if ((ll = log + invlog[cc][i]) >= 255) 1033 ll -= 255; 1034 val = vdev_raidz_pow2[ll]; 1035 } 1036 1037 if (i == 0) 1038 dst[cc][x] = val; 1039 else 1040 dst[cc][x] ^= val; 1041 } 1042 } 1043 } 1044 1045 free(p); 1046 } 1047 1048 static int 1049 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 1050 { 1051 int n, i, c, t, tt; 1052 int nmissing_rows; 1053 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1054 int parity_map[VDEV_RAIDZ_MAXPARITY]; 1055 1056 uint8_t *p, *pp; 1057 size_t psize; 1058 1059 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1060 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1061 uint8_t *used; 1062 1063 int code = 0; 1064 1065 1066 n = rm->rm_cols - rm->rm_firstdatacol; 1067 1068 /* 1069 * Figure out which data columns are missing. 1070 */ 1071 nmissing_rows = 0; 1072 for (t = 0; t < ntgts; t++) { 1073 if (tgts[t] >= rm->rm_firstdatacol) { 1074 missing_rows[nmissing_rows++] = 1075 tgts[t] - rm->rm_firstdatacol; 1076 } 1077 } 1078 1079 /* 1080 * Figure out which parity columns to use to help generate the missing 1081 * data columns. 1082 */ 1083 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1084 ASSERT(tt < ntgts); 1085 ASSERT(c < rm->rm_firstdatacol); 1086 1087 /* 1088 * Skip any targeted parity columns. 1089 */ 1090 if (c == tgts[tt]) { 1091 tt++; 1092 continue; 1093 } 1094 1095 code |= 1 << c; 1096 1097 parity_map[i] = c; 1098 i++; 1099 } 1100 1101 ASSERT(code != 0); 1102 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1103 1104 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1105 nmissing_rows * n + sizeof (used[0]) * n; 1106 p = malloc(psize); 1107 if (p == NULL) { 1108 printf("Out of memory\n"); 1109 return (code); 1110 } 1111 1112 for (pp = p, i = 0; i < nmissing_rows; i++) { 1113 rows[i] = pp; 1114 pp += n; 1115 invrows[i] = pp; 1116 pp += n; 1117 } 1118 used = pp; 1119 1120 for (i = 0; i < nmissing_rows; i++) { 1121 used[i] = parity_map[i]; 1122 } 1123 1124 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1125 if (tt < nmissing_rows && 1126 c == missing_rows[tt] + rm->rm_firstdatacol) { 1127 tt++; 1128 continue; 1129 } 1130 1131 ASSERT3S(i, <, n); 1132 used[i] = c; 1133 i++; 1134 } 1135 1136 /* 1137 * Initialize the interesting rows of the matrix. 1138 */ 1139 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1140 1141 /* 1142 * Invert the matrix. 1143 */ 1144 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1145 invrows, used); 1146 1147 /* 1148 * Reconstruct the missing data using the generated matrix. 1149 */ 1150 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1151 invrows, used); 1152 1153 free(p); 1154 1155 return (code); 1156 } 1157 1158 static int 1159 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 1160 { 1161 int tgts[VDEV_RAIDZ_MAXPARITY]; 1162 int ntgts; 1163 int i, c; 1164 int code; 1165 int nbadparity, nbaddata; 1166 1167 /* 1168 * The tgts list must already be sorted. 1169 */ 1170 for (i = 1; i < nt; i++) { 1171 ASSERT(t[i] > t[i - 1]); 1172 } 1173 1174 nbadparity = rm->rm_firstdatacol; 1175 nbaddata = rm->rm_cols - nbadparity; 1176 ntgts = 0; 1177 for (i = 0, c = 0; c < rm->rm_cols; c++) { 1178 if (i < nt && c == t[i]) { 1179 tgts[ntgts++] = c; 1180 i++; 1181 } else if (rm->rm_col[c].rc_error != 0) { 1182 tgts[ntgts++] = c; 1183 } else if (c >= rm->rm_firstdatacol) { 1184 nbaddata--; 1185 } else { 1186 nbadparity--; 1187 } 1188 } 1189 1190 ASSERT(ntgts >= nt); 1191 ASSERT(nbaddata >= 0); 1192 ASSERT(nbaddata + nbadparity == ntgts); 1193 1194 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1195 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1196 ASSERT(code > 0); 1197 return (code); 1198 } 1199 1200 static raidz_map_t * 1201 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift, 1202 uint64_t dcols, uint64_t nparity) 1203 { 1204 raidz_map_t *rm; 1205 uint64_t b = offset >> unit_shift; 1206 uint64_t s = size >> unit_shift; 1207 uint64_t f = b % dcols; 1208 uint64_t o = (b / dcols) << unit_shift; 1209 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 1210 1211 q = s / (dcols - nparity); 1212 r = s - q * (dcols - nparity); 1213 bc = (r == 0 ? 0 : r + nparity); 1214 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 1215 1216 if (q == 0) { 1217 acols = bc; 1218 scols = MIN(dcols, roundup(bc, nparity + 1)); 1219 } else { 1220 acols = dcols; 1221 scols = dcols; 1222 } 1223 1224 ASSERT3U(acols, <=, scols); 1225 1226 rm = malloc(offsetof(raidz_map_t, rm_col[scols])); 1227 if (rm == NULL) 1228 return (rm); 1229 1230 rm->rm_cols = acols; 1231 rm->rm_scols = scols; 1232 rm->rm_bigcols = bc; 1233 rm->rm_skipstart = bc; 1234 rm->rm_missingdata = 0; 1235 rm->rm_missingparity = 0; 1236 rm->rm_firstdatacol = nparity; 1237 rm->rm_reports = 0; 1238 rm->rm_freed = 0; 1239 rm->rm_ecksuminjected = 0; 1240 1241 asize = 0; 1242 1243 for (c = 0; c < scols; c++) { 1244 col = f + c; 1245 coff = o; 1246 if (col >= dcols) { 1247 col -= dcols; 1248 coff += 1ULL << unit_shift; 1249 } 1250 rm->rm_col[c].rc_devidx = col; 1251 rm->rm_col[c].rc_offset = coff; 1252 rm->rm_col[c].rc_data = NULL; 1253 rm->rm_col[c].rc_error = 0; 1254 rm->rm_col[c].rc_tried = 0; 1255 rm->rm_col[c].rc_skipped = 0; 1256 1257 if (c >= acols) 1258 rm->rm_col[c].rc_size = 0; 1259 else if (c < bc) 1260 rm->rm_col[c].rc_size = (q + 1) << unit_shift; 1261 else 1262 rm->rm_col[c].rc_size = q << unit_shift; 1263 1264 asize += rm->rm_col[c].rc_size; 1265 } 1266 1267 ASSERT3U(asize, ==, tot << unit_shift); 1268 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 1269 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 1270 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 1271 ASSERT3U(rm->rm_nskip, <=, nparity); 1272 1273 for (c = 0; c < rm->rm_firstdatacol; c++) { 1274 rm->rm_col[c].rc_data = malloc(rm->rm_col[c].rc_size); 1275 if (rm->rm_col[c].rc_data == NULL) { 1276 c++; 1277 while (c != 0) 1278 free(rm->rm_col[--c].rc_data); 1279 free(rm); 1280 return (NULL); 1281 } 1282 } 1283 1284 rm->rm_col[c].rc_data = data; 1285 1286 for (c = c + 1; c < acols; c++) 1287 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 1288 rm->rm_col[c - 1].rc_size; 1289 1290 /* 1291 * If all data stored spans all columns, there's a danger that parity 1292 * will always be on the same device and, since parity isn't read 1293 * during normal operation, that that device's I/O bandwidth won't be 1294 * used effectively. We therefore switch the parity every 1MB. 1295 * 1296 * ... at least that was, ostensibly, the theory. As a practical 1297 * matter unless we juggle the parity between all devices evenly, we 1298 * won't see any benefit. Further, occasional writes that aren't a 1299 * multiple of the LCM of the number of children and the minimum 1300 * stripe width are sufficient to avoid pessimal behavior. 1301 * Unfortunately, this decision created an implicit on-disk format 1302 * requirement that we need to support for all eternity, but only 1303 * for single-parity RAID-Z. 1304 * 1305 * If we intend to skip a sector in the zeroth column for padding 1306 * we must make sure to note this swap. We will never intend to 1307 * skip the first column since at least one data and one parity 1308 * column must appear in each row. 1309 */ 1310 ASSERT(rm->rm_cols >= 2); 1311 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 1312 1313 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { 1314 devidx = rm->rm_col[0].rc_devidx; 1315 o = rm->rm_col[0].rc_offset; 1316 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 1317 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 1318 rm->rm_col[1].rc_devidx = devidx; 1319 rm->rm_col[1].rc_offset = o; 1320 1321 if (rm->rm_skipstart == 0) 1322 rm->rm_skipstart = 1; 1323 } 1324 1325 return (rm); 1326 } 1327 1328 static void 1329 vdev_raidz_map_free(raidz_map_t *rm) 1330 { 1331 int c; 1332 1333 for (c = rm->rm_firstdatacol - 1; c >= 0; c--) 1334 free(rm->rm_col[c].rc_data); 1335 1336 free(rm); 1337 } 1338 1339 static vdev_t * 1340 vdev_child(vdev_t *pvd, uint64_t devidx) 1341 { 1342 vdev_t *cvd; 1343 1344 STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) { 1345 if (cvd->v_id == devidx) 1346 break; 1347 } 1348 1349 return (cvd); 1350 } 1351 1352 /* 1353 * We keep track of whether or not there were any injected errors, so that 1354 * any ereports we generate can note it. 1355 */ 1356 static int 1357 raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data, 1358 uint64_t size) 1359 { 1360 return (zio_checksum_verify(spa, bp, data)); 1361 } 1362 1363 /* 1364 * Generate the parity from the data columns. If we tried and were able to 1365 * read the parity without error, verify that the generated parity matches the 1366 * data we read. If it doesn't, we fire off a checksum error. Return the 1367 * number such failures. 1368 */ 1369 static int 1370 raidz_parity_verify(raidz_map_t *rm) 1371 { 1372 void *orig[VDEV_RAIDZ_MAXPARITY]; 1373 int c, ret = 0; 1374 raidz_col_t *rc; 1375 1376 for (c = 0; c < rm->rm_firstdatacol; c++) { 1377 rc = &rm->rm_col[c]; 1378 if (!rc->rc_tried || rc->rc_error != 0) 1379 continue; 1380 orig[c] = malloc(rc->rc_size); 1381 if (orig[c] != NULL) { 1382 bcopy(rc->rc_data, orig[c], rc->rc_size); 1383 } else { 1384 printf("Out of memory\n"); 1385 } 1386 } 1387 1388 vdev_raidz_generate_parity(rm); 1389 1390 for (c = rm->rm_firstdatacol - 1; c >= 0; c--) { 1391 rc = &rm->rm_col[c]; 1392 if (!rc->rc_tried || rc->rc_error != 0) 1393 continue; 1394 if (orig[c] == NULL || 1395 bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 1396 rc->rc_error = ECKSUM; 1397 ret++; 1398 } 1399 free(orig[c]); 1400 } 1401 1402 return (ret); 1403 } 1404 1405 /* 1406 * Iterate over all combinations of bad data and attempt a reconstruction. 1407 * Note that the algorithm below is non-optimal because it doesn't take into 1408 * account how reconstruction is actually performed. For example, with 1409 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 1410 * is targeted as invalid as if columns 1 and 4 are targeted since in both 1411 * cases we'd only use parity information in column 0. 1412 */ 1413 static int 1414 vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp, 1415 void *data, off_t offset, uint64_t bytes, int total_errors, int data_errors) 1416 { 1417 raidz_col_t *rc; 1418 void *orig[VDEV_RAIDZ_MAXPARITY]; 1419 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 1420 int *tgts = &tstore[1]; 1421 int current, next, i, c, n; 1422 int code, ret = 0; 1423 1424 ASSERT(total_errors < rm->rm_firstdatacol); 1425 1426 /* 1427 * This simplifies one edge condition. 1428 */ 1429 tgts[-1] = -1; 1430 1431 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 1432 /* 1433 * Initialize the targets array by finding the first n columns 1434 * that contain no error. 1435 * 1436 * If there were no data errors, we need to ensure that we're 1437 * always explicitly attempting to reconstruct at least one 1438 * data column. To do this, we simply push the highest target 1439 * up into the data columns. 1440 */ 1441 for (c = 0, i = 0; i < n; i++) { 1442 if (i == n - 1 && data_errors == 0 && 1443 c < rm->rm_firstdatacol) { 1444 c = rm->rm_firstdatacol; 1445 } 1446 1447 while (rm->rm_col[c].rc_error != 0) { 1448 c++; 1449 ASSERT3S(c, <, rm->rm_cols); 1450 } 1451 1452 tgts[i] = c++; 1453 } 1454 1455 /* 1456 * Setting tgts[n] simplifies the other edge condition. 1457 */ 1458 tgts[n] = rm->rm_cols; 1459 1460 /* 1461 * These buffers were allocated in previous iterations. 1462 */ 1463 for (i = 0; i < n - 1; i++) { 1464 ASSERT(orig[i] != NULL); 1465 } 1466 1467 orig[n - 1] = malloc(rm->rm_col[0].rc_size); 1468 if (orig[n - 1] == NULL) { 1469 ret = ENOMEM; 1470 goto done; 1471 } 1472 1473 current = 0; 1474 next = tgts[current]; 1475 1476 while (current != n) { 1477 tgts[current] = next; 1478 current = 0; 1479 1480 /* 1481 * Save off the original data that we're going to 1482 * attempt to reconstruct. 1483 */ 1484 for (i = 0; i < n; i++) { 1485 ASSERT(orig[i] != NULL); 1486 c = tgts[i]; 1487 ASSERT3S(c, >=, 0); 1488 ASSERT3S(c, <, rm->rm_cols); 1489 rc = &rm->rm_col[c]; 1490 bcopy(rc->rc_data, orig[i], rc->rc_size); 1491 } 1492 1493 /* 1494 * Attempt a reconstruction and exit the outer loop on 1495 * success. 1496 */ 1497 code = vdev_raidz_reconstruct(rm, tgts, n); 1498 if (raidz_checksum_verify(spa, bp, data, bytes) == 0) { 1499 for (i = 0; i < n; i++) { 1500 c = tgts[i]; 1501 rc = &rm->rm_col[c]; 1502 ASSERT(rc->rc_error == 0); 1503 rc->rc_error = ECKSUM; 1504 } 1505 1506 ret = code; 1507 goto done; 1508 } 1509 1510 /* 1511 * Restore the original data. 1512 */ 1513 for (i = 0; i < n; i++) { 1514 c = tgts[i]; 1515 rc = &rm->rm_col[c]; 1516 bcopy(orig[i], rc->rc_data, rc->rc_size); 1517 } 1518 1519 do { 1520 /* 1521 * Find the next valid column after the current 1522 * position.. 1523 */ 1524 for (next = tgts[current] + 1; 1525 next < rm->rm_cols && 1526 rm->rm_col[next].rc_error != 0; next++) 1527 continue; 1528 1529 ASSERT(next <= tgts[current + 1]); 1530 1531 /* 1532 * If that spot is available, we're done here. 1533 */ 1534 if (next != tgts[current + 1]) 1535 break; 1536 1537 /* 1538 * Otherwise, find the next valid column after 1539 * the previous position. 1540 */ 1541 for (c = tgts[current - 1] + 1; 1542 rm->rm_col[c].rc_error != 0; c++) 1543 continue; 1544 1545 tgts[current] = c; 1546 current++; 1547 1548 } while (current != n); 1549 } 1550 } 1551 n--; 1552 done: 1553 for (i = n - 1; i >= 0; i--) { 1554 free(orig[i]); 1555 } 1556 1557 return (ret); 1558 } 1559 1560 static int 1561 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data, 1562 off_t offset, size_t bytes) 1563 { 1564 vdev_t *tvd = vd->v_top; 1565 vdev_t *cvd; 1566 raidz_map_t *rm; 1567 raidz_col_t *rc; 1568 int c, error; 1569 int unexpected_errors; 1570 int parity_errors; 1571 int parity_untried; 1572 int data_errors; 1573 int total_errors; 1574 int n; 1575 int tgts[VDEV_RAIDZ_MAXPARITY]; 1576 int code; 1577 1578 rc = NULL; /* gcc */ 1579 error = 0; 1580 1581 rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift, 1582 vd->v_nchildren, vd->v_nparity); 1583 if (rm == NULL) 1584 return (ENOMEM); 1585 1586 /* 1587 * Iterate over the columns in reverse order so that we hit the parity 1588 * last -- any errors along the way will force us to read the parity. 1589 */ 1590 for (c = rm->rm_cols - 1; c >= 0; c--) { 1591 rc = &rm->rm_col[c]; 1592 cvd = vdev_child(vd, rc->rc_devidx); 1593 if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) { 1594 if (c >= rm->rm_firstdatacol) 1595 rm->rm_missingdata++; 1596 else 1597 rm->rm_missingparity++; 1598 rc->rc_error = ENXIO; 1599 rc->rc_tried = 1; /* don't even try */ 1600 rc->rc_skipped = 1; 1601 continue; 1602 } 1603 #if 0 /* XXX: Too hard for the boot code. */ 1604 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 1605 if (c >= rm->rm_firstdatacol) 1606 rm->rm_missingdata++; 1607 else 1608 rm->rm_missingparity++; 1609 rc->rc_error = ESTALE; 1610 rc->rc_skipped = 1; 1611 continue; 1612 } 1613 #endif 1614 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) { 1615 rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data, 1616 rc->rc_offset, rc->rc_size); 1617 rc->rc_tried = 1; 1618 rc->rc_skipped = 0; 1619 } 1620 } 1621 1622 reconstruct: 1623 unexpected_errors = 0; 1624 parity_errors = 0; 1625 parity_untried = 0; 1626 data_errors = 0; 1627 total_errors = 0; 1628 1629 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 1630 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 1631 1632 for (c = 0; c < rm->rm_cols; c++) { 1633 rc = &rm->rm_col[c]; 1634 1635 if (rc->rc_error) { 1636 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 1637 1638 if (c < rm->rm_firstdatacol) 1639 parity_errors++; 1640 else 1641 data_errors++; 1642 1643 if (!rc->rc_skipped) 1644 unexpected_errors++; 1645 1646 total_errors++; 1647 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 1648 parity_untried++; 1649 } 1650 } 1651 1652 /* 1653 * There are three potential phases for a read: 1654 * 1. produce valid data from the columns read 1655 * 2. read all disks and try again 1656 * 3. perform combinatorial reconstruction 1657 * 1658 * Each phase is progressively both more expensive and less likely to 1659 * occur. If we encounter more errors than we can repair or all phases 1660 * fail, we have no choice but to return an error. 1661 */ 1662 1663 /* 1664 * If the number of errors we saw was correctable -- less than or equal 1665 * to the number of parity disks read -- attempt to produce data that 1666 * has a valid checksum. Naturally, this case applies in the absence of 1667 * any errors. 1668 */ 1669 if (total_errors <= rm->rm_firstdatacol - parity_untried) { 1670 int rv; 1671 1672 if (data_errors == 0) { 1673 rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes); 1674 if (rv == 0) { 1675 /* 1676 * If we read parity information (unnecessarily 1677 * as it happens since no reconstruction was 1678 * needed) regenerate and verify the parity. 1679 * We also regenerate parity when resilvering 1680 * so we can write it out to the failed device 1681 * later. 1682 */ 1683 if (parity_errors + parity_untried < 1684 rm->rm_firstdatacol) { 1685 n = raidz_parity_verify(rm); 1686 unexpected_errors += n; 1687 ASSERT(parity_errors + n <= 1688 rm->rm_firstdatacol); 1689 } 1690 goto done; 1691 } 1692 } else { 1693 /* 1694 * We either attempt to read all the parity columns or 1695 * none of them. If we didn't try to read parity, we 1696 * wouldn't be here in the correctable case. There must 1697 * also have been fewer parity errors than parity 1698 * columns or, again, we wouldn't be in this code path. 1699 */ 1700 ASSERT(parity_untried == 0); 1701 ASSERT(parity_errors < rm->rm_firstdatacol); 1702 1703 /* 1704 * Identify the data columns that reported an error. 1705 */ 1706 n = 0; 1707 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1708 rc = &rm->rm_col[c]; 1709 if (rc->rc_error != 0) { 1710 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 1711 tgts[n++] = c; 1712 } 1713 } 1714 1715 ASSERT(rm->rm_firstdatacol >= n); 1716 1717 code = vdev_raidz_reconstruct(rm, tgts, n); 1718 1719 rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes); 1720 if (rv == 0) { 1721 /* 1722 * If we read more parity disks than were used 1723 * for reconstruction, confirm that the other 1724 * parity disks produced correct data. This 1725 * routine is suboptimal in that it regenerates 1726 * the parity that we already used in addition 1727 * to the parity that we're attempting to 1728 * verify, but this should be a relatively 1729 * uncommon case, and can be optimized if it 1730 * becomes a problem. Note that we regenerate 1731 * parity when resilvering so we can write it 1732 * out to failed devices later. 1733 */ 1734 if (parity_errors < rm->rm_firstdatacol - n) { 1735 n = raidz_parity_verify(rm); 1736 unexpected_errors += n; 1737 ASSERT(parity_errors + n <= 1738 rm->rm_firstdatacol); 1739 } 1740 1741 goto done; 1742 } 1743 } 1744 } 1745 1746 /* 1747 * This isn't a typical situation -- either we got a read 1748 * error or a child silently returned bad data. Read every 1749 * block so we can try again with as much data and parity as 1750 * we can track down. If we've already been through once 1751 * before, all children will be marked as tried so we'll 1752 * proceed to combinatorial reconstruction. 1753 */ 1754 unexpected_errors = 1; 1755 rm->rm_missingdata = 0; 1756 rm->rm_missingparity = 0; 1757 1758 n = 0; 1759 for (c = 0; c < rm->rm_cols; c++) { 1760 rc = &rm->rm_col[c]; 1761 1762 if (rc->rc_tried) 1763 continue; 1764 1765 cvd = vdev_child(vd, rc->rc_devidx); 1766 ASSERT(cvd != NULL); 1767 rc->rc_error = cvd->v_read(cvd, NULL, 1768 rc->rc_data, rc->rc_offset, rc->rc_size); 1769 if (rc->rc_error == 0) 1770 n++; 1771 rc->rc_tried = 1; 1772 rc->rc_skipped = 0; 1773 } 1774 /* 1775 * If we managed to read anything more, retry the 1776 * reconstruction. 1777 */ 1778 if (n > 0) 1779 goto reconstruct; 1780 1781 /* 1782 * At this point we've attempted to reconstruct the data given the 1783 * errors we detected, and we've attempted to read all columns. There 1784 * must, therefore, be one or more additional problems -- silent errors 1785 * resulting in invalid data rather than explicit I/O errors resulting 1786 * in absent data. We check if there is enough additional data to 1787 * possibly reconstruct the data and then perform combinatorial 1788 * reconstruction over all possible combinations. If that fails, 1789 * we're cooked. 1790 */ 1791 if (total_errors > rm->rm_firstdatacol) { 1792 error = EIO; 1793 } else if (total_errors < rm->rm_firstdatacol && 1794 (code = vdev_raidz_combrec(vd->v_spa, rm, bp, data, offset, bytes, 1795 total_errors, data_errors)) != 0) { 1796 /* 1797 * If we didn't use all the available parity for the 1798 * combinatorial reconstruction, verify that the remaining 1799 * parity is correct. 1800 */ 1801 if (code != (1 << rm->rm_firstdatacol) - 1) 1802 (void) raidz_parity_verify(rm); 1803 } else { 1804 /* 1805 * We're here because either: 1806 * 1807 * total_errors == rm_first_datacol, or 1808 * vdev_raidz_combrec() failed 1809 * 1810 * In either case, there is enough bad data to prevent 1811 * reconstruction. 1812 * 1813 * Start checksum ereports for all children which haven't 1814 * failed, and the IO wasn't speculative. 1815 */ 1816 error = ECKSUM; 1817 } 1818 1819 done: 1820 vdev_raidz_map_free(rm); 1821 1822 return (error); 1823 } 1824