1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/cdefs.h> 27 __FBSDID("$FreeBSD$"); 28 29 #include <lz4.h> 30 31 static uint64_t zfs_crc64_table[256]; 32 33 #ifndef ASSERT3S /* Proxy for all the assert defines */ 34 #define ASSERT3S(x, y, z) ((void)0) 35 #define ASSERT3U(x, y, z) ((void)0) 36 #define ASSERT3P(x, y, z) ((void)0) 37 #define ASSERT0(x) ((void)0) 38 #define ASSERT(x) ((void)0) 39 #endif 40 41 #define panic(...) do { \ 42 printf(__VA_ARGS__); \ 43 for (;;) ; \ 44 } while (0) 45 46 static void 47 zfs_init_crc(void) 48 { 49 int i, j; 50 uint64_t *ct; 51 52 /* 53 * Calculate the crc64 table (used for the zap hash 54 * function). 55 */ 56 if (zfs_crc64_table[128] != ZFS_CRC64_POLY) { 57 memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table)); 58 for (i = 0; i < 256; i++) 59 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 60 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 61 } 62 } 63 64 static void 65 zio_checksum_off(const void *buf, uint64_t size, 66 const void *ctx_template, zio_cksum_t *zcp) 67 { 68 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); 69 } 70 71 /* 72 * Signature for checksum functions. 73 */ 74 typedef void zio_checksum_t(const void *data, uint64_t size, 75 const void *ctx_template, zio_cksum_t *zcp); 76 typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt); 77 typedef void zio_checksum_tmpl_free_t(void *ctx_template); 78 79 typedef enum zio_checksum_flags { 80 /* Strong enough for metadata? */ 81 ZCHECKSUM_FLAG_METADATA = (1 << 1), 82 /* ZIO embedded checksum */ 83 ZCHECKSUM_FLAG_EMBEDDED = (1 << 2), 84 /* Strong enough for dedup (without verification)? */ 85 ZCHECKSUM_FLAG_DEDUP = (1 << 3), 86 /* Uses salt value */ 87 ZCHECKSUM_FLAG_SALTED = (1 << 4), 88 /* Strong enough for nopwrite? */ 89 ZCHECKSUM_FLAG_NOPWRITE = (1 << 5) 90 } zio_checksum_flags_t; 91 92 /* 93 * Information about each checksum function. 94 */ 95 typedef struct zio_checksum_info { 96 /* checksum function for each byteorder */ 97 zio_checksum_t *ci_func[2]; 98 zio_checksum_tmpl_init_t *ci_tmpl_init; 99 zio_checksum_tmpl_free_t *ci_tmpl_free; 100 zio_checksum_flags_t ci_flags; 101 const char *ci_name; /* descriptive name */ 102 } zio_checksum_info_t; 103 104 #include "blkptr.c" 105 106 #include "fletcher.c" 107 #include "blake3_zfs.c" 108 #include "sha256.c" 109 #include "skein_zfs.c" 110 111 #ifdef HAS_ZSTD_ZFS 112 extern int zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len, 113 size_t d_len, int n); 114 #endif 115 116 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { 117 {{NULL, NULL}, NULL, NULL, 0, "inherit"}, 118 {{NULL, NULL}, NULL, NULL, 0, "on"}, 119 {{zio_checksum_off, zio_checksum_off}, NULL, NULL, 0, "off"}, 120 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, 121 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"}, 122 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, 123 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"}, 124 {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL, 125 ZCHECKSUM_FLAG_EMBEDDED, "zilog"}, 126 {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL, 127 0, "fletcher2"}, 128 {{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL, 129 ZCHECKSUM_FLAG_METADATA, "fletcher4"}, 130 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, 131 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | 132 ZCHECKSUM_FLAG_NOPWRITE, "SHA256"}, 133 {{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL, 134 ZCHECKSUM_FLAG_EMBEDDED, "zillog2"}, 135 {{zio_checksum_off, zio_checksum_off}, NULL, NULL, 136 0, "noparity"}, 137 {{zio_checksum_SHA512_native, zio_checksum_SHA512_byteswap}, 138 NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | 139 ZCHECKSUM_FLAG_NOPWRITE, "SHA512"}, 140 {{zio_checksum_skein_native, zio_checksum_skein_byteswap}, 141 zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free, 142 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | 143 ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"}, 144 /* no edonr for now */ 145 {{NULL, NULL}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | 146 ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"}, 147 {{zio_checksum_blake3_native, zio_checksum_blake3_byteswap}, 148 zio_checksum_blake3_tmpl_init, zio_checksum_blake3_tmpl_free, 149 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | 150 ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "blake3"} 151 }; 152 153 /* 154 * Common signature for all zio compress/decompress functions. 155 */ 156 typedef size_t zio_compress_func_t(void *src, void *dst, 157 size_t s_len, size_t d_len, int); 158 typedef int zio_decompress_func_t(void *src, void *dst, 159 size_t s_len, size_t d_len, int); 160 161 /* 162 * Information about each compression function. 163 */ 164 typedef struct zio_compress_info { 165 zio_compress_func_t *ci_compress; /* compression function */ 166 zio_decompress_func_t *ci_decompress; /* decompression function */ 167 int ci_level; /* level parameter */ 168 const char *ci_name; /* algorithm name */ 169 } zio_compress_info_t; 170 171 #include "lzjb.c" 172 #include "zle.c" 173 #include "gzip.c" 174 175 /* 176 * Compression vectors. 177 */ 178 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { 179 {NULL, NULL, 0, "inherit"}, 180 {NULL, NULL, 0, "on"}, 181 {NULL, NULL, 0, "uncompressed"}, 182 {NULL, lzjb_decompress, 0, "lzjb"}, 183 {NULL, NULL, 0, "empty"}, 184 {NULL, gzip_decompress, 1, "gzip-1"}, 185 {NULL, gzip_decompress, 2, "gzip-2"}, 186 {NULL, gzip_decompress, 3, "gzip-3"}, 187 {NULL, gzip_decompress, 4, "gzip-4"}, 188 {NULL, gzip_decompress, 5, "gzip-5"}, 189 {NULL, gzip_decompress, 6, "gzip-6"}, 190 {NULL, gzip_decompress, 7, "gzip-7"}, 191 {NULL, gzip_decompress, 8, "gzip-8"}, 192 {NULL, gzip_decompress, 9, "gzip-9"}, 193 {NULL, zle_decompress, 64, "zle"}, 194 {NULL, lz4_decompress, 0, "lz4"}, 195 #ifdef HAS_ZSTD_ZFS 196 {NULL, zfs_zstd_decompress, ZIO_ZSTD_LEVEL_DEFAULT, "zstd"} 197 #endif 198 }; 199 200 static void 201 byteswap_uint64_array(void *vbuf, size_t size) 202 { 203 uint64_t *buf = vbuf; 204 size_t count = size >> 3; 205 int i; 206 207 ASSERT((size & 7) == 0); 208 209 for (i = 0; i < count; i++) 210 buf[i] = BSWAP_64(buf[i]); 211 } 212 213 /* 214 * Set the external verifier for a gang block based on <vdev, offset, txg>, 215 * a tuple which is guaranteed to be unique for the life of the pool. 216 */ 217 static void 218 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp) 219 { 220 const dva_t *dva = BP_IDENTITY(bp); 221 uint64_t txg = BP_PHYSICAL_BIRTH(bp); 222 223 ASSERT(BP_IS_GANG(bp)); 224 225 ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0); 226 } 227 228 /* 229 * Set the external verifier for a label block based on its offset. 230 * The vdev is implicit, and the txg is unknowable at pool open time -- 231 * hence the logic in vdev_uberblock_load() to find the most recent copy. 232 */ 233 static void 234 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset) 235 { 236 ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0); 237 } 238 239 /* 240 * Calls the template init function of a checksum which supports context 241 * templates and installs the template into the spa_t. 242 */ 243 static void 244 zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa) 245 { 246 zio_checksum_info_t *ci = &zio_checksum_table[checksum]; 247 248 if (ci->ci_tmpl_init == NULL) 249 return; 250 251 if (spa->spa_cksum_tmpls[checksum] != NULL) 252 return; 253 254 if (spa->spa_cksum_tmpls[checksum] == NULL) { 255 spa->spa_cksum_tmpls[checksum] = 256 ci->ci_tmpl_init(&spa->spa_cksum_salt); 257 } 258 } 259 260 /* 261 * Called by a spa_t that's about to be deallocated. This steps through 262 * all of the checksum context templates and deallocates any that were 263 * initialized using the algorithm-specific template init function. 264 */ 265 static void __unused 266 zio_checksum_templates_free(spa_t *spa) 267 { 268 for (enum zio_checksum checksum = 0; 269 checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) { 270 if (spa->spa_cksum_tmpls[checksum] != NULL) { 271 zio_checksum_info_t *ci = &zio_checksum_table[checksum]; 272 273 ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]); 274 spa->spa_cksum_tmpls[checksum] = NULL; 275 } 276 } 277 } 278 279 static int 280 zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data) 281 { 282 uint64_t size; 283 unsigned int checksum; 284 zio_checksum_info_t *ci; 285 void *ctx = NULL; 286 zio_cksum_t actual_cksum, expected_cksum, verifier; 287 int byteswap; 288 289 checksum = BP_GET_CHECKSUM(bp); 290 size = BP_GET_PSIZE(bp); 291 292 if (checksum >= ZIO_CHECKSUM_FUNCTIONS) 293 return (EINVAL); 294 ci = &zio_checksum_table[checksum]; 295 if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL) 296 return (EINVAL); 297 298 if (spa != NULL) { 299 zio_checksum_template_init(checksum, __DECONST(spa_t *,spa)); 300 ctx = spa->spa_cksum_tmpls[checksum]; 301 } 302 303 if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { 304 zio_eck_t *eck; 305 306 ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER || 307 checksum == ZIO_CHECKSUM_LABEL); 308 309 eck = (zio_eck_t *)((char *)data + size) - 1; 310 311 if (checksum == ZIO_CHECKSUM_GANG_HEADER) 312 zio_checksum_gang_verifier(&verifier, bp); 313 else if (checksum == ZIO_CHECKSUM_LABEL) 314 zio_checksum_label_verifier(&verifier, 315 DVA_GET_OFFSET(BP_IDENTITY(bp))); 316 else 317 verifier = bp->blk_cksum; 318 319 byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC)); 320 321 if (byteswap) 322 byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); 323 324 expected_cksum = eck->zec_cksum; 325 eck->zec_cksum = verifier; 326 ci->ci_func[byteswap](data, size, ctx, &actual_cksum); 327 eck->zec_cksum = expected_cksum; 328 329 if (byteswap) 330 byteswap_uint64_array(&expected_cksum, 331 sizeof (zio_cksum_t)); 332 } else { 333 byteswap = BP_SHOULD_BYTESWAP(bp); 334 expected_cksum = bp->blk_cksum; 335 ci->ci_func[byteswap](data, size, ctx, &actual_cksum); 336 } 337 338 if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) { 339 /*printf("ZFS: read checksum %s failed\n", ci->ci_name);*/ 340 return (EIO); 341 } 342 343 return (0); 344 } 345 346 static int 347 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize, 348 void *dest, uint64_t destsize) 349 { 350 zio_compress_info_t *ci; 351 352 if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) { 353 printf("ZFS: unsupported compression algorithm %u\n", cpfunc); 354 return (EIO); 355 } 356 357 ci = &zio_compress_table[cpfunc]; 358 if (!ci->ci_decompress) { 359 printf("ZFS: unsupported compression algorithm %s\n", 360 ci->ci_name); 361 return (EIO); 362 } 363 364 return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level)); 365 } 366 367 static uint64_t 368 zap_hash(uint64_t salt, const char *name) 369 { 370 const uint8_t *cp; 371 uint8_t c; 372 uint64_t crc = salt; 373 374 ASSERT(crc != 0); 375 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 376 for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++) 377 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF]; 378 379 /* 380 * Only use 28 bits, since we need 4 bits in the cookie for the 381 * collision differentiator. We MUST use the high bits, since 382 * those are the onces that we first pay attention to when 383 * chosing the bucket. 384 */ 385 crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1); 386 387 return (crc); 388 } 389 390 typedef struct raidz_col { 391 uint64_t rc_devidx; /* child device index for I/O */ 392 uint64_t rc_offset; /* device offset */ 393 uint64_t rc_size; /* I/O size */ 394 void *rc_data; /* I/O data */ 395 int rc_error; /* I/O error for this device */ 396 uint8_t rc_tried; /* Did we attempt this I/O column? */ 397 uint8_t rc_skipped; /* Did we skip this I/O column? */ 398 } raidz_col_t; 399 400 typedef struct raidz_map { 401 uint64_t rm_cols; /* Regular column count */ 402 uint64_t rm_scols; /* Count including skipped columns */ 403 uint64_t rm_bigcols; /* Number of oversized columns */ 404 uint64_t rm_asize; /* Actual total I/O size */ 405 uint64_t rm_missingdata; /* Count of missing data devices */ 406 uint64_t rm_missingparity; /* Count of missing parity devices */ 407 uint64_t rm_firstdatacol; /* First data column/parity count */ 408 uint64_t rm_nskip; /* Skipped sectors for padding */ 409 uint64_t rm_skipstart; /* Column index of padding start */ 410 uintptr_t rm_reports; /* # of referencing checksum reports */ 411 uint8_t rm_freed; /* map no longer has referencing ZIO */ 412 uint8_t rm_ecksuminjected; /* checksum error was injected */ 413 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 414 } raidz_map_t; 415 416 #define VDEV_RAIDZ_P 0 417 #define VDEV_RAIDZ_Q 1 418 #define VDEV_RAIDZ_R 2 419 420 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 421 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 422 423 /* 424 * We provide a mechanism to perform the field multiplication operation on a 425 * 64-bit value all at once rather than a byte at a time. This works by 426 * creating a mask from the top bit in each byte and using that to 427 * conditionally apply the XOR of 0x1d. 428 */ 429 #define VDEV_RAIDZ_64MUL_2(x, mask) \ 430 { \ 431 (mask) = (x) & 0x8080808080808080ULL; \ 432 (mask) = ((mask) << 1) - ((mask) >> 7); \ 433 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 434 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ 435 } 436 437 #define VDEV_RAIDZ_64MUL_4(x, mask) \ 438 { \ 439 VDEV_RAIDZ_64MUL_2((x), mask); \ 440 VDEV_RAIDZ_64MUL_2((x), mask); \ 441 } 442 443 /* 444 * These two tables represent powers and logs of 2 in the Galois field defined 445 * above. These values were computed by repeatedly multiplying by 2 as above. 446 */ 447 static const uint8_t vdev_raidz_pow2[256] = { 448 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 449 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 450 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 451 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 452 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 453 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 454 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 455 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 456 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 457 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 458 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 459 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 460 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 461 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 462 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 463 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 464 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 465 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 466 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 467 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 468 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 469 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 470 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 471 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 472 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 473 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 474 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 475 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 476 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 477 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 478 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 479 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 480 }; 481 static const uint8_t vdev_raidz_log2[256] = { 482 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 483 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 484 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 485 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 486 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 487 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 488 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 489 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 490 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 491 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 492 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 493 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 494 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 495 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 496 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 497 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 498 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 499 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 500 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 501 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 502 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 503 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 504 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 505 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 506 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 507 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 508 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 509 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 510 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 511 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 512 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 513 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 514 }; 515 516 /* 517 * Multiply a given number by 2 raised to the given power. 518 */ 519 static uint8_t 520 vdev_raidz_exp2(uint8_t a, int exp) 521 { 522 if (a == 0) 523 return (0); 524 525 ASSERT(exp >= 0); 526 ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 527 528 exp += vdev_raidz_log2[a]; 529 if (exp > 255) 530 exp -= 255; 531 532 return (vdev_raidz_pow2[exp]); 533 } 534 535 static void 536 vdev_raidz_generate_parity_p(raidz_map_t *rm) 537 { 538 uint64_t *p, *src, ccount, i; 539 uint64_t pcount __unused; 540 int c; 541 542 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 543 544 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 545 src = rm->rm_col[c].rc_data; 546 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 547 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 548 549 if (c == rm->rm_firstdatacol) { 550 ASSERT(ccount == pcount); 551 for (i = 0; i < ccount; i++, src++, p++) { 552 *p = *src; 553 } 554 } else { 555 ASSERT(ccount <= pcount); 556 for (i = 0; i < ccount; i++, src++, p++) { 557 *p ^= *src; 558 } 559 } 560 } 561 } 562 563 static void 564 vdev_raidz_generate_parity_pq(raidz_map_t *rm) 565 { 566 uint64_t *p, *q, *src, pcnt, ccnt, mask, i; 567 int c; 568 569 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 570 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 571 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 572 573 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 574 src = rm->rm_col[c].rc_data; 575 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 576 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 577 578 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 579 580 if (c == rm->rm_firstdatacol) { 581 ASSERT(ccnt == pcnt || ccnt == 0); 582 for (i = 0; i < ccnt; i++, src++, p++, q++) { 583 *p = *src; 584 *q = *src; 585 } 586 for (; i < pcnt; i++, src++, p++, q++) { 587 *p = 0; 588 *q = 0; 589 } 590 } else { 591 ASSERT(ccnt <= pcnt); 592 593 /* 594 * Apply the algorithm described above by multiplying 595 * the previous result and adding in the new value. 596 */ 597 for (i = 0; i < ccnt; i++, src++, p++, q++) { 598 *p ^= *src; 599 600 VDEV_RAIDZ_64MUL_2(*q, mask); 601 *q ^= *src; 602 } 603 604 /* 605 * Treat short columns as though they are full of 0s. 606 * Note that there's therefore nothing needed for P. 607 */ 608 for (; i < pcnt; i++, q++) { 609 VDEV_RAIDZ_64MUL_2(*q, mask); 610 } 611 } 612 } 613 } 614 615 static void 616 vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 617 { 618 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; 619 int c; 620 621 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 622 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 623 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 624 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 625 rm->rm_col[VDEV_RAIDZ_R].rc_size); 626 627 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 628 src = rm->rm_col[c].rc_data; 629 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 630 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 631 r = rm->rm_col[VDEV_RAIDZ_R].rc_data; 632 633 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 634 635 if (c == rm->rm_firstdatacol) { 636 ASSERT(ccnt == pcnt || ccnt == 0); 637 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 638 *p = *src; 639 *q = *src; 640 *r = *src; 641 } 642 for (; i < pcnt; i++, src++, p++, q++, r++) { 643 *p = 0; 644 *q = 0; 645 *r = 0; 646 } 647 } else { 648 ASSERT(ccnt <= pcnt); 649 650 /* 651 * Apply the algorithm described above by multiplying 652 * the previous result and adding in the new value. 653 */ 654 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 655 *p ^= *src; 656 657 VDEV_RAIDZ_64MUL_2(*q, mask); 658 *q ^= *src; 659 660 VDEV_RAIDZ_64MUL_4(*r, mask); 661 *r ^= *src; 662 } 663 664 /* 665 * Treat short columns as though they are full of 0s. 666 * Note that there's therefore nothing needed for P. 667 */ 668 for (; i < pcnt; i++, q++, r++) { 669 VDEV_RAIDZ_64MUL_2(*q, mask); 670 VDEV_RAIDZ_64MUL_4(*r, mask); 671 } 672 } 673 } 674 } 675 676 /* 677 * Generate RAID parity in the first virtual columns according to the number of 678 * parity columns available. 679 */ 680 static void 681 vdev_raidz_generate_parity(raidz_map_t *rm) 682 { 683 switch (rm->rm_firstdatacol) { 684 case 1: 685 vdev_raidz_generate_parity_p(rm); 686 break; 687 case 2: 688 vdev_raidz_generate_parity_pq(rm); 689 break; 690 case 3: 691 vdev_raidz_generate_parity_pqr(rm); 692 break; 693 default: 694 panic("invalid RAID-Z configuration"); 695 } 696 } 697 698 /* BEGIN CSTYLED */ 699 /* 700 * In the general case of reconstruction, we must solve the system of linear 701 * equations defined by the coeffecients used to generate parity as well as 702 * the contents of the data and parity disks. This can be expressed with 703 * vectors for the original data (D) and the actual data (d) and parity (p) 704 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 705 * 706 * __ __ __ __ 707 * | | __ __ | p_0 | 708 * | V | | D_0 | | p_m-1 | 709 * | | x | : | = | d_0 | 710 * | I | | D_n-1 | | : | 711 * | | ~~ ~~ | d_n-1 | 712 * ~~ ~~ ~~ ~~ 713 * 714 * I is simply a square identity matrix of size n, and V is a vandermonde 715 * matrix defined by the coeffecients we chose for the various parity columns 716 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 717 * computation as well as linear separability. 718 * 719 * __ __ __ __ 720 * | 1 .. 1 1 1 | | p_0 | 721 * | 2^n-1 .. 4 2 1 | __ __ | : | 722 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 723 * | 1 .. 0 0 0 | | D_1 | | d_0 | 724 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 725 * | : : : : | | : | | d_2 | 726 * | 0 .. 1 0 0 | | D_n-1 | | : | 727 * | 0 .. 0 1 0 | ~~ ~~ | : | 728 * | 0 .. 0 0 1 | | d_n-1 | 729 * ~~ ~~ ~~ ~~ 730 * 731 * Note that I, V, d, and p are known. To compute D, we must invert the 732 * matrix and use the known data and parity values to reconstruct the unknown 733 * data values. We begin by removing the rows in V|I and d|p that correspond 734 * to failed or missing columns; we then make V|I square (n x n) and d|p 735 * sized n by removing rows corresponding to unused parity from the bottom up 736 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 737 * using Gauss-Jordan elimination. In the example below we use m=3 parity 738 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 739 * __ __ 740 * | 1 1 1 1 1 1 1 1 | 741 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 742 * | 19 205 116 29 64 16 4 1 | / / 743 * | 1 0 0 0 0 0 0 0 | / / 744 * | 0 1 0 0 0 0 0 0 | <--' / 745 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 746 * | 0 0 0 1 0 0 0 0 | 747 * | 0 0 0 0 1 0 0 0 | 748 * | 0 0 0 0 0 1 0 0 | 749 * | 0 0 0 0 0 0 1 0 | 750 * | 0 0 0 0 0 0 0 1 | 751 * ~~ ~~ 752 * __ __ 753 * | 1 1 1 1 1 1 1 1 | 754 * | 128 64 32 16 8 4 2 1 | 755 * | 19 205 116 29 64 16 4 1 | 756 * | 1 0 0 0 0 0 0 0 | 757 * | 0 1 0 0 0 0 0 0 | 758 * (V|I)' = | 0 0 1 0 0 0 0 0 | 759 * | 0 0 0 1 0 0 0 0 | 760 * | 0 0 0 0 1 0 0 0 | 761 * | 0 0 0 0 0 1 0 0 | 762 * | 0 0 0 0 0 0 1 0 | 763 * | 0 0 0 0 0 0 0 1 | 764 * ~~ ~~ 765 * 766 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 767 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 768 * matrix is not singular. 769 * __ __ 770 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 771 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 772 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 773 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 774 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 775 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 776 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 777 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 778 * ~~ ~~ 779 * __ __ 780 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 781 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 782 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 783 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 784 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 785 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 786 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 787 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 788 * ~~ ~~ 789 * __ __ 790 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 791 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 792 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 793 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 794 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 795 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 796 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 797 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 798 * ~~ ~~ 799 * __ __ 800 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 801 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 802 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 803 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 804 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 805 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 806 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 807 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 808 * ~~ ~~ 809 * __ __ 810 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 811 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 812 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 813 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 814 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 815 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 816 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 817 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 818 * ~~ ~~ 819 * __ __ 820 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 821 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 822 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 823 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 824 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 825 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 826 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 827 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 828 * ~~ ~~ 829 * __ __ 830 * | 0 0 1 0 0 0 0 0 | 831 * | 167 100 5 41 159 169 217 208 | 832 * | 166 100 4 40 158 168 216 209 | 833 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 834 * | 0 0 0 0 1 0 0 0 | 835 * | 0 0 0 0 0 1 0 0 | 836 * | 0 0 0 0 0 0 1 0 | 837 * | 0 0 0 0 0 0 0 1 | 838 * ~~ ~~ 839 * 840 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 841 * of the missing data. 842 * 843 * As is apparent from the example above, the only non-trivial rows in the 844 * inverse matrix correspond to the data disks that we're trying to 845 * reconstruct. Indeed, those are the only rows we need as the others would 846 * only be useful for reconstructing data known or assumed to be valid. For 847 * that reason, we only build the coefficients in the rows that correspond to 848 * targeted columns. 849 */ 850 /* END CSTYLED */ 851 852 static void 853 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 854 uint8_t **rows) 855 { 856 int i, j; 857 int pow; 858 859 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 860 861 /* 862 * Fill in the missing rows of interest. 863 */ 864 for (i = 0; i < nmap; i++) { 865 ASSERT3S(0, <=, map[i]); 866 ASSERT3S(map[i], <=, 2); 867 868 pow = map[i] * n; 869 if (pow > 255) 870 pow -= 255; 871 ASSERT(pow <= 255); 872 873 for (j = 0; j < n; j++) { 874 pow -= map[i]; 875 if (pow < 0) 876 pow += 255; 877 rows[i][j] = vdev_raidz_pow2[pow]; 878 } 879 } 880 } 881 882 static void 883 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 884 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 885 { 886 int i, j, ii, jj; 887 uint8_t log; 888 889 /* 890 * Assert that the first nmissing entries from the array of used 891 * columns correspond to parity columns and that subsequent entries 892 * correspond to data columns. 893 */ 894 for (i = 0; i < nmissing; i++) { 895 ASSERT3S(used[i], <, rm->rm_firstdatacol); 896 } 897 for (; i < n; i++) { 898 ASSERT3S(used[i], >=, rm->rm_firstdatacol); 899 } 900 901 /* 902 * First initialize the storage where we'll compute the inverse rows. 903 */ 904 for (i = 0; i < nmissing; i++) { 905 for (j = 0; j < n; j++) { 906 invrows[i][j] = (i == j) ? 1 : 0; 907 } 908 } 909 910 /* 911 * Subtract all trivial rows from the rows of consequence. 912 */ 913 for (i = 0; i < nmissing; i++) { 914 for (j = nmissing; j < n; j++) { 915 ASSERT3U(used[j], >=, rm->rm_firstdatacol); 916 jj = used[j] - rm->rm_firstdatacol; 917 ASSERT3S(jj, <, n); 918 invrows[i][j] = rows[i][jj]; 919 rows[i][jj] = 0; 920 } 921 } 922 923 /* 924 * For each of the rows of interest, we must normalize it and subtract 925 * a multiple of it from the other rows. 926 */ 927 for (i = 0; i < nmissing; i++) { 928 for (j = 0; j < missing[i]; j++) { 929 ASSERT3U(rows[i][j], ==, 0); 930 } 931 ASSERT3U(rows[i][missing[i]], !=, 0); 932 933 /* 934 * Compute the inverse of the first element and multiply each 935 * element in the row by that value. 936 */ 937 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 938 939 for (j = 0; j < n; j++) { 940 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 941 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 942 } 943 944 for (ii = 0; ii < nmissing; ii++) { 945 if (i == ii) 946 continue; 947 948 ASSERT3U(rows[ii][missing[i]], !=, 0); 949 950 log = vdev_raidz_log2[rows[ii][missing[i]]]; 951 952 for (j = 0; j < n; j++) { 953 rows[ii][j] ^= 954 vdev_raidz_exp2(rows[i][j], log); 955 invrows[ii][j] ^= 956 vdev_raidz_exp2(invrows[i][j], log); 957 } 958 } 959 } 960 961 /* 962 * Verify that the data that is left in the rows are properly part of 963 * an identity matrix. 964 */ 965 for (i = 0; i < nmissing; i++) { 966 for (j = 0; j < n; j++) { 967 if (j == missing[i]) { 968 ASSERT3U(rows[i][j], ==, 1); 969 } else { 970 ASSERT3U(rows[i][j], ==, 0); 971 } 972 } 973 } 974 } 975 976 static void 977 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 978 int *missing, uint8_t **invrows, const uint8_t *used) 979 { 980 int i, j, x, cc, c; 981 uint8_t *src; 982 uint64_t ccount; 983 uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 984 uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 985 uint8_t log, val; 986 int ll; 987 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 988 uint8_t *p, *pp; 989 size_t psize; 990 991 log = 0; /* gcc */ 992 psize = sizeof (invlog[0][0]) * n * nmissing; 993 p = malloc(psize); 994 if (p == NULL) { 995 printf("Out of memory\n"); 996 return; 997 } 998 999 for (pp = p, i = 0; i < nmissing; i++) { 1000 invlog[i] = pp; 1001 pp += n; 1002 } 1003 1004 for (i = 0; i < nmissing; i++) { 1005 for (j = 0; j < n; j++) { 1006 ASSERT3U(invrows[i][j], !=, 0); 1007 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1008 } 1009 } 1010 1011 for (i = 0; i < n; i++) { 1012 c = used[i]; 1013 ASSERT3U(c, <, rm->rm_cols); 1014 1015 src = rm->rm_col[c].rc_data; 1016 ccount = rm->rm_col[c].rc_size; 1017 for (j = 0; j < nmissing; j++) { 1018 cc = missing[j] + rm->rm_firstdatacol; 1019 ASSERT3U(cc, >=, rm->rm_firstdatacol); 1020 ASSERT3U(cc, <, rm->rm_cols); 1021 ASSERT3U(cc, !=, c); 1022 1023 dst[j] = rm->rm_col[cc].rc_data; 1024 dcount[j] = rm->rm_col[cc].rc_size; 1025 } 1026 1027 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 1028 1029 for (x = 0; x < ccount; x++, src++) { 1030 if (*src != 0) 1031 log = vdev_raidz_log2[*src]; 1032 1033 for (cc = 0; cc < nmissing; cc++) { 1034 if (x >= dcount[cc]) 1035 continue; 1036 1037 if (*src == 0) { 1038 val = 0; 1039 } else { 1040 if ((ll = log + invlog[cc][i]) >= 255) 1041 ll -= 255; 1042 val = vdev_raidz_pow2[ll]; 1043 } 1044 1045 if (i == 0) 1046 dst[cc][x] = val; 1047 else 1048 dst[cc][x] ^= val; 1049 } 1050 } 1051 } 1052 1053 free(p); 1054 } 1055 1056 static int 1057 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 1058 { 1059 int n, i, c, t, tt; 1060 int nmissing_rows; 1061 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1062 int parity_map[VDEV_RAIDZ_MAXPARITY]; 1063 1064 uint8_t *p, *pp; 1065 size_t psize; 1066 1067 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1068 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1069 uint8_t *used; 1070 1071 int code = 0; 1072 1073 1074 n = rm->rm_cols - rm->rm_firstdatacol; 1075 1076 /* 1077 * Figure out which data columns are missing. 1078 */ 1079 nmissing_rows = 0; 1080 for (t = 0; t < ntgts; t++) { 1081 if (tgts[t] >= rm->rm_firstdatacol) { 1082 missing_rows[nmissing_rows++] = 1083 tgts[t] - rm->rm_firstdatacol; 1084 } 1085 } 1086 1087 /* 1088 * Figure out which parity columns to use to help generate the missing 1089 * data columns. 1090 */ 1091 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1092 ASSERT(tt < ntgts); 1093 ASSERT(c < rm->rm_firstdatacol); 1094 1095 /* 1096 * Skip any targeted parity columns. 1097 */ 1098 if (c == tgts[tt]) { 1099 tt++; 1100 continue; 1101 } 1102 1103 code |= 1 << c; 1104 1105 parity_map[i] = c; 1106 i++; 1107 } 1108 1109 ASSERT(code != 0); 1110 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1111 1112 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1113 nmissing_rows * n + sizeof (used[0]) * n; 1114 p = malloc(psize); 1115 if (p == NULL) { 1116 printf("Out of memory\n"); 1117 return (code); 1118 } 1119 1120 for (pp = p, i = 0; i < nmissing_rows; i++) { 1121 rows[i] = pp; 1122 pp += n; 1123 invrows[i] = pp; 1124 pp += n; 1125 } 1126 used = pp; 1127 1128 for (i = 0; i < nmissing_rows; i++) { 1129 used[i] = parity_map[i]; 1130 } 1131 1132 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1133 if (tt < nmissing_rows && 1134 c == missing_rows[tt] + rm->rm_firstdatacol) { 1135 tt++; 1136 continue; 1137 } 1138 1139 ASSERT3S(i, <, n); 1140 used[i] = c; 1141 i++; 1142 } 1143 1144 /* 1145 * Initialize the interesting rows of the matrix. 1146 */ 1147 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1148 1149 /* 1150 * Invert the matrix. 1151 */ 1152 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1153 invrows, used); 1154 1155 /* 1156 * Reconstruct the missing data using the generated matrix. 1157 */ 1158 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1159 invrows, used); 1160 1161 free(p); 1162 1163 return (code); 1164 } 1165 1166 static int 1167 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 1168 { 1169 int tgts[VDEV_RAIDZ_MAXPARITY]; 1170 int ntgts; 1171 int i, c; 1172 int code; 1173 int nbadparity, nbaddata; 1174 1175 /* 1176 * The tgts list must already be sorted. 1177 */ 1178 for (i = 1; i < nt; i++) { 1179 ASSERT(t[i] > t[i - 1]); 1180 } 1181 1182 nbadparity = rm->rm_firstdatacol; 1183 nbaddata = rm->rm_cols - nbadparity; 1184 ntgts = 0; 1185 for (i = 0, c = 0; c < rm->rm_cols; c++) { 1186 if (i < nt && c == t[i]) { 1187 tgts[ntgts++] = c; 1188 i++; 1189 } else if (rm->rm_col[c].rc_error != 0) { 1190 tgts[ntgts++] = c; 1191 } else if (c >= rm->rm_firstdatacol) { 1192 nbaddata--; 1193 } else { 1194 nbadparity--; 1195 } 1196 } 1197 1198 ASSERT(ntgts >= nt); 1199 ASSERT(nbaddata >= 0); 1200 ASSERT(nbaddata + nbadparity == ntgts); 1201 1202 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1203 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1204 ASSERT(code > 0); 1205 return (code); 1206 } 1207 1208 static raidz_map_t * 1209 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift, 1210 uint64_t dcols, uint64_t nparity) 1211 { 1212 raidz_map_t *rm; 1213 uint64_t b = offset >> unit_shift; 1214 uint64_t s = size >> unit_shift; 1215 uint64_t f = b % dcols; 1216 uint64_t o = (b / dcols) << unit_shift; 1217 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 1218 1219 q = s / (dcols - nparity); 1220 r = s - q * (dcols - nparity); 1221 bc = (r == 0 ? 0 : r + nparity); 1222 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 1223 1224 if (q == 0) { 1225 acols = bc; 1226 scols = MIN(dcols, roundup(bc, nparity + 1)); 1227 } else { 1228 acols = dcols; 1229 scols = dcols; 1230 } 1231 1232 ASSERT3U(acols, <=, scols); 1233 1234 rm = malloc(offsetof(raidz_map_t, rm_col[scols])); 1235 if (rm == NULL) 1236 return (rm); 1237 1238 rm->rm_cols = acols; 1239 rm->rm_scols = scols; 1240 rm->rm_bigcols = bc; 1241 rm->rm_skipstart = bc; 1242 rm->rm_missingdata = 0; 1243 rm->rm_missingparity = 0; 1244 rm->rm_firstdatacol = nparity; 1245 rm->rm_reports = 0; 1246 rm->rm_freed = 0; 1247 rm->rm_ecksuminjected = 0; 1248 1249 asize = 0; 1250 1251 for (c = 0; c < scols; c++) { 1252 col = f + c; 1253 coff = o; 1254 if (col >= dcols) { 1255 col -= dcols; 1256 coff += 1ULL << unit_shift; 1257 } 1258 rm->rm_col[c].rc_devidx = col; 1259 rm->rm_col[c].rc_offset = coff; 1260 rm->rm_col[c].rc_data = NULL; 1261 rm->rm_col[c].rc_error = 0; 1262 rm->rm_col[c].rc_tried = 0; 1263 rm->rm_col[c].rc_skipped = 0; 1264 1265 if (c >= acols) 1266 rm->rm_col[c].rc_size = 0; 1267 else if (c < bc) 1268 rm->rm_col[c].rc_size = (q + 1) << unit_shift; 1269 else 1270 rm->rm_col[c].rc_size = q << unit_shift; 1271 1272 asize += rm->rm_col[c].rc_size; 1273 } 1274 1275 ASSERT3U(asize, ==, tot << unit_shift); 1276 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 1277 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 1278 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 1279 ASSERT3U(rm->rm_nskip, <=, nparity); 1280 1281 for (c = 0; c < rm->rm_firstdatacol; c++) { 1282 rm->rm_col[c].rc_data = malloc(rm->rm_col[c].rc_size); 1283 if (rm->rm_col[c].rc_data == NULL) { 1284 c++; 1285 while (c != 0) 1286 free(rm->rm_col[--c].rc_data); 1287 free(rm); 1288 return (NULL); 1289 } 1290 } 1291 1292 rm->rm_col[c].rc_data = data; 1293 1294 for (c = c + 1; c < acols; c++) 1295 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 1296 rm->rm_col[c - 1].rc_size; 1297 1298 /* 1299 * If all data stored spans all columns, there's a danger that parity 1300 * will always be on the same device and, since parity isn't read 1301 * during normal operation, that that device's I/O bandwidth won't be 1302 * used effectively. We therefore switch the parity every 1MB. 1303 * 1304 * ... at least that was, ostensibly, the theory. As a practical 1305 * matter unless we juggle the parity between all devices evenly, we 1306 * won't see any benefit. Further, occasional writes that aren't a 1307 * multiple of the LCM of the number of children and the minimum 1308 * stripe width are sufficient to avoid pessimal behavior. 1309 * Unfortunately, this decision created an implicit on-disk format 1310 * requirement that we need to support for all eternity, but only 1311 * for single-parity RAID-Z. 1312 * 1313 * If we intend to skip a sector in the zeroth column for padding 1314 * we must make sure to note this swap. We will never intend to 1315 * skip the first column since at least one data and one parity 1316 * column must appear in each row. 1317 */ 1318 ASSERT(rm->rm_cols >= 2); 1319 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 1320 1321 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { 1322 devidx = rm->rm_col[0].rc_devidx; 1323 o = rm->rm_col[0].rc_offset; 1324 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 1325 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 1326 rm->rm_col[1].rc_devidx = devidx; 1327 rm->rm_col[1].rc_offset = o; 1328 1329 if (rm->rm_skipstart == 0) 1330 rm->rm_skipstart = 1; 1331 } 1332 1333 return (rm); 1334 } 1335 1336 static void 1337 vdev_raidz_map_free(raidz_map_t *rm) 1338 { 1339 int c; 1340 1341 for (c = rm->rm_firstdatacol - 1; c >= 0; c--) 1342 free(rm->rm_col[c].rc_data); 1343 1344 free(rm); 1345 } 1346 1347 static vdev_t * 1348 vdev_child(vdev_t *pvd, uint64_t devidx) 1349 { 1350 vdev_t *cvd; 1351 1352 STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) { 1353 if (cvd->v_id == devidx) 1354 break; 1355 } 1356 1357 return (cvd); 1358 } 1359 1360 /* 1361 * We keep track of whether or not there were any injected errors, so that 1362 * any ereports we generate can note it. 1363 */ 1364 static int 1365 raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data, 1366 uint64_t size) 1367 { 1368 return (zio_checksum_verify(spa, bp, data)); 1369 } 1370 1371 /* 1372 * Generate the parity from the data columns. If we tried and were able to 1373 * read the parity without error, verify that the generated parity matches the 1374 * data we read. If it doesn't, we fire off a checksum error. Return the 1375 * number such failures. 1376 */ 1377 static int 1378 raidz_parity_verify(raidz_map_t *rm) 1379 { 1380 void *orig[VDEV_RAIDZ_MAXPARITY]; 1381 int c, ret = 0; 1382 raidz_col_t *rc; 1383 1384 for (c = 0; c < rm->rm_firstdatacol; c++) { 1385 rc = &rm->rm_col[c]; 1386 if (!rc->rc_tried || rc->rc_error != 0) 1387 continue; 1388 orig[c] = malloc(rc->rc_size); 1389 if (orig[c] != NULL) { 1390 bcopy(rc->rc_data, orig[c], rc->rc_size); 1391 } else { 1392 printf("Out of memory\n"); 1393 } 1394 } 1395 1396 vdev_raidz_generate_parity(rm); 1397 1398 for (c = rm->rm_firstdatacol - 1; c >= 0; c--) { 1399 rc = &rm->rm_col[c]; 1400 if (!rc->rc_tried || rc->rc_error != 0) 1401 continue; 1402 if (orig[c] == NULL || 1403 bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 1404 rc->rc_error = ECKSUM; 1405 ret++; 1406 } 1407 free(orig[c]); 1408 } 1409 1410 return (ret); 1411 } 1412 1413 /* 1414 * Iterate over all combinations of bad data and attempt a reconstruction. 1415 * Note that the algorithm below is non-optimal because it doesn't take into 1416 * account how reconstruction is actually performed. For example, with 1417 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 1418 * is targeted as invalid as if columns 1 and 4 are targeted since in both 1419 * cases we'd only use parity information in column 0. 1420 */ 1421 static int 1422 vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp, 1423 void *data, off_t offset, uint64_t bytes, int total_errors, int data_errors) 1424 { 1425 raidz_col_t *rc; 1426 void *orig[VDEV_RAIDZ_MAXPARITY]; 1427 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 1428 int *tgts = &tstore[1]; 1429 int current, next, i, c, n; 1430 int code, ret = 0; 1431 1432 ASSERT(total_errors < rm->rm_firstdatacol); 1433 1434 /* 1435 * This simplifies one edge condition. 1436 */ 1437 tgts[-1] = -1; 1438 1439 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 1440 /* 1441 * Initialize the targets array by finding the first n columns 1442 * that contain no error. 1443 * 1444 * If there were no data errors, we need to ensure that we're 1445 * always explicitly attempting to reconstruct at least one 1446 * data column. To do this, we simply push the highest target 1447 * up into the data columns. 1448 */ 1449 for (c = 0, i = 0; i < n; i++) { 1450 if (i == n - 1 && data_errors == 0 && 1451 c < rm->rm_firstdatacol) { 1452 c = rm->rm_firstdatacol; 1453 } 1454 1455 while (rm->rm_col[c].rc_error != 0) { 1456 c++; 1457 ASSERT3S(c, <, rm->rm_cols); 1458 } 1459 1460 tgts[i] = c++; 1461 } 1462 1463 /* 1464 * Setting tgts[n] simplifies the other edge condition. 1465 */ 1466 tgts[n] = rm->rm_cols; 1467 1468 /* 1469 * These buffers were allocated in previous iterations. 1470 */ 1471 for (i = 0; i < n - 1; i++) { 1472 ASSERT(orig[i] != NULL); 1473 } 1474 1475 orig[n - 1] = malloc(rm->rm_col[0].rc_size); 1476 if (orig[n - 1] == NULL) { 1477 ret = ENOMEM; 1478 goto done; 1479 } 1480 1481 current = 0; 1482 next = tgts[current]; 1483 1484 while (current != n) { 1485 tgts[current] = next; 1486 current = 0; 1487 1488 /* 1489 * Save off the original data that we're going to 1490 * attempt to reconstruct. 1491 */ 1492 for (i = 0; i < n; i++) { 1493 ASSERT(orig[i] != NULL); 1494 c = tgts[i]; 1495 ASSERT3S(c, >=, 0); 1496 ASSERT3S(c, <, rm->rm_cols); 1497 rc = &rm->rm_col[c]; 1498 bcopy(rc->rc_data, orig[i], rc->rc_size); 1499 } 1500 1501 /* 1502 * Attempt a reconstruction and exit the outer loop on 1503 * success. 1504 */ 1505 code = vdev_raidz_reconstruct(rm, tgts, n); 1506 if (raidz_checksum_verify(spa, bp, data, bytes) == 0) { 1507 for (i = 0; i < n; i++) { 1508 c = tgts[i]; 1509 rc = &rm->rm_col[c]; 1510 ASSERT(rc->rc_error == 0); 1511 rc->rc_error = ECKSUM; 1512 } 1513 1514 ret = code; 1515 goto done; 1516 } 1517 1518 /* 1519 * Restore the original data. 1520 */ 1521 for (i = 0; i < n; i++) { 1522 c = tgts[i]; 1523 rc = &rm->rm_col[c]; 1524 bcopy(orig[i], rc->rc_data, rc->rc_size); 1525 } 1526 1527 do { 1528 /* 1529 * Find the next valid column after the current 1530 * position.. 1531 */ 1532 for (next = tgts[current] + 1; 1533 next < rm->rm_cols && 1534 rm->rm_col[next].rc_error != 0; next++) 1535 continue; 1536 1537 ASSERT(next <= tgts[current + 1]); 1538 1539 /* 1540 * If that spot is available, we're done here. 1541 */ 1542 if (next != tgts[current + 1]) 1543 break; 1544 1545 /* 1546 * Otherwise, find the next valid column after 1547 * the previous position. 1548 */ 1549 for (c = tgts[current - 1] + 1; 1550 rm->rm_col[c].rc_error != 0; c++) 1551 continue; 1552 1553 tgts[current] = c; 1554 current++; 1555 1556 } while (current != n); 1557 } 1558 } 1559 n--; 1560 done: 1561 for (i = n - 1; i >= 0; i--) { 1562 free(orig[i]); 1563 } 1564 1565 return (ret); 1566 } 1567 1568 static int 1569 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data, 1570 off_t offset, size_t bytes) 1571 { 1572 vdev_t *tvd = vd->v_top; 1573 vdev_t *cvd; 1574 raidz_map_t *rm; 1575 raidz_col_t *rc; 1576 int c, error; 1577 int unexpected_errors; 1578 int parity_errors; 1579 int parity_untried; 1580 int data_errors; 1581 int total_errors; 1582 int n; 1583 int tgts[VDEV_RAIDZ_MAXPARITY]; 1584 int code; 1585 1586 rc = NULL; /* gcc */ 1587 error = 0; 1588 1589 rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift, 1590 vd->v_nchildren, vd->v_nparity); 1591 if (rm == NULL) 1592 return (ENOMEM); 1593 1594 /* 1595 * Iterate over the columns in reverse order so that we hit the parity 1596 * last -- any errors along the way will force us to read the parity. 1597 */ 1598 for (c = rm->rm_cols - 1; c >= 0; c--) { 1599 rc = &rm->rm_col[c]; 1600 cvd = vdev_child(vd, rc->rc_devidx); 1601 if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) { 1602 if (c >= rm->rm_firstdatacol) 1603 rm->rm_missingdata++; 1604 else 1605 rm->rm_missingparity++; 1606 rc->rc_error = ENXIO; 1607 rc->rc_tried = 1; /* don't even try */ 1608 rc->rc_skipped = 1; 1609 continue; 1610 } 1611 #if 0 /* XXX: Too hard for the boot code. */ 1612 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 1613 if (c >= rm->rm_firstdatacol) 1614 rm->rm_missingdata++; 1615 else 1616 rm->rm_missingparity++; 1617 rc->rc_error = ESTALE; 1618 rc->rc_skipped = 1; 1619 continue; 1620 } 1621 #endif 1622 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) { 1623 rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data, 1624 rc->rc_offset, rc->rc_size); 1625 rc->rc_tried = 1; 1626 rc->rc_skipped = 0; 1627 } 1628 } 1629 1630 reconstruct: 1631 unexpected_errors = 0; 1632 parity_errors = 0; 1633 parity_untried = 0; 1634 data_errors = 0; 1635 total_errors = 0; 1636 1637 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 1638 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 1639 1640 for (c = 0; c < rm->rm_cols; c++) { 1641 rc = &rm->rm_col[c]; 1642 1643 if (rc->rc_error) { 1644 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 1645 1646 if (c < rm->rm_firstdatacol) 1647 parity_errors++; 1648 else 1649 data_errors++; 1650 1651 if (!rc->rc_skipped) 1652 unexpected_errors++; 1653 1654 total_errors++; 1655 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 1656 parity_untried++; 1657 } 1658 } 1659 1660 /* 1661 * There are three potential phases for a read: 1662 * 1. produce valid data from the columns read 1663 * 2. read all disks and try again 1664 * 3. perform combinatorial reconstruction 1665 * 1666 * Each phase is progressively both more expensive and less likely to 1667 * occur. If we encounter more errors than we can repair or all phases 1668 * fail, we have no choice but to return an error. 1669 */ 1670 1671 /* 1672 * If the number of errors we saw was correctable -- less than or equal 1673 * to the number of parity disks read -- attempt to produce data that 1674 * has a valid checksum. Naturally, this case applies in the absence of 1675 * any errors. 1676 */ 1677 if (total_errors <= rm->rm_firstdatacol - parity_untried) { 1678 int rv; 1679 1680 if (data_errors == 0) { 1681 rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes); 1682 if (rv == 0) { 1683 /* 1684 * If we read parity information (unnecessarily 1685 * as it happens since no reconstruction was 1686 * needed) regenerate and verify the parity. 1687 * We also regenerate parity when resilvering 1688 * so we can write it out to the failed device 1689 * later. 1690 */ 1691 if (parity_errors + parity_untried < 1692 rm->rm_firstdatacol) { 1693 n = raidz_parity_verify(rm); 1694 unexpected_errors += n; 1695 ASSERT(parity_errors + n <= 1696 rm->rm_firstdatacol); 1697 } 1698 goto done; 1699 } 1700 } else { 1701 /* 1702 * We either attempt to read all the parity columns or 1703 * none of them. If we didn't try to read parity, we 1704 * wouldn't be here in the correctable case. There must 1705 * also have been fewer parity errors than parity 1706 * columns or, again, we wouldn't be in this code path. 1707 */ 1708 ASSERT(parity_untried == 0); 1709 ASSERT(parity_errors < rm->rm_firstdatacol); 1710 1711 /* 1712 * Identify the data columns that reported an error. 1713 */ 1714 n = 0; 1715 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1716 rc = &rm->rm_col[c]; 1717 if (rc->rc_error != 0) { 1718 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 1719 tgts[n++] = c; 1720 } 1721 } 1722 1723 ASSERT(rm->rm_firstdatacol >= n); 1724 1725 code = vdev_raidz_reconstruct(rm, tgts, n); 1726 1727 rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes); 1728 if (rv == 0) { 1729 /* 1730 * If we read more parity disks than were used 1731 * for reconstruction, confirm that the other 1732 * parity disks produced correct data. This 1733 * routine is suboptimal in that it regenerates 1734 * the parity that we already used in addition 1735 * to the parity that we're attempting to 1736 * verify, but this should be a relatively 1737 * uncommon case, and can be optimized if it 1738 * becomes a problem. Note that we regenerate 1739 * parity when resilvering so we can write it 1740 * out to failed devices later. 1741 */ 1742 if (parity_errors < rm->rm_firstdatacol - n) { 1743 n = raidz_parity_verify(rm); 1744 unexpected_errors += n; 1745 ASSERT(parity_errors + n <= 1746 rm->rm_firstdatacol); 1747 } 1748 1749 goto done; 1750 } 1751 } 1752 } 1753 1754 /* 1755 * This isn't a typical situation -- either we got a read 1756 * error or a child silently returned bad data. Read every 1757 * block so we can try again with as much data and parity as 1758 * we can track down. If we've already been through once 1759 * before, all children will be marked as tried so we'll 1760 * proceed to combinatorial reconstruction. 1761 */ 1762 unexpected_errors = 1; 1763 rm->rm_missingdata = 0; 1764 rm->rm_missingparity = 0; 1765 1766 n = 0; 1767 for (c = 0; c < rm->rm_cols; c++) { 1768 rc = &rm->rm_col[c]; 1769 1770 if (rc->rc_tried) 1771 continue; 1772 1773 cvd = vdev_child(vd, rc->rc_devidx); 1774 ASSERT(cvd != NULL); 1775 rc->rc_error = cvd->v_read(cvd, NULL, 1776 rc->rc_data, rc->rc_offset, rc->rc_size); 1777 if (rc->rc_error == 0) 1778 n++; 1779 rc->rc_tried = 1; 1780 rc->rc_skipped = 0; 1781 } 1782 /* 1783 * If we managed to read anything more, retry the 1784 * reconstruction. 1785 */ 1786 if (n > 0) 1787 goto reconstruct; 1788 1789 /* 1790 * At this point we've attempted to reconstruct the data given the 1791 * errors we detected, and we've attempted to read all columns. There 1792 * must, therefore, be one or more additional problems -- silent errors 1793 * resulting in invalid data rather than explicit I/O errors resulting 1794 * in absent data. We check if there is enough additional data to 1795 * possibly reconstruct the data and then perform combinatorial 1796 * reconstruction over all possible combinations. If that fails, 1797 * we're cooked. 1798 */ 1799 if (total_errors > rm->rm_firstdatacol) { 1800 error = EIO; 1801 } else if (total_errors < rm->rm_firstdatacol && 1802 (code = vdev_raidz_combrec(vd->v_spa, rm, bp, data, offset, bytes, 1803 total_errors, data_errors)) != 0) { 1804 /* 1805 * If we didn't use all the available parity for the 1806 * combinatorial reconstruction, verify that the remaining 1807 * parity is correct. 1808 */ 1809 if (code != (1 << rm->rm_firstdatacol) - 1) 1810 (void) raidz_parity_verify(rm); 1811 } else { 1812 /* 1813 * We're here because either: 1814 * 1815 * total_errors == rm_first_datacol, or 1816 * vdev_raidz_combrec() failed 1817 * 1818 * In either case, there is enough bad data to prevent 1819 * reconstruction. 1820 * 1821 * Start checksum ereports for all children which haven't 1822 * failed, and the IO wasn't speculative. 1823 */ 1824 error = ECKSUM; 1825 } 1826 1827 done: 1828 vdev_raidz_map_free(rm); 1829 1830 return (error); 1831 } 1832