1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/cdefs.h> 27 __FBSDID("$FreeBSD$"); 28 29 #include <lz4.h> 30 31 static uint64_t zfs_crc64_table[256]; 32 33 #define ASSERT3S(x, y, z) ((void)0) 34 #define ASSERT3U(x, y, z) ((void)0) 35 #define ASSERT3P(x, y, z) ((void)0) 36 #define ASSERT0(x) ((void)0) 37 #define ASSERT(x) ((void)0) 38 39 #define panic(...) do { \ 40 printf(__VA_ARGS__); \ 41 for (;;) ; \ 42 } while (0) 43 44 static void 45 zfs_init_crc(void) 46 { 47 int i, j; 48 uint64_t *ct; 49 50 /* 51 * Calculate the crc64 table (used for the zap hash 52 * function). 53 */ 54 if (zfs_crc64_table[128] != ZFS_CRC64_POLY) { 55 memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table)); 56 for (i = 0; i < 256; i++) 57 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 58 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 59 } 60 } 61 62 static void 63 zio_checksum_off(const void *buf, uint64_t size, 64 const void *ctx_template, zio_cksum_t *zcp) 65 { 66 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); 67 } 68 69 /* 70 * Signature for checksum functions. 71 */ 72 typedef void zio_checksum_t(const void *data, uint64_t size, 73 const void *ctx_template, zio_cksum_t *zcp); 74 typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt); 75 typedef void zio_checksum_tmpl_free_t(void *ctx_template); 76 77 typedef enum zio_checksum_flags { 78 /* Strong enough for metadata? */ 79 ZCHECKSUM_FLAG_METADATA = (1 << 1), 80 /* ZIO embedded checksum */ 81 ZCHECKSUM_FLAG_EMBEDDED = (1 << 2), 82 /* Strong enough for dedup (without verification)? */ 83 ZCHECKSUM_FLAG_DEDUP = (1 << 3), 84 /* Uses salt value */ 85 ZCHECKSUM_FLAG_SALTED = (1 << 4), 86 /* Strong enough for nopwrite? */ 87 ZCHECKSUM_FLAG_NOPWRITE = (1 << 5) 88 } zio_checksum_flags_t; 89 90 /* 91 * Information about each checksum function. 92 */ 93 typedef struct zio_checksum_info { 94 /* checksum function for each byteorder */ 95 zio_checksum_t *ci_func[2]; 96 zio_checksum_tmpl_init_t *ci_tmpl_init; 97 zio_checksum_tmpl_free_t *ci_tmpl_free; 98 zio_checksum_flags_t ci_flags; 99 const char *ci_name; /* descriptive name */ 100 } zio_checksum_info_t; 101 102 #include "blkptr.c" 103 104 #include "fletcher.c" 105 #include "sha256.c" 106 #include "skein_zfs.c" 107 108 #ifdef HAS_ZSTD_ZFS 109 extern int zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len, 110 size_t d_len, int n); 111 #endif 112 113 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { 114 {{NULL, NULL}, NULL, NULL, 0, "inherit"}, 115 {{NULL, NULL}, NULL, NULL, 0, "on"}, 116 {{zio_checksum_off, zio_checksum_off}, NULL, NULL, 0, "off"}, 117 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, 118 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"}, 119 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, 120 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"}, 121 {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL, 122 ZCHECKSUM_FLAG_EMBEDDED, "zilog"}, 123 {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL, 124 0, "fletcher2"}, 125 {{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL, 126 ZCHECKSUM_FLAG_METADATA, "fletcher4"}, 127 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL, 128 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | 129 ZCHECKSUM_FLAG_NOPWRITE, "SHA256"}, 130 {{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL, 131 ZCHECKSUM_FLAG_EMBEDDED, "zillog2"}, 132 {{zio_checksum_off, zio_checksum_off}, NULL, NULL, 133 0, "noparity"}, 134 {{zio_checksum_SHA512_native, zio_checksum_SHA512_byteswap}, 135 NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | 136 ZCHECKSUM_FLAG_NOPWRITE, "SHA512"}, 137 {{zio_checksum_skein_native, zio_checksum_skein_byteswap}, 138 zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free, 139 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | 140 ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"}, 141 /* no edonr for now */ 142 {{NULL, NULL}, NULL, NULL, ZCHECKSUM_FLAG_METADATA | 143 ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"} 144 }; 145 146 /* 147 * Common signature for all zio compress/decompress functions. 148 */ 149 typedef size_t zio_compress_func_t(void *src, void *dst, 150 size_t s_len, size_t d_len, int); 151 typedef int zio_decompress_func_t(void *src, void *dst, 152 size_t s_len, size_t d_len, int); 153 154 /* 155 * Information about each compression function. 156 */ 157 typedef struct zio_compress_info { 158 zio_compress_func_t *ci_compress; /* compression function */ 159 zio_decompress_func_t *ci_decompress; /* decompression function */ 160 int ci_level; /* level parameter */ 161 const char *ci_name; /* algorithm name */ 162 } zio_compress_info_t; 163 164 #include "lzjb.c" 165 #include "zle.c" 166 #include "gzip.c" 167 168 /* 169 * Compression vectors. 170 */ 171 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { 172 {NULL, NULL, 0, "inherit"}, 173 {NULL, NULL, 0, "on"}, 174 {NULL, NULL, 0, "uncompressed"}, 175 {NULL, lzjb_decompress, 0, "lzjb"}, 176 {NULL, NULL, 0, "empty"}, 177 {NULL, gzip_decompress, 1, "gzip-1"}, 178 {NULL, gzip_decompress, 2, "gzip-2"}, 179 {NULL, gzip_decompress, 3, "gzip-3"}, 180 {NULL, gzip_decompress, 4, "gzip-4"}, 181 {NULL, gzip_decompress, 5, "gzip-5"}, 182 {NULL, gzip_decompress, 6, "gzip-6"}, 183 {NULL, gzip_decompress, 7, "gzip-7"}, 184 {NULL, gzip_decompress, 8, "gzip-8"}, 185 {NULL, gzip_decompress, 9, "gzip-9"}, 186 {NULL, zle_decompress, 64, "zle"}, 187 {NULL, lz4_decompress, 0, "lz4"}, 188 #ifdef HAS_ZSTD_ZFS 189 {NULL, zfs_zstd_decompress, ZIO_ZSTD_LEVEL_DEFAULT, "zstd"} 190 #endif 191 }; 192 193 static void 194 byteswap_uint64_array(void *vbuf, size_t size) 195 { 196 uint64_t *buf = vbuf; 197 size_t count = size >> 3; 198 int i; 199 200 ASSERT((size & 7) == 0); 201 202 for (i = 0; i < count; i++) 203 buf[i] = BSWAP_64(buf[i]); 204 } 205 206 /* 207 * Set the external verifier for a gang block based on <vdev, offset, txg>, 208 * a tuple which is guaranteed to be unique for the life of the pool. 209 */ 210 static void 211 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp) 212 { 213 const dva_t *dva = BP_IDENTITY(bp); 214 uint64_t txg = BP_PHYSICAL_BIRTH(bp); 215 216 ASSERT(BP_IS_GANG(bp)); 217 218 ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0); 219 } 220 221 /* 222 * Set the external verifier for a label block based on its offset. 223 * The vdev is implicit, and the txg is unknowable at pool open time -- 224 * hence the logic in vdev_uberblock_load() to find the most recent copy. 225 */ 226 static void 227 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset) 228 { 229 ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0); 230 } 231 232 /* 233 * Calls the template init function of a checksum which supports context 234 * templates and installs the template into the spa_t. 235 */ 236 static void 237 zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa) 238 { 239 zio_checksum_info_t *ci = &zio_checksum_table[checksum]; 240 241 if (ci->ci_tmpl_init == NULL) 242 return; 243 244 if (spa->spa_cksum_tmpls[checksum] != NULL) 245 return; 246 247 if (spa->spa_cksum_tmpls[checksum] == NULL) { 248 spa->spa_cksum_tmpls[checksum] = 249 ci->ci_tmpl_init(&spa->spa_cksum_salt); 250 } 251 } 252 253 /* 254 * Called by a spa_t that's about to be deallocated. This steps through 255 * all of the checksum context templates and deallocates any that were 256 * initialized using the algorithm-specific template init function. 257 */ 258 static void __unused 259 zio_checksum_templates_free(spa_t *spa) 260 { 261 for (enum zio_checksum checksum = 0; 262 checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) { 263 if (spa->spa_cksum_tmpls[checksum] != NULL) { 264 zio_checksum_info_t *ci = &zio_checksum_table[checksum]; 265 266 ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]); 267 spa->spa_cksum_tmpls[checksum] = NULL; 268 } 269 } 270 } 271 272 static int 273 zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data) 274 { 275 uint64_t size; 276 unsigned int checksum; 277 zio_checksum_info_t *ci; 278 void *ctx = NULL; 279 zio_cksum_t actual_cksum, expected_cksum, verifier; 280 int byteswap; 281 282 checksum = BP_GET_CHECKSUM(bp); 283 size = BP_GET_PSIZE(bp); 284 285 if (checksum >= ZIO_CHECKSUM_FUNCTIONS) 286 return (EINVAL); 287 ci = &zio_checksum_table[checksum]; 288 if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL) 289 return (EINVAL); 290 291 if (spa != NULL) { 292 zio_checksum_template_init(checksum, __DECONST(spa_t *,spa)); 293 ctx = spa->spa_cksum_tmpls[checksum]; 294 } 295 296 if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { 297 zio_eck_t *eck; 298 299 ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER || 300 checksum == ZIO_CHECKSUM_LABEL); 301 302 eck = (zio_eck_t *)((char *)data + size) - 1; 303 304 if (checksum == ZIO_CHECKSUM_GANG_HEADER) 305 zio_checksum_gang_verifier(&verifier, bp); 306 else if (checksum == ZIO_CHECKSUM_LABEL) 307 zio_checksum_label_verifier(&verifier, 308 DVA_GET_OFFSET(BP_IDENTITY(bp))); 309 else 310 verifier = bp->blk_cksum; 311 312 byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC)); 313 314 if (byteswap) 315 byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); 316 317 expected_cksum = eck->zec_cksum; 318 eck->zec_cksum = verifier; 319 ci->ci_func[byteswap](data, size, ctx, &actual_cksum); 320 eck->zec_cksum = expected_cksum; 321 322 if (byteswap) 323 byteswap_uint64_array(&expected_cksum, 324 sizeof (zio_cksum_t)); 325 } else { 326 byteswap = BP_SHOULD_BYTESWAP(bp); 327 expected_cksum = bp->blk_cksum; 328 ci->ci_func[byteswap](data, size, ctx, &actual_cksum); 329 } 330 331 if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) { 332 /*printf("ZFS: read checksum %s failed\n", ci->ci_name);*/ 333 return (EIO); 334 } 335 336 return (0); 337 } 338 339 static int 340 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize, 341 void *dest, uint64_t destsize) 342 { 343 zio_compress_info_t *ci; 344 345 if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) { 346 printf("ZFS: unsupported compression algorithm %u\n", cpfunc); 347 return (EIO); 348 } 349 350 ci = &zio_compress_table[cpfunc]; 351 if (!ci->ci_decompress) { 352 printf("ZFS: unsupported compression algorithm %s\n", 353 ci->ci_name); 354 return (EIO); 355 } 356 357 return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level)); 358 } 359 360 static uint64_t 361 zap_hash(uint64_t salt, const char *name) 362 { 363 const uint8_t *cp; 364 uint8_t c; 365 uint64_t crc = salt; 366 367 ASSERT(crc != 0); 368 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 369 for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++) 370 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF]; 371 372 /* 373 * Only use 28 bits, since we need 4 bits in the cookie for the 374 * collision differentiator. We MUST use the high bits, since 375 * those are the onces that we first pay attention to when 376 * chosing the bucket. 377 */ 378 crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1); 379 380 return (crc); 381 } 382 383 typedef struct raidz_col { 384 uint64_t rc_devidx; /* child device index for I/O */ 385 uint64_t rc_offset; /* device offset */ 386 uint64_t rc_size; /* I/O size */ 387 void *rc_data; /* I/O data */ 388 int rc_error; /* I/O error for this device */ 389 uint8_t rc_tried; /* Did we attempt this I/O column? */ 390 uint8_t rc_skipped; /* Did we skip this I/O column? */ 391 } raidz_col_t; 392 393 typedef struct raidz_map { 394 uint64_t rm_cols; /* Regular column count */ 395 uint64_t rm_scols; /* Count including skipped columns */ 396 uint64_t rm_bigcols; /* Number of oversized columns */ 397 uint64_t rm_asize; /* Actual total I/O size */ 398 uint64_t rm_missingdata; /* Count of missing data devices */ 399 uint64_t rm_missingparity; /* Count of missing parity devices */ 400 uint64_t rm_firstdatacol; /* First data column/parity count */ 401 uint64_t rm_nskip; /* Skipped sectors for padding */ 402 uint64_t rm_skipstart; /* Column index of padding start */ 403 uintptr_t rm_reports; /* # of referencing checksum reports */ 404 uint8_t rm_freed; /* map no longer has referencing ZIO */ 405 uint8_t rm_ecksuminjected; /* checksum error was injected */ 406 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ 407 } raidz_map_t; 408 409 #define VDEV_RAIDZ_P 0 410 #define VDEV_RAIDZ_Q 1 411 #define VDEV_RAIDZ_R 2 412 413 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) 414 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) 415 416 /* 417 * We provide a mechanism to perform the field multiplication operation on a 418 * 64-bit value all at once rather than a byte at a time. This works by 419 * creating a mask from the top bit in each byte and using that to 420 * conditionally apply the XOR of 0x1d. 421 */ 422 #define VDEV_RAIDZ_64MUL_2(x, mask) \ 423 { \ 424 (mask) = (x) & 0x8080808080808080ULL; \ 425 (mask) = ((mask) << 1) - ((mask) >> 7); \ 426 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ 427 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ 428 } 429 430 #define VDEV_RAIDZ_64MUL_4(x, mask) \ 431 { \ 432 VDEV_RAIDZ_64MUL_2((x), mask); \ 433 VDEV_RAIDZ_64MUL_2((x), mask); \ 434 } 435 436 /* 437 * These two tables represent powers and logs of 2 in the Galois field defined 438 * above. These values were computed by repeatedly multiplying by 2 as above. 439 */ 440 static const uint8_t vdev_raidz_pow2[256] = { 441 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 442 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, 443 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, 444 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, 445 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, 446 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, 447 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, 448 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, 449 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, 450 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, 451 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, 452 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, 453 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, 454 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, 455 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, 456 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, 457 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, 458 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, 459 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, 460 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, 461 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, 462 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, 463 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, 464 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, 465 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, 466 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, 467 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, 468 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, 469 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, 470 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, 471 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 472 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 473 }; 474 static const uint8_t vdev_raidz_log2[256] = { 475 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 476 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, 477 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, 478 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, 479 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, 480 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, 481 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, 482 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, 483 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, 484 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, 485 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, 486 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, 487 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, 488 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, 489 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, 490 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, 491 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, 492 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, 493 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, 494 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, 495 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, 496 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, 497 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, 498 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, 499 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, 500 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, 501 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, 502 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, 503 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, 504 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, 505 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, 506 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, 507 }; 508 509 /* 510 * Multiply a given number by 2 raised to the given power. 511 */ 512 static uint8_t 513 vdev_raidz_exp2(uint8_t a, int exp) 514 { 515 if (a == 0) 516 return (0); 517 518 ASSERT(exp >= 0); 519 ASSERT(vdev_raidz_log2[a] > 0 || a == 1); 520 521 exp += vdev_raidz_log2[a]; 522 if (exp > 255) 523 exp -= 255; 524 525 return (vdev_raidz_pow2[exp]); 526 } 527 528 static void 529 vdev_raidz_generate_parity_p(raidz_map_t *rm) 530 { 531 uint64_t *p, *src, ccount, i; 532 uint64_t pcount __unused; 533 int c; 534 535 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 536 537 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 538 src = rm->rm_col[c].rc_data; 539 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 540 ccount = rm->rm_col[c].rc_size / sizeof (src[0]); 541 542 if (c == rm->rm_firstdatacol) { 543 ASSERT(ccount == pcount); 544 for (i = 0; i < ccount; i++, src++, p++) { 545 *p = *src; 546 } 547 } else { 548 ASSERT(ccount <= pcount); 549 for (i = 0; i < ccount; i++, src++, p++) { 550 *p ^= *src; 551 } 552 } 553 } 554 } 555 556 static void 557 vdev_raidz_generate_parity_pq(raidz_map_t *rm) 558 { 559 uint64_t *p, *q, *src, pcnt, ccnt, mask, i; 560 int c; 561 562 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 563 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 564 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 565 566 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 567 src = rm->rm_col[c].rc_data; 568 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 569 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 570 571 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 572 573 if (c == rm->rm_firstdatacol) { 574 ASSERT(ccnt == pcnt || ccnt == 0); 575 for (i = 0; i < ccnt; i++, src++, p++, q++) { 576 *p = *src; 577 *q = *src; 578 } 579 for (; i < pcnt; i++, src++, p++, q++) { 580 *p = 0; 581 *q = 0; 582 } 583 } else { 584 ASSERT(ccnt <= pcnt); 585 586 /* 587 * Apply the algorithm described above by multiplying 588 * the previous result and adding in the new value. 589 */ 590 for (i = 0; i < ccnt; i++, src++, p++, q++) { 591 *p ^= *src; 592 593 VDEV_RAIDZ_64MUL_2(*q, mask); 594 *q ^= *src; 595 } 596 597 /* 598 * Treat short columns as though they are full of 0s. 599 * Note that there's therefore nothing needed for P. 600 */ 601 for (; i < pcnt; i++, q++) { 602 VDEV_RAIDZ_64MUL_2(*q, mask); 603 } 604 } 605 } 606 } 607 608 static void 609 vdev_raidz_generate_parity_pqr(raidz_map_t *rm) 610 { 611 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; 612 int c; 613 614 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); 615 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 616 rm->rm_col[VDEV_RAIDZ_Q].rc_size); 617 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == 618 rm->rm_col[VDEV_RAIDZ_R].rc_size); 619 620 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 621 src = rm->rm_col[c].rc_data; 622 p = rm->rm_col[VDEV_RAIDZ_P].rc_data; 623 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; 624 r = rm->rm_col[VDEV_RAIDZ_R].rc_data; 625 626 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); 627 628 if (c == rm->rm_firstdatacol) { 629 ASSERT(ccnt == pcnt || ccnt == 0); 630 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 631 *p = *src; 632 *q = *src; 633 *r = *src; 634 } 635 for (; i < pcnt; i++, src++, p++, q++, r++) { 636 *p = 0; 637 *q = 0; 638 *r = 0; 639 } 640 } else { 641 ASSERT(ccnt <= pcnt); 642 643 /* 644 * Apply the algorithm described above by multiplying 645 * the previous result and adding in the new value. 646 */ 647 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { 648 *p ^= *src; 649 650 VDEV_RAIDZ_64MUL_2(*q, mask); 651 *q ^= *src; 652 653 VDEV_RAIDZ_64MUL_4(*r, mask); 654 *r ^= *src; 655 } 656 657 /* 658 * Treat short columns as though they are full of 0s. 659 * Note that there's therefore nothing needed for P. 660 */ 661 for (; i < pcnt; i++, q++, r++) { 662 VDEV_RAIDZ_64MUL_2(*q, mask); 663 VDEV_RAIDZ_64MUL_4(*r, mask); 664 } 665 } 666 } 667 } 668 669 /* 670 * Generate RAID parity in the first virtual columns according to the number of 671 * parity columns available. 672 */ 673 static void 674 vdev_raidz_generate_parity(raidz_map_t *rm) 675 { 676 switch (rm->rm_firstdatacol) { 677 case 1: 678 vdev_raidz_generate_parity_p(rm); 679 break; 680 case 2: 681 vdev_raidz_generate_parity_pq(rm); 682 break; 683 case 3: 684 vdev_raidz_generate_parity_pqr(rm); 685 break; 686 default: 687 panic("invalid RAID-Z configuration"); 688 } 689 } 690 691 /* BEGIN CSTYLED */ 692 /* 693 * In the general case of reconstruction, we must solve the system of linear 694 * equations defined by the coeffecients used to generate parity as well as 695 * the contents of the data and parity disks. This can be expressed with 696 * vectors for the original data (D) and the actual data (d) and parity (p) 697 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): 698 * 699 * __ __ __ __ 700 * | | __ __ | p_0 | 701 * | V | | D_0 | | p_m-1 | 702 * | | x | : | = | d_0 | 703 * | I | | D_n-1 | | : | 704 * | | ~~ ~~ | d_n-1 | 705 * ~~ ~~ ~~ ~~ 706 * 707 * I is simply a square identity matrix of size n, and V is a vandermonde 708 * matrix defined by the coeffecients we chose for the various parity columns 709 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy 710 * computation as well as linear separability. 711 * 712 * __ __ __ __ 713 * | 1 .. 1 1 1 | | p_0 | 714 * | 2^n-1 .. 4 2 1 | __ __ | : | 715 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | 716 * | 1 .. 0 0 0 | | D_1 | | d_0 | 717 * | 0 .. 0 0 0 | x | D_2 | = | d_1 | 718 * | : : : : | | : | | d_2 | 719 * | 0 .. 1 0 0 | | D_n-1 | | : | 720 * | 0 .. 0 1 0 | ~~ ~~ | : | 721 * | 0 .. 0 0 1 | | d_n-1 | 722 * ~~ ~~ ~~ ~~ 723 * 724 * Note that I, V, d, and p are known. To compute D, we must invert the 725 * matrix and use the known data and parity values to reconstruct the unknown 726 * data values. We begin by removing the rows in V|I and d|p that correspond 727 * to failed or missing columns; we then make V|I square (n x n) and d|p 728 * sized n by removing rows corresponding to unused parity from the bottom up 729 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' 730 * using Gauss-Jordan elimination. In the example below we use m=3 parity 731 * columns, n=8 data columns, with errors in d_1, d_2, and p_1: 732 * __ __ 733 * | 1 1 1 1 1 1 1 1 | 734 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks 735 * | 19 205 116 29 64 16 4 1 | / / 736 * | 1 0 0 0 0 0 0 0 | / / 737 * | 0 1 0 0 0 0 0 0 | <--' / 738 * (V|I) = | 0 0 1 0 0 0 0 0 | <---' 739 * | 0 0 0 1 0 0 0 0 | 740 * | 0 0 0 0 1 0 0 0 | 741 * | 0 0 0 0 0 1 0 0 | 742 * | 0 0 0 0 0 0 1 0 | 743 * | 0 0 0 0 0 0 0 1 | 744 * ~~ ~~ 745 * __ __ 746 * | 1 1 1 1 1 1 1 1 | 747 * | 128 64 32 16 8 4 2 1 | 748 * | 19 205 116 29 64 16 4 1 | 749 * | 1 0 0 0 0 0 0 0 | 750 * | 0 1 0 0 0 0 0 0 | 751 * (V|I)' = | 0 0 1 0 0 0 0 0 | 752 * | 0 0 0 1 0 0 0 0 | 753 * | 0 0 0 0 1 0 0 0 | 754 * | 0 0 0 0 0 1 0 0 | 755 * | 0 0 0 0 0 0 1 0 | 756 * | 0 0 0 0 0 0 0 1 | 757 * ~~ ~~ 758 * 759 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We 760 * have carefully chosen the seed values 1, 2, and 4 to ensure that this 761 * matrix is not singular. 762 * __ __ 763 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 764 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 765 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 766 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 767 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 768 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 769 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 770 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 771 * ~~ ~~ 772 * __ __ 773 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 774 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | 775 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | 776 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 777 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 778 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 779 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 780 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 781 * ~~ ~~ 782 * __ __ 783 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 784 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 785 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | 786 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 787 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 788 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 789 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 790 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 791 * ~~ ~~ 792 * __ __ 793 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 794 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 795 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | 796 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 797 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 798 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 799 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 800 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 801 * ~~ ~~ 802 * __ __ 803 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 804 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | 805 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 806 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 807 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 808 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 809 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 810 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 811 * ~~ ~~ 812 * __ __ 813 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | 814 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | 815 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | 816 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | 817 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | 818 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | 819 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | 820 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | 821 * ~~ ~~ 822 * __ __ 823 * | 0 0 1 0 0 0 0 0 | 824 * | 167 100 5 41 159 169 217 208 | 825 * | 166 100 4 40 158 168 216 209 | 826 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | 827 * | 0 0 0 0 1 0 0 0 | 828 * | 0 0 0 0 0 1 0 0 | 829 * | 0 0 0 0 0 0 1 0 | 830 * | 0 0 0 0 0 0 0 1 | 831 * ~~ ~~ 832 * 833 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values 834 * of the missing data. 835 * 836 * As is apparent from the example above, the only non-trivial rows in the 837 * inverse matrix correspond to the data disks that we're trying to 838 * reconstruct. Indeed, those are the only rows we need as the others would 839 * only be useful for reconstructing data known or assumed to be valid. For 840 * that reason, we only build the coefficients in the rows that correspond to 841 * targeted columns. 842 */ 843 /* END CSTYLED */ 844 845 static void 846 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, 847 uint8_t **rows) 848 { 849 int i, j; 850 int pow; 851 852 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); 853 854 /* 855 * Fill in the missing rows of interest. 856 */ 857 for (i = 0; i < nmap; i++) { 858 ASSERT3S(0, <=, map[i]); 859 ASSERT3S(map[i], <=, 2); 860 861 pow = map[i] * n; 862 if (pow > 255) 863 pow -= 255; 864 ASSERT(pow <= 255); 865 866 for (j = 0; j < n; j++) { 867 pow -= map[i]; 868 if (pow < 0) 869 pow += 255; 870 rows[i][j] = vdev_raidz_pow2[pow]; 871 } 872 } 873 } 874 875 static void 876 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, 877 uint8_t **rows, uint8_t **invrows, const uint8_t *used) 878 { 879 int i, j, ii, jj; 880 uint8_t log; 881 882 /* 883 * Assert that the first nmissing entries from the array of used 884 * columns correspond to parity columns and that subsequent entries 885 * correspond to data columns. 886 */ 887 for (i = 0; i < nmissing; i++) { 888 ASSERT3S(used[i], <, rm->rm_firstdatacol); 889 } 890 for (; i < n; i++) { 891 ASSERT3S(used[i], >=, rm->rm_firstdatacol); 892 } 893 894 /* 895 * First initialize the storage where we'll compute the inverse rows. 896 */ 897 for (i = 0; i < nmissing; i++) { 898 for (j = 0; j < n; j++) { 899 invrows[i][j] = (i == j) ? 1 : 0; 900 } 901 } 902 903 /* 904 * Subtract all trivial rows from the rows of consequence. 905 */ 906 for (i = 0; i < nmissing; i++) { 907 for (j = nmissing; j < n; j++) { 908 ASSERT3U(used[j], >=, rm->rm_firstdatacol); 909 jj = used[j] - rm->rm_firstdatacol; 910 ASSERT3S(jj, <, n); 911 invrows[i][j] = rows[i][jj]; 912 rows[i][jj] = 0; 913 } 914 } 915 916 /* 917 * For each of the rows of interest, we must normalize it and subtract 918 * a multiple of it from the other rows. 919 */ 920 for (i = 0; i < nmissing; i++) { 921 for (j = 0; j < missing[i]; j++) { 922 ASSERT3U(rows[i][j], ==, 0); 923 } 924 ASSERT3U(rows[i][missing[i]], !=, 0); 925 926 /* 927 * Compute the inverse of the first element and multiply each 928 * element in the row by that value. 929 */ 930 log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; 931 932 for (j = 0; j < n; j++) { 933 rows[i][j] = vdev_raidz_exp2(rows[i][j], log); 934 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); 935 } 936 937 for (ii = 0; ii < nmissing; ii++) { 938 if (i == ii) 939 continue; 940 941 ASSERT3U(rows[ii][missing[i]], !=, 0); 942 943 log = vdev_raidz_log2[rows[ii][missing[i]]]; 944 945 for (j = 0; j < n; j++) { 946 rows[ii][j] ^= 947 vdev_raidz_exp2(rows[i][j], log); 948 invrows[ii][j] ^= 949 vdev_raidz_exp2(invrows[i][j], log); 950 } 951 } 952 } 953 954 /* 955 * Verify that the data that is left in the rows are properly part of 956 * an identity matrix. 957 */ 958 for (i = 0; i < nmissing; i++) { 959 for (j = 0; j < n; j++) { 960 if (j == missing[i]) { 961 ASSERT3U(rows[i][j], ==, 1); 962 } else { 963 ASSERT3U(rows[i][j], ==, 0); 964 } 965 } 966 } 967 } 968 969 static void 970 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, 971 int *missing, uint8_t **invrows, const uint8_t *used) 972 { 973 int i, j, x, cc, c; 974 uint8_t *src; 975 uint64_t ccount; 976 uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; 977 uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; 978 uint8_t log, val; 979 int ll; 980 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; 981 uint8_t *p, *pp; 982 size_t psize; 983 984 log = 0; /* gcc */ 985 psize = sizeof (invlog[0][0]) * n * nmissing; 986 p = malloc(psize); 987 if (p == NULL) { 988 printf("Out of memory\n"); 989 return; 990 } 991 992 for (pp = p, i = 0; i < nmissing; i++) { 993 invlog[i] = pp; 994 pp += n; 995 } 996 997 for (i = 0; i < nmissing; i++) { 998 for (j = 0; j < n; j++) { 999 ASSERT3U(invrows[i][j], !=, 0); 1000 invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; 1001 } 1002 } 1003 1004 for (i = 0; i < n; i++) { 1005 c = used[i]; 1006 ASSERT3U(c, <, rm->rm_cols); 1007 1008 src = rm->rm_col[c].rc_data; 1009 ccount = rm->rm_col[c].rc_size; 1010 for (j = 0; j < nmissing; j++) { 1011 cc = missing[j] + rm->rm_firstdatacol; 1012 ASSERT3U(cc, >=, rm->rm_firstdatacol); 1013 ASSERT3U(cc, <, rm->rm_cols); 1014 ASSERT3U(cc, !=, c); 1015 1016 dst[j] = rm->rm_col[cc].rc_data; 1017 dcount[j] = rm->rm_col[cc].rc_size; 1018 } 1019 1020 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); 1021 1022 for (x = 0; x < ccount; x++, src++) { 1023 if (*src != 0) 1024 log = vdev_raidz_log2[*src]; 1025 1026 for (cc = 0; cc < nmissing; cc++) { 1027 if (x >= dcount[cc]) 1028 continue; 1029 1030 if (*src == 0) { 1031 val = 0; 1032 } else { 1033 if ((ll = log + invlog[cc][i]) >= 255) 1034 ll -= 255; 1035 val = vdev_raidz_pow2[ll]; 1036 } 1037 1038 if (i == 0) 1039 dst[cc][x] = val; 1040 else 1041 dst[cc][x] ^= val; 1042 } 1043 } 1044 } 1045 1046 free(p); 1047 } 1048 1049 static int 1050 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) 1051 { 1052 int n, i, c, t, tt; 1053 int nmissing_rows; 1054 int missing_rows[VDEV_RAIDZ_MAXPARITY]; 1055 int parity_map[VDEV_RAIDZ_MAXPARITY]; 1056 1057 uint8_t *p, *pp; 1058 size_t psize; 1059 1060 uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; 1061 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; 1062 uint8_t *used; 1063 1064 int code = 0; 1065 1066 1067 n = rm->rm_cols - rm->rm_firstdatacol; 1068 1069 /* 1070 * Figure out which data columns are missing. 1071 */ 1072 nmissing_rows = 0; 1073 for (t = 0; t < ntgts; t++) { 1074 if (tgts[t] >= rm->rm_firstdatacol) { 1075 missing_rows[nmissing_rows++] = 1076 tgts[t] - rm->rm_firstdatacol; 1077 } 1078 } 1079 1080 /* 1081 * Figure out which parity columns to use to help generate the missing 1082 * data columns. 1083 */ 1084 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { 1085 ASSERT(tt < ntgts); 1086 ASSERT(c < rm->rm_firstdatacol); 1087 1088 /* 1089 * Skip any targeted parity columns. 1090 */ 1091 if (c == tgts[tt]) { 1092 tt++; 1093 continue; 1094 } 1095 1096 code |= 1 << c; 1097 1098 parity_map[i] = c; 1099 i++; 1100 } 1101 1102 ASSERT(code != 0); 1103 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); 1104 1105 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * 1106 nmissing_rows * n + sizeof (used[0]) * n; 1107 p = malloc(psize); 1108 if (p == NULL) { 1109 printf("Out of memory\n"); 1110 return (code); 1111 } 1112 1113 for (pp = p, i = 0; i < nmissing_rows; i++) { 1114 rows[i] = pp; 1115 pp += n; 1116 invrows[i] = pp; 1117 pp += n; 1118 } 1119 used = pp; 1120 1121 for (i = 0; i < nmissing_rows; i++) { 1122 used[i] = parity_map[i]; 1123 } 1124 1125 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1126 if (tt < nmissing_rows && 1127 c == missing_rows[tt] + rm->rm_firstdatacol) { 1128 tt++; 1129 continue; 1130 } 1131 1132 ASSERT3S(i, <, n); 1133 used[i] = c; 1134 i++; 1135 } 1136 1137 /* 1138 * Initialize the interesting rows of the matrix. 1139 */ 1140 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); 1141 1142 /* 1143 * Invert the matrix. 1144 */ 1145 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, 1146 invrows, used); 1147 1148 /* 1149 * Reconstruct the missing data using the generated matrix. 1150 */ 1151 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, 1152 invrows, used); 1153 1154 free(p); 1155 1156 return (code); 1157 } 1158 1159 static int 1160 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) 1161 { 1162 int tgts[VDEV_RAIDZ_MAXPARITY]; 1163 int ntgts; 1164 int i, c; 1165 int code; 1166 int nbadparity, nbaddata; 1167 1168 /* 1169 * The tgts list must already be sorted. 1170 */ 1171 for (i = 1; i < nt; i++) { 1172 ASSERT(t[i] > t[i - 1]); 1173 } 1174 1175 nbadparity = rm->rm_firstdatacol; 1176 nbaddata = rm->rm_cols - nbadparity; 1177 ntgts = 0; 1178 for (i = 0, c = 0; c < rm->rm_cols; c++) { 1179 if (i < nt && c == t[i]) { 1180 tgts[ntgts++] = c; 1181 i++; 1182 } else if (rm->rm_col[c].rc_error != 0) { 1183 tgts[ntgts++] = c; 1184 } else if (c >= rm->rm_firstdatacol) { 1185 nbaddata--; 1186 } else { 1187 nbadparity--; 1188 } 1189 } 1190 1191 ASSERT(ntgts >= nt); 1192 ASSERT(nbaddata >= 0); 1193 ASSERT(nbaddata + nbadparity == ntgts); 1194 1195 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); 1196 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); 1197 ASSERT(code > 0); 1198 return (code); 1199 } 1200 1201 static raidz_map_t * 1202 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift, 1203 uint64_t dcols, uint64_t nparity) 1204 { 1205 raidz_map_t *rm; 1206 uint64_t b = offset >> unit_shift; 1207 uint64_t s = size >> unit_shift; 1208 uint64_t f = b % dcols; 1209 uint64_t o = (b / dcols) << unit_shift; 1210 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; 1211 1212 q = s / (dcols - nparity); 1213 r = s - q * (dcols - nparity); 1214 bc = (r == 0 ? 0 : r + nparity); 1215 tot = s + nparity * (q + (r == 0 ? 0 : 1)); 1216 1217 if (q == 0) { 1218 acols = bc; 1219 scols = MIN(dcols, roundup(bc, nparity + 1)); 1220 } else { 1221 acols = dcols; 1222 scols = dcols; 1223 } 1224 1225 ASSERT3U(acols, <=, scols); 1226 1227 rm = malloc(offsetof(raidz_map_t, rm_col[scols])); 1228 if (rm == NULL) 1229 return (rm); 1230 1231 rm->rm_cols = acols; 1232 rm->rm_scols = scols; 1233 rm->rm_bigcols = bc; 1234 rm->rm_skipstart = bc; 1235 rm->rm_missingdata = 0; 1236 rm->rm_missingparity = 0; 1237 rm->rm_firstdatacol = nparity; 1238 rm->rm_reports = 0; 1239 rm->rm_freed = 0; 1240 rm->rm_ecksuminjected = 0; 1241 1242 asize = 0; 1243 1244 for (c = 0; c < scols; c++) { 1245 col = f + c; 1246 coff = o; 1247 if (col >= dcols) { 1248 col -= dcols; 1249 coff += 1ULL << unit_shift; 1250 } 1251 rm->rm_col[c].rc_devidx = col; 1252 rm->rm_col[c].rc_offset = coff; 1253 rm->rm_col[c].rc_data = NULL; 1254 rm->rm_col[c].rc_error = 0; 1255 rm->rm_col[c].rc_tried = 0; 1256 rm->rm_col[c].rc_skipped = 0; 1257 1258 if (c >= acols) 1259 rm->rm_col[c].rc_size = 0; 1260 else if (c < bc) 1261 rm->rm_col[c].rc_size = (q + 1) << unit_shift; 1262 else 1263 rm->rm_col[c].rc_size = q << unit_shift; 1264 1265 asize += rm->rm_col[c].rc_size; 1266 } 1267 1268 ASSERT3U(asize, ==, tot << unit_shift); 1269 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); 1270 rm->rm_nskip = roundup(tot, nparity + 1) - tot; 1271 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); 1272 ASSERT3U(rm->rm_nskip, <=, nparity); 1273 1274 for (c = 0; c < rm->rm_firstdatacol; c++) { 1275 rm->rm_col[c].rc_data = malloc(rm->rm_col[c].rc_size); 1276 if (rm->rm_col[c].rc_data == NULL) { 1277 c++; 1278 while (c != 0) 1279 free(rm->rm_col[--c].rc_data); 1280 free(rm); 1281 return (NULL); 1282 } 1283 } 1284 1285 rm->rm_col[c].rc_data = data; 1286 1287 for (c = c + 1; c < acols; c++) 1288 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + 1289 rm->rm_col[c - 1].rc_size; 1290 1291 /* 1292 * If all data stored spans all columns, there's a danger that parity 1293 * will always be on the same device and, since parity isn't read 1294 * during normal operation, that that device's I/O bandwidth won't be 1295 * used effectively. We therefore switch the parity every 1MB. 1296 * 1297 * ... at least that was, ostensibly, the theory. As a practical 1298 * matter unless we juggle the parity between all devices evenly, we 1299 * won't see any benefit. Further, occasional writes that aren't a 1300 * multiple of the LCM of the number of children and the minimum 1301 * stripe width are sufficient to avoid pessimal behavior. 1302 * Unfortunately, this decision created an implicit on-disk format 1303 * requirement that we need to support for all eternity, but only 1304 * for single-parity RAID-Z. 1305 * 1306 * If we intend to skip a sector in the zeroth column for padding 1307 * we must make sure to note this swap. We will never intend to 1308 * skip the first column since at least one data and one parity 1309 * column must appear in each row. 1310 */ 1311 ASSERT(rm->rm_cols >= 2); 1312 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); 1313 1314 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) { 1315 devidx = rm->rm_col[0].rc_devidx; 1316 o = rm->rm_col[0].rc_offset; 1317 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; 1318 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; 1319 rm->rm_col[1].rc_devidx = devidx; 1320 rm->rm_col[1].rc_offset = o; 1321 1322 if (rm->rm_skipstart == 0) 1323 rm->rm_skipstart = 1; 1324 } 1325 1326 return (rm); 1327 } 1328 1329 static void 1330 vdev_raidz_map_free(raidz_map_t *rm) 1331 { 1332 int c; 1333 1334 for (c = rm->rm_firstdatacol - 1; c >= 0; c--) 1335 free(rm->rm_col[c].rc_data); 1336 1337 free(rm); 1338 } 1339 1340 static vdev_t * 1341 vdev_child(vdev_t *pvd, uint64_t devidx) 1342 { 1343 vdev_t *cvd; 1344 1345 STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) { 1346 if (cvd->v_id == devidx) 1347 break; 1348 } 1349 1350 return (cvd); 1351 } 1352 1353 /* 1354 * We keep track of whether or not there were any injected errors, so that 1355 * any ereports we generate can note it. 1356 */ 1357 static int 1358 raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data, 1359 uint64_t size) 1360 { 1361 return (zio_checksum_verify(spa, bp, data)); 1362 } 1363 1364 /* 1365 * Generate the parity from the data columns. If we tried and were able to 1366 * read the parity without error, verify that the generated parity matches the 1367 * data we read. If it doesn't, we fire off a checksum error. Return the 1368 * number such failures. 1369 */ 1370 static int 1371 raidz_parity_verify(raidz_map_t *rm) 1372 { 1373 void *orig[VDEV_RAIDZ_MAXPARITY]; 1374 int c, ret = 0; 1375 raidz_col_t *rc; 1376 1377 for (c = 0; c < rm->rm_firstdatacol; c++) { 1378 rc = &rm->rm_col[c]; 1379 if (!rc->rc_tried || rc->rc_error != 0) 1380 continue; 1381 orig[c] = malloc(rc->rc_size); 1382 if (orig[c] != NULL) { 1383 bcopy(rc->rc_data, orig[c], rc->rc_size); 1384 } else { 1385 printf("Out of memory\n"); 1386 } 1387 } 1388 1389 vdev_raidz_generate_parity(rm); 1390 1391 for (c = rm->rm_firstdatacol - 1; c >= 0; c--) { 1392 rc = &rm->rm_col[c]; 1393 if (!rc->rc_tried || rc->rc_error != 0) 1394 continue; 1395 if (orig[c] == NULL || 1396 bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { 1397 rc->rc_error = ECKSUM; 1398 ret++; 1399 } 1400 free(orig[c]); 1401 } 1402 1403 return (ret); 1404 } 1405 1406 /* 1407 * Iterate over all combinations of bad data and attempt a reconstruction. 1408 * Note that the algorithm below is non-optimal because it doesn't take into 1409 * account how reconstruction is actually performed. For example, with 1410 * triple-parity RAID-Z the reconstruction procedure is the same if column 4 1411 * is targeted as invalid as if columns 1 and 4 are targeted since in both 1412 * cases we'd only use parity information in column 0. 1413 */ 1414 static int 1415 vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp, 1416 void *data, off_t offset, uint64_t bytes, int total_errors, int data_errors) 1417 { 1418 raidz_col_t *rc; 1419 void *orig[VDEV_RAIDZ_MAXPARITY]; 1420 int tstore[VDEV_RAIDZ_MAXPARITY + 2]; 1421 int *tgts = &tstore[1]; 1422 int current, next, i, c, n; 1423 int code, ret = 0; 1424 1425 ASSERT(total_errors < rm->rm_firstdatacol); 1426 1427 /* 1428 * This simplifies one edge condition. 1429 */ 1430 tgts[-1] = -1; 1431 1432 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { 1433 /* 1434 * Initialize the targets array by finding the first n columns 1435 * that contain no error. 1436 * 1437 * If there were no data errors, we need to ensure that we're 1438 * always explicitly attempting to reconstruct at least one 1439 * data column. To do this, we simply push the highest target 1440 * up into the data columns. 1441 */ 1442 for (c = 0, i = 0; i < n; i++) { 1443 if (i == n - 1 && data_errors == 0 && 1444 c < rm->rm_firstdatacol) { 1445 c = rm->rm_firstdatacol; 1446 } 1447 1448 while (rm->rm_col[c].rc_error != 0) { 1449 c++; 1450 ASSERT3S(c, <, rm->rm_cols); 1451 } 1452 1453 tgts[i] = c++; 1454 } 1455 1456 /* 1457 * Setting tgts[n] simplifies the other edge condition. 1458 */ 1459 tgts[n] = rm->rm_cols; 1460 1461 /* 1462 * These buffers were allocated in previous iterations. 1463 */ 1464 for (i = 0; i < n - 1; i++) { 1465 ASSERT(orig[i] != NULL); 1466 } 1467 1468 orig[n - 1] = malloc(rm->rm_col[0].rc_size); 1469 if (orig[n - 1] == NULL) { 1470 ret = ENOMEM; 1471 goto done; 1472 } 1473 1474 current = 0; 1475 next = tgts[current]; 1476 1477 while (current != n) { 1478 tgts[current] = next; 1479 current = 0; 1480 1481 /* 1482 * Save off the original data that we're going to 1483 * attempt to reconstruct. 1484 */ 1485 for (i = 0; i < n; i++) { 1486 ASSERT(orig[i] != NULL); 1487 c = tgts[i]; 1488 ASSERT3S(c, >=, 0); 1489 ASSERT3S(c, <, rm->rm_cols); 1490 rc = &rm->rm_col[c]; 1491 bcopy(rc->rc_data, orig[i], rc->rc_size); 1492 } 1493 1494 /* 1495 * Attempt a reconstruction and exit the outer loop on 1496 * success. 1497 */ 1498 code = vdev_raidz_reconstruct(rm, tgts, n); 1499 if (raidz_checksum_verify(spa, bp, data, bytes) == 0) { 1500 for (i = 0; i < n; i++) { 1501 c = tgts[i]; 1502 rc = &rm->rm_col[c]; 1503 ASSERT(rc->rc_error == 0); 1504 rc->rc_error = ECKSUM; 1505 } 1506 1507 ret = code; 1508 goto done; 1509 } 1510 1511 /* 1512 * Restore the original data. 1513 */ 1514 for (i = 0; i < n; i++) { 1515 c = tgts[i]; 1516 rc = &rm->rm_col[c]; 1517 bcopy(orig[i], rc->rc_data, rc->rc_size); 1518 } 1519 1520 do { 1521 /* 1522 * Find the next valid column after the current 1523 * position.. 1524 */ 1525 for (next = tgts[current] + 1; 1526 next < rm->rm_cols && 1527 rm->rm_col[next].rc_error != 0; next++) 1528 continue; 1529 1530 ASSERT(next <= tgts[current + 1]); 1531 1532 /* 1533 * If that spot is available, we're done here. 1534 */ 1535 if (next != tgts[current + 1]) 1536 break; 1537 1538 /* 1539 * Otherwise, find the next valid column after 1540 * the previous position. 1541 */ 1542 for (c = tgts[current - 1] + 1; 1543 rm->rm_col[c].rc_error != 0; c++) 1544 continue; 1545 1546 tgts[current] = c; 1547 current++; 1548 1549 } while (current != n); 1550 } 1551 } 1552 n--; 1553 done: 1554 for (i = n - 1; i >= 0; i--) { 1555 free(orig[i]); 1556 } 1557 1558 return (ret); 1559 } 1560 1561 static int 1562 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data, 1563 off_t offset, size_t bytes) 1564 { 1565 vdev_t *tvd = vd->v_top; 1566 vdev_t *cvd; 1567 raidz_map_t *rm; 1568 raidz_col_t *rc; 1569 int c, error; 1570 int unexpected_errors; 1571 int parity_errors; 1572 int parity_untried; 1573 int data_errors; 1574 int total_errors; 1575 int n; 1576 int tgts[VDEV_RAIDZ_MAXPARITY]; 1577 int code; 1578 1579 rc = NULL; /* gcc */ 1580 error = 0; 1581 1582 rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift, 1583 vd->v_nchildren, vd->v_nparity); 1584 if (rm == NULL) 1585 return (ENOMEM); 1586 1587 /* 1588 * Iterate over the columns in reverse order so that we hit the parity 1589 * last -- any errors along the way will force us to read the parity. 1590 */ 1591 for (c = rm->rm_cols - 1; c >= 0; c--) { 1592 rc = &rm->rm_col[c]; 1593 cvd = vdev_child(vd, rc->rc_devidx); 1594 if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) { 1595 if (c >= rm->rm_firstdatacol) 1596 rm->rm_missingdata++; 1597 else 1598 rm->rm_missingparity++; 1599 rc->rc_error = ENXIO; 1600 rc->rc_tried = 1; /* don't even try */ 1601 rc->rc_skipped = 1; 1602 continue; 1603 } 1604 #if 0 /* XXX: Too hard for the boot code. */ 1605 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { 1606 if (c >= rm->rm_firstdatacol) 1607 rm->rm_missingdata++; 1608 else 1609 rm->rm_missingparity++; 1610 rc->rc_error = ESTALE; 1611 rc->rc_skipped = 1; 1612 continue; 1613 } 1614 #endif 1615 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) { 1616 rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data, 1617 rc->rc_offset, rc->rc_size); 1618 rc->rc_tried = 1; 1619 rc->rc_skipped = 0; 1620 } 1621 } 1622 1623 reconstruct: 1624 unexpected_errors = 0; 1625 parity_errors = 0; 1626 parity_untried = 0; 1627 data_errors = 0; 1628 total_errors = 0; 1629 1630 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); 1631 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); 1632 1633 for (c = 0; c < rm->rm_cols; c++) { 1634 rc = &rm->rm_col[c]; 1635 1636 if (rc->rc_error) { 1637 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ 1638 1639 if (c < rm->rm_firstdatacol) 1640 parity_errors++; 1641 else 1642 data_errors++; 1643 1644 if (!rc->rc_skipped) 1645 unexpected_errors++; 1646 1647 total_errors++; 1648 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { 1649 parity_untried++; 1650 } 1651 } 1652 1653 /* 1654 * There are three potential phases for a read: 1655 * 1. produce valid data from the columns read 1656 * 2. read all disks and try again 1657 * 3. perform combinatorial reconstruction 1658 * 1659 * Each phase is progressively both more expensive and less likely to 1660 * occur. If we encounter more errors than we can repair or all phases 1661 * fail, we have no choice but to return an error. 1662 */ 1663 1664 /* 1665 * If the number of errors we saw was correctable -- less than or equal 1666 * to the number of parity disks read -- attempt to produce data that 1667 * has a valid checksum. Naturally, this case applies in the absence of 1668 * any errors. 1669 */ 1670 if (total_errors <= rm->rm_firstdatacol - parity_untried) { 1671 int rv; 1672 1673 if (data_errors == 0) { 1674 rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes); 1675 if (rv == 0) { 1676 /* 1677 * If we read parity information (unnecessarily 1678 * as it happens since no reconstruction was 1679 * needed) regenerate and verify the parity. 1680 * We also regenerate parity when resilvering 1681 * so we can write it out to the failed device 1682 * later. 1683 */ 1684 if (parity_errors + parity_untried < 1685 rm->rm_firstdatacol) { 1686 n = raidz_parity_verify(rm); 1687 unexpected_errors += n; 1688 ASSERT(parity_errors + n <= 1689 rm->rm_firstdatacol); 1690 } 1691 goto done; 1692 } 1693 } else { 1694 /* 1695 * We either attempt to read all the parity columns or 1696 * none of them. If we didn't try to read parity, we 1697 * wouldn't be here in the correctable case. There must 1698 * also have been fewer parity errors than parity 1699 * columns or, again, we wouldn't be in this code path. 1700 */ 1701 ASSERT(parity_untried == 0); 1702 ASSERT(parity_errors < rm->rm_firstdatacol); 1703 1704 /* 1705 * Identify the data columns that reported an error. 1706 */ 1707 n = 0; 1708 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { 1709 rc = &rm->rm_col[c]; 1710 if (rc->rc_error != 0) { 1711 ASSERT(n < VDEV_RAIDZ_MAXPARITY); 1712 tgts[n++] = c; 1713 } 1714 } 1715 1716 ASSERT(rm->rm_firstdatacol >= n); 1717 1718 code = vdev_raidz_reconstruct(rm, tgts, n); 1719 1720 rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes); 1721 if (rv == 0) { 1722 /* 1723 * If we read more parity disks than were used 1724 * for reconstruction, confirm that the other 1725 * parity disks produced correct data. This 1726 * routine is suboptimal in that it regenerates 1727 * the parity that we already used in addition 1728 * to the parity that we're attempting to 1729 * verify, but this should be a relatively 1730 * uncommon case, and can be optimized if it 1731 * becomes a problem. Note that we regenerate 1732 * parity when resilvering so we can write it 1733 * out to failed devices later. 1734 */ 1735 if (parity_errors < rm->rm_firstdatacol - n) { 1736 n = raidz_parity_verify(rm); 1737 unexpected_errors += n; 1738 ASSERT(parity_errors + n <= 1739 rm->rm_firstdatacol); 1740 } 1741 1742 goto done; 1743 } 1744 } 1745 } 1746 1747 /* 1748 * This isn't a typical situation -- either we got a read 1749 * error or a child silently returned bad data. Read every 1750 * block so we can try again with as much data and parity as 1751 * we can track down. If we've already been through once 1752 * before, all children will be marked as tried so we'll 1753 * proceed to combinatorial reconstruction. 1754 */ 1755 unexpected_errors = 1; 1756 rm->rm_missingdata = 0; 1757 rm->rm_missingparity = 0; 1758 1759 n = 0; 1760 for (c = 0; c < rm->rm_cols; c++) { 1761 rc = &rm->rm_col[c]; 1762 1763 if (rc->rc_tried) 1764 continue; 1765 1766 cvd = vdev_child(vd, rc->rc_devidx); 1767 ASSERT(cvd != NULL); 1768 rc->rc_error = cvd->v_read(cvd, NULL, 1769 rc->rc_data, rc->rc_offset, rc->rc_size); 1770 if (rc->rc_error == 0) 1771 n++; 1772 rc->rc_tried = 1; 1773 rc->rc_skipped = 0; 1774 } 1775 /* 1776 * If we managed to read anything more, retry the 1777 * reconstruction. 1778 */ 1779 if (n > 0) 1780 goto reconstruct; 1781 1782 /* 1783 * At this point we've attempted to reconstruct the data given the 1784 * errors we detected, and we've attempted to read all columns. There 1785 * must, therefore, be one or more additional problems -- silent errors 1786 * resulting in invalid data rather than explicit I/O errors resulting 1787 * in absent data. We check if there is enough additional data to 1788 * possibly reconstruct the data and then perform combinatorial 1789 * reconstruction over all possible combinations. If that fails, 1790 * we're cooked. 1791 */ 1792 if (total_errors > rm->rm_firstdatacol) { 1793 error = EIO; 1794 } else if (total_errors < rm->rm_firstdatacol && 1795 (code = vdev_raidz_combrec(vd->v_spa, rm, bp, data, offset, bytes, 1796 total_errors, data_errors)) != 0) { 1797 /* 1798 * If we didn't use all the available parity for the 1799 * combinatorial reconstruction, verify that the remaining 1800 * parity is correct. 1801 */ 1802 if (code != (1 << rm->rm_firstdatacol) - 1) 1803 (void) raidz_parity_verify(rm); 1804 } else { 1805 /* 1806 * We're here because either: 1807 * 1808 * total_errors == rm_first_datacol, or 1809 * vdev_raidz_combrec() failed 1810 * 1811 * In either case, there is enough bad data to prevent 1812 * reconstruction. 1813 * 1814 * Start checksum ereports for all children which haven't 1815 * failed, and the IO wasn't speculative. 1816 */ 1817 error = ECKSUM; 1818 } 1819 1820 done: 1821 vdev_raidz_map_free(rm); 1822 1823 return (error); 1824 } 1825