1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 #include <sys/cdefs.h>
27 #include <lz4.h>
28
29 static uint64_t zfs_crc64_table[256];
30
31 #ifndef ASSERT3S /* Proxy for all the assert defines */
32 #define ASSERT3S(x, y, z) ((void)0)
33 #define ASSERT3U(x, y, z) ((void)0)
34 #define ASSERT3P(x, y, z) ((void)0)
35 #define ASSERT0(x) ((void)0)
36 #define ASSERT(x) ((void)0)
37 #endif
38
39 #define panic(...) do { \
40 printf(__VA_ARGS__); \
41 for (;;) ; \
42 } while (0)
43
44 static void
zfs_init_crc(void)45 zfs_init_crc(void)
46 {
47 int i, j;
48 uint64_t *ct;
49
50 /*
51 * Calculate the crc64 table (used for the zap hash
52 * function).
53 */
54 if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
55 memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
56 for (i = 0; i < 256; i++)
57 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
58 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
59 }
60 }
61
62 static void
zio_checksum_off(const void * buf,uint64_t size,const void * ctx_template,zio_cksum_t * zcp)63 zio_checksum_off(const void *buf, uint64_t size,
64 const void *ctx_template, zio_cksum_t *zcp)
65 {
66 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
67 }
68
69 /*
70 * Signature for checksum functions.
71 */
72 typedef void zio_checksum_t(const void *data, uint64_t size,
73 const void *ctx_template, zio_cksum_t *zcp);
74 typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
75 typedef void zio_checksum_tmpl_free_t(void *ctx_template);
76
77 typedef enum zio_checksum_flags {
78 /* Strong enough for metadata? */
79 ZCHECKSUM_FLAG_METADATA = (1 << 1),
80 /* ZIO embedded checksum */
81 ZCHECKSUM_FLAG_EMBEDDED = (1 << 2),
82 /* Strong enough for dedup (without verification)? */
83 ZCHECKSUM_FLAG_DEDUP = (1 << 3),
84 /* Uses salt value */
85 ZCHECKSUM_FLAG_SALTED = (1 << 4),
86 /* Strong enough for nopwrite? */
87 ZCHECKSUM_FLAG_NOPWRITE = (1 << 5)
88 } zio_checksum_flags_t;
89
90 /*
91 * Information about each checksum function.
92 */
93 typedef struct zio_checksum_info {
94 /* checksum function for each byteorder */
95 zio_checksum_t *ci_func[2];
96 zio_checksum_tmpl_init_t *ci_tmpl_init;
97 zio_checksum_tmpl_free_t *ci_tmpl_free;
98 zio_checksum_flags_t ci_flags;
99 const char *ci_name; /* descriptive name */
100 } zio_checksum_info_t;
101
102 #include "blkptr.c"
103
104 #include "fletcher.c"
105 #include "blake3_zfs.c"
106 #include "sha256.c"
107 #include "skein_zfs.c"
108
109 #ifdef HAS_ZSTD_ZFS
110 extern int zfs_zstd_decompress_buf(void *s_start, void *d_start, size_t s_len,
111 size_t d_len, int n);
112 #endif
113
114 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
115 {{NULL, NULL}, NULL, NULL, 0, "inherit"},
116 {{NULL, NULL}, NULL, NULL, 0, "on"},
117 {{zio_checksum_off, zio_checksum_off}, NULL, NULL, 0, "off"},
118 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL,
119 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"},
120 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL,
121 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"},
122 {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL,
123 ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
124 {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL,
125 0, "fletcher2"},
126 {{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL,
127 ZCHECKSUM_FLAG_METADATA, "fletcher4"},
128 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL,
129 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
130 ZCHECKSUM_FLAG_NOPWRITE, "SHA256"},
131 {{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL,
132 ZCHECKSUM_FLAG_EMBEDDED, "zillog2"},
133 {{zio_checksum_off, zio_checksum_off}, NULL, NULL,
134 0, "noparity"},
135 {{zio_checksum_SHA512_native, zio_checksum_SHA512_byteswap},
136 NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
137 ZCHECKSUM_FLAG_NOPWRITE, "SHA512"},
138 {{zio_checksum_skein_native, zio_checksum_skein_byteswap},
139 zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
140 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
141 ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
142 /* no edonr for now */
143 {{NULL, NULL}, NULL, NULL, ZCHECKSUM_FLAG_METADATA |
144 ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
145 {{zio_checksum_blake3_native, zio_checksum_blake3_byteswap},
146 zio_checksum_blake3_tmpl_init, zio_checksum_blake3_tmpl_free,
147 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
148 ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "blake3"}
149 };
150
151 /*
152 * Common signature for all zio compress/decompress functions.
153 */
154 typedef size_t zio_compress_func_t(void *src, void *dst,
155 size_t s_len, size_t d_len, int);
156 typedef int zio_decompress_func_t(void *src, void *dst,
157 size_t s_len, size_t d_len, int);
158
159 /*
160 * Information about each compression function.
161 */
162 typedef struct zio_compress_info {
163 zio_compress_func_t *ci_compress; /* compression function */
164 zio_decompress_func_t *ci_decompress; /* decompression function */
165 int ci_level; /* level parameter */
166 const char *ci_name; /* algorithm name */
167 } zio_compress_info_t;
168
169 #include "lzjb.c"
170 #include "zle.c"
171 #include "gzip.c"
172
173 /*
174 * Compression vectors.
175 */
176 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
177 {NULL, NULL, 0, "inherit"},
178 {NULL, NULL, 0, "on"},
179 {NULL, NULL, 0, "uncompressed"},
180 {NULL, lzjb_decompress, 0, "lzjb"},
181 {NULL, NULL, 0, "empty"},
182 {NULL, gzip_decompress, 1, "gzip-1"},
183 {NULL, gzip_decompress, 2, "gzip-2"},
184 {NULL, gzip_decompress, 3, "gzip-3"},
185 {NULL, gzip_decompress, 4, "gzip-4"},
186 {NULL, gzip_decompress, 5, "gzip-5"},
187 {NULL, gzip_decompress, 6, "gzip-6"},
188 {NULL, gzip_decompress, 7, "gzip-7"},
189 {NULL, gzip_decompress, 8, "gzip-8"},
190 {NULL, gzip_decompress, 9, "gzip-9"},
191 {NULL, zle_decompress, 64, "zle"},
192 {NULL, lz4_decompress, 0, "lz4"},
193 #ifdef HAS_ZSTD_ZFS
194 {NULL, zfs_zstd_decompress_buf, ZIO_ZSTD_LEVEL_DEFAULT, "zstd"}
195 #endif
196 };
197
198 static void
byteswap_uint64_array(void * vbuf,size_t size)199 byteswap_uint64_array(void *vbuf, size_t size)
200 {
201 uint64_t *buf = vbuf;
202 size_t count = size >> 3;
203 int i;
204
205 ASSERT((size & 7) == 0);
206
207 for (i = 0; i < count; i++)
208 buf[i] = BSWAP_64(buf[i]);
209 }
210
211 /*
212 * Set the external verifier for a gang block based on <vdev, offset, txg>,
213 * a tuple which is guaranteed to be unique for the life of the pool.
214 */
215 static void
zio_checksum_gang_verifier(zio_cksum_t * zcp,const blkptr_t * bp)216 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
217 {
218 const dva_t *dva = BP_IDENTITY(bp);
219 uint64_t txg = BP_PHYSICAL_BIRTH(bp);
220
221 ASSERT(BP_IS_GANG(bp));
222
223 ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
224 }
225
226 /*
227 * Set the external verifier for a label block based on its offset.
228 * The vdev is implicit, and the txg is unknowable at pool open time --
229 * hence the logic in vdev_uberblock_load() to find the most recent copy.
230 */
231 static void
zio_checksum_label_verifier(zio_cksum_t * zcp,uint64_t offset)232 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
233 {
234 ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
235 }
236
237 /*
238 * Calls the template init function of a checksum which supports context
239 * templates and installs the template into the spa_t.
240 */
241 static void
zio_checksum_template_init(enum zio_checksum checksum,spa_t * spa)242 zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
243 {
244 zio_checksum_info_t *ci = &zio_checksum_table[checksum];
245
246 if (ci->ci_tmpl_init == NULL)
247 return;
248
249 if (spa->spa_cksum_tmpls[checksum] != NULL)
250 return;
251
252 if (spa->spa_cksum_tmpls[checksum] == NULL) {
253 spa->spa_cksum_tmpls[checksum] =
254 ci->ci_tmpl_init(&spa->spa_cksum_salt);
255 }
256 }
257
258 /*
259 * Called by a spa_t that's about to be deallocated. This steps through
260 * all of the checksum context templates and deallocates any that were
261 * initialized using the algorithm-specific template init function.
262 */
263 static void __unused
zio_checksum_templates_free(spa_t * spa)264 zio_checksum_templates_free(spa_t *spa)
265 {
266 for (enum zio_checksum checksum = 0;
267 checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
268 if (spa->spa_cksum_tmpls[checksum] != NULL) {
269 zio_checksum_info_t *ci = &zio_checksum_table[checksum];
270
271 ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
272 spa->spa_cksum_tmpls[checksum] = NULL;
273 }
274 }
275 }
276
277 static int
zio_checksum_verify(const spa_t * spa,const blkptr_t * bp,void * data)278 zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data)
279 {
280 uint64_t size;
281 unsigned int checksum;
282 zio_checksum_info_t *ci;
283 void *ctx = NULL;
284 zio_cksum_t actual_cksum, expected_cksum, verifier;
285 int byteswap;
286
287 checksum = BP_GET_CHECKSUM(bp);
288 size = BP_GET_PSIZE(bp);
289
290 if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
291 return (EINVAL);
292 ci = &zio_checksum_table[checksum];
293 if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
294 return (EINVAL);
295
296 if (spa != NULL) {
297 zio_checksum_template_init(checksum, __DECONST(spa_t *,spa));
298 ctx = spa->spa_cksum_tmpls[checksum];
299 }
300
301 if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
302 zio_eck_t *eck;
303
304 ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
305 checksum == ZIO_CHECKSUM_LABEL);
306
307 eck = (zio_eck_t *)((char *)data + size) - 1;
308
309 if (checksum == ZIO_CHECKSUM_GANG_HEADER)
310 zio_checksum_gang_verifier(&verifier, bp);
311 else if (checksum == ZIO_CHECKSUM_LABEL)
312 zio_checksum_label_verifier(&verifier,
313 DVA_GET_OFFSET(BP_IDENTITY(bp)));
314 else
315 verifier = bp->blk_cksum;
316
317 byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
318
319 if (byteswap)
320 byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
321
322 expected_cksum = eck->zec_cksum;
323 eck->zec_cksum = verifier;
324 ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
325 eck->zec_cksum = expected_cksum;
326
327 if (byteswap)
328 byteswap_uint64_array(&expected_cksum,
329 sizeof (zio_cksum_t));
330 } else {
331 byteswap = BP_SHOULD_BYTESWAP(bp);
332 expected_cksum = bp->blk_cksum;
333 ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
334 }
335
336 if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
337 /*printf("ZFS: read checksum %s failed\n", ci->ci_name);*/
338 return (EIO);
339 }
340
341 return (0);
342 }
343
344 static int
zio_decompress_data(int cpfunc,void * src,uint64_t srcsize,void * dest,uint64_t destsize)345 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
346 void *dest, uint64_t destsize)
347 {
348 zio_compress_info_t *ci;
349
350 if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
351 printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
352 return (EIO);
353 }
354
355 ci = &zio_compress_table[cpfunc];
356 if (!ci->ci_decompress) {
357 printf("ZFS: unsupported compression algorithm %s\n",
358 ci->ci_name);
359 return (EIO);
360 }
361
362 return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
363 }
364
365 static uint64_t
zap_hash(uint64_t salt,const char * name)366 zap_hash(uint64_t salt, const char *name)
367 {
368 const uint8_t *cp;
369 uint8_t c;
370 uint64_t crc = salt;
371
372 ASSERT(crc != 0);
373 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
374 for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
375 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
376
377 /*
378 * Only use 28 bits, since we need 4 bits in the cookie for the
379 * collision differentiator. We MUST use the high bits, since
380 * those are the onces that we first pay attention to when
381 * chosing the bucket.
382 */
383 crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
384
385 return (crc);
386 }
387
388 typedef struct raidz_col {
389 uint64_t rc_devidx; /* child device index for I/O */
390 uint64_t rc_offset; /* device offset */
391 uint64_t rc_size; /* I/O size */
392 void *rc_data; /* I/O data */
393 int rc_error; /* I/O error for this device */
394 uint8_t rc_tried; /* Did we attempt this I/O column? */
395 uint8_t rc_skipped; /* Did we skip this I/O column? */
396 } raidz_col_t;
397
398 typedef struct raidz_map {
399 uint64_t rm_cols; /* Regular column count */
400 uint64_t rm_scols; /* Count including skipped columns */
401 uint64_t rm_bigcols; /* Number of oversized columns */
402 uint64_t rm_asize; /* Actual total I/O size */
403 uint64_t rm_missingdata; /* Count of missing data devices */
404 uint64_t rm_missingparity; /* Count of missing parity devices */
405 uint64_t rm_firstdatacol; /* First data column/parity count */
406 uint64_t rm_nskip; /* Skipped sectors for padding */
407 uint64_t rm_skipstart; /* Column index of padding start */
408 uintptr_t rm_reports; /* # of referencing checksum reports */
409 uint8_t rm_freed; /* map no longer has referencing ZIO */
410 uint8_t rm_ecksuminjected; /* checksum error was injected */
411 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
412 } raidz_map_t;
413
414 #define VDEV_RAIDZ_P 0
415 #define VDEV_RAIDZ_Q 1
416 #define VDEV_RAIDZ_R 2
417
418 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
419 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
420
421 /*
422 * We provide a mechanism to perform the field multiplication operation on a
423 * 64-bit value all at once rather than a byte at a time. This works by
424 * creating a mask from the top bit in each byte and using that to
425 * conditionally apply the XOR of 0x1d.
426 */
427 #define VDEV_RAIDZ_64MUL_2(x, mask) \
428 { \
429 (mask) = (x) & 0x8080808080808080ULL; \
430 (mask) = ((mask) << 1) - ((mask) >> 7); \
431 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
432 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
433 }
434
435 #define VDEV_RAIDZ_64MUL_4(x, mask) \
436 { \
437 VDEV_RAIDZ_64MUL_2((x), mask); \
438 VDEV_RAIDZ_64MUL_2((x), mask); \
439 }
440
441 /*
442 * These two tables represent powers and logs of 2 in the Galois field defined
443 * above. These values were computed by repeatedly multiplying by 2 as above.
444 */
445 static const uint8_t vdev_raidz_pow2[256] = {
446 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
447 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
448 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
449 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
450 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
451 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
452 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
453 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
454 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
455 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
456 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
457 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
458 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
459 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
460 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
461 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
462 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
463 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
464 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
465 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
466 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
467 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
468 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
469 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
470 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
471 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
472 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
473 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
474 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
475 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
476 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
477 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
478 };
479 static const uint8_t vdev_raidz_log2[256] = {
480 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
481 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
482 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
483 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
484 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
485 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
486 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
487 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
488 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
489 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
490 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
491 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
492 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
493 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
494 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
495 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
496 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
497 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
498 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
499 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
500 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
501 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
502 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
503 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
504 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
505 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
506 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
507 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
508 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
509 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
510 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
511 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
512 };
513
514 /*
515 * Multiply a given number by 2 raised to the given power.
516 */
517 static uint8_t
vdev_raidz_exp2(uint8_t a,int exp)518 vdev_raidz_exp2(uint8_t a, int exp)
519 {
520 if (a == 0)
521 return (0);
522
523 ASSERT(exp >= 0);
524 ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
525
526 exp += vdev_raidz_log2[a];
527 if (exp > 255)
528 exp -= 255;
529
530 return (vdev_raidz_pow2[exp]);
531 }
532
533 static void
vdev_raidz_generate_parity_p(raidz_map_t * rm)534 vdev_raidz_generate_parity_p(raidz_map_t *rm)
535 {
536 uint64_t *p, *src, ccount, i;
537 uint64_t pcount __unused;
538 int c;
539
540 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
541
542 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
543 src = rm->rm_col[c].rc_data;
544 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
545 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
546
547 if (c == rm->rm_firstdatacol) {
548 ASSERT(ccount == pcount);
549 for (i = 0; i < ccount; i++, src++, p++) {
550 *p = *src;
551 }
552 } else {
553 ASSERT(ccount <= pcount);
554 for (i = 0; i < ccount; i++, src++, p++) {
555 *p ^= *src;
556 }
557 }
558 }
559 }
560
561 static void
vdev_raidz_generate_parity_pq(raidz_map_t * rm)562 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
563 {
564 uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
565 int c;
566
567 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
568 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
569 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
570
571 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
572 src = rm->rm_col[c].rc_data;
573 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
574 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
575
576 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
577
578 if (c == rm->rm_firstdatacol) {
579 ASSERT(ccnt == pcnt || ccnt == 0);
580 for (i = 0; i < ccnt; i++, src++, p++, q++) {
581 *p = *src;
582 *q = *src;
583 }
584 for (; i < pcnt; i++, src++, p++, q++) {
585 *p = 0;
586 *q = 0;
587 }
588 } else {
589 ASSERT(ccnt <= pcnt);
590
591 /*
592 * Apply the algorithm described above by multiplying
593 * the previous result and adding in the new value.
594 */
595 for (i = 0; i < ccnt; i++, src++, p++, q++) {
596 *p ^= *src;
597
598 VDEV_RAIDZ_64MUL_2(*q, mask);
599 *q ^= *src;
600 }
601
602 /*
603 * Treat short columns as though they are full of 0s.
604 * Note that there's therefore nothing needed for P.
605 */
606 for (; i < pcnt; i++, q++) {
607 VDEV_RAIDZ_64MUL_2(*q, mask);
608 }
609 }
610 }
611 }
612
613 static void
vdev_raidz_generate_parity_pqr(raidz_map_t * rm)614 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
615 {
616 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
617 int c;
618
619 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
620 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
621 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
622 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
623 rm->rm_col[VDEV_RAIDZ_R].rc_size);
624
625 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
626 src = rm->rm_col[c].rc_data;
627 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
628 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
629 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
630
631 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
632
633 if (c == rm->rm_firstdatacol) {
634 ASSERT(ccnt == pcnt || ccnt == 0);
635 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
636 *p = *src;
637 *q = *src;
638 *r = *src;
639 }
640 for (; i < pcnt; i++, src++, p++, q++, r++) {
641 *p = 0;
642 *q = 0;
643 *r = 0;
644 }
645 } else {
646 ASSERT(ccnt <= pcnt);
647
648 /*
649 * Apply the algorithm described above by multiplying
650 * the previous result and adding in the new value.
651 */
652 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
653 *p ^= *src;
654
655 VDEV_RAIDZ_64MUL_2(*q, mask);
656 *q ^= *src;
657
658 VDEV_RAIDZ_64MUL_4(*r, mask);
659 *r ^= *src;
660 }
661
662 /*
663 * Treat short columns as though they are full of 0s.
664 * Note that there's therefore nothing needed for P.
665 */
666 for (; i < pcnt; i++, q++, r++) {
667 VDEV_RAIDZ_64MUL_2(*q, mask);
668 VDEV_RAIDZ_64MUL_4(*r, mask);
669 }
670 }
671 }
672 }
673
674 /*
675 * Generate RAID parity in the first virtual columns according to the number of
676 * parity columns available.
677 */
678 static void
vdev_raidz_generate_parity(raidz_map_t * rm)679 vdev_raidz_generate_parity(raidz_map_t *rm)
680 {
681 switch (rm->rm_firstdatacol) {
682 case 1:
683 vdev_raidz_generate_parity_p(rm);
684 break;
685 case 2:
686 vdev_raidz_generate_parity_pq(rm);
687 break;
688 case 3:
689 vdev_raidz_generate_parity_pqr(rm);
690 break;
691 default:
692 panic("invalid RAID-Z configuration");
693 }
694 }
695
696 /* BEGIN CSTYLED */
697 /*
698 * In the general case of reconstruction, we must solve the system of linear
699 * equations defined by the coeffecients used to generate parity as well as
700 * the contents of the data and parity disks. This can be expressed with
701 * vectors for the original data (D) and the actual data (d) and parity (p)
702 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
703 *
704 * __ __ __ __
705 * | | __ __ | p_0 |
706 * | V | | D_0 | | p_m-1 |
707 * | | x | : | = | d_0 |
708 * | I | | D_n-1 | | : |
709 * | | ~~ ~~ | d_n-1 |
710 * ~~ ~~ ~~ ~~
711 *
712 * I is simply a square identity matrix of size n, and V is a vandermonde
713 * matrix defined by the coeffecients we chose for the various parity columns
714 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
715 * computation as well as linear separability.
716 *
717 * __ __ __ __
718 * | 1 .. 1 1 1 | | p_0 |
719 * | 2^n-1 .. 4 2 1 | __ __ | : |
720 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
721 * | 1 .. 0 0 0 | | D_1 | | d_0 |
722 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
723 * | : : : : | | : | | d_2 |
724 * | 0 .. 1 0 0 | | D_n-1 | | : |
725 * | 0 .. 0 1 0 | ~~ ~~ | : |
726 * | 0 .. 0 0 1 | | d_n-1 |
727 * ~~ ~~ ~~ ~~
728 *
729 * Note that I, V, d, and p are known. To compute D, we must invert the
730 * matrix and use the known data and parity values to reconstruct the unknown
731 * data values. We begin by removing the rows in V|I and d|p that correspond
732 * to failed or missing columns; we then make V|I square (n x n) and d|p
733 * sized n by removing rows corresponding to unused parity from the bottom up
734 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
735 * using Gauss-Jordan elimination. In the example below we use m=3 parity
736 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
737 * __ __
738 * | 1 1 1 1 1 1 1 1 |
739 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
740 * | 19 205 116 29 64 16 4 1 | / /
741 * | 1 0 0 0 0 0 0 0 | / /
742 * | 0 1 0 0 0 0 0 0 | <--' /
743 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
744 * | 0 0 0 1 0 0 0 0 |
745 * | 0 0 0 0 1 0 0 0 |
746 * | 0 0 0 0 0 1 0 0 |
747 * | 0 0 0 0 0 0 1 0 |
748 * | 0 0 0 0 0 0 0 1 |
749 * ~~ ~~
750 * __ __
751 * | 1 1 1 1 1 1 1 1 |
752 * | 128 64 32 16 8 4 2 1 |
753 * | 19 205 116 29 64 16 4 1 |
754 * | 1 0 0 0 0 0 0 0 |
755 * | 0 1 0 0 0 0 0 0 |
756 * (V|I)' = | 0 0 1 0 0 0 0 0 |
757 * | 0 0 0 1 0 0 0 0 |
758 * | 0 0 0 0 1 0 0 0 |
759 * | 0 0 0 0 0 1 0 0 |
760 * | 0 0 0 0 0 0 1 0 |
761 * | 0 0 0 0 0 0 0 1 |
762 * ~~ ~~
763 *
764 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
765 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
766 * matrix is not singular.
767 * __ __
768 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
769 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
770 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
771 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
772 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
773 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
774 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
775 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
776 * ~~ ~~
777 * __ __
778 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
779 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
780 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
781 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
782 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
783 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
784 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
785 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
786 * ~~ ~~
787 * __ __
788 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
789 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
790 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
791 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
792 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
793 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
794 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
795 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
796 * ~~ ~~
797 * __ __
798 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
799 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
800 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
801 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
802 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
803 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
804 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
805 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
806 * ~~ ~~
807 * __ __
808 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
809 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
810 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
811 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
812 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
813 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
814 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
815 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
816 * ~~ ~~
817 * __ __
818 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
819 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
820 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
821 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
822 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
823 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
824 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
825 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
826 * ~~ ~~
827 * __ __
828 * | 0 0 1 0 0 0 0 0 |
829 * | 167 100 5 41 159 169 217 208 |
830 * | 166 100 4 40 158 168 216 209 |
831 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
832 * | 0 0 0 0 1 0 0 0 |
833 * | 0 0 0 0 0 1 0 0 |
834 * | 0 0 0 0 0 0 1 0 |
835 * | 0 0 0 0 0 0 0 1 |
836 * ~~ ~~
837 *
838 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
839 * of the missing data.
840 *
841 * As is apparent from the example above, the only non-trivial rows in the
842 * inverse matrix correspond to the data disks that we're trying to
843 * reconstruct. Indeed, those are the only rows we need as the others would
844 * only be useful for reconstructing data known or assumed to be valid. For
845 * that reason, we only build the coefficients in the rows that correspond to
846 * targeted columns.
847 */
848 /* END CSTYLED */
849
850 static void
vdev_raidz_matrix_init(raidz_map_t * rm,int n,int nmap,int * map,uint8_t ** rows)851 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
852 uint8_t **rows)
853 {
854 int i, j;
855 int pow;
856
857 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
858
859 /*
860 * Fill in the missing rows of interest.
861 */
862 for (i = 0; i < nmap; i++) {
863 ASSERT3S(0, <=, map[i]);
864 ASSERT3S(map[i], <=, 2);
865
866 pow = map[i] * n;
867 if (pow > 255)
868 pow -= 255;
869 ASSERT(pow <= 255);
870
871 for (j = 0; j < n; j++) {
872 pow -= map[i];
873 if (pow < 0)
874 pow += 255;
875 rows[i][j] = vdev_raidz_pow2[pow];
876 }
877 }
878 }
879
880 static void
vdev_raidz_matrix_invert(raidz_map_t * rm,int n,int nmissing,int * missing,uint8_t ** rows,uint8_t ** invrows,const uint8_t * used)881 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
882 uint8_t **rows, uint8_t **invrows, const uint8_t *used)
883 {
884 int i, j, ii, jj;
885 uint8_t log;
886
887 /*
888 * Assert that the first nmissing entries from the array of used
889 * columns correspond to parity columns and that subsequent entries
890 * correspond to data columns.
891 */
892 for (i = 0; i < nmissing; i++) {
893 ASSERT3S(used[i], <, rm->rm_firstdatacol);
894 }
895 for (; i < n; i++) {
896 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
897 }
898
899 /*
900 * First initialize the storage where we'll compute the inverse rows.
901 */
902 for (i = 0; i < nmissing; i++) {
903 for (j = 0; j < n; j++) {
904 invrows[i][j] = (i == j) ? 1 : 0;
905 }
906 }
907
908 /*
909 * Subtract all trivial rows from the rows of consequence.
910 */
911 for (i = 0; i < nmissing; i++) {
912 for (j = nmissing; j < n; j++) {
913 ASSERT3U(used[j], >=, rm->rm_firstdatacol);
914 jj = used[j] - rm->rm_firstdatacol;
915 ASSERT3S(jj, <, n);
916 invrows[i][j] = rows[i][jj];
917 rows[i][jj] = 0;
918 }
919 }
920
921 /*
922 * For each of the rows of interest, we must normalize it and subtract
923 * a multiple of it from the other rows.
924 */
925 for (i = 0; i < nmissing; i++) {
926 for (j = 0; j < missing[i]; j++) {
927 ASSERT3U(rows[i][j], ==, 0);
928 }
929 ASSERT3U(rows[i][missing[i]], !=, 0);
930
931 /*
932 * Compute the inverse of the first element and multiply each
933 * element in the row by that value.
934 */
935 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
936
937 for (j = 0; j < n; j++) {
938 rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
939 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
940 }
941
942 for (ii = 0; ii < nmissing; ii++) {
943 if (i == ii)
944 continue;
945
946 ASSERT3U(rows[ii][missing[i]], !=, 0);
947
948 log = vdev_raidz_log2[rows[ii][missing[i]]];
949
950 for (j = 0; j < n; j++) {
951 rows[ii][j] ^=
952 vdev_raidz_exp2(rows[i][j], log);
953 invrows[ii][j] ^=
954 vdev_raidz_exp2(invrows[i][j], log);
955 }
956 }
957 }
958
959 /*
960 * Verify that the data that is left in the rows are properly part of
961 * an identity matrix.
962 */
963 for (i = 0; i < nmissing; i++) {
964 for (j = 0; j < n; j++) {
965 if (j == missing[i]) {
966 ASSERT3U(rows[i][j], ==, 1);
967 } else {
968 ASSERT3U(rows[i][j], ==, 0);
969 }
970 }
971 }
972 }
973
974 static void
vdev_raidz_matrix_reconstruct(raidz_map_t * rm,int n,int nmissing,int * missing,uint8_t ** invrows,const uint8_t * used)975 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
976 int *missing, uint8_t **invrows, const uint8_t *used)
977 {
978 int i, j, x, cc, c;
979 uint8_t *src;
980 uint64_t ccount;
981 uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
982 uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
983 uint8_t log, val;
984 int ll;
985 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
986 uint8_t *p, *pp;
987 size_t psize;
988
989 log = 0; /* gcc */
990 psize = sizeof (invlog[0][0]) * n * nmissing;
991 p = malloc(psize);
992 if (p == NULL) {
993 printf("Out of memory\n");
994 return;
995 }
996
997 for (pp = p, i = 0; i < nmissing; i++) {
998 invlog[i] = pp;
999 pp += n;
1000 }
1001
1002 for (i = 0; i < nmissing; i++) {
1003 for (j = 0; j < n; j++) {
1004 ASSERT3U(invrows[i][j], !=, 0);
1005 invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1006 }
1007 }
1008
1009 for (i = 0; i < n; i++) {
1010 c = used[i];
1011 ASSERT3U(c, <, rm->rm_cols);
1012
1013 src = rm->rm_col[c].rc_data;
1014 ccount = rm->rm_col[c].rc_size;
1015 for (j = 0; j < nmissing; j++) {
1016 cc = missing[j] + rm->rm_firstdatacol;
1017 ASSERT3U(cc, >=, rm->rm_firstdatacol);
1018 ASSERT3U(cc, <, rm->rm_cols);
1019 ASSERT3U(cc, !=, c);
1020
1021 dst[j] = rm->rm_col[cc].rc_data;
1022 dcount[j] = rm->rm_col[cc].rc_size;
1023 }
1024
1025 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1026
1027 for (x = 0; x < ccount; x++, src++) {
1028 if (*src != 0)
1029 log = vdev_raidz_log2[*src];
1030
1031 for (cc = 0; cc < nmissing; cc++) {
1032 if (x >= dcount[cc])
1033 continue;
1034
1035 if (*src == 0) {
1036 val = 0;
1037 } else {
1038 if ((ll = log + invlog[cc][i]) >= 255)
1039 ll -= 255;
1040 val = vdev_raidz_pow2[ll];
1041 }
1042
1043 if (i == 0)
1044 dst[cc][x] = val;
1045 else
1046 dst[cc][x] ^= val;
1047 }
1048 }
1049 }
1050
1051 free(p);
1052 }
1053
1054 static int
vdev_raidz_reconstruct_general(raidz_map_t * rm,int * tgts,int ntgts)1055 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1056 {
1057 int n, i, c, t, tt;
1058 int nmissing_rows;
1059 int missing_rows[VDEV_RAIDZ_MAXPARITY];
1060 int parity_map[VDEV_RAIDZ_MAXPARITY];
1061
1062 uint8_t *p, *pp;
1063 size_t psize;
1064
1065 uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1066 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1067 uint8_t *used;
1068
1069 int code = 0;
1070
1071
1072 n = rm->rm_cols - rm->rm_firstdatacol;
1073
1074 /*
1075 * Figure out which data columns are missing.
1076 */
1077 nmissing_rows = 0;
1078 for (t = 0; t < ntgts; t++) {
1079 if (tgts[t] >= rm->rm_firstdatacol) {
1080 missing_rows[nmissing_rows++] =
1081 tgts[t] - rm->rm_firstdatacol;
1082 }
1083 }
1084
1085 /*
1086 * Figure out which parity columns to use to help generate the missing
1087 * data columns.
1088 */
1089 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1090 ASSERT(tt < ntgts);
1091 ASSERT(c < rm->rm_firstdatacol);
1092
1093 /*
1094 * Skip any targeted parity columns.
1095 */
1096 if (c == tgts[tt]) {
1097 tt++;
1098 continue;
1099 }
1100
1101 code |= 1 << c;
1102
1103 parity_map[i] = c;
1104 i++;
1105 }
1106
1107 ASSERT(code != 0);
1108 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1109
1110 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1111 nmissing_rows * n + sizeof (used[0]) * n;
1112 p = malloc(psize);
1113 if (p == NULL) {
1114 printf("Out of memory\n");
1115 return (code);
1116 }
1117
1118 for (pp = p, i = 0; i < nmissing_rows; i++) {
1119 rows[i] = pp;
1120 pp += n;
1121 invrows[i] = pp;
1122 pp += n;
1123 }
1124 used = pp;
1125
1126 for (i = 0; i < nmissing_rows; i++) {
1127 used[i] = parity_map[i];
1128 }
1129
1130 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1131 if (tt < nmissing_rows &&
1132 c == missing_rows[tt] + rm->rm_firstdatacol) {
1133 tt++;
1134 continue;
1135 }
1136
1137 ASSERT3S(i, <, n);
1138 used[i] = c;
1139 i++;
1140 }
1141
1142 /*
1143 * Initialize the interesting rows of the matrix.
1144 */
1145 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1146
1147 /*
1148 * Invert the matrix.
1149 */
1150 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1151 invrows, used);
1152
1153 /*
1154 * Reconstruct the missing data using the generated matrix.
1155 */
1156 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1157 invrows, used);
1158
1159 free(p);
1160
1161 return (code);
1162 }
1163
1164 static int
vdev_raidz_reconstruct(raidz_map_t * rm,int * t,int nt)1165 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1166 {
1167 int tgts[VDEV_RAIDZ_MAXPARITY];
1168 int ntgts;
1169 int i, c;
1170 int code;
1171 int nbadparity, nbaddata;
1172
1173 /*
1174 * The tgts list must already be sorted.
1175 */
1176 for (i = 1; i < nt; i++) {
1177 ASSERT(t[i] > t[i - 1]);
1178 }
1179
1180 nbadparity = rm->rm_firstdatacol;
1181 nbaddata = rm->rm_cols - nbadparity;
1182 ntgts = 0;
1183 for (i = 0, c = 0; c < rm->rm_cols; c++) {
1184 if (i < nt && c == t[i]) {
1185 tgts[ntgts++] = c;
1186 i++;
1187 } else if (rm->rm_col[c].rc_error != 0) {
1188 tgts[ntgts++] = c;
1189 } else if (c >= rm->rm_firstdatacol) {
1190 nbaddata--;
1191 } else {
1192 nbadparity--;
1193 }
1194 }
1195
1196 ASSERT(ntgts >= nt);
1197 ASSERT(nbaddata >= 0);
1198 ASSERT(nbaddata + nbadparity == ntgts);
1199
1200 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1201 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1202 ASSERT(code > 0);
1203 return (code);
1204 }
1205
1206 static raidz_map_t *
vdev_raidz_map_alloc(void * data,off_t offset,size_t size,uint64_t unit_shift,uint64_t dcols,uint64_t nparity)1207 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1208 uint64_t dcols, uint64_t nparity)
1209 {
1210 raidz_map_t *rm;
1211 uint64_t b = offset >> unit_shift;
1212 uint64_t s = size >> unit_shift;
1213 uint64_t f = b % dcols;
1214 uint64_t o = (b / dcols) << unit_shift;
1215 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1216
1217 q = s / (dcols - nparity);
1218 r = s - q * (dcols - nparity);
1219 bc = (r == 0 ? 0 : r + nparity);
1220 tot = s + nparity * (q + (r == 0 ? 0 : 1));
1221
1222 if (q == 0) {
1223 acols = bc;
1224 scols = MIN(dcols, roundup(bc, nparity + 1));
1225 } else {
1226 acols = dcols;
1227 scols = dcols;
1228 }
1229
1230 ASSERT3U(acols, <=, scols);
1231
1232 rm = malloc(offsetof(raidz_map_t, rm_col[scols]));
1233 if (rm == NULL)
1234 return (rm);
1235
1236 rm->rm_cols = acols;
1237 rm->rm_scols = scols;
1238 rm->rm_bigcols = bc;
1239 rm->rm_skipstart = bc;
1240 rm->rm_missingdata = 0;
1241 rm->rm_missingparity = 0;
1242 rm->rm_firstdatacol = nparity;
1243 rm->rm_reports = 0;
1244 rm->rm_freed = 0;
1245 rm->rm_ecksuminjected = 0;
1246
1247 asize = 0;
1248
1249 for (c = 0; c < scols; c++) {
1250 col = f + c;
1251 coff = o;
1252 if (col >= dcols) {
1253 col -= dcols;
1254 coff += 1ULL << unit_shift;
1255 }
1256 rm->rm_col[c].rc_devidx = col;
1257 rm->rm_col[c].rc_offset = coff;
1258 rm->rm_col[c].rc_data = NULL;
1259 rm->rm_col[c].rc_error = 0;
1260 rm->rm_col[c].rc_tried = 0;
1261 rm->rm_col[c].rc_skipped = 0;
1262
1263 if (c >= acols)
1264 rm->rm_col[c].rc_size = 0;
1265 else if (c < bc)
1266 rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1267 else
1268 rm->rm_col[c].rc_size = q << unit_shift;
1269
1270 asize += rm->rm_col[c].rc_size;
1271 }
1272
1273 ASSERT3U(asize, ==, tot << unit_shift);
1274 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1275 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1276 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1277 ASSERT3U(rm->rm_nskip, <=, nparity);
1278
1279 for (c = 0; c < rm->rm_firstdatacol; c++) {
1280 rm->rm_col[c].rc_data = malloc(rm->rm_col[c].rc_size);
1281 if (rm->rm_col[c].rc_data == NULL) {
1282 c++;
1283 while (c != 0)
1284 free(rm->rm_col[--c].rc_data);
1285 free(rm);
1286 return (NULL);
1287 }
1288 }
1289
1290 rm->rm_col[c].rc_data = data;
1291
1292 for (c = c + 1; c < acols; c++)
1293 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1294 rm->rm_col[c - 1].rc_size;
1295
1296 /*
1297 * If all data stored spans all columns, there's a danger that parity
1298 * will always be on the same device and, since parity isn't read
1299 * during normal operation, that that device's I/O bandwidth won't be
1300 * used effectively. We therefore switch the parity every 1MB.
1301 *
1302 * ... at least that was, ostensibly, the theory. As a practical
1303 * matter unless we juggle the parity between all devices evenly, we
1304 * won't see any benefit. Further, occasional writes that aren't a
1305 * multiple of the LCM of the number of children and the minimum
1306 * stripe width are sufficient to avoid pessimal behavior.
1307 * Unfortunately, this decision created an implicit on-disk format
1308 * requirement that we need to support for all eternity, but only
1309 * for single-parity RAID-Z.
1310 *
1311 * If we intend to skip a sector in the zeroth column for padding
1312 * we must make sure to note this swap. We will never intend to
1313 * skip the first column since at least one data and one parity
1314 * column must appear in each row.
1315 */
1316 ASSERT(rm->rm_cols >= 2);
1317 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1318
1319 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1320 devidx = rm->rm_col[0].rc_devidx;
1321 o = rm->rm_col[0].rc_offset;
1322 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1323 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1324 rm->rm_col[1].rc_devidx = devidx;
1325 rm->rm_col[1].rc_offset = o;
1326
1327 if (rm->rm_skipstart == 0)
1328 rm->rm_skipstart = 1;
1329 }
1330
1331 return (rm);
1332 }
1333
1334 static void
vdev_raidz_map_free(raidz_map_t * rm)1335 vdev_raidz_map_free(raidz_map_t *rm)
1336 {
1337 int c;
1338
1339 for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1340 free(rm->rm_col[c].rc_data);
1341
1342 free(rm);
1343 }
1344
1345 static vdev_t *
vdev_child(vdev_t * pvd,uint64_t devidx)1346 vdev_child(vdev_t *pvd, uint64_t devidx)
1347 {
1348 vdev_t *cvd;
1349
1350 STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1351 if (cvd->v_id == devidx)
1352 break;
1353 }
1354
1355 return (cvd);
1356 }
1357
1358 /*
1359 * We keep track of whether or not there were any injected errors, so that
1360 * any ereports we generate can note it.
1361 */
1362 static int
raidz_checksum_verify(const spa_t * spa,const blkptr_t * bp,void * data,uint64_t size)1363 raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data,
1364 uint64_t size)
1365 {
1366 return (zio_checksum_verify(spa, bp, data));
1367 }
1368
1369 /*
1370 * Generate the parity from the data columns. If we tried and were able to
1371 * read the parity without error, verify that the generated parity matches the
1372 * data we read. If it doesn't, we fire off a checksum error. Return the
1373 * number such failures.
1374 */
1375 static int
raidz_parity_verify(raidz_map_t * rm)1376 raidz_parity_verify(raidz_map_t *rm)
1377 {
1378 void *orig[VDEV_RAIDZ_MAXPARITY];
1379 int c, ret = 0;
1380 raidz_col_t *rc;
1381
1382 for (c = 0; c < rm->rm_firstdatacol; c++) {
1383 rc = &rm->rm_col[c];
1384 if (!rc->rc_tried || rc->rc_error != 0)
1385 continue;
1386 orig[c] = malloc(rc->rc_size);
1387 if (orig[c] != NULL) {
1388 bcopy(rc->rc_data, orig[c], rc->rc_size);
1389 } else {
1390 printf("Out of memory\n");
1391 }
1392 }
1393
1394 vdev_raidz_generate_parity(rm);
1395
1396 for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1397 rc = &rm->rm_col[c];
1398 if (!rc->rc_tried || rc->rc_error != 0)
1399 continue;
1400 if (orig[c] == NULL ||
1401 bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1402 rc->rc_error = ECKSUM;
1403 ret++;
1404 }
1405 free(orig[c]);
1406 }
1407
1408 return (ret);
1409 }
1410
1411 /*
1412 * Iterate over all combinations of bad data and attempt a reconstruction.
1413 * Note that the algorithm below is non-optimal because it doesn't take into
1414 * account how reconstruction is actually performed. For example, with
1415 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1416 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1417 * cases we'd only use parity information in column 0.
1418 */
1419 static int
vdev_raidz_combrec(const spa_t * spa,raidz_map_t * rm,const blkptr_t * bp,void * data,off_t offset,uint64_t bytes,int total_errors,int data_errors)1420 vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp,
1421 void *data, off_t offset, uint64_t bytes, int total_errors, int data_errors)
1422 {
1423 raidz_col_t *rc;
1424 void *orig[VDEV_RAIDZ_MAXPARITY];
1425 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1426 int *tgts = &tstore[1];
1427 int current, next, i, c, n;
1428 int code, ret = 0;
1429
1430 ASSERT(total_errors < rm->rm_firstdatacol);
1431
1432 /*
1433 * This simplifies one edge condition.
1434 */
1435 tgts[-1] = -1;
1436
1437 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1438 /*
1439 * Initialize the targets array by finding the first n columns
1440 * that contain no error.
1441 *
1442 * If there were no data errors, we need to ensure that we're
1443 * always explicitly attempting to reconstruct at least one
1444 * data column. To do this, we simply push the highest target
1445 * up into the data columns.
1446 */
1447 for (c = 0, i = 0; i < n; i++) {
1448 if (i == n - 1 && data_errors == 0 &&
1449 c < rm->rm_firstdatacol) {
1450 c = rm->rm_firstdatacol;
1451 }
1452
1453 while (rm->rm_col[c].rc_error != 0) {
1454 c++;
1455 ASSERT3S(c, <, rm->rm_cols);
1456 }
1457
1458 tgts[i] = c++;
1459 }
1460
1461 /*
1462 * Setting tgts[n] simplifies the other edge condition.
1463 */
1464 tgts[n] = rm->rm_cols;
1465
1466 /*
1467 * These buffers were allocated in previous iterations.
1468 */
1469 for (i = 0; i < n - 1; i++) {
1470 ASSERT(orig[i] != NULL);
1471 }
1472
1473 orig[n - 1] = malloc(rm->rm_col[0].rc_size);
1474 if (orig[n - 1] == NULL) {
1475 ret = ENOMEM;
1476 goto done;
1477 }
1478
1479 current = 0;
1480 next = tgts[current];
1481
1482 while (current != n) {
1483 tgts[current] = next;
1484 current = 0;
1485
1486 /*
1487 * Save off the original data that we're going to
1488 * attempt to reconstruct.
1489 */
1490 for (i = 0; i < n; i++) {
1491 ASSERT(orig[i] != NULL);
1492 c = tgts[i];
1493 ASSERT3S(c, >=, 0);
1494 ASSERT3S(c, <, rm->rm_cols);
1495 rc = &rm->rm_col[c];
1496 bcopy(rc->rc_data, orig[i], rc->rc_size);
1497 }
1498
1499 /*
1500 * Attempt a reconstruction and exit the outer loop on
1501 * success.
1502 */
1503 code = vdev_raidz_reconstruct(rm, tgts, n);
1504 if (raidz_checksum_verify(spa, bp, data, bytes) == 0) {
1505 for (i = 0; i < n; i++) {
1506 c = tgts[i];
1507 rc = &rm->rm_col[c];
1508 ASSERT(rc->rc_error == 0);
1509 rc->rc_error = ECKSUM;
1510 }
1511
1512 ret = code;
1513 goto done;
1514 }
1515
1516 /*
1517 * Restore the original data.
1518 */
1519 for (i = 0; i < n; i++) {
1520 c = tgts[i];
1521 rc = &rm->rm_col[c];
1522 bcopy(orig[i], rc->rc_data, rc->rc_size);
1523 }
1524
1525 do {
1526 /*
1527 * Find the next valid column after the current
1528 * position..
1529 */
1530 for (next = tgts[current] + 1;
1531 next < rm->rm_cols &&
1532 rm->rm_col[next].rc_error != 0; next++)
1533 continue;
1534
1535 ASSERT(next <= tgts[current + 1]);
1536
1537 /*
1538 * If that spot is available, we're done here.
1539 */
1540 if (next != tgts[current + 1])
1541 break;
1542
1543 /*
1544 * Otherwise, find the next valid column after
1545 * the previous position.
1546 */
1547 for (c = tgts[current - 1] + 1;
1548 rm->rm_col[c].rc_error != 0; c++)
1549 continue;
1550
1551 tgts[current] = c;
1552 current++;
1553
1554 } while (current != n);
1555 }
1556 }
1557 n--;
1558 done:
1559 for (i = n - 1; i >= 0; i--) {
1560 free(orig[i]);
1561 }
1562
1563 return (ret);
1564 }
1565
1566 static int
vdev_raidz_read(vdev_t * vd,const blkptr_t * bp,void * data,off_t offset,size_t bytes)1567 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1568 off_t offset, size_t bytes)
1569 {
1570 vdev_t *tvd = vd->v_top;
1571 vdev_t *cvd;
1572 raidz_map_t *rm;
1573 raidz_col_t *rc;
1574 int c, error;
1575 int unexpected_errors __unused;
1576 int parity_errors;
1577 int parity_untried;
1578 int data_errors;
1579 int total_errors;
1580 int n;
1581 int tgts[VDEV_RAIDZ_MAXPARITY];
1582 int code;
1583
1584 rc = NULL; /* gcc */
1585 error = 0;
1586
1587 rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1588 vd->v_nchildren, vd->v_nparity);
1589 if (rm == NULL)
1590 return (ENOMEM);
1591
1592 /*
1593 * Iterate over the columns in reverse order so that we hit the parity
1594 * last -- any errors along the way will force us to read the parity.
1595 */
1596 for (c = rm->rm_cols - 1; c >= 0; c--) {
1597 rc = &rm->rm_col[c];
1598 cvd = vdev_child(vd, rc->rc_devidx);
1599 if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1600 if (c >= rm->rm_firstdatacol)
1601 rm->rm_missingdata++;
1602 else
1603 rm->rm_missingparity++;
1604 rc->rc_error = ENXIO;
1605 rc->rc_tried = 1; /* don't even try */
1606 rc->rc_skipped = 1;
1607 continue;
1608 }
1609 #if 0 /* XXX: Too hard for the boot code. */
1610 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1611 if (c >= rm->rm_firstdatacol)
1612 rm->rm_missingdata++;
1613 else
1614 rm->rm_missingparity++;
1615 rc->rc_error = ESTALE;
1616 rc->rc_skipped = 1;
1617 continue;
1618 }
1619 #endif
1620 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1621 rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1622 rc->rc_offset, rc->rc_size);
1623 rc->rc_tried = 1;
1624 rc->rc_skipped = 0;
1625 }
1626 }
1627
1628 reconstruct:
1629 unexpected_errors = 0;
1630 parity_errors = 0;
1631 parity_untried = 0;
1632 data_errors = 0;
1633 total_errors = 0;
1634
1635 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1636 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1637
1638 for (c = 0; c < rm->rm_cols; c++) {
1639 rc = &rm->rm_col[c];
1640
1641 if (rc->rc_error) {
1642 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
1643
1644 if (c < rm->rm_firstdatacol)
1645 parity_errors++;
1646 else
1647 data_errors++;
1648
1649 if (!rc->rc_skipped)
1650 unexpected_errors++;
1651
1652 total_errors++;
1653 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1654 parity_untried++;
1655 }
1656 }
1657
1658 /*
1659 * There are three potential phases for a read:
1660 * 1. produce valid data from the columns read
1661 * 2. read all disks and try again
1662 * 3. perform combinatorial reconstruction
1663 *
1664 * Each phase is progressively both more expensive and less likely to
1665 * occur. If we encounter more errors than we can repair or all phases
1666 * fail, we have no choice but to return an error.
1667 */
1668
1669 /*
1670 * If the number of errors we saw was correctable -- less than or equal
1671 * to the number of parity disks read -- attempt to produce data that
1672 * has a valid checksum. Naturally, this case applies in the absence of
1673 * any errors.
1674 */
1675 if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1676 int rv;
1677
1678 if (data_errors == 0) {
1679 rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
1680 if (rv == 0) {
1681 /*
1682 * If we read parity information (unnecessarily
1683 * as it happens since no reconstruction was
1684 * needed) regenerate and verify the parity.
1685 * We also regenerate parity when resilvering
1686 * so we can write it out to the failed device
1687 * later.
1688 */
1689 if (parity_errors + parity_untried <
1690 rm->rm_firstdatacol) {
1691 n = raidz_parity_verify(rm);
1692 unexpected_errors += n;
1693 ASSERT(parity_errors + n <=
1694 rm->rm_firstdatacol);
1695 }
1696 goto done;
1697 }
1698 } else {
1699 /*
1700 * We either attempt to read all the parity columns or
1701 * none of them. If we didn't try to read parity, we
1702 * wouldn't be here in the correctable case. There must
1703 * also have been fewer parity errors than parity
1704 * columns or, again, we wouldn't be in this code path.
1705 */
1706 ASSERT(parity_untried == 0);
1707 ASSERT(parity_errors < rm->rm_firstdatacol);
1708
1709 /*
1710 * Identify the data columns that reported an error.
1711 */
1712 n = 0;
1713 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1714 rc = &rm->rm_col[c];
1715 if (rc->rc_error != 0) {
1716 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1717 tgts[n++] = c;
1718 }
1719 }
1720
1721 ASSERT(rm->rm_firstdatacol >= n);
1722
1723 code = vdev_raidz_reconstruct(rm, tgts, n);
1724
1725 rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
1726 if (rv == 0) {
1727 /*
1728 * If we read more parity disks than were used
1729 * for reconstruction, confirm that the other
1730 * parity disks produced correct data. This
1731 * routine is suboptimal in that it regenerates
1732 * the parity that we already used in addition
1733 * to the parity that we're attempting to
1734 * verify, but this should be a relatively
1735 * uncommon case, and can be optimized if it
1736 * becomes a problem. Note that we regenerate
1737 * parity when resilvering so we can write it
1738 * out to failed devices later.
1739 */
1740 if (parity_errors < rm->rm_firstdatacol - n) {
1741 n = raidz_parity_verify(rm);
1742 unexpected_errors += n;
1743 ASSERT(parity_errors + n <=
1744 rm->rm_firstdatacol);
1745 }
1746
1747 goto done;
1748 }
1749 }
1750 }
1751
1752 /*
1753 * This isn't a typical situation -- either we got a read
1754 * error or a child silently returned bad data. Read every
1755 * block so we can try again with as much data and parity as
1756 * we can track down. If we've already been through once
1757 * before, all children will be marked as tried so we'll
1758 * proceed to combinatorial reconstruction.
1759 */
1760 unexpected_errors = 1;
1761 rm->rm_missingdata = 0;
1762 rm->rm_missingparity = 0;
1763
1764 n = 0;
1765 for (c = 0; c < rm->rm_cols; c++) {
1766 rc = &rm->rm_col[c];
1767
1768 if (rc->rc_tried)
1769 continue;
1770
1771 cvd = vdev_child(vd, rc->rc_devidx);
1772 ASSERT(cvd != NULL);
1773 rc->rc_error = cvd->v_read(cvd, NULL,
1774 rc->rc_data, rc->rc_offset, rc->rc_size);
1775 if (rc->rc_error == 0)
1776 n++;
1777 rc->rc_tried = 1;
1778 rc->rc_skipped = 0;
1779 }
1780 /*
1781 * If we managed to read anything more, retry the
1782 * reconstruction.
1783 */
1784 if (n > 0)
1785 goto reconstruct;
1786
1787 /*
1788 * At this point we've attempted to reconstruct the data given the
1789 * errors we detected, and we've attempted to read all columns. There
1790 * must, therefore, be one or more additional problems -- silent errors
1791 * resulting in invalid data rather than explicit I/O errors resulting
1792 * in absent data. We check if there is enough additional data to
1793 * possibly reconstruct the data and then perform combinatorial
1794 * reconstruction over all possible combinations. If that fails,
1795 * we're cooked.
1796 */
1797 if (total_errors > rm->rm_firstdatacol) {
1798 error = EIO;
1799 } else if (total_errors < rm->rm_firstdatacol &&
1800 (code = vdev_raidz_combrec(vd->v_spa, rm, bp, data, offset, bytes,
1801 total_errors, data_errors)) != 0) {
1802 /*
1803 * If we didn't use all the available parity for the
1804 * combinatorial reconstruction, verify that the remaining
1805 * parity is correct.
1806 */
1807 if (code != (1 << rm->rm_firstdatacol) - 1)
1808 (void) raidz_parity_verify(rm);
1809 } else {
1810 /*
1811 * We're here because either:
1812 *
1813 * total_errors == rm_first_datacol, or
1814 * vdev_raidz_combrec() failed
1815 *
1816 * In either case, there is enough bad data to prevent
1817 * reconstruction.
1818 *
1819 * Start checksum ereports for all children which haven't
1820 * failed, and the IO wasn't speculative.
1821 */
1822 error = ECKSUM;
1823 }
1824
1825 done:
1826 vdev_raidz_map_free(rm);
1827
1828 return (error);
1829 }
1830