1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 #include <sys/cdefs.h>
27
28 static uint64_t zfs_crc64_table[256];
29
30 #define ECKSUM 666
31
32 #define ASSERT3S(x, y, z) ((void)0)
33 #define ASSERT3U(x, y, z) ((void)0)
34 #define ASSERT3P(x, y, z) ((void)0)
35 #define ASSERT0(x) ((void)0)
36 #define ASSERT(x) ((void)0)
37
38 #define panic(...) do { \
39 printf(__VA_ARGS__); \
40 for (;;) ; \
41 } while (0)
42
43 #define kmem_alloc(size, flag) zfs_alloc((size))
44 #define kmem_free(ptr, size) zfs_free((ptr), (size))
45
46 static void
zfs_init_crc(void)47 zfs_init_crc(void)
48 {
49 int i, j;
50 uint64_t *ct;
51
52 /*
53 * Calculate the crc64 table (used for the zap hash
54 * function).
55 */
56 if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
57 memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
58 for (i = 0; i < 256; i++)
59 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
60 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
61 }
62 }
63
64 static void
zio_checksum_off(const void * buf,uint64_t size,const void * ctx_template,zio_cksum_t * zcp)65 zio_checksum_off(const void *buf, uint64_t size,
66 const void *ctx_template, zio_cksum_t *zcp)
67 {
68 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
69 }
70
71 /*
72 * Signature for checksum functions.
73 */
74 typedef void zio_checksum_t(const void *data, uint64_t size,
75 const void *ctx_template, zio_cksum_t *zcp);
76 typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
77 typedef void zio_checksum_tmpl_free_t(void *ctx_template);
78
79 typedef enum zio_checksum_flags {
80 /* Strong enough for metadata? */
81 ZCHECKSUM_FLAG_METADATA = (1 << 1),
82 /* ZIO embedded checksum */
83 ZCHECKSUM_FLAG_EMBEDDED = (1 << 2),
84 /* Strong enough for dedup (without verification)? */
85 ZCHECKSUM_FLAG_DEDUP = (1 << 3),
86 /* Uses salt value */
87 ZCHECKSUM_FLAG_SALTED = (1 << 4),
88 /* Strong enough for nopwrite? */
89 ZCHECKSUM_FLAG_NOPWRITE = (1 << 5)
90 } zio_checksum_flags_t;
91
92 /*
93 * Information about each checksum function.
94 */
95 typedef struct zio_checksum_info {
96 /* checksum function for each byteorder */
97 zio_checksum_t *ci_func[2];
98 zio_checksum_tmpl_init_t *ci_tmpl_init;
99 zio_checksum_tmpl_free_t *ci_tmpl_free;
100 zio_checksum_flags_t ci_flags;
101 const char *ci_name; /* descriptive name */
102 } zio_checksum_info_t;
103
104 #include "blkptr.c"
105
106 #include "fletcher.c"
107 #include "sha256.c"
108
109 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
110 {{NULL, NULL}, NULL, NULL, 0, "inherit"},
111 {{NULL, NULL}, NULL, NULL, 0, "on"},
112 {{zio_checksum_off, zio_checksum_off}, NULL, NULL, 0, "off"},
113 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL,
114 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"},
115 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL,
116 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"},
117 {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL,
118 ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
119 {{fletcher_2_native, fletcher_2_byteswap}, NULL, NULL,
120 0, "fletcher2"},
121 {{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL,
122 ZCHECKSUM_FLAG_METADATA, "fletcher4"},
123 {{zio_checksum_SHA256, zio_checksum_SHA256}, NULL, NULL,
124 ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
125 ZCHECKSUM_FLAG_NOPWRITE, "SHA256"},
126 {{fletcher_4_native, fletcher_4_byteswap}, NULL, NULL,
127 ZCHECKSUM_FLAG_EMBEDDED, "zillog2"},
128 {{zio_checksum_off, zio_checksum_off}, NULL, NULL,
129 0, "noparity"},
130 {{zio_checksum_SHA512_native, zio_checksum_SHA512_byteswap},
131 NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
132 ZCHECKSUM_FLAG_NOPWRITE, "SHA512"},
133 /* no skein and edonr for now */
134 {{NULL, NULL}, NULL, NULL, ZCHECKSUM_FLAG_METADATA |
135 ZCHECKSUM_FLAG_DEDUP | ZCHECKSUM_FLAG_SALTED |
136 ZCHECKSUM_FLAG_NOPWRITE, "skein"},
137 {{NULL, NULL}, NULL, NULL, ZCHECKSUM_FLAG_METADATA |
138 ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
139 };
140
141 /*
142 * Common signature for all zio compress/decompress functions.
143 */
144 typedef size_t zio_compress_func_t(void *src, void *dst,
145 size_t s_len, size_t d_len, int);
146 typedef int zio_decompress_func_t(void *src, void *dst,
147 size_t s_len, size_t d_len, int);
148
149 extern int gzip_decompress(void *src, void *dst,
150 size_t s_len, size_t d_len, int);
151 /*
152 * Information about each compression function.
153 */
154 typedef struct zio_compress_info {
155 zio_compress_func_t *ci_compress; /* compression function */
156 zio_decompress_func_t *ci_decompress; /* decompression function */
157 int ci_level; /* level parameter */
158 const char *ci_name; /* algorithm name */
159 } zio_compress_info_t;
160
161 #include "lzjb.c"
162 #include "zle.c"
163 #include "lz4.c"
164
165 /*
166 * Compression vectors.
167 */
168 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
169 {NULL, NULL, 0, "inherit"},
170 {NULL, NULL, 0, "on"},
171 {NULL, NULL, 0, "uncompressed"},
172 {NULL, lzjb_decompress, 0, "lzjb"},
173 {NULL, NULL, 0, "empty"},
174 {NULL, gzip_decompress, 1, "gzip-1"},
175 {NULL, gzip_decompress, 2, "gzip-2"},
176 {NULL, gzip_decompress, 3, "gzip-3"},
177 {NULL, gzip_decompress, 4, "gzip-4"},
178 {NULL, gzip_decompress, 5, "gzip-5"},
179 {NULL, gzip_decompress, 6, "gzip-6"},
180 {NULL, gzip_decompress, 7, "gzip-7"},
181 {NULL, gzip_decompress, 8, "gzip-8"},
182 {NULL, gzip_decompress, 9, "gzip-9"},
183 {NULL, zle_decompress, 64, "zle"},
184 {NULL, lz4_decompress, 0, "lz4"},
185 };
186
187 static void
byteswap_uint64_array(void * vbuf,size_t size)188 byteswap_uint64_array(void *vbuf, size_t size)
189 {
190 uint64_t *buf = vbuf;
191 size_t count = size >> 3;
192 int i;
193
194 ASSERT((size & 7) == 0);
195
196 for (i = 0; i < count; i++)
197 buf[i] = BSWAP_64(buf[i]);
198 }
199
200 /*
201 * Set the external verifier for a gang block based on <vdev, offset, txg>,
202 * a tuple which is guaranteed to be unique for the life of the pool.
203 */
204 static void
zio_checksum_gang_verifier(zio_cksum_t * zcp,const blkptr_t * bp)205 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
206 {
207 const dva_t *dva = BP_IDENTITY(bp);
208 uint64_t txg = BP_PHYSICAL_BIRTH(bp);
209
210 ASSERT(BP_IS_GANG(bp));
211
212 ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
213 }
214
215 /*
216 * Set the external verifier for a label block based on its offset.
217 * The vdev is implicit, and the txg is unknowable at pool open time --
218 * hence the logic in vdev_uberblock_load() to find the most recent copy.
219 */
220 static void
zio_checksum_label_verifier(zio_cksum_t * zcp,uint64_t offset)221 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
222 {
223 ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
224 }
225
226 /*
227 * Calls the template init function of a checksum which supports context
228 * templates and installs the template into the spa_t.
229 */
230 static void
zio_checksum_template_init(enum zio_checksum checksum,const spa_t * spa)231 zio_checksum_template_init(enum zio_checksum checksum, const spa_t *spa)
232 {
233 zio_checksum_info_t *ci = &zio_checksum_table[checksum];
234
235 if (ci->ci_tmpl_init == NULL)
236 return;
237 #if 0 /* for now we dont have anything here */
238 if (spa->spa_cksum_tmpls[checksum] != NULL)
239 return;
240
241 VERIFY(ci->ci_tmpl_free != NULL);
242 mutex_enter(&spa->spa_cksum_tmpls_lock);
243 if (spa->spa_cksum_tmpls[checksum] == NULL) {
244 spa->spa_cksum_tmpls[checksum] =
245 ci->ci_tmpl_init(&spa->spa_cksum_salt);
246 VERIFY(spa->spa_cksum_tmpls[checksum] != NULL);
247 }
248 mutex_exit(&spa->spa_cksum_tmpls_lock);
249 #endif
250 }
251
252 static int
zio_checksum_verify(const blkptr_t * bp,void * data)253 zio_checksum_verify(const blkptr_t *bp, void *data)
254 {
255 uint64_t size;
256 unsigned int checksum;
257 zio_checksum_info_t *ci;
258 zio_cksum_t actual_cksum, expected_cksum, verifier;
259 int byteswap;
260
261 checksum = BP_GET_CHECKSUM(bp);
262 size = BP_GET_PSIZE(bp);
263
264 if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
265 return (EINVAL);
266 ci = &zio_checksum_table[checksum];
267 if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
268 return (EINVAL);
269
270 zio_checksum_template_init(checksum, NULL);
271 if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
272 zio_eck_t *eck;
273
274 ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
275 checksum == ZIO_CHECKSUM_LABEL);
276
277 eck = (zio_eck_t *)((char *)data + size) - 1;
278
279 if (checksum == ZIO_CHECKSUM_GANG_HEADER)
280 zio_checksum_gang_verifier(&verifier, bp);
281 else if (checksum == ZIO_CHECKSUM_LABEL)
282 zio_checksum_label_verifier(&verifier,
283 DVA_GET_OFFSET(BP_IDENTITY(bp)));
284 else
285 verifier = bp->blk_cksum;
286
287 byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
288
289 if (byteswap)
290 byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
291
292 expected_cksum = eck->zec_cksum;
293 eck->zec_cksum = verifier;
294 ci->ci_func[byteswap](data, size, NULL, &actual_cksum);
295 eck->zec_cksum = expected_cksum;
296
297 if (byteswap)
298 byteswap_uint64_array(&expected_cksum,
299 sizeof (zio_cksum_t));
300 } else {
301 expected_cksum = bp->blk_cksum;
302 ci->ci_func[0](data, size, NULL, &actual_cksum);
303 }
304
305 if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
306 /*printf("ZFS: read checksum failed\n");*/
307 return (EIO);
308 }
309
310 return (0);
311 }
312
313 static int
zio_decompress_data(int cpfunc,void * src,uint64_t srcsize,void * dest,uint64_t destsize)314 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
315 void *dest, uint64_t destsize)
316 {
317 zio_compress_info_t *ci;
318
319 if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
320 printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
321 return (EIO);
322 }
323
324 ci = &zio_compress_table[cpfunc];
325 if (!ci->ci_decompress) {
326 printf("ZFS: unsupported compression algorithm %s\n",
327 ci->ci_name);
328 return (EIO);
329 }
330
331 return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
332 }
333
334 static uint64_t
zap_hash(uint64_t salt,const char * name)335 zap_hash(uint64_t salt, const char *name)
336 {
337 const uint8_t *cp;
338 uint8_t c;
339 uint64_t crc = salt;
340
341 ASSERT(crc != 0);
342 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
343 for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
344 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
345
346 /*
347 * Only use 28 bits, since we need 4 bits in the cookie for the
348 * collision differentiator. We MUST use the high bits, since
349 * those are the onces that we first pay attention to when
350 * chosing the bucket.
351 */
352 crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
353
354 return (crc);
355 }
356
357 static void *zfs_alloc(size_t size);
358 static void zfs_free(void *ptr, size_t size);
359
360 typedef struct raidz_col {
361 uint64_t rc_devidx; /* child device index for I/O */
362 uint64_t rc_offset; /* device offset */
363 uint64_t rc_size; /* I/O size */
364 void *rc_data; /* I/O data */
365 int rc_error; /* I/O error for this device */
366 uint8_t rc_tried; /* Did we attempt this I/O column? */
367 uint8_t rc_skipped; /* Did we skip this I/O column? */
368 } raidz_col_t;
369
370 typedef struct raidz_map {
371 uint64_t rm_cols; /* Regular column count */
372 uint64_t rm_scols; /* Count including skipped columns */
373 uint64_t rm_bigcols; /* Number of oversized columns */
374 uint64_t rm_asize; /* Actual total I/O size */
375 uint64_t rm_missingdata; /* Count of missing data devices */
376 uint64_t rm_missingparity; /* Count of missing parity devices */
377 uint64_t rm_firstdatacol; /* First data column/parity count */
378 uint64_t rm_nskip; /* Skipped sectors for padding */
379 uint64_t rm_skipstart; /* Column index of padding start */
380 uintptr_t rm_reports; /* # of referencing checksum reports */
381 uint8_t rm_freed; /* map no longer has referencing ZIO */
382 uint8_t rm_ecksuminjected; /* checksum error was injected */
383 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
384 } raidz_map_t;
385
386 #define VDEV_RAIDZ_P 0
387 #define VDEV_RAIDZ_Q 1
388 #define VDEV_RAIDZ_R 2
389
390 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
391 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
392
393 /*
394 * We provide a mechanism to perform the field multiplication operation on a
395 * 64-bit value all at once rather than a byte at a time. This works by
396 * creating a mask from the top bit in each byte and using that to
397 * conditionally apply the XOR of 0x1d.
398 */
399 #define VDEV_RAIDZ_64MUL_2(x, mask) \
400 { \
401 (mask) = (x) & 0x8080808080808080ULL; \
402 (mask) = ((mask) << 1) - ((mask) >> 7); \
403 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
404 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
405 }
406
407 #define VDEV_RAIDZ_64MUL_4(x, mask) \
408 { \
409 VDEV_RAIDZ_64MUL_2((x), mask); \
410 VDEV_RAIDZ_64MUL_2((x), mask); \
411 }
412
413 /*
414 * These two tables represent powers and logs of 2 in the Galois field defined
415 * above. These values were computed by repeatedly multiplying by 2 as above.
416 */
417 static const uint8_t vdev_raidz_pow2[256] = {
418 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
419 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
420 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
421 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
422 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
423 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
424 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
425 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
426 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
427 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
428 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
429 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
430 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
431 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
432 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
433 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
434 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
435 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
436 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
437 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
438 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
439 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
440 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
441 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
442 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
443 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
444 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
445 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
446 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
447 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
448 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
449 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
450 };
451 static const uint8_t vdev_raidz_log2[256] = {
452 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
453 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
454 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
455 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
456 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
457 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
458 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
459 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
460 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
461 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
462 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
463 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
464 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
465 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
466 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
467 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
468 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
469 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
470 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
471 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
472 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
473 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
474 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
475 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
476 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
477 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
478 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
479 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
480 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
481 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
482 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
483 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
484 };
485
486 /*
487 * Multiply a given number by 2 raised to the given power.
488 */
489 static uint8_t
vdev_raidz_exp2(uint8_t a,int exp)490 vdev_raidz_exp2(uint8_t a, int exp)
491 {
492 if (a == 0)
493 return (0);
494
495 ASSERT(exp >= 0);
496 ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
497
498 exp += vdev_raidz_log2[a];
499 if (exp > 255)
500 exp -= 255;
501
502 return (vdev_raidz_pow2[exp]);
503 }
504
505 static void
vdev_raidz_generate_parity_p(raidz_map_t * rm)506 vdev_raidz_generate_parity_p(raidz_map_t *rm)
507 {
508 uint64_t *p, *src, pcount __attribute__((unused)), ccount, i;
509 int c;
510
511 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
512
513 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
514 src = rm->rm_col[c].rc_data;
515 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
516 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
517
518 if (c == rm->rm_firstdatacol) {
519 ASSERT(ccount == pcount);
520 for (i = 0; i < ccount; i++, src++, p++) {
521 *p = *src;
522 }
523 } else {
524 ASSERT(ccount <= pcount);
525 for (i = 0; i < ccount; i++, src++, p++) {
526 *p ^= *src;
527 }
528 }
529 }
530 }
531
532 static void
vdev_raidz_generate_parity_pq(raidz_map_t * rm)533 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
534 {
535 uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
536 int c;
537
538 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
539 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
540 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
541
542 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
543 src = rm->rm_col[c].rc_data;
544 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
545 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
546
547 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
548
549 if (c == rm->rm_firstdatacol) {
550 ASSERT(ccnt == pcnt || ccnt == 0);
551 for (i = 0; i < ccnt; i++, src++, p++, q++) {
552 *p = *src;
553 *q = *src;
554 }
555 for (; i < pcnt; i++, src++, p++, q++) {
556 *p = 0;
557 *q = 0;
558 }
559 } else {
560 ASSERT(ccnt <= pcnt);
561
562 /*
563 * Apply the algorithm described above by multiplying
564 * the previous result and adding in the new value.
565 */
566 for (i = 0; i < ccnt; i++, src++, p++, q++) {
567 *p ^= *src;
568
569 VDEV_RAIDZ_64MUL_2(*q, mask);
570 *q ^= *src;
571 }
572
573 /*
574 * Treat short columns as though they are full of 0s.
575 * Note that there's therefore nothing needed for P.
576 */
577 for (; i < pcnt; i++, q++) {
578 VDEV_RAIDZ_64MUL_2(*q, mask);
579 }
580 }
581 }
582 }
583
584 static void
vdev_raidz_generate_parity_pqr(raidz_map_t * rm)585 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
586 {
587 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
588 int c;
589
590 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
591 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
592 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
593 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
594 rm->rm_col[VDEV_RAIDZ_R].rc_size);
595
596 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
597 src = rm->rm_col[c].rc_data;
598 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
599 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
600 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
601
602 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
603
604 if (c == rm->rm_firstdatacol) {
605 ASSERT(ccnt == pcnt || ccnt == 0);
606 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
607 *p = *src;
608 *q = *src;
609 *r = *src;
610 }
611 for (; i < pcnt; i++, src++, p++, q++, r++) {
612 *p = 0;
613 *q = 0;
614 *r = 0;
615 }
616 } else {
617 ASSERT(ccnt <= pcnt);
618
619 /*
620 * Apply the algorithm described above by multiplying
621 * the previous result and adding in the new value.
622 */
623 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
624 *p ^= *src;
625
626 VDEV_RAIDZ_64MUL_2(*q, mask);
627 *q ^= *src;
628
629 VDEV_RAIDZ_64MUL_4(*r, mask);
630 *r ^= *src;
631 }
632
633 /*
634 * Treat short columns as though they are full of 0s.
635 * Note that there's therefore nothing needed for P.
636 */
637 for (; i < pcnt; i++, q++, r++) {
638 VDEV_RAIDZ_64MUL_2(*q, mask);
639 VDEV_RAIDZ_64MUL_4(*r, mask);
640 }
641 }
642 }
643 }
644
645 /*
646 * Generate RAID parity in the first virtual columns according to the number of
647 * parity columns available.
648 */
649 static void
vdev_raidz_generate_parity(raidz_map_t * rm)650 vdev_raidz_generate_parity(raidz_map_t *rm)
651 {
652 switch (rm->rm_firstdatacol) {
653 case 1:
654 vdev_raidz_generate_parity_p(rm);
655 break;
656 case 2:
657 vdev_raidz_generate_parity_pq(rm);
658 break;
659 case 3:
660 vdev_raidz_generate_parity_pqr(rm);
661 break;
662 default:
663 panic("invalid RAID-Z configuration");
664 }
665 }
666
667 /* BEGIN CSTYLED */
668 /*
669 * In the general case of reconstruction, we must solve the system of linear
670 * equations defined by the coeffecients used to generate parity as well as
671 * the contents of the data and parity disks. This can be expressed with
672 * vectors for the original data (D) and the actual data (d) and parity (p)
673 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
674 *
675 * __ __ __ __
676 * | | __ __ | p_0 |
677 * | V | | D_0 | | p_m-1 |
678 * | | x | : | = | d_0 |
679 * | I | | D_n-1 | | : |
680 * | | ~~ ~~ | d_n-1 |
681 * ~~ ~~ ~~ ~~
682 *
683 * I is simply a square identity matrix of size n, and V is a vandermonde
684 * matrix defined by the coeffecients we chose for the various parity columns
685 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
686 * computation as well as linear separability.
687 *
688 * __ __ __ __
689 * | 1 .. 1 1 1 | | p_0 |
690 * | 2^n-1 .. 4 2 1 | __ __ | : |
691 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
692 * | 1 .. 0 0 0 | | D_1 | | d_0 |
693 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
694 * | : : : : | | : | | d_2 |
695 * | 0 .. 1 0 0 | | D_n-1 | | : |
696 * | 0 .. 0 1 0 | ~~ ~~ | : |
697 * | 0 .. 0 0 1 | | d_n-1 |
698 * ~~ ~~ ~~ ~~
699 *
700 * Note that I, V, d, and p are known. To compute D, we must invert the
701 * matrix and use the known data and parity values to reconstruct the unknown
702 * data values. We begin by removing the rows in V|I and d|p that correspond
703 * to failed or missing columns; we then make V|I square (n x n) and d|p
704 * sized n by removing rows corresponding to unused parity from the bottom up
705 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
706 * using Gauss-Jordan elimination. In the example below we use m=3 parity
707 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
708 * __ __
709 * | 1 1 1 1 1 1 1 1 |
710 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
711 * | 19 205 116 29 64 16 4 1 | / /
712 * | 1 0 0 0 0 0 0 0 | / /
713 * | 0 1 0 0 0 0 0 0 | <--' /
714 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
715 * | 0 0 0 1 0 0 0 0 |
716 * | 0 0 0 0 1 0 0 0 |
717 * | 0 0 0 0 0 1 0 0 |
718 * | 0 0 0 0 0 0 1 0 |
719 * | 0 0 0 0 0 0 0 1 |
720 * ~~ ~~
721 * __ __
722 * | 1 1 1 1 1 1 1 1 |
723 * | 128 64 32 16 8 4 2 1 |
724 * | 19 205 116 29 64 16 4 1 |
725 * | 1 0 0 0 0 0 0 0 |
726 * | 0 1 0 0 0 0 0 0 |
727 * (V|I)' = | 0 0 1 0 0 0 0 0 |
728 * | 0 0 0 1 0 0 0 0 |
729 * | 0 0 0 0 1 0 0 0 |
730 * | 0 0 0 0 0 1 0 0 |
731 * | 0 0 0 0 0 0 1 0 |
732 * | 0 0 0 0 0 0 0 1 |
733 * ~~ ~~
734 *
735 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
736 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
737 * matrix is not singular.
738 * __ __
739 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
740 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
741 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
742 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
743 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
744 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
745 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
746 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
747 * ~~ ~~
748 * __ __
749 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
750 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
751 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
752 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
753 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
754 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
755 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
756 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
757 * ~~ ~~
758 * __ __
759 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
760 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
761 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
762 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
763 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
764 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
765 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
766 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
767 * ~~ ~~
768 * __ __
769 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
770 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
771 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
772 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
773 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
774 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
775 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
776 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
777 * ~~ ~~
778 * __ __
779 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
780 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
781 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
782 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
783 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
784 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
785 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
786 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
787 * ~~ ~~
788 * __ __
789 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
790 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
791 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
792 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
793 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
794 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
795 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
796 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
797 * ~~ ~~
798 * __ __
799 * | 0 0 1 0 0 0 0 0 |
800 * | 167 100 5 41 159 169 217 208 |
801 * | 166 100 4 40 158 168 216 209 |
802 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
803 * | 0 0 0 0 1 0 0 0 |
804 * | 0 0 0 0 0 1 0 0 |
805 * | 0 0 0 0 0 0 1 0 |
806 * | 0 0 0 0 0 0 0 1 |
807 * ~~ ~~
808 *
809 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
810 * of the missing data.
811 *
812 * As is apparent from the example above, the only non-trivial rows in the
813 * inverse matrix correspond to the data disks that we're trying to
814 * reconstruct. Indeed, those are the only rows we need as the others would
815 * only be useful for reconstructing data known or assumed to be valid. For
816 * that reason, we only build the coefficients in the rows that correspond to
817 * targeted columns.
818 */
819 /* END CSTYLED */
820
821 static void
vdev_raidz_matrix_init(raidz_map_t * rm,int n,int nmap,int * map,uint8_t ** rows)822 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
823 uint8_t **rows)
824 {
825 int i, j;
826 int pow;
827
828 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
829
830 /*
831 * Fill in the missing rows of interest.
832 */
833 for (i = 0; i < nmap; i++) {
834 ASSERT3S(0, <=, map[i]);
835 ASSERT3S(map[i], <=, 2);
836
837 pow = map[i] * n;
838 if (pow > 255)
839 pow -= 255;
840 ASSERT(pow <= 255);
841
842 for (j = 0; j < n; j++) {
843 pow -= map[i];
844 if (pow < 0)
845 pow += 255;
846 rows[i][j] = vdev_raidz_pow2[pow];
847 }
848 }
849 }
850
851 static void
vdev_raidz_matrix_invert(raidz_map_t * rm,int n,int nmissing,int * missing,uint8_t ** rows,uint8_t ** invrows,const uint8_t * used)852 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
853 uint8_t **rows, uint8_t **invrows, const uint8_t *used)
854 {
855 int i, j, ii, jj;
856 uint8_t log;
857
858 /*
859 * Assert that the first nmissing entries from the array of used
860 * columns correspond to parity columns and that subsequent entries
861 * correspond to data columns.
862 */
863 for (i = 0; i < nmissing; i++) {
864 ASSERT3S(used[i], <, rm->rm_firstdatacol);
865 }
866 for (; i < n; i++) {
867 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
868 }
869
870 /*
871 * First initialize the storage where we'll compute the inverse rows.
872 */
873 for (i = 0; i < nmissing; i++) {
874 for (j = 0; j < n; j++) {
875 invrows[i][j] = (i == j) ? 1 : 0;
876 }
877 }
878
879 /*
880 * Subtract all trivial rows from the rows of consequence.
881 */
882 for (i = 0; i < nmissing; i++) {
883 for (j = nmissing; j < n; j++) {
884 ASSERT3U(used[j], >=, rm->rm_firstdatacol);
885 jj = used[j] - rm->rm_firstdatacol;
886 ASSERT3S(jj, <, n);
887 invrows[i][j] = rows[i][jj];
888 rows[i][jj] = 0;
889 }
890 }
891
892 /*
893 * For each of the rows of interest, we must normalize it and subtract
894 * a multiple of it from the other rows.
895 */
896 for (i = 0; i < nmissing; i++) {
897 for (j = 0; j < missing[i]; j++) {
898 ASSERT3U(rows[i][j], ==, 0);
899 }
900 ASSERT3U(rows[i][missing[i]], !=, 0);
901
902 /*
903 * Compute the inverse of the first element and multiply each
904 * element in the row by that value.
905 */
906 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
907
908 for (j = 0; j < n; j++) {
909 rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
910 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
911 }
912
913 for (ii = 0; ii < nmissing; ii++) {
914 if (i == ii)
915 continue;
916
917 ASSERT3U(rows[ii][missing[i]], !=, 0);
918
919 log = vdev_raidz_log2[rows[ii][missing[i]]];
920
921 for (j = 0; j < n; j++) {
922 rows[ii][j] ^=
923 vdev_raidz_exp2(rows[i][j], log);
924 invrows[ii][j] ^=
925 vdev_raidz_exp2(invrows[i][j], log);
926 }
927 }
928 }
929
930 /*
931 * Verify that the data that is left in the rows are properly part of
932 * an identity matrix.
933 */
934 for (i = 0; i < nmissing; i++) {
935 for (j = 0; j < n; j++) {
936 if (j == missing[i]) {
937 ASSERT3U(rows[i][j], ==, 1);
938 } else {
939 ASSERT3U(rows[i][j], ==, 0);
940 }
941 }
942 }
943 }
944
945 static void
vdev_raidz_matrix_reconstruct(raidz_map_t * rm,int n,int nmissing,int * missing,uint8_t ** invrows,const uint8_t * used)946 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
947 int *missing, uint8_t **invrows, const uint8_t *used)
948 {
949 int i, j, x, cc, c;
950 uint8_t *src;
951 uint64_t ccount;
952 uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
953 uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
954 uint8_t log, val;
955 int ll;
956 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
957 uint8_t *p, *pp;
958 size_t psize;
959
960 log = 0; /* gcc */
961 psize = sizeof (invlog[0][0]) * n * nmissing;
962 p = zfs_alloc(psize);
963
964 for (pp = p, i = 0; i < nmissing; i++) {
965 invlog[i] = pp;
966 pp += n;
967 }
968
969 for (i = 0; i < nmissing; i++) {
970 for (j = 0; j < n; j++) {
971 ASSERT3U(invrows[i][j], !=, 0);
972 invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
973 }
974 }
975
976 for (i = 0; i < n; i++) {
977 c = used[i];
978 ASSERT3U(c, <, rm->rm_cols);
979
980 src = rm->rm_col[c].rc_data;
981 ccount = rm->rm_col[c].rc_size;
982 for (j = 0; j < nmissing; j++) {
983 cc = missing[j] + rm->rm_firstdatacol;
984 ASSERT3U(cc, >=, rm->rm_firstdatacol);
985 ASSERT3U(cc, <, rm->rm_cols);
986 ASSERT3U(cc, !=, c);
987
988 dst[j] = rm->rm_col[cc].rc_data;
989 dcount[j] = rm->rm_col[cc].rc_size;
990 }
991
992 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
993
994 for (x = 0; x < ccount; x++, src++) {
995 if (*src != 0)
996 log = vdev_raidz_log2[*src];
997
998 for (cc = 0; cc < nmissing; cc++) {
999 if (x >= dcount[cc])
1000 continue;
1001
1002 if (*src == 0) {
1003 val = 0;
1004 } else {
1005 if ((ll = log + invlog[cc][i]) >= 255)
1006 ll -= 255;
1007 val = vdev_raidz_pow2[ll];
1008 }
1009
1010 if (i == 0)
1011 dst[cc][x] = val;
1012 else
1013 dst[cc][x] ^= val;
1014 }
1015 }
1016 }
1017
1018 zfs_free(p, psize);
1019 }
1020
1021 static int
vdev_raidz_reconstruct_general(raidz_map_t * rm,int * tgts,int ntgts)1022 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1023 {
1024 int n, i, c, t, tt;
1025 int nmissing_rows;
1026 int missing_rows[VDEV_RAIDZ_MAXPARITY];
1027 int parity_map[VDEV_RAIDZ_MAXPARITY];
1028
1029 uint8_t *p, *pp;
1030 size_t psize;
1031
1032 uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1033 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1034 uint8_t *used;
1035
1036 int code = 0;
1037
1038
1039 n = rm->rm_cols - rm->rm_firstdatacol;
1040
1041 /*
1042 * Figure out which data columns are missing.
1043 */
1044 nmissing_rows = 0;
1045 for (t = 0; t < ntgts; t++) {
1046 if (tgts[t] >= rm->rm_firstdatacol) {
1047 missing_rows[nmissing_rows++] =
1048 tgts[t] - rm->rm_firstdatacol;
1049 }
1050 }
1051
1052 /*
1053 * Figure out which parity columns to use to help generate the missing
1054 * data columns.
1055 */
1056 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1057 ASSERT(tt < ntgts);
1058 ASSERT(c < rm->rm_firstdatacol);
1059
1060 /*
1061 * Skip any targeted parity columns.
1062 */
1063 if (c == tgts[tt]) {
1064 tt++;
1065 continue;
1066 }
1067
1068 code |= 1 << c;
1069
1070 parity_map[i] = c;
1071 i++;
1072 }
1073
1074 ASSERT(code != 0);
1075 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1076
1077 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1078 nmissing_rows * n + sizeof (used[0]) * n;
1079 p = kmem_alloc(psize, KM_SLEEP);
1080
1081 for (pp = p, i = 0; i < nmissing_rows; i++) {
1082 rows[i] = pp;
1083 pp += n;
1084 invrows[i] = pp;
1085 pp += n;
1086 }
1087 used = pp;
1088
1089 for (i = 0; i < nmissing_rows; i++) {
1090 used[i] = parity_map[i];
1091 }
1092
1093 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1094 if (tt < nmissing_rows &&
1095 c == missing_rows[tt] + rm->rm_firstdatacol) {
1096 tt++;
1097 continue;
1098 }
1099
1100 ASSERT3S(i, <, n);
1101 used[i] = c;
1102 i++;
1103 }
1104
1105 /*
1106 * Initialize the interesting rows of the matrix.
1107 */
1108 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1109
1110 /*
1111 * Invert the matrix.
1112 */
1113 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1114 invrows, used);
1115
1116 /*
1117 * Reconstruct the missing data using the generated matrix.
1118 */
1119 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1120 invrows, used);
1121
1122 kmem_free(p, psize);
1123
1124 return (code);
1125 }
1126
1127 static int
vdev_raidz_reconstruct(raidz_map_t * rm,int * t,int nt)1128 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1129 {
1130 int tgts[VDEV_RAIDZ_MAXPARITY];
1131 int ntgts;
1132 int i, c;
1133 int code;
1134 int nbadparity, nbaddata;
1135
1136 /*
1137 * The tgts list must already be sorted.
1138 */
1139 for (i = 1; i < nt; i++) {
1140 ASSERT(t[i] > t[i - 1]);
1141 }
1142
1143 nbadparity = rm->rm_firstdatacol;
1144 nbaddata = rm->rm_cols - nbadparity;
1145 ntgts = 0;
1146 for (i = 0, c = 0; c < rm->rm_cols; c++) {
1147 if (i < nt && c == t[i]) {
1148 tgts[ntgts++] = c;
1149 i++;
1150 } else if (rm->rm_col[c].rc_error != 0) {
1151 tgts[ntgts++] = c;
1152 } else if (c >= rm->rm_firstdatacol) {
1153 nbaddata--;
1154 } else {
1155 nbadparity--;
1156 }
1157 }
1158
1159 ASSERT(ntgts >= nt);
1160 ASSERT(nbaddata >= 0);
1161 ASSERT(nbaddata + nbadparity == ntgts);
1162
1163 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1164 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1165 ASSERT(code > 0);
1166 return (code);
1167 }
1168
1169 static raidz_map_t *
vdev_raidz_map_alloc(void * data,off_t offset,size_t size,uint64_t unit_shift,uint64_t dcols,uint64_t nparity)1170 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1171 uint64_t dcols, uint64_t nparity)
1172 {
1173 raidz_map_t *rm;
1174 uint64_t b = offset >> unit_shift;
1175 uint64_t s = size >> unit_shift;
1176 uint64_t f = b % dcols;
1177 uint64_t o = (b / dcols) << unit_shift;
1178 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1179
1180 q = s / (dcols - nparity);
1181 r = s - q * (dcols - nparity);
1182 bc = (r == 0 ? 0 : r + nparity);
1183 tot = s + nparity * (q + (r == 0 ? 0 : 1));
1184
1185 if (q == 0) {
1186 acols = bc;
1187 scols = MIN(dcols, roundup(bc, nparity + 1));
1188 } else {
1189 acols = dcols;
1190 scols = dcols;
1191 }
1192
1193 ASSERT3U(acols, <=, scols);
1194
1195 rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols]));
1196
1197 rm->rm_cols = acols;
1198 rm->rm_scols = scols;
1199 rm->rm_bigcols = bc;
1200 rm->rm_skipstart = bc;
1201 rm->rm_missingdata = 0;
1202 rm->rm_missingparity = 0;
1203 rm->rm_firstdatacol = nparity;
1204 rm->rm_reports = 0;
1205 rm->rm_freed = 0;
1206 rm->rm_ecksuminjected = 0;
1207
1208 asize = 0;
1209
1210 for (c = 0; c < scols; c++) {
1211 col = f + c;
1212 coff = o;
1213 if (col >= dcols) {
1214 col -= dcols;
1215 coff += 1ULL << unit_shift;
1216 }
1217 rm->rm_col[c].rc_devidx = col;
1218 rm->rm_col[c].rc_offset = coff;
1219 rm->rm_col[c].rc_data = NULL;
1220 rm->rm_col[c].rc_error = 0;
1221 rm->rm_col[c].rc_tried = 0;
1222 rm->rm_col[c].rc_skipped = 0;
1223
1224 if (c >= acols)
1225 rm->rm_col[c].rc_size = 0;
1226 else if (c < bc)
1227 rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1228 else
1229 rm->rm_col[c].rc_size = q << unit_shift;
1230
1231 asize += rm->rm_col[c].rc_size;
1232 }
1233
1234 ASSERT3U(asize, ==, tot << unit_shift);
1235 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1236 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1237 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1238 ASSERT3U(rm->rm_nskip, <=, nparity);
1239
1240 for (c = 0; c < rm->rm_firstdatacol; c++)
1241 rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size);
1242
1243 rm->rm_col[c].rc_data = data;
1244
1245 for (c = c + 1; c < acols; c++)
1246 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1247 rm->rm_col[c - 1].rc_size;
1248
1249 /*
1250 * If all data stored spans all columns, there's a danger that parity
1251 * will always be on the same device and, since parity isn't read
1252 * during normal operation, that that device's I/O bandwidth won't be
1253 * used effectively. We therefore switch the parity every 1MB.
1254 *
1255 * ... at least that was, ostensibly, the theory. As a practical
1256 * matter unless we juggle the parity between all devices evenly, we
1257 * won't see any benefit. Further, occasional writes that aren't a
1258 * multiple of the LCM of the number of children and the minimum
1259 * stripe width are sufficient to avoid pessimal behavior.
1260 * Unfortunately, this decision created an implicit on-disk format
1261 * requirement that we need to support for all eternity, but only
1262 * for single-parity RAID-Z.
1263 *
1264 * If we intend to skip a sector in the zeroth column for padding
1265 * we must make sure to note this swap. We will never intend to
1266 * skip the first column since at least one data and one parity
1267 * column must appear in each row.
1268 */
1269 ASSERT(rm->rm_cols >= 2);
1270 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1271
1272 if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1273 devidx = rm->rm_col[0].rc_devidx;
1274 o = rm->rm_col[0].rc_offset;
1275 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1276 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1277 rm->rm_col[1].rc_devidx = devidx;
1278 rm->rm_col[1].rc_offset = o;
1279
1280 if (rm->rm_skipstart == 0)
1281 rm->rm_skipstart = 1;
1282 }
1283
1284 return (rm);
1285 }
1286
1287 static void
vdev_raidz_map_free(raidz_map_t * rm)1288 vdev_raidz_map_free(raidz_map_t *rm)
1289 {
1290 int c;
1291
1292 for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1293 zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
1294
1295 zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
1296 }
1297
1298 static vdev_t *
vdev_child(vdev_t * pvd,uint64_t devidx)1299 vdev_child(vdev_t *pvd, uint64_t devidx)
1300 {
1301 vdev_t *cvd;
1302
1303 STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1304 if (cvd->v_id == devidx)
1305 break;
1306 }
1307
1308 return (cvd);
1309 }
1310
1311 /*
1312 * We keep track of whether or not there were any injected errors, so that
1313 * any ereports we generate can note it.
1314 */
1315 static int
raidz_checksum_verify(const blkptr_t * bp,void * data,uint64_t size)1316 raidz_checksum_verify(const blkptr_t *bp, void *data, uint64_t size)
1317 {
1318
1319 return (zio_checksum_verify(bp, data));
1320 }
1321
1322 /*
1323 * Generate the parity from the data columns. If we tried and were able to
1324 * read the parity without error, verify that the generated parity matches the
1325 * data we read. If it doesn't, we fire off a checksum error. Return the
1326 * number such failures.
1327 */
1328 static int
raidz_parity_verify(raidz_map_t * rm)1329 raidz_parity_verify(raidz_map_t *rm)
1330 {
1331 void *orig[VDEV_RAIDZ_MAXPARITY];
1332 int c, ret = 0;
1333 raidz_col_t *rc;
1334
1335 for (c = 0; c < rm->rm_firstdatacol; c++) {
1336 rc = &rm->rm_col[c];
1337 if (!rc->rc_tried || rc->rc_error != 0)
1338 continue;
1339 orig[c] = zfs_alloc(rc->rc_size);
1340 bcopy(rc->rc_data, orig[c], rc->rc_size);
1341 }
1342
1343 vdev_raidz_generate_parity(rm);
1344
1345 for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1346 rc = &rm->rm_col[c];
1347 if (!rc->rc_tried || rc->rc_error != 0)
1348 continue;
1349 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1350 rc->rc_error = ECKSUM;
1351 ret++;
1352 }
1353 zfs_free(orig[c], rc->rc_size);
1354 }
1355
1356 return (ret);
1357 }
1358
1359 /*
1360 * Iterate over all combinations of bad data and attempt a reconstruction.
1361 * Note that the algorithm below is non-optimal because it doesn't take into
1362 * account how reconstruction is actually performed. For example, with
1363 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1364 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1365 * cases we'd only use parity information in column 0.
1366 */
1367 static int
vdev_raidz_combrec(raidz_map_t * rm,const blkptr_t * bp,void * data,off_t offset,uint64_t bytes,int total_errors,int data_errors)1368 vdev_raidz_combrec(raidz_map_t *rm, const blkptr_t *bp, void *data,
1369 off_t offset, uint64_t bytes, int total_errors, int data_errors)
1370 {
1371 raidz_col_t *rc;
1372 void *orig[VDEV_RAIDZ_MAXPARITY];
1373 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1374 int *tgts = &tstore[1];
1375 int current, next, i, c, n;
1376 int code, ret = 0;
1377
1378 ASSERT(total_errors < rm->rm_firstdatacol);
1379
1380 /*
1381 * This simplifies one edge condition.
1382 */
1383 tgts[-1] = -1;
1384
1385 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1386 /*
1387 * Initialize the targets array by finding the first n columns
1388 * that contain no error.
1389 *
1390 * If there were no data errors, we need to ensure that we're
1391 * always explicitly attempting to reconstruct at least one
1392 * data column. To do this, we simply push the highest target
1393 * up into the data columns.
1394 */
1395 for (c = 0, i = 0; i < n; i++) {
1396 if (i == n - 1 && data_errors == 0 &&
1397 c < rm->rm_firstdatacol) {
1398 c = rm->rm_firstdatacol;
1399 }
1400
1401 while (rm->rm_col[c].rc_error != 0) {
1402 c++;
1403 ASSERT3S(c, <, rm->rm_cols);
1404 }
1405
1406 tgts[i] = c++;
1407 }
1408
1409 /*
1410 * Setting tgts[n] simplifies the other edge condition.
1411 */
1412 tgts[n] = rm->rm_cols;
1413
1414 /*
1415 * These buffers were allocated in previous iterations.
1416 */
1417 for (i = 0; i < n - 1; i++) {
1418 ASSERT(orig[i] != NULL);
1419 }
1420
1421 orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size);
1422
1423 current = 0;
1424 next = tgts[current];
1425
1426 while (current != n) {
1427 tgts[current] = next;
1428 current = 0;
1429
1430 /*
1431 * Save off the original data that we're going to
1432 * attempt to reconstruct.
1433 */
1434 for (i = 0; i < n; i++) {
1435 ASSERT(orig[i] != NULL);
1436 c = tgts[i];
1437 ASSERT3S(c, >=, 0);
1438 ASSERT3S(c, <, rm->rm_cols);
1439 rc = &rm->rm_col[c];
1440 bcopy(rc->rc_data, orig[i], rc->rc_size);
1441 }
1442
1443 /*
1444 * Attempt a reconstruction and exit the outer loop on
1445 * success.
1446 */
1447 code = vdev_raidz_reconstruct(rm, tgts, n);
1448 if (raidz_checksum_verify(bp, data, bytes) == 0) {
1449 for (i = 0; i < n; i++) {
1450 c = tgts[i];
1451 rc = &rm->rm_col[c];
1452 ASSERT(rc->rc_error == 0);
1453 rc->rc_error = ECKSUM;
1454 }
1455
1456 ret = code;
1457 goto done;
1458 }
1459
1460 /*
1461 * Restore the original data.
1462 */
1463 for (i = 0; i < n; i++) {
1464 c = tgts[i];
1465 rc = &rm->rm_col[c];
1466 bcopy(orig[i], rc->rc_data, rc->rc_size);
1467 }
1468
1469 do {
1470 /*
1471 * Find the next valid column after the current
1472 * position..
1473 */
1474 for (next = tgts[current] + 1;
1475 next < rm->rm_cols &&
1476 rm->rm_col[next].rc_error != 0; next++)
1477 continue;
1478
1479 ASSERT(next <= tgts[current + 1]);
1480
1481 /*
1482 * If that spot is available, we're done here.
1483 */
1484 if (next != tgts[current + 1])
1485 break;
1486
1487 /*
1488 * Otherwise, find the next valid column after
1489 * the previous position.
1490 */
1491 for (c = tgts[current - 1] + 1;
1492 rm->rm_col[c].rc_error != 0; c++)
1493 continue;
1494
1495 tgts[current] = c;
1496 current++;
1497
1498 } while (current != n);
1499 }
1500 }
1501 n--;
1502 done:
1503 for (i = n - 1; i >= 0; i--) {
1504 zfs_free(orig[i], rm->rm_col[0].rc_size);
1505 }
1506
1507 return (ret);
1508 }
1509
1510 static int
vdev_raidz_read(vdev_t * vd,const blkptr_t * bp,void * data,off_t offset,size_t bytes)1511 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1512 off_t offset, size_t bytes)
1513 {
1514 vdev_t *tvd = vd->v_top;
1515 vdev_t *cvd;
1516 raidz_map_t *rm;
1517 raidz_col_t *rc;
1518 int c, error;
1519 int unexpected_errors;
1520 int parity_errors;
1521 int parity_untried;
1522 int data_errors;
1523 int total_errors;
1524 int n;
1525 int tgts[VDEV_RAIDZ_MAXPARITY];
1526 int code;
1527
1528 rc = NULL; /* gcc */
1529 error = 0;
1530
1531 rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1532 vd->v_nchildren, vd->v_nparity);
1533
1534 /*
1535 * Iterate over the columns in reverse order so that we hit the parity
1536 * last -- any errors along the way will force us to read the parity.
1537 */
1538 for (c = rm->rm_cols - 1; c >= 0; c--) {
1539 rc = &rm->rm_col[c];
1540 cvd = vdev_child(vd, rc->rc_devidx);
1541 if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1542 if (c >= rm->rm_firstdatacol)
1543 rm->rm_missingdata++;
1544 else
1545 rm->rm_missingparity++;
1546 rc->rc_error = ENXIO;
1547 rc->rc_tried = 1; /* don't even try */
1548 rc->rc_skipped = 1;
1549 continue;
1550 }
1551 #if 0 /* XXX: Too hard for the boot code. */
1552 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1553 if (c >= rm->rm_firstdatacol)
1554 rm->rm_missingdata++;
1555 else
1556 rm->rm_missingparity++;
1557 rc->rc_error = ESTALE;
1558 rc->rc_skipped = 1;
1559 continue;
1560 }
1561 #endif
1562 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1563 rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1564 rc->rc_offset, rc->rc_size);
1565 rc->rc_tried = 1;
1566 rc->rc_skipped = 0;
1567 }
1568 }
1569
1570 reconstruct:
1571 unexpected_errors = 0;
1572 parity_errors = 0;
1573 parity_untried = 0;
1574 data_errors = 0;
1575 total_errors = 0;
1576
1577 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1578 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1579
1580 for (c = 0; c < rm->rm_cols; c++) {
1581 rc = &rm->rm_col[c];
1582
1583 if (rc->rc_error) {
1584 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
1585
1586 if (c < rm->rm_firstdatacol)
1587 parity_errors++;
1588 else
1589 data_errors++;
1590
1591 if (!rc->rc_skipped)
1592 unexpected_errors++;
1593
1594 total_errors++;
1595 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1596 parity_untried++;
1597 }
1598 }
1599
1600 /*
1601 * There are three potential phases for a read:
1602 * 1. produce valid data from the columns read
1603 * 2. read all disks and try again
1604 * 3. perform combinatorial reconstruction
1605 *
1606 * Each phase is progressively both more expensive and less likely to
1607 * occur. If we encounter more errors than we can repair or all phases
1608 * fail, we have no choice but to return an error.
1609 */
1610
1611 /*
1612 * If the number of errors we saw was correctable -- less than or equal
1613 * to the number of parity disks read -- attempt to produce data that
1614 * has a valid checksum. Naturally, this case applies in the absence of
1615 * any errors.
1616 */
1617 if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1618 if (data_errors == 0) {
1619 if (raidz_checksum_verify(bp, data, bytes) == 0) {
1620 /*
1621 * If we read parity information (unnecessarily
1622 * as it happens since no reconstruction was
1623 * needed) regenerate and verify the parity.
1624 * We also regenerate parity when resilvering
1625 * so we can write it out to the failed device
1626 * later.
1627 */
1628 if (parity_errors + parity_untried <
1629 rm->rm_firstdatacol) {
1630 n = raidz_parity_verify(rm);
1631 unexpected_errors += n;
1632 ASSERT(parity_errors + n <=
1633 rm->rm_firstdatacol);
1634 }
1635 goto done;
1636 }
1637 } else {
1638 /*
1639 * We either attempt to read all the parity columns or
1640 * none of them. If we didn't try to read parity, we
1641 * wouldn't be here in the correctable case. There must
1642 * also have been fewer parity errors than parity
1643 * columns or, again, we wouldn't be in this code path.
1644 */
1645 ASSERT(parity_untried == 0);
1646 ASSERT(parity_errors < rm->rm_firstdatacol);
1647
1648 /*
1649 * Identify the data columns that reported an error.
1650 */
1651 n = 0;
1652 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1653 rc = &rm->rm_col[c];
1654 if (rc->rc_error != 0) {
1655 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1656 tgts[n++] = c;
1657 }
1658 }
1659
1660 ASSERT(rm->rm_firstdatacol >= n);
1661
1662 code = vdev_raidz_reconstruct(rm, tgts, n);
1663
1664 if (raidz_checksum_verify(bp, data, bytes) == 0) {
1665 /*
1666 * If we read more parity disks than were used
1667 * for reconstruction, confirm that the other
1668 * parity disks produced correct data. This
1669 * routine is suboptimal in that it regenerates
1670 * the parity that we already used in addition
1671 * to the parity that we're attempting to
1672 * verify, but this should be a relatively
1673 * uncommon case, and can be optimized if it
1674 * becomes a problem. Note that we regenerate
1675 * parity when resilvering so we can write it
1676 * out to failed devices later.
1677 */
1678 if (parity_errors < rm->rm_firstdatacol - n) {
1679 n = raidz_parity_verify(rm);
1680 unexpected_errors += n;
1681 ASSERT(parity_errors + n <=
1682 rm->rm_firstdatacol);
1683 }
1684
1685 goto done;
1686 }
1687 }
1688 }
1689
1690 /*
1691 * This isn't a typical situation -- either we got a read
1692 * error or a child silently returned bad data. Read every
1693 * block so we can try again with as much data and parity as
1694 * we can track down. If we've already been through once
1695 * before, all children will be marked as tried so we'll
1696 * proceed to combinatorial reconstruction.
1697 */
1698 unexpected_errors = 1;
1699 rm->rm_missingdata = 0;
1700 rm->rm_missingparity = 0;
1701
1702 n = 0;
1703 for (c = 0; c < rm->rm_cols; c++) {
1704 rc = &rm->rm_col[c];
1705
1706 if (rc->rc_tried)
1707 continue;
1708
1709 cvd = vdev_child(vd, rc->rc_devidx);
1710 ASSERT(cvd != NULL);
1711 rc->rc_error = cvd->v_read(cvd, NULL,
1712 rc->rc_data, rc->rc_offset, rc->rc_size);
1713 if (rc->rc_error == 0)
1714 n++;
1715 rc->rc_tried = 1;
1716 rc->rc_skipped = 0;
1717 }
1718 /*
1719 * If we managed to read anything more, retry the
1720 * reconstruction.
1721 */
1722 if (n > 0)
1723 goto reconstruct;
1724
1725 /*
1726 * At this point we've attempted to reconstruct the data given the
1727 * errors we detected, and we've attempted to read all columns. There
1728 * must, therefore, be one or more additional problems -- silent errors
1729 * resulting in invalid data rather than explicit I/O errors resulting
1730 * in absent data. We check if there is enough additional data to
1731 * possibly reconstruct the data and then perform combinatorial
1732 * reconstruction over all possible combinations. If that fails,
1733 * we're cooked.
1734 */
1735 if (total_errors > rm->rm_firstdatacol) {
1736 error = EIO;
1737 } else if (total_errors < rm->rm_firstdatacol &&
1738 (code = vdev_raidz_combrec(rm, bp, data, offset, bytes,
1739 total_errors, data_errors)) != 0) {
1740 /*
1741 * If we didn't use all the available parity for the
1742 * combinatorial reconstruction, verify that the remaining
1743 * parity is correct.
1744 */
1745 if (code != (1 << rm->rm_firstdatacol) - 1)
1746 (void) raidz_parity_verify(rm);
1747 } else {
1748 /*
1749 * We're here because either:
1750 *
1751 * total_errors == rm_first_datacol, or
1752 * vdev_raidz_combrec() failed
1753 *
1754 * In either case, there is enough bad data to prevent
1755 * reconstruction.
1756 *
1757 * Start checksum ereports for all children which haven't
1758 * failed, and the IO wasn't speculative.
1759 */
1760 error = ECKSUM;
1761 }
1762
1763 done:
1764 vdev_raidz_map_free(rm);
1765
1766 return (error);
1767 }
1768