xref: /freebsd/sys/cddl/boot/zfs/zfssubr.c (revision 2284664ef9fcb0baaf59f1ef7df877c0b0f2b187)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/cdefs.h>
27 __FBSDID("$FreeBSD$");
28 
29 static uint64_t zfs_crc64_table[256];
30 
31 #define	ECKSUM	666
32 
33 #define	ASSERT3S(x, y, z)	((void)0)
34 #define	ASSERT3U(x, y, z)	((void)0)
35 #define	ASSERT3P(x, y, z)	((void)0)
36 #define	ASSERT0(x)		((void)0)
37 #define	ASSERT(x)		((void)0)
38 
39 #define	panic(...)	do {						\
40 	printf(__VA_ARGS__);						\
41 	for (;;) ;							\
42 } while (0)
43 
44 #define	kmem_alloc(size, flag)	zfs_alloc((size))
45 #define	kmem_free(ptr, size)	zfs_free((ptr), (size))
46 
47 static void
48 zfs_init_crc(void)
49 {
50 	int i, j;
51 	uint64_t *ct;
52 
53 	/*
54 	 * Calculate the crc64 table (used for the zap hash
55 	 * function).
56 	 */
57 	if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
58 		memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
59 		for (i = 0; i < 256; i++)
60 			for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
61 				*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
62 	}
63 }
64 
65 static void
66 zio_checksum_off(const void *buf, uint64_t size,
67     const void *ctx_template, zio_cksum_t *zcp)
68 {
69 	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
70 }
71 
72 /*
73  * Signature for checksum functions.
74  */
75 typedef void zio_checksum_t(const void *data, uint64_t size,
76     const void *ctx_template, zio_cksum_t *zcp);
77 typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
78 typedef void zio_checksum_tmpl_free_t(void *ctx_template);
79 
80 typedef enum zio_checksum_flags {
81 	/* Strong enough for metadata? */
82 	ZCHECKSUM_FLAG_METADATA = (1 << 1),
83 	/* ZIO embedded checksum */
84 	ZCHECKSUM_FLAG_EMBEDDED = (1 << 2),
85 	/* Strong enough for dedup (without verification)? */
86 	ZCHECKSUM_FLAG_DEDUP = (1 << 3),
87 	/* Uses salt value */
88 	ZCHECKSUM_FLAG_SALTED = (1 << 4),
89 	/* Strong enough for nopwrite? */
90 	ZCHECKSUM_FLAG_NOPWRITE = (1 << 5)
91 } zio_checksum_flags_t;
92 
93 /*
94  * Information about each checksum function.
95  */
96 typedef struct zio_checksum_info {
97 	/* checksum function for each byteorder */
98 	zio_checksum_t			*ci_func[2];
99 	zio_checksum_tmpl_init_t	*ci_tmpl_init;
100 	zio_checksum_tmpl_free_t	*ci_tmpl_free;
101 	zio_checksum_flags_t		ci_flags;
102 	const char			*ci_name;	/* descriptive name */
103 } zio_checksum_info_t;
104 
105 #include "blkptr.c"
106 
107 #include "fletcher.c"
108 #include "sha256.c"
109 #include "skein_zfs.c"
110 
111 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
112 	{{NULL, NULL}, NULL, NULL, 0, "inherit"},
113 	{{NULL, NULL}, NULL, NULL, 0, "on"},
114 	{{zio_checksum_off,	zio_checksum_off}, NULL, NULL, 0, "off"},
115 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
116 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"},
117 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
118 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"},
119 	{{fletcher_2_native,	fletcher_2_byteswap}, NULL, NULL,
120 	    ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
121 	{{fletcher_2_native,	fletcher_2_byteswap}, NULL, NULL,
122 	    0, "fletcher2"},
123 	{{fletcher_4_native,	fletcher_4_byteswap}, NULL, NULL,
124 	    ZCHECKSUM_FLAG_METADATA, "fletcher4"},
125 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
126 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
127 	    ZCHECKSUM_FLAG_NOPWRITE, "SHA256"},
128 	{{fletcher_4_native,	fletcher_4_byteswap}, NULL, NULL,
129 	    ZCHECKSUM_FLAG_EMBEDDED, "zillog2"},
130 	{{zio_checksum_off,	zio_checksum_off}, NULL, NULL,
131 	    0, "noparity"},
132 	{{zio_checksum_SHA512_native,	zio_checksum_SHA512_byteswap},
133 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
134 	    ZCHECKSUM_FLAG_NOPWRITE, "SHA512"},
135 	{{zio_checksum_skein_native, zio_checksum_skein_byteswap},
136 	    zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
137 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
138 	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
139 	/* no edonr for now */
140 	{{NULL, NULL}, NULL, NULL, ZCHECKSUM_FLAG_METADATA |
141 	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"}
142 };
143 
144 /*
145  * Common signature for all zio compress/decompress functions.
146  */
147 typedef size_t zio_compress_func_t(void *src, void *dst,
148     size_t s_len, size_t d_len, int);
149 typedef int zio_decompress_func_t(void *src, void *dst,
150     size_t s_len, size_t d_len, int);
151 
152 /*
153  * Information about each compression function.
154  */
155 typedef struct zio_compress_info {
156 	zio_compress_func_t	*ci_compress;	/* compression function */
157 	zio_decompress_func_t	*ci_decompress;	/* decompression function */
158 	int			ci_level;	/* level parameter */
159 	const char		*ci_name;	/* algorithm name */
160 } zio_compress_info_t;
161 
162 #include "lzjb.c"
163 #include "zle.c"
164 #include "lz4.c"
165 
166 /*
167  * Compression vectors.
168  */
169 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
170 	{NULL,			NULL,			0,	"inherit"},
171 	{NULL,			NULL,			0,	"on"},
172 	{NULL,			NULL,			0,	"uncompressed"},
173 	{NULL,			lzjb_decompress,	0,	"lzjb"},
174 	{NULL,			NULL,			0,	"empty"},
175 	{NULL,			NULL,			1,	"gzip-1"},
176 	{NULL,			NULL,			2,	"gzip-2"},
177 	{NULL,			NULL,			3,	"gzip-3"},
178 	{NULL,			NULL,			4,	"gzip-4"},
179 	{NULL,			NULL,			5,	"gzip-5"},
180 	{NULL,			NULL,			6,	"gzip-6"},
181 	{NULL,			NULL,			7,	"gzip-7"},
182 	{NULL,			NULL,			8,	"gzip-8"},
183 	{NULL,			NULL,			9,	"gzip-9"},
184 	{NULL,			zle_decompress,		64,	"zle"},
185 	{NULL,			lz4_decompress,		0,	"lz4"},
186 };
187 
188 static void
189 byteswap_uint64_array(void *vbuf, size_t size)
190 {
191 	uint64_t *buf = vbuf;
192 	size_t count = size >> 3;
193 	int i;
194 
195 	ASSERT((size & 7) == 0);
196 
197 	for (i = 0; i < count; i++)
198 		buf[i] = BSWAP_64(buf[i]);
199 }
200 
201 /*
202  * Set the external verifier for a gang block based on <vdev, offset, txg>,
203  * a tuple which is guaranteed to be unique for the life of the pool.
204  */
205 static void
206 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
207 {
208 	const dva_t *dva = BP_IDENTITY(bp);
209 	uint64_t txg = BP_PHYSICAL_BIRTH(bp);
210 
211 	ASSERT(BP_IS_GANG(bp));
212 
213 	ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
214 }
215 
216 /*
217  * Set the external verifier for a label block based on its offset.
218  * The vdev is implicit, and the txg is unknowable at pool open time --
219  * hence the logic in vdev_uberblock_load() to find the most recent copy.
220  */
221 static void
222 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
223 {
224 	ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
225 }
226 
227 /*
228  * Calls the template init function of a checksum which supports context
229  * templates and installs the template into the spa_t.
230  */
231 static void
232 zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
233 {
234 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
235 
236 	if (ci->ci_tmpl_init == NULL)
237 		return;
238 
239 	if (spa->spa_cksum_tmpls[checksum] != NULL)
240 		return;
241 
242 	if (spa->spa_cksum_tmpls[checksum] == NULL) {
243 		spa->spa_cksum_tmpls[checksum] =
244 		    ci->ci_tmpl_init(&spa->spa_cksum_salt);
245 	}
246 }
247 
248 /*
249  * Called by a spa_t that's about to be deallocated. This steps through
250  * all of the checksum context templates and deallocates any that were
251  * initialized using the algorithm-specific template init function.
252  */
253 static void __unused
254 zio_checksum_templates_free(spa_t *spa)
255 {
256 	for (enum zio_checksum checksum = 0;
257 	    checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
258 		if (spa->spa_cksum_tmpls[checksum] != NULL) {
259 			zio_checksum_info_t *ci = &zio_checksum_table[checksum];
260 
261 			ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
262 			spa->spa_cksum_tmpls[checksum] = NULL;
263 		}
264 	}
265 }
266 
267 static int
268 zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data)
269 {
270 	uint64_t size;
271 	unsigned int checksum;
272 	zio_checksum_info_t *ci;
273 	void *ctx = NULL;
274 	zio_cksum_t actual_cksum, expected_cksum, verifier;
275 	int byteswap;
276 
277 	checksum = BP_GET_CHECKSUM(bp);
278 	size = BP_GET_PSIZE(bp);
279 
280 	if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
281 		return (EINVAL);
282 	ci = &zio_checksum_table[checksum];
283 	if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
284 		return (EINVAL);
285 
286 	if (spa != NULL) {
287 		zio_checksum_template_init(checksum, __DECONST(spa_t *,spa));
288 		ctx = spa->spa_cksum_tmpls[checksum];
289 	}
290 
291 	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
292 		zio_eck_t *eck;
293 
294 		ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
295 		    checksum == ZIO_CHECKSUM_LABEL);
296 
297 		eck = (zio_eck_t *)((char *)data + size) - 1;
298 
299 		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
300 			zio_checksum_gang_verifier(&verifier, bp);
301 		else if (checksum == ZIO_CHECKSUM_LABEL)
302 			zio_checksum_label_verifier(&verifier,
303 			    DVA_GET_OFFSET(BP_IDENTITY(bp)));
304 		else
305 			verifier = bp->blk_cksum;
306 
307 		byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
308 
309 		if (byteswap)
310 			byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
311 
312 		expected_cksum = eck->zec_cksum;
313 		eck->zec_cksum = verifier;
314 		ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
315 		eck->zec_cksum = expected_cksum;
316 
317 		if (byteswap)
318 			byteswap_uint64_array(&expected_cksum,
319 			    sizeof (zio_cksum_t));
320 	} else {
321 		expected_cksum = bp->blk_cksum;
322 		ci->ci_func[0](data, size, ctx, &actual_cksum);
323 	}
324 
325 	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
326 		/*printf("ZFS: read checksum %s failed\n", ci->ci_name);*/
327 		return (EIO);
328 	}
329 
330 	return (0);
331 }
332 
333 static int
334 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
335 	void *dest, uint64_t destsize)
336 {
337 	zio_compress_info_t *ci;
338 
339 	if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
340 		printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
341 		return (EIO);
342 	}
343 
344 	ci = &zio_compress_table[cpfunc];
345 	if (!ci->ci_decompress) {
346 		printf("ZFS: unsupported compression algorithm %s\n",
347 		    ci->ci_name);
348 		return (EIO);
349 	}
350 
351 	return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
352 }
353 
354 static uint64_t
355 zap_hash(uint64_t salt, const char *name)
356 {
357 	const uint8_t *cp;
358 	uint8_t c;
359 	uint64_t crc = salt;
360 
361 	ASSERT(crc != 0);
362 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
363 	for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
364 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
365 
366 	/*
367 	 * Only use 28 bits, since we need 4 bits in the cookie for the
368 	 * collision differentiator.  We MUST use the high bits, since
369 	 * those are the onces that we first pay attention to when
370 	 * chosing the bucket.
371 	 */
372 	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
373 
374 	return (crc);
375 }
376 
377 static void *zfs_alloc(size_t size);
378 static void zfs_free(void *ptr, size_t size);
379 
380 typedef struct raidz_col {
381 	uint64_t rc_devidx;		/* child device index for I/O */
382 	uint64_t rc_offset;		/* device offset */
383 	uint64_t rc_size;		/* I/O size */
384 	void *rc_data;			/* I/O data */
385 	int rc_error;			/* I/O error for this device */
386 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
387 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
388 } raidz_col_t;
389 
390 typedef struct raidz_map {
391 	uint64_t rm_cols;		/* Regular column count */
392 	uint64_t rm_scols;		/* Count including skipped columns */
393 	uint64_t rm_bigcols;		/* Number of oversized columns */
394 	uint64_t rm_asize;		/* Actual total I/O size */
395 	uint64_t rm_missingdata;	/* Count of missing data devices */
396 	uint64_t rm_missingparity;	/* Count of missing parity devices */
397 	uint64_t rm_firstdatacol;	/* First data column/parity count */
398 	uint64_t rm_nskip;		/* Skipped sectors for padding */
399 	uint64_t rm_skipstart;		/* Column index of padding start */
400 	uintptr_t rm_reports;		/* # of referencing checksum reports */
401 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
402 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
403 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
404 } raidz_map_t;
405 
406 #define	VDEV_RAIDZ_P		0
407 #define	VDEV_RAIDZ_Q		1
408 #define	VDEV_RAIDZ_R		2
409 
410 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
411 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
412 
413 /*
414  * We provide a mechanism to perform the field multiplication operation on a
415  * 64-bit value all at once rather than a byte at a time. This works by
416  * creating a mask from the top bit in each byte and using that to
417  * conditionally apply the XOR of 0x1d.
418  */
419 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
420 { \
421 	(mask) = (x) & 0x8080808080808080ULL; \
422 	(mask) = ((mask) << 1) - ((mask) >> 7); \
423 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
424 	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
425 }
426 
427 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
428 { \
429 	VDEV_RAIDZ_64MUL_2((x), mask); \
430 	VDEV_RAIDZ_64MUL_2((x), mask); \
431 }
432 
433 /*
434  * These two tables represent powers and logs of 2 in the Galois field defined
435  * above. These values were computed by repeatedly multiplying by 2 as above.
436  */
437 static const uint8_t vdev_raidz_pow2[256] = {
438 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
439 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
440 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
441 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
442 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
443 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
444 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
445 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
446 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
447 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
448 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
449 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
450 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
451 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
452 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
453 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
454 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
455 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
456 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
457 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
458 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
459 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
460 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
461 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
462 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
463 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
464 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
465 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
466 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
467 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
468 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
469 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
470 };
471 static const uint8_t vdev_raidz_log2[256] = {
472 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
473 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
474 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
475 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
476 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
477 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
478 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
479 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
480 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
481 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
482 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
483 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
484 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
485 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
486 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
487 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
488 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
489 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
490 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
491 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
492 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
493 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
494 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
495 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
496 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
497 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
498 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
499 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
500 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
501 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
502 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
503 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
504 };
505 
506 /*
507  * Multiply a given number by 2 raised to the given power.
508  */
509 static uint8_t
510 vdev_raidz_exp2(uint8_t a, int exp)
511 {
512 	if (a == 0)
513 		return (0);
514 
515 	ASSERT(exp >= 0);
516 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
517 
518 	exp += vdev_raidz_log2[a];
519 	if (exp > 255)
520 		exp -= 255;
521 
522 	return (vdev_raidz_pow2[exp]);
523 }
524 
525 static void
526 vdev_raidz_generate_parity_p(raidz_map_t *rm)
527 {
528 	uint64_t *p, *src, pcount, ccount, i;
529 	int c;
530 
531 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
532 
533 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
534 		src = rm->rm_col[c].rc_data;
535 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
536 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
537 
538 		if (c == rm->rm_firstdatacol) {
539 			ASSERT(ccount == pcount);
540 			for (i = 0; i < ccount; i++, src++, p++) {
541 				*p = *src;
542 			}
543 		} else {
544 			ASSERT(ccount <= pcount);
545 			for (i = 0; i < ccount; i++, src++, p++) {
546 				*p ^= *src;
547 			}
548 		}
549 	}
550 }
551 
552 static void
553 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
554 {
555 	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
556 	int c;
557 
558 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
559 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
560 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
561 
562 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
563 		src = rm->rm_col[c].rc_data;
564 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
565 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
566 
567 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
568 
569 		if (c == rm->rm_firstdatacol) {
570 			ASSERT(ccnt == pcnt || ccnt == 0);
571 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
572 				*p = *src;
573 				*q = *src;
574 			}
575 			for (; i < pcnt; i++, src++, p++, q++) {
576 				*p = 0;
577 				*q = 0;
578 			}
579 		} else {
580 			ASSERT(ccnt <= pcnt);
581 
582 			/*
583 			 * Apply the algorithm described above by multiplying
584 			 * the previous result and adding in the new value.
585 			 */
586 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
587 				*p ^= *src;
588 
589 				VDEV_RAIDZ_64MUL_2(*q, mask);
590 				*q ^= *src;
591 			}
592 
593 			/*
594 			 * Treat short columns as though they are full of 0s.
595 			 * Note that there's therefore nothing needed for P.
596 			 */
597 			for (; i < pcnt; i++, q++) {
598 				VDEV_RAIDZ_64MUL_2(*q, mask);
599 			}
600 		}
601 	}
602 }
603 
604 static void
605 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
606 {
607 	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
608 	int c;
609 
610 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
611 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
612 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
613 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
614 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
615 
616 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
617 		src = rm->rm_col[c].rc_data;
618 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
619 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
620 		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
621 
622 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
623 
624 		if (c == rm->rm_firstdatacol) {
625 			ASSERT(ccnt == pcnt || ccnt == 0);
626 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
627 				*p = *src;
628 				*q = *src;
629 				*r = *src;
630 			}
631 			for (; i < pcnt; i++, src++, p++, q++, r++) {
632 				*p = 0;
633 				*q = 0;
634 				*r = 0;
635 			}
636 		} else {
637 			ASSERT(ccnt <= pcnt);
638 
639 			/*
640 			 * Apply the algorithm described above by multiplying
641 			 * the previous result and adding in the new value.
642 			 */
643 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
644 				*p ^= *src;
645 
646 				VDEV_RAIDZ_64MUL_2(*q, mask);
647 				*q ^= *src;
648 
649 				VDEV_RAIDZ_64MUL_4(*r, mask);
650 				*r ^= *src;
651 			}
652 
653 			/*
654 			 * Treat short columns as though they are full of 0s.
655 			 * Note that there's therefore nothing needed for P.
656 			 */
657 			for (; i < pcnt; i++, q++, r++) {
658 				VDEV_RAIDZ_64MUL_2(*q, mask);
659 				VDEV_RAIDZ_64MUL_4(*r, mask);
660 			}
661 		}
662 	}
663 }
664 
665 /*
666  * Generate RAID parity in the first virtual columns according to the number of
667  * parity columns available.
668  */
669 static void
670 vdev_raidz_generate_parity(raidz_map_t *rm)
671 {
672 	switch (rm->rm_firstdatacol) {
673 	case 1:
674 		vdev_raidz_generate_parity_p(rm);
675 		break;
676 	case 2:
677 		vdev_raidz_generate_parity_pq(rm);
678 		break;
679 	case 3:
680 		vdev_raidz_generate_parity_pqr(rm);
681 		break;
682 	default:
683 		panic("invalid RAID-Z configuration");
684 	}
685 }
686 
687 /* BEGIN CSTYLED */
688 /*
689  * In the general case of reconstruction, we must solve the system of linear
690  * equations defined by the coeffecients used to generate parity as well as
691  * the contents of the data and parity disks. This can be expressed with
692  * vectors for the original data (D) and the actual data (d) and parity (p)
693  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
694  *
695  *            __   __                     __     __
696  *            |     |         __     __   |  p_0  |
697  *            |  V  |         |  D_0  |   | p_m-1 |
698  *            |     |    x    |   :   | = |  d_0  |
699  *            |  I  |         | D_n-1 |   |   :   |
700  *            |     |         ~~     ~~   | d_n-1 |
701  *            ~~   ~~                     ~~     ~~
702  *
703  * I is simply a square identity matrix of size n, and V is a vandermonde
704  * matrix defined by the coeffecients we chose for the various parity columns
705  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
706  * computation as well as linear separability.
707  *
708  *      __               __               __     __
709  *      |   1   ..  1 1 1 |               |  p_0  |
710  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
711  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
712  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
713  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
714  *      |   :       : : : |   |   :   |   |  d_2  |
715  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
716  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
717  *      |   0   ..  0 0 1 |               | d_n-1 |
718  *      ~~               ~~               ~~     ~~
719  *
720  * Note that I, V, d, and p are known. To compute D, we must invert the
721  * matrix and use the known data and parity values to reconstruct the unknown
722  * data values. We begin by removing the rows in V|I and d|p that correspond
723  * to failed or missing columns; we then make V|I square (n x n) and d|p
724  * sized n by removing rows corresponding to unused parity from the bottom up
725  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
726  * using Gauss-Jordan elimination. In the example below we use m=3 parity
727  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
728  *           __                               __
729  *           |  1   1   1   1   1   1   1   1  |
730  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
731  *           |  19 205 116  29  64  16  4   1  |      / /
732  *           |  1   0   0   0   0   0   0   0  |     / /
733  *           |  0   1   0   0   0   0   0   0  | <--' /
734  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
735  *           |  0   0   0   1   0   0   0   0  |
736  *           |  0   0   0   0   1   0   0   0  |
737  *           |  0   0   0   0   0   1   0   0  |
738  *           |  0   0   0   0   0   0   1   0  |
739  *           |  0   0   0   0   0   0   0   1  |
740  *           ~~                               ~~
741  *           __                               __
742  *           |  1   1   1   1   1   1   1   1  |
743  *           | 128  64  32  16  8   4   2   1  |
744  *           |  19 205 116  29  64  16  4   1  |
745  *           |  1   0   0   0   0   0   0   0  |
746  *           |  0   1   0   0   0   0   0   0  |
747  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
748  *           |  0   0   0   1   0   0   0   0  |
749  *           |  0   0   0   0   1   0   0   0  |
750  *           |  0   0   0   0   0   1   0   0  |
751  *           |  0   0   0   0   0   0   1   0  |
752  *           |  0   0   0   0   0   0   0   1  |
753  *           ~~                               ~~
754  *
755  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
756  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
757  * matrix is not singular.
758  * __                                                                 __
759  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
760  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
761  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
762  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
763  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
764  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
765  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
766  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
767  * ~~                                                                 ~~
768  * __                                                                 __
769  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
770  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
771  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
772  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
773  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
774  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
775  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
776  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
777  * ~~                                                                 ~~
778  * __                                                                 __
779  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
780  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
781  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
782  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
783  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
784  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
785  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
786  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
787  * ~~                                                                 ~~
788  * __                                                                 __
789  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
790  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
791  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
792  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
793  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
794  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
795  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
796  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
797  * ~~                                                                 ~~
798  * __                                                                 __
799  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
800  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
801  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
802  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
803  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
804  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
805  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
806  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
807  * ~~                                                                 ~~
808  * __                                                                 __
809  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
810  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
811  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
812  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
813  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
814  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
815  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
816  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
817  * ~~                                                                 ~~
818  *                   __                               __
819  *                   |  0   0   1   0   0   0   0   0  |
820  *                   | 167 100  5   41 159 169 217 208 |
821  *                   | 166 100  4   40 158 168 216 209 |
822  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
823  *                   |  0   0   0   0   1   0   0   0  |
824  *                   |  0   0   0   0   0   1   0   0  |
825  *                   |  0   0   0   0   0   0   1   0  |
826  *                   |  0   0   0   0   0   0   0   1  |
827  *                   ~~                               ~~
828  *
829  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
830  * of the missing data.
831  *
832  * As is apparent from the example above, the only non-trivial rows in the
833  * inverse matrix correspond to the data disks that we're trying to
834  * reconstruct. Indeed, those are the only rows we need as the others would
835  * only be useful for reconstructing data known or assumed to be valid. For
836  * that reason, we only build the coefficients in the rows that correspond to
837  * targeted columns.
838  */
839 /* END CSTYLED */
840 
841 static void
842 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
843     uint8_t **rows)
844 {
845 	int i, j;
846 	int pow;
847 
848 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
849 
850 	/*
851 	 * Fill in the missing rows of interest.
852 	 */
853 	for (i = 0; i < nmap; i++) {
854 		ASSERT3S(0, <=, map[i]);
855 		ASSERT3S(map[i], <=, 2);
856 
857 		pow = map[i] * n;
858 		if (pow > 255)
859 			pow -= 255;
860 		ASSERT(pow <= 255);
861 
862 		for (j = 0; j < n; j++) {
863 			pow -= map[i];
864 			if (pow < 0)
865 				pow += 255;
866 			rows[i][j] = vdev_raidz_pow2[pow];
867 		}
868 	}
869 }
870 
871 static void
872 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
873     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
874 {
875 	int i, j, ii, jj;
876 	uint8_t log;
877 
878 	/*
879 	 * Assert that the first nmissing entries from the array of used
880 	 * columns correspond to parity columns and that subsequent entries
881 	 * correspond to data columns.
882 	 */
883 	for (i = 0; i < nmissing; i++) {
884 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
885 	}
886 	for (; i < n; i++) {
887 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
888 	}
889 
890 	/*
891 	 * First initialize the storage where we'll compute the inverse rows.
892 	 */
893 	for (i = 0; i < nmissing; i++) {
894 		for (j = 0; j < n; j++) {
895 			invrows[i][j] = (i == j) ? 1 : 0;
896 		}
897 	}
898 
899 	/*
900 	 * Subtract all trivial rows from the rows of consequence.
901 	 */
902 	for (i = 0; i < nmissing; i++) {
903 		for (j = nmissing; j < n; j++) {
904 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
905 			jj = used[j] - rm->rm_firstdatacol;
906 			ASSERT3S(jj, <, n);
907 			invrows[i][j] = rows[i][jj];
908 			rows[i][jj] = 0;
909 		}
910 	}
911 
912 	/*
913 	 * For each of the rows of interest, we must normalize it and subtract
914 	 * a multiple of it from the other rows.
915 	 */
916 	for (i = 0; i < nmissing; i++) {
917 		for (j = 0; j < missing[i]; j++) {
918 			ASSERT3U(rows[i][j], ==, 0);
919 		}
920 		ASSERT3U(rows[i][missing[i]], !=, 0);
921 
922 		/*
923 		 * Compute the inverse of the first element and multiply each
924 		 * element in the row by that value.
925 		 */
926 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
927 
928 		for (j = 0; j < n; j++) {
929 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
930 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
931 		}
932 
933 		for (ii = 0; ii < nmissing; ii++) {
934 			if (i == ii)
935 				continue;
936 
937 			ASSERT3U(rows[ii][missing[i]], !=, 0);
938 
939 			log = vdev_raidz_log2[rows[ii][missing[i]]];
940 
941 			for (j = 0; j < n; j++) {
942 				rows[ii][j] ^=
943 				    vdev_raidz_exp2(rows[i][j], log);
944 				invrows[ii][j] ^=
945 				    vdev_raidz_exp2(invrows[i][j], log);
946 			}
947 		}
948 	}
949 
950 	/*
951 	 * Verify that the data that is left in the rows are properly part of
952 	 * an identity matrix.
953 	 */
954 	for (i = 0; i < nmissing; i++) {
955 		for (j = 0; j < n; j++) {
956 			if (j == missing[i]) {
957 				ASSERT3U(rows[i][j], ==, 1);
958 			} else {
959 				ASSERT3U(rows[i][j], ==, 0);
960 			}
961 		}
962 	}
963 }
964 
965 static void
966 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
967     int *missing, uint8_t **invrows, const uint8_t *used)
968 {
969 	int i, j, x, cc, c;
970 	uint8_t *src;
971 	uint64_t ccount;
972 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
973 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
974 	uint8_t log, val;
975 	int ll;
976 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
977 	uint8_t *p, *pp;
978 	size_t psize;
979 
980 	log = 0;	/* gcc */
981 	psize = sizeof (invlog[0][0]) * n * nmissing;
982 	p = zfs_alloc(psize);
983 
984 	for (pp = p, i = 0; i < nmissing; i++) {
985 		invlog[i] = pp;
986 		pp += n;
987 	}
988 
989 	for (i = 0; i < nmissing; i++) {
990 		for (j = 0; j < n; j++) {
991 			ASSERT3U(invrows[i][j], !=, 0);
992 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
993 		}
994 	}
995 
996 	for (i = 0; i < n; i++) {
997 		c = used[i];
998 		ASSERT3U(c, <, rm->rm_cols);
999 
1000 		src = rm->rm_col[c].rc_data;
1001 		ccount = rm->rm_col[c].rc_size;
1002 		for (j = 0; j < nmissing; j++) {
1003 			cc = missing[j] + rm->rm_firstdatacol;
1004 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1005 			ASSERT3U(cc, <, rm->rm_cols);
1006 			ASSERT3U(cc, !=, c);
1007 
1008 			dst[j] = rm->rm_col[cc].rc_data;
1009 			dcount[j] = rm->rm_col[cc].rc_size;
1010 		}
1011 
1012 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1013 
1014 		for (x = 0; x < ccount; x++, src++) {
1015 			if (*src != 0)
1016 				log = vdev_raidz_log2[*src];
1017 
1018 			for (cc = 0; cc < nmissing; cc++) {
1019 				if (x >= dcount[cc])
1020 					continue;
1021 
1022 				if (*src == 0) {
1023 					val = 0;
1024 				} else {
1025 					if ((ll = log + invlog[cc][i]) >= 255)
1026 						ll -= 255;
1027 					val = vdev_raidz_pow2[ll];
1028 				}
1029 
1030 				if (i == 0)
1031 					dst[cc][x] = val;
1032 				else
1033 					dst[cc][x] ^= val;
1034 			}
1035 		}
1036 	}
1037 
1038 	zfs_free(p, psize);
1039 }
1040 
1041 static int
1042 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1043 {
1044 	int n, i, c, t, tt;
1045 	int nmissing_rows;
1046 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1047 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1048 
1049 	uint8_t *p, *pp;
1050 	size_t psize;
1051 
1052 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1053 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1054 	uint8_t *used;
1055 
1056 	int code = 0;
1057 
1058 
1059 	n = rm->rm_cols - rm->rm_firstdatacol;
1060 
1061 	/*
1062 	 * Figure out which data columns are missing.
1063 	 */
1064 	nmissing_rows = 0;
1065 	for (t = 0; t < ntgts; t++) {
1066 		if (tgts[t] >= rm->rm_firstdatacol) {
1067 			missing_rows[nmissing_rows++] =
1068 			    tgts[t] - rm->rm_firstdatacol;
1069 		}
1070 	}
1071 
1072 	/*
1073 	 * Figure out which parity columns to use to help generate the missing
1074 	 * data columns.
1075 	 */
1076 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1077 		ASSERT(tt < ntgts);
1078 		ASSERT(c < rm->rm_firstdatacol);
1079 
1080 		/*
1081 		 * Skip any targeted parity columns.
1082 		 */
1083 		if (c == tgts[tt]) {
1084 			tt++;
1085 			continue;
1086 		}
1087 
1088 		code |= 1 << c;
1089 
1090 		parity_map[i] = c;
1091 		i++;
1092 	}
1093 
1094 	ASSERT(code != 0);
1095 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1096 
1097 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1098 	    nmissing_rows * n + sizeof (used[0]) * n;
1099 	p = kmem_alloc(psize, KM_SLEEP);
1100 
1101 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1102 		rows[i] = pp;
1103 		pp += n;
1104 		invrows[i] = pp;
1105 		pp += n;
1106 	}
1107 	used = pp;
1108 
1109 	for (i = 0; i < nmissing_rows; i++) {
1110 		used[i] = parity_map[i];
1111 	}
1112 
1113 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1114 		if (tt < nmissing_rows &&
1115 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1116 			tt++;
1117 			continue;
1118 		}
1119 
1120 		ASSERT3S(i, <, n);
1121 		used[i] = c;
1122 		i++;
1123 	}
1124 
1125 	/*
1126 	 * Initialize the interesting rows of the matrix.
1127 	 */
1128 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1129 
1130 	/*
1131 	 * Invert the matrix.
1132 	 */
1133 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1134 	    invrows, used);
1135 
1136 	/*
1137 	 * Reconstruct the missing data using the generated matrix.
1138 	 */
1139 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1140 	    invrows, used);
1141 
1142 	kmem_free(p, psize);
1143 
1144 	return (code);
1145 }
1146 
1147 static int
1148 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1149 {
1150 	int tgts[VDEV_RAIDZ_MAXPARITY];
1151 	int ntgts;
1152 	int i, c;
1153 	int code;
1154 	int nbadparity, nbaddata;
1155 
1156 	/*
1157 	 * The tgts list must already be sorted.
1158 	 */
1159 	for (i = 1; i < nt; i++) {
1160 		ASSERT(t[i] > t[i - 1]);
1161 	}
1162 
1163 	nbadparity = rm->rm_firstdatacol;
1164 	nbaddata = rm->rm_cols - nbadparity;
1165 	ntgts = 0;
1166 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1167 		if (i < nt && c == t[i]) {
1168 			tgts[ntgts++] = c;
1169 			i++;
1170 		} else if (rm->rm_col[c].rc_error != 0) {
1171 			tgts[ntgts++] = c;
1172 		} else if (c >= rm->rm_firstdatacol) {
1173 			nbaddata--;
1174 		} else {
1175 			nbadparity--;
1176 		}
1177 	}
1178 
1179 	ASSERT(ntgts >= nt);
1180 	ASSERT(nbaddata >= 0);
1181 	ASSERT(nbaddata + nbadparity == ntgts);
1182 
1183 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1184 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1185 	ASSERT(code > 0);
1186 	return (code);
1187 }
1188 
1189 static raidz_map_t *
1190 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1191     uint64_t dcols, uint64_t nparity)
1192 {
1193 	raidz_map_t *rm;
1194 	uint64_t b = offset >> unit_shift;
1195 	uint64_t s = size >> unit_shift;
1196 	uint64_t f = b % dcols;
1197 	uint64_t o = (b / dcols) << unit_shift;
1198 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1199 
1200 	q = s / (dcols - nparity);
1201 	r = s - q * (dcols - nparity);
1202 	bc = (r == 0 ? 0 : r + nparity);
1203 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
1204 
1205 	if (q == 0) {
1206 		acols = bc;
1207 		scols = MIN(dcols, roundup(bc, nparity + 1));
1208 	} else {
1209 		acols = dcols;
1210 		scols = dcols;
1211 	}
1212 
1213 	ASSERT3U(acols, <=, scols);
1214 
1215 	rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols]));
1216 
1217 	rm->rm_cols = acols;
1218 	rm->rm_scols = scols;
1219 	rm->rm_bigcols = bc;
1220 	rm->rm_skipstart = bc;
1221 	rm->rm_missingdata = 0;
1222 	rm->rm_missingparity = 0;
1223 	rm->rm_firstdatacol = nparity;
1224 	rm->rm_reports = 0;
1225 	rm->rm_freed = 0;
1226 	rm->rm_ecksuminjected = 0;
1227 
1228 	asize = 0;
1229 
1230 	for (c = 0; c < scols; c++) {
1231 		col = f + c;
1232 		coff = o;
1233 		if (col >= dcols) {
1234 			col -= dcols;
1235 			coff += 1ULL << unit_shift;
1236 		}
1237 		rm->rm_col[c].rc_devidx = col;
1238 		rm->rm_col[c].rc_offset = coff;
1239 		rm->rm_col[c].rc_data = NULL;
1240 		rm->rm_col[c].rc_error = 0;
1241 		rm->rm_col[c].rc_tried = 0;
1242 		rm->rm_col[c].rc_skipped = 0;
1243 
1244 		if (c >= acols)
1245 			rm->rm_col[c].rc_size = 0;
1246 		else if (c < bc)
1247 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1248 		else
1249 			rm->rm_col[c].rc_size = q << unit_shift;
1250 
1251 		asize += rm->rm_col[c].rc_size;
1252 	}
1253 
1254 	ASSERT3U(asize, ==, tot << unit_shift);
1255 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1256 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1257 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1258 	ASSERT3U(rm->rm_nskip, <=, nparity);
1259 
1260 	for (c = 0; c < rm->rm_firstdatacol; c++)
1261 		rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size);
1262 
1263 	rm->rm_col[c].rc_data = data;
1264 
1265 	for (c = c + 1; c < acols; c++)
1266 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1267 		    rm->rm_col[c - 1].rc_size;
1268 
1269 	/*
1270 	 * If all data stored spans all columns, there's a danger that parity
1271 	 * will always be on the same device and, since parity isn't read
1272 	 * during normal operation, that that device's I/O bandwidth won't be
1273 	 * used effectively. We therefore switch the parity every 1MB.
1274 	 *
1275 	 * ... at least that was, ostensibly, the theory. As a practical
1276 	 * matter unless we juggle the parity between all devices evenly, we
1277 	 * won't see any benefit. Further, occasional writes that aren't a
1278 	 * multiple of the LCM of the number of children and the minimum
1279 	 * stripe width are sufficient to avoid pessimal behavior.
1280 	 * Unfortunately, this decision created an implicit on-disk format
1281 	 * requirement that we need to support for all eternity, but only
1282 	 * for single-parity RAID-Z.
1283 	 *
1284 	 * If we intend to skip a sector in the zeroth column for padding
1285 	 * we must make sure to note this swap. We will never intend to
1286 	 * skip the first column since at least one data and one parity
1287 	 * column must appear in each row.
1288 	 */
1289 	ASSERT(rm->rm_cols >= 2);
1290 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1291 
1292 	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1293 		devidx = rm->rm_col[0].rc_devidx;
1294 		o = rm->rm_col[0].rc_offset;
1295 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1296 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1297 		rm->rm_col[1].rc_devidx = devidx;
1298 		rm->rm_col[1].rc_offset = o;
1299 
1300 		if (rm->rm_skipstart == 0)
1301 			rm->rm_skipstart = 1;
1302 	}
1303 
1304 	return (rm);
1305 }
1306 
1307 static void
1308 vdev_raidz_map_free(raidz_map_t *rm)
1309 {
1310 	int c;
1311 
1312 	for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1313 		zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
1314 
1315 	zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
1316 }
1317 
1318 static vdev_t *
1319 vdev_child(vdev_t *pvd, uint64_t devidx)
1320 {
1321 	vdev_t *cvd;
1322 
1323 	STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1324 		if (cvd->v_id == devidx)
1325 			break;
1326 	}
1327 
1328 	return (cvd);
1329 }
1330 
1331 /*
1332  * We keep track of whether or not there were any injected errors, so that
1333  * any ereports we generate can note it.
1334  */
1335 static int
1336 raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data,
1337     uint64_t size)
1338 {
1339 	return (zio_checksum_verify(spa, bp, data));
1340 }
1341 
1342 /*
1343  * Generate the parity from the data columns. If we tried and were able to
1344  * read the parity without error, verify that the generated parity matches the
1345  * data we read. If it doesn't, we fire off a checksum error. Return the
1346  * number such failures.
1347  */
1348 static int
1349 raidz_parity_verify(raidz_map_t *rm)
1350 {
1351 	void *orig[VDEV_RAIDZ_MAXPARITY];
1352 	int c, ret = 0;
1353 	raidz_col_t *rc;
1354 
1355 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1356 		rc = &rm->rm_col[c];
1357 		if (!rc->rc_tried || rc->rc_error != 0)
1358 			continue;
1359 		orig[c] = zfs_alloc(rc->rc_size);
1360 		bcopy(rc->rc_data, orig[c], rc->rc_size);
1361 	}
1362 
1363 	vdev_raidz_generate_parity(rm);
1364 
1365 	for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1366 		rc = &rm->rm_col[c];
1367 		if (!rc->rc_tried || rc->rc_error != 0)
1368 			continue;
1369 		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1370 			rc->rc_error = ECKSUM;
1371 			ret++;
1372 		}
1373 		zfs_free(orig[c], rc->rc_size);
1374 	}
1375 
1376 	return (ret);
1377 }
1378 
1379 /*
1380  * Iterate over all combinations of bad data and attempt a reconstruction.
1381  * Note that the algorithm below is non-optimal because it doesn't take into
1382  * account how reconstruction is actually performed. For example, with
1383  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1384  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1385  * cases we'd only use parity information in column 0.
1386  */
1387 static int
1388 vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp,
1389     void *data, off_t offset, uint64_t bytes, int total_errors, int data_errors)
1390 {
1391 	raidz_col_t *rc;
1392 	void *orig[VDEV_RAIDZ_MAXPARITY];
1393 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1394 	int *tgts = &tstore[1];
1395 	int current, next, i, c, n;
1396 	int code, ret = 0;
1397 
1398 	ASSERT(total_errors < rm->rm_firstdatacol);
1399 
1400 	/*
1401 	 * This simplifies one edge condition.
1402 	 */
1403 	tgts[-1] = -1;
1404 
1405 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1406 		/*
1407 		 * Initialize the targets array by finding the first n columns
1408 		 * that contain no error.
1409 		 *
1410 		 * If there were no data errors, we need to ensure that we're
1411 		 * always explicitly attempting to reconstruct at least one
1412 		 * data column. To do this, we simply push the highest target
1413 		 * up into the data columns.
1414 		 */
1415 		for (c = 0, i = 0; i < n; i++) {
1416 			if (i == n - 1 && data_errors == 0 &&
1417 			    c < rm->rm_firstdatacol) {
1418 				c = rm->rm_firstdatacol;
1419 			}
1420 
1421 			while (rm->rm_col[c].rc_error != 0) {
1422 				c++;
1423 				ASSERT3S(c, <, rm->rm_cols);
1424 			}
1425 
1426 			tgts[i] = c++;
1427 		}
1428 
1429 		/*
1430 		 * Setting tgts[n] simplifies the other edge condition.
1431 		 */
1432 		tgts[n] = rm->rm_cols;
1433 
1434 		/*
1435 		 * These buffers were allocated in previous iterations.
1436 		 */
1437 		for (i = 0; i < n - 1; i++) {
1438 			ASSERT(orig[i] != NULL);
1439 		}
1440 
1441 		orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size);
1442 
1443 		current = 0;
1444 		next = tgts[current];
1445 
1446 		while (current != n) {
1447 			tgts[current] = next;
1448 			current = 0;
1449 
1450 			/*
1451 			 * Save off the original data that we're going to
1452 			 * attempt to reconstruct.
1453 			 */
1454 			for (i = 0; i < n; i++) {
1455 				ASSERT(orig[i] != NULL);
1456 				c = tgts[i];
1457 				ASSERT3S(c, >=, 0);
1458 				ASSERT3S(c, <, rm->rm_cols);
1459 				rc = &rm->rm_col[c];
1460 				bcopy(rc->rc_data, orig[i], rc->rc_size);
1461 			}
1462 
1463 			/*
1464 			 * Attempt a reconstruction and exit the outer loop on
1465 			 * success.
1466 			 */
1467 			code = vdev_raidz_reconstruct(rm, tgts, n);
1468 			if (raidz_checksum_verify(spa, bp, data, bytes) == 0) {
1469 				for (i = 0; i < n; i++) {
1470 					c = tgts[i];
1471 					rc = &rm->rm_col[c];
1472 					ASSERT(rc->rc_error == 0);
1473 					rc->rc_error = ECKSUM;
1474 				}
1475 
1476 				ret = code;
1477 				goto done;
1478 			}
1479 
1480 			/*
1481 			 * Restore the original data.
1482 			 */
1483 			for (i = 0; i < n; i++) {
1484 				c = tgts[i];
1485 				rc = &rm->rm_col[c];
1486 				bcopy(orig[i], rc->rc_data, rc->rc_size);
1487 			}
1488 
1489 			do {
1490 				/*
1491 				 * Find the next valid column after the current
1492 				 * position..
1493 				 */
1494 				for (next = tgts[current] + 1;
1495 				    next < rm->rm_cols &&
1496 				    rm->rm_col[next].rc_error != 0; next++)
1497 					continue;
1498 
1499 				ASSERT(next <= tgts[current + 1]);
1500 
1501 				/*
1502 				 * If that spot is available, we're done here.
1503 				 */
1504 				if (next != tgts[current + 1])
1505 					break;
1506 
1507 				/*
1508 				 * Otherwise, find the next valid column after
1509 				 * the previous position.
1510 				 */
1511 				for (c = tgts[current - 1] + 1;
1512 				    rm->rm_col[c].rc_error != 0; c++)
1513 					continue;
1514 
1515 				tgts[current] = c;
1516 				current++;
1517 
1518 			} while (current != n);
1519 		}
1520 	}
1521 	n--;
1522 done:
1523 	for (i = n - 1; i >= 0; i--) {
1524 		zfs_free(orig[i], rm->rm_col[0].rc_size);
1525 	}
1526 
1527 	return (ret);
1528 }
1529 
1530 static int
1531 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1532     off_t offset, size_t bytes)
1533 {
1534 	vdev_t *tvd = vd->v_top;
1535 	vdev_t *cvd;
1536 	raidz_map_t *rm;
1537 	raidz_col_t *rc;
1538 	int c, error;
1539 	int unexpected_errors;
1540 	int parity_errors;
1541 	int parity_untried;
1542 	int data_errors;
1543 	int total_errors;
1544 	int n;
1545 	int tgts[VDEV_RAIDZ_MAXPARITY];
1546 	int code;
1547 
1548 	rc = NULL;	/* gcc */
1549 	error = 0;
1550 
1551 	rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1552 	    vd->v_nchildren, vd->v_nparity);
1553 
1554 	/*
1555 	 * Iterate over the columns in reverse order so that we hit the parity
1556 	 * last -- any errors along the way will force us to read the parity.
1557 	 */
1558 	for (c = rm->rm_cols - 1; c >= 0; c--) {
1559 		rc = &rm->rm_col[c];
1560 		cvd = vdev_child(vd, rc->rc_devidx);
1561 		if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1562 			if (c >= rm->rm_firstdatacol)
1563 				rm->rm_missingdata++;
1564 			else
1565 				rm->rm_missingparity++;
1566 			rc->rc_error = ENXIO;
1567 			rc->rc_tried = 1;	/* don't even try */
1568 			rc->rc_skipped = 1;
1569 			continue;
1570 		}
1571 #if 0		/* XXX: Too hard for the boot code. */
1572 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1573 			if (c >= rm->rm_firstdatacol)
1574 				rm->rm_missingdata++;
1575 			else
1576 				rm->rm_missingparity++;
1577 			rc->rc_error = ESTALE;
1578 			rc->rc_skipped = 1;
1579 			continue;
1580 		}
1581 #endif
1582 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1583 			rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1584 			    rc->rc_offset, rc->rc_size);
1585 			rc->rc_tried = 1;
1586 			rc->rc_skipped = 0;
1587 		}
1588 	}
1589 
1590 reconstruct:
1591 	unexpected_errors = 0;
1592 	parity_errors = 0;
1593 	parity_untried = 0;
1594 	data_errors = 0;
1595 	total_errors = 0;
1596 
1597 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1598 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1599 
1600 	for (c = 0; c < rm->rm_cols; c++) {
1601 		rc = &rm->rm_col[c];
1602 
1603 		if (rc->rc_error) {
1604 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
1605 
1606 			if (c < rm->rm_firstdatacol)
1607 				parity_errors++;
1608 			else
1609 				data_errors++;
1610 
1611 			if (!rc->rc_skipped)
1612 				unexpected_errors++;
1613 
1614 			total_errors++;
1615 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1616 			parity_untried++;
1617 		}
1618 	}
1619 
1620 	/*
1621 	 * There are three potential phases for a read:
1622 	 *	1. produce valid data from the columns read
1623 	 *	2. read all disks and try again
1624 	 *	3. perform combinatorial reconstruction
1625 	 *
1626 	 * Each phase is progressively both more expensive and less likely to
1627 	 * occur. If we encounter more errors than we can repair or all phases
1628 	 * fail, we have no choice but to return an error.
1629 	 */
1630 
1631 	/*
1632 	 * If the number of errors we saw was correctable -- less than or equal
1633 	 * to the number of parity disks read -- attempt to produce data that
1634 	 * has a valid checksum. Naturally, this case applies in the absence of
1635 	 * any errors.
1636 	 */
1637 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1638 		if (data_errors == 0) {
1639 			if (raidz_checksum_verify(vd->spa, bp, data, bytes) == 0) {
1640 				/*
1641 				 * If we read parity information (unnecessarily
1642 				 * as it happens since no reconstruction was
1643 				 * needed) regenerate and verify the parity.
1644 				 * We also regenerate parity when resilvering
1645 				 * so we can write it out to the failed device
1646 				 * later.
1647 				 */
1648 				if (parity_errors + parity_untried <
1649 				    rm->rm_firstdatacol) {
1650 					n = raidz_parity_verify(rm);
1651 					unexpected_errors += n;
1652 					ASSERT(parity_errors + n <=
1653 					    rm->rm_firstdatacol);
1654 				}
1655 				goto done;
1656 			}
1657 		} else {
1658 			/*
1659 			 * We either attempt to read all the parity columns or
1660 			 * none of them. If we didn't try to read parity, we
1661 			 * wouldn't be here in the correctable case. There must
1662 			 * also have been fewer parity errors than parity
1663 			 * columns or, again, we wouldn't be in this code path.
1664 			 */
1665 			ASSERT(parity_untried == 0);
1666 			ASSERT(parity_errors < rm->rm_firstdatacol);
1667 
1668 			/*
1669 			 * Identify the data columns that reported an error.
1670 			 */
1671 			n = 0;
1672 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1673 				rc = &rm->rm_col[c];
1674 				if (rc->rc_error != 0) {
1675 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1676 					tgts[n++] = c;
1677 				}
1678 			}
1679 
1680 			ASSERT(rm->rm_firstdatacol >= n);
1681 
1682 			code = vdev_raidz_reconstruct(rm, tgts, n);
1683 
1684 			if (raidz_checksum_verify(vd->spa, bp, data, bytes) == 0) {
1685 				/*
1686 				 * If we read more parity disks than were used
1687 				 * for reconstruction, confirm that the other
1688 				 * parity disks produced correct data. This
1689 				 * routine is suboptimal in that it regenerates
1690 				 * the parity that we already used in addition
1691 				 * to the parity that we're attempting to
1692 				 * verify, but this should be a relatively
1693 				 * uncommon case, and can be optimized if it
1694 				 * becomes a problem. Note that we regenerate
1695 				 * parity when resilvering so we can write it
1696 				 * out to failed devices later.
1697 				 */
1698 				if (parity_errors < rm->rm_firstdatacol - n) {
1699 					n = raidz_parity_verify(rm);
1700 					unexpected_errors += n;
1701 					ASSERT(parity_errors + n <=
1702 					    rm->rm_firstdatacol);
1703 				}
1704 
1705 				goto done;
1706 			}
1707 		}
1708 	}
1709 
1710 	/*
1711 	 * This isn't a typical situation -- either we got a read
1712 	 * error or a child silently returned bad data. Read every
1713 	 * block so we can try again with as much data and parity as
1714 	 * we can track down. If we've already been through once
1715 	 * before, all children will be marked as tried so we'll
1716 	 * proceed to combinatorial reconstruction.
1717 	 */
1718 	unexpected_errors = 1;
1719 	rm->rm_missingdata = 0;
1720 	rm->rm_missingparity = 0;
1721 
1722 	n = 0;
1723 	for (c = 0; c < rm->rm_cols; c++) {
1724 		rc = &rm->rm_col[c];
1725 
1726 		if (rc->rc_tried)
1727 			continue;
1728 
1729 		cvd = vdev_child(vd, rc->rc_devidx);
1730 		ASSERT(cvd != NULL);
1731 		rc->rc_error = cvd->v_read(cvd, NULL,
1732 		    rc->rc_data, rc->rc_offset, rc->rc_size);
1733 		if (rc->rc_error == 0)
1734 			n++;
1735 		rc->rc_tried = 1;
1736 		rc->rc_skipped = 0;
1737 	}
1738 	/*
1739 	 * If we managed to read anything more, retry the
1740 	 * reconstruction.
1741 	 */
1742 	if (n > 0)
1743 		goto reconstruct;
1744 
1745 	/*
1746 	 * At this point we've attempted to reconstruct the data given the
1747 	 * errors we detected, and we've attempted to read all columns. There
1748 	 * must, therefore, be one or more additional problems -- silent errors
1749 	 * resulting in invalid data rather than explicit I/O errors resulting
1750 	 * in absent data. We check if there is enough additional data to
1751 	 * possibly reconstruct the data and then perform combinatorial
1752 	 * reconstruction over all possible combinations. If that fails,
1753 	 * we're cooked.
1754 	 */
1755 	if (total_errors > rm->rm_firstdatacol) {
1756 		error = EIO;
1757 	} else if (total_errors < rm->rm_firstdatacol &&
1758 	    (code = vdev_raidz_combrec(vd->spa, rm, bp, data, offset, bytes,
1759 	     total_errors, data_errors)) != 0) {
1760 		/*
1761 		 * If we didn't use all the available parity for the
1762 		 * combinatorial reconstruction, verify that the remaining
1763 		 * parity is correct.
1764 		 */
1765 		if (code != (1 << rm->rm_firstdatacol) - 1)
1766 			(void) raidz_parity_verify(rm);
1767 	} else {
1768 		/*
1769 		 * We're here because either:
1770 		 *
1771 		 *	total_errors == rm_first_datacol, or
1772 		 *	vdev_raidz_combrec() failed
1773 		 *
1774 		 * In either case, there is enough bad data to prevent
1775 		 * reconstruction.
1776 		 *
1777 		 * Start checksum ereports for all children which haven't
1778 		 * failed, and the IO wasn't speculative.
1779 		 */
1780 		error = ECKSUM;
1781 	}
1782 
1783 done:
1784 	vdev_raidz_map_free(rm);
1785 
1786 	return (error);
1787 }
1788