xref: /freebsd/sys/cddl/boot/zfs/zfssubr.c (revision 2fec3ae8964c8864a9e75d6a2f0ed137ede489a7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/cdefs.h>
27 __FBSDID("$FreeBSD$");
28 
29 #include <lz4.h>
30 
31 static uint64_t zfs_crc64_table[256];
32 
33 #define	ASSERT3S(x, y, z)	((void)0)
34 #define	ASSERT3U(x, y, z)	((void)0)
35 #define	ASSERT3P(x, y, z)	((void)0)
36 #define	ASSERT0(x)		((void)0)
37 #define	ASSERT(x)		((void)0)
38 
39 #define	panic(...)	do {						\
40 	printf(__VA_ARGS__);						\
41 	for (;;) ;							\
42 } while (0)
43 
44 static void
45 zfs_init_crc(void)
46 {
47 	int i, j;
48 	uint64_t *ct;
49 
50 	/*
51 	 * Calculate the crc64 table (used for the zap hash
52 	 * function).
53 	 */
54 	if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
55 		memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
56 		for (i = 0; i < 256; i++)
57 			for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
58 				*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
59 	}
60 }
61 
62 static void
63 zio_checksum_off(const void *buf, uint64_t size,
64     const void *ctx_template, zio_cksum_t *zcp)
65 {
66 	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
67 }
68 
69 /*
70  * Signature for checksum functions.
71  */
72 typedef void zio_checksum_t(const void *data, uint64_t size,
73     const void *ctx_template, zio_cksum_t *zcp);
74 typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
75 typedef void zio_checksum_tmpl_free_t(void *ctx_template);
76 
77 typedef enum zio_checksum_flags {
78 	/* Strong enough for metadata? */
79 	ZCHECKSUM_FLAG_METADATA = (1 << 1),
80 	/* ZIO embedded checksum */
81 	ZCHECKSUM_FLAG_EMBEDDED = (1 << 2),
82 	/* Strong enough for dedup (without verification)? */
83 	ZCHECKSUM_FLAG_DEDUP = (1 << 3),
84 	/* Uses salt value */
85 	ZCHECKSUM_FLAG_SALTED = (1 << 4),
86 	/* Strong enough for nopwrite? */
87 	ZCHECKSUM_FLAG_NOPWRITE = (1 << 5)
88 } zio_checksum_flags_t;
89 
90 /*
91  * Information about each checksum function.
92  */
93 typedef struct zio_checksum_info {
94 	/* checksum function for each byteorder */
95 	zio_checksum_t			*ci_func[2];
96 	zio_checksum_tmpl_init_t	*ci_tmpl_init;
97 	zio_checksum_tmpl_free_t	*ci_tmpl_free;
98 	zio_checksum_flags_t		ci_flags;
99 	const char			*ci_name;	/* descriptive name */
100 } zio_checksum_info_t;
101 
102 #include "blkptr.c"
103 
104 #include "fletcher.c"
105 #include "sha256.c"
106 #include "skein_zfs.c"
107 
108 extern int zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len,
109     size_t d_len, int n);
110 
111 
112 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
113 	{{NULL, NULL}, NULL, NULL, 0, "inherit"},
114 	{{NULL, NULL}, NULL, NULL, 0, "on"},
115 	{{zio_checksum_off,	zio_checksum_off}, NULL, NULL, 0, "off"},
116 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
117 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"},
118 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
119 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"},
120 	{{fletcher_2_native,	fletcher_2_byteswap}, NULL, NULL,
121 	    ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
122 	{{fletcher_2_native,	fletcher_2_byteswap}, NULL, NULL,
123 	    0, "fletcher2"},
124 	{{fletcher_4_native,	fletcher_4_byteswap}, NULL, NULL,
125 	    ZCHECKSUM_FLAG_METADATA, "fletcher4"},
126 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
127 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
128 	    ZCHECKSUM_FLAG_NOPWRITE, "SHA256"},
129 	{{fletcher_4_native,	fletcher_4_byteswap}, NULL, NULL,
130 	    ZCHECKSUM_FLAG_EMBEDDED, "zillog2"},
131 	{{zio_checksum_off,	zio_checksum_off}, NULL, NULL,
132 	    0, "noparity"},
133 	{{zio_checksum_SHA512_native,	zio_checksum_SHA512_byteswap},
134 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
135 	    ZCHECKSUM_FLAG_NOPWRITE, "SHA512"},
136 	{{zio_checksum_skein_native, zio_checksum_skein_byteswap},
137 	    zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
138 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
139 	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
140 	/* no edonr for now */
141 	{{NULL, NULL}, NULL, NULL, ZCHECKSUM_FLAG_METADATA |
142 	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"}
143 };
144 
145 /*
146  * Common signature for all zio compress/decompress functions.
147  */
148 typedef size_t zio_compress_func_t(void *src, void *dst,
149     size_t s_len, size_t d_len, int);
150 typedef int zio_decompress_func_t(void *src, void *dst,
151     size_t s_len, size_t d_len, int);
152 
153 /*
154  * Information about each compression function.
155  */
156 typedef struct zio_compress_info {
157 	zio_compress_func_t	*ci_compress;	/* compression function */
158 	zio_decompress_func_t	*ci_decompress;	/* decompression function */
159 	int			ci_level;	/* level parameter */
160 	const char		*ci_name;	/* algorithm name */
161 } zio_compress_info_t;
162 
163 #include "lzjb.c"
164 #include "zle.c"
165 
166 /*
167  * Compression vectors.
168  */
169 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
170 	{NULL,			NULL,			0,	"inherit"},
171 	{NULL,			NULL,			0,	"on"},
172 	{NULL,			NULL,			0,	"uncompressed"},
173 	{NULL,			lzjb_decompress,	0,	"lzjb"},
174 	{NULL,			NULL,			0,	"empty"},
175 	{NULL,			NULL,			1,	"gzip-1"},
176 	{NULL,			NULL,			2,	"gzip-2"},
177 	{NULL,			NULL,			3,	"gzip-3"},
178 	{NULL,			NULL,			4,	"gzip-4"},
179 	{NULL,			NULL,			5,	"gzip-5"},
180 	{NULL,			NULL,			6,	"gzip-6"},
181 	{NULL,			NULL,			7,	"gzip-7"},
182 	{NULL,			NULL,			8,	"gzip-8"},
183 	{NULL,			NULL,			9,	"gzip-9"},
184 	{NULL,			zle_decompress,		64,	"zle"},
185 	{NULL,			lz4_decompress,		0,	"lz4"},
186 	{NULL,			zfs_zstd_decompress, ZIO_ZSTD_LEVEL_DEFAULT, "zstd"}
187 };
188 
189 static void
190 byteswap_uint64_array(void *vbuf, size_t size)
191 {
192 	uint64_t *buf = vbuf;
193 	size_t count = size >> 3;
194 	int i;
195 
196 	ASSERT((size & 7) == 0);
197 
198 	for (i = 0; i < count; i++)
199 		buf[i] = BSWAP_64(buf[i]);
200 }
201 
202 /*
203  * Set the external verifier for a gang block based on <vdev, offset, txg>,
204  * a tuple which is guaranteed to be unique for the life of the pool.
205  */
206 static void
207 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
208 {
209 	const dva_t *dva = BP_IDENTITY(bp);
210 	uint64_t txg = BP_PHYSICAL_BIRTH(bp);
211 
212 	ASSERT(BP_IS_GANG(bp));
213 
214 	ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
215 }
216 
217 /*
218  * Set the external verifier for a label block based on its offset.
219  * The vdev is implicit, and the txg is unknowable at pool open time --
220  * hence the logic in vdev_uberblock_load() to find the most recent copy.
221  */
222 static void
223 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
224 {
225 	ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
226 }
227 
228 /*
229  * Calls the template init function of a checksum which supports context
230  * templates and installs the template into the spa_t.
231  */
232 static void
233 zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
234 {
235 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
236 
237 	if (ci->ci_tmpl_init == NULL)
238 		return;
239 
240 	if (spa->spa_cksum_tmpls[checksum] != NULL)
241 		return;
242 
243 	if (spa->spa_cksum_tmpls[checksum] == NULL) {
244 		spa->spa_cksum_tmpls[checksum] =
245 		    ci->ci_tmpl_init(&spa->spa_cksum_salt);
246 	}
247 }
248 
249 /*
250  * Called by a spa_t that's about to be deallocated. This steps through
251  * all of the checksum context templates and deallocates any that were
252  * initialized using the algorithm-specific template init function.
253  */
254 static void __unused
255 zio_checksum_templates_free(spa_t *spa)
256 {
257 	for (enum zio_checksum checksum = 0;
258 	    checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
259 		if (spa->spa_cksum_tmpls[checksum] != NULL) {
260 			zio_checksum_info_t *ci = &zio_checksum_table[checksum];
261 
262 			ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
263 			spa->spa_cksum_tmpls[checksum] = NULL;
264 		}
265 	}
266 }
267 
268 static int
269 zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data)
270 {
271 	uint64_t size;
272 	unsigned int checksum;
273 	zio_checksum_info_t *ci;
274 	void *ctx = NULL;
275 	zio_cksum_t actual_cksum, expected_cksum, verifier;
276 	int byteswap;
277 
278 	checksum = BP_GET_CHECKSUM(bp);
279 	size = BP_GET_PSIZE(bp);
280 
281 	if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
282 		return (EINVAL);
283 	ci = &zio_checksum_table[checksum];
284 	if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
285 		return (EINVAL);
286 
287 	if (spa != NULL) {
288 		zio_checksum_template_init(checksum, __DECONST(spa_t *,spa));
289 		ctx = spa->spa_cksum_tmpls[checksum];
290 	}
291 
292 	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
293 		zio_eck_t *eck;
294 
295 		ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
296 		    checksum == ZIO_CHECKSUM_LABEL);
297 
298 		eck = (zio_eck_t *)((char *)data + size) - 1;
299 
300 		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
301 			zio_checksum_gang_verifier(&verifier, bp);
302 		else if (checksum == ZIO_CHECKSUM_LABEL)
303 			zio_checksum_label_verifier(&verifier,
304 			    DVA_GET_OFFSET(BP_IDENTITY(bp)));
305 		else
306 			verifier = bp->blk_cksum;
307 
308 		byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
309 
310 		if (byteswap)
311 			byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
312 
313 		expected_cksum = eck->zec_cksum;
314 		eck->zec_cksum = verifier;
315 		ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
316 		eck->zec_cksum = expected_cksum;
317 
318 		if (byteswap)
319 			byteswap_uint64_array(&expected_cksum,
320 			    sizeof (zio_cksum_t));
321 	} else {
322 		byteswap = BP_SHOULD_BYTESWAP(bp);
323 		expected_cksum = bp->blk_cksum;
324 		ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
325 	}
326 
327 	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
328 		/*printf("ZFS: read checksum %s failed\n", ci->ci_name);*/
329 		return (EIO);
330 	}
331 
332 	return (0);
333 }
334 
335 static int
336 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
337 	void *dest, uint64_t destsize)
338 {
339 	zio_compress_info_t *ci;
340 
341 	if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
342 		printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
343 		return (EIO);
344 	}
345 
346 	ci = &zio_compress_table[cpfunc];
347 	if (!ci->ci_decompress) {
348 		printf("ZFS: unsupported compression algorithm %s\n",
349 		    ci->ci_name);
350 		return (EIO);
351 	}
352 
353 	return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
354 }
355 
356 static uint64_t
357 zap_hash(uint64_t salt, const char *name)
358 {
359 	const uint8_t *cp;
360 	uint8_t c;
361 	uint64_t crc = salt;
362 
363 	ASSERT(crc != 0);
364 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
365 	for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
366 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
367 
368 	/*
369 	 * Only use 28 bits, since we need 4 bits in the cookie for the
370 	 * collision differentiator.  We MUST use the high bits, since
371 	 * those are the onces that we first pay attention to when
372 	 * chosing the bucket.
373 	 */
374 	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
375 
376 	return (crc);
377 }
378 
379 typedef struct raidz_col {
380 	uint64_t rc_devidx;		/* child device index for I/O */
381 	uint64_t rc_offset;		/* device offset */
382 	uint64_t rc_size;		/* I/O size */
383 	void *rc_data;			/* I/O data */
384 	int rc_error;			/* I/O error for this device */
385 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
386 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
387 } raidz_col_t;
388 
389 typedef struct raidz_map {
390 	uint64_t rm_cols;		/* Regular column count */
391 	uint64_t rm_scols;		/* Count including skipped columns */
392 	uint64_t rm_bigcols;		/* Number of oversized columns */
393 	uint64_t rm_asize;		/* Actual total I/O size */
394 	uint64_t rm_missingdata;	/* Count of missing data devices */
395 	uint64_t rm_missingparity;	/* Count of missing parity devices */
396 	uint64_t rm_firstdatacol;	/* First data column/parity count */
397 	uint64_t rm_nskip;		/* Skipped sectors for padding */
398 	uint64_t rm_skipstart;		/* Column index of padding start */
399 	uintptr_t rm_reports;		/* # of referencing checksum reports */
400 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
401 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
402 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
403 } raidz_map_t;
404 
405 #define	VDEV_RAIDZ_P		0
406 #define	VDEV_RAIDZ_Q		1
407 #define	VDEV_RAIDZ_R		2
408 
409 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
410 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
411 
412 /*
413  * We provide a mechanism to perform the field multiplication operation on a
414  * 64-bit value all at once rather than a byte at a time. This works by
415  * creating a mask from the top bit in each byte and using that to
416  * conditionally apply the XOR of 0x1d.
417  */
418 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
419 { \
420 	(mask) = (x) & 0x8080808080808080ULL; \
421 	(mask) = ((mask) << 1) - ((mask) >> 7); \
422 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
423 	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
424 }
425 
426 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
427 { \
428 	VDEV_RAIDZ_64MUL_2((x), mask); \
429 	VDEV_RAIDZ_64MUL_2((x), mask); \
430 }
431 
432 /*
433  * These two tables represent powers and logs of 2 in the Galois field defined
434  * above. These values were computed by repeatedly multiplying by 2 as above.
435  */
436 static const uint8_t vdev_raidz_pow2[256] = {
437 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
438 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
439 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
440 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
441 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
442 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
443 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
444 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
445 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
446 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
447 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
448 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
449 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
450 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
451 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
452 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
453 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
454 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
455 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
456 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
457 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
458 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
459 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
460 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
461 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
462 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
463 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
464 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
465 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
466 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
467 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
468 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
469 };
470 static const uint8_t vdev_raidz_log2[256] = {
471 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
472 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
473 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
474 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
475 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
476 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
477 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
478 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
479 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
480 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
481 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
482 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
483 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
484 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
485 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
486 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
487 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
488 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
489 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
490 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
491 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
492 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
493 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
494 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
495 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
496 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
497 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
498 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
499 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
500 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
501 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
502 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
503 };
504 
505 /*
506  * Multiply a given number by 2 raised to the given power.
507  */
508 static uint8_t
509 vdev_raidz_exp2(uint8_t a, int exp)
510 {
511 	if (a == 0)
512 		return (0);
513 
514 	ASSERT(exp >= 0);
515 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
516 
517 	exp += vdev_raidz_log2[a];
518 	if (exp > 255)
519 		exp -= 255;
520 
521 	return (vdev_raidz_pow2[exp]);
522 }
523 
524 static void
525 vdev_raidz_generate_parity_p(raidz_map_t *rm)
526 {
527 	uint64_t *p, *src, pcount, ccount, i;
528 	int c;
529 
530 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
531 
532 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
533 		src = rm->rm_col[c].rc_data;
534 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
535 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
536 
537 		if (c == rm->rm_firstdatacol) {
538 			ASSERT(ccount == pcount);
539 			for (i = 0; i < ccount; i++, src++, p++) {
540 				*p = *src;
541 			}
542 		} else {
543 			ASSERT(ccount <= pcount);
544 			for (i = 0; i < ccount; i++, src++, p++) {
545 				*p ^= *src;
546 			}
547 		}
548 	}
549 }
550 
551 static void
552 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
553 {
554 	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
555 	int c;
556 
557 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
558 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
559 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
560 
561 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
562 		src = rm->rm_col[c].rc_data;
563 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
564 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
565 
566 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
567 
568 		if (c == rm->rm_firstdatacol) {
569 			ASSERT(ccnt == pcnt || ccnt == 0);
570 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
571 				*p = *src;
572 				*q = *src;
573 			}
574 			for (; i < pcnt; i++, src++, p++, q++) {
575 				*p = 0;
576 				*q = 0;
577 			}
578 		} else {
579 			ASSERT(ccnt <= pcnt);
580 
581 			/*
582 			 * Apply the algorithm described above by multiplying
583 			 * the previous result and adding in the new value.
584 			 */
585 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
586 				*p ^= *src;
587 
588 				VDEV_RAIDZ_64MUL_2(*q, mask);
589 				*q ^= *src;
590 			}
591 
592 			/*
593 			 * Treat short columns as though they are full of 0s.
594 			 * Note that there's therefore nothing needed for P.
595 			 */
596 			for (; i < pcnt; i++, q++) {
597 				VDEV_RAIDZ_64MUL_2(*q, mask);
598 			}
599 		}
600 	}
601 }
602 
603 static void
604 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
605 {
606 	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
607 	int c;
608 
609 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
610 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
611 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
612 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
613 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
614 
615 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
616 		src = rm->rm_col[c].rc_data;
617 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
618 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
619 		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
620 
621 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
622 
623 		if (c == rm->rm_firstdatacol) {
624 			ASSERT(ccnt == pcnt || ccnt == 0);
625 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
626 				*p = *src;
627 				*q = *src;
628 				*r = *src;
629 			}
630 			for (; i < pcnt; i++, src++, p++, q++, r++) {
631 				*p = 0;
632 				*q = 0;
633 				*r = 0;
634 			}
635 		} else {
636 			ASSERT(ccnt <= pcnt);
637 
638 			/*
639 			 * Apply the algorithm described above by multiplying
640 			 * the previous result and adding in the new value.
641 			 */
642 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
643 				*p ^= *src;
644 
645 				VDEV_RAIDZ_64MUL_2(*q, mask);
646 				*q ^= *src;
647 
648 				VDEV_RAIDZ_64MUL_4(*r, mask);
649 				*r ^= *src;
650 			}
651 
652 			/*
653 			 * Treat short columns as though they are full of 0s.
654 			 * Note that there's therefore nothing needed for P.
655 			 */
656 			for (; i < pcnt; i++, q++, r++) {
657 				VDEV_RAIDZ_64MUL_2(*q, mask);
658 				VDEV_RAIDZ_64MUL_4(*r, mask);
659 			}
660 		}
661 	}
662 }
663 
664 /*
665  * Generate RAID parity in the first virtual columns according to the number of
666  * parity columns available.
667  */
668 static void
669 vdev_raidz_generate_parity(raidz_map_t *rm)
670 {
671 	switch (rm->rm_firstdatacol) {
672 	case 1:
673 		vdev_raidz_generate_parity_p(rm);
674 		break;
675 	case 2:
676 		vdev_raidz_generate_parity_pq(rm);
677 		break;
678 	case 3:
679 		vdev_raidz_generate_parity_pqr(rm);
680 		break;
681 	default:
682 		panic("invalid RAID-Z configuration");
683 	}
684 }
685 
686 /* BEGIN CSTYLED */
687 /*
688  * In the general case of reconstruction, we must solve the system of linear
689  * equations defined by the coeffecients used to generate parity as well as
690  * the contents of the data and parity disks. This can be expressed with
691  * vectors for the original data (D) and the actual data (d) and parity (p)
692  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
693  *
694  *            __   __                     __     __
695  *            |     |         __     __   |  p_0  |
696  *            |  V  |         |  D_0  |   | p_m-1 |
697  *            |     |    x    |   :   | = |  d_0  |
698  *            |  I  |         | D_n-1 |   |   :   |
699  *            |     |         ~~     ~~   | d_n-1 |
700  *            ~~   ~~                     ~~     ~~
701  *
702  * I is simply a square identity matrix of size n, and V is a vandermonde
703  * matrix defined by the coeffecients we chose for the various parity columns
704  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
705  * computation as well as linear separability.
706  *
707  *      __               __               __     __
708  *      |   1   ..  1 1 1 |               |  p_0  |
709  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
710  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
711  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
712  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
713  *      |   :       : : : |   |   :   |   |  d_2  |
714  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
715  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
716  *      |   0   ..  0 0 1 |               | d_n-1 |
717  *      ~~               ~~               ~~     ~~
718  *
719  * Note that I, V, d, and p are known. To compute D, we must invert the
720  * matrix and use the known data and parity values to reconstruct the unknown
721  * data values. We begin by removing the rows in V|I and d|p that correspond
722  * to failed or missing columns; we then make V|I square (n x n) and d|p
723  * sized n by removing rows corresponding to unused parity from the bottom up
724  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
725  * using Gauss-Jordan elimination. In the example below we use m=3 parity
726  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
727  *           __                               __
728  *           |  1   1   1   1   1   1   1   1  |
729  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
730  *           |  19 205 116  29  64  16  4   1  |      / /
731  *           |  1   0   0   0   0   0   0   0  |     / /
732  *           |  0   1   0   0   0   0   0   0  | <--' /
733  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
734  *           |  0   0   0   1   0   0   0   0  |
735  *           |  0   0   0   0   1   0   0   0  |
736  *           |  0   0   0   0   0   1   0   0  |
737  *           |  0   0   0   0   0   0   1   0  |
738  *           |  0   0   0   0   0   0   0   1  |
739  *           ~~                               ~~
740  *           __                               __
741  *           |  1   1   1   1   1   1   1   1  |
742  *           | 128  64  32  16  8   4   2   1  |
743  *           |  19 205 116  29  64  16  4   1  |
744  *           |  1   0   0   0   0   0   0   0  |
745  *           |  0   1   0   0   0   0   0   0  |
746  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
747  *           |  0   0   0   1   0   0   0   0  |
748  *           |  0   0   0   0   1   0   0   0  |
749  *           |  0   0   0   0   0   1   0   0  |
750  *           |  0   0   0   0   0   0   1   0  |
751  *           |  0   0   0   0   0   0   0   1  |
752  *           ~~                               ~~
753  *
754  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
755  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
756  * matrix is not singular.
757  * __                                                                 __
758  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
759  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
760  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
761  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
762  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
763  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
764  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
765  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
766  * ~~                                                                 ~~
767  * __                                                                 __
768  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
769  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
770  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
771  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
772  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
773  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
774  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
775  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
776  * ~~                                                                 ~~
777  * __                                                                 __
778  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
779  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
780  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
781  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
782  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
783  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
784  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
785  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
786  * ~~                                                                 ~~
787  * __                                                                 __
788  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
789  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
790  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
791  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
792  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
793  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
794  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
795  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
796  * ~~                                                                 ~~
797  * __                                                                 __
798  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
799  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
800  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
801  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
802  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
803  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
804  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
805  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
806  * ~~                                                                 ~~
807  * __                                                                 __
808  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
809  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
810  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
811  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
812  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
813  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
814  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
815  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
816  * ~~                                                                 ~~
817  *                   __                               __
818  *                   |  0   0   1   0   0   0   0   0  |
819  *                   | 167 100  5   41 159 169 217 208 |
820  *                   | 166 100  4   40 158 168 216 209 |
821  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
822  *                   |  0   0   0   0   1   0   0   0  |
823  *                   |  0   0   0   0   0   1   0   0  |
824  *                   |  0   0   0   0   0   0   1   0  |
825  *                   |  0   0   0   0   0   0   0   1  |
826  *                   ~~                               ~~
827  *
828  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
829  * of the missing data.
830  *
831  * As is apparent from the example above, the only non-trivial rows in the
832  * inverse matrix correspond to the data disks that we're trying to
833  * reconstruct. Indeed, those are the only rows we need as the others would
834  * only be useful for reconstructing data known or assumed to be valid. For
835  * that reason, we only build the coefficients in the rows that correspond to
836  * targeted columns.
837  */
838 /* END CSTYLED */
839 
840 static void
841 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
842     uint8_t **rows)
843 {
844 	int i, j;
845 	int pow;
846 
847 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
848 
849 	/*
850 	 * Fill in the missing rows of interest.
851 	 */
852 	for (i = 0; i < nmap; i++) {
853 		ASSERT3S(0, <=, map[i]);
854 		ASSERT3S(map[i], <=, 2);
855 
856 		pow = map[i] * n;
857 		if (pow > 255)
858 			pow -= 255;
859 		ASSERT(pow <= 255);
860 
861 		for (j = 0; j < n; j++) {
862 			pow -= map[i];
863 			if (pow < 0)
864 				pow += 255;
865 			rows[i][j] = vdev_raidz_pow2[pow];
866 		}
867 	}
868 }
869 
870 static void
871 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
872     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
873 {
874 	int i, j, ii, jj;
875 	uint8_t log;
876 
877 	/*
878 	 * Assert that the first nmissing entries from the array of used
879 	 * columns correspond to parity columns and that subsequent entries
880 	 * correspond to data columns.
881 	 */
882 	for (i = 0; i < nmissing; i++) {
883 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
884 	}
885 	for (; i < n; i++) {
886 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
887 	}
888 
889 	/*
890 	 * First initialize the storage where we'll compute the inverse rows.
891 	 */
892 	for (i = 0; i < nmissing; i++) {
893 		for (j = 0; j < n; j++) {
894 			invrows[i][j] = (i == j) ? 1 : 0;
895 		}
896 	}
897 
898 	/*
899 	 * Subtract all trivial rows from the rows of consequence.
900 	 */
901 	for (i = 0; i < nmissing; i++) {
902 		for (j = nmissing; j < n; j++) {
903 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
904 			jj = used[j] - rm->rm_firstdatacol;
905 			ASSERT3S(jj, <, n);
906 			invrows[i][j] = rows[i][jj];
907 			rows[i][jj] = 0;
908 		}
909 	}
910 
911 	/*
912 	 * For each of the rows of interest, we must normalize it and subtract
913 	 * a multiple of it from the other rows.
914 	 */
915 	for (i = 0; i < nmissing; i++) {
916 		for (j = 0; j < missing[i]; j++) {
917 			ASSERT3U(rows[i][j], ==, 0);
918 		}
919 		ASSERT3U(rows[i][missing[i]], !=, 0);
920 
921 		/*
922 		 * Compute the inverse of the first element and multiply each
923 		 * element in the row by that value.
924 		 */
925 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
926 
927 		for (j = 0; j < n; j++) {
928 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
929 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
930 		}
931 
932 		for (ii = 0; ii < nmissing; ii++) {
933 			if (i == ii)
934 				continue;
935 
936 			ASSERT3U(rows[ii][missing[i]], !=, 0);
937 
938 			log = vdev_raidz_log2[rows[ii][missing[i]]];
939 
940 			for (j = 0; j < n; j++) {
941 				rows[ii][j] ^=
942 				    vdev_raidz_exp2(rows[i][j], log);
943 				invrows[ii][j] ^=
944 				    vdev_raidz_exp2(invrows[i][j], log);
945 			}
946 		}
947 	}
948 
949 	/*
950 	 * Verify that the data that is left in the rows are properly part of
951 	 * an identity matrix.
952 	 */
953 	for (i = 0; i < nmissing; i++) {
954 		for (j = 0; j < n; j++) {
955 			if (j == missing[i]) {
956 				ASSERT3U(rows[i][j], ==, 1);
957 			} else {
958 				ASSERT3U(rows[i][j], ==, 0);
959 			}
960 		}
961 	}
962 }
963 
964 static void
965 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
966     int *missing, uint8_t **invrows, const uint8_t *used)
967 {
968 	int i, j, x, cc, c;
969 	uint8_t *src;
970 	uint64_t ccount;
971 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
972 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
973 	uint8_t log, val;
974 	int ll;
975 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
976 	uint8_t *p, *pp;
977 	size_t psize;
978 
979 	log = 0;	/* gcc */
980 	psize = sizeof (invlog[0][0]) * n * nmissing;
981 	p = malloc(psize);
982 	if (p == NULL) {
983 		printf("Out of memory\n");
984 		return;
985 	}
986 
987 	for (pp = p, i = 0; i < nmissing; i++) {
988 		invlog[i] = pp;
989 		pp += n;
990 	}
991 
992 	for (i = 0; i < nmissing; i++) {
993 		for (j = 0; j < n; j++) {
994 			ASSERT3U(invrows[i][j], !=, 0);
995 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
996 		}
997 	}
998 
999 	for (i = 0; i < n; i++) {
1000 		c = used[i];
1001 		ASSERT3U(c, <, rm->rm_cols);
1002 
1003 		src = rm->rm_col[c].rc_data;
1004 		ccount = rm->rm_col[c].rc_size;
1005 		for (j = 0; j < nmissing; j++) {
1006 			cc = missing[j] + rm->rm_firstdatacol;
1007 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1008 			ASSERT3U(cc, <, rm->rm_cols);
1009 			ASSERT3U(cc, !=, c);
1010 
1011 			dst[j] = rm->rm_col[cc].rc_data;
1012 			dcount[j] = rm->rm_col[cc].rc_size;
1013 		}
1014 
1015 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1016 
1017 		for (x = 0; x < ccount; x++, src++) {
1018 			if (*src != 0)
1019 				log = vdev_raidz_log2[*src];
1020 
1021 			for (cc = 0; cc < nmissing; cc++) {
1022 				if (x >= dcount[cc])
1023 					continue;
1024 
1025 				if (*src == 0) {
1026 					val = 0;
1027 				} else {
1028 					if ((ll = log + invlog[cc][i]) >= 255)
1029 						ll -= 255;
1030 					val = vdev_raidz_pow2[ll];
1031 				}
1032 
1033 				if (i == 0)
1034 					dst[cc][x] = val;
1035 				else
1036 					dst[cc][x] ^= val;
1037 			}
1038 		}
1039 	}
1040 
1041 	free(p);
1042 }
1043 
1044 static int
1045 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1046 {
1047 	int n, i, c, t, tt;
1048 	int nmissing_rows;
1049 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1050 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1051 
1052 	uint8_t *p, *pp;
1053 	size_t psize;
1054 
1055 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1056 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1057 	uint8_t *used;
1058 
1059 	int code = 0;
1060 
1061 
1062 	n = rm->rm_cols - rm->rm_firstdatacol;
1063 
1064 	/*
1065 	 * Figure out which data columns are missing.
1066 	 */
1067 	nmissing_rows = 0;
1068 	for (t = 0; t < ntgts; t++) {
1069 		if (tgts[t] >= rm->rm_firstdatacol) {
1070 			missing_rows[nmissing_rows++] =
1071 			    tgts[t] - rm->rm_firstdatacol;
1072 		}
1073 	}
1074 
1075 	/*
1076 	 * Figure out which parity columns to use to help generate the missing
1077 	 * data columns.
1078 	 */
1079 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1080 		ASSERT(tt < ntgts);
1081 		ASSERT(c < rm->rm_firstdatacol);
1082 
1083 		/*
1084 		 * Skip any targeted parity columns.
1085 		 */
1086 		if (c == tgts[tt]) {
1087 			tt++;
1088 			continue;
1089 		}
1090 
1091 		code |= 1 << c;
1092 
1093 		parity_map[i] = c;
1094 		i++;
1095 	}
1096 
1097 	ASSERT(code != 0);
1098 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1099 
1100 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1101 	    nmissing_rows * n + sizeof (used[0]) * n;
1102 	p = malloc(psize);
1103 	if (p == NULL) {
1104 		printf("Out of memory\n");
1105 		return (code);
1106 	}
1107 
1108 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1109 		rows[i] = pp;
1110 		pp += n;
1111 		invrows[i] = pp;
1112 		pp += n;
1113 	}
1114 	used = pp;
1115 
1116 	for (i = 0; i < nmissing_rows; i++) {
1117 		used[i] = parity_map[i];
1118 	}
1119 
1120 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1121 		if (tt < nmissing_rows &&
1122 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1123 			tt++;
1124 			continue;
1125 		}
1126 
1127 		ASSERT3S(i, <, n);
1128 		used[i] = c;
1129 		i++;
1130 	}
1131 
1132 	/*
1133 	 * Initialize the interesting rows of the matrix.
1134 	 */
1135 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1136 
1137 	/*
1138 	 * Invert the matrix.
1139 	 */
1140 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1141 	    invrows, used);
1142 
1143 	/*
1144 	 * Reconstruct the missing data using the generated matrix.
1145 	 */
1146 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1147 	    invrows, used);
1148 
1149 	free(p);
1150 
1151 	return (code);
1152 }
1153 
1154 static int
1155 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1156 {
1157 	int tgts[VDEV_RAIDZ_MAXPARITY];
1158 	int ntgts;
1159 	int i, c;
1160 	int code;
1161 	int nbadparity, nbaddata;
1162 
1163 	/*
1164 	 * The tgts list must already be sorted.
1165 	 */
1166 	for (i = 1; i < nt; i++) {
1167 		ASSERT(t[i] > t[i - 1]);
1168 	}
1169 
1170 	nbadparity = rm->rm_firstdatacol;
1171 	nbaddata = rm->rm_cols - nbadparity;
1172 	ntgts = 0;
1173 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1174 		if (i < nt && c == t[i]) {
1175 			tgts[ntgts++] = c;
1176 			i++;
1177 		} else if (rm->rm_col[c].rc_error != 0) {
1178 			tgts[ntgts++] = c;
1179 		} else if (c >= rm->rm_firstdatacol) {
1180 			nbaddata--;
1181 		} else {
1182 			nbadparity--;
1183 		}
1184 	}
1185 
1186 	ASSERT(ntgts >= nt);
1187 	ASSERT(nbaddata >= 0);
1188 	ASSERT(nbaddata + nbadparity == ntgts);
1189 
1190 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1191 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1192 	ASSERT(code > 0);
1193 	return (code);
1194 }
1195 
1196 static raidz_map_t *
1197 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1198     uint64_t dcols, uint64_t nparity)
1199 {
1200 	raidz_map_t *rm;
1201 	uint64_t b = offset >> unit_shift;
1202 	uint64_t s = size >> unit_shift;
1203 	uint64_t f = b % dcols;
1204 	uint64_t o = (b / dcols) << unit_shift;
1205 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1206 
1207 	q = s / (dcols - nparity);
1208 	r = s - q * (dcols - nparity);
1209 	bc = (r == 0 ? 0 : r + nparity);
1210 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
1211 
1212 	if (q == 0) {
1213 		acols = bc;
1214 		scols = MIN(dcols, roundup(bc, nparity + 1));
1215 	} else {
1216 		acols = dcols;
1217 		scols = dcols;
1218 	}
1219 
1220 	ASSERT3U(acols, <=, scols);
1221 
1222 	rm = malloc(offsetof(raidz_map_t, rm_col[scols]));
1223 	if (rm == NULL)
1224 		return (rm);
1225 
1226 	rm->rm_cols = acols;
1227 	rm->rm_scols = scols;
1228 	rm->rm_bigcols = bc;
1229 	rm->rm_skipstart = bc;
1230 	rm->rm_missingdata = 0;
1231 	rm->rm_missingparity = 0;
1232 	rm->rm_firstdatacol = nparity;
1233 	rm->rm_reports = 0;
1234 	rm->rm_freed = 0;
1235 	rm->rm_ecksuminjected = 0;
1236 
1237 	asize = 0;
1238 
1239 	for (c = 0; c < scols; c++) {
1240 		col = f + c;
1241 		coff = o;
1242 		if (col >= dcols) {
1243 			col -= dcols;
1244 			coff += 1ULL << unit_shift;
1245 		}
1246 		rm->rm_col[c].rc_devidx = col;
1247 		rm->rm_col[c].rc_offset = coff;
1248 		rm->rm_col[c].rc_data = NULL;
1249 		rm->rm_col[c].rc_error = 0;
1250 		rm->rm_col[c].rc_tried = 0;
1251 		rm->rm_col[c].rc_skipped = 0;
1252 
1253 		if (c >= acols)
1254 			rm->rm_col[c].rc_size = 0;
1255 		else if (c < bc)
1256 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1257 		else
1258 			rm->rm_col[c].rc_size = q << unit_shift;
1259 
1260 		asize += rm->rm_col[c].rc_size;
1261 	}
1262 
1263 	ASSERT3U(asize, ==, tot << unit_shift);
1264 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1265 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1266 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1267 	ASSERT3U(rm->rm_nskip, <=, nparity);
1268 
1269 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1270 		rm->rm_col[c].rc_data = malloc(rm->rm_col[c].rc_size);
1271 		if (rm->rm_col[c].rc_data == NULL) {
1272 			c++;
1273 			while (c != 0)
1274 				free(rm->rm_col[--c].rc_data);
1275 			free(rm);
1276 			return (NULL);
1277 		}
1278 	}
1279 
1280 	rm->rm_col[c].rc_data = data;
1281 
1282 	for (c = c + 1; c < acols; c++)
1283 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1284 		    rm->rm_col[c - 1].rc_size;
1285 
1286 	/*
1287 	 * If all data stored spans all columns, there's a danger that parity
1288 	 * will always be on the same device and, since parity isn't read
1289 	 * during normal operation, that that device's I/O bandwidth won't be
1290 	 * used effectively. We therefore switch the parity every 1MB.
1291 	 *
1292 	 * ... at least that was, ostensibly, the theory. As a practical
1293 	 * matter unless we juggle the parity between all devices evenly, we
1294 	 * won't see any benefit. Further, occasional writes that aren't a
1295 	 * multiple of the LCM of the number of children and the minimum
1296 	 * stripe width are sufficient to avoid pessimal behavior.
1297 	 * Unfortunately, this decision created an implicit on-disk format
1298 	 * requirement that we need to support for all eternity, but only
1299 	 * for single-parity RAID-Z.
1300 	 *
1301 	 * If we intend to skip a sector in the zeroth column for padding
1302 	 * we must make sure to note this swap. We will never intend to
1303 	 * skip the first column since at least one data and one parity
1304 	 * column must appear in each row.
1305 	 */
1306 	ASSERT(rm->rm_cols >= 2);
1307 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1308 
1309 	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1310 		devidx = rm->rm_col[0].rc_devidx;
1311 		o = rm->rm_col[0].rc_offset;
1312 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1313 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1314 		rm->rm_col[1].rc_devidx = devidx;
1315 		rm->rm_col[1].rc_offset = o;
1316 
1317 		if (rm->rm_skipstart == 0)
1318 			rm->rm_skipstart = 1;
1319 	}
1320 
1321 	return (rm);
1322 }
1323 
1324 static void
1325 vdev_raidz_map_free(raidz_map_t *rm)
1326 {
1327 	int c;
1328 
1329 	for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1330 		free(rm->rm_col[c].rc_data);
1331 
1332 	free(rm);
1333 }
1334 
1335 static vdev_t *
1336 vdev_child(vdev_t *pvd, uint64_t devidx)
1337 {
1338 	vdev_t *cvd;
1339 
1340 	STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1341 		if (cvd->v_id == devidx)
1342 			break;
1343 	}
1344 
1345 	return (cvd);
1346 }
1347 
1348 /*
1349  * We keep track of whether or not there were any injected errors, so that
1350  * any ereports we generate can note it.
1351  */
1352 static int
1353 raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data,
1354     uint64_t size)
1355 {
1356 	return (zio_checksum_verify(spa, bp, data));
1357 }
1358 
1359 /*
1360  * Generate the parity from the data columns. If we tried and were able to
1361  * read the parity without error, verify that the generated parity matches the
1362  * data we read. If it doesn't, we fire off a checksum error. Return the
1363  * number such failures.
1364  */
1365 static int
1366 raidz_parity_verify(raidz_map_t *rm)
1367 {
1368 	void *orig[VDEV_RAIDZ_MAXPARITY];
1369 	int c, ret = 0;
1370 	raidz_col_t *rc;
1371 
1372 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1373 		rc = &rm->rm_col[c];
1374 		if (!rc->rc_tried || rc->rc_error != 0)
1375 			continue;
1376 		orig[c] = malloc(rc->rc_size);
1377 		if (orig[c] != NULL) {
1378 			bcopy(rc->rc_data, orig[c], rc->rc_size);
1379 		} else {
1380 			printf("Out of memory\n");
1381 		}
1382 	}
1383 
1384 	vdev_raidz_generate_parity(rm);
1385 
1386 	for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1387 		rc = &rm->rm_col[c];
1388 		if (!rc->rc_tried || rc->rc_error != 0)
1389 			continue;
1390 		if (orig[c] == NULL ||
1391 		    bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1392 			rc->rc_error = ECKSUM;
1393 			ret++;
1394 		}
1395 		free(orig[c]);
1396 	}
1397 
1398 	return (ret);
1399 }
1400 
1401 /*
1402  * Iterate over all combinations of bad data and attempt a reconstruction.
1403  * Note that the algorithm below is non-optimal because it doesn't take into
1404  * account how reconstruction is actually performed. For example, with
1405  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1406  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1407  * cases we'd only use parity information in column 0.
1408  */
1409 static int
1410 vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp,
1411     void *data, off_t offset, uint64_t bytes, int total_errors, int data_errors)
1412 {
1413 	raidz_col_t *rc;
1414 	void *orig[VDEV_RAIDZ_MAXPARITY];
1415 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1416 	int *tgts = &tstore[1];
1417 	int current, next, i, c, n;
1418 	int code, ret = 0;
1419 
1420 	ASSERT(total_errors < rm->rm_firstdatacol);
1421 
1422 	/*
1423 	 * This simplifies one edge condition.
1424 	 */
1425 	tgts[-1] = -1;
1426 
1427 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1428 		/*
1429 		 * Initialize the targets array by finding the first n columns
1430 		 * that contain no error.
1431 		 *
1432 		 * If there were no data errors, we need to ensure that we're
1433 		 * always explicitly attempting to reconstruct at least one
1434 		 * data column. To do this, we simply push the highest target
1435 		 * up into the data columns.
1436 		 */
1437 		for (c = 0, i = 0; i < n; i++) {
1438 			if (i == n - 1 && data_errors == 0 &&
1439 			    c < rm->rm_firstdatacol) {
1440 				c = rm->rm_firstdatacol;
1441 			}
1442 
1443 			while (rm->rm_col[c].rc_error != 0) {
1444 				c++;
1445 				ASSERT3S(c, <, rm->rm_cols);
1446 			}
1447 
1448 			tgts[i] = c++;
1449 		}
1450 
1451 		/*
1452 		 * Setting tgts[n] simplifies the other edge condition.
1453 		 */
1454 		tgts[n] = rm->rm_cols;
1455 
1456 		/*
1457 		 * These buffers were allocated in previous iterations.
1458 		 */
1459 		for (i = 0; i < n - 1; i++) {
1460 			ASSERT(orig[i] != NULL);
1461 		}
1462 
1463 		orig[n - 1] = malloc(rm->rm_col[0].rc_size);
1464 		if (orig[n - 1] == NULL) {
1465 			ret = ENOMEM;
1466 			goto done;
1467 		}
1468 
1469 		current = 0;
1470 		next = tgts[current];
1471 
1472 		while (current != n) {
1473 			tgts[current] = next;
1474 			current = 0;
1475 
1476 			/*
1477 			 * Save off the original data that we're going to
1478 			 * attempt to reconstruct.
1479 			 */
1480 			for (i = 0; i < n; i++) {
1481 				ASSERT(orig[i] != NULL);
1482 				c = tgts[i];
1483 				ASSERT3S(c, >=, 0);
1484 				ASSERT3S(c, <, rm->rm_cols);
1485 				rc = &rm->rm_col[c];
1486 				bcopy(rc->rc_data, orig[i], rc->rc_size);
1487 			}
1488 
1489 			/*
1490 			 * Attempt a reconstruction and exit the outer loop on
1491 			 * success.
1492 			 */
1493 			code = vdev_raidz_reconstruct(rm, tgts, n);
1494 			if (raidz_checksum_verify(spa, bp, data, bytes) == 0) {
1495 				for (i = 0; i < n; i++) {
1496 					c = tgts[i];
1497 					rc = &rm->rm_col[c];
1498 					ASSERT(rc->rc_error == 0);
1499 					rc->rc_error = ECKSUM;
1500 				}
1501 
1502 				ret = code;
1503 				goto done;
1504 			}
1505 
1506 			/*
1507 			 * Restore the original data.
1508 			 */
1509 			for (i = 0; i < n; i++) {
1510 				c = tgts[i];
1511 				rc = &rm->rm_col[c];
1512 				bcopy(orig[i], rc->rc_data, rc->rc_size);
1513 			}
1514 
1515 			do {
1516 				/*
1517 				 * Find the next valid column after the current
1518 				 * position..
1519 				 */
1520 				for (next = tgts[current] + 1;
1521 				    next < rm->rm_cols &&
1522 				    rm->rm_col[next].rc_error != 0; next++)
1523 					continue;
1524 
1525 				ASSERT(next <= tgts[current + 1]);
1526 
1527 				/*
1528 				 * If that spot is available, we're done here.
1529 				 */
1530 				if (next != tgts[current + 1])
1531 					break;
1532 
1533 				/*
1534 				 * Otherwise, find the next valid column after
1535 				 * the previous position.
1536 				 */
1537 				for (c = tgts[current - 1] + 1;
1538 				    rm->rm_col[c].rc_error != 0; c++)
1539 					continue;
1540 
1541 				tgts[current] = c;
1542 				current++;
1543 
1544 			} while (current != n);
1545 		}
1546 	}
1547 	n--;
1548 done:
1549 	for (i = n - 1; i >= 0; i--) {
1550 		free(orig[i]);
1551 	}
1552 
1553 	return (ret);
1554 }
1555 
1556 static int
1557 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1558     off_t offset, size_t bytes)
1559 {
1560 	vdev_t *tvd = vd->v_top;
1561 	vdev_t *cvd;
1562 	raidz_map_t *rm;
1563 	raidz_col_t *rc;
1564 	int c, error;
1565 	int unexpected_errors;
1566 	int parity_errors;
1567 	int parity_untried;
1568 	int data_errors;
1569 	int total_errors;
1570 	int n;
1571 	int tgts[VDEV_RAIDZ_MAXPARITY];
1572 	int code;
1573 
1574 	rc = NULL;	/* gcc */
1575 	error = 0;
1576 
1577 	rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1578 	    vd->v_nchildren, vd->v_nparity);
1579 	if (rm == NULL)
1580 		return (ENOMEM);
1581 
1582 	/*
1583 	 * Iterate over the columns in reverse order so that we hit the parity
1584 	 * last -- any errors along the way will force us to read the parity.
1585 	 */
1586 	for (c = rm->rm_cols - 1; c >= 0; c--) {
1587 		rc = &rm->rm_col[c];
1588 		cvd = vdev_child(vd, rc->rc_devidx);
1589 		if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1590 			if (c >= rm->rm_firstdatacol)
1591 				rm->rm_missingdata++;
1592 			else
1593 				rm->rm_missingparity++;
1594 			rc->rc_error = ENXIO;
1595 			rc->rc_tried = 1;	/* don't even try */
1596 			rc->rc_skipped = 1;
1597 			continue;
1598 		}
1599 #if 0		/* XXX: Too hard for the boot code. */
1600 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1601 			if (c >= rm->rm_firstdatacol)
1602 				rm->rm_missingdata++;
1603 			else
1604 				rm->rm_missingparity++;
1605 			rc->rc_error = ESTALE;
1606 			rc->rc_skipped = 1;
1607 			continue;
1608 		}
1609 #endif
1610 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1611 			rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1612 			    rc->rc_offset, rc->rc_size);
1613 			rc->rc_tried = 1;
1614 			rc->rc_skipped = 0;
1615 		}
1616 	}
1617 
1618 reconstruct:
1619 	unexpected_errors = 0;
1620 	parity_errors = 0;
1621 	parity_untried = 0;
1622 	data_errors = 0;
1623 	total_errors = 0;
1624 
1625 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1626 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1627 
1628 	for (c = 0; c < rm->rm_cols; c++) {
1629 		rc = &rm->rm_col[c];
1630 
1631 		if (rc->rc_error) {
1632 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
1633 
1634 			if (c < rm->rm_firstdatacol)
1635 				parity_errors++;
1636 			else
1637 				data_errors++;
1638 
1639 			if (!rc->rc_skipped)
1640 				unexpected_errors++;
1641 
1642 			total_errors++;
1643 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1644 			parity_untried++;
1645 		}
1646 	}
1647 
1648 	/*
1649 	 * There are three potential phases for a read:
1650 	 *	1. produce valid data from the columns read
1651 	 *	2. read all disks and try again
1652 	 *	3. perform combinatorial reconstruction
1653 	 *
1654 	 * Each phase is progressively both more expensive and less likely to
1655 	 * occur. If we encounter more errors than we can repair or all phases
1656 	 * fail, we have no choice but to return an error.
1657 	 */
1658 
1659 	/*
1660 	 * If the number of errors we saw was correctable -- less than or equal
1661 	 * to the number of parity disks read -- attempt to produce data that
1662 	 * has a valid checksum. Naturally, this case applies in the absence of
1663 	 * any errors.
1664 	 */
1665 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1666 		int rv;
1667 
1668 		if (data_errors == 0) {
1669 			rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
1670 			if (rv == 0) {
1671 				/*
1672 				 * If we read parity information (unnecessarily
1673 				 * as it happens since no reconstruction was
1674 				 * needed) regenerate and verify the parity.
1675 				 * We also regenerate parity when resilvering
1676 				 * so we can write it out to the failed device
1677 				 * later.
1678 				 */
1679 				if (parity_errors + parity_untried <
1680 				    rm->rm_firstdatacol) {
1681 					n = raidz_parity_verify(rm);
1682 					unexpected_errors += n;
1683 					ASSERT(parity_errors + n <=
1684 					    rm->rm_firstdatacol);
1685 				}
1686 				goto done;
1687 			}
1688 		} else {
1689 			/*
1690 			 * We either attempt to read all the parity columns or
1691 			 * none of them. If we didn't try to read parity, we
1692 			 * wouldn't be here in the correctable case. There must
1693 			 * also have been fewer parity errors than parity
1694 			 * columns or, again, we wouldn't be in this code path.
1695 			 */
1696 			ASSERT(parity_untried == 0);
1697 			ASSERT(parity_errors < rm->rm_firstdatacol);
1698 
1699 			/*
1700 			 * Identify the data columns that reported an error.
1701 			 */
1702 			n = 0;
1703 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1704 				rc = &rm->rm_col[c];
1705 				if (rc->rc_error != 0) {
1706 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1707 					tgts[n++] = c;
1708 				}
1709 			}
1710 
1711 			ASSERT(rm->rm_firstdatacol >= n);
1712 
1713 			code = vdev_raidz_reconstruct(rm, tgts, n);
1714 
1715 			rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
1716 			if (rv == 0) {
1717 				/*
1718 				 * If we read more parity disks than were used
1719 				 * for reconstruction, confirm that the other
1720 				 * parity disks produced correct data. This
1721 				 * routine is suboptimal in that it regenerates
1722 				 * the parity that we already used in addition
1723 				 * to the parity that we're attempting to
1724 				 * verify, but this should be a relatively
1725 				 * uncommon case, and can be optimized if it
1726 				 * becomes a problem. Note that we regenerate
1727 				 * parity when resilvering so we can write it
1728 				 * out to failed devices later.
1729 				 */
1730 				if (parity_errors < rm->rm_firstdatacol - n) {
1731 					n = raidz_parity_verify(rm);
1732 					unexpected_errors += n;
1733 					ASSERT(parity_errors + n <=
1734 					    rm->rm_firstdatacol);
1735 				}
1736 
1737 				goto done;
1738 			}
1739 		}
1740 	}
1741 
1742 	/*
1743 	 * This isn't a typical situation -- either we got a read
1744 	 * error or a child silently returned bad data. Read every
1745 	 * block so we can try again with as much data and parity as
1746 	 * we can track down. If we've already been through once
1747 	 * before, all children will be marked as tried so we'll
1748 	 * proceed to combinatorial reconstruction.
1749 	 */
1750 	unexpected_errors = 1;
1751 	rm->rm_missingdata = 0;
1752 	rm->rm_missingparity = 0;
1753 
1754 	n = 0;
1755 	for (c = 0; c < rm->rm_cols; c++) {
1756 		rc = &rm->rm_col[c];
1757 
1758 		if (rc->rc_tried)
1759 			continue;
1760 
1761 		cvd = vdev_child(vd, rc->rc_devidx);
1762 		ASSERT(cvd != NULL);
1763 		rc->rc_error = cvd->v_read(cvd, NULL,
1764 		    rc->rc_data, rc->rc_offset, rc->rc_size);
1765 		if (rc->rc_error == 0)
1766 			n++;
1767 		rc->rc_tried = 1;
1768 		rc->rc_skipped = 0;
1769 	}
1770 	/*
1771 	 * If we managed to read anything more, retry the
1772 	 * reconstruction.
1773 	 */
1774 	if (n > 0)
1775 		goto reconstruct;
1776 
1777 	/*
1778 	 * At this point we've attempted to reconstruct the data given the
1779 	 * errors we detected, and we've attempted to read all columns. There
1780 	 * must, therefore, be one or more additional problems -- silent errors
1781 	 * resulting in invalid data rather than explicit I/O errors resulting
1782 	 * in absent data. We check if there is enough additional data to
1783 	 * possibly reconstruct the data and then perform combinatorial
1784 	 * reconstruction over all possible combinations. If that fails,
1785 	 * we're cooked.
1786 	 */
1787 	if (total_errors > rm->rm_firstdatacol) {
1788 		error = EIO;
1789 	} else if (total_errors < rm->rm_firstdatacol &&
1790 	    (code = vdev_raidz_combrec(vd->v_spa, rm, bp, data, offset, bytes,
1791 	     total_errors, data_errors)) != 0) {
1792 		/*
1793 		 * If we didn't use all the available parity for the
1794 		 * combinatorial reconstruction, verify that the remaining
1795 		 * parity is correct.
1796 		 */
1797 		if (code != (1 << rm->rm_firstdatacol) - 1)
1798 			(void) raidz_parity_verify(rm);
1799 	} else {
1800 		/*
1801 		 * We're here because either:
1802 		 *
1803 		 *	total_errors == rm_first_datacol, or
1804 		 *	vdev_raidz_combrec() failed
1805 		 *
1806 		 * In either case, there is enough bad data to prevent
1807 		 * reconstruction.
1808 		 *
1809 		 * Start checksum ereports for all children which haven't
1810 		 * failed, and the IO wasn't speculative.
1811 		 */
1812 		error = ECKSUM;
1813 	}
1814 
1815 done:
1816 	vdev_raidz_map_free(rm);
1817 
1818 	return (error);
1819 }
1820