xref: /freebsd/sys/cddl/boot/zfs/zfssubr.c (revision 1f1e2261e341e6ca6862f82261066ef1705f0a7a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/cdefs.h>
27 __FBSDID("$FreeBSD$");
28 
29 #include <lz4.h>
30 
31 static uint64_t zfs_crc64_table[256];
32 
33 #define	ASSERT3S(x, y, z)	((void)0)
34 #define	ASSERT3U(x, y, z)	((void)0)
35 #define	ASSERT3P(x, y, z)	((void)0)
36 #define	ASSERT0(x)		((void)0)
37 #define	ASSERT(x)		((void)0)
38 
39 #define	panic(...)	do {						\
40 	printf(__VA_ARGS__);						\
41 	for (;;) ;							\
42 } while (0)
43 
44 static void
45 zfs_init_crc(void)
46 {
47 	int i, j;
48 	uint64_t *ct;
49 
50 	/*
51 	 * Calculate the crc64 table (used for the zap hash
52 	 * function).
53 	 */
54 	if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
55 		memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
56 		for (i = 0; i < 256; i++)
57 			for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
58 				*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
59 	}
60 }
61 
62 static void
63 zio_checksum_off(const void *buf, uint64_t size,
64     const void *ctx_template, zio_cksum_t *zcp)
65 {
66 	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
67 }
68 
69 /*
70  * Signature for checksum functions.
71  */
72 typedef void zio_checksum_t(const void *data, uint64_t size,
73     const void *ctx_template, zio_cksum_t *zcp);
74 typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
75 typedef void zio_checksum_tmpl_free_t(void *ctx_template);
76 
77 typedef enum zio_checksum_flags {
78 	/* Strong enough for metadata? */
79 	ZCHECKSUM_FLAG_METADATA = (1 << 1),
80 	/* ZIO embedded checksum */
81 	ZCHECKSUM_FLAG_EMBEDDED = (1 << 2),
82 	/* Strong enough for dedup (without verification)? */
83 	ZCHECKSUM_FLAG_DEDUP = (1 << 3),
84 	/* Uses salt value */
85 	ZCHECKSUM_FLAG_SALTED = (1 << 4),
86 	/* Strong enough for nopwrite? */
87 	ZCHECKSUM_FLAG_NOPWRITE = (1 << 5)
88 } zio_checksum_flags_t;
89 
90 /*
91  * Information about each checksum function.
92  */
93 typedef struct zio_checksum_info {
94 	/* checksum function for each byteorder */
95 	zio_checksum_t			*ci_func[2];
96 	zio_checksum_tmpl_init_t	*ci_tmpl_init;
97 	zio_checksum_tmpl_free_t	*ci_tmpl_free;
98 	zio_checksum_flags_t		ci_flags;
99 	const char			*ci_name;	/* descriptive name */
100 } zio_checksum_info_t;
101 
102 #include "blkptr.c"
103 
104 #include "fletcher.c"
105 #include "sha256.c"
106 #include "skein_zfs.c"
107 
108 #ifdef HAS_ZSTD_ZFS
109 extern int zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len,
110     size_t d_len, int n);
111 #endif
112 
113 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
114 	{{NULL, NULL}, NULL, NULL, 0, "inherit"},
115 	{{NULL, NULL}, NULL, NULL, 0, "on"},
116 	{{zio_checksum_off,	zio_checksum_off}, NULL, NULL, 0, "off"},
117 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
118 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"},
119 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
120 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"},
121 	{{fletcher_2_native,	fletcher_2_byteswap}, NULL, NULL,
122 	    ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
123 	{{fletcher_2_native,	fletcher_2_byteswap}, NULL, NULL,
124 	    0, "fletcher2"},
125 	{{fletcher_4_native,	fletcher_4_byteswap}, NULL, NULL,
126 	    ZCHECKSUM_FLAG_METADATA, "fletcher4"},
127 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
128 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
129 	    ZCHECKSUM_FLAG_NOPWRITE, "SHA256"},
130 	{{fletcher_4_native,	fletcher_4_byteswap}, NULL, NULL,
131 	    ZCHECKSUM_FLAG_EMBEDDED, "zillog2"},
132 	{{zio_checksum_off,	zio_checksum_off}, NULL, NULL,
133 	    0, "noparity"},
134 	{{zio_checksum_SHA512_native,	zio_checksum_SHA512_byteswap},
135 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
136 	    ZCHECKSUM_FLAG_NOPWRITE, "SHA512"},
137 	{{zio_checksum_skein_native, zio_checksum_skein_byteswap},
138 	    zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
139 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
140 	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
141 	/* no edonr for now */
142 	{{NULL, NULL}, NULL, NULL, ZCHECKSUM_FLAG_METADATA |
143 	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"}
144 };
145 
146 /*
147  * Common signature for all zio compress/decompress functions.
148  */
149 typedef size_t zio_compress_func_t(void *src, void *dst,
150     size_t s_len, size_t d_len, int);
151 typedef int zio_decompress_func_t(void *src, void *dst,
152     size_t s_len, size_t d_len, int);
153 
154 /*
155  * Information about each compression function.
156  */
157 typedef struct zio_compress_info {
158 	zio_compress_func_t	*ci_compress;	/* compression function */
159 	zio_decompress_func_t	*ci_decompress;	/* decompression function */
160 	int			ci_level;	/* level parameter */
161 	const char		*ci_name;	/* algorithm name */
162 } zio_compress_info_t;
163 
164 #include "lzjb.c"
165 #include "zle.c"
166 #include "gzip.c"
167 
168 /*
169  * Compression vectors.
170  */
171 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
172 	{NULL,			NULL,			0,	"inherit"},
173 	{NULL,			NULL,			0,	"on"},
174 	{NULL,			NULL,			0,	"uncompressed"},
175 	{NULL,			lzjb_decompress,	0,	"lzjb"},
176 	{NULL,			NULL,			0,	"empty"},
177 	{NULL,			gzip_decompress,	1,	"gzip-1"},
178 	{NULL,			gzip_decompress,	2,	"gzip-2"},
179 	{NULL,			gzip_decompress,	3,	"gzip-3"},
180 	{NULL,			gzip_decompress,	4,	"gzip-4"},
181 	{NULL,			gzip_decompress,	5,	"gzip-5"},
182 	{NULL,			gzip_decompress,	6,	"gzip-6"},
183 	{NULL,			gzip_decompress,	7,	"gzip-7"},
184 	{NULL,			gzip_decompress,	8,	"gzip-8"},
185 	{NULL,			gzip_decompress,	9,	"gzip-9"},
186 	{NULL,			zle_decompress,		64,	"zle"},
187 	{NULL,			lz4_decompress,		0,	"lz4"},
188 #ifdef HAS_ZSTD_ZFS
189 	{NULL,			zfs_zstd_decompress, ZIO_ZSTD_LEVEL_DEFAULT, "zstd"}
190 #endif
191 };
192 
193 static void
194 byteswap_uint64_array(void *vbuf, size_t size)
195 {
196 	uint64_t *buf = vbuf;
197 	size_t count = size >> 3;
198 	int i;
199 
200 	ASSERT((size & 7) == 0);
201 
202 	for (i = 0; i < count; i++)
203 		buf[i] = BSWAP_64(buf[i]);
204 }
205 
206 /*
207  * Set the external verifier for a gang block based on <vdev, offset, txg>,
208  * a tuple which is guaranteed to be unique for the life of the pool.
209  */
210 static void
211 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
212 {
213 	const dva_t *dva = BP_IDENTITY(bp);
214 	uint64_t txg = BP_PHYSICAL_BIRTH(bp);
215 
216 	ASSERT(BP_IS_GANG(bp));
217 
218 	ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
219 }
220 
221 /*
222  * Set the external verifier for a label block based on its offset.
223  * The vdev is implicit, and the txg is unknowable at pool open time --
224  * hence the logic in vdev_uberblock_load() to find the most recent copy.
225  */
226 static void
227 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
228 {
229 	ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
230 }
231 
232 /*
233  * Calls the template init function of a checksum which supports context
234  * templates and installs the template into the spa_t.
235  */
236 static void
237 zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
238 {
239 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
240 
241 	if (ci->ci_tmpl_init == NULL)
242 		return;
243 
244 	if (spa->spa_cksum_tmpls[checksum] != NULL)
245 		return;
246 
247 	if (spa->spa_cksum_tmpls[checksum] == NULL) {
248 		spa->spa_cksum_tmpls[checksum] =
249 		    ci->ci_tmpl_init(&spa->spa_cksum_salt);
250 	}
251 }
252 
253 /*
254  * Called by a spa_t that's about to be deallocated. This steps through
255  * all of the checksum context templates and deallocates any that were
256  * initialized using the algorithm-specific template init function.
257  */
258 static void __unused
259 zio_checksum_templates_free(spa_t *spa)
260 {
261 	for (enum zio_checksum checksum = 0;
262 	    checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
263 		if (spa->spa_cksum_tmpls[checksum] != NULL) {
264 			zio_checksum_info_t *ci = &zio_checksum_table[checksum];
265 
266 			ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
267 			spa->spa_cksum_tmpls[checksum] = NULL;
268 		}
269 	}
270 }
271 
272 static int
273 zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data)
274 {
275 	uint64_t size;
276 	unsigned int checksum;
277 	zio_checksum_info_t *ci;
278 	void *ctx = NULL;
279 	zio_cksum_t actual_cksum, expected_cksum, verifier;
280 	int byteswap;
281 
282 	checksum = BP_GET_CHECKSUM(bp);
283 	size = BP_GET_PSIZE(bp);
284 
285 	if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
286 		return (EINVAL);
287 	ci = &zio_checksum_table[checksum];
288 	if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
289 		return (EINVAL);
290 
291 	if (spa != NULL) {
292 		zio_checksum_template_init(checksum, __DECONST(spa_t *,spa));
293 		ctx = spa->spa_cksum_tmpls[checksum];
294 	}
295 
296 	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
297 		zio_eck_t *eck;
298 
299 		ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
300 		    checksum == ZIO_CHECKSUM_LABEL);
301 
302 		eck = (zio_eck_t *)((char *)data + size) - 1;
303 
304 		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
305 			zio_checksum_gang_verifier(&verifier, bp);
306 		else if (checksum == ZIO_CHECKSUM_LABEL)
307 			zio_checksum_label_verifier(&verifier,
308 			    DVA_GET_OFFSET(BP_IDENTITY(bp)));
309 		else
310 			verifier = bp->blk_cksum;
311 
312 		byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
313 
314 		if (byteswap)
315 			byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
316 
317 		expected_cksum = eck->zec_cksum;
318 		eck->zec_cksum = verifier;
319 		ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
320 		eck->zec_cksum = expected_cksum;
321 
322 		if (byteswap)
323 			byteswap_uint64_array(&expected_cksum,
324 			    sizeof (zio_cksum_t));
325 	} else {
326 		byteswap = BP_SHOULD_BYTESWAP(bp);
327 		expected_cksum = bp->blk_cksum;
328 		ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
329 	}
330 
331 	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
332 		/*printf("ZFS: read checksum %s failed\n", ci->ci_name);*/
333 		return (EIO);
334 	}
335 
336 	return (0);
337 }
338 
339 static int
340 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
341 	void *dest, uint64_t destsize)
342 {
343 	zio_compress_info_t *ci;
344 
345 	if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
346 		printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
347 		return (EIO);
348 	}
349 
350 	ci = &zio_compress_table[cpfunc];
351 	if (!ci->ci_decompress) {
352 		printf("ZFS: unsupported compression algorithm %s\n",
353 		    ci->ci_name);
354 		return (EIO);
355 	}
356 
357 	return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
358 }
359 
360 static uint64_t
361 zap_hash(uint64_t salt, const char *name)
362 {
363 	const uint8_t *cp;
364 	uint8_t c;
365 	uint64_t crc = salt;
366 
367 	ASSERT(crc != 0);
368 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
369 	for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
370 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
371 
372 	/*
373 	 * Only use 28 bits, since we need 4 bits in the cookie for the
374 	 * collision differentiator.  We MUST use the high bits, since
375 	 * those are the onces that we first pay attention to when
376 	 * chosing the bucket.
377 	 */
378 	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
379 
380 	return (crc);
381 }
382 
383 typedef struct raidz_col {
384 	uint64_t rc_devidx;		/* child device index for I/O */
385 	uint64_t rc_offset;		/* device offset */
386 	uint64_t rc_size;		/* I/O size */
387 	void *rc_data;			/* I/O data */
388 	int rc_error;			/* I/O error for this device */
389 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
390 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
391 } raidz_col_t;
392 
393 typedef struct raidz_map {
394 	uint64_t rm_cols;		/* Regular column count */
395 	uint64_t rm_scols;		/* Count including skipped columns */
396 	uint64_t rm_bigcols;		/* Number of oversized columns */
397 	uint64_t rm_asize;		/* Actual total I/O size */
398 	uint64_t rm_missingdata;	/* Count of missing data devices */
399 	uint64_t rm_missingparity;	/* Count of missing parity devices */
400 	uint64_t rm_firstdatacol;	/* First data column/parity count */
401 	uint64_t rm_nskip;		/* Skipped sectors for padding */
402 	uint64_t rm_skipstart;		/* Column index of padding start */
403 	uintptr_t rm_reports;		/* # of referencing checksum reports */
404 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
405 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
406 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
407 } raidz_map_t;
408 
409 #define	VDEV_RAIDZ_P		0
410 #define	VDEV_RAIDZ_Q		1
411 #define	VDEV_RAIDZ_R		2
412 
413 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
414 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
415 
416 /*
417  * We provide a mechanism to perform the field multiplication operation on a
418  * 64-bit value all at once rather than a byte at a time. This works by
419  * creating a mask from the top bit in each byte and using that to
420  * conditionally apply the XOR of 0x1d.
421  */
422 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
423 { \
424 	(mask) = (x) & 0x8080808080808080ULL; \
425 	(mask) = ((mask) << 1) - ((mask) >> 7); \
426 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
427 	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
428 }
429 
430 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
431 { \
432 	VDEV_RAIDZ_64MUL_2((x), mask); \
433 	VDEV_RAIDZ_64MUL_2((x), mask); \
434 }
435 
436 /*
437  * These two tables represent powers and logs of 2 in the Galois field defined
438  * above. These values were computed by repeatedly multiplying by 2 as above.
439  */
440 static const uint8_t vdev_raidz_pow2[256] = {
441 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
442 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
443 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
444 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
445 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
446 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
447 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
448 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
449 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
450 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
451 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
452 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
453 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
454 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
455 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
456 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
457 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
458 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
459 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
460 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
461 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
462 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
463 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
464 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
465 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
466 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
467 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
468 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
469 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
470 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
471 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
472 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
473 };
474 static const uint8_t vdev_raidz_log2[256] = {
475 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
476 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
477 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
478 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
479 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
480 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
481 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
482 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
483 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
484 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
485 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
486 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
487 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
488 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
489 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
490 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
491 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
492 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
493 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
494 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
495 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
496 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
497 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
498 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
499 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
500 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
501 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
502 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
503 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
504 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
505 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
506 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
507 };
508 
509 /*
510  * Multiply a given number by 2 raised to the given power.
511  */
512 static uint8_t
513 vdev_raidz_exp2(uint8_t a, int exp)
514 {
515 	if (a == 0)
516 		return (0);
517 
518 	ASSERT(exp >= 0);
519 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
520 
521 	exp += vdev_raidz_log2[a];
522 	if (exp > 255)
523 		exp -= 255;
524 
525 	return (vdev_raidz_pow2[exp]);
526 }
527 
528 static void
529 vdev_raidz_generate_parity_p(raidz_map_t *rm)
530 {
531 	uint64_t *p, *src, ccount, i;
532 	uint64_t pcount __unused;
533 	int c;
534 
535 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
536 
537 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
538 		src = rm->rm_col[c].rc_data;
539 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
540 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
541 
542 		if (c == rm->rm_firstdatacol) {
543 			ASSERT(ccount == pcount);
544 			for (i = 0; i < ccount; i++, src++, p++) {
545 				*p = *src;
546 			}
547 		} else {
548 			ASSERT(ccount <= pcount);
549 			for (i = 0; i < ccount; i++, src++, p++) {
550 				*p ^= *src;
551 			}
552 		}
553 	}
554 }
555 
556 static void
557 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
558 {
559 	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
560 	int c;
561 
562 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
563 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
564 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
565 
566 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
567 		src = rm->rm_col[c].rc_data;
568 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
569 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
570 
571 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
572 
573 		if (c == rm->rm_firstdatacol) {
574 			ASSERT(ccnt == pcnt || ccnt == 0);
575 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
576 				*p = *src;
577 				*q = *src;
578 			}
579 			for (; i < pcnt; i++, src++, p++, q++) {
580 				*p = 0;
581 				*q = 0;
582 			}
583 		} else {
584 			ASSERT(ccnt <= pcnt);
585 
586 			/*
587 			 * Apply the algorithm described above by multiplying
588 			 * the previous result and adding in the new value.
589 			 */
590 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
591 				*p ^= *src;
592 
593 				VDEV_RAIDZ_64MUL_2(*q, mask);
594 				*q ^= *src;
595 			}
596 
597 			/*
598 			 * Treat short columns as though they are full of 0s.
599 			 * Note that there's therefore nothing needed for P.
600 			 */
601 			for (; i < pcnt; i++, q++) {
602 				VDEV_RAIDZ_64MUL_2(*q, mask);
603 			}
604 		}
605 	}
606 }
607 
608 static void
609 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
610 {
611 	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
612 	int c;
613 
614 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
615 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
616 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
617 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
618 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
619 
620 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
621 		src = rm->rm_col[c].rc_data;
622 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
623 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
624 		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
625 
626 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
627 
628 		if (c == rm->rm_firstdatacol) {
629 			ASSERT(ccnt == pcnt || ccnt == 0);
630 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
631 				*p = *src;
632 				*q = *src;
633 				*r = *src;
634 			}
635 			for (; i < pcnt; i++, src++, p++, q++, r++) {
636 				*p = 0;
637 				*q = 0;
638 				*r = 0;
639 			}
640 		} else {
641 			ASSERT(ccnt <= pcnt);
642 
643 			/*
644 			 * Apply the algorithm described above by multiplying
645 			 * the previous result and adding in the new value.
646 			 */
647 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
648 				*p ^= *src;
649 
650 				VDEV_RAIDZ_64MUL_2(*q, mask);
651 				*q ^= *src;
652 
653 				VDEV_RAIDZ_64MUL_4(*r, mask);
654 				*r ^= *src;
655 			}
656 
657 			/*
658 			 * Treat short columns as though they are full of 0s.
659 			 * Note that there's therefore nothing needed for P.
660 			 */
661 			for (; i < pcnt; i++, q++, r++) {
662 				VDEV_RAIDZ_64MUL_2(*q, mask);
663 				VDEV_RAIDZ_64MUL_4(*r, mask);
664 			}
665 		}
666 	}
667 }
668 
669 /*
670  * Generate RAID parity in the first virtual columns according to the number of
671  * parity columns available.
672  */
673 static void
674 vdev_raidz_generate_parity(raidz_map_t *rm)
675 {
676 	switch (rm->rm_firstdatacol) {
677 	case 1:
678 		vdev_raidz_generate_parity_p(rm);
679 		break;
680 	case 2:
681 		vdev_raidz_generate_parity_pq(rm);
682 		break;
683 	case 3:
684 		vdev_raidz_generate_parity_pqr(rm);
685 		break;
686 	default:
687 		panic("invalid RAID-Z configuration");
688 	}
689 }
690 
691 /* BEGIN CSTYLED */
692 /*
693  * In the general case of reconstruction, we must solve the system of linear
694  * equations defined by the coeffecients used to generate parity as well as
695  * the contents of the data and parity disks. This can be expressed with
696  * vectors for the original data (D) and the actual data (d) and parity (p)
697  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
698  *
699  *            __   __                     __     __
700  *            |     |         __     __   |  p_0  |
701  *            |  V  |         |  D_0  |   | p_m-1 |
702  *            |     |    x    |   :   | = |  d_0  |
703  *            |  I  |         | D_n-1 |   |   :   |
704  *            |     |         ~~     ~~   | d_n-1 |
705  *            ~~   ~~                     ~~     ~~
706  *
707  * I is simply a square identity matrix of size n, and V is a vandermonde
708  * matrix defined by the coeffecients we chose for the various parity columns
709  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
710  * computation as well as linear separability.
711  *
712  *      __               __               __     __
713  *      |   1   ..  1 1 1 |               |  p_0  |
714  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
715  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
716  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
717  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
718  *      |   :       : : : |   |   :   |   |  d_2  |
719  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
720  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
721  *      |   0   ..  0 0 1 |               | d_n-1 |
722  *      ~~               ~~               ~~     ~~
723  *
724  * Note that I, V, d, and p are known. To compute D, we must invert the
725  * matrix and use the known data and parity values to reconstruct the unknown
726  * data values. We begin by removing the rows in V|I and d|p that correspond
727  * to failed or missing columns; we then make V|I square (n x n) and d|p
728  * sized n by removing rows corresponding to unused parity from the bottom up
729  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
730  * using Gauss-Jordan elimination. In the example below we use m=3 parity
731  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
732  *           __                               __
733  *           |  1   1   1   1   1   1   1   1  |
734  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
735  *           |  19 205 116  29  64  16  4   1  |      / /
736  *           |  1   0   0   0   0   0   0   0  |     / /
737  *           |  0   1   0   0   0   0   0   0  | <--' /
738  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
739  *           |  0   0   0   1   0   0   0   0  |
740  *           |  0   0   0   0   1   0   0   0  |
741  *           |  0   0   0   0   0   1   0   0  |
742  *           |  0   0   0   0   0   0   1   0  |
743  *           |  0   0   0   0   0   0   0   1  |
744  *           ~~                               ~~
745  *           __                               __
746  *           |  1   1   1   1   1   1   1   1  |
747  *           | 128  64  32  16  8   4   2   1  |
748  *           |  19 205 116  29  64  16  4   1  |
749  *           |  1   0   0   0   0   0   0   0  |
750  *           |  0   1   0   0   0   0   0   0  |
751  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
752  *           |  0   0   0   1   0   0   0   0  |
753  *           |  0   0   0   0   1   0   0   0  |
754  *           |  0   0   0   0   0   1   0   0  |
755  *           |  0   0   0   0   0   0   1   0  |
756  *           |  0   0   0   0   0   0   0   1  |
757  *           ~~                               ~~
758  *
759  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
760  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
761  * matrix is not singular.
762  * __                                                                 __
763  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
764  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
765  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
766  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
767  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
768  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
769  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
770  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
771  * ~~                                                                 ~~
772  * __                                                                 __
773  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
774  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
775  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
776  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
777  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
778  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
779  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
780  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
781  * ~~                                                                 ~~
782  * __                                                                 __
783  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
784  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
785  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
786  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
787  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
788  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
789  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
790  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
791  * ~~                                                                 ~~
792  * __                                                                 __
793  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
794  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
795  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
796  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
797  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
798  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
799  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
800  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
801  * ~~                                                                 ~~
802  * __                                                                 __
803  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
804  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
805  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
806  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
807  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
808  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
809  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
810  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
811  * ~~                                                                 ~~
812  * __                                                                 __
813  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
814  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
815  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
816  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
817  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
818  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
819  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
820  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
821  * ~~                                                                 ~~
822  *                   __                               __
823  *                   |  0   0   1   0   0   0   0   0  |
824  *                   | 167 100  5   41 159 169 217 208 |
825  *                   | 166 100  4   40 158 168 216 209 |
826  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
827  *                   |  0   0   0   0   1   0   0   0  |
828  *                   |  0   0   0   0   0   1   0   0  |
829  *                   |  0   0   0   0   0   0   1   0  |
830  *                   |  0   0   0   0   0   0   0   1  |
831  *                   ~~                               ~~
832  *
833  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
834  * of the missing data.
835  *
836  * As is apparent from the example above, the only non-trivial rows in the
837  * inverse matrix correspond to the data disks that we're trying to
838  * reconstruct. Indeed, those are the only rows we need as the others would
839  * only be useful for reconstructing data known or assumed to be valid. For
840  * that reason, we only build the coefficients in the rows that correspond to
841  * targeted columns.
842  */
843 /* END CSTYLED */
844 
845 static void
846 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
847     uint8_t **rows)
848 {
849 	int i, j;
850 	int pow;
851 
852 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
853 
854 	/*
855 	 * Fill in the missing rows of interest.
856 	 */
857 	for (i = 0; i < nmap; i++) {
858 		ASSERT3S(0, <=, map[i]);
859 		ASSERT3S(map[i], <=, 2);
860 
861 		pow = map[i] * n;
862 		if (pow > 255)
863 			pow -= 255;
864 		ASSERT(pow <= 255);
865 
866 		for (j = 0; j < n; j++) {
867 			pow -= map[i];
868 			if (pow < 0)
869 				pow += 255;
870 			rows[i][j] = vdev_raidz_pow2[pow];
871 		}
872 	}
873 }
874 
875 static void
876 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
877     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
878 {
879 	int i, j, ii, jj;
880 	uint8_t log;
881 
882 	/*
883 	 * Assert that the first nmissing entries from the array of used
884 	 * columns correspond to parity columns and that subsequent entries
885 	 * correspond to data columns.
886 	 */
887 	for (i = 0; i < nmissing; i++) {
888 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
889 	}
890 	for (; i < n; i++) {
891 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
892 	}
893 
894 	/*
895 	 * First initialize the storage where we'll compute the inverse rows.
896 	 */
897 	for (i = 0; i < nmissing; i++) {
898 		for (j = 0; j < n; j++) {
899 			invrows[i][j] = (i == j) ? 1 : 0;
900 		}
901 	}
902 
903 	/*
904 	 * Subtract all trivial rows from the rows of consequence.
905 	 */
906 	for (i = 0; i < nmissing; i++) {
907 		for (j = nmissing; j < n; j++) {
908 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
909 			jj = used[j] - rm->rm_firstdatacol;
910 			ASSERT3S(jj, <, n);
911 			invrows[i][j] = rows[i][jj];
912 			rows[i][jj] = 0;
913 		}
914 	}
915 
916 	/*
917 	 * For each of the rows of interest, we must normalize it and subtract
918 	 * a multiple of it from the other rows.
919 	 */
920 	for (i = 0; i < nmissing; i++) {
921 		for (j = 0; j < missing[i]; j++) {
922 			ASSERT3U(rows[i][j], ==, 0);
923 		}
924 		ASSERT3U(rows[i][missing[i]], !=, 0);
925 
926 		/*
927 		 * Compute the inverse of the first element and multiply each
928 		 * element in the row by that value.
929 		 */
930 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
931 
932 		for (j = 0; j < n; j++) {
933 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
934 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
935 		}
936 
937 		for (ii = 0; ii < nmissing; ii++) {
938 			if (i == ii)
939 				continue;
940 
941 			ASSERT3U(rows[ii][missing[i]], !=, 0);
942 
943 			log = vdev_raidz_log2[rows[ii][missing[i]]];
944 
945 			for (j = 0; j < n; j++) {
946 				rows[ii][j] ^=
947 				    vdev_raidz_exp2(rows[i][j], log);
948 				invrows[ii][j] ^=
949 				    vdev_raidz_exp2(invrows[i][j], log);
950 			}
951 		}
952 	}
953 
954 	/*
955 	 * Verify that the data that is left in the rows are properly part of
956 	 * an identity matrix.
957 	 */
958 	for (i = 0; i < nmissing; i++) {
959 		for (j = 0; j < n; j++) {
960 			if (j == missing[i]) {
961 				ASSERT3U(rows[i][j], ==, 1);
962 			} else {
963 				ASSERT3U(rows[i][j], ==, 0);
964 			}
965 		}
966 	}
967 }
968 
969 static void
970 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
971     int *missing, uint8_t **invrows, const uint8_t *used)
972 {
973 	int i, j, x, cc, c;
974 	uint8_t *src;
975 	uint64_t ccount;
976 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
977 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
978 	uint8_t log, val;
979 	int ll;
980 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
981 	uint8_t *p, *pp;
982 	size_t psize;
983 
984 	log = 0;	/* gcc */
985 	psize = sizeof (invlog[0][0]) * n * nmissing;
986 	p = malloc(psize);
987 	if (p == NULL) {
988 		printf("Out of memory\n");
989 		return;
990 	}
991 
992 	for (pp = p, i = 0; i < nmissing; i++) {
993 		invlog[i] = pp;
994 		pp += n;
995 	}
996 
997 	for (i = 0; i < nmissing; i++) {
998 		for (j = 0; j < n; j++) {
999 			ASSERT3U(invrows[i][j], !=, 0);
1000 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1001 		}
1002 	}
1003 
1004 	for (i = 0; i < n; i++) {
1005 		c = used[i];
1006 		ASSERT3U(c, <, rm->rm_cols);
1007 
1008 		src = rm->rm_col[c].rc_data;
1009 		ccount = rm->rm_col[c].rc_size;
1010 		for (j = 0; j < nmissing; j++) {
1011 			cc = missing[j] + rm->rm_firstdatacol;
1012 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1013 			ASSERT3U(cc, <, rm->rm_cols);
1014 			ASSERT3U(cc, !=, c);
1015 
1016 			dst[j] = rm->rm_col[cc].rc_data;
1017 			dcount[j] = rm->rm_col[cc].rc_size;
1018 		}
1019 
1020 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1021 
1022 		for (x = 0; x < ccount; x++, src++) {
1023 			if (*src != 0)
1024 				log = vdev_raidz_log2[*src];
1025 
1026 			for (cc = 0; cc < nmissing; cc++) {
1027 				if (x >= dcount[cc])
1028 					continue;
1029 
1030 				if (*src == 0) {
1031 					val = 0;
1032 				} else {
1033 					if ((ll = log + invlog[cc][i]) >= 255)
1034 						ll -= 255;
1035 					val = vdev_raidz_pow2[ll];
1036 				}
1037 
1038 				if (i == 0)
1039 					dst[cc][x] = val;
1040 				else
1041 					dst[cc][x] ^= val;
1042 			}
1043 		}
1044 	}
1045 
1046 	free(p);
1047 }
1048 
1049 static int
1050 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1051 {
1052 	int n, i, c, t, tt;
1053 	int nmissing_rows;
1054 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1055 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1056 
1057 	uint8_t *p, *pp;
1058 	size_t psize;
1059 
1060 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1061 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1062 	uint8_t *used;
1063 
1064 	int code = 0;
1065 
1066 
1067 	n = rm->rm_cols - rm->rm_firstdatacol;
1068 
1069 	/*
1070 	 * Figure out which data columns are missing.
1071 	 */
1072 	nmissing_rows = 0;
1073 	for (t = 0; t < ntgts; t++) {
1074 		if (tgts[t] >= rm->rm_firstdatacol) {
1075 			missing_rows[nmissing_rows++] =
1076 			    tgts[t] - rm->rm_firstdatacol;
1077 		}
1078 	}
1079 
1080 	/*
1081 	 * Figure out which parity columns to use to help generate the missing
1082 	 * data columns.
1083 	 */
1084 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1085 		ASSERT(tt < ntgts);
1086 		ASSERT(c < rm->rm_firstdatacol);
1087 
1088 		/*
1089 		 * Skip any targeted parity columns.
1090 		 */
1091 		if (c == tgts[tt]) {
1092 			tt++;
1093 			continue;
1094 		}
1095 
1096 		code |= 1 << c;
1097 
1098 		parity_map[i] = c;
1099 		i++;
1100 	}
1101 
1102 	ASSERT(code != 0);
1103 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1104 
1105 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1106 	    nmissing_rows * n + sizeof (used[0]) * n;
1107 	p = malloc(psize);
1108 	if (p == NULL) {
1109 		printf("Out of memory\n");
1110 		return (code);
1111 	}
1112 
1113 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1114 		rows[i] = pp;
1115 		pp += n;
1116 		invrows[i] = pp;
1117 		pp += n;
1118 	}
1119 	used = pp;
1120 
1121 	for (i = 0; i < nmissing_rows; i++) {
1122 		used[i] = parity_map[i];
1123 	}
1124 
1125 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1126 		if (tt < nmissing_rows &&
1127 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1128 			tt++;
1129 			continue;
1130 		}
1131 
1132 		ASSERT3S(i, <, n);
1133 		used[i] = c;
1134 		i++;
1135 	}
1136 
1137 	/*
1138 	 * Initialize the interesting rows of the matrix.
1139 	 */
1140 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1141 
1142 	/*
1143 	 * Invert the matrix.
1144 	 */
1145 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1146 	    invrows, used);
1147 
1148 	/*
1149 	 * Reconstruct the missing data using the generated matrix.
1150 	 */
1151 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1152 	    invrows, used);
1153 
1154 	free(p);
1155 
1156 	return (code);
1157 }
1158 
1159 static int
1160 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1161 {
1162 	int tgts[VDEV_RAIDZ_MAXPARITY];
1163 	int ntgts;
1164 	int i, c;
1165 	int code;
1166 	int nbadparity, nbaddata;
1167 
1168 	/*
1169 	 * The tgts list must already be sorted.
1170 	 */
1171 	for (i = 1; i < nt; i++) {
1172 		ASSERT(t[i] > t[i - 1]);
1173 	}
1174 
1175 	nbadparity = rm->rm_firstdatacol;
1176 	nbaddata = rm->rm_cols - nbadparity;
1177 	ntgts = 0;
1178 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1179 		if (i < nt && c == t[i]) {
1180 			tgts[ntgts++] = c;
1181 			i++;
1182 		} else if (rm->rm_col[c].rc_error != 0) {
1183 			tgts[ntgts++] = c;
1184 		} else if (c >= rm->rm_firstdatacol) {
1185 			nbaddata--;
1186 		} else {
1187 			nbadparity--;
1188 		}
1189 	}
1190 
1191 	ASSERT(ntgts >= nt);
1192 	ASSERT(nbaddata >= 0);
1193 	ASSERT(nbaddata + nbadparity == ntgts);
1194 
1195 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1196 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1197 	ASSERT(code > 0);
1198 	return (code);
1199 }
1200 
1201 static raidz_map_t *
1202 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1203     uint64_t dcols, uint64_t nparity)
1204 {
1205 	raidz_map_t *rm;
1206 	uint64_t b = offset >> unit_shift;
1207 	uint64_t s = size >> unit_shift;
1208 	uint64_t f = b % dcols;
1209 	uint64_t o = (b / dcols) << unit_shift;
1210 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1211 
1212 	q = s / (dcols - nparity);
1213 	r = s - q * (dcols - nparity);
1214 	bc = (r == 0 ? 0 : r + nparity);
1215 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
1216 
1217 	if (q == 0) {
1218 		acols = bc;
1219 		scols = MIN(dcols, roundup(bc, nparity + 1));
1220 	} else {
1221 		acols = dcols;
1222 		scols = dcols;
1223 	}
1224 
1225 	ASSERT3U(acols, <=, scols);
1226 
1227 	rm = malloc(offsetof(raidz_map_t, rm_col[scols]));
1228 	if (rm == NULL)
1229 		return (rm);
1230 
1231 	rm->rm_cols = acols;
1232 	rm->rm_scols = scols;
1233 	rm->rm_bigcols = bc;
1234 	rm->rm_skipstart = bc;
1235 	rm->rm_missingdata = 0;
1236 	rm->rm_missingparity = 0;
1237 	rm->rm_firstdatacol = nparity;
1238 	rm->rm_reports = 0;
1239 	rm->rm_freed = 0;
1240 	rm->rm_ecksuminjected = 0;
1241 
1242 	asize = 0;
1243 
1244 	for (c = 0; c < scols; c++) {
1245 		col = f + c;
1246 		coff = o;
1247 		if (col >= dcols) {
1248 			col -= dcols;
1249 			coff += 1ULL << unit_shift;
1250 		}
1251 		rm->rm_col[c].rc_devidx = col;
1252 		rm->rm_col[c].rc_offset = coff;
1253 		rm->rm_col[c].rc_data = NULL;
1254 		rm->rm_col[c].rc_error = 0;
1255 		rm->rm_col[c].rc_tried = 0;
1256 		rm->rm_col[c].rc_skipped = 0;
1257 
1258 		if (c >= acols)
1259 			rm->rm_col[c].rc_size = 0;
1260 		else if (c < bc)
1261 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1262 		else
1263 			rm->rm_col[c].rc_size = q << unit_shift;
1264 
1265 		asize += rm->rm_col[c].rc_size;
1266 	}
1267 
1268 	ASSERT3U(asize, ==, tot << unit_shift);
1269 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1270 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1271 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1272 	ASSERT3U(rm->rm_nskip, <=, nparity);
1273 
1274 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1275 		rm->rm_col[c].rc_data = malloc(rm->rm_col[c].rc_size);
1276 		if (rm->rm_col[c].rc_data == NULL) {
1277 			c++;
1278 			while (c != 0)
1279 				free(rm->rm_col[--c].rc_data);
1280 			free(rm);
1281 			return (NULL);
1282 		}
1283 	}
1284 
1285 	rm->rm_col[c].rc_data = data;
1286 
1287 	for (c = c + 1; c < acols; c++)
1288 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1289 		    rm->rm_col[c - 1].rc_size;
1290 
1291 	/*
1292 	 * If all data stored spans all columns, there's a danger that parity
1293 	 * will always be on the same device and, since parity isn't read
1294 	 * during normal operation, that that device's I/O bandwidth won't be
1295 	 * used effectively. We therefore switch the parity every 1MB.
1296 	 *
1297 	 * ... at least that was, ostensibly, the theory. As a practical
1298 	 * matter unless we juggle the parity between all devices evenly, we
1299 	 * won't see any benefit. Further, occasional writes that aren't a
1300 	 * multiple of the LCM of the number of children and the minimum
1301 	 * stripe width are sufficient to avoid pessimal behavior.
1302 	 * Unfortunately, this decision created an implicit on-disk format
1303 	 * requirement that we need to support for all eternity, but only
1304 	 * for single-parity RAID-Z.
1305 	 *
1306 	 * If we intend to skip a sector in the zeroth column for padding
1307 	 * we must make sure to note this swap. We will never intend to
1308 	 * skip the first column since at least one data and one parity
1309 	 * column must appear in each row.
1310 	 */
1311 	ASSERT(rm->rm_cols >= 2);
1312 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1313 
1314 	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1315 		devidx = rm->rm_col[0].rc_devidx;
1316 		o = rm->rm_col[0].rc_offset;
1317 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1318 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1319 		rm->rm_col[1].rc_devidx = devidx;
1320 		rm->rm_col[1].rc_offset = o;
1321 
1322 		if (rm->rm_skipstart == 0)
1323 			rm->rm_skipstart = 1;
1324 	}
1325 
1326 	return (rm);
1327 }
1328 
1329 static void
1330 vdev_raidz_map_free(raidz_map_t *rm)
1331 {
1332 	int c;
1333 
1334 	for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1335 		free(rm->rm_col[c].rc_data);
1336 
1337 	free(rm);
1338 }
1339 
1340 static vdev_t *
1341 vdev_child(vdev_t *pvd, uint64_t devidx)
1342 {
1343 	vdev_t *cvd;
1344 
1345 	STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1346 		if (cvd->v_id == devidx)
1347 			break;
1348 	}
1349 
1350 	return (cvd);
1351 }
1352 
1353 /*
1354  * We keep track of whether or not there were any injected errors, so that
1355  * any ereports we generate can note it.
1356  */
1357 static int
1358 raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data,
1359     uint64_t size)
1360 {
1361 	return (zio_checksum_verify(spa, bp, data));
1362 }
1363 
1364 /*
1365  * Generate the parity from the data columns. If we tried and were able to
1366  * read the parity without error, verify that the generated parity matches the
1367  * data we read. If it doesn't, we fire off a checksum error. Return the
1368  * number such failures.
1369  */
1370 static int
1371 raidz_parity_verify(raidz_map_t *rm)
1372 {
1373 	void *orig[VDEV_RAIDZ_MAXPARITY];
1374 	int c, ret = 0;
1375 	raidz_col_t *rc;
1376 
1377 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1378 		rc = &rm->rm_col[c];
1379 		if (!rc->rc_tried || rc->rc_error != 0)
1380 			continue;
1381 		orig[c] = malloc(rc->rc_size);
1382 		if (orig[c] != NULL) {
1383 			bcopy(rc->rc_data, orig[c], rc->rc_size);
1384 		} else {
1385 			printf("Out of memory\n");
1386 		}
1387 	}
1388 
1389 	vdev_raidz_generate_parity(rm);
1390 
1391 	for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1392 		rc = &rm->rm_col[c];
1393 		if (!rc->rc_tried || rc->rc_error != 0)
1394 			continue;
1395 		if (orig[c] == NULL ||
1396 		    bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1397 			rc->rc_error = ECKSUM;
1398 			ret++;
1399 		}
1400 		free(orig[c]);
1401 	}
1402 
1403 	return (ret);
1404 }
1405 
1406 /*
1407  * Iterate over all combinations of bad data and attempt a reconstruction.
1408  * Note that the algorithm below is non-optimal because it doesn't take into
1409  * account how reconstruction is actually performed. For example, with
1410  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1411  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1412  * cases we'd only use parity information in column 0.
1413  */
1414 static int
1415 vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp,
1416     void *data, off_t offset, uint64_t bytes, int total_errors, int data_errors)
1417 {
1418 	raidz_col_t *rc;
1419 	void *orig[VDEV_RAIDZ_MAXPARITY];
1420 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1421 	int *tgts = &tstore[1];
1422 	int current, next, i, c, n;
1423 	int code, ret = 0;
1424 
1425 	ASSERT(total_errors < rm->rm_firstdatacol);
1426 
1427 	/*
1428 	 * This simplifies one edge condition.
1429 	 */
1430 	tgts[-1] = -1;
1431 
1432 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1433 		/*
1434 		 * Initialize the targets array by finding the first n columns
1435 		 * that contain no error.
1436 		 *
1437 		 * If there were no data errors, we need to ensure that we're
1438 		 * always explicitly attempting to reconstruct at least one
1439 		 * data column. To do this, we simply push the highest target
1440 		 * up into the data columns.
1441 		 */
1442 		for (c = 0, i = 0; i < n; i++) {
1443 			if (i == n - 1 && data_errors == 0 &&
1444 			    c < rm->rm_firstdatacol) {
1445 				c = rm->rm_firstdatacol;
1446 			}
1447 
1448 			while (rm->rm_col[c].rc_error != 0) {
1449 				c++;
1450 				ASSERT3S(c, <, rm->rm_cols);
1451 			}
1452 
1453 			tgts[i] = c++;
1454 		}
1455 
1456 		/*
1457 		 * Setting tgts[n] simplifies the other edge condition.
1458 		 */
1459 		tgts[n] = rm->rm_cols;
1460 
1461 		/*
1462 		 * These buffers were allocated in previous iterations.
1463 		 */
1464 		for (i = 0; i < n - 1; i++) {
1465 			ASSERT(orig[i] != NULL);
1466 		}
1467 
1468 		orig[n - 1] = malloc(rm->rm_col[0].rc_size);
1469 		if (orig[n - 1] == NULL) {
1470 			ret = ENOMEM;
1471 			goto done;
1472 		}
1473 
1474 		current = 0;
1475 		next = tgts[current];
1476 
1477 		while (current != n) {
1478 			tgts[current] = next;
1479 			current = 0;
1480 
1481 			/*
1482 			 * Save off the original data that we're going to
1483 			 * attempt to reconstruct.
1484 			 */
1485 			for (i = 0; i < n; i++) {
1486 				ASSERT(orig[i] != NULL);
1487 				c = tgts[i];
1488 				ASSERT3S(c, >=, 0);
1489 				ASSERT3S(c, <, rm->rm_cols);
1490 				rc = &rm->rm_col[c];
1491 				bcopy(rc->rc_data, orig[i], rc->rc_size);
1492 			}
1493 
1494 			/*
1495 			 * Attempt a reconstruction and exit the outer loop on
1496 			 * success.
1497 			 */
1498 			code = vdev_raidz_reconstruct(rm, tgts, n);
1499 			if (raidz_checksum_verify(spa, bp, data, bytes) == 0) {
1500 				for (i = 0; i < n; i++) {
1501 					c = tgts[i];
1502 					rc = &rm->rm_col[c];
1503 					ASSERT(rc->rc_error == 0);
1504 					rc->rc_error = ECKSUM;
1505 				}
1506 
1507 				ret = code;
1508 				goto done;
1509 			}
1510 
1511 			/*
1512 			 * Restore the original data.
1513 			 */
1514 			for (i = 0; i < n; i++) {
1515 				c = tgts[i];
1516 				rc = &rm->rm_col[c];
1517 				bcopy(orig[i], rc->rc_data, rc->rc_size);
1518 			}
1519 
1520 			do {
1521 				/*
1522 				 * Find the next valid column after the current
1523 				 * position..
1524 				 */
1525 				for (next = tgts[current] + 1;
1526 				    next < rm->rm_cols &&
1527 				    rm->rm_col[next].rc_error != 0; next++)
1528 					continue;
1529 
1530 				ASSERT(next <= tgts[current + 1]);
1531 
1532 				/*
1533 				 * If that spot is available, we're done here.
1534 				 */
1535 				if (next != tgts[current + 1])
1536 					break;
1537 
1538 				/*
1539 				 * Otherwise, find the next valid column after
1540 				 * the previous position.
1541 				 */
1542 				for (c = tgts[current - 1] + 1;
1543 				    rm->rm_col[c].rc_error != 0; c++)
1544 					continue;
1545 
1546 				tgts[current] = c;
1547 				current++;
1548 
1549 			} while (current != n);
1550 		}
1551 	}
1552 	n--;
1553 done:
1554 	for (i = n - 1; i >= 0; i--) {
1555 		free(orig[i]);
1556 	}
1557 
1558 	return (ret);
1559 }
1560 
1561 static int
1562 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1563     off_t offset, size_t bytes)
1564 {
1565 	vdev_t *tvd = vd->v_top;
1566 	vdev_t *cvd;
1567 	raidz_map_t *rm;
1568 	raidz_col_t *rc;
1569 	int c, error;
1570 	int unexpected_errors;
1571 	int parity_errors;
1572 	int parity_untried;
1573 	int data_errors;
1574 	int total_errors;
1575 	int n;
1576 	int tgts[VDEV_RAIDZ_MAXPARITY];
1577 	int code;
1578 
1579 	rc = NULL;	/* gcc */
1580 	error = 0;
1581 
1582 	rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1583 	    vd->v_nchildren, vd->v_nparity);
1584 	if (rm == NULL)
1585 		return (ENOMEM);
1586 
1587 	/*
1588 	 * Iterate over the columns in reverse order so that we hit the parity
1589 	 * last -- any errors along the way will force us to read the parity.
1590 	 */
1591 	for (c = rm->rm_cols - 1; c >= 0; c--) {
1592 		rc = &rm->rm_col[c];
1593 		cvd = vdev_child(vd, rc->rc_devidx);
1594 		if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1595 			if (c >= rm->rm_firstdatacol)
1596 				rm->rm_missingdata++;
1597 			else
1598 				rm->rm_missingparity++;
1599 			rc->rc_error = ENXIO;
1600 			rc->rc_tried = 1;	/* don't even try */
1601 			rc->rc_skipped = 1;
1602 			continue;
1603 		}
1604 #if 0		/* XXX: Too hard for the boot code. */
1605 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1606 			if (c >= rm->rm_firstdatacol)
1607 				rm->rm_missingdata++;
1608 			else
1609 				rm->rm_missingparity++;
1610 			rc->rc_error = ESTALE;
1611 			rc->rc_skipped = 1;
1612 			continue;
1613 		}
1614 #endif
1615 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1616 			rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1617 			    rc->rc_offset, rc->rc_size);
1618 			rc->rc_tried = 1;
1619 			rc->rc_skipped = 0;
1620 		}
1621 	}
1622 
1623 reconstruct:
1624 	unexpected_errors = 0;
1625 	parity_errors = 0;
1626 	parity_untried = 0;
1627 	data_errors = 0;
1628 	total_errors = 0;
1629 
1630 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1631 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1632 
1633 	for (c = 0; c < rm->rm_cols; c++) {
1634 		rc = &rm->rm_col[c];
1635 
1636 		if (rc->rc_error) {
1637 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
1638 
1639 			if (c < rm->rm_firstdatacol)
1640 				parity_errors++;
1641 			else
1642 				data_errors++;
1643 
1644 			if (!rc->rc_skipped)
1645 				unexpected_errors++;
1646 
1647 			total_errors++;
1648 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1649 			parity_untried++;
1650 		}
1651 	}
1652 
1653 	/*
1654 	 * There are three potential phases for a read:
1655 	 *	1. produce valid data from the columns read
1656 	 *	2. read all disks and try again
1657 	 *	3. perform combinatorial reconstruction
1658 	 *
1659 	 * Each phase is progressively both more expensive and less likely to
1660 	 * occur. If we encounter more errors than we can repair or all phases
1661 	 * fail, we have no choice but to return an error.
1662 	 */
1663 
1664 	/*
1665 	 * If the number of errors we saw was correctable -- less than or equal
1666 	 * to the number of parity disks read -- attempt to produce data that
1667 	 * has a valid checksum. Naturally, this case applies in the absence of
1668 	 * any errors.
1669 	 */
1670 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1671 		int rv;
1672 
1673 		if (data_errors == 0) {
1674 			rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
1675 			if (rv == 0) {
1676 				/*
1677 				 * If we read parity information (unnecessarily
1678 				 * as it happens since no reconstruction was
1679 				 * needed) regenerate and verify the parity.
1680 				 * We also regenerate parity when resilvering
1681 				 * so we can write it out to the failed device
1682 				 * later.
1683 				 */
1684 				if (parity_errors + parity_untried <
1685 				    rm->rm_firstdatacol) {
1686 					n = raidz_parity_verify(rm);
1687 					unexpected_errors += n;
1688 					ASSERT(parity_errors + n <=
1689 					    rm->rm_firstdatacol);
1690 				}
1691 				goto done;
1692 			}
1693 		} else {
1694 			/*
1695 			 * We either attempt to read all the parity columns or
1696 			 * none of them. If we didn't try to read parity, we
1697 			 * wouldn't be here in the correctable case. There must
1698 			 * also have been fewer parity errors than parity
1699 			 * columns or, again, we wouldn't be in this code path.
1700 			 */
1701 			ASSERT(parity_untried == 0);
1702 			ASSERT(parity_errors < rm->rm_firstdatacol);
1703 
1704 			/*
1705 			 * Identify the data columns that reported an error.
1706 			 */
1707 			n = 0;
1708 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1709 				rc = &rm->rm_col[c];
1710 				if (rc->rc_error != 0) {
1711 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1712 					tgts[n++] = c;
1713 				}
1714 			}
1715 
1716 			ASSERT(rm->rm_firstdatacol >= n);
1717 
1718 			code = vdev_raidz_reconstruct(rm, tgts, n);
1719 
1720 			rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
1721 			if (rv == 0) {
1722 				/*
1723 				 * If we read more parity disks than were used
1724 				 * for reconstruction, confirm that the other
1725 				 * parity disks produced correct data. This
1726 				 * routine is suboptimal in that it regenerates
1727 				 * the parity that we already used in addition
1728 				 * to the parity that we're attempting to
1729 				 * verify, but this should be a relatively
1730 				 * uncommon case, and can be optimized if it
1731 				 * becomes a problem. Note that we regenerate
1732 				 * parity when resilvering so we can write it
1733 				 * out to failed devices later.
1734 				 */
1735 				if (parity_errors < rm->rm_firstdatacol - n) {
1736 					n = raidz_parity_verify(rm);
1737 					unexpected_errors += n;
1738 					ASSERT(parity_errors + n <=
1739 					    rm->rm_firstdatacol);
1740 				}
1741 
1742 				goto done;
1743 			}
1744 		}
1745 	}
1746 
1747 	/*
1748 	 * This isn't a typical situation -- either we got a read
1749 	 * error or a child silently returned bad data. Read every
1750 	 * block so we can try again with as much data and parity as
1751 	 * we can track down. If we've already been through once
1752 	 * before, all children will be marked as tried so we'll
1753 	 * proceed to combinatorial reconstruction.
1754 	 */
1755 	unexpected_errors = 1;
1756 	rm->rm_missingdata = 0;
1757 	rm->rm_missingparity = 0;
1758 
1759 	n = 0;
1760 	for (c = 0; c < rm->rm_cols; c++) {
1761 		rc = &rm->rm_col[c];
1762 
1763 		if (rc->rc_tried)
1764 			continue;
1765 
1766 		cvd = vdev_child(vd, rc->rc_devidx);
1767 		ASSERT(cvd != NULL);
1768 		rc->rc_error = cvd->v_read(cvd, NULL,
1769 		    rc->rc_data, rc->rc_offset, rc->rc_size);
1770 		if (rc->rc_error == 0)
1771 			n++;
1772 		rc->rc_tried = 1;
1773 		rc->rc_skipped = 0;
1774 	}
1775 	/*
1776 	 * If we managed to read anything more, retry the
1777 	 * reconstruction.
1778 	 */
1779 	if (n > 0)
1780 		goto reconstruct;
1781 
1782 	/*
1783 	 * At this point we've attempted to reconstruct the data given the
1784 	 * errors we detected, and we've attempted to read all columns. There
1785 	 * must, therefore, be one or more additional problems -- silent errors
1786 	 * resulting in invalid data rather than explicit I/O errors resulting
1787 	 * in absent data. We check if there is enough additional data to
1788 	 * possibly reconstruct the data and then perform combinatorial
1789 	 * reconstruction over all possible combinations. If that fails,
1790 	 * we're cooked.
1791 	 */
1792 	if (total_errors > rm->rm_firstdatacol) {
1793 		error = EIO;
1794 	} else if (total_errors < rm->rm_firstdatacol &&
1795 	    (code = vdev_raidz_combrec(vd->v_spa, rm, bp, data, offset, bytes,
1796 	     total_errors, data_errors)) != 0) {
1797 		/*
1798 		 * If we didn't use all the available parity for the
1799 		 * combinatorial reconstruction, verify that the remaining
1800 		 * parity is correct.
1801 		 */
1802 		if (code != (1 << rm->rm_firstdatacol) - 1)
1803 			(void) raidz_parity_verify(rm);
1804 	} else {
1805 		/*
1806 		 * We're here because either:
1807 		 *
1808 		 *	total_errors == rm_first_datacol, or
1809 		 *	vdev_raidz_combrec() failed
1810 		 *
1811 		 * In either case, there is enough bad data to prevent
1812 		 * reconstruction.
1813 		 *
1814 		 * Start checksum ereports for all children which haven't
1815 		 * failed, and the IO wasn't speculative.
1816 		 */
1817 		error = ECKSUM;
1818 	}
1819 
1820 done:
1821 	vdev_raidz_map_free(rm);
1822 
1823 	return (error);
1824 }
1825