xref: /freebsd/sys/cddl/boot/zfs/zfssubr.c (revision b214fcceacad6b842545150664bd2695c1c2b34f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/cdefs.h>
27 __FBSDID("$FreeBSD$");
28 
29 #include <lz4.h>
30 
31 static uint64_t zfs_crc64_table[256];
32 
33 #define	ASSERT3S(x, y, z)	((void)0)
34 #define	ASSERT3U(x, y, z)	((void)0)
35 #define	ASSERT3P(x, y, z)	((void)0)
36 #define	ASSERT0(x)		((void)0)
37 #define	ASSERT(x)		((void)0)
38 
39 #define	panic(...)	do {						\
40 	printf(__VA_ARGS__);						\
41 	for (;;) ;							\
42 } while (0)
43 
44 static void
45 zfs_init_crc(void)
46 {
47 	int i, j;
48 	uint64_t *ct;
49 
50 	/*
51 	 * Calculate the crc64 table (used for the zap hash
52 	 * function).
53 	 */
54 	if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
55 		memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
56 		for (i = 0; i < 256; i++)
57 			for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
58 				*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
59 	}
60 }
61 
62 static void
63 zio_checksum_off(const void *buf, uint64_t size,
64     const void *ctx_template, zio_cksum_t *zcp)
65 {
66 	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
67 }
68 
69 /*
70  * Signature for checksum functions.
71  */
72 typedef void zio_checksum_t(const void *data, uint64_t size,
73     const void *ctx_template, zio_cksum_t *zcp);
74 typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
75 typedef void zio_checksum_tmpl_free_t(void *ctx_template);
76 
77 typedef enum zio_checksum_flags {
78 	/* Strong enough for metadata? */
79 	ZCHECKSUM_FLAG_METADATA = (1 << 1),
80 	/* ZIO embedded checksum */
81 	ZCHECKSUM_FLAG_EMBEDDED = (1 << 2),
82 	/* Strong enough for dedup (without verification)? */
83 	ZCHECKSUM_FLAG_DEDUP = (1 << 3),
84 	/* Uses salt value */
85 	ZCHECKSUM_FLAG_SALTED = (1 << 4),
86 	/* Strong enough for nopwrite? */
87 	ZCHECKSUM_FLAG_NOPWRITE = (1 << 5)
88 } zio_checksum_flags_t;
89 
90 /*
91  * Information about each checksum function.
92  */
93 typedef struct zio_checksum_info {
94 	/* checksum function for each byteorder */
95 	zio_checksum_t			*ci_func[2];
96 	zio_checksum_tmpl_init_t	*ci_tmpl_init;
97 	zio_checksum_tmpl_free_t	*ci_tmpl_free;
98 	zio_checksum_flags_t		ci_flags;
99 	const char			*ci_name;	/* descriptive name */
100 } zio_checksum_info_t;
101 
102 #include "blkptr.c"
103 
104 #include "fletcher.c"
105 #include "sha256.c"
106 #include "skein_zfs.c"
107 
108 #ifdef HAS_ZSTD_ZFS
109 extern int zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len,
110     size_t d_len, int n);
111 #endif
112 
113 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
114 	{{NULL, NULL}, NULL, NULL, 0, "inherit"},
115 	{{NULL, NULL}, NULL, NULL, 0, "on"},
116 	{{zio_checksum_off,	zio_checksum_off}, NULL, NULL, 0, "off"},
117 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
118 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"},
119 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
120 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"},
121 	{{fletcher_2_native,	fletcher_2_byteswap}, NULL, NULL,
122 	    ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
123 	{{fletcher_2_native,	fletcher_2_byteswap}, NULL, NULL,
124 	    0, "fletcher2"},
125 	{{fletcher_4_native,	fletcher_4_byteswap}, NULL, NULL,
126 	    ZCHECKSUM_FLAG_METADATA, "fletcher4"},
127 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
128 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
129 	    ZCHECKSUM_FLAG_NOPWRITE, "SHA256"},
130 	{{fletcher_4_native,	fletcher_4_byteswap}, NULL, NULL,
131 	    ZCHECKSUM_FLAG_EMBEDDED, "zillog2"},
132 	{{zio_checksum_off,	zio_checksum_off}, NULL, NULL,
133 	    0, "noparity"},
134 	{{zio_checksum_SHA512_native,	zio_checksum_SHA512_byteswap},
135 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
136 	    ZCHECKSUM_FLAG_NOPWRITE, "SHA512"},
137 	{{zio_checksum_skein_native, zio_checksum_skein_byteswap},
138 	    zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
139 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
140 	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
141 	/* no edonr for now */
142 	{{NULL, NULL}, NULL, NULL, ZCHECKSUM_FLAG_METADATA |
143 	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"}
144 };
145 
146 /*
147  * Common signature for all zio compress/decompress functions.
148  */
149 typedef size_t zio_compress_func_t(void *src, void *dst,
150     size_t s_len, size_t d_len, int);
151 typedef int zio_decompress_func_t(void *src, void *dst,
152     size_t s_len, size_t d_len, int);
153 
154 /*
155  * Information about each compression function.
156  */
157 typedef struct zio_compress_info {
158 	zio_compress_func_t	*ci_compress;	/* compression function */
159 	zio_decompress_func_t	*ci_decompress;	/* decompression function */
160 	int			ci_level;	/* level parameter */
161 	const char		*ci_name;	/* algorithm name */
162 } zio_compress_info_t;
163 
164 #include "lzjb.c"
165 #include "zle.c"
166 
167 /*
168  * Compression vectors.
169  */
170 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
171 	{NULL,			NULL,			0,	"inherit"},
172 	{NULL,			NULL,			0,	"on"},
173 	{NULL,			NULL,			0,	"uncompressed"},
174 	{NULL,			lzjb_decompress,	0,	"lzjb"},
175 	{NULL,			NULL,			0,	"empty"},
176 	{NULL,			NULL,			1,	"gzip-1"},
177 	{NULL,			NULL,			2,	"gzip-2"},
178 	{NULL,			NULL,			3,	"gzip-3"},
179 	{NULL,			NULL,			4,	"gzip-4"},
180 	{NULL,			NULL,			5,	"gzip-5"},
181 	{NULL,			NULL,			6,	"gzip-6"},
182 	{NULL,			NULL,			7,	"gzip-7"},
183 	{NULL,			NULL,			8,	"gzip-8"},
184 	{NULL,			NULL,			9,	"gzip-9"},
185 	{NULL,			zle_decompress,		64,	"zle"},
186 	{NULL,			lz4_decompress,		0,	"lz4"},
187 #ifdef HAS_ZSTD_ZFS
188 	{NULL,			zfs_zstd_decompress, ZIO_ZSTD_LEVEL_DEFAULT, "zstd"}
189 #endif
190 };
191 
192 static void
193 byteswap_uint64_array(void *vbuf, size_t size)
194 {
195 	uint64_t *buf = vbuf;
196 	size_t count = size >> 3;
197 	int i;
198 
199 	ASSERT((size & 7) == 0);
200 
201 	for (i = 0; i < count; i++)
202 		buf[i] = BSWAP_64(buf[i]);
203 }
204 
205 /*
206  * Set the external verifier for a gang block based on <vdev, offset, txg>,
207  * a tuple which is guaranteed to be unique for the life of the pool.
208  */
209 static void
210 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
211 {
212 	const dva_t *dva = BP_IDENTITY(bp);
213 	uint64_t txg = BP_PHYSICAL_BIRTH(bp);
214 
215 	ASSERT(BP_IS_GANG(bp));
216 
217 	ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
218 }
219 
220 /*
221  * Set the external verifier for a label block based on its offset.
222  * The vdev is implicit, and the txg is unknowable at pool open time --
223  * hence the logic in vdev_uberblock_load() to find the most recent copy.
224  */
225 static void
226 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
227 {
228 	ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
229 }
230 
231 /*
232  * Calls the template init function of a checksum which supports context
233  * templates and installs the template into the spa_t.
234  */
235 static void
236 zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
237 {
238 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
239 
240 	if (ci->ci_tmpl_init == NULL)
241 		return;
242 
243 	if (spa->spa_cksum_tmpls[checksum] != NULL)
244 		return;
245 
246 	if (spa->spa_cksum_tmpls[checksum] == NULL) {
247 		spa->spa_cksum_tmpls[checksum] =
248 		    ci->ci_tmpl_init(&spa->spa_cksum_salt);
249 	}
250 }
251 
252 /*
253  * Called by a spa_t that's about to be deallocated. This steps through
254  * all of the checksum context templates and deallocates any that were
255  * initialized using the algorithm-specific template init function.
256  */
257 static void __unused
258 zio_checksum_templates_free(spa_t *spa)
259 {
260 	for (enum zio_checksum checksum = 0;
261 	    checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
262 		if (spa->spa_cksum_tmpls[checksum] != NULL) {
263 			zio_checksum_info_t *ci = &zio_checksum_table[checksum];
264 
265 			ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
266 			spa->spa_cksum_tmpls[checksum] = NULL;
267 		}
268 	}
269 }
270 
271 static int
272 zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data)
273 {
274 	uint64_t size;
275 	unsigned int checksum;
276 	zio_checksum_info_t *ci;
277 	void *ctx = NULL;
278 	zio_cksum_t actual_cksum, expected_cksum, verifier;
279 	int byteswap;
280 
281 	checksum = BP_GET_CHECKSUM(bp);
282 	size = BP_GET_PSIZE(bp);
283 
284 	if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
285 		return (EINVAL);
286 	ci = &zio_checksum_table[checksum];
287 	if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
288 		return (EINVAL);
289 
290 	if (spa != NULL) {
291 		zio_checksum_template_init(checksum, __DECONST(spa_t *,spa));
292 		ctx = spa->spa_cksum_tmpls[checksum];
293 	}
294 
295 	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
296 		zio_eck_t *eck;
297 
298 		ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
299 		    checksum == ZIO_CHECKSUM_LABEL);
300 
301 		eck = (zio_eck_t *)((char *)data + size) - 1;
302 
303 		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
304 			zio_checksum_gang_verifier(&verifier, bp);
305 		else if (checksum == ZIO_CHECKSUM_LABEL)
306 			zio_checksum_label_verifier(&verifier,
307 			    DVA_GET_OFFSET(BP_IDENTITY(bp)));
308 		else
309 			verifier = bp->blk_cksum;
310 
311 		byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
312 
313 		if (byteswap)
314 			byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
315 
316 		expected_cksum = eck->zec_cksum;
317 		eck->zec_cksum = verifier;
318 		ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
319 		eck->zec_cksum = expected_cksum;
320 
321 		if (byteswap)
322 			byteswap_uint64_array(&expected_cksum,
323 			    sizeof (zio_cksum_t));
324 	} else {
325 		byteswap = BP_SHOULD_BYTESWAP(bp);
326 		expected_cksum = bp->blk_cksum;
327 		ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
328 	}
329 
330 	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
331 		/*printf("ZFS: read checksum %s failed\n", ci->ci_name);*/
332 		return (EIO);
333 	}
334 
335 	return (0);
336 }
337 
338 static int
339 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
340 	void *dest, uint64_t destsize)
341 {
342 	zio_compress_info_t *ci;
343 
344 	if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
345 		printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
346 		return (EIO);
347 	}
348 
349 	ci = &zio_compress_table[cpfunc];
350 	if (!ci->ci_decompress) {
351 		printf("ZFS: unsupported compression algorithm %s\n",
352 		    ci->ci_name);
353 		return (EIO);
354 	}
355 
356 	return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
357 }
358 
359 static uint64_t
360 zap_hash(uint64_t salt, const char *name)
361 {
362 	const uint8_t *cp;
363 	uint8_t c;
364 	uint64_t crc = salt;
365 
366 	ASSERT(crc != 0);
367 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
368 	for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
369 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
370 
371 	/*
372 	 * Only use 28 bits, since we need 4 bits in the cookie for the
373 	 * collision differentiator.  We MUST use the high bits, since
374 	 * those are the onces that we first pay attention to when
375 	 * chosing the bucket.
376 	 */
377 	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
378 
379 	return (crc);
380 }
381 
382 typedef struct raidz_col {
383 	uint64_t rc_devidx;		/* child device index for I/O */
384 	uint64_t rc_offset;		/* device offset */
385 	uint64_t rc_size;		/* I/O size */
386 	void *rc_data;			/* I/O data */
387 	int rc_error;			/* I/O error for this device */
388 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
389 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
390 } raidz_col_t;
391 
392 typedef struct raidz_map {
393 	uint64_t rm_cols;		/* Regular column count */
394 	uint64_t rm_scols;		/* Count including skipped columns */
395 	uint64_t rm_bigcols;		/* Number of oversized columns */
396 	uint64_t rm_asize;		/* Actual total I/O size */
397 	uint64_t rm_missingdata;	/* Count of missing data devices */
398 	uint64_t rm_missingparity;	/* Count of missing parity devices */
399 	uint64_t rm_firstdatacol;	/* First data column/parity count */
400 	uint64_t rm_nskip;		/* Skipped sectors for padding */
401 	uint64_t rm_skipstart;		/* Column index of padding start */
402 	uintptr_t rm_reports;		/* # of referencing checksum reports */
403 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
404 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
405 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
406 } raidz_map_t;
407 
408 #define	VDEV_RAIDZ_P		0
409 #define	VDEV_RAIDZ_Q		1
410 #define	VDEV_RAIDZ_R		2
411 
412 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
413 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
414 
415 /*
416  * We provide a mechanism to perform the field multiplication operation on a
417  * 64-bit value all at once rather than a byte at a time. This works by
418  * creating a mask from the top bit in each byte and using that to
419  * conditionally apply the XOR of 0x1d.
420  */
421 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
422 { \
423 	(mask) = (x) & 0x8080808080808080ULL; \
424 	(mask) = ((mask) << 1) - ((mask) >> 7); \
425 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
426 	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
427 }
428 
429 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
430 { \
431 	VDEV_RAIDZ_64MUL_2((x), mask); \
432 	VDEV_RAIDZ_64MUL_2((x), mask); \
433 }
434 
435 /*
436  * These two tables represent powers and logs of 2 in the Galois field defined
437  * above. These values were computed by repeatedly multiplying by 2 as above.
438  */
439 static const uint8_t vdev_raidz_pow2[256] = {
440 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
441 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
442 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
443 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
444 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
445 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
446 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
447 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
448 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
449 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
450 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
451 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
452 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
453 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
454 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
455 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
456 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
457 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
458 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
459 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
460 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
461 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
462 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
463 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
464 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
465 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
466 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
467 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
468 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
469 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
470 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
471 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
472 };
473 static const uint8_t vdev_raidz_log2[256] = {
474 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
475 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
476 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
477 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
478 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
479 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
480 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
481 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
482 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
483 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
484 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
485 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
486 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
487 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
488 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
489 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
490 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
491 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
492 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
493 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
494 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
495 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
496 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
497 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
498 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
499 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
500 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
501 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
502 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
503 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
504 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
505 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
506 };
507 
508 /*
509  * Multiply a given number by 2 raised to the given power.
510  */
511 static uint8_t
512 vdev_raidz_exp2(uint8_t a, int exp)
513 {
514 	if (a == 0)
515 		return (0);
516 
517 	ASSERT(exp >= 0);
518 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
519 
520 	exp += vdev_raidz_log2[a];
521 	if (exp > 255)
522 		exp -= 255;
523 
524 	return (vdev_raidz_pow2[exp]);
525 }
526 
527 static void
528 vdev_raidz_generate_parity_p(raidz_map_t *rm)
529 {
530 	uint64_t *p, *src, ccount, i;
531 	uint64_t pcount __unused;
532 	int c;
533 
534 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
535 
536 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
537 		src = rm->rm_col[c].rc_data;
538 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
539 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
540 
541 		if (c == rm->rm_firstdatacol) {
542 			ASSERT(ccount == pcount);
543 			for (i = 0; i < ccount; i++, src++, p++) {
544 				*p = *src;
545 			}
546 		} else {
547 			ASSERT(ccount <= pcount);
548 			for (i = 0; i < ccount; i++, src++, p++) {
549 				*p ^= *src;
550 			}
551 		}
552 	}
553 }
554 
555 static void
556 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
557 {
558 	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
559 	int c;
560 
561 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
562 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
563 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
564 
565 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
566 		src = rm->rm_col[c].rc_data;
567 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
568 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
569 
570 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
571 
572 		if (c == rm->rm_firstdatacol) {
573 			ASSERT(ccnt == pcnt || ccnt == 0);
574 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
575 				*p = *src;
576 				*q = *src;
577 			}
578 			for (; i < pcnt; i++, src++, p++, q++) {
579 				*p = 0;
580 				*q = 0;
581 			}
582 		} else {
583 			ASSERT(ccnt <= pcnt);
584 
585 			/*
586 			 * Apply the algorithm described above by multiplying
587 			 * the previous result and adding in the new value.
588 			 */
589 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
590 				*p ^= *src;
591 
592 				VDEV_RAIDZ_64MUL_2(*q, mask);
593 				*q ^= *src;
594 			}
595 
596 			/*
597 			 * Treat short columns as though they are full of 0s.
598 			 * Note that there's therefore nothing needed for P.
599 			 */
600 			for (; i < pcnt; i++, q++) {
601 				VDEV_RAIDZ_64MUL_2(*q, mask);
602 			}
603 		}
604 	}
605 }
606 
607 static void
608 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
609 {
610 	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
611 	int c;
612 
613 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
614 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
615 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
616 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
617 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
618 
619 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
620 		src = rm->rm_col[c].rc_data;
621 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
622 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
623 		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
624 
625 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
626 
627 		if (c == rm->rm_firstdatacol) {
628 			ASSERT(ccnt == pcnt || ccnt == 0);
629 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
630 				*p = *src;
631 				*q = *src;
632 				*r = *src;
633 			}
634 			for (; i < pcnt; i++, src++, p++, q++, r++) {
635 				*p = 0;
636 				*q = 0;
637 				*r = 0;
638 			}
639 		} else {
640 			ASSERT(ccnt <= pcnt);
641 
642 			/*
643 			 * Apply the algorithm described above by multiplying
644 			 * the previous result and adding in the new value.
645 			 */
646 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
647 				*p ^= *src;
648 
649 				VDEV_RAIDZ_64MUL_2(*q, mask);
650 				*q ^= *src;
651 
652 				VDEV_RAIDZ_64MUL_4(*r, mask);
653 				*r ^= *src;
654 			}
655 
656 			/*
657 			 * Treat short columns as though they are full of 0s.
658 			 * Note that there's therefore nothing needed for P.
659 			 */
660 			for (; i < pcnt; i++, q++, r++) {
661 				VDEV_RAIDZ_64MUL_2(*q, mask);
662 				VDEV_RAIDZ_64MUL_4(*r, mask);
663 			}
664 		}
665 	}
666 }
667 
668 /*
669  * Generate RAID parity in the first virtual columns according to the number of
670  * parity columns available.
671  */
672 static void
673 vdev_raidz_generate_parity(raidz_map_t *rm)
674 {
675 	switch (rm->rm_firstdatacol) {
676 	case 1:
677 		vdev_raidz_generate_parity_p(rm);
678 		break;
679 	case 2:
680 		vdev_raidz_generate_parity_pq(rm);
681 		break;
682 	case 3:
683 		vdev_raidz_generate_parity_pqr(rm);
684 		break;
685 	default:
686 		panic("invalid RAID-Z configuration");
687 	}
688 }
689 
690 /* BEGIN CSTYLED */
691 /*
692  * In the general case of reconstruction, we must solve the system of linear
693  * equations defined by the coeffecients used to generate parity as well as
694  * the contents of the data and parity disks. This can be expressed with
695  * vectors for the original data (D) and the actual data (d) and parity (p)
696  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
697  *
698  *            __   __                     __     __
699  *            |     |         __     __   |  p_0  |
700  *            |  V  |         |  D_0  |   | p_m-1 |
701  *            |     |    x    |   :   | = |  d_0  |
702  *            |  I  |         | D_n-1 |   |   :   |
703  *            |     |         ~~     ~~   | d_n-1 |
704  *            ~~   ~~                     ~~     ~~
705  *
706  * I is simply a square identity matrix of size n, and V is a vandermonde
707  * matrix defined by the coeffecients we chose for the various parity columns
708  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
709  * computation as well as linear separability.
710  *
711  *      __               __               __     __
712  *      |   1   ..  1 1 1 |               |  p_0  |
713  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
714  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
715  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
716  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
717  *      |   :       : : : |   |   :   |   |  d_2  |
718  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
719  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
720  *      |   0   ..  0 0 1 |               | d_n-1 |
721  *      ~~               ~~               ~~     ~~
722  *
723  * Note that I, V, d, and p are known. To compute D, we must invert the
724  * matrix and use the known data and parity values to reconstruct the unknown
725  * data values. We begin by removing the rows in V|I and d|p that correspond
726  * to failed or missing columns; we then make V|I square (n x n) and d|p
727  * sized n by removing rows corresponding to unused parity from the bottom up
728  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
729  * using Gauss-Jordan elimination. In the example below we use m=3 parity
730  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
731  *           __                               __
732  *           |  1   1   1   1   1   1   1   1  |
733  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
734  *           |  19 205 116  29  64  16  4   1  |      / /
735  *           |  1   0   0   0   0   0   0   0  |     / /
736  *           |  0   1   0   0   0   0   0   0  | <--' /
737  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
738  *           |  0   0   0   1   0   0   0   0  |
739  *           |  0   0   0   0   1   0   0   0  |
740  *           |  0   0   0   0   0   1   0   0  |
741  *           |  0   0   0   0   0   0   1   0  |
742  *           |  0   0   0   0   0   0   0   1  |
743  *           ~~                               ~~
744  *           __                               __
745  *           |  1   1   1   1   1   1   1   1  |
746  *           | 128  64  32  16  8   4   2   1  |
747  *           |  19 205 116  29  64  16  4   1  |
748  *           |  1   0   0   0   0   0   0   0  |
749  *           |  0   1   0   0   0   0   0   0  |
750  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
751  *           |  0   0   0   1   0   0   0   0  |
752  *           |  0   0   0   0   1   0   0   0  |
753  *           |  0   0   0   0   0   1   0   0  |
754  *           |  0   0   0   0   0   0   1   0  |
755  *           |  0   0   0   0   0   0   0   1  |
756  *           ~~                               ~~
757  *
758  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
759  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
760  * matrix is not singular.
761  * __                                                                 __
762  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
763  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
764  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
765  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
766  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
767  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
768  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
769  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
770  * ~~                                                                 ~~
771  * __                                                                 __
772  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
773  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
774  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
775  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
776  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
777  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
778  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
779  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
780  * ~~                                                                 ~~
781  * __                                                                 __
782  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
783  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
784  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
785  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
786  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
787  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
788  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
789  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
790  * ~~                                                                 ~~
791  * __                                                                 __
792  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
793  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
794  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
795  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
796  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
797  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
798  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
799  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
800  * ~~                                                                 ~~
801  * __                                                                 __
802  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
803  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
804  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
805  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
806  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
807  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
808  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
809  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
810  * ~~                                                                 ~~
811  * __                                                                 __
812  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
813  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
814  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
815  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
816  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
817  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
818  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
819  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
820  * ~~                                                                 ~~
821  *                   __                               __
822  *                   |  0   0   1   0   0   0   0   0  |
823  *                   | 167 100  5   41 159 169 217 208 |
824  *                   | 166 100  4   40 158 168 216 209 |
825  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
826  *                   |  0   0   0   0   1   0   0   0  |
827  *                   |  0   0   0   0   0   1   0   0  |
828  *                   |  0   0   0   0   0   0   1   0  |
829  *                   |  0   0   0   0   0   0   0   1  |
830  *                   ~~                               ~~
831  *
832  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
833  * of the missing data.
834  *
835  * As is apparent from the example above, the only non-trivial rows in the
836  * inverse matrix correspond to the data disks that we're trying to
837  * reconstruct. Indeed, those are the only rows we need as the others would
838  * only be useful for reconstructing data known or assumed to be valid. For
839  * that reason, we only build the coefficients in the rows that correspond to
840  * targeted columns.
841  */
842 /* END CSTYLED */
843 
844 static void
845 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
846     uint8_t **rows)
847 {
848 	int i, j;
849 	int pow;
850 
851 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
852 
853 	/*
854 	 * Fill in the missing rows of interest.
855 	 */
856 	for (i = 0; i < nmap; i++) {
857 		ASSERT3S(0, <=, map[i]);
858 		ASSERT3S(map[i], <=, 2);
859 
860 		pow = map[i] * n;
861 		if (pow > 255)
862 			pow -= 255;
863 		ASSERT(pow <= 255);
864 
865 		for (j = 0; j < n; j++) {
866 			pow -= map[i];
867 			if (pow < 0)
868 				pow += 255;
869 			rows[i][j] = vdev_raidz_pow2[pow];
870 		}
871 	}
872 }
873 
874 static void
875 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
876     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
877 {
878 	int i, j, ii, jj;
879 	uint8_t log;
880 
881 	/*
882 	 * Assert that the first nmissing entries from the array of used
883 	 * columns correspond to parity columns and that subsequent entries
884 	 * correspond to data columns.
885 	 */
886 	for (i = 0; i < nmissing; i++) {
887 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
888 	}
889 	for (; i < n; i++) {
890 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
891 	}
892 
893 	/*
894 	 * First initialize the storage where we'll compute the inverse rows.
895 	 */
896 	for (i = 0; i < nmissing; i++) {
897 		for (j = 0; j < n; j++) {
898 			invrows[i][j] = (i == j) ? 1 : 0;
899 		}
900 	}
901 
902 	/*
903 	 * Subtract all trivial rows from the rows of consequence.
904 	 */
905 	for (i = 0; i < nmissing; i++) {
906 		for (j = nmissing; j < n; j++) {
907 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
908 			jj = used[j] - rm->rm_firstdatacol;
909 			ASSERT3S(jj, <, n);
910 			invrows[i][j] = rows[i][jj];
911 			rows[i][jj] = 0;
912 		}
913 	}
914 
915 	/*
916 	 * For each of the rows of interest, we must normalize it and subtract
917 	 * a multiple of it from the other rows.
918 	 */
919 	for (i = 0; i < nmissing; i++) {
920 		for (j = 0; j < missing[i]; j++) {
921 			ASSERT3U(rows[i][j], ==, 0);
922 		}
923 		ASSERT3U(rows[i][missing[i]], !=, 0);
924 
925 		/*
926 		 * Compute the inverse of the first element and multiply each
927 		 * element in the row by that value.
928 		 */
929 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
930 
931 		for (j = 0; j < n; j++) {
932 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
933 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
934 		}
935 
936 		for (ii = 0; ii < nmissing; ii++) {
937 			if (i == ii)
938 				continue;
939 
940 			ASSERT3U(rows[ii][missing[i]], !=, 0);
941 
942 			log = vdev_raidz_log2[rows[ii][missing[i]]];
943 
944 			for (j = 0; j < n; j++) {
945 				rows[ii][j] ^=
946 				    vdev_raidz_exp2(rows[i][j], log);
947 				invrows[ii][j] ^=
948 				    vdev_raidz_exp2(invrows[i][j], log);
949 			}
950 		}
951 	}
952 
953 	/*
954 	 * Verify that the data that is left in the rows are properly part of
955 	 * an identity matrix.
956 	 */
957 	for (i = 0; i < nmissing; i++) {
958 		for (j = 0; j < n; j++) {
959 			if (j == missing[i]) {
960 				ASSERT3U(rows[i][j], ==, 1);
961 			} else {
962 				ASSERT3U(rows[i][j], ==, 0);
963 			}
964 		}
965 	}
966 }
967 
968 static void
969 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
970     int *missing, uint8_t **invrows, const uint8_t *used)
971 {
972 	int i, j, x, cc, c;
973 	uint8_t *src;
974 	uint64_t ccount;
975 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
976 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
977 	uint8_t log, val;
978 	int ll;
979 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
980 	uint8_t *p, *pp;
981 	size_t psize;
982 
983 	log = 0;	/* gcc */
984 	psize = sizeof (invlog[0][0]) * n * nmissing;
985 	p = malloc(psize);
986 	if (p == NULL) {
987 		printf("Out of memory\n");
988 		return;
989 	}
990 
991 	for (pp = p, i = 0; i < nmissing; i++) {
992 		invlog[i] = pp;
993 		pp += n;
994 	}
995 
996 	for (i = 0; i < nmissing; i++) {
997 		for (j = 0; j < n; j++) {
998 			ASSERT3U(invrows[i][j], !=, 0);
999 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1000 		}
1001 	}
1002 
1003 	for (i = 0; i < n; i++) {
1004 		c = used[i];
1005 		ASSERT3U(c, <, rm->rm_cols);
1006 
1007 		src = rm->rm_col[c].rc_data;
1008 		ccount = rm->rm_col[c].rc_size;
1009 		for (j = 0; j < nmissing; j++) {
1010 			cc = missing[j] + rm->rm_firstdatacol;
1011 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1012 			ASSERT3U(cc, <, rm->rm_cols);
1013 			ASSERT3U(cc, !=, c);
1014 
1015 			dst[j] = rm->rm_col[cc].rc_data;
1016 			dcount[j] = rm->rm_col[cc].rc_size;
1017 		}
1018 
1019 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1020 
1021 		for (x = 0; x < ccount; x++, src++) {
1022 			if (*src != 0)
1023 				log = vdev_raidz_log2[*src];
1024 
1025 			for (cc = 0; cc < nmissing; cc++) {
1026 				if (x >= dcount[cc])
1027 					continue;
1028 
1029 				if (*src == 0) {
1030 					val = 0;
1031 				} else {
1032 					if ((ll = log + invlog[cc][i]) >= 255)
1033 						ll -= 255;
1034 					val = vdev_raidz_pow2[ll];
1035 				}
1036 
1037 				if (i == 0)
1038 					dst[cc][x] = val;
1039 				else
1040 					dst[cc][x] ^= val;
1041 			}
1042 		}
1043 	}
1044 
1045 	free(p);
1046 }
1047 
1048 static int
1049 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1050 {
1051 	int n, i, c, t, tt;
1052 	int nmissing_rows;
1053 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1054 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1055 
1056 	uint8_t *p, *pp;
1057 	size_t psize;
1058 
1059 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1060 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1061 	uint8_t *used;
1062 
1063 	int code = 0;
1064 
1065 
1066 	n = rm->rm_cols - rm->rm_firstdatacol;
1067 
1068 	/*
1069 	 * Figure out which data columns are missing.
1070 	 */
1071 	nmissing_rows = 0;
1072 	for (t = 0; t < ntgts; t++) {
1073 		if (tgts[t] >= rm->rm_firstdatacol) {
1074 			missing_rows[nmissing_rows++] =
1075 			    tgts[t] - rm->rm_firstdatacol;
1076 		}
1077 	}
1078 
1079 	/*
1080 	 * Figure out which parity columns to use to help generate the missing
1081 	 * data columns.
1082 	 */
1083 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1084 		ASSERT(tt < ntgts);
1085 		ASSERT(c < rm->rm_firstdatacol);
1086 
1087 		/*
1088 		 * Skip any targeted parity columns.
1089 		 */
1090 		if (c == tgts[tt]) {
1091 			tt++;
1092 			continue;
1093 		}
1094 
1095 		code |= 1 << c;
1096 
1097 		parity_map[i] = c;
1098 		i++;
1099 	}
1100 
1101 	ASSERT(code != 0);
1102 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1103 
1104 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1105 	    nmissing_rows * n + sizeof (used[0]) * n;
1106 	p = malloc(psize);
1107 	if (p == NULL) {
1108 		printf("Out of memory\n");
1109 		return (code);
1110 	}
1111 
1112 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1113 		rows[i] = pp;
1114 		pp += n;
1115 		invrows[i] = pp;
1116 		pp += n;
1117 	}
1118 	used = pp;
1119 
1120 	for (i = 0; i < nmissing_rows; i++) {
1121 		used[i] = parity_map[i];
1122 	}
1123 
1124 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1125 		if (tt < nmissing_rows &&
1126 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1127 			tt++;
1128 			continue;
1129 		}
1130 
1131 		ASSERT3S(i, <, n);
1132 		used[i] = c;
1133 		i++;
1134 	}
1135 
1136 	/*
1137 	 * Initialize the interesting rows of the matrix.
1138 	 */
1139 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1140 
1141 	/*
1142 	 * Invert the matrix.
1143 	 */
1144 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1145 	    invrows, used);
1146 
1147 	/*
1148 	 * Reconstruct the missing data using the generated matrix.
1149 	 */
1150 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1151 	    invrows, used);
1152 
1153 	free(p);
1154 
1155 	return (code);
1156 }
1157 
1158 static int
1159 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1160 {
1161 	int tgts[VDEV_RAIDZ_MAXPARITY];
1162 	int ntgts;
1163 	int i, c;
1164 	int code;
1165 	int nbadparity, nbaddata;
1166 
1167 	/*
1168 	 * The tgts list must already be sorted.
1169 	 */
1170 	for (i = 1; i < nt; i++) {
1171 		ASSERT(t[i] > t[i - 1]);
1172 	}
1173 
1174 	nbadparity = rm->rm_firstdatacol;
1175 	nbaddata = rm->rm_cols - nbadparity;
1176 	ntgts = 0;
1177 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1178 		if (i < nt && c == t[i]) {
1179 			tgts[ntgts++] = c;
1180 			i++;
1181 		} else if (rm->rm_col[c].rc_error != 0) {
1182 			tgts[ntgts++] = c;
1183 		} else if (c >= rm->rm_firstdatacol) {
1184 			nbaddata--;
1185 		} else {
1186 			nbadparity--;
1187 		}
1188 	}
1189 
1190 	ASSERT(ntgts >= nt);
1191 	ASSERT(nbaddata >= 0);
1192 	ASSERT(nbaddata + nbadparity == ntgts);
1193 
1194 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1195 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1196 	ASSERT(code > 0);
1197 	return (code);
1198 }
1199 
1200 static raidz_map_t *
1201 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1202     uint64_t dcols, uint64_t nparity)
1203 {
1204 	raidz_map_t *rm;
1205 	uint64_t b = offset >> unit_shift;
1206 	uint64_t s = size >> unit_shift;
1207 	uint64_t f = b % dcols;
1208 	uint64_t o = (b / dcols) << unit_shift;
1209 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1210 
1211 	q = s / (dcols - nparity);
1212 	r = s - q * (dcols - nparity);
1213 	bc = (r == 0 ? 0 : r + nparity);
1214 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
1215 
1216 	if (q == 0) {
1217 		acols = bc;
1218 		scols = MIN(dcols, roundup(bc, nparity + 1));
1219 	} else {
1220 		acols = dcols;
1221 		scols = dcols;
1222 	}
1223 
1224 	ASSERT3U(acols, <=, scols);
1225 
1226 	rm = malloc(offsetof(raidz_map_t, rm_col[scols]));
1227 	if (rm == NULL)
1228 		return (rm);
1229 
1230 	rm->rm_cols = acols;
1231 	rm->rm_scols = scols;
1232 	rm->rm_bigcols = bc;
1233 	rm->rm_skipstart = bc;
1234 	rm->rm_missingdata = 0;
1235 	rm->rm_missingparity = 0;
1236 	rm->rm_firstdatacol = nparity;
1237 	rm->rm_reports = 0;
1238 	rm->rm_freed = 0;
1239 	rm->rm_ecksuminjected = 0;
1240 
1241 	asize = 0;
1242 
1243 	for (c = 0; c < scols; c++) {
1244 		col = f + c;
1245 		coff = o;
1246 		if (col >= dcols) {
1247 			col -= dcols;
1248 			coff += 1ULL << unit_shift;
1249 		}
1250 		rm->rm_col[c].rc_devidx = col;
1251 		rm->rm_col[c].rc_offset = coff;
1252 		rm->rm_col[c].rc_data = NULL;
1253 		rm->rm_col[c].rc_error = 0;
1254 		rm->rm_col[c].rc_tried = 0;
1255 		rm->rm_col[c].rc_skipped = 0;
1256 
1257 		if (c >= acols)
1258 			rm->rm_col[c].rc_size = 0;
1259 		else if (c < bc)
1260 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1261 		else
1262 			rm->rm_col[c].rc_size = q << unit_shift;
1263 
1264 		asize += rm->rm_col[c].rc_size;
1265 	}
1266 
1267 	ASSERT3U(asize, ==, tot << unit_shift);
1268 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1269 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1270 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1271 	ASSERT3U(rm->rm_nskip, <=, nparity);
1272 
1273 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1274 		rm->rm_col[c].rc_data = malloc(rm->rm_col[c].rc_size);
1275 		if (rm->rm_col[c].rc_data == NULL) {
1276 			c++;
1277 			while (c != 0)
1278 				free(rm->rm_col[--c].rc_data);
1279 			free(rm);
1280 			return (NULL);
1281 		}
1282 	}
1283 
1284 	rm->rm_col[c].rc_data = data;
1285 
1286 	for (c = c + 1; c < acols; c++)
1287 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1288 		    rm->rm_col[c - 1].rc_size;
1289 
1290 	/*
1291 	 * If all data stored spans all columns, there's a danger that parity
1292 	 * will always be on the same device and, since parity isn't read
1293 	 * during normal operation, that that device's I/O bandwidth won't be
1294 	 * used effectively. We therefore switch the parity every 1MB.
1295 	 *
1296 	 * ... at least that was, ostensibly, the theory. As a practical
1297 	 * matter unless we juggle the parity between all devices evenly, we
1298 	 * won't see any benefit. Further, occasional writes that aren't a
1299 	 * multiple of the LCM of the number of children and the minimum
1300 	 * stripe width are sufficient to avoid pessimal behavior.
1301 	 * Unfortunately, this decision created an implicit on-disk format
1302 	 * requirement that we need to support for all eternity, but only
1303 	 * for single-parity RAID-Z.
1304 	 *
1305 	 * If we intend to skip a sector in the zeroth column for padding
1306 	 * we must make sure to note this swap. We will never intend to
1307 	 * skip the first column since at least one data and one parity
1308 	 * column must appear in each row.
1309 	 */
1310 	ASSERT(rm->rm_cols >= 2);
1311 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1312 
1313 	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1314 		devidx = rm->rm_col[0].rc_devidx;
1315 		o = rm->rm_col[0].rc_offset;
1316 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1317 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1318 		rm->rm_col[1].rc_devidx = devidx;
1319 		rm->rm_col[1].rc_offset = o;
1320 
1321 		if (rm->rm_skipstart == 0)
1322 			rm->rm_skipstart = 1;
1323 	}
1324 
1325 	return (rm);
1326 }
1327 
1328 static void
1329 vdev_raidz_map_free(raidz_map_t *rm)
1330 {
1331 	int c;
1332 
1333 	for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1334 		free(rm->rm_col[c].rc_data);
1335 
1336 	free(rm);
1337 }
1338 
1339 static vdev_t *
1340 vdev_child(vdev_t *pvd, uint64_t devidx)
1341 {
1342 	vdev_t *cvd;
1343 
1344 	STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1345 		if (cvd->v_id == devidx)
1346 			break;
1347 	}
1348 
1349 	return (cvd);
1350 }
1351 
1352 /*
1353  * We keep track of whether or not there were any injected errors, so that
1354  * any ereports we generate can note it.
1355  */
1356 static int
1357 raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data,
1358     uint64_t size)
1359 {
1360 	return (zio_checksum_verify(spa, bp, data));
1361 }
1362 
1363 /*
1364  * Generate the parity from the data columns. If we tried and were able to
1365  * read the parity without error, verify that the generated parity matches the
1366  * data we read. If it doesn't, we fire off a checksum error. Return the
1367  * number such failures.
1368  */
1369 static int
1370 raidz_parity_verify(raidz_map_t *rm)
1371 {
1372 	void *orig[VDEV_RAIDZ_MAXPARITY];
1373 	int c, ret = 0;
1374 	raidz_col_t *rc;
1375 
1376 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1377 		rc = &rm->rm_col[c];
1378 		if (!rc->rc_tried || rc->rc_error != 0)
1379 			continue;
1380 		orig[c] = malloc(rc->rc_size);
1381 		if (orig[c] != NULL) {
1382 			bcopy(rc->rc_data, orig[c], rc->rc_size);
1383 		} else {
1384 			printf("Out of memory\n");
1385 		}
1386 	}
1387 
1388 	vdev_raidz_generate_parity(rm);
1389 
1390 	for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1391 		rc = &rm->rm_col[c];
1392 		if (!rc->rc_tried || rc->rc_error != 0)
1393 			continue;
1394 		if (orig[c] == NULL ||
1395 		    bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1396 			rc->rc_error = ECKSUM;
1397 			ret++;
1398 		}
1399 		free(orig[c]);
1400 	}
1401 
1402 	return (ret);
1403 }
1404 
1405 /*
1406  * Iterate over all combinations of bad data and attempt a reconstruction.
1407  * Note that the algorithm below is non-optimal because it doesn't take into
1408  * account how reconstruction is actually performed. For example, with
1409  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1410  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1411  * cases we'd only use parity information in column 0.
1412  */
1413 static int
1414 vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp,
1415     void *data, off_t offset, uint64_t bytes, int total_errors, int data_errors)
1416 {
1417 	raidz_col_t *rc;
1418 	void *orig[VDEV_RAIDZ_MAXPARITY];
1419 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1420 	int *tgts = &tstore[1];
1421 	int current, next, i, c, n;
1422 	int code, ret = 0;
1423 
1424 	ASSERT(total_errors < rm->rm_firstdatacol);
1425 
1426 	/*
1427 	 * This simplifies one edge condition.
1428 	 */
1429 	tgts[-1] = -1;
1430 
1431 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1432 		/*
1433 		 * Initialize the targets array by finding the first n columns
1434 		 * that contain no error.
1435 		 *
1436 		 * If there were no data errors, we need to ensure that we're
1437 		 * always explicitly attempting to reconstruct at least one
1438 		 * data column. To do this, we simply push the highest target
1439 		 * up into the data columns.
1440 		 */
1441 		for (c = 0, i = 0; i < n; i++) {
1442 			if (i == n - 1 && data_errors == 0 &&
1443 			    c < rm->rm_firstdatacol) {
1444 				c = rm->rm_firstdatacol;
1445 			}
1446 
1447 			while (rm->rm_col[c].rc_error != 0) {
1448 				c++;
1449 				ASSERT3S(c, <, rm->rm_cols);
1450 			}
1451 
1452 			tgts[i] = c++;
1453 		}
1454 
1455 		/*
1456 		 * Setting tgts[n] simplifies the other edge condition.
1457 		 */
1458 		tgts[n] = rm->rm_cols;
1459 
1460 		/*
1461 		 * These buffers were allocated in previous iterations.
1462 		 */
1463 		for (i = 0; i < n - 1; i++) {
1464 			ASSERT(orig[i] != NULL);
1465 		}
1466 
1467 		orig[n - 1] = malloc(rm->rm_col[0].rc_size);
1468 		if (orig[n - 1] == NULL) {
1469 			ret = ENOMEM;
1470 			goto done;
1471 		}
1472 
1473 		current = 0;
1474 		next = tgts[current];
1475 
1476 		while (current != n) {
1477 			tgts[current] = next;
1478 			current = 0;
1479 
1480 			/*
1481 			 * Save off the original data that we're going to
1482 			 * attempt to reconstruct.
1483 			 */
1484 			for (i = 0; i < n; i++) {
1485 				ASSERT(orig[i] != NULL);
1486 				c = tgts[i];
1487 				ASSERT3S(c, >=, 0);
1488 				ASSERT3S(c, <, rm->rm_cols);
1489 				rc = &rm->rm_col[c];
1490 				bcopy(rc->rc_data, orig[i], rc->rc_size);
1491 			}
1492 
1493 			/*
1494 			 * Attempt a reconstruction and exit the outer loop on
1495 			 * success.
1496 			 */
1497 			code = vdev_raidz_reconstruct(rm, tgts, n);
1498 			if (raidz_checksum_verify(spa, bp, data, bytes) == 0) {
1499 				for (i = 0; i < n; i++) {
1500 					c = tgts[i];
1501 					rc = &rm->rm_col[c];
1502 					ASSERT(rc->rc_error == 0);
1503 					rc->rc_error = ECKSUM;
1504 				}
1505 
1506 				ret = code;
1507 				goto done;
1508 			}
1509 
1510 			/*
1511 			 * Restore the original data.
1512 			 */
1513 			for (i = 0; i < n; i++) {
1514 				c = tgts[i];
1515 				rc = &rm->rm_col[c];
1516 				bcopy(orig[i], rc->rc_data, rc->rc_size);
1517 			}
1518 
1519 			do {
1520 				/*
1521 				 * Find the next valid column after the current
1522 				 * position..
1523 				 */
1524 				for (next = tgts[current] + 1;
1525 				    next < rm->rm_cols &&
1526 				    rm->rm_col[next].rc_error != 0; next++)
1527 					continue;
1528 
1529 				ASSERT(next <= tgts[current + 1]);
1530 
1531 				/*
1532 				 * If that spot is available, we're done here.
1533 				 */
1534 				if (next != tgts[current + 1])
1535 					break;
1536 
1537 				/*
1538 				 * Otherwise, find the next valid column after
1539 				 * the previous position.
1540 				 */
1541 				for (c = tgts[current - 1] + 1;
1542 				    rm->rm_col[c].rc_error != 0; c++)
1543 					continue;
1544 
1545 				tgts[current] = c;
1546 				current++;
1547 
1548 			} while (current != n);
1549 		}
1550 	}
1551 	n--;
1552 done:
1553 	for (i = n - 1; i >= 0; i--) {
1554 		free(orig[i]);
1555 	}
1556 
1557 	return (ret);
1558 }
1559 
1560 static int
1561 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1562     off_t offset, size_t bytes)
1563 {
1564 	vdev_t *tvd = vd->v_top;
1565 	vdev_t *cvd;
1566 	raidz_map_t *rm;
1567 	raidz_col_t *rc;
1568 	int c, error;
1569 	int unexpected_errors;
1570 	int parity_errors;
1571 	int parity_untried;
1572 	int data_errors;
1573 	int total_errors;
1574 	int n;
1575 	int tgts[VDEV_RAIDZ_MAXPARITY];
1576 	int code;
1577 
1578 	rc = NULL;	/* gcc */
1579 	error = 0;
1580 
1581 	rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1582 	    vd->v_nchildren, vd->v_nparity);
1583 	if (rm == NULL)
1584 		return (ENOMEM);
1585 
1586 	/*
1587 	 * Iterate over the columns in reverse order so that we hit the parity
1588 	 * last -- any errors along the way will force us to read the parity.
1589 	 */
1590 	for (c = rm->rm_cols - 1; c >= 0; c--) {
1591 		rc = &rm->rm_col[c];
1592 		cvd = vdev_child(vd, rc->rc_devidx);
1593 		if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1594 			if (c >= rm->rm_firstdatacol)
1595 				rm->rm_missingdata++;
1596 			else
1597 				rm->rm_missingparity++;
1598 			rc->rc_error = ENXIO;
1599 			rc->rc_tried = 1;	/* don't even try */
1600 			rc->rc_skipped = 1;
1601 			continue;
1602 		}
1603 #if 0		/* XXX: Too hard for the boot code. */
1604 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1605 			if (c >= rm->rm_firstdatacol)
1606 				rm->rm_missingdata++;
1607 			else
1608 				rm->rm_missingparity++;
1609 			rc->rc_error = ESTALE;
1610 			rc->rc_skipped = 1;
1611 			continue;
1612 		}
1613 #endif
1614 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1615 			rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1616 			    rc->rc_offset, rc->rc_size);
1617 			rc->rc_tried = 1;
1618 			rc->rc_skipped = 0;
1619 		}
1620 	}
1621 
1622 reconstruct:
1623 	unexpected_errors = 0;
1624 	parity_errors = 0;
1625 	parity_untried = 0;
1626 	data_errors = 0;
1627 	total_errors = 0;
1628 
1629 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1630 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1631 
1632 	for (c = 0; c < rm->rm_cols; c++) {
1633 		rc = &rm->rm_col[c];
1634 
1635 		if (rc->rc_error) {
1636 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
1637 
1638 			if (c < rm->rm_firstdatacol)
1639 				parity_errors++;
1640 			else
1641 				data_errors++;
1642 
1643 			if (!rc->rc_skipped)
1644 				unexpected_errors++;
1645 
1646 			total_errors++;
1647 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1648 			parity_untried++;
1649 		}
1650 	}
1651 
1652 	/*
1653 	 * There are three potential phases for a read:
1654 	 *	1. produce valid data from the columns read
1655 	 *	2. read all disks and try again
1656 	 *	3. perform combinatorial reconstruction
1657 	 *
1658 	 * Each phase is progressively both more expensive and less likely to
1659 	 * occur. If we encounter more errors than we can repair or all phases
1660 	 * fail, we have no choice but to return an error.
1661 	 */
1662 
1663 	/*
1664 	 * If the number of errors we saw was correctable -- less than or equal
1665 	 * to the number of parity disks read -- attempt to produce data that
1666 	 * has a valid checksum. Naturally, this case applies in the absence of
1667 	 * any errors.
1668 	 */
1669 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1670 		int rv;
1671 
1672 		if (data_errors == 0) {
1673 			rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
1674 			if (rv == 0) {
1675 				/*
1676 				 * If we read parity information (unnecessarily
1677 				 * as it happens since no reconstruction was
1678 				 * needed) regenerate and verify the parity.
1679 				 * We also regenerate parity when resilvering
1680 				 * so we can write it out to the failed device
1681 				 * later.
1682 				 */
1683 				if (parity_errors + parity_untried <
1684 				    rm->rm_firstdatacol) {
1685 					n = raidz_parity_verify(rm);
1686 					unexpected_errors += n;
1687 					ASSERT(parity_errors + n <=
1688 					    rm->rm_firstdatacol);
1689 				}
1690 				goto done;
1691 			}
1692 		} else {
1693 			/*
1694 			 * We either attempt to read all the parity columns or
1695 			 * none of them. If we didn't try to read parity, we
1696 			 * wouldn't be here in the correctable case. There must
1697 			 * also have been fewer parity errors than parity
1698 			 * columns or, again, we wouldn't be in this code path.
1699 			 */
1700 			ASSERT(parity_untried == 0);
1701 			ASSERT(parity_errors < rm->rm_firstdatacol);
1702 
1703 			/*
1704 			 * Identify the data columns that reported an error.
1705 			 */
1706 			n = 0;
1707 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1708 				rc = &rm->rm_col[c];
1709 				if (rc->rc_error != 0) {
1710 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1711 					tgts[n++] = c;
1712 				}
1713 			}
1714 
1715 			ASSERT(rm->rm_firstdatacol >= n);
1716 
1717 			code = vdev_raidz_reconstruct(rm, tgts, n);
1718 
1719 			rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
1720 			if (rv == 0) {
1721 				/*
1722 				 * If we read more parity disks than were used
1723 				 * for reconstruction, confirm that the other
1724 				 * parity disks produced correct data. This
1725 				 * routine is suboptimal in that it regenerates
1726 				 * the parity that we already used in addition
1727 				 * to the parity that we're attempting to
1728 				 * verify, but this should be a relatively
1729 				 * uncommon case, and can be optimized if it
1730 				 * becomes a problem. Note that we regenerate
1731 				 * parity when resilvering so we can write it
1732 				 * out to failed devices later.
1733 				 */
1734 				if (parity_errors < rm->rm_firstdatacol - n) {
1735 					n = raidz_parity_verify(rm);
1736 					unexpected_errors += n;
1737 					ASSERT(parity_errors + n <=
1738 					    rm->rm_firstdatacol);
1739 				}
1740 
1741 				goto done;
1742 			}
1743 		}
1744 	}
1745 
1746 	/*
1747 	 * This isn't a typical situation -- either we got a read
1748 	 * error or a child silently returned bad data. Read every
1749 	 * block so we can try again with as much data and parity as
1750 	 * we can track down. If we've already been through once
1751 	 * before, all children will be marked as tried so we'll
1752 	 * proceed to combinatorial reconstruction.
1753 	 */
1754 	unexpected_errors = 1;
1755 	rm->rm_missingdata = 0;
1756 	rm->rm_missingparity = 0;
1757 
1758 	n = 0;
1759 	for (c = 0; c < rm->rm_cols; c++) {
1760 		rc = &rm->rm_col[c];
1761 
1762 		if (rc->rc_tried)
1763 			continue;
1764 
1765 		cvd = vdev_child(vd, rc->rc_devidx);
1766 		ASSERT(cvd != NULL);
1767 		rc->rc_error = cvd->v_read(cvd, NULL,
1768 		    rc->rc_data, rc->rc_offset, rc->rc_size);
1769 		if (rc->rc_error == 0)
1770 			n++;
1771 		rc->rc_tried = 1;
1772 		rc->rc_skipped = 0;
1773 	}
1774 	/*
1775 	 * If we managed to read anything more, retry the
1776 	 * reconstruction.
1777 	 */
1778 	if (n > 0)
1779 		goto reconstruct;
1780 
1781 	/*
1782 	 * At this point we've attempted to reconstruct the data given the
1783 	 * errors we detected, and we've attempted to read all columns. There
1784 	 * must, therefore, be one or more additional problems -- silent errors
1785 	 * resulting in invalid data rather than explicit I/O errors resulting
1786 	 * in absent data. We check if there is enough additional data to
1787 	 * possibly reconstruct the data and then perform combinatorial
1788 	 * reconstruction over all possible combinations. If that fails,
1789 	 * we're cooked.
1790 	 */
1791 	if (total_errors > rm->rm_firstdatacol) {
1792 		error = EIO;
1793 	} else if (total_errors < rm->rm_firstdatacol &&
1794 	    (code = vdev_raidz_combrec(vd->v_spa, rm, bp, data, offset, bytes,
1795 	     total_errors, data_errors)) != 0) {
1796 		/*
1797 		 * If we didn't use all the available parity for the
1798 		 * combinatorial reconstruction, verify that the remaining
1799 		 * parity is correct.
1800 		 */
1801 		if (code != (1 << rm->rm_firstdatacol) - 1)
1802 			(void) raidz_parity_verify(rm);
1803 	} else {
1804 		/*
1805 		 * We're here because either:
1806 		 *
1807 		 *	total_errors == rm_first_datacol, or
1808 		 *	vdev_raidz_combrec() failed
1809 		 *
1810 		 * In either case, there is enough bad data to prevent
1811 		 * reconstruction.
1812 		 *
1813 		 * Start checksum ereports for all children which haven't
1814 		 * failed, and the IO wasn't speculative.
1815 		 */
1816 		error = ECKSUM;
1817 	}
1818 
1819 done:
1820 	vdev_raidz_map_free(rm);
1821 
1822 	return (error);
1823 }
1824