xref: /freebsd/sys/cddl/boot/zfs/zfssubr.c (revision 5bb3134a8c21cb87b30e135ef168483f0333dabb)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/cdefs.h>
27 __FBSDID("$FreeBSD$");
28 
29 #include <lz4.h>
30 
31 static uint64_t zfs_crc64_table[256];
32 
33 #define	ASSERT3S(x, y, z)	((void)0)
34 #define	ASSERT3U(x, y, z)	((void)0)
35 #define	ASSERT3P(x, y, z)	((void)0)
36 #define	ASSERT0(x)		((void)0)
37 #define	ASSERT(x)		((void)0)
38 
39 #define	panic(...)	do {						\
40 	printf(__VA_ARGS__);						\
41 	for (;;) ;							\
42 } while (0)
43 
44 static void
45 zfs_init_crc(void)
46 {
47 	int i, j;
48 	uint64_t *ct;
49 
50 	/*
51 	 * Calculate the crc64 table (used for the zap hash
52 	 * function).
53 	 */
54 	if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
55 		memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
56 		for (i = 0; i < 256; i++)
57 			for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
58 				*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
59 	}
60 }
61 
62 static void
63 zio_checksum_off(const void *buf, uint64_t size,
64     const void *ctx_template, zio_cksum_t *zcp)
65 {
66 	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
67 }
68 
69 /*
70  * Signature for checksum functions.
71  */
72 typedef void zio_checksum_t(const void *data, uint64_t size,
73     const void *ctx_template, zio_cksum_t *zcp);
74 typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
75 typedef void zio_checksum_tmpl_free_t(void *ctx_template);
76 
77 typedef enum zio_checksum_flags {
78 	/* Strong enough for metadata? */
79 	ZCHECKSUM_FLAG_METADATA = (1 << 1),
80 	/* ZIO embedded checksum */
81 	ZCHECKSUM_FLAG_EMBEDDED = (1 << 2),
82 	/* Strong enough for dedup (without verification)? */
83 	ZCHECKSUM_FLAG_DEDUP = (1 << 3),
84 	/* Uses salt value */
85 	ZCHECKSUM_FLAG_SALTED = (1 << 4),
86 	/* Strong enough for nopwrite? */
87 	ZCHECKSUM_FLAG_NOPWRITE = (1 << 5)
88 } zio_checksum_flags_t;
89 
90 /*
91  * Information about each checksum function.
92  */
93 typedef struct zio_checksum_info {
94 	/* checksum function for each byteorder */
95 	zio_checksum_t			*ci_func[2];
96 	zio_checksum_tmpl_init_t	*ci_tmpl_init;
97 	zio_checksum_tmpl_free_t	*ci_tmpl_free;
98 	zio_checksum_flags_t		ci_flags;
99 	const char			*ci_name;	/* descriptive name */
100 } zio_checksum_info_t;
101 
102 #include "blkptr.c"
103 
104 #include "fletcher.c"
105 #include "sha256.c"
106 #include "skein_zfs.c"
107 
108 #ifdef HAS_ZSTD_ZFS
109 extern int zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len,
110     size_t d_len, int n);
111 #endif
112 
113 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
114 	{{NULL, NULL}, NULL, NULL, 0, "inherit"},
115 	{{NULL, NULL}, NULL, NULL, 0, "on"},
116 	{{zio_checksum_off,	zio_checksum_off}, NULL, NULL, 0, "off"},
117 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
118 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"},
119 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
120 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"},
121 	{{fletcher_2_native,	fletcher_2_byteswap}, NULL, NULL,
122 	    ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
123 	{{fletcher_2_native,	fletcher_2_byteswap}, NULL, NULL,
124 	    0, "fletcher2"},
125 	{{fletcher_4_native,	fletcher_4_byteswap}, NULL, NULL,
126 	    ZCHECKSUM_FLAG_METADATA, "fletcher4"},
127 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
128 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
129 	    ZCHECKSUM_FLAG_NOPWRITE, "SHA256"},
130 	{{fletcher_4_native,	fletcher_4_byteswap}, NULL, NULL,
131 	    ZCHECKSUM_FLAG_EMBEDDED, "zillog2"},
132 	{{zio_checksum_off,	zio_checksum_off}, NULL, NULL,
133 	    0, "noparity"},
134 	{{zio_checksum_SHA512_native,	zio_checksum_SHA512_byteswap},
135 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
136 	    ZCHECKSUM_FLAG_NOPWRITE, "SHA512"},
137 	{{zio_checksum_skein_native, zio_checksum_skein_byteswap},
138 	    zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
139 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
140 	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
141 	/* no edonr for now */
142 	{{NULL, NULL}, NULL, NULL, ZCHECKSUM_FLAG_METADATA |
143 	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"}
144 };
145 
146 /*
147  * Common signature for all zio compress/decompress functions.
148  */
149 typedef size_t zio_compress_func_t(void *src, void *dst,
150     size_t s_len, size_t d_len, int);
151 typedef int zio_decompress_func_t(void *src, void *dst,
152     size_t s_len, size_t d_len, int);
153 
154 /*
155  * Information about each compression function.
156  */
157 typedef struct zio_compress_info {
158 	zio_compress_func_t	*ci_compress;	/* compression function */
159 	zio_decompress_func_t	*ci_decompress;	/* decompression function */
160 	int			ci_level;	/* level parameter */
161 	const char		*ci_name;	/* algorithm name */
162 } zio_compress_info_t;
163 
164 #include "lzjb.c"
165 #include "zle.c"
166 
167 /*
168  * Compression vectors.
169  */
170 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
171 	{NULL,			NULL,			0,	"inherit"},
172 	{NULL,			NULL,			0,	"on"},
173 	{NULL,			NULL,			0,	"uncompressed"},
174 	{NULL,			lzjb_decompress,	0,	"lzjb"},
175 	{NULL,			NULL,			0,	"empty"},
176 	{NULL,			NULL,			1,	"gzip-1"},
177 	{NULL,			NULL,			2,	"gzip-2"},
178 	{NULL,			NULL,			3,	"gzip-3"},
179 	{NULL,			NULL,			4,	"gzip-4"},
180 	{NULL,			NULL,			5,	"gzip-5"},
181 	{NULL,			NULL,			6,	"gzip-6"},
182 	{NULL,			NULL,			7,	"gzip-7"},
183 	{NULL,			NULL,			8,	"gzip-8"},
184 	{NULL,			NULL,			9,	"gzip-9"},
185 	{NULL,			zle_decompress,		64,	"zle"},
186 	{NULL,			lz4_decompress,		0,	"lz4"},
187 #ifdef HAS_ZSTD_ZFS
188 	{NULL,			zfs_zstd_decompress, ZIO_ZSTD_LEVEL_DEFAULT, "zstd"}
189 #endif
190 };
191 
192 static void
193 byteswap_uint64_array(void *vbuf, size_t size)
194 {
195 	uint64_t *buf = vbuf;
196 	size_t count = size >> 3;
197 	int i;
198 
199 	ASSERT((size & 7) == 0);
200 
201 	for (i = 0; i < count; i++)
202 		buf[i] = BSWAP_64(buf[i]);
203 }
204 
205 /*
206  * Set the external verifier for a gang block based on <vdev, offset, txg>,
207  * a tuple which is guaranteed to be unique for the life of the pool.
208  */
209 static void
210 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
211 {
212 	const dva_t *dva = BP_IDENTITY(bp);
213 	uint64_t txg = BP_PHYSICAL_BIRTH(bp);
214 
215 	ASSERT(BP_IS_GANG(bp));
216 
217 	ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
218 }
219 
220 /*
221  * Set the external verifier for a label block based on its offset.
222  * The vdev is implicit, and the txg is unknowable at pool open time --
223  * hence the logic in vdev_uberblock_load() to find the most recent copy.
224  */
225 static void
226 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
227 {
228 	ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
229 }
230 
231 /*
232  * Calls the template init function of a checksum which supports context
233  * templates and installs the template into the spa_t.
234  */
235 static void
236 zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
237 {
238 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
239 
240 	if (ci->ci_tmpl_init == NULL)
241 		return;
242 
243 	if (spa->spa_cksum_tmpls[checksum] != NULL)
244 		return;
245 
246 	if (spa->spa_cksum_tmpls[checksum] == NULL) {
247 		spa->spa_cksum_tmpls[checksum] =
248 		    ci->ci_tmpl_init(&spa->spa_cksum_salt);
249 	}
250 }
251 
252 /*
253  * Called by a spa_t that's about to be deallocated. This steps through
254  * all of the checksum context templates and deallocates any that were
255  * initialized using the algorithm-specific template init function.
256  */
257 static void __unused
258 zio_checksum_templates_free(spa_t *spa)
259 {
260 	for (enum zio_checksum checksum = 0;
261 	    checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
262 		if (spa->spa_cksum_tmpls[checksum] != NULL) {
263 			zio_checksum_info_t *ci = &zio_checksum_table[checksum];
264 
265 			ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
266 			spa->spa_cksum_tmpls[checksum] = NULL;
267 		}
268 	}
269 }
270 
271 static int
272 zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data)
273 {
274 	uint64_t size;
275 	unsigned int checksum;
276 	zio_checksum_info_t *ci;
277 	void *ctx = NULL;
278 	zio_cksum_t actual_cksum, expected_cksum, verifier;
279 	int byteswap;
280 
281 	checksum = BP_GET_CHECKSUM(bp);
282 	size = BP_GET_PSIZE(bp);
283 
284 	if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
285 		return (EINVAL);
286 	ci = &zio_checksum_table[checksum];
287 	if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
288 		return (EINVAL);
289 
290 	if (spa != NULL) {
291 		zio_checksum_template_init(checksum, __DECONST(spa_t *,spa));
292 		ctx = spa->spa_cksum_tmpls[checksum];
293 	}
294 
295 	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
296 		zio_eck_t *eck;
297 
298 		ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
299 		    checksum == ZIO_CHECKSUM_LABEL);
300 
301 		eck = (zio_eck_t *)((char *)data + size) - 1;
302 
303 		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
304 			zio_checksum_gang_verifier(&verifier, bp);
305 		else if (checksum == ZIO_CHECKSUM_LABEL)
306 			zio_checksum_label_verifier(&verifier,
307 			    DVA_GET_OFFSET(BP_IDENTITY(bp)));
308 		else
309 			verifier = bp->blk_cksum;
310 
311 		byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
312 
313 		if (byteswap)
314 			byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
315 
316 		expected_cksum = eck->zec_cksum;
317 		eck->zec_cksum = verifier;
318 		ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
319 		eck->zec_cksum = expected_cksum;
320 
321 		if (byteswap)
322 			byteswap_uint64_array(&expected_cksum,
323 			    sizeof (zio_cksum_t));
324 	} else {
325 		byteswap = BP_SHOULD_BYTESWAP(bp);
326 		expected_cksum = bp->blk_cksum;
327 		ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
328 	}
329 
330 	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
331 		/*printf("ZFS: read checksum %s failed\n", ci->ci_name);*/
332 		return (EIO);
333 	}
334 
335 	return (0);
336 }
337 
338 static int
339 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
340 	void *dest, uint64_t destsize)
341 {
342 	zio_compress_info_t *ci;
343 
344 	if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
345 		printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
346 		return (EIO);
347 	}
348 
349 	ci = &zio_compress_table[cpfunc];
350 	if (!ci->ci_decompress) {
351 		printf("ZFS: unsupported compression algorithm %s\n",
352 		    ci->ci_name);
353 		return (EIO);
354 	}
355 
356 	return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
357 }
358 
359 static uint64_t
360 zap_hash(uint64_t salt, const char *name)
361 {
362 	const uint8_t *cp;
363 	uint8_t c;
364 	uint64_t crc = salt;
365 
366 	ASSERT(crc != 0);
367 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
368 	for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
369 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
370 
371 	/*
372 	 * Only use 28 bits, since we need 4 bits in the cookie for the
373 	 * collision differentiator.  We MUST use the high bits, since
374 	 * those are the onces that we first pay attention to when
375 	 * chosing the bucket.
376 	 */
377 	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
378 
379 	return (crc);
380 }
381 
382 typedef struct raidz_col {
383 	uint64_t rc_devidx;		/* child device index for I/O */
384 	uint64_t rc_offset;		/* device offset */
385 	uint64_t rc_size;		/* I/O size */
386 	void *rc_data;			/* I/O data */
387 	int rc_error;			/* I/O error for this device */
388 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
389 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
390 } raidz_col_t;
391 
392 typedef struct raidz_map {
393 	uint64_t rm_cols;		/* Regular column count */
394 	uint64_t rm_scols;		/* Count including skipped columns */
395 	uint64_t rm_bigcols;		/* Number of oversized columns */
396 	uint64_t rm_asize;		/* Actual total I/O size */
397 	uint64_t rm_missingdata;	/* Count of missing data devices */
398 	uint64_t rm_missingparity;	/* Count of missing parity devices */
399 	uint64_t rm_firstdatacol;	/* First data column/parity count */
400 	uint64_t rm_nskip;		/* Skipped sectors for padding */
401 	uint64_t rm_skipstart;		/* Column index of padding start */
402 	uintptr_t rm_reports;		/* # of referencing checksum reports */
403 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
404 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
405 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
406 } raidz_map_t;
407 
408 #define	VDEV_RAIDZ_P		0
409 #define	VDEV_RAIDZ_Q		1
410 #define	VDEV_RAIDZ_R		2
411 
412 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
413 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
414 
415 /*
416  * We provide a mechanism to perform the field multiplication operation on a
417  * 64-bit value all at once rather than a byte at a time. This works by
418  * creating a mask from the top bit in each byte and using that to
419  * conditionally apply the XOR of 0x1d.
420  */
421 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
422 { \
423 	(mask) = (x) & 0x8080808080808080ULL; \
424 	(mask) = ((mask) << 1) - ((mask) >> 7); \
425 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
426 	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
427 }
428 
429 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
430 { \
431 	VDEV_RAIDZ_64MUL_2((x), mask); \
432 	VDEV_RAIDZ_64MUL_2((x), mask); \
433 }
434 
435 /*
436  * These two tables represent powers and logs of 2 in the Galois field defined
437  * above. These values were computed by repeatedly multiplying by 2 as above.
438  */
439 static const uint8_t vdev_raidz_pow2[256] = {
440 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
441 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
442 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
443 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
444 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
445 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
446 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
447 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
448 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
449 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
450 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
451 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
452 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
453 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
454 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
455 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
456 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
457 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
458 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
459 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
460 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
461 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
462 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
463 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
464 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
465 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
466 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
467 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
468 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
469 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
470 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
471 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
472 };
473 static const uint8_t vdev_raidz_log2[256] = {
474 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
475 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
476 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
477 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
478 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
479 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
480 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
481 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
482 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
483 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
484 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
485 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
486 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
487 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
488 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
489 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
490 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
491 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
492 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
493 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
494 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
495 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
496 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
497 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
498 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
499 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
500 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
501 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
502 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
503 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
504 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
505 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
506 };
507 
508 /*
509  * Multiply a given number by 2 raised to the given power.
510  */
511 static uint8_t
512 vdev_raidz_exp2(uint8_t a, int exp)
513 {
514 	if (a == 0)
515 		return (0);
516 
517 	ASSERT(exp >= 0);
518 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
519 
520 	exp += vdev_raidz_log2[a];
521 	if (exp > 255)
522 		exp -= 255;
523 
524 	return (vdev_raidz_pow2[exp]);
525 }
526 
527 static void
528 vdev_raidz_generate_parity_p(raidz_map_t *rm)
529 {
530 	uint64_t *p, *src, pcount, ccount, i;
531 	int c;
532 
533 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
534 
535 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
536 		src = rm->rm_col[c].rc_data;
537 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
538 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
539 
540 		if (c == rm->rm_firstdatacol) {
541 			ASSERT(ccount == pcount);
542 			for (i = 0; i < ccount; i++, src++, p++) {
543 				*p = *src;
544 			}
545 		} else {
546 			ASSERT(ccount <= pcount);
547 			for (i = 0; i < ccount; i++, src++, p++) {
548 				*p ^= *src;
549 			}
550 		}
551 	}
552 }
553 
554 static void
555 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
556 {
557 	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
558 	int c;
559 
560 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
561 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
562 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
563 
564 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
565 		src = rm->rm_col[c].rc_data;
566 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
567 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
568 
569 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
570 
571 		if (c == rm->rm_firstdatacol) {
572 			ASSERT(ccnt == pcnt || ccnt == 0);
573 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
574 				*p = *src;
575 				*q = *src;
576 			}
577 			for (; i < pcnt; i++, src++, p++, q++) {
578 				*p = 0;
579 				*q = 0;
580 			}
581 		} else {
582 			ASSERT(ccnt <= pcnt);
583 
584 			/*
585 			 * Apply the algorithm described above by multiplying
586 			 * the previous result and adding in the new value.
587 			 */
588 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
589 				*p ^= *src;
590 
591 				VDEV_RAIDZ_64MUL_2(*q, mask);
592 				*q ^= *src;
593 			}
594 
595 			/*
596 			 * Treat short columns as though they are full of 0s.
597 			 * Note that there's therefore nothing needed for P.
598 			 */
599 			for (; i < pcnt; i++, q++) {
600 				VDEV_RAIDZ_64MUL_2(*q, mask);
601 			}
602 		}
603 	}
604 }
605 
606 static void
607 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
608 {
609 	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
610 	int c;
611 
612 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
613 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
614 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
615 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
616 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
617 
618 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
619 		src = rm->rm_col[c].rc_data;
620 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
621 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
622 		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
623 
624 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
625 
626 		if (c == rm->rm_firstdatacol) {
627 			ASSERT(ccnt == pcnt || ccnt == 0);
628 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
629 				*p = *src;
630 				*q = *src;
631 				*r = *src;
632 			}
633 			for (; i < pcnt; i++, src++, p++, q++, r++) {
634 				*p = 0;
635 				*q = 0;
636 				*r = 0;
637 			}
638 		} else {
639 			ASSERT(ccnt <= pcnt);
640 
641 			/*
642 			 * Apply the algorithm described above by multiplying
643 			 * the previous result and adding in the new value.
644 			 */
645 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
646 				*p ^= *src;
647 
648 				VDEV_RAIDZ_64MUL_2(*q, mask);
649 				*q ^= *src;
650 
651 				VDEV_RAIDZ_64MUL_4(*r, mask);
652 				*r ^= *src;
653 			}
654 
655 			/*
656 			 * Treat short columns as though they are full of 0s.
657 			 * Note that there's therefore nothing needed for P.
658 			 */
659 			for (; i < pcnt; i++, q++, r++) {
660 				VDEV_RAIDZ_64MUL_2(*q, mask);
661 				VDEV_RAIDZ_64MUL_4(*r, mask);
662 			}
663 		}
664 	}
665 }
666 
667 /*
668  * Generate RAID parity in the first virtual columns according to the number of
669  * parity columns available.
670  */
671 static void
672 vdev_raidz_generate_parity(raidz_map_t *rm)
673 {
674 	switch (rm->rm_firstdatacol) {
675 	case 1:
676 		vdev_raidz_generate_parity_p(rm);
677 		break;
678 	case 2:
679 		vdev_raidz_generate_parity_pq(rm);
680 		break;
681 	case 3:
682 		vdev_raidz_generate_parity_pqr(rm);
683 		break;
684 	default:
685 		panic("invalid RAID-Z configuration");
686 	}
687 }
688 
689 /* BEGIN CSTYLED */
690 /*
691  * In the general case of reconstruction, we must solve the system of linear
692  * equations defined by the coeffecients used to generate parity as well as
693  * the contents of the data and parity disks. This can be expressed with
694  * vectors for the original data (D) and the actual data (d) and parity (p)
695  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
696  *
697  *            __   __                     __     __
698  *            |     |         __     __   |  p_0  |
699  *            |  V  |         |  D_0  |   | p_m-1 |
700  *            |     |    x    |   :   | = |  d_0  |
701  *            |  I  |         | D_n-1 |   |   :   |
702  *            |     |         ~~     ~~   | d_n-1 |
703  *            ~~   ~~                     ~~     ~~
704  *
705  * I is simply a square identity matrix of size n, and V is a vandermonde
706  * matrix defined by the coeffecients we chose for the various parity columns
707  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
708  * computation as well as linear separability.
709  *
710  *      __               __               __     __
711  *      |   1   ..  1 1 1 |               |  p_0  |
712  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
713  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
714  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
715  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
716  *      |   :       : : : |   |   :   |   |  d_2  |
717  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
718  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
719  *      |   0   ..  0 0 1 |               | d_n-1 |
720  *      ~~               ~~               ~~     ~~
721  *
722  * Note that I, V, d, and p are known. To compute D, we must invert the
723  * matrix and use the known data and parity values to reconstruct the unknown
724  * data values. We begin by removing the rows in V|I and d|p that correspond
725  * to failed or missing columns; we then make V|I square (n x n) and d|p
726  * sized n by removing rows corresponding to unused parity from the bottom up
727  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
728  * using Gauss-Jordan elimination. In the example below we use m=3 parity
729  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
730  *           __                               __
731  *           |  1   1   1   1   1   1   1   1  |
732  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
733  *           |  19 205 116  29  64  16  4   1  |      / /
734  *           |  1   0   0   0   0   0   0   0  |     / /
735  *           |  0   1   0   0   0   0   0   0  | <--' /
736  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
737  *           |  0   0   0   1   0   0   0   0  |
738  *           |  0   0   0   0   1   0   0   0  |
739  *           |  0   0   0   0   0   1   0   0  |
740  *           |  0   0   0   0   0   0   1   0  |
741  *           |  0   0   0   0   0   0   0   1  |
742  *           ~~                               ~~
743  *           __                               __
744  *           |  1   1   1   1   1   1   1   1  |
745  *           | 128  64  32  16  8   4   2   1  |
746  *           |  19 205 116  29  64  16  4   1  |
747  *           |  1   0   0   0   0   0   0   0  |
748  *           |  0   1   0   0   0   0   0   0  |
749  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
750  *           |  0   0   0   1   0   0   0   0  |
751  *           |  0   0   0   0   1   0   0   0  |
752  *           |  0   0   0   0   0   1   0   0  |
753  *           |  0   0   0   0   0   0   1   0  |
754  *           |  0   0   0   0   0   0   0   1  |
755  *           ~~                               ~~
756  *
757  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
758  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
759  * matrix is not singular.
760  * __                                                                 __
761  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
762  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
763  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
764  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
765  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
766  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
767  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
768  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
769  * ~~                                                                 ~~
770  * __                                                                 __
771  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
772  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
773  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
774  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
775  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
776  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
777  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
778  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
779  * ~~                                                                 ~~
780  * __                                                                 __
781  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
782  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
783  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
784  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
785  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
786  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
787  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
788  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
789  * ~~                                                                 ~~
790  * __                                                                 __
791  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
792  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
793  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
794  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
795  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
796  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
797  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
798  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
799  * ~~                                                                 ~~
800  * __                                                                 __
801  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
802  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
803  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
804  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
805  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
806  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
807  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
808  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
809  * ~~                                                                 ~~
810  * __                                                                 __
811  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
812  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
813  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
814  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
815  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
816  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
817  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
818  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
819  * ~~                                                                 ~~
820  *                   __                               __
821  *                   |  0   0   1   0   0   0   0   0  |
822  *                   | 167 100  5   41 159 169 217 208 |
823  *                   | 166 100  4   40 158 168 216 209 |
824  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
825  *                   |  0   0   0   0   1   0   0   0  |
826  *                   |  0   0   0   0   0   1   0   0  |
827  *                   |  0   0   0   0   0   0   1   0  |
828  *                   |  0   0   0   0   0   0   0   1  |
829  *                   ~~                               ~~
830  *
831  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
832  * of the missing data.
833  *
834  * As is apparent from the example above, the only non-trivial rows in the
835  * inverse matrix correspond to the data disks that we're trying to
836  * reconstruct. Indeed, those are the only rows we need as the others would
837  * only be useful for reconstructing data known or assumed to be valid. For
838  * that reason, we only build the coefficients in the rows that correspond to
839  * targeted columns.
840  */
841 /* END CSTYLED */
842 
843 static void
844 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
845     uint8_t **rows)
846 {
847 	int i, j;
848 	int pow;
849 
850 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
851 
852 	/*
853 	 * Fill in the missing rows of interest.
854 	 */
855 	for (i = 0; i < nmap; i++) {
856 		ASSERT3S(0, <=, map[i]);
857 		ASSERT3S(map[i], <=, 2);
858 
859 		pow = map[i] * n;
860 		if (pow > 255)
861 			pow -= 255;
862 		ASSERT(pow <= 255);
863 
864 		for (j = 0; j < n; j++) {
865 			pow -= map[i];
866 			if (pow < 0)
867 				pow += 255;
868 			rows[i][j] = vdev_raidz_pow2[pow];
869 		}
870 	}
871 }
872 
873 static void
874 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
875     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
876 {
877 	int i, j, ii, jj;
878 	uint8_t log;
879 
880 	/*
881 	 * Assert that the first nmissing entries from the array of used
882 	 * columns correspond to parity columns and that subsequent entries
883 	 * correspond to data columns.
884 	 */
885 	for (i = 0; i < nmissing; i++) {
886 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
887 	}
888 	for (; i < n; i++) {
889 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
890 	}
891 
892 	/*
893 	 * First initialize the storage where we'll compute the inverse rows.
894 	 */
895 	for (i = 0; i < nmissing; i++) {
896 		for (j = 0; j < n; j++) {
897 			invrows[i][j] = (i == j) ? 1 : 0;
898 		}
899 	}
900 
901 	/*
902 	 * Subtract all trivial rows from the rows of consequence.
903 	 */
904 	for (i = 0; i < nmissing; i++) {
905 		for (j = nmissing; j < n; j++) {
906 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
907 			jj = used[j] - rm->rm_firstdatacol;
908 			ASSERT3S(jj, <, n);
909 			invrows[i][j] = rows[i][jj];
910 			rows[i][jj] = 0;
911 		}
912 	}
913 
914 	/*
915 	 * For each of the rows of interest, we must normalize it and subtract
916 	 * a multiple of it from the other rows.
917 	 */
918 	for (i = 0; i < nmissing; i++) {
919 		for (j = 0; j < missing[i]; j++) {
920 			ASSERT3U(rows[i][j], ==, 0);
921 		}
922 		ASSERT3U(rows[i][missing[i]], !=, 0);
923 
924 		/*
925 		 * Compute the inverse of the first element and multiply each
926 		 * element in the row by that value.
927 		 */
928 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
929 
930 		for (j = 0; j < n; j++) {
931 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
932 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
933 		}
934 
935 		for (ii = 0; ii < nmissing; ii++) {
936 			if (i == ii)
937 				continue;
938 
939 			ASSERT3U(rows[ii][missing[i]], !=, 0);
940 
941 			log = vdev_raidz_log2[rows[ii][missing[i]]];
942 
943 			for (j = 0; j < n; j++) {
944 				rows[ii][j] ^=
945 				    vdev_raidz_exp2(rows[i][j], log);
946 				invrows[ii][j] ^=
947 				    vdev_raidz_exp2(invrows[i][j], log);
948 			}
949 		}
950 	}
951 
952 	/*
953 	 * Verify that the data that is left in the rows are properly part of
954 	 * an identity matrix.
955 	 */
956 	for (i = 0; i < nmissing; i++) {
957 		for (j = 0; j < n; j++) {
958 			if (j == missing[i]) {
959 				ASSERT3U(rows[i][j], ==, 1);
960 			} else {
961 				ASSERT3U(rows[i][j], ==, 0);
962 			}
963 		}
964 	}
965 }
966 
967 static void
968 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
969     int *missing, uint8_t **invrows, const uint8_t *used)
970 {
971 	int i, j, x, cc, c;
972 	uint8_t *src;
973 	uint64_t ccount;
974 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
975 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
976 	uint8_t log, val;
977 	int ll;
978 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
979 	uint8_t *p, *pp;
980 	size_t psize;
981 
982 	log = 0;	/* gcc */
983 	psize = sizeof (invlog[0][0]) * n * nmissing;
984 	p = malloc(psize);
985 	if (p == NULL) {
986 		printf("Out of memory\n");
987 		return;
988 	}
989 
990 	for (pp = p, i = 0; i < nmissing; i++) {
991 		invlog[i] = pp;
992 		pp += n;
993 	}
994 
995 	for (i = 0; i < nmissing; i++) {
996 		for (j = 0; j < n; j++) {
997 			ASSERT3U(invrows[i][j], !=, 0);
998 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
999 		}
1000 	}
1001 
1002 	for (i = 0; i < n; i++) {
1003 		c = used[i];
1004 		ASSERT3U(c, <, rm->rm_cols);
1005 
1006 		src = rm->rm_col[c].rc_data;
1007 		ccount = rm->rm_col[c].rc_size;
1008 		for (j = 0; j < nmissing; j++) {
1009 			cc = missing[j] + rm->rm_firstdatacol;
1010 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1011 			ASSERT3U(cc, <, rm->rm_cols);
1012 			ASSERT3U(cc, !=, c);
1013 
1014 			dst[j] = rm->rm_col[cc].rc_data;
1015 			dcount[j] = rm->rm_col[cc].rc_size;
1016 		}
1017 
1018 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1019 
1020 		for (x = 0; x < ccount; x++, src++) {
1021 			if (*src != 0)
1022 				log = vdev_raidz_log2[*src];
1023 
1024 			for (cc = 0; cc < nmissing; cc++) {
1025 				if (x >= dcount[cc])
1026 					continue;
1027 
1028 				if (*src == 0) {
1029 					val = 0;
1030 				} else {
1031 					if ((ll = log + invlog[cc][i]) >= 255)
1032 						ll -= 255;
1033 					val = vdev_raidz_pow2[ll];
1034 				}
1035 
1036 				if (i == 0)
1037 					dst[cc][x] = val;
1038 				else
1039 					dst[cc][x] ^= val;
1040 			}
1041 		}
1042 	}
1043 
1044 	free(p);
1045 }
1046 
1047 static int
1048 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1049 {
1050 	int n, i, c, t, tt;
1051 	int nmissing_rows;
1052 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1053 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1054 
1055 	uint8_t *p, *pp;
1056 	size_t psize;
1057 
1058 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1059 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1060 	uint8_t *used;
1061 
1062 	int code = 0;
1063 
1064 
1065 	n = rm->rm_cols - rm->rm_firstdatacol;
1066 
1067 	/*
1068 	 * Figure out which data columns are missing.
1069 	 */
1070 	nmissing_rows = 0;
1071 	for (t = 0; t < ntgts; t++) {
1072 		if (tgts[t] >= rm->rm_firstdatacol) {
1073 			missing_rows[nmissing_rows++] =
1074 			    tgts[t] - rm->rm_firstdatacol;
1075 		}
1076 	}
1077 
1078 	/*
1079 	 * Figure out which parity columns to use to help generate the missing
1080 	 * data columns.
1081 	 */
1082 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1083 		ASSERT(tt < ntgts);
1084 		ASSERT(c < rm->rm_firstdatacol);
1085 
1086 		/*
1087 		 * Skip any targeted parity columns.
1088 		 */
1089 		if (c == tgts[tt]) {
1090 			tt++;
1091 			continue;
1092 		}
1093 
1094 		code |= 1 << c;
1095 
1096 		parity_map[i] = c;
1097 		i++;
1098 	}
1099 
1100 	ASSERT(code != 0);
1101 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1102 
1103 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1104 	    nmissing_rows * n + sizeof (used[0]) * n;
1105 	p = malloc(psize);
1106 	if (p == NULL) {
1107 		printf("Out of memory\n");
1108 		return (code);
1109 	}
1110 
1111 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1112 		rows[i] = pp;
1113 		pp += n;
1114 		invrows[i] = pp;
1115 		pp += n;
1116 	}
1117 	used = pp;
1118 
1119 	for (i = 0; i < nmissing_rows; i++) {
1120 		used[i] = parity_map[i];
1121 	}
1122 
1123 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1124 		if (tt < nmissing_rows &&
1125 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1126 			tt++;
1127 			continue;
1128 		}
1129 
1130 		ASSERT3S(i, <, n);
1131 		used[i] = c;
1132 		i++;
1133 	}
1134 
1135 	/*
1136 	 * Initialize the interesting rows of the matrix.
1137 	 */
1138 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1139 
1140 	/*
1141 	 * Invert the matrix.
1142 	 */
1143 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1144 	    invrows, used);
1145 
1146 	/*
1147 	 * Reconstruct the missing data using the generated matrix.
1148 	 */
1149 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1150 	    invrows, used);
1151 
1152 	free(p);
1153 
1154 	return (code);
1155 }
1156 
1157 static int
1158 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1159 {
1160 	int tgts[VDEV_RAIDZ_MAXPARITY];
1161 	int ntgts;
1162 	int i, c;
1163 	int code;
1164 	int nbadparity, nbaddata;
1165 
1166 	/*
1167 	 * The tgts list must already be sorted.
1168 	 */
1169 	for (i = 1; i < nt; i++) {
1170 		ASSERT(t[i] > t[i - 1]);
1171 	}
1172 
1173 	nbadparity = rm->rm_firstdatacol;
1174 	nbaddata = rm->rm_cols - nbadparity;
1175 	ntgts = 0;
1176 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1177 		if (i < nt && c == t[i]) {
1178 			tgts[ntgts++] = c;
1179 			i++;
1180 		} else if (rm->rm_col[c].rc_error != 0) {
1181 			tgts[ntgts++] = c;
1182 		} else if (c >= rm->rm_firstdatacol) {
1183 			nbaddata--;
1184 		} else {
1185 			nbadparity--;
1186 		}
1187 	}
1188 
1189 	ASSERT(ntgts >= nt);
1190 	ASSERT(nbaddata >= 0);
1191 	ASSERT(nbaddata + nbadparity == ntgts);
1192 
1193 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1194 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1195 	ASSERT(code > 0);
1196 	return (code);
1197 }
1198 
1199 static raidz_map_t *
1200 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1201     uint64_t dcols, uint64_t nparity)
1202 {
1203 	raidz_map_t *rm;
1204 	uint64_t b = offset >> unit_shift;
1205 	uint64_t s = size >> unit_shift;
1206 	uint64_t f = b % dcols;
1207 	uint64_t o = (b / dcols) << unit_shift;
1208 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1209 
1210 	q = s / (dcols - nparity);
1211 	r = s - q * (dcols - nparity);
1212 	bc = (r == 0 ? 0 : r + nparity);
1213 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
1214 
1215 	if (q == 0) {
1216 		acols = bc;
1217 		scols = MIN(dcols, roundup(bc, nparity + 1));
1218 	} else {
1219 		acols = dcols;
1220 		scols = dcols;
1221 	}
1222 
1223 	ASSERT3U(acols, <=, scols);
1224 
1225 	rm = malloc(offsetof(raidz_map_t, rm_col[scols]));
1226 	if (rm == NULL)
1227 		return (rm);
1228 
1229 	rm->rm_cols = acols;
1230 	rm->rm_scols = scols;
1231 	rm->rm_bigcols = bc;
1232 	rm->rm_skipstart = bc;
1233 	rm->rm_missingdata = 0;
1234 	rm->rm_missingparity = 0;
1235 	rm->rm_firstdatacol = nparity;
1236 	rm->rm_reports = 0;
1237 	rm->rm_freed = 0;
1238 	rm->rm_ecksuminjected = 0;
1239 
1240 	asize = 0;
1241 
1242 	for (c = 0; c < scols; c++) {
1243 		col = f + c;
1244 		coff = o;
1245 		if (col >= dcols) {
1246 			col -= dcols;
1247 			coff += 1ULL << unit_shift;
1248 		}
1249 		rm->rm_col[c].rc_devidx = col;
1250 		rm->rm_col[c].rc_offset = coff;
1251 		rm->rm_col[c].rc_data = NULL;
1252 		rm->rm_col[c].rc_error = 0;
1253 		rm->rm_col[c].rc_tried = 0;
1254 		rm->rm_col[c].rc_skipped = 0;
1255 
1256 		if (c >= acols)
1257 			rm->rm_col[c].rc_size = 0;
1258 		else if (c < bc)
1259 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1260 		else
1261 			rm->rm_col[c].rc_size = q << unit_shift;
1262 
1263 		asize += rm->rm_col[c].rc_size;
1264 	}
1265 
1266 	ASSERT3U(asize, ==, tot << unit_shift);
1267 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1268 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1269 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1270 	ASSERT3U(rm->rm_nskip, <=, nparity);
1271 
1272 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1273 		rm->rm_col[c].rc_data = malloc(rm->rm_col[c].rc_size);
1274 		if (rm->rm_col[c].rc_data == NULL) {
1275 			c++;
1276 			while (c != 0)
1277 				free(rm->rm_col[--c].rc_data);
1278 			free(rm);
1279 			return (NULL);
1280 		}
1281 	}
1282 
1283 	rm->rm_col[c].rc_data = data;
1284 
1285 	for (c = c + 1; c < acols; c++)
1286 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1287 		    rm->rm_col[c - 1].rc_size;
1288 
1289 	/*
1290 	 * If all data stored spans all columns, there's a danger that parity
1291 	 * will always be on the same device and, since parity isn't read
1292 	 * during normal operation, that that device's I/O bandwidth won't be
1293 	 * used effectively. We therefore switch the parity every 1MB.
1294 	 *
1295 	 * ... at least that was, ostensibly, the theory. As a practical
1296 	 * matter unless we juggle the parity between all devices evenly, we
1297 	 * won't see any benefit. Further, occasional writes that aren't a
1298 	 * multiple of the LCM of the number of children and the minimum
1299 	 * stripe width are sufficient to avoid pessimal behavior.
1300 	 * Unfortunately, this decision created an implicit on-disk format
1301 	 * requirement that we need to support for all eternity, but only
1302 	 * for single-parity RAID-Z.
1303 	 *
1304 	 * If we intend to skip a sector in the zeroth column for padding
1305 	 * we must make sure to note this swap. We will never intend to
1306 	 * skip the first column since at least one data and one parity
1307 	 * column must appear in each row.
1308 	 */
1309 	ASSERT(rm->rm_cols >= 2);
1310 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1311 
1312 	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1313 		devidx = rm->rm_col[0].rc_devidx;
1314 		o = rm->rm_col[0].rc_offset;
1315 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1316 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1317 		rm->rm_col[1].rc_devidx = devidx;
1318 		rm->rm_col[1].rc_offset = o;
1319 
1320 		if (rm->rm_skipstart == 0)
1321 			rm->rm_skipstart = 1;
1322 	}
1323 
1324 	return (rm);
1325 }
1326 
1327 static void
1328 vdev_raidz_map_free(raidz_map_t *rm)
1329 {
1330 	int c;
1331 
1332 	for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1333 		free(rm->rm_col[c].rc_data);
1334 
1335 	free(rm);
1336 }
1337 
1338 static vdev_t *
1339 vdev_child(vdev_t *pvd, uint64_t devidx)
1340 {
1341 	vdev_t *cvd;
1342 
1343 	STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1344 		if (cvd->v_id == devidx)
1345 			break;
1346 	}
1347 
1348 	return (cvd);
1349 }
1350 
1351 /*
1352  * We keep track of whether or not there were any injected errors, so that
1353  * any ereports we generate can note it.
1354  */
1355 static int
1356 raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data,
1357     uint64_t size)
1358 {
1359 	return (zio_checksum_verify(spa, bp, data));
1360 }
1361 
1362 /*
1363  * Generate the parity from the data columns. If we tried and were able to
1364  * read the parity without error, verify that the generated parity matches the
1365  * data we read. If it doesn't, we fire off a checksum error. Return the
1366  * number such failures.
1367  */
1368 static int
1369 raidz_parity_verify(raidz_map_t *rm)
1370 {
1371 	void *orig[VDEV_RAIDZ_MAXPARITY];
1372 	int c, ret = 0;
1373 	raidz_col_t *rc;
1374 
1375 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1376 		rc = &rm->rm_col[c];
1377 		if (!rc->rc_tried || rc->rc_error != 0)
1378 			continue;
1379 		orig[c] = malloc(rc->rc_size);
1380 		if (orig[c] != NULL) {
1381 			bcopy(rc->rc_data, orig[c], rc->rc_size);
1382 		} else {
1383 			printf("Out of memory\n");
1384 		}
1385 	}
1386 
1387 	vdev_raidz_generate_parity(rm);
1388 
1389 	for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1390 		rc = &rm->rm_col[c];
1391 		if (!rc->rc_tried || rc->rc_error != 0)
1392 			continue;
1393 		if (orig[c] == NULL ||
1394 		    bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1395 			rc->rc_error = ECKSUM;
1396 			ret++;
1397 		}
1398 		free(orig[c]);
1399 	}
1400 
1401 	return (ret);
1402 }
1403 
1404 /*
1405  * Iterate over all combinations of bad data and attempt a reconstruction.
1406  * Note that the algorithm below is non-optimal because it doesn't take into
1407  * account how reconstruction is actually performed. For example, with
1408  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1409  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1410  * cases we'd only use parity information in column 0.
1411  */
1412 static int
1413 vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp,
1414     void *data, off_t offset, uint64_t bytes, int total_errors, int data_errors)
1415 {
1416 	raidz_col_t *rc;
1417 	void *orig[VDEV_RAIDZ_MAXPARITY];
1418 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1419 	int *tgts = &tstore[1];
1420 	int current, next, i, c, n;
1421 	int code, ret = 0;
1422 
1423 	ASSERT(total_errors < rm->rm_firstdatacol);
1424 
1425 	/*
1426 	 * This simplifies one edge condition.
1427 	 */
1428 	tgts[-1] = -1;
1429 
1430 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1431 		/*
1432 		 * Initialize the targets array by finding the first n columns
1433 		 * that contain no error.
1434 		 *
1435 		 * If there were no data errors, we need to ensure that we're
1436 		 * always explicitly attempting to reconstruct at least one
1437 		 * data column. To do this, we simply push the highest target
1438 		 * up into the data columns.
1439 		 */
1440 		for (c = 0, i = 0; i < n; i++) {
1441 			if (i == n - 1 && data_errors == 0 &&
1442 			    c < rm->rm_firstdatacol) {
1443 				c = rm->rm_firstdatacol;
1444 			}
1445 
1446 			while (rm->rm_col[c].rc_error != 0) {
1447 				c++;
1448 				ASSERT3S(c, <, rm->rm_cols);
1449 			}
1450 
1451 			tgts[i] = c++;
1452 		}
1453 
1454 		/*
1455 		 * Setting tgts[n] simplifies the other edge condition.
1456 		 */
1457 		tgts[n] = rm->rm_cols;
1458 
1459 		/*
1460 		 * These buffers were allocated in previous iterations.
1461 		 */
1462 		for (i = 0; i < n - 1; i++) {
1463 			ASSERT(orig[i] != NULL);
1464 		}
1465 
1466 		orig[n - 1] = malloc(rm->rm_col[0].rc_size);
1467 		if (orig[n - 1] == NULL) {
1468 			ret = ENOMEM;
1469 			goto done;
1470 		}
1471 
1472 		current = 0;
1473 		next = tgts[current];
1474 
1475 		while (current != n) {
1476 			tgts[current] = next;
1477 			current = 0;
1478 
1479 			/*
1480 			 * Save off the original data that we're going to
1481 			 * attempt to reconstruct.
1482 			 */
1483 			for (i = 0; i < n; i++) {
1484 				ASSERT(orig[i] != NULL);
1485 				c = tgts[i];
1486 				ASSERT3S(c, >=, 0);
1487 				ASSERT3S(c, <, rm->rm_cols);
1488 				rc = &rm->rm_col[c];
1489 				bcopy(rc->rc_data, orig[i], rc->rc_size);
1490 			}
1491 
1492 			/*
1493 			 * Attempt a reconstruction and exit the outer loop on
1494 			 * success.
1495 			 */
1496 			code = vdev_raidz_reconstruct(rm, tgts, n);
1497 			if (raidz_checksum_verify(spa, bp, data, bytes) == 0) {
1498 				for (i = 0; i < n; i++) {
1499 					c = tgts[i];
1500 					rc = &rm->rm_col[c];
1501 					ASSERT(rc->rc_error == 0);
1502 					rc->rc_error = ECKSUM;
1503 				}
1504 
1505 				ret = code;
1506 				goto done;
1507 			}
1508 
1509 			/*
1510 			 * Restore the original data.
1511 			 */
1512 			for (i = 0; i < n; i++) {
1513 				c = tgts[i];
1514 				rc = &rm->rm_col[c];
1515 				bcopy(orig[i], rc->rc_data, rc->rc_size);
1516 			}
1517 
1518 			do {
1519 				/*
1520 				 * Find the next valid column after the current
1521 				 * position..
1522 				 */
1523 				for (next = tgts[current] + 1;
1524 				    next < rm->rm_cols &&
1525 				    rm->rm_col[next].rc_error != 0; next++)
1526 					continue;
1527 
1528 				ASSERT(next <= tgts[current + 1]);
1529 
1530 				/*
1531 				 * If that spot is available, we're done here.
1532 				 */
1533 				if (next != tgts[current + 1])
1534 					break;
1535 
1536 				/*
1537 				 * Otherwise, find the next valid column after
1538 				 * the previous position.
1539 				 */
1540 				for (c = tgts[current - 1] + 1;
1541 				    rm->rm_col[c].rc_error != 0; c++)
1542 					continue;
1543 
1544 				tgts[current] = c;
1545 				current++;
1546 
1547 			} while (current != n);
1548 		}
1549 	}
1550 	n--;
1551 done:
1552 	for (i = n - 1; i >= 0; i--) {
1553 		free(orig[i]);
1554 	}
1555 
1556 	return (ret);
1557 }
1558 
1559 static int
1560 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1561     off_t offset, size_t bytes)
1562 {
1563 	vdev_t *tvd = vd->v_top;
1564 	vdev_t *cvd;
1565 	raidz_map_t *rm;
1566 	raidz_col_t *rc;
1567 	int c, error;
1568 	int unexpected_errors;
1569 	int parity_errors;
1570 	int parity_untried;
1571 	int data_errors;
1572 	int total_errors;
1573 	int n;
1574 	int tgts[VDEV_RAIDZ_MAXPARITY];
1575 	int code;
1576 
1577 	rc = NULL;	/* gcc */
1578 	error = 0;
1579 
1580 	rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1581 	    vd->v_nchildren, vd->v_nparity);
1582 	if (rm == NULL)
1583 		return (ENOMEM);
1584 
1585 	/*
1586 	 * Iterate over the columns in reverse order so that we hit the parity
1587 	 * last -- any errors along the way will force us to read the parity.
1588 	 */
1589 	for (c = rm->rm_cols - 1; c >= 0; c--) {
1590 		rc = &rm->rm_col[c];
1591 		cvd = vdev_child(vd, rc->rc_devidx);
1592 		if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1593 			if (c >= rm->rm_firstdatacol)
1594 				rm->rm_missingdata++;
1595 			else
1596 				rm->rm_missingparity++;
1597 			rc->rc_error = ENXIO;
1598 			rc->rc_tried = 1;	/* don't even try */
1599 			rc->rc_skipped = 1;
1600 			continue;
1601 		}
1602 #if 0		/* XXX: Too hard for the boot code. */
1603 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1604 			if (c >= rm->rm_firstdatacol)
1605 				rm->rm_missingdata++;
1606 			else
1607 				rm->rm_missingparity++;
1608 			rc->rc_error = ESTALE;
1609 			rc->rc_skipped = 1;
1610 			continue;
1611 		}
1612 #endif
1613 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1614 			rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1615 			    rc->rc_offset, rc->rc_size);
1616 			rc->rc_tried = 1;
1617 			rc->rc_skipped = 0;
1618 		}
1619 	}
1620 
1621 reconstruct:
1622 	unexpected_errors = 0;
1623 	parity_errors = 0;
1624 	parity_untried = 0;
1625 	data_errors = 0;
1626 	total_errors = 0;
1627 
1628 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1629 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1630 
1631 	for (c = 0; c < rm->rm_cols; c++) {
1632 		rc = &rm->rm_col[c];
1633 
1634 		if (rc->rc_error) {
1635 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
1636 
1637 			if (c < rm->rm_firstdatacol)
1638 				parity_errors++;
1639 			else
1640 				data_errors++;
1641 
1642 			if (!rc->rc_skipped)
1643 				unexpected_errors++;
1644 
1645 			total_errors++;
1646 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1647 			parity_untried++;
1648 		}
1649 	}
1650 
1651 	/*
1652 	 * There are three potential phases for a read:
1653 	 *	1. produce valid data from the columns read
1654 	 *	2. read all disks and try again
1655 	 *	3. perform combinatorial reconstruction
1656 	 *
1657 	 * Each phase is progressively both more expensive and less likely to
1658 	 * occur. If we encounter more errors than we can repair or all phases
1659 	 * fail, we have no choice but to return an error.
1660 	 */
1661 
1662 	/*
1663 	 * If the number of errors we saw was correctable -- less than or equal
1664 	 * to the number of parity disks read -- attempt to produce data that
1665 	 * has a valid checksum. Naturally, this case applies in the absence of
1666 	 * any errors.
1667 	 */
1668 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1669 		int rv;
1670 
1671 		if (data_errors == 0) {
1672 			rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
1673 			if (rv == 0) {
1674 				/*
1675 				 * If we read parity information (unnecessarily
1676 				 * as it happens since no reconstruction was
1677 				 * needed) regenerate and verify the parity.
1678 				 * We also regenerate parity when resilvering
1679 				 * so we can write it out to the failed device
1680 				 * later.
1681 				 */
1682 				if (parity_errors + parity_untried <
1683 				    rm->rm_firstdatacol) {
1684 					n = raidz_parity_verify(rm);
1685 					unexpected_errors += n;
1686 					ASSERT(parity_errors + n <=
1687 					    rm->rm_firstdatacol);
1688 				}
1689 				goto done;
1690 			}
1691 		} else {
1692 			/*
1693 			 * We either attempt to read all the parity columns or
1694 			 * none of them. If we didn't try to read parity, we
1695 			 * wouldn't be here in the correctable case. There must
1696 			 * also have been fewer parity errors than parity
1697 			 * columns or, again, we wouldn't be in this code path.
1698 			 */
1699 			ASSERT(parity_untried == 0);
1700 			ASSERT(parity_errors < rm->rm_firstdatacol);
1701 
1702 			/*
1703 			 * Identify the data columns that reported an error.
1704 			 */
1705 			n = 0;
1706 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1707 				rc = &rm->rm_col[c];
1708 				if (rc->rc_error != 0) {
1709 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1710 					tgts[n++] = c;
1711 				}
1712 			}
1713 
1714 			ASSERT(rm->rm_firstdatacol >= n);
1715 
1716 			code = vdev_raidz_reconstruct(rm, tgts, n);
1717 
1718 			rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
1719 			if (rv == 0) {
1720 				/*
1721 				 * If we read more parity disks than were used
1722 				 * for reconstruction, confirm that the other
1723 				 * parity disks produced correct data. This
1724 				 * routine is suboptimal in that it regenerates
1725 				 * the parity that we already used in addition
1726 				 * to the parity that we're attempting to
1727 				 * verify, but this should be a relatively
1728 				 * uncommon case, and can be optimized if it
1729 				 * becomes a problem. Note that we regenerate
1730 				 * parity when resilvering so we can write it
1731 				 * out to failed devices later.
1732 				 */
1733 				if (parity_errors < rm->rm_firstdatacol - n) {
1734 					n = raidz_parity_verify(rm);
1735 					unexpected_errors += n;
1736 					ASSERT(parity_errors + n <=
1737 					    rm->rm_firstdatacol);
1738 				}
1739 
1740 				goto done;
1741 			}
1742 		}
1743 	}
1744 
1745 	/*
1746 	 * This isn't a typical situation -- either we got a read
1747 	 * error or a child silently returned bad data. Read every
1748 	 * block so we can try again with as much data and parity as
1749 	 * we can track down. If we've already been through once
1750 	 * before, all children will be marked as tried so we'll
1751 	 * proceed to combinatorial reconstruction.
1752 	 */
1753 	unexpected_errors = 1;
1754 	rm->rm_missingdata = 0;
1755 	rm->rm_missingparity = 0;
1756 
1757 	n = 0;
1758 	for (c = 0; c < rm->rm_cols; c++) {
1759 		rc = &rm->rm_col[c];
1760 
1761 		if (rc->rc_tried)
1762 			continue;
1763 
1764 		cvd = vdev_child(vd, rc->rc_devidx);
1765 		ASSERT(cvd != NULL);
1766 		rc->rc_error = cvd->v_read(cvd, NULL,
1767 		    rc->rc_data, rc->rc_offset, rc->rc_size);
1768 		if (rc->rc_error == 0)
1769 			n++;
1770 		rc->rc_tried = 1;
1771 		rc->rc_skipped = 0;
1772 	}
1773 	/*
1774 	 * If we managed to read anything more, retry the
1775 	 * reconstruction.
1776 	 */
1777 	if (n > 0)
1778 		goto reconstruct;
1779 
1780 	/*
1781 	 * At this point we've attempted to reconstruct the data given the
1782 	 * errors we detected, and we've attempted to read all columns. There
1783 	 * must, therefore, be one or more additional problems -- silent errors
1784 	 * resulting in invalid data rather than explicit I/O errors resulting
1785 	 * in absent data. We check if there is enough additional data to
1786 	 * possibly reconstruct the data and then perform combinatorial
1787 	 * reconstruction over all possible combinations. If that fails,
1788 	 * we're cooked.
1789 	 */
1790 	if (total_errors > rm->rm_firstdatacol) {
1791 		error = EIO;
1792 	} else if (total_errors < rm->rm_firstdatacol &&
1793 	    (code = vdev_raidz_combrec(vd->v_spa, rm, bp, data, offset, bytes,
1794 	     total_errors, data_errors)) != 0) {
1795 		/*
1796 		 * If we didn't use all the available parity for the
1797 		 * combinatorial reconstruction, verify that the remaining
1798 		 * parity is correct.
1799 		 */
1800 		if (code != (1 << rm->rm_firstdatacol) - 1)
1801 			(void) raidz_parity_verify(rm);
1802 	} else {
1803 		/*
1804 		 * We're here because either:
1805 		 *
1806 		 *	total_errors == rm_first_datacol, or
1807 		 *	vdev_raidz_combrec() failed
1808 		 *
1809 		 * In either case, there is enough bad data to prevent
1810 		 * reconstruction.
1811 		 *
1812 		 * Start checksum ereports for all children which haven't
1813 		 * failed, and the IO wasn't speculative.
1814 		 */
1815 		error = ECKSUM;
1816 	}
1817 
1818 done:
1819 	vdev_raidz_map_free(rm);
1820 
1821 	return (error);
1822 }
1823