xref: /freebsd/sys/cddl/boot/zfs/zfssubr.c (revision 99282790b7d01ec3c4072621d46a0d7302517ad4)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/cdefs.h>
27 __FBSDID("$FreeBSD$");
28 
29 #include <lz4.h>
30 
31 static uint64_t zfs_crc64_table[256];
32 
33 #define	ECKSUM	666
34 
35 #define	ASSERT3S(x, y, z)	((void)0)
36 #define	ASSERT3U(x, y, z)	((void)0)
37 #define	ASSERT3P(x, y, z)	((void)0)
38 #define	ASSERT0(x)		((void)0)
39 #define	ASSERT(x)		((void)0)
40 
41 #define	panic(...)	do {						\
42 	printf(__VA_ARGS__);						\
43 	for (;;) ;							\
44 } while (0)
45 
46 static void
47 zfs_init_crc(void)
48 {
49 	int i, j;
50 	uint64_t *ct;
51 
52 	/*
53 	 * Calculate the crc64 table (used for the zap hash
54 	 * function).
55 	 */
56 	if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
57 		memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
58 		for (i = 0; i < 256; i++)
59 			for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
60 				*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
61 	}
62 }
63 
64 static void
65 zio_checksum_off(const void *buf, uint64_t size,
66     const void *ctx_template, zio_cksum_t *zcp)
67 {
68 	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
69 }
70 
71 /*
72  * Signature for checksum functions.
73  */
74 typedef void zio_checksum_t(const void *data, uint64_t size,
75     const void *ctx_template, zio_cksum_t *zcp);
76 typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
77 typedef void zio_checksum_tmpl_free_t(void *ctx_template);
78 
79 typedef enum zio_checksum_flags {
80 	/* Strong enough for metadata? */
81 	ZCHECKSUM_FLAG_METADATA = (1 << 1),
82 	/* ZIO embedded checksum */
83 	ZCHECKSUM_FLAG_EMBEDDED = (1 << 2),
84 	/* Strong enough for dedup (without verification)? */
85 	ZCHECKSUM_FLAG_DEDUP = (1 << 3),
86 	/* Uses salt value */
87 	ZCHECKSUM_FLAG_SALTED = (1 << 4),
88 	/* Strong enough for nopwrite? */
89 	ZCHECKSUM_FLAG_NOPWRITE = (1 << 5)
90 } zio_checksum_flags_t;
91 
92 /*
93  * Information about each checksum function.
94  */
95 typedef struct zio_checksum_info {
96 	/* checksum function for each byteorder */
97 	zio_checksum_t			*ci_func[2];
98 	zio_checksum_tmpl_init_t	*ci_tmpl_init;
99 	zio_checksum_tmpl_free_t	*ci_tmpl_free;
100 	zio_checksum_flags_t		ci_flags;
101 	const char			*ci_name;	/* descriptive name */
102 } zio_checksum_info_t;
103 
104 #include "blkptr.c"
105 
106 #include "fletcher.c"
107 #include "sha256.c"
108 #include "skein_zfs.c"
109 
110 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
111 	{{NULL, NULL}, NULL, NULL, 0, "inherit"},
112 	{{NULL, NULL}, NULL, NULL, 0, "on"},
113 	{{zio_checksum_off,	zio_checksum_off}, NULL, NULL, 0, "off"},
114 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
115 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"},
116 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
117 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"},
118 	{{fletcher_2_native,	fletcher_2_byteswap}, NULL, NULL,
119 	    ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
120 	{{fletcher_2_native,	fletcher_2_byteswap}, NULL, NULL,
121 	    0, "fletcher2"},
122 	{{fletcher_4_native,	fletcher_4_byteswap}, NULL, NULL,
123 	    ZCHECKSUM_FLAG_METADATA, "fletcher4"},
124 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
125 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
126 	    ZCHECKSUM_FLAG_NOPWRITE, "SHA256"},
127 	{{fletcher_4_native,	fletcher_4_byteswap}, NULL, NULL,
128 	    ZCHECKSUM_FLAG_EMBEDDED, "zillog2"},
129 	{{zio_checksum_off,	zio_checksum_off}, NULL, NULL,
130 	    0, "noparity"},
131 	{{zio_checksum_SHA512_native,	zio_checksum_SHA512_byteswap},
132 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
133 	    ZCHECKSUM_FLAG_NOPWRITE, "SHA512"},
134 	{{zio_checksum_skein_native, zio_checksum_skein_byteswap},
135 	    zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
136 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
137 	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
138 	/* no edonr for now */
139 	{{NULL, NULL}, NULL, NULL, ZCHECKSUM_FLAG_METADATA |
140 	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"}
141 };
142 
143 /*
144  * Common signature for all zio compress/decompress functions.
145  */
146 typedef size_t zio_compress_func_t(void *src, void *dst,
147     size_t s_len, size_t d_len, int);
148 typedef int zio_decompress_func_t(void *src, void *dst,
149     size_t s_len, size_t d_len, int);
150 
151 /*
152  * Information about each compression function.
153  */
154 typedef struct zio_compress_info {
155 	zio_compress_func_t	*ci_compress;	/* compression function */
156 	zio_decompress_func_t	*ci_decompress;	/* decompression function */
157 	int			ci_level;	/* level parameter */
158 	const char		*ci_name;	/* algorithm name */
159 } zio_compress_info_t;
160 
161 #include "lzjb.c"
162 #include "zle.c"
163 
164 /*
165  * Compression vectors.
166  */
167 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
168 	{NULL,			NULL,			0,	"inherit"},
169 	{NULL,			NULL,			0,	"on"},
170 	{NULL,			NULL,			0,	"uncompressed"},
171 	{NULL,			lzjb_decompress,	0,	"lzjb"},
172 	{NULL,			NULL,			0,	"empty"},
173 	{NULL,			NULL,			1,	"gzip-1"},
174 	{NULL,			NULL,			2,	"gzip-2"},
175 	{NULL,			NULL,			3,	"gzip-3"},
176 	{NULL,			NULL,			4,	"gzip-4"},
177 	{NULL,			NULL,			5,	"gzip-5"},
178 	{NULL,			NULL,			6,	"gzip-6"},
179 	{NULL,			NULL,			7,	"gzip-7"},
180 	{NULL,			NULL,			8,	"gzip-8"},
181 	{NULL,			NULL,			9,	"gzip-9"},
182 	{NULL,			zle_decompress,		64,	"zle"},
183 	{NULL,			lz4_decompress,		0,	"lz4"},
184 };
185 
186 static void
187 byteswap_uint64_array(void *vbuf, size_t size)
188 {
189 	uint64_t *buf = vbuf;
190 	size_t count = size >> 3;
191 	int i;
192 
193 	ASSERT((size & 7) == 0);
194 
195 	for (i = 0; i < count; i++)
196 		buf[i] = BSWAP_64(buf[i]);
197 }
198 
199 /*
200  * Set the external verifier for a gang block based on <vdev, offset, txg>,
201  * a tuple which is guaranteed to be unique for the life of the pool.
202  */
203 static void
204 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
205 {
206 	const dva_t *dva = BP_IDENTITY(bp);
207 	uint64_t txg = BP_PHYSICAL_BIRTH(bp);
208 
209 	ASSERT(BP_IS_GANG(bp));
210 
211 	ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
212 }
213 
214 /*
215  * Set the external verifier for a label block based on its offset.
216  * The vdev is implicit, and the txg is unknowable at pool open time --
217  * hence the logic in vdev_uberblock_load() to find the most recent copy.
218  */
219 static void
220 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
221 {
222 	ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
223 }
224 
225 /*
226  * Calls the template init function of a checksum which supports context
227  * templates and installs the template into the spa_t.
228  */
229 static void
230 zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
231 {
232 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
233 
234 	if (ci->ci_tmpl_init == NULL)
235 		return;
236 
237 	if (spa->spa_cksum_tmpls[checksum] != NULL)
238 		return;
239 
240 	if (spa->spa_cksum_tmpls[checksum] == NULL) {
241 		spa->spa_cksum_tmpls[checksum] =
242 		    ci->ci_tmpl_init(&spa->spa_cksum_salt);
243 	}
244 }
245 
246 /*
247  * Called by a spa_t that's about to be deallocated. This steps through
248  * all of the checksum context templates and deallocates any that were
249  * initialized using the algorithm-specific template init function.
250  */
251 static void __unused
252 zio_checksum_templates_free(spa_t *spa)
253 {
254 	for (enum zio_checksum checksum = 0;
255 	    checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
256 		if (spa->spa_cksum_tmpls[checksum] != NULL) {
257 			zio_checksum_info_t *ci = &zio_checksum_table[checksum];
258 
259 			ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
260 			spa->spa_cksum_tmpls[checksum] = NULL;
261 		}
262 	}
263 }
264 
265 static int
266 zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data)
267 {
268 	uint64_t size;
269 	unsigned int checksum;
270 	zio_checksum_info_t *ci;
271 	void *ctx = NULL;
272 	zio_cksum_t actual_cksum, expected_cksum, verifier;
273 	int byteswap;
274 
275 	checksum = BP_GET_CHECKSUM(bp);
276 	size = BP_GET_PSIZE(bp);
277 
278 	if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
279 		return (EINVAL);
280 	ci = &zio_checksum_table[checksum];
281 	if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
282 		return (EINVAL);
283 
284 	if (spa != NULL) {
285 		zio_checksum_template_init(checksum, __DECONST(spa_t *,spa));
286 		ctx = spa->spa_cksum_tmpls[checksum];
287 	}
288 
289 	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
290 		zio_eck_t *eck;
291 
292 		ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
293 		    checksum == ZIO_CHECKSUM_LABEL);
294 
295 		eck = (zio_eck_t *)((char *)data + size) - 1;
296 
297 		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
298 			zio_checksum_gang_verifier(&verifier, bp);
299 		else if (checksum == ZIO_CHECKSUM_LABEL)
300 			zio_checksum_label_verifier(&verifier,
301 			    DVA_GET_OFFSET(BP_IDENTITY(bp)));
302 		else
303 			verifier = bp->blk_cksum;
304 
305 		byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
306 
307 		if (byteswap)
308 			byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
309 
310 		expected_cksum = eck->zec_cksum;
311 		eck->zec_cksum = verifier;
312 		ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
313 		eck->zec_cksum = expected_cksum;
314 
315 		if (byteswap)
316 			byteswap_uint64_array(&expected_cksum,
317 			    sizeof (zio_cksum_t));
318 	} else {
319 		byteswap = BP_SHOULD_BYTESWAP(bp);
320 		expected_cksum = bp->blk_cksum;
321 		ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
322 	}
323 
324 	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
325 		/*printf("ZFS: read checksum %s failed\n", ci->ci_name);*/
326 		return (EIO);
327 	}
328 
329 	return (0);
330 }
331 
332 static int
333 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
334 	void *dest, uint64_t destsize)
335 {
336 	zio_compress_info_t *ci;
337 
338 	if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
339 		printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
340 		return (EIO);
341 	}
342 
343 	ci = &zio_compress_table[cpfunc];
344 	if (!ci->ci_decompress) {
345 		printf("ZFS: unsupported compression algorithm %s\n",
346 		    ci->ci_name);
347 		return (EIO);
348 	}
349 
350 	return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
351 }
352 
353 static uint64_t
354 zap_hash(uint64_t salt, const char *name)
355 {
356 	const uint8_t *cp;
357 	uint8_t c;
358 	uint64_t crc = salt;
359 
360 	ASSERT(crc != 0);
361 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
362 	for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
363 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
364 
365 	/*
366 	 * Only use 28 bits, since we need 4 bits in the cookie for the
367 	 * collision differentiator.  We MUST use the high bits, since
368 	 * those are the onces that we first pay attention to when
369 	 * chosing the bucket.
370 	 */
371 	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
372 
373 	return (crc);
374 }
375 
376 typedef struct raidz_col {
377 	uint64_t rc_devidx;		/* child device index for I/O */
378 	uint64_t rc_offset;		/* device offset */
379 	uint64_t rc_size;		/* I/O size */
380 	void *rc_data;			/* I/O data */
381 	int rc_error;			/* I/O error for this device */
382 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
383 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
384 } raidz_col_t;
385 
386 typedef struct raidz_map {
387 	uint64_t rm_cols;		/* Regular column count */
388 	uint64_t rm_scols;		/* Count including skipped columns */
389 	uint64_t rm_bigcols;		/* Number of oversized columns */
390 	uint64_t rm_asize;		/* Actual total I/O size */
391 	uint64_t rm_missingdata;	/* Count of missing data devices */
392 	uint64_t rm_missingparity;	/* Count of missing parity devices */
393 	uint64_t rm_firstdatacol;	/* First data column/parity count */
394 	uint64_t rm_nskip;		/* Skipped sectors for padding */
395 	uint64_t rm_skipstart;		/* Column index of padding start */
396 	uintptr_t rm_reports;		/* # of referencing checksum reports */
397 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
398 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
399 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
400 } raidz_map_t;
401 
402 #define	VDEV_RAIDZ_P		0
403 #define	VDEV_RAIDZ_Q		1
404 #define	VDEV_RAIDZ_R		2
405 
406 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
407 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
408 
409 /*
410  * We provide a mechanism to perform the field multiplication operation on a
411  * 64-bit value all at once rather than a byte at a time. This works by
412  * creating a mask from the top bit in each byte and using that to
413  * conditionally apply the XOR of 0x1d.
414  */
415 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
416 { \
417 	(mask) = (x) & 0x8080808080808080ULL; \
418 	(mask) = ((mask) << 1) - ((mask) >> 7); \
419 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
420 	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
421 }
422 
423 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
424 { \
425 	VDEV_RAIDZ_64MUL_2((x), mask); \
426 	VDEV_RAIDZ_64MUL_2((x), mask); \
427 }
428 
429 /*
430  * These two tables represent powers and logs of 2 in the Galois field defined
431  * above. These values were computed by repeatedly multiplying by 2 as above.
432  */
433 static const uint8_t vdev_raidz_pow2[256] = {
434 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
435 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
436 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
437 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
438 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
439 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
440 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
441 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
442 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
443 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
444 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
445 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
446 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
447 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
448 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
449 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
450 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
451 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
452 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
453 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
454 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
455 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
456 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
457 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
458 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
459 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
460 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
461 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
462 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
463 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
464 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
465 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
466 };
467 static const uint8_t vdev_raidz_log2[256] = {
468 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
469 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
470 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
471 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
472 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
473 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
474 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
475 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
476 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
477 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
478 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
479 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
480 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
481 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
482 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
483 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
484 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
485 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
486 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
487 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
488 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
489 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
490 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
491 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
492 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
493 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
494 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
495 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
496 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
497 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
498 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
499 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
500 };
501 
502 /*
503  * Multiply a given number by 2 raised to the given power.
504  */
505 static uint8_t
506 vdev_raidz_exp2(uint8_t a, int exp)
507 {
508 	if (a == 0)
509 		return (0);
510 
511 	ASSERT(exp >= 0);
512 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
513 
514 	exp += vdev_raidz_log2[a];
515 	if (exp > 255)
516 		exp -= 255;
517 
518 	return (vdev_raidz_pow2[exp]);
519 }
520 
521 static void
522 vdev_raidz_generate_parity_p(raidz_map_t *rm)
523 {
524 	uint64_t *p, *src, pcount, ccount, i;
525 	int c;
526 
527 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
528 
529 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
530 		src = rm->rm_col[c].rc_data;
531 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
532 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
533 
534 		if (c == rm->rm_firstdatacol) {
535 			ASSERT(ccount == pcount);
536 			for (i = 0; i < ccount; i++, src++, p++) {
537 				*p = *src;
538 			}
539 		} else {
540 			ASSERT(ccount <= pcount);
541 			for (i = 0; i < ccount; i++, src++, p++) {
542 				*p ^= *src;
543 			}
544 		}
545 	}
546 }
547 
548 static void
549 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
550 {
551 	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
552 	int c;
553 
554 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
555 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
556 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
557 
558 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
559 		src = rm->rm_col[c].rc_data;
560 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
561 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
562 
563 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
564 
565 		if (c == rm->rm_firstdatacol) {
566 			ASSERT(ccnt == pcnt || ccnt == 0);
567 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
568 				*p = *src;
569 				*q = *src;
570 			}
571 			for (; i < pcnt; i++, src++, p++, q++) {
572 				*p = 0;
573 				*q = 0;
574 			}
575 		} else {
576 			ASSERT(ccnt <= pcnt);
577 
578 			/*
579 			 * Apply the algorithm described above by multiplying
580 			 * the previous result and adding in the new value.
581 			 */
582 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
583 				*p ^= *src;
584 
585 				VDEV_RAIDZ_64MUL_2(*q, mask);
586 				*q ^= *src;
587 			}
588 
589 			/*
590 			 * Treat short columns as though they are full of 0s.
591 			 * Note that there's therefore nothing needed for P.
592 			 */
593 			for (; i < pcnt; i++, q++) {
594 				VDEV_RAIDZ_64MUL_2(*q, mask);
595 			}
596 		}
597 	}
598 }
599 
600 static void
601 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
602 {
603 	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
604 	int c;
605 
606 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
607 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
608 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
609 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
610 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
611 
612 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
613 		src = rm->rm_col[c].rc_data;
614 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
615 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
616 		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
617 
618 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
619 
620 		if (c == rm->rm_firstdatacol) {
621 			ASSERT(ccnt == pcnt || ccnt == 0);
622 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
623 				*p = *src;
624 				*q = *src;
625 				*r = *src;
626 			}
627 			for (; i < pcnt; i++, src++, p++, q++, r++) {
628 				*p = 0;
629 				*q = 0;
630 				*r = 0;
631 			}
632 		} else {
633 			ASSERT(ccnt <= pcnt);
634 
635 			/*
636 			 * Apply the algorithm described above by multiplying
637 			 * the previous result and adding in the new value.
638 			 */
639 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
640 				*p ^= *src;
641 
642 				VDEV_RAIDZ_64MUL_2(*q, mask);
643 				*q ^= *src;
644 
645 				VDEV_RAIDZ_64MUL_4(*r, mask);
646 				*r ^= *src;
647 			}
648 
649 			/*
650 			 * Treat short columns as though they are full of 0s.
651 			 * Note that there's therefore nothing needed for P.
652 			 */
653 			for (; i < pcnt; i++, q++, r++) {
654 				VDEV_RAIDZ_64MUL_2(*q, mask);
655 				VDEV_RAIDZ_64MUL_4(*r, mask);
656 			}
657 		}
658 	}
659 }
660 
661 /*
662  * Generate RAID parity in the first virtual columns according to the number of
663  * parity columns available.
664  */
665 static void
666 vdev_raidz_generate_parity(raidz_map_t *rm)
667 {
668 	switch (rm->rm_firstdatacol) {
669 	case 1:
670 		vdev_raidz_generate_parity_p(rm);
671 		break;
672 	case 2:
673 		vdev_raidz_generate_parity_pq(rm);
674 		break;
675 	case 3:
676 		vdev_raidz_generate_parity_pqr(rm);
677 		break;
678 	default:
679 		panic("invalid RAID-Z configuration");
680 	}
681 }
682 
683 /* BEGIN CSTYLED */
684 /*
685  * In the general case of reconstruction, we must solve the system of linear
686  * equations defined by the coeffecients used to generate parity as well as
687  * the contents of the data and parity disks. This can be expressed with
688  * vectors for the original data (D) and the actual data (d) and parity (p)
689  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
690  *
691  *            __   __                     __     __
692  *            |     |         __     __   |  p_0  |
693  *            |  V  |         |  D_0  |   | p_m-1 |
694  *            |     |    x    |   :   | = |  d_0  |
695  *            |  I  |         | D_n-1 |   |   :   |
696  *            |     |         ~~     ~~   | d_n-1 |
697  *            ~~   ~~                     ~~     ~~
698  *
699  * I is simply a square identity matrix of size n, and V is a vandermonde
700  * matrix defined by the coeffecients we chose for the various parity columns
701  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
702  * computation as well as linear separability.
703  *
704  *      __               __               __     __
705  *      |   1   ..  1 1 1 |               |  p_0  |
706  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
707  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
708  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
709  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
710  *      |   :       : : : |   |   :   |   |  d_2  |
711  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
712  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
713  *      |   0   ..  0 0 1 |               | d_n-1 |
714  *      ~~               ~~               ~~     ~~
715  *
716  * Note that I, V, d, and p are known. To compute D, we must invert the
717  * matrix and use the known data and parity values to reconstruct the unknown
718  * data values. We begin by removing the rows in V|I and d|p that correspond
719  * to failed or missing columns; we then make V|I square (n x n) and d|p
720  * sized n by removing rows corresponding to unused parity from the bottom up
721  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
722  * using Gauss-Jordan elimination. In the example below we use m=3 parity
723  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
724  *           __                               __
725  *           |  1   1   1   1   1   1   1   1  |
726  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
727  *           |  19 205 116  29  64  16  4   1  |      / /
728  *           |  1   0   0   0   0   0   0   0  |     / /
729  *           |  0   1   0   0   0   0   0   0  | <--' /
730  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
731  *           |  0   0   0   1   0   0   0   0  |
732  *           |  0   0   0   0   1   0   0   0  |
733  *           |  0   0   0   0   0   1   0   0  |
734  *           |  0   0   0   0   0   0   1   0  |
735  *           |  0   0   0   0   0   0   0   1  |
736  *           ~~                               ~~
737  *           __                               __
738  *           |  1   1   1   1   1   1   1   1  |
739  *           | 128  64  32  16  8   4   2   1  |
740  *           |  19 205 116  29  64  16  4   1  |
741  *           |  1   0   0   0   0   0   0   0  |
742  *           |  0   1   0   0   0   0   0   0  |
743  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
744  *           |  0   0   0   1   0   0   0   0  |
745  *           |  0   0   0   0   1   0   0   0  |
746  *           |  0   0   0   0   0   1   0   0  |
747  *           |  0   0   0   0   0   0   1   0  |
748  *           |  0   0   0   0   0   0   0   1  |
749  *           ~~                               ~~
750  *
751  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
752  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
753  * matrix is not singular.
754  * __                                                                 __
755  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
756  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
757  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
758  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
759  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
760  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
761  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
762  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
763  * ~~                                                                 ~~
764  * __                                                                 __
765  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
766  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
767  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
768  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
769  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
770  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
771  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
772  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
773  * ~~                                                                 ~~
774  * __                                                                 __
775  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
776  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
777  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
778  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
779  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
780  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
781  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
782  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
783  * ~~                                                                 ~~
784  * __                                                                 __
785  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
786  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
787  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
788  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
789  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
790  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
791  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
792  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
793  * ~~                                                                 ~~
794  * __                                                                 __
795  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
796  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
797  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
798  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
799  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
800  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
801  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
802  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
803  * ~~                                                                 ~~
804  * __                                                                 __
805  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
806  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
807  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
808  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
809  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
810  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
811  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
812  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
813  * ~~                                                                 ~~
814  *                   __                               __
815  *                   |  0   0   1   0   0   0   0   0  |
816  *                   | 167 100  5   41 159 169 217 208 |
817  *                   | 166 100  4   40 158 168 216 209 |
818  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
819  *                   |  0   0   0   0   1   0   0   0  |
820  *                   |  0   0   0   0   0   1   0   0  |
821  *                   |  0   0   0   0   0   0   1   0  |
822  *                   |  0   0   0   0   0   0   0   1  |
823  *                   ~~                               ~~
824  *
825  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
826  * of the missing data.
827  *
828  * As is apparent from the example above, the only non-trivial rows in the
829  * inverse matrix correspond to the data disks that we're trying to
830  * reconstruct. Indeed, those are the only rows we need as the others would
831  * only be useful for reconstructing data known or assumed to be valid. For
832  * that reason, we only build the coefficients in the rows that correspond to
833  * targeted columns.
834  */
835 /* END CSTYLED */
836 
837 static void
838 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
839     uint8_t **rows)
840 {
841 	int i, j;
842 	int pow;
843 
844 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
845 
846 	/*
847 	 * Fill in the missing rows of interest.
848 	 */
849 	for (i = 0; i < nmap; i++) {
850 		ASSERT3S(0, <=, map[i]);
851 		ASSERT3S(map[i], <=, 2);
852 
853 		pow = map[i] * n;
854 		if (pow > 255)
855 			pow -= 255;
856 		ASSERT(pow <= 255);
857 
858 		for (j = 0; j < n; j++) {
859 			pow -= map[i];
860 			if (pow < 0)
861 				pow += 255;
862 			rows[i][j] = vdev_raidz_pow2[pow];
863 		}
864 	}
865 }
866 
867 static void
868 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
869     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
870 {
871 	int i, j, ii, jj;
872 	uint8_t log;
873 
874 	/*
875 	 * Assert that the first nmissing entries from the array of used
876 	 * columns correspond to parity columns and that subsequent entries
877 	 * correspond to data columns.
878 	 */
879 	for (i = 0; i < nmissing; i++) {
880 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
881 	}
882 	for (; i < n; i++) {
883 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
884 	}
885 
886 	/*
887 	 * First initialize the storage where we'll compute the inverse rows.
888 	 */
889 	for (i = 0; i < nmissing; i++) {
890 		for (j = 0; j < n; j++) {
891 			invrows[i][j] = (i == j) ? 1 : 0;
892 		}
893 	}
894 
895 	/*
896 	 * Subtract all trivial rows from the rows of consequence.
897 	 */
898 	for (i = 0; i < nmissing; i++) {
899 		for (j = nmissing; j < n; j++) {
900 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
901 			jj = used[j] - rm->rm_firstdatacol;
902 			ASSERT3S(jj, <, n);
903 			invrows[i][j] = rows[i][jj];
904 			rows[i][jj] = 0;
905 		}
906 	}
907 
908 	/*
909 	 * For each of the rows of interest, we must normalize it and subtract
910 	 * a multiple of it from the other rows.
911 	 */
912 	for (i = 0; i < nmissing; i++) {
913 		for (j = 0; j < missing[i]; j++) {
914 			ASSERT3U(rows[i][j], ==, 0);
915 		}
916 		ASSERT3U(rows[i][missing[i]], !=, 0);
917 
918 		/*
919 		 * Compute the inverse of the first element and multiply each
920 		 * element in the row by that value.
921 		 */
922 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
923 
924 		for (j = 0; j < n; j++) {
925 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
926 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
927 		}
928 
929 		for (ii = 0; ii < nmissing; ii++) {
930 			if (i == ii)
931 				continue;
932 
933 			ASSERT3U(rows[ii][missing[i]], !=, 0);
934 
935 			log = vdev_raidz_log2[rows[ii][missing[i]]];
936 
937 			for (j = 0; j < n; j++) {
938 				rows[ii][j] ^=
939 				    vdev_raidz_exp2(rows[i][j], log);
940 				invrows[ii][j] ^=
941 				    vdev_raidz_exp2(invrows[i][j], log);
942 			}
943 		}
944 	}
945 
946 	/*
947 	 * Verify that the data that is left in the rows are properly part of
948 	 * an identity matrix.
949 	 */
950 	for (i = 0; i < nmissing; i++) {
951 		for (j = 0; j < n; j++) {
952 			if (j == missing[i]) {
953 				ASSERT3U(rows[i][j], ==, 1);
954 			} else {
955 				ASSERT3U(rows[i][j], ==, 0);
956 			}
957 		}
958 	}
959 }
960 
961 static void
962 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
963     int *missing, uint8_t **invrows, const uint8_t *used)
964 {
965 	int i, j, x, cc, c;
966 	uint8_t *src;
967 	uint64_t ccount;
968 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
969 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
970 	uint8_t log, val;
971 	int ll;
972 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
973 	uint8_t *p, *pp;
974 	size_t psize;
975 
976 	log = 0;	/* gcc */
977 	psize = sizeof (invlog[0][0]) * n * nmissing;
978 	p = malloc(psize);
979 	if (p == NULL) {
980 		printf("Out of memory\n");
981 		return;
982 	}
983 
984 	for (pp = p, i = 0; i < nmissing; i++) {
985 		invlog[i] = pp;
986 		pp += n;
987 	}
988 
989 	for (i = 0; i < nmissing; i++) {
990 		for (j = 0; j < n; j++) {
991 			ASSERT3U(invrows[i][j], !=, 0);
992 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
993 		}
994 	}
995 
996 	for (i = 0; i < n; i++) {
997 		c = used[i];
998 		ASSERT3U(c, <, rm->rm_cols);
999 
1000 		src = rm->rm_col[c].rc_data;
1001 		ccount = rm->rm_col[c].rc_size;
1002 		for (j = 0; j < nmissing; j++) {
1003 			cc = missing[j] + rm->rm_firstdatacol;
1004 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1005 			ASSERT3U(cc, <, rm->rm_cols);
1006 			ASSERT3U(cc, !=, c);
1007 
1008 			dst[j] = rm->rm_col[cc].rc_data;
1009 			dcount[j] = rm->rm_col[cc].rc_size;
1010 		}
1011 
1012 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1013 
1014 		for (x = 0; x < ccount; x++, src++) {
1015 			if (*src != 0)
1016 				log = vdev_raidz_log2[*src];
1017 
1018 			for (cc = 0; cc < nmissing; cc++) {
1019 				if (x >= dcount[cc])
1020 					continue;
1021 
1022 				if (*src == 0) {
1023 					val = 0;
1024 				} else {
1025 					if ((ll = log + invlog[cc][i]) >= 255)
1026 						ll -= 255;
1027 					val = vdev_raidz_pow2[ll];
1028 				}
1029 
1030 				if (i == 0)
1031 					dst[cc][x] = val;
1032 				else
1033 					dst[cc][x] ^= val;
1034 			}
1035 		}
1036 	}
1037 
1038 	free(p);
1039 }
1040 
1041 static int
1042 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1043 {
1044 	int n, i, c, t, tt;
1045 	int nmissing_rows;
1046 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1047 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1048 
1049 	uint8_t *p, *pp;
1050 	size_t psize;
1051 
1052 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1053 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1054 	uint8_t *used;
1055 
1056 	int code = 0;
1057 
1058 
1059 	n = rm->rm_cols - rm->rm_firstdatacol;
1060 
1061 	/*
1062 	 * Figure out which data columns are missing.
1063 	 */
1064 	nmissing_rows = 0;
1065 	for (t = 0; t < ntgts; t++) {
1066 		if (tgts[t] >= rm->rm_firstdatacol) {
1067 			missing_rows[nmissing_rows++] =
1068 			    tgts[t] - rm->rm_firstdatacol;
1069 		}
1070 	}
1071 
1072 	/*
1073 	 * Figure out which parity columns to use to help generate the missing
1074 	 * data columns.
1075 	 */
1076 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1077 		ASSERT(tt < ntgts);
1078 		ASSERT(c < rm->rm_firstdatacol);
1079 
1080 		/*
1081 		 * Skip any targeted parity columns.
1082 		 */
1083 		if (c == tgts[tt]) {
1084 			tt++;
1085 			continue;
1086 		}
1087 
1088 		code |= 1 << c;
1089 
1090 		parity_map[i] = c;
1091 		i++;
1092 	}
1093 
1094 	ASSERT(code != 0);
1095 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1096 
1097 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1098 	    nmissing_rows * n + sizeof (used[0]) * n;
1099 	p = malloc(psize);
1100 	if (p == NULL) {
1101 		printf("Out of memory\n");
1102 		return (code);
1103 	}
1104 
1105 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1106 		rows[i] = pp;
1107 		pp += n;
1108 		invrows[i] = pp;
1109 		pp += n;
1110 	}
1111 	used = pp;
1112 
1113 	for (i = 0; i < nmissing_rows; i++) {
1114 		used[i] = parity_map[i];
1115 	}
1116 
1117 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1118 		if (tt < nmissing_rows &&
1119 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1120 			tt++;
1121 			continue;
1122 		}
1123 
1124 		ASSERT3S(i, <, n);
1125 		used[i] = c;
1126 		i++;
1127 	}
1128 
1129 	/*
1130 	 * Initialize the interesting rows of the matrix.
1131 	 */
1132 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1133 
1134 	/*
1135 	 * Invert the matrix.
1136 	 */
1137 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1138 	    invrows, used);
1139 
1140 	/*
1141 	 * Reconstruct the missing data using the generated matrix.
1142 	 */
1143 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1144 	    invrows, used);
1145 
1146 	free(p);
1147 
1148 	return (code);
1149 }
1150 
1151 static int
1152 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1153 {
1154 	int tgts[VDEV_RAIDZ_MAXPARITY];
1155 	int ntgts;
1156 	int i, c;
1157 	int code;
1158 	int nbadparity, nbaddata;
1159 
1160 	/*
1161 	 * The tgts list must already be sorted.
1162 	 */
1163 	for (i = 1; i < nt; i++) {
1164 		ASSERT(t[i] > t[i - 1]);
1165 	}
1166 
1167 	nbadparity = rm->rm_firstdatacol;
1168 	nbaddata = rm->rm_cols - nbadparity;
1169 	ntgts = 0;
1170 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1171 		if (i < nt && c == t[i]) {
1172 			tgts[ntgts++] = c;
1173 			i++;
1174 		} else if (rm->rm_col[c].rc_error != 0) {
1175 			tgts[ntgts++] = c;
1176 		} else if (c >= rm->rm_firstdatacol) {
1177 			nbaddata--;
1178 		} else {
1179 			nbadparity--;
1180 		}
1181 	}
1182 
1183 	ASSERT(ntgts >= nt);
1184 	ASSERT(nbaddata >= 0);
1185 	ASSERT(nbaddata + nbadparity == ntgts);
1186 
1187 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1188 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1189 	ASSERT(code > 0);
1190 	return (code);
1191 }
1192 
1193 static raidz_map_t *
1194 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1195     uint64_t dcols, uint64_t nparity)
1196 {
1197 	raidz_map_t *rm;
1198 	uint64_t b = offset >> unit_shift;
1199 	uint64_t s = size >> unit_shift;
1200 	uint64_t f = b % dcols;
1201 	uint64_t o = (b / dcols) << unit_shift;
1202 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1203 
1204 	q = s / (dcols - nparity);
1205 	r = s - q * (dcols - nparity);
1206 	bc = (r == 0 ? 0 : r + nparity);
1207 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
1208 
1209 	if (q == 0) {
1210 		acols = bc;
1211 		scols = MIN(dcols, roundup(bc, nparity + 1));
1212 	} else {
1213 		acols = dcols;
1214 		scols = dcols;
1215 	}
1216 
1217 	ASSERT3U(acols, <=, scols);
1218 
1219 	rm = malloc(offsetof(raidz_map_t, rm_col[scols]));
1220 	if (rm == NULL)
1221 		return (rm);
1222 
1223 	rm->rm_cols = acols;
1224 	rm->rm_scols = scols;
1225 	rm->rm_bigcols = bc;
1226 	rm->rm_skipstart = bc;
1227 	rm->rm_missingdata = 0;
1228 	rm->rm_missingparity = 0;
1229 	rm->rm_firstdatacol = nparity;
1230 	rm->rm_reports = 0;
1231 	rm->rm_freed = 0;
1232 	rm->rm_ecksuminjected = 0;
1233 
1234 	asize = 0;
1235 
1236 	for (c = 0; c < scols; c++) {
1237 		col = f + c;
1238 		coff = o;
1239 		if (col >= dcols) {
1240 			col -= dcols;
1241 			coff += 1ULL << unit_shift;
1242 		}
1243 		rm->rm_col[c].rc_devidx = col;
1244 		rm->rm_col[c].rc_offset = coff;
1245 		rm->rm_col[c].rc_data = NULL;
1246 		rm->rm_col[c].rc_error = 0;
1247 		rm->rm_col[c].rc_tried = 0;
1248 		rm->rm_col[c].rc_skipped = 0;
1249 
1250 		if (c >= acols)
1251 			rm->rm_col[c].rc_size = 0;
1252 		else if (c < bc)
1253 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1254 		else
1255 			rm->rm_col[c].rc_size = q << unit_shift;
1256 
1257 		asize += rm->rm_col[c].rc_size;
1258 	}
1259 
1260 	ASSERT3U(asize, ==, tot << unit_shift);
1261 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1262 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1263 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1264 	ASSERT3U(rm->rm_nskip, <=, nparity);
1265 
1266 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1267 		rm->rm_col[c].rc_data = malloc(rm->rm_col[c].rc_size);
1268 		if (rm->rm_col[c].rc_data == NULL) {
1269 			c++;
1270 			while (c != 0)
1271 				free(rm->rm_col[--c].rc_data);
1272 			free(rm);
1273 			return (NULL);
1274 		}
1275 	}
1276 
1277 	rm->rm_col[c].rc_data = data;
1278 
1279 	for (c = c + 1; c < acols; c++)
1280 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1281 		    rm->rm_col[c - 1].rc_size;
1282 
1283 	/*
1284 	 * If all data stored spans all columns, there's a danger that parity
1285 	 * will always be on the same device and, since parity isn't read
1286 	 * during normal operation, that that device's I/O bandwidth won't be
1287 	 * used effectively. We therefore switch the parity every 1MB.
1288 	 *
1289 	 * ... at least that was, ostensibly, the theory. As a practical
1290 	 * matter unless we juggle the parity between all devices evenly, we
1291 	 * won't see any benefit. Further, occasional writes that aren't a
1292 	 * multiple of the LCM of the number of children and the minimum
1293 	 * stripe width are sufficient to avoid pessimal behavior.
1294 	 * Unfortunately, this decision created an implicit on-disk format
1295 	 * requirement that we need to support for all eternity, but only
1296 	 * for single-parity RAID-Z.
1297 	 *
1298 	 * If we intend to skip a sector in the zeroth column for padding
1299 	 * we must make sure to note this swap. We will never intend to
1300 	 * skip the first column since at least one data and one parity
1301 	 * column must appear in each row.
1302 	 */
1303 	ASSERT(rm->rm_cols >= 2);
1304 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1305 
1306 	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1307 		devidx = rm->rm_col[0].rc_devidx;
1308 		o = rm->rm_col[0].rc_offset;
1309 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1310 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1311 		rm->rm_col[1].rc_devidx = devidx;
1312 		rm->rm_col[1].rc_offset = o;
1313 
1314 		if (rm->rm_skipstart == 0)
1315 			rm->rm_skipstart = 1;
1316 	}
1317 
1318 	return (rm);
1319 }
1320 
1321 static void
1322 vdev_raidz_map_free(raidz_map_t *rm)
1323 {
1324 	int c;
1325 
1326 	for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1327 		free(rm->rm_col[c].rc_data);
1328 
1329 	free(rm);
1330 }
1331 
1332 static vdev_t *
1333 vdev_child(vdev_t *pvd, uint64_t devidx)
1334 {
1335 	vdev_t *cvd;
1336 
1337 	STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1338 		if (cvd->v_id == devidx)
1339 			break;
1340 	}
1341 
1342 	return (cvd);
1343 }
1344 
1345 /*
1346  * We keep track of whether or not there were any injected errors, so that
1347  * any ereports we generate can note it.
1348  */
1349 static int
1350 raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data,
1351     uint64_t size)
1352 {
1353 	return (zio_checksum_verify(spa, bp, data));
1354 }
1355 
1356 /*
1357  * Generate the parity from the data columns. If we tried and were able to
1358  * read the parity without error, verify that the generated parity matches the
1359  * data we read. If it doesn't, we fire off a checksum error. Return the
1360  * number such failures.
1361  */
1362 static int
1363 raidz_parity_verify(raidz_map_t *rm)
1364 {
1365 	void *orig[VDEV_RAIDZ_MAXPARITY];
1366 	int c, ret = 0;
1367 	raidz_col_t *rc;
1368 
1369 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1370 		rc = &rm->rm_col[c];
1371 		if (!rc->rc_tried || rc->rc_error != 0)
1372 			continue;
1373 		orig[c] = malloc(rc->rc_size);
1374 		if (orig[c] != NULL) {
1375 			bcopy(rc->rc_data, orig[c], rc->rc_size);
1376 		} else {
1377 			printf("Out of memory\n");
1378 		}
1379 	}
1380 
1381 	vdev_raidz_generate_parity(rm);
1382 
1383 	for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1384 		rc = &rm->rm_col[c];
1385 		if (!rc->rc_tried || rc->rc_error != 0)
1386 			continue;
1387 		if (orig[c] == NULL ||
1388 		    bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1389 			rc->rc_error = ECKSUM;
1390 			ret++;
1391 		}
1392 		free(orig[c]);
1393 	}
1394 
1395 	return (ret);
1396 }
1397 
1398 /*
1399  * Iterate over all combinations of bad data and attempt a reconstruction.
1400  * Note that the algorithm below is non-optimal because it doesn't take into
1401  * account how reconstruction is actually performed. For example, with
1402  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1403  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1404  * cases we'd only use parity information in column 0.
1405  */
1406 static int
1407 vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp,
1408     void *data, off_t offset, uint64_t bytes, int total_errors, int data_errors)
1409 {
1410 	raidz_col_t *rc;
1411 	void *orig[VDEV_RAIDZ_MAXPARITY];
1412 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1413 	int *tgts = &tstore[1];
1414 	int current, next, i, c, n;
1415 	int code, ret = 0;
1416 
1417 	ASSERT(total_errors < rm->rm_firstdatacol);
1418 
1419 	/*
1420 	 * This simplifies one edge condition.
1421 	 */
1422 	tgts[-1] = -1;
1423 
1424 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1425 		/*
1426 		 * Initialize the targets array by finding the first n columns
1427 		 * that contain no error.
1428 		 *
1429 		 * If there were no data errors, we need to ensure that we're
1430 		 * always explicitly attempting to reconstruct at least one
1431 		 * data column. To do this, we simply push the highest target
1432 		 * up into the data columns.
1433 		 */
1434 		for (c = 0, i = 0; i < n; i++) {
1435 			if (i == n - 1 && data_errors == 0 &&
1436 			    c < rm->rm_firstdatacol) {
1437 				c = rm->rm_firstdatacol;
1438 			}
1439 
1440 			while (rm->rm_col[c].rc_error != 0) {
1441 				c++;
1442 				ASSERT3S(c, <, rm->rm_cols);
1443 			}
1444 
1445 			tgts[i] = c++;
1446 		}
1447 
1448 		/*
1449 		 * Setting tgts[n] simplifies the other edge condition.
1450 		 */
1451 		tgts[n] = rm->rm_cols;
1452 
1453 		/*
1454 		 * These buffers were allocated in previous iterations.
1455 		 */
1456 		for (i = 0; i < n - 1; i++) {
1457 			ASSERT(orig[i] != NULL);
1458 		}
1459 
1460 		orig[n - 1] = malloc(rm->rm_col[0].rc_size);
1461 		if (orig[n - 1] == NULL) {
1462 			ret = ENOMEM;
1463 			goto done;
1464 		}
1465 
1466 		current = 0;
1467 		next = tgts[current];
1468 
1469 		while (current != n) {
1470 			tgts[current] = next;
1471 			current = 0;
1472 
1473 			/*
1474 			 * Save off the original data that we're going to
1475 			 * attempt to reconstruct.
1476 			 */
1477 			for (i = 0; i < n; i++) {
1478 				ASSERT(orig[i] != NULL);
1479 				c = tgts[i];
1480 				ASSERT3S(c, >=, 0);
1481 				ASSERT3S(c, <, rm->rm_cols);
1482 				rc = &rm->rm_col[c];
1483 				bcopy(rc->rc_data, orig[i], rc->rc_size);
1484 			}
1485 
1486 			/*
1487 			 * Attempt a reconstruction and exit the outer loop on
1488 			 * success.
1489 			 */
1490 			code = vdev_raidz_reconstruct(rm, tgts, n);
1491 			if (raidz_checksum_verify(spa, bp, data, bytes) == 0) {
1492 				for (i = 0; i < n; i++) {
1493 					c = tgts[i];
1494 					rc = &rm->rm_col[c];
1495 					ASSERT(rc->rc_error == 0);
1496 					rc->rc_error = ECKSUM;
1497 				}
1498 
1499 				ret = code;
1500 				goto done;
1501 			}
1502 
1503 			/*
1504 			 * Restore the original data.
1505 			 */
1506 			for (i = 0; i < n; i++) {
1507 				c = tgts[i];
1508 				rc = &rm->rm_col[c];
1509 				bcopy(orig[i], rc->rc_data, rc->rc_size);
1510 			}
1511 
1512 			do {
1513 				/*
1514 				 * Find the next valid column after the current
1515 				 * position..
1516 				 */
1517 				for (next = tgts[current] + 1;
1518 				    next < rm->rm_cols &&
1519 				    rm->rm_col[next].rc_error != 0; next++)
1520 					continue;
1521 
1522 				ASSERT(next <= tgts[current + 1]);
1523 
1524 				/*
1525 				 * If that spot is available, we're done here.
1526 				 */
1527 				if (next != tgts[current + 1])
1528 					break;
1529 
1530 				/*
1531 				 * Otherwise, find the next valid column after
1532 				 * the previous position.
1533 				 */
1534 				for (c = tgts[current - 1] + 1;
1535 				    rm->rm_col[c].rc_error != 0; c++)
1536 					continue;
1537 
1538 				tgts[current] = c;
1539 				current++;
1540 
1541 			} while (current != n);
1542 		}
1543 	}
1544 	n--;
1545 done:
1546 	for (i = n - 1; i >= 0; i--) {
1547 		free(orig[i]);
1548 	}
1549 
1550 	return (ret);
1551 }
1552 
1553 static int
1554 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1555     off_t offset, size_t bytes)
1556 {
1557 	vdev_t *tvd = vd->v_top;
1558 	vdev_t *cvd;
1559 	raidz_map_t *rm;
1560 	raidz_col_t *rc;
1561 	int c, error;
1562 	int unexpected_errors;
1563 	int parity_errors;
1564 	int parity_untried;
1565 	int data_errors;
1566 	int total_errors;
1567 	int n;
1568 	int tgts[VDEV_RAIDZ_MAXPARITY];
1569 	int code;
1570 
1571 	rc = NULL;	/* gcc */
1572 	error = 0;
1573 
1574 	rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1575 	    vd->v_nchildren, vd->v_nparity);
1576 	if (rm == NULL)
1577 		return (ENOMEM);
1578 
1579 	/*
1580 	 * Iterate over the columns in reverse order so that we hit the parity
1581 	 * last -- any errors along the way will force us to read the parity.
1582 	 */
1583 	for (c = rm->rm_cols - 1; c >= 0; c--) {
1584 		rc = &rm->rm_col[c];
1585 		cvd = vdev_child(vd, rc->rc_devidx);
1586 		if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1587 			if (c >= rm->rm_firstdatacol)
1588 				rm->rm_missingdata++;
1589 			else
1590 				rm->rm_missingparity++;
1591 			rc->rc_error = ENXIO;
1592 			rc->rc_tried = 1;	/* don't even try */
1593 			rc->rc_skipped = 1;
1594 			continue;
1595 		}
1596 #if 0		/* XXX: Too hard for the boot code. */
1597 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1598 			if (c >= rm->rm_firstdatacol)
1599 				rm->rm_missingdata++;
1600 			else
1601 				rm->rm_missingparity++;
1602 			rc->rc_error = ESTALE;
1603 			rc->rc_skipped = 1;
1604 			continue;
1605 		}
1606 #endif
1607 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1608 			rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1609 			    rc->rc_offset, rc->rc_size);
1610 			rc->rc_tried = 1;
1611 			rc->rc_skipped = 0;
1612 		}
1613 	}
1614 
1615 reconstruct:
1616 	unexpected_errors = 0;
1617 	parity_errors = 0;
1618 	parity_untried = 0;
1619 	data_errors = 0;
1620 	total_errors = 0;
1621 
1622 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1623 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1624 
1625 	for (c = 0; c < rm->rm_cols; c++) {
1626 		rc = &rm->rm_col[c];
1627 
1628 		if (rc->rc_error) {
1629 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
1630 
1631 			if (c < rm->rm_firstdatacol)
1632 				parity_errors++;
1633 			else
1634 				data_errors++;
1635 
1636 			if (!rc->rc_skipped)
1637 				unexpected_errors++;
1638 
1639 			total_errors++;
1640 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1641 			parity_untried++;
1642 		}
1643 	}
1644 
1645 	/*
1646 	 * There are three potential phases for a read:
1647 	 *	1. produce valid data from the columns read
1648 	 *	2. read all disks and try again
1649 	 *	3. perform combinatorial reconstruction
1650 	 *
1651 	 * Each phase is progressively both more expensive and less likely to
1652 	 * occur. If we encounter more errors than we can repair or all phases
1653 	 * fail, we have no choice but to return an error.
1654 	 */
1655 
1656 	/*
1657 	 * If the number of errors we saw was correctable -- less than or equal
1658 	 * to the number of parity disks read -- attempt to produce data that
1659 	 * has a valid checksum. Naturally, this case applies in the absence of
1660 	 * any errors.
1661 	 */
1662 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1663 		int rv;
1664 
1665 		if (data_errors == 0) {
1666 			rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
1667 			if (rv == 0) {
1668 				/*
1669 				 * If we read parity information (unnecessarily
1670 				 * as it happens since no reconstruction was
1671 				 * needed) regenerate and verify the parity.
1672 				 * We also regenerate parity when resilvering
1673 				 * so we can write it out to the failed device
1674 				 * later.
1675 				 */
1676 				if (parity_errors + parity_untried <
1677 				    rm->rm_firstdatacol) {
1678 					n = raidz_parity_verify(rm);
1679 					unexpected_errors += n;
1680 					ASSERT(parity_errors + n <=
1681 					    rm->rm_firstdatacol);
1682 				}
1683 				goto done;
1684 			}
1685 		} else {
1686 			/*
1687 			 * We either attempt to read all the parity columns or
1688 			 * none of them. If we didn't try to read parity, we
1689 			 * wouldn't be here in the correctable case. There must
1690 			 * also have been fewer parity errors than parity
1691 			 * columns or, again, we wouldn't be in this code path.
1692 			 */
1693 			ASSERT(parity_untried == 0);
1694 			ASSERT(parity_errors < rm->rm_firstdatacol);
1695 
1696 			/*
1697 			 * Identify the data columns that reported an error.
1698 			 */
1699 			n = 0;
1700 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1701 				rc = &rm->rm_col[c];
1702 				if (rc->rc_error != 0) {
1703 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1704 					tgts[n++] = c;
1705 				}
1706 			}
1707 
1708 			ASSERT(rm->rm_firstdatacol >= n);
1709 
1710 			code = vdev_raidz_reconstruct(rm, tgts, n);
1711 
1712 			rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
1713 			if (rv == 0) {
1714 				/*
1715 				 * If we read more parity disks than were used
1716 				 * for reconstruction, confirm that the other
1717 				 * parity disks produced correct data. This
1718 				 * routine is suboptimal in that it regenerates
1719 				 * the parity that we already used in addition
1720 				 * to the parity that we're attempting to
1721 				 * verify, but this should be a relatively
1722 				 * uncommon case, and can be optimized if it
1723 				 * becomes a problem. Note that we regenerate
1724 				 * parity when resilvering so we can write it
1725 				 * out to failed devices later.
1726 				 */
1727 				if (parity_errors < rm->rm_firstdatacol - n) {
1728 					n = raidz_parity_verify(rm);
1729 					unexpected_errors += n;
1730 					ASSERT(parity_errors + n <=
1731 					    rm->rm_firstdatacol);
1732 				}
1733 
1734 				goto done;
1735 			}
1736 		}
1737 	}
1738 
1739 	/*
1740 	 * This isn't a typical situation -- either we got a read
1741 	 * error or a child silently returned bad data. Read every
1742 	 * block so we can try again with as much data and parity as
1743 	 * we can track down. If we've already been through once
1744 	 * before, all children will be marked as tried so we'll
1745 	 * proceed to combinatorial reconstruction.
1746 	 */
1747 	unexpected_errors = 1;
1748 	rm->rm_missingdata = 0;
1749 	rm->rm_missingparity = 0;
1750 
1751 	n = 0;
1752 	for (c = 0; c < rm->rm_cols; c++) {
1753 		rc = &rm->rm_col[c];
1754 
1755 		if (rc->rc_tried)
1756 			continue;
1757 
1758 		cvd = vdev_child(vd, rc->rc_devidx);
1759 		ASSERT(cvd != NULL);
1760 		rc->rc_error = cvd->v_read(cvd, NULL,
1761 		    rc->rc_data, rc->rc_offset, rc->rc_size);
1762 		if (rc->rc_error == 0)
1763 			n++;
1764 		rc->rc_tried = 1;
1765 		rc->rc_skipped = 0;
1766 	}
1767 	/*
1768 	 * If we managed to read anything more, retry the
1769 	 * reconstruction.
1770 	 */
1771 	if (n > 0)
1772 		goto reconstruct;
1773 
1774 	/*
1775 	 * At this point we've attempted to reconstruct the data given the
1776 	 * errors we detected, and we've attempted to read all columns. There
1777 	 * must, therefore, be one or more additional problems -- silent errors
1778 	 * resulting in invalid data rather than explicit I/O errors resulting
1779 	 * in absent data. We check if there is enough additional data to
1780 	 * possibly reconstruct the data and then perform combinatorial
1781 	 * reconstruction over all possible combinations. If that fails,
1782 	 * we're cooked.
1783 	 */
1784 	if (total_errors > rm->rm_firstdatacol) {
1785 		error = EIO;
1786 	} else if (total_errors < rm->rm_firstdatacol &&
1787 	    (code = vdev_raidz_combrec(vd->v_spa, rm, bp, data, offset, bytes,
1788 	     total_errors, data_errors)) != 0) {
1789 		/*
1790 		 * If we didn't use all the available parity for the
1791 		 * combinatorial reconstruction, verify that the remaining
1792 		 * parity is correct.
1793 		 */
1794 		if (code != (1 << rm->rm_firstdatacol) - 1)
1795 			(void) raidz_parity_verify(rm);
1796 	} else {
1797 		/*
1798 		 * We're here because either:
1799 		 *
1800 		 *	total_errors == rm_first_datacol, or
1801 		 *	vdev_raidz_combrec() failed
1802 		 *
1803 		 * In either case, there is enough bad data to prevent
1804 		 * reconstruction.
1805 		 *
1806 		 * Start checksum ereports for all children which haven't
1807 		 * failed, and the IO wasn't speculative.
1808 		 */
1809 		error = ECKSUM;
1810 	}
1811 
1812 done:
1813 	vdev_raidz_map_free(rm);
1814 
1815 	return (error);
1816 }
1817