xref: /freebsd/sys/cddl/boot/zfs/zfssubr.c (revision 8b238f4126d32df3e70056bc32536b7248ebffa0)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/cdefs.h>
27 __FBSDID("$FreeBSD$");
28 
29 #include <lz4.h>
30 
31 static uint64_t zfs_crc64_table[256];
32 
33 #define	ECKSUM	666
34 
35 #define	ASSERT3S(x, y, z)	((void)0)
36 #define	ASSERT3U(x, y, z)	((void)0)
37 #define	ASSERT3P(x, y, z)	((void)0)
38 #define	ASSERT0(x)		((void)0)
39 #define	ASSERT(x)		((void)0)
40 
41 #define	panic(...)	do {						\
42 	printf(__VA_ARGS__);						\
43 	for (;;) ;							\
44 } while (0)
45 
46 #define	kmem_alloc(size, flag)	zfs_alloc((size))
47 #define	kmem_free(ptr, size)	zfs_free((ptr), (size))
48 
49 static void
50 zfs_init_crc(void)
51 {
52 	int i, j;
53 	uint64_t *ct;
54 
55 	/*
56 	 * Calculate the crc64 table (used for the zap hash
57 	 * function).
58 	 */
59 	if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
60 		memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
61 		for (i = 0; i < 256; i++)
62 			for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
63 				*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
64 	}
65 }
66 
67 static void
68 zio_checksum_off(const void *buf, uint64_t size,
69     const void *ctx_template, zio_cksum_t *zcp)
70 {
71 	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
72 }
73 
74 /*
75  * Signature for checksum functions.
76  */
77 typedef void zio_checksum_t(const void *data, uint64_t size,
78     const void *ctx_template, zio_cksum_t *zcp);
79 typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
80 typedef void zio_checksum_tmpl_free_t(void *ctx_template);
81 
82 typedef enum zio_checksum_flags {
83 	/* Strong enough for metadata? */
84 	ZCHECKSUM_FLAG_METADATA = (1 << 1),
85 	/* ZIO embedded checksum */
86 	ZCHECKSUM_FLAG_EMBEDDED = (1 << 2),
87 	/* Strong enough for dedup (without verification)? */
88 	ZCHECKSUM_FLAG_DEDUP = (1 << 3),
89 	/* Uses salt value */
90 	ZCHECKSUM_FLAG_SALTED = (1 << 4),
91 	/* Strong enough for nopwrite? */
92 	ZCHECKSUM_FLAG_NOPWRITE = (1 << 5)
93 } zio_checksum_flags_t;
94 
95 /*
96  * Information about each checksum function.
97  */
98 typedef struct zio_checksum_info {
99 	/* checksum function for each byteorder */
100 	zio_checksum_t			*ci_func[2];
101 	zio_checksum_tmpl_init_t	*ci_tmpl_init;
102 	zio_checksum_tmpl_free_t	*ci_tmpl_free;
103 	zio_checksum_flags_t		ci_flags;
104 	const char			*ci_name;	/* descriptive name */
105 } zio_checksum_info_t;
106 
107 #include "blkptr.c"
108 
109 #include "fletcher.c"
110 #include "sha256.c"
111 #include "skein_zfs.c"
112 
113 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
114 	{{NULL, NULL}, NULL, NULL, 0, "inherit"},
115 	{{NULL, NULL}, NULL, NULL, 0, "on"},
116 	{{zio_checksum_off,	zio_checksum_off}, NULL, NULL, 0, "off"},
117 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
118 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"},
119 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
120 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"},
121 	{{fletcher_2_native,	fletcher_2_byteswap}, NULL, NULL,
122 	    ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
123 	{{fletcher_2_native,	fletcher_2_byteswap}, NULL, NULL,
124 	    0, "fletcher2"},
125 	{{fletcher_4_native,	fletcher_4_byteswap}, NULL, NULL,
126 	    ZCHECKSUM_FLAG_METADATA, "fletcher4"},
127 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
128 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
129 	    ZCHECKSUM_FLAG_NOPWRITE, "SHA256"},
130 	{{fletcher_4_native,	fletcher_4_byteswap}, NULL, NULL,
131 	    ZCHECKSUM_FLAG_EMBEDDED, "zillog2"},
132 	{{zio_checksum_off,	zio_checksum_off}, NULL, NULL,
133 	    0, "noparity"},
134 	{{zio_checksum_SHA512_native,	zio_checksum_SHA512_byteswap},
135 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
136 	    ZCHECKSUM_FLAG_NOPWRITE, "SHA512"},
137 	{{zio_checksum_skein_native, zio_checksum_skein_byteswap},
138 	    zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
139 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
140 	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
141 	/* no edonr for now */
142 	{{NULL, NULL}, NULL, NULL, ZCHECKSUM_FLAG_METADATA |
143 	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"}
144 };
145 
146 /*
147  * Common signature for all zio compress/decompress functions.
148  */
149 typedef size_t zio_compress_func_t(void *src, void *dst,
150     size_t s_len, size_t d_len, int);
151 typedef int zio_decompress_func_t(void *src, void *dst,
152     size_t s_len, size_t d_len, int);
153 
154 /*
155  * Information about each compression function.
156  */
157 typedef struct zio_compress_info {
158 	zio_compress_func_t	*ci_compress;	/* compression function */
159 	zio_decompress_func_t	*ci_decompress;	/* decompression function */
160 	int			ci_level;	/* level parameter */
161 	const char		*ci_name;	/* algorithm name */
162 } zio_compress_info_t;
163 
164 #include "lzjb.c"
165 #include "zle.c"
166 
167 /*
168  * Compression vectors.
169  */
170 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
171 	{NULL,			NULL,			0,	"inherit"},
172 	{NULL,			NULL,			0,	"on"},
173 	{NULL,			NULL,			0,	"uncompressed"},
174 	{NULL,			lzjb_decompress,	0,	"lzjb"},
175 	{NULL,			NULL,			0,	"empty"},
176 	{NULL,			NULL,			1,	"gzip-1"},
177 	{NULL,			NULL,			2,	"gzip-2"},
178 	{NULL,			NULL,			3,	"gzip-3"},
179 	{NULL,			NULL,			4,	"gzip-4"},
180 	{NULL,			NULL,			5,	"gzip-5"},
181 	{NULL,			NULL,			6,	"gzip-6"},
182 	{NULL,			NULL,			7,	"gzip-7"},
183 	{NULL,			NULL,			8,	"gzip-8"},
184 	{NULL,			NULL,			9,	"gzip-9"},
185 	{NULL,			zle_decompress,		64,	"zle"},
186 	{NULL,			lz4_decompress,		0,	"lz4"},
187 };
188 
189 static void
190 byteswap_uint64_array(void *vbuf, size_t size)
191 {
192 	uint64_t *buf = vbuf;
193 	size_t count = size >> 3;
194 	int i;
195 
196 	ASSERT((size & 7) == 0);
197 
198 	for (i = 0; i < count; i++)
199 		buf[i] = BSWAP_64(buf[i]);
200 }
201 
202 /*
203  * Set the external verifier for a gang block based on <vdev, offset, txg>,
204  * a tuple which is guaranteed to be unique for the life of the pool.
205  */
206 static void
207 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
208 {
209 	const dva_t *dva = BP_IDENTITY(bp);
210 	uint64_t txg = BP_PHYSICAL_BIRTH(bp);
211 
212 	ASSERT(BP_IS_GANG(bp));
213 
214 	ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
215 }
216 
217 /*
218  * Set the external verifier for a label block based on its offset.
219  * The vdev is implicit, and the txg is unknowable at pool open time --
220  * hence the logic in vdev_uberblock_load() to find the most recent copy.
221  */
222 static void
223 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
224 {
225 	ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
226 }
227 
228 /*
229  * Calls the template init function of a checksum which supports context
230  * templates and installs the template into the spa_t.
231  */
232 static void
233 zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
234 {
235 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
236 
237 	if (ci->ci_tmpl_init == NULL)
238 		return;
239 
240 	if (spa->spa_cksum_tmpls[checksum] != NULL)
241 		return;
242 
243 	if (spa->spa_cksum_tmpls[checksum] == NULL) {
244 		spa->spa_cksum_tmpls[checksum] =
245 		    ci->ci_tmpl_init(&spa->spa_cksum_salt);
246 	}
247 }
248 
249 /*
250  * Called by a spa_t that's about to be deallocated. This steps through
251  * all of the checksum context templates and deallocates any that were
252  * initialized using the algorithm-specific template init function.
253  */
254 static void __unused
255 zio_checksum_templates_free(spa_t *spa)
256 {
257 	for (enum zio_checksum checksum = 0;
258 	    checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
259 		if (spa->spa_cksum_tmpls[checksum] != NULL) {
260 			zio_checksum_info_t *ci = &zio_checksum_table[checksum];
261 
262 			ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
263 			spa->spa_cksum_tmpls[checksum] = NULL;
264 		}
265 	}
266 }
267 
268 static int
269 zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data)
270 {
271 	uint64_t size;
272 	unsigned int checksum;
273 	zio_checksum_info_t *ci;
274 	void *ctx = NULL;
275 	zio_cksum_t actual_cksum, expected_cksum, verifier;
276 	int byteswap;
277 
278 	checksum = BP_GET_CHECKSUM(bp);
279 	size = BP_GET_PSIZE(bp);
280 
281 	if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
282 		return (EINVAL);
283 	ci = &zio_checksum_table[checksum];
284 	if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
285 		return (EINVAL);
286 
287 	if (spa != NULL) {
288 		zio_checksum_template_init(checksum, __DECONST(spa_t *,spa));
289 		ctx = spa->spa_cksum_tmpls[checksum];
290 	}
291 
292 	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
293 		zio_eck_t *eck;
294 
295 		ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
296 		    checksum == ZIO_CHECKSUM_LABEL);
297 
298 		eck = (zio_eck_t *)((char *)data + size) - 1;
299 
300 		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
301 			zio_checksum_gang_verifier(&verifier, bp);
302 		else if (checksum == ZIO_CHECKSUM_LABEL)
303 			zio_checksum_label_verifier(&verifier,
304 			    DVA_GET_OFFSET(BP_IDENTITY(bp)));
305 		else
306 			verifier = bp->blk_cksum;
307 
308 		byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
309 
310 		if (byteswap)
311 			byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
312 
313 		expected_cksum = eck->zec_cksum;
314 		eck->zec_cksum = verifier;
315 		ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
316 		eck->zec_cksum = expected_cksum;
317 
318 		if (byteswap)
319 			byteswap_uint64_array(&expected_cksum,
320 			    sizeof (zio_cksum_t));
321 	} else {
322 		byteswap = BP_SHOULD_BYTESWAP(bp);
323 		expected_cksum = bp->blk_cksum;
324 		ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
325 	}
326 
327 	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
328 		/*printf("ZFS: read checksum %s failed\n", ci->ci_name);*/
329 		return (EIO);
330 	}
331 
332 	return (0);
333 }
334 
335 static int
336 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
337 	void *dest, uint64_t destsize)
338 {
339 	zio_compress_info_t *ci;
340 
341 	if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
342 		printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
343 		return (EIO);
344 	}
345 
346 	ci = &zio_compress_table[cpfunc];
347 	if (!ci->ci_decompress) {
348 		printf("ZFS: unsupported compression algorithm %s\n",
349 		    ci->ci_name);
350 		return (EIO);
351 	}
352 
353 	return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
354 }
355 
356 static uint64_t
357 zap_hash(uint64_t salt, const char *name)
358 {
359 	const uint8_t *cp;
360 	uint8_t c;
361 	uint64_t crc = salt;
362 
363 	ASSERT(crc != 0);
364 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
365 	for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
366 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
367 
368 	/*
369 	 * Only use 28 bits, since we need 4 bits in the cookie for the
370 	 * collision differentiator.  We MUST use the high bits, since
371 	 * those are the onces that we first pay attention to when
372 	 * chosing the bucket.
373 	 */
374 	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
375 
376 	return (crc);
377 }
378 
379 static void *zfs_alloc(size_t size);
380 static void zfs_free(void *ptr, size_t size);
381 
382 typedef struct raidz_col {
383 	uint64_t rc_devidx;		/* child device index for I/O */
384 	uint64_t rc_offset;		/* device offset */
385 	uint64_t rc_size;		/* I/O size */
386 	void *rc_data;			/* I/O data */
387 	int rc_error;			/* I/O error for this device */
388 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
389 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
390 } raidz_col_t;
391 
392 typedef struct raidz_map {
393 	uint64_t rm_cols;		/* Regular column count */
394 	uint64_t rm_scols;		/* Count including skipped columns */
395 	uint64_t rm_bigcols;		/* Number of oversized columns */
396 	uint64_t rm_asize;		/* Actual total I/O size */
397 	uint64_t rm_missingdata;	/* Count of missing data devices */
398 	uint64_t rm_missingparity;	/* Count of missing parity devices */
399 	uint64_t rm_firstdatacol;	/* First data column/parity count */
400 	uint64_t rm_nskip;		/* Skipped sectors for padding */
401 	uint64_t rm_skipstart;		/* Column index of padding start */
402 	uintptr_t rm_reports;		/* # of referencing checksum reports */
403 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
404 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
405 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
406 } raidz_map_t;
407 
408 #define	VDEV_RAIDZ_P		0
409 #define	VDEV_RAIDZ_Q		1
410 #define	VDEV_RAIDZ_R		2
411 
412 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
413 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
414 
415 /*
416  * We provide a mechanism to perform the field multiplication operation on a
417  * 64-bit value all at once rather than a byte at a time. This works by
418  * creating a mask from the top bit in each byte and using that to
419  * conditionally apply the XOR of 0x1d.
420  */
421 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
422 { \
423 	(mask) = (x) & 0x8080808080808080ULL; \
424 	(mask) = ((mask) << 1) - ((mask) >> 7); \
425 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
426 	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
427 }
428 
429 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
430 { \
431 	VDEV_RAIDZ_64MUL_2((x), mask); \
432 	VDEV_RAIDZ_64MUL_2((x), mask); \
433 }
434 
435 /*
436  * These two tables represent powers and logs of 2 in the Galois field defined
437  * above. These values were computed by repeatedly multiplying by 2 as above.
438  */
439 static const uint8_t vdev_raidz_pow2[256] = {
440 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
441 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
442 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
443 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
444 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
445 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
446 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
447 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
448 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
449 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
450 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
451 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
452 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
453 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
454 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
455 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
456 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
457 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
458 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
459 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
460 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
461 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
462 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
463 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
464 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
465 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
466 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
467 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
468 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
469 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
470 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
471 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
472 };
473 static const uint8_t vdev_raidz_log2[256] = {
474 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
475 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
476 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
477 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
478 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
479 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
480 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
481 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
482 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
483 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
484 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
485 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
486 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
487 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
488 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
489 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
490 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
491 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
492 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
493 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
494 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
495 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
496 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
497 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
498 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
499 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
500 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
501 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
502 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
503 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
504 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
505 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
506 };
507 
508 /*
509  * Multiply a given number by 2 raised to the given power.
510  */
511 static uint8_t
512 vdev_raidz_exp2(uint8_t a, int exp)
513 {
514 	if (a == 0)
515 		return (0);
516 
517 	ASSERT(exp >= 0);
518 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
519 
520 	exp += vdev_raidz_log2[a];
521 	if (exp > 255)
522 		exp -= 255;
523 
524 	return (vdev_raidz_pow2[exp]);
525 }
526 
527 static void
528 vdev_raidz_generate_parity_p(raidz_map_t *rm)
529 {
530 	uint64_t *p, *src, pcount, ccount, i;
531 	int c;
532 
533 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
534 
535 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
536 		src = rm->rm_col[c].rc_data;
537 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
538 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
539 
540 		if (c == rm->rm_firstdatacol) {
541 			ASSERT(ccount == pcount);
542 			for (i = 0; i < ccount; i++, src++, p++) {
543 				*p = *src;
544 			}
545 		} else {
546 			ASSERT(ccount <= pcount);
547 			for (i = 0; i < ccount; i++, src++, p++) {
548 				*p ^= *src;
549 			}
550 		}
551 	}
552 }
553 
554 static void
555 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
556 {
557 	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
558 	int c;
559 
560 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
561 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
562 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
563 
564 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
565 		src = rm->rm_col[c].rc_data;
566 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
567 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
568 
569 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
570 
571 		if (c == rm->rm_firstdatacol) {
572 			ASSERT(ccnt == pcnt || ccnt == 0);
573 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
574 				*p = *src;
575 				*q = *src;
576 			}
577 			for (; i < pcnt; i++, src++, p++, q++) {
578 				*p = 0;
579 				*q = 0;
580 			}
581 		} else {
582 			ASSERT(ccnt <= pcnt);
583 
584 			/*
585 			 * Apply the algorithm described above by multiplying
586 			 * the previous result and adding in the new value.
587 			 */
588 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
589 				*p ^= *src;
590 
591 				VDEV_RAIDZ_64MUL_2(*q, mask);
592 				*q ^= *src;
593 			}
594 
595 			/*
596 			 * Treat short columns as though they are full of 0s.
597 			 * Note that there's therefore nothing needed for P.
598 			 */
599 			for (; i < pcnt; i++, q++) {
600 				VDEV_RAIDZ_64MUL_2(*q, mask);
601 			}
602 		}
603 	}
604 }
605 
606 static void
607 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
608 {
609 	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
610 	int c;
611 
612 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
613 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
614 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
615 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
616 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
617 
618 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
619 		src = rm->rm_col[c].rc_data;
620 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
621 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
622 		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
623 
624 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
625 
626 		if (c == rm->rm_firstdatacol) {
627 			ASSERT(ccnt == pcnt || ccnt == 0);
628 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
629 				*p = *src;
630 				*q = *src;
631 				*r = *src;
632 			}
633 			for (; i < pcnt; i++, src++, p++, q++, r++) {
634 				*p = 0;
635 				*q = 0;
636 				*r = 0;
637 			}
638 		} else {
639 			ASSERT(ccnt <= pcnt);
640 
641 			/*
642 			 * Apply the algorithm described above by multiplying
643 			 * the previous result and adding in the new value.
644 			 */
645 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
646 				*p ^= *src;
647 
648 				VDEV_RAIDZ_64MUL_2(*q, mask);
649 				*q ^= *src;
650 
651 				VDEV_RAIDZ_64MUL_4(*r, mask);
652 				*r ^= *src;
653 			}
654 
655 			/*
656 			 * Treat short columns as though they are full of 0s.
657 			 * Note that there's therefore nothing needed for P.
658 			 */
659 			for (; i < pcnt; i++, q++, r++) {
660 				VDEV_RAIDZ_64MUL_2(*q, mask);
661 				VDEV_RAIDZ_64MUL_4(*r, mask);
662 			}
663 		}
664 	}
665 }
666 
667 /*
668  * Generate RAID parity in the first virtual columns according to the number of
669  * parity columns available.
670  */
671 static void
672 vdev_raidz_generate_parity(raidz_map_t *rm)
673 {
674 	switch (rm->rm_firstdatacol) {
675 	case 1:
676 		vdev_raidz_generate_parity_p(rm);
677 		break;
678 	case 2:
679 		vdev_raidz_generate_parity_pq(rm);
680 		break;
681 	case 3:
682 		vdev_raidz_generate_parity_pqr(rm);
683 		break;
684 	default:
685 		panic("invalid RAID-Z configuration");
686 	}
687 }
688 
689 /* BEGIN CSTYLED */
690 /*
691  * In the general case of reconstruction, we must solve the system of linear
692  * equations defined by the coeffecients used to generate parity as well as
693  * the contents of the data and parity disks. This can be expressed with
694  * vectors for the original data (D) and the actual data (d) and parity (p)
695  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
696  *
697  *            __   __                     __     __
698  *            |     |         __     __   |  p_0  |
699  *            |  V  |         |  D_0  |   | p_m-1 |
700  *            |     |    x    |   :   | = |  d_0  |
701  *            |  I  |         | D_n-1 |   |   :   |
702  *            |     |         ~~     ~~   | d_n-1 |
703  *            ~~   ~~                     ~~     ~~
704  *
705  * I is simply a square identity matrix of size n, and V is a vandermonde
706  * matrix defined by the coeffecients we chose for the various parity columns
707  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
708  * computation as well as linear separability.
709  *
710  *      __               __               __     __
711  *      |   1   ..  1 1 1 |               |  p_0  |
712  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
713  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
714  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
715  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
716  *      |   :       : : : |   |   :   |   |  d_2  |
717  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
718  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
719  *      |   0   ..  0 0 1 |               | d_n-1 |
720  *      ~~               ~~               ~~     ~~
721  *
722  * Note that I, V, d, and p are known. To compute D, we must invert the
723  * matrix and use the known data and parity values to reconstruct the unknown
724  * data values. We begin by removing the rows in V|I and d|p that correspond
725  * to failed or missing columns; we then make V|I square (n x n) and d|p
726  * sized n by removing rows corresponding to unused parity from the bottom up
727  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
728  * using Gauss-Jordan elimination. In the example below we use m=3 parity
729  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
730  *           __                               __
731  *           |  1   1   1   1   1   1   1   1  |
732  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
733  *           |  19 205 116  29  64  16  4   1  |      / /
734  *           |  1   0   0   0   0   0   0   0  |     / /
735  *           |  0   1   0   0   0   0   0   0  | <--' /
736  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
737  *           |  0   0   0   1   0   0   0   0  |
738  *           |  0   0   0   0   1   0   0   0  |
739  *           |  0   0   0   0   0   1   0   0  |
740  *           |  0   0   0   0   0   0   1   0  |
741  *           |  0   0   0   0   0   0   0   1  |
742  *           ~~                               ~~
743  *           __                               __
744  *           |  1   1   1   1   1   1   1   1  |
745  *           | 128  64  32  16  8   4   2   1  |
746  *           |  19 205 116  29  64  16  4   1  |
747  *           |  1   0   0   0   0   0   0   0  |
748  *           |  0   1   0   0   0   0   0   0  |
749  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
750  *           |  0   0   0   1   0   0   0   0  |
751  *           |  0   0   0   0   1   0   0   0  |
752  *           |  0   0   0   0   0   1   0   0  |
753  *           |  0   0   0   0   0   0   1   0  |
754  *           |  0   0   0   0   0   0   0   1  |
755  *           ~~                               ~~
756  *
757  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
758  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
759  * matrix is not singular.
760  * __                                                                 __
761  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
762  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
763  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
764  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
765  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
766  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
767  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
768  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
769  * ~~                                                                 ~~
770  * __                                                                 __
771  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
772  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
773  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
774  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
775  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
776  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
777  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
778  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
779  * ~~                                                                 ~~
780  * __                                                                 __
781  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
782  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
783  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
784  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
785  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
786  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
787  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
788  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
789  * ~~                                                                 ~~
790  * __                                                                 __
791  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
792  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
793  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
794  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
795  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
796  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
797  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
798  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
799  * ~~                                                                 ~~
800  * __                                                                 __
801  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
802  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
803  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
804  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
805  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
806  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
807  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
808  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
809  * ~~                                                                 ~~
810  * __                                                                 __
811  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
812  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
813  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
814  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
815  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
816  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
817  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
818  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
819  * ~~                                                                 ~~
820  *                   __                               __
821  *                   |  0   0   1   0   0   0   0   0  |
822  *                   | 167 100  5   41 159 169 217 208 |
823  *                   | 166 100  4   40 158 168 216 209 |
824  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
825  *                   |  0   0   0   0   1   0   0   0  |
826  *                   |  0   0   0   0   0   1   0   0  |
827  *                   |  0   0   0   0   0   0   1   0  |
828  *                   |  0   0   0   0   0   0   0   1  |
829  *                   ~~                               ~~
830  *
831  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
832  * of the missing data.
833  *
834  * As is apparent from the example above, the only non-trivial rows in the
835  * inverse matrix correspond to the data disks that we're trying to
836  * reconstruct. Indeed, those are the only rows we need as the others would
837  * only be useful for reconstructing data known or assumed to be valid. For
838  * that reason, we only build the coefficients in the rows that correspond to
839  * targeted columns.
840  */
841 /* END CSTYLED */
842 
843 static void
844 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
845     uint8_t **rows)
846 {
847 	int i, j;
848 	int pow;
849 
850 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
851 
852 	/*
853 	 * Fill in the missing rows of interest.
854 	 */
855 	for (i = 0; i < nmap; i++) {
856 		ASSERT3S(0, <=, map[i]);
857 		ASSERT3S(map[i], <=, 2);
858 
859 		pow = map[i] * n;
860 		if (pow > 255)
861 			pow -= 255;
862 		ASSERT(pow <= 255);
863 
864 		for (j = 0; j < n; j++) {
865 			pow -= map[i];
866 			if (pow < 0)
867 				pow += 255;
868 			rows[i][j] = vdev_raidz_pow2[pow];
869 		}
870 	}
871 }
872 
873 static void
874 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
875     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
876 {
877 	int i, j, ii, jj;
878 	uint8_t log;
879 
880 	/*
881 	 * Assert that the first nmissing entries from the array of used
882 	 * columns correspond to parity columns and that subsequent entries
883 	 * correspond to data columns.
884 	 */
885 	for (i = 0; i < nmissing; i++) {
886 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
887 	}
888 	for (; i < n; i++) {
889 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
890 	}
891 
892 	/*
893 	 * First initialize the storage where we'll compute the inverse rows.
894 	 */
895 	for (i = 0; i < nmissing; i++) {
896 		for (j = 0; j < n; j++) {
897 			invrows[i][j] = (i == j) ? 1 : 0;
898 		}
899 	}
900 
901 	/*
902 	 * Subtract all trivial rows from the rows of consequence.
903 	 */
904 	for (i = 0; i < nmissing; i++) {
905 		for (j = nmissing; j < n; j++) {
906 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
907 			jj = used[j] - rm->rm_firstdatacol;
908 			ASSERT3S(jj, <, n);
909 			invrows[i][j] = rows[i][jj];
910 			rows[i][jj] = 0;
911 		}
912 	}
913 
914 	/*
915 	 * For each of the rows of interest, we must normalize it and subtract
916 	 * a multiple of it from the other rows.
917 	 */
918 	for (i = 0; i < nmissing; i++) {
919 		for (j = 0; j < missing[i]; j++) {
920 			ASSERT3U(rows[i][j], ==, 0);
921 		}
922 		ASSERT3U(rows[i][missing[i]], !=, 0);
923 
924 		/*
925 		 * Compute the inverse of the first element and multiply each
926 		 * element in the row by that value.
927 		 */
928 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
929 
930 		for (j = 0; j < n; j++) {
931 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
932 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
933 		}
934 
935 		for (ii = 0; ii < nmissing; ii++) {
936 			if (i == ii)
937 				continue;
938 
939 			ASSERT3U(rows[ii][missing[i]], !=, 0);
940 
941 			log = vdev_raidz_log2[rows[ii][missing[i]]];
942 
943 			for (j = 0; j < n; j++) {
944 				rows[ii][j] ^=
945 				    vdev_raidz_exp2(rows[i][j], log);
946 				invrows[ii][j] ^=
947 				    vdev_raidz_exp2(invrows[i][j], log);
948 			}
949 		}
950 	}
951 
952 	/*
953 	 * Verify that the data that is left in the rows are properly part of
954 	 * an identity matrix.
955 	 */
956 	for (i = 0; i < nmissing; i++) {
957 		for (j = 0; j < n; j++) {
958 			if (j == missing[i]) {
959 				ASSERT3U(rows[i][j], ==, 1);
960 			} else {
961 				ASSERT3U(rows[i][j], ==, 0);
962 			}
963 		}
964 	}
965 }
966 
967 static void
968 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
969     int *missing, uint8_t **invrows, const uint8_t *used)
970 {
971 	int i, j, x, cc, c;
972 	uint8_t *src;
973 	uint64_t ccount;
974 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
975 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
976 	uint8_t log, val;
977 	int ll;
978 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
979 	uint8_t *p, *pp;
980 	size_t psize;
981 
982 	log = 0;	/* gcc */
983 	psize = sizeof (invlog[0][0]) * n * nmissing;
984 	p = zfs_alloc(psize);
985 
986 	for (pp = p, i = 0; i < nmissing; i++) {
987 		invlog[i] = pp;
988 		pp += n;
989 	}
990 
991 	for (i = 0; i < nmissing; i++) {
992 		for (j = 0; j < n; j++) {
993 			ASSERT3U(invrows[i][j], !=, 0);
994 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
995 		}
996 	}
997 
998 	for (i = 0; i < n; i++) {
999 		c = used[i];
1000 		ASSERT3U(c, <, rm->rm_cols);
1001 
1002 		src = rm->rm_col[c].rc_data;
1003 		ccount = rm->rm_col[c].rc_size;
1004 		for (j = 0; j < nmissing; j++) {
1005 			cc = missing[j] + rm->rm_firstdatacol;
1006 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1007 			ASSERT3U(cc, <, rm->rm_cols);
1008 			ASSERT3U(cc, !=, c);
1009 
1010 			dst[j] = rm->rm_col[cc].rc_data;
1011 			dcount[j] = rm->rm_col[cc].rc_size;
1012 		}
1013 
1014 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1015 
1016 		for (x = 0; x < ccount; x++, src++) {
1017 			if (*src != 0)
1018 				log = vdev_raidz_log2[*src];
1019 
1020 			for (cc = 0; cc < nmissing; cc++) {
1021 				if (x >= dcount[cc])
1022 					continue;
1023 
1024 				if (*src == 0) {
1025 					val = 0;
1026 				} else {
1027 					if ((ll = log + invlog[cc][i]) >= 255)
1028 						ll -= 255;
1029 					val = vdev_raidz_pow2[ll];
1030 				}
1031 
1032 				if (i == 0)
1033 					dst[cc][x] = val;
1034 				else
1035 					dst[cc][x] ^= val;
1036 			}
1037 		}
1038 	}
1039 
1040 	zfs_free(p, psize);
1041 }
1042 
1043 static int
1044 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1045 {
1046 	int n, i, c, t, tt;
1047 	int nmissing_rows;
1048 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1049 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1050 
1051 	uint8_t *p, *pp;
1052 	size_t psize;
1053 
1054 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1055 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1056 	uint8_t *used;
1057 
1058 	int code = 0;
1059 
1060 
1061 	n = rm->rm_cols - rm->rm_firstdatacol;
1062 
1063 	/*
1064 	 * Figure out which data columns are missing.
1065 	 */
1066 	nmissing_rows = 0;
1067 	for (t = 0; t < ntgts; t++) {
1068 		if (tgts[t] >= rm->rm_firstdatacol) {
1069 			missing_rows[nmissing_rows++] =
1070 			    tgts[t] - rm->rm_firstdatacol;
1071 		}
1072 	}
1073 
1074 	/*
1075 	 * Figure out which parity columns to use to help generate the missing
1076 	 * data columns.
1077 	 */
1078 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1079 		ASSERT(tt < ntgts);
1080 		ASSERT(c < rm->rm_firstdatacol);
1081 
1082 		/*
1083 		 * Skip any targeted parity columns.
1084 		 */
1085 		if (c == tgts[tt]) {
1086 			tt++;
1087 			continue;
1088 		}
1089 
1090 		code |= 1 << c;
1091 
1092 		parity_map[i] = c;
1093 		i++;
1094 	}
1095 
1096 	ASSERT(code != 0);
1097 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1098 
1099 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1100 	    nmissing_rows * n + sizeof (used[0]) * n;
1101 	p = kmem_alloc(psize, KM_SLEEP);
1102 
1103 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1104 		rows[i] = pp;
1105 		pp += n;
1106 		invrows[i] = pp;
1107 		pp += n;
1108 	}
1109 	used = pp;
1110 
1111 	for (i = 0; i < nmissing_rows; i++) {
1112 		used[i] = parity_map[i];
1113 	}
1114 
1115 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1116 		if (tt < nmissing_rows &&
1117 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1118 			tt++;
1119 			continue;
1120 		}
1121 
1122 		ASSERT3S(i, <, n);
1123 		used[i] = c;
1124 		i++;
1125 	}
1126 
1127 	/*
1128 	 * Initialize the interesting rows of the matrix.
1129 	 */
1130 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1131 
1132 	/*
1133 	 * Invert the matrix.
1134 	 */
1135 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1136 	    invrows, used);
1137 
1138 	/*
1139 	 * Reconstruct the missing data using the generated matrix.
1140 	 */
1141 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1142 	    invrows, used);
1143 
1144 	kmem_free(p, psize);
1145 
1146 	return (code);
1147 }
1148 
1149 static int
1150 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1151 {
1152 	int tgts[VDEV_RAIDZ_MAXPARITY];
1153 	int ntgts;
1154 	int i, c;
1155 	int code;
1156 	int nbadparity, nbaddata;
1157 
1158 	/*
1159 	 * The tgts list must already be sorted.
1160 	 */
1161 	for (i = 1; i < nt; i++) {
1162 		ASSERT(t[i] > t[i - 1]);
1163 	}
1164 
1165 	nbadparity = rm->rm_firstdatacol;
1166 	nbaddata = rm->rm_cols - nbadparity;
1167 	ntgts = 0;
1168 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1169 		if (i < nt && c == t[i]) {
1170 			tgts[ntgts++] = c;
1171 			i++;
1172 		} else if (rm->rm_col[c].rc_error != 0) {
1173 			tgts[ntgts++] = c;
1174 		} else if (c >= rm->rm_firstdatacol) {
1175 			nbaddata--;
1176 		} else {
1177 			nbadparity--;
1178 		}
1179 	}
1180 
1181 	ASSERT(ntgts >= nt);
1182 	ASSERT(nbaddata >= 0);
1183 	ASSERT(nbaddata + nbadparity == ntgts);
1184 
1185 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1186 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1187 	ASSERT(code > 0);
1188 	return (code);
1189 }
1190 
1191 static raidz_map_t *
1192 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1193     uint64_t dcols, uint64_t nparity)
1194 {
1195 	raidz_map_t *rm;
1196 	uint64_t b = offset >> unit_shift;
1197 	uint64_t s = size >> unit_shift;
1198 	uint64_t f = b % dcols;
1199 	uint64_t o = (b / dcols) << unit_shift;
1200 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1201 
1202 	q = s / (dcols - nparity);
1203 	r = s - q * (dcols - nparity);
1204 	bc = (r == 0 ? 0 : r + nparity);
1205 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
1206 
1207 	if (q == 0) {
1208 		acols = bc;
1209 		scols = MIN(dcols, roundup(bc, nparity + 1));
1210 	} else {
1211 		acols = dcols;
1212 		scols = dcols;
1213 	}
1214 
1215 	ASSERT3U(acols, <=, scols);
1216 
1217 	rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols]));
1218 
1219 	rm->rm_cols = acols;
1220 	rm->rm_scols = scols;
1221 	rm->rm_bigcols = bc;
1222 	rm->rm_skipstart = bc;
1223 	rm->rm_missingdata = 0;
1224 	rm->rm_missingparity = 0;
1225 	rm->rm_firstdatacol = nparity;
1226 	rm->rm_reports = 0;
1227 	rm->rm_freed = 0;
1228 	rm->rm_ecksuminjected = 0;
1229 
1230 	asize = 0;
1231 
1232 	for (c = 0; c < scols; c++) {
1233 		col = f + c;
1234 		coff = o;
1235 		if (col >= dcols) {
1236 			col -= dcols;
1237 			coff += 1ULL << unit_shift;
1238 		}
1239 		rm->rm_col[c].rc_devidx = col;
1240 		rm->rm_col[c].rc_offset = coff;
1241 		rm->rm_col[c].rc_data = NULL;
1242 		rm->rm_col[c].rc_error = 0;
1243 		rm->rm_col[c].rc_tried = 0;
1244 		rm->rm_col[c].rc_skipped = 0;
1245 
1246 		if (c >= acols)
1247 			rm->rm_col[c].rc_size = 0;
1248 		else if (c < bc)
1249 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1250 		else
1251 			rm->rm_col[c].rc_size = q << unit_shift;
1252 
1253 		asize += rm->rm_col[c].rc_size;
1254 	}
1255 
1256 	ASSERT3U(asize, ==, tot << unit_shift);
1257 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1258 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1259 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1260 	ASSERT3U(rm->rm_nskip, <=, nparity);
1261 
1262 	for (c = 0; c < rm->rm_firstdatacol; c++)
1263 		rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size);
1264 
1265 	rm->rm_col[c].rc_data = data;
1266 
1267 	for (c = c + 1; c < acols; c++)
1268 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1269 		    rm->rm_col[c - 1].rc_size;
1270 
1271 	/*
1272 	 * If all data stored spans all columns, there's a danger that parity
1273 	 * will always be on the same device and, since parity isn't read
1274 	 * during normal operation, that that device's I/O bandwidth won't be
1275 	 * used effectively. We therefore switch the parity every 1MB.
1276 	 *
1277 	 * ... at least that was, ostensibly, the theory. As a practical
1278 	 * matter unless we juggle the parity between all devices evenly, we
1279 	 * won't see any benefit. Further, occasional writes that aren't a
1280 	 * multiple of the LCM of the number of children and the minimum
1281 	 * stripe width are sufficient to avoid pessimal behavior.
1282 	 * Unfortunately, this decision created an implicit on-disk format
1283 	 * requirement that we need to support for all eternity, but only
1284 	 * for single-parity RAID-Z.
1285 	 *
1286 	 * If we intend to skip a sector in the zeroth column for padding
1287 	 * we must make sure to note this swap. We will never intend to
1288 	 * skip the first column since at least one data and one parity
1289 	 * column must appear in each row.
1290 	 */
1291 	ASSERT(rm->rm_cols >= 2);
1292 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1293 
1294 	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1295 		devidx = rm->rm_col[0].rc_devidx;
1296 		o = rm->rm_col[0].rc_offset;
1297 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1298 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1299 		rm->rm_col[1].rc_devidx = devidx;
1300 		rm->rm_col[1].rc_offset = o;
1301 
1302 		if (rm->rm_skipstart == 0)
1303 			rm->rm_skipstart = 1;
1304 	}
1305 
1306 	return (rm);
1307 }
1308 
1309 static void
1310 vdev_raidz_map_free(raidz_map_t *rm)
1311 {
1312 	int c;
1313 
1314 	for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1315 		zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
1316 
1317 	zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
1318 }
1319 
1320 static vdev_t *
1321 vdev_child(vdev_t *pvd, uint64_t devidx)
1322 {
1323 	vdev_t *cvd;
1324 
1325 	STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1326 		if (cvd->v_id == devidx)
1327 			break;
1328 	}
1329 
1330 	return (cvd);
1331 }
1332 
1333 /*
1334  * We keep track of whether or not there were any injected errors, so that
1335  * any ereports we generate can note it.
1336  */
1337 static int
1338 raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data,
1339     uint64_t size)
1340 {
1341 	return (zio_checksum_verify(spa, bp, data));
1342 }
1343 
1344 /*
1345  * Generate the parity from the data columns. If we tried and were able to
1346  * read the parity without error, verify that the generated parity matches the
1347  * data we read. If it doesn't, we fire off a checksum error. Return the
1348  * number such failures.
1349  */
1350 static int
1351 raidz_parity_verify(raidz_map_t *rm)
1352 {
1353 	void *orig[VDEV_RAIDZ_MAXPARITY];
1354 	int c, ret = 0;
1355 	raidz_col_t *rc;
1356 
1357 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1358 		rc = &rm->rm_col[c];
1359 		if (!rc->rc_tried || rc->rc_error != 0)
1360 			continue;
1361 		orig[c] = zfs_alloc(rc->rc_size);
1362 		bcopy(rc->rc_data, orig[c], rc->rc_size);
1363 	}
1364 
1365 	vdev_raidz_generate_parity(rm);
1366 
1367 	for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1368 		rc = &rm->rm_col[c];
1369 		if (!rc->rc_tried || rc->rc_error != 0)
1370 			continue;
1371 		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1372 			rc->rc_error = ECKSUM;
1373 			ret++;
1374 		}
1375 		zfs_free(orig[c], rc->rc_size);
1376 	}
1377 
1378 	return (ret);
1379 }
1380 
1381 /*
1382  * Iterate over all combinations of bad data and attempt a reconstruction.
1383  * Note that the algorithm below is non-optimal because it doesn't take into
1384  * account how reconstruction is actually performed. For example, with
1385  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1386  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1387  * cases we'd only use parity information in column 0.
1388  */
1389 static int
1390 vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp,
1391     void *data, off_t offset, uint64_t bytes, int total_errors, int data_errors)
1392 {
1393 	raidz_col_t *rc;
1394 	void *orig[VDEV_RAIDZ_MAXPARITY];
1395 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1396 	int *tgts = &tstore[1];
1397 	int current, next, i, c, n;
1398 	int code, ret = 0;
1399 
1400 	ASSERT(total_errors < rm->rm_firstdatacol);
1401 
1402 	/*
1403 	 * This simplifies one edge condition.
1404 	 */
1405 	tgts[-1] = -1;
1406 
1407 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1408 		/*
1409 		 * Initialize the targets array by finding the first n columns
1410 		 * that contain no error.
1411 		 *
1412 		 * If there were no data errors, we need to ensure that we're
1413 		 * always explicitly attempting to reconstruct at least one
1414 		 * data column. To do this, we simply push the highest target
1415 		 * up into the data columns.
1416 		 */
1417 		for (c = 0, i = 0; i < n; i++) {
1418 			if (i == n - 1 && data_errors == 0 &&
1419 			    c < rm->rm_firstdatacol) {
1420 				c = rm->rm_firstdatacol;
1421 			}
1422 
1423 			while (rm->rm_col[c].rc_error != 0) {
1424 				c++;
1425 				ASSERT3S(c, <, rm->rm_cols);
1426 			}
1427 
1428 			tgts[i] = c++;
1429 		}
1430 
1431 		/*
1432 		 * Setting tgts[n] simplifies the other edge condition.
1433 		 */
1434 		tgts[n] = rm->rm_cols;
1435 
1436 		/*
1437 		 * These buffers were allocated in previous iterations.
1438 		 */
1439 		for (i = 0; i < n - 1; i++) {
1440 			ASSERT(orig[i] != NULL);
1441 		}
1442 
1443 		orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size);
1444 
1445 		current = 0;
1446 		next = tgts[current];
1447 
1448 		while (current != n) {
1449 			tgts[current] = next;
1450 			current = 0;
1451 
1452 			/*
1453 			 * Save off the original data that we're going to
1454 			 * attempt to reconstruct.
1455 			 */
1456 			for (i = 0; i < n; i++) {
1457 				ASSERT(orig[i] != NULL);
1458 				c = tgts[i];
1459 				ASSERT3S(c, >=, 0);
1460 				ASSERT3S(c, <, rm->rm_cols);
1461 				rc = &rm->rm_col[c];
1462 				bcopy(rc->rc_data, orig[i], rc->rc_size);
1463 			}
1464 
1465 			/*
1466 			 * Attempt a reconstruction and exit the outer loop on
1467 			 * success.
1468 			 */
1469 			code = vdev_raidz_reconstruct(rm, tgts, n);
1470 			if (raidz_checksum_verify(spa, bp, data, bytes) == 0) {
1471 				for (i = 0; i < n; i++) {
1472 					c = tgts[i];
1473 					rc = &rm->rm_col[c];
1474 					ASSERT(rc->rc_error == 0);
1475 					rc->rc_error = ECKSUM;
1476 				}
1477 
1478 				ret = code;
1479 				goto done;
1480 			}
1481 
1482 			/*
1483 			 * Restore the original data.
1484 			 */
1485 			for (i = 0; i < n; i++) {
1486 				c = tgts[i];
1487 				rc = &rm->rm_col[c];
1488 				bcopy(orig[i], rc->rc_data, rc->rc_size);
1489 			}
1490 
1491 			do {
1492 				/*
1493 				 * Find the next valid column after the current
1494 				 * position..
1495 				 */
1496 				for (next = tgts[current] + 1;
1497 				    next < rm->rm_cols &&
1498 				    rm->rm_col[next].rc_error != 0; next++)
1499 					continue;
1500 
1501 				ASSERT(next <= tgts[current + 1]);
1502 
1503 				/*
1504 				 * If that spot is available, we're done here.
1505 				 */
1506 				if (next != tgts[current + 1])
1507 					break;
1508 
1509 				/*
1510 				 * Otherwise, find the next valid column after
1511 				 * the previous position.
1512 				 */
1513 				for (c = tgts[current - 1] + 1;
1514 				    rm->rm_col[c].rc_error != 0; c++)
1515 					continue;
1516 
1517 				tgts[current] = c;
1518 				current++;
1519 
1520 			} while (current != n);
1521 		}
1522 	}
1523 	n--;
1524 done:
1525 	for (i = n - 1; i >= 0; i--) {
1526 		zfs_free(orig[i], rm->rm_col[0].rc_size);
1527 	}
1528 
1529 	return (ret);
1530 }
1531 
1532 static int
1533 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1534     off_t offset, size_t bytes)
1535 {
1536 	vdev_t *tvd = vd->v_top;
1537 	vdev_t *cvd;
1538 	raidz_map_t *rm;
1539 	raidz_col_t *rc;
1540 	int c, error;
1541 	int unexpected_errors;
1542 	int parity_errors;
1543 	int parity_untried;
1544 	int data_errors;
1545 	int total_errors;
1546 	int n;
1547 	int tgts[VDEV_RAIDZ_MAXPARITY];
1548 	int code;
1549 
1550 	rc = NULL;	/* gcc */
1551 	error = 0;
1552 
1553 	rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1554 	    vd->v_nchildren, vd->v_nparity);
1555 
1556 	/*
1557 	 * Iterate over the columns in reverse order so that we hit the parity
1558 	 * last -- any errors along the way will force us to read the parity.
1559 	 */
1560 	for (c = rm->rm_cols - 1; c >= 0; c--) {
1561 		rc = &rm->rm_col[c];
1562 		cvd = vdev_child(vd, rc->rc_devidx);
1563 		if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1564 			if (c >= rm->rm_firstdatacol)
1565 				rm->rm_missingdata++;
1566 			else
1567 				rm->rm_missingparity++;
1568 			rc->rc_error = ENXIO;
1569 			rc->rc_tried = 1;	/* don't even try */
1570 			rc->rc_skipped = 1;
1571 			continue;
1572 		}
1573 #if 0		/* XXX: Too hard for the boot code. */
1574 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1575 			if (c >= rm->rm_firstdatacol)
1576 				rm->rm_missingdata++;
1577 			else
1578 				rm->rm_missingparity++;
1579 			rc->rc_error = ESTALE;
1580 			rc->rc_skipped = 1;
1581 			continue;
1582 		}
1583 #endif
1584 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1585 			rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1586 			    rc->rc_offset, rc->rc_size);
1587 			rc->rc_tried = 1;
1588 			rc->rc_skipped = 0;
1589 		}
1590 	}
1591 
1592 reconstruct:
1593 	unexpected_errors = 0;
1594 	parity_errors = 0;
1595 	parity_untried = 0;
1596 	data_errors = 0;
1597 	total_errors = 0;
1598 
1599 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1600 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1601 
1602 	for (c = 0; c < rm->rm_cols; c++) {
1603 		rc = &rm->rm_col[c];
1604 
1605 		if (rc->rc_error) {
1606 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
1607 
1608 			if (c < rm->rm_firstdatacol)
1609 				parity_errors++;
1610 			else
1611 				data_errors++;
1612 
1613 			if (!rc->rc_skipped)
1614 				unexpected_errors++;
1615 
1616 			total_errors++;
1617 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1618 			parity_untried++;
1619 		}
1620 	}
1621 
1622 	/*
1623 	 * There are three potential phases for a read:
1624 	 *	1. produce valid data from the columns read
1625 	 *	2. read all disks and try again
1626 	 *	3. perform combinatorial reconstruction
1627 	 *
1628 	 * Each phase is progressively both more expensive and less likely to
1629 	 * occur. If we encounter more errors than we can repair or all phases
1630 	 * fail, we have no choice but to return an error.
1631 	 */
1632 
1633 	/*
1634 	 * If the number of errors we saw was correctable -- less than or equal
1635 	 * to the number of parity disks read -- attempt to produce data that
1636 	 * has a valid checksum. Naturally, this case applies in the absence of
1637 	 * any errors.
1638 	 */
1639 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1640 		if (data_errors == 0) {
1641 			if (raidz_checksum_verify(vd->spa, bp, data, bytes) == 0) {
1642 				/*
1643 				 * If we read parity information (unnecessarily
1644 				 * as it happens since no reconstruction was
1645 				 * needed) regenerate and verify the parity.
1646 				 * We also regenerate parity when resilvering
1647 				 * so we can write it out to the failed device
1648 				 * later.
1649 				 */
1650 				if (parity_errors + parity_untried <
1651 				    rm->rm_firstdatacol) {
1652 					n = raidz_parity_verify(rm);
1653 					unexpected_errors += n;
1654 					ASSERT(parity_errors + n <=
1655 					    rm->rm_firstdatacol);
1656 				}
1657 				goto done;
1658 			}
1659 		} else {
1660 			/*
1661 			 * We either attempt to read all the parity columns or
1662 			 * none of them. If we didn't try to read parity, we
1663 			 * wouldn't be here in the correctable case. There must
1664 			 * also have been fewer parity errors than parity
1665 			 * columns or, again, we wouldn't be in this code path.
1666 			 */
1667 			ASSERT(parity_untried == 0);
1668 			ASSERT(parity_errors < rm->rm_firstdatacol);
1669 
1670 			/*
1671 			 * Identify the data columns that reported an error.
1672 			 */
1673 			n = 0;
1674 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1675 				rc = &rm->rm_col[c];
1676 				if (rc->rc_error != 0) {
1677 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1678 					tgts[n++] = c;
1679 				}
1680 			}
1681 
1682 			ASSERT(rm->rm_firstdatacol >= n);
1683 
1684 			code = vdev_raidz_reconstruct(rm, tgts, n);
1685 
1686 			if (raidz_checksum_verify(vd->spa, bp, data, bytes) == 0) {
1687 				/*
1688 				 * If we read more parity disks than were used
1689 				 * for reconstruction, confirm that the other
1690 				 * parity disks produced correct data. This
1691 				 * routine is suboptimal in that it regenerates
1692 				 * the parity that we already used in addition
1693 				 * to the parity that we're attempting to
1694 				 * verify, but this should be a relatively
1695 				 * uncommon case, and can be optimized if it
1696 				 * becomes a problem. Note that we regenerate
1697 				 * parity when resilvering so we can write it
1698 				 * out to failed devices later.
1699 				 */
1700 				if (parity_errors < rm->rm_firstdatacol - n) {
1701 					n = raidz_parity_verify(rm);
1702 					unexpected_errors += n;
1703 					ASSERT(parity_errors + n <=
1704 					    rm->rm_firstdatacol);
1705 				}
1706 
1707 				goto done;
1708 			}
1709 		}
1710 	}
1711 
1712 	/*
1713 	 * This isn't a typical situation -- either we got a read
1714 	 * error or a child silently returned bad data. Read every
1715 	 * block so we can try again with as much data and parity as
1716 	 * we can track down. If we've already been through once
1717 	 * before, all children will be marked as tried so we'll
1718 	 * proceed to combinatorial reconstruction.
1719 	 */
1720 	unexpected_errors = 1;
1721 	rm->rm_missingdata = 0;
1722 	rm->rm_missingparity = 0;
1723 
1724 	n = 0;
1725 	for (c = 0; c < rm->rm_cols; c++) {
1726 		rc = &rm->rm_col[c];
1727 
1728 		if (rc->rc_tried)
1729 			continue;
1730 
1731 		cvd = vdev_child(vd, rc->rc_devidx);
1732 		ASSERT(cvd != NULL);
1733 		rc->rc_error = cvd->v_read(cvd, NULL,
1734 		    rc->rc_data, rc->rc_offset, rc->rc_size);
1735 		if (rc->rc_error == 0)
1736 			n++;
1737 		rc->rc_tried = 1;
1738 		rc->rc_skipped = 0;
1739 	}
1740 	/*
1741 	 * If we managed to read anything more, retry the
1742 	 * reconstruction.
1743 	 */
1744 	if (n > 0)
1745 		goto reconstruct;
1746 
1747 	/*
1748 	 * At this point we've attempted to reconstruct the data given the
1749 	 * errors we detected, and we've attempted to read all columns. There
1750 	 * must, therefore, be one or more additional problems -- silent errors
1751 	 * resulting in invalid data rather than explicit I/O errors resulting
1752 	 * in absent data. We check if there is enough additional data to
1753 	 * possibly reconstruct the data and then perform combinatorial
1754 	 * reconstruction over all possible combinations. If that fails,
1755 	 * we're cooked.
1756 	 */
1757 	if (total_errors > rm->rm_firstdatacol) {
1758 		error = EIO;
1759 	} else if (total_errors < rm->rm_firstdatacol &&
1760 	    (code = vdev_raidz_combrec(vd->spa, rm, bp, data, offset, bytes,
1761 	     total_errors, data_errors)) != 0) {
1762 		/*
1763 		 * If we didn't use all the available parity for the
1764 		 * combinatorial reconstruction, verify that the remaining
1765 		 * parity is correct.
1766 		 */
1767 		if (code != (1 << rm->rm_firstdatacol) - 1)
1768 			(void) raidz_parity_verify(rm);
1769 	} else {
1770 		/*
1771 		 * We're here because either:
1772 		 *
1773 		 *	total_errors == rm_first_datacol, or
1774 		 *	vdev_raidz_combrec() failed
1775 		 *
1776 		 * In either case, there is enough bad data to prevent
1777 		 * reconstruction.
1778 		 *
1779 		 * Start checksum ereports for all children which haven't
1780 		 * failed, and the IO wasn't speculative.
1781 		 */
1782 		error = ECKSUM;
1783 	}
1784 
1785 done:
1786 	vdev_raidz_map_free(rm);
1787 
1788 	return (error);
1789 }
1790