xref: /freebsd/sys/cddl/boot/zfs/zfssubr.c (revision 8d20be1e22095c27faf8fe8b2f0d089739cc742e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/cdefs.h>
27 __FBSDID("$FreeBSD$");
28 
29 static uint64_t zfs_crc64_table[256];
30 
31 #define	ECKSUM	666
32 
33 #define	ASSERT(...)	do { } while (0)
34 #define	ASSERT3U(...)	do { } while (0)
35 #define	ASSERT3S(...)	do { } while (0)
36 
37 #define	panic(...)	do {						\
38 	printf(__VA_ARGS__);						\
39 	for (;;) ;							\
40 } while (0)
41 
42 #define	kmem_alloc(size, flag)	zfs_alloc((size))
43 #define	kmem_free(ptr, size)	zfs_free((ptr), (size))
44 
45 static void
46 zfs_init_crc(void)
47 {
48 	int i, j;
49 	uint64_t *ct;
50 
51 	/*
52 	 * Calculate the crc64 table (used for the zap hash
53 	 * function).
54 	 */
55 	if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
56 		memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
57 		for (i = 0; i < 256; i++)
58 			for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
59 				*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
60 	}
61 }
62 
63 static void
64 zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
65 {
66 	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
67 }
68 
69 /*
70  * Signature for checksum functions.
71  */
72 typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp);
73 
74 /*
75  * Information about each checksum function.
76  */
77 typedef struct zio_checksum_info {
78 	zio_checksum_t	*ci_func[2]; /* checksum function for each byteorder */
79 	int		ci_correctable;	/* number of correctable bits	*/
80 	int		ci_eck;		/* uses zio embedded checksum? */
81 	int		ci_dedup;	/* strong enough for dedup? */
82 	const char	*ci_name;	/* descriptive name */
83 } zio_checksum_info_t;
84 
85 #include "fletcher.c"
86 #include "sha256.c"
87 
88 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
89 	{{NULL,			NULL},			0, 0, 0, "inherit"},
90 	{{NULL,			NULL},			0, 0, 0, "on"},
91 	{{zio_checksum_off,	zio_checksum_off},	0, 0, 0, "off"},
92 	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 1, 0, "label"},
93 	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 1, 0, "gang_header"},
94 	{{fletcher_2_native,	fletcher_2_byteswap},	0, 1, 0, "zilog"},
95 	{{fletcher_2_native,	fletcher_2_byteswap},	0, 0, 0, "fletcher2"},
96 	{{fletcher_4_native,	fletcher_4_byteswap},	1, 0, 0, "fletcher4"},
97 	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 0, 1, "SHA256"},
98 	{{fletcher_4_native,	fletcher_4_byteswap},	0, 1, 0, "zillog2"},
99 };
100 
101 
102 /*
103  * Common signature for all zio compress/decompress functions.
104  */
105 typedef size_t zio_compress_func_t(void *src, void *dst,
106     size_t s_len, size_t d_len, int);
107 typedef int zio_decompress_func_t(void *src, void *dst,
108     size_t s_len, size_t d_len, int);
109 
110 /*
111  * Information about each compression function.
112  */
113 typedef struct zio_compress_info {
114 	zio_compress_func_t	*ci_compress;	/* compression function */
115 	zio_decompress_func_t	*ci_decompress;	/* decompression function */
116 	int			ci_level;	/* level parameter */
117 	const char		*ci_name;	/* algorithm name */
118 } zio_compress_info_t;
119 
120 #include "lzjb.c"
121 #include "zle.c"
122 #include "lz4.c"
123 
124 /*
125  * Compression vectors.
126  */
127 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
128 	{NULL,			NULL,			0,	"inherit"},
129 	{NULL,			NULL,			0,	"on"},
130 	{NULL,			NULL,			0,	"uncompressed"},
131 	{NULL,			lzjb_decompress,	0,	"lzjb"},
132 	{NULL,			NULL,			0,	"empty"},
133 	{NULL,			NULL,			1,	"gzip-1"},
134 	{NULL,			NULL,			2,	"gzip-2"},
135 	{NULL,			NULL,			3,	"gzip-3"},
136 	{NULL,			NULL,			4,	"gzip-4"},
137 	{NULL,			NULL,			5,	"gzip-5"},
138 	{NULL,			NULL,			6,	"gzip-6"},
139 	{NULL,			NULL,			7,	"gzip-7"},
140 	{NULL,			NULL,			8,	"gzip-8"},
141 	{NULL,			NULL,			9,	"gzip-9"},
142 	{NULL,			zle_decompress,		64,	"zle"},
143 	{NULL,			lz4_decompress,		0,	"lz4"},
144 };
145 
146 static void
147 byteswap_uint64_array(void *vbuf, size_t size)
148 {
149 	uint64_t *buf = vbuf;
150 	size_t count = size >> 3;
151 	int i;
152 
153 	ASSERT((size & 7) == 0);
154 
155 	for (i = 0; i < count; i++)
156 		buf[i] = BSWAP_64(buf[i]);
157 }
158 
159 /*
160  * Set the external verifier for a gang block based on <vdev, offset, txg>,
161  * a tuple which is guaranteed to be unique for the life of the pool.
162  */
163 static void
164 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
165 {
166 	const dva_t *dva = BP_IDENTITY(bp);
167 	uint64_t txg = BP_PHYSICAL_BIRTH(bp);
168 
169 	ASSERT(BP_IS_GANG(bp));
170 
171 	ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
172 }
173 
174 /*
175  * Set the external verifier for a label block based on its offset.
176  * The vdev is implicit, and the txg is unknowable at pool open time --
177  * hence the logic in vdev_uberblock_load() to find the most recent copy.
178  */
179 static void
180 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
181 {
182 	ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
183 }
184 
185 static int
186 zio_checksum_verify(const blkptr_t *bp, void *data)
187 {
188 	uint64_t size;
189 	unsigned int checksum;
190 	zio_checksum_info_t *ci;
191 	zio_cksum_t actual_cksum, expected_cksum, verifier;
192 	int byteswap;
193 
194 	checksum = BP_GET_CHECKSUM(bp);
195 	size = BP_GET_PSIZE(bp);
196 
197 	if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
198 		return (EINVAL);
199 	ci = &zio_checksum_table[checksum];
200 	if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
201 		return (EINVAL);
202 
203 	if (ci->ci_eck) {
204 		zio_eck_t *eck;
205 
206 		ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
207 		    checksum == ZIO_CHECKSUM_LABEL);
208 
209 		eck = (zio_eck_t *)((char *)data + size) - 1;
210 
211 		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
212 			zio_checksum_gang_verifier(&verifier, bp);
213 		else if (checksum == ZIO_CHECKSUM_LABEL)
214 			zio_checksum_label_verifier(&verifier,
215 			    DVA_GET_OFFSET(BP_IDENTITY(bp)));
216 		else
217 			verifier = bp->blk_cksum;
218 
219 		byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
220 
221 		if (byteswap)
222 			byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
223 
224 		expected_cksum = eck->zec_cksum;
225 		eck->zec_cksum = verifier;
226 		ci->ci_func[byteswap](data, size, &actual_cksum);
227 		eck->zec_cksum = expected_cksum;
228 
229 		if (byteswap)
230 			byteswap_uint64_array(&expected_cksum,
231 			    sizeof (zio_cksum_t));
232 	} else {
233 		expected_cksum = bp->blk_cksum;
234 		ci->ci_func[0](data, size, &actual_cksum);
235 	}
236 
237 	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
238 		/*printf("ZFS: read checksum failed\n");*/
239 		return (EIO);
240 	}
241 
242 	return (0);
243 }
244 
245 static int
246 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
247 	void *dest, uint64_t destsize)
248 {
249 	zio_compress_info_t *ci;
250 
251 	if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
252 		printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
253 		return (EIO);
254 	}
255 
256 	ci = &zio_compress_table[cpfunc];
257 	if (!ci->ci_decompress) {
258 		printf("ZFS: unsupported compression algorithm %s\n",
259 		    ci->ci_name);
260 		return (EIO);
261 	}
262 
263 	return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
264 }
265 
266 static uint64_t
267 zap_hash(uint64_t salt, const char *name)
268 {
269 	const uint8_t *cp;
270 	uint8_t c;
271 	uint64_t crc = salt;
272 
273 	ASSERT(crc != 0);
274 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
275 	for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
276 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
277 
278 	/*
279 	 * Only use 28 bits, since we need 4 bits in the cookie for the
280 	 * collision differentiator.  We MUST use the high bits, since
281 	 * those are the onces that we first pay attention to when
282 	 * chosing the bucket.
283 	 */
284 	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
285 
286 	return (crc);
287 }
288 
289 static void *zfs_alloc(size_t size);
290 static void zfs_free(void *ptr, size_t size);
291 
292 typedef struct raidz_col {
293 	uint64_t rc_devidx;		/* child device index for I/O */
294 	uint64_t rc_offset;		/* device offset */
295 	uint64_t rc_size;		/* I/O size */
296 	void *rc_data;			/* I/O data */
297 	int rc_error;			/* I/O error for this device */
298 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
299 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
300 } raidz_col_t;
301 
302 typedef struct raidz_map {
303 	uint64_t rm_cols;		/* Regular column count */
304 	uint64_t rm_scols;		/* Count including skipped columns */
305 	uint64_t rm_bigcols;		/* Number of oversized columns */
306 	uint64_t rm_asize;		/* Actual total I/O size */
307 	uint64_t rm_missingdata;	/* Count of missing data devices */
308 	uint64_t rm_missingparity;	/* Count of missing parity devices */
309 	uint64_t rm_firstdatacol;	/* First data column/parity count */
310 	uint64_t rm_nskip;		/* Skipped sectors for padding */
311 	uint64_t rm_skipstart;		/* Column index of padding start */
312 	uintptr_t rm_reports;		/* # of referencing checksum reports */
313 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
314 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
315 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
316 } raidz_map_t;
317 
318 #define	VDEV_RAIDZ_P		0
319 #define	VDEV_RAIDZ_Q		1
320 #define	VDEV_RAIDZ_R		2
321 
322 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
323 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
324 
325 /*
326  * We provide a mechanism to perform the field multiplication operation on a
327  * 64-bit value all at once rather than a byte at a time. This works by
328  * creating a mask from the top bit in each byte and using that to
329  * conditionally apply the XOR of 0x1d.
330  */
331 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
332 { \
333 	(mask) = (x) & 0x8080808080808080ULL; \
334 	(mask) = ((mask) << 1) - ((mask) >> 7); \
335 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
336 	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
337 }
338 
339 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
340 { \
341 	VDEV_RAIDZ_64MUL_2((x), mask); \
342 	VDEV_RAIDZ_64MUL_2((x), mask); \
343 }
344 
345 /*
346  * These two tables represent powers and logs of 2 in the Galois field defined
347  * above. These values were computed by repeatedly multiplying by 2 as above.
348  */
349 static const uint8_t vdev_raidz_pow2[256] = {
350 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
351 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
352 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
353 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
354 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
355 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
356 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
357 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
358 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
359 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
360 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
361 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
362 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
363 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
364 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
365 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
366 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
367 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
368 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
369 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
370 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
371 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
372 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
373 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
374 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
375 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
376 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
377 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
378 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
379 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
380 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
381 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
382 };
383 static const uint8_t vdev_raidz_log2[256] = {
384 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
385 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
386 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
387 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
388 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
389 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
390 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
391 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
392 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
393 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
394 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
395 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
396 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
397 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
398 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
399 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
400 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
401 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
402 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
403 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
404 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
405 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
406 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
407 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
408 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
409 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
410 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
411 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
412 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
413 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
414 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
415 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
416 };
417 
418 /*
419  * Multiply a given number by 2 raised to the given power.
420  */
421 static uint8_t
422 vdev_raidz_exp2(uint8_t a, int exp)
423 {
424 	if (a == 0)
425 		return (0);
426 
427 	ASSERT(exp >= 0);
428 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
429 
430 	exp += vdev_raidz_log2[a];
431 	if (exp > 255)
432 		exp -= 255;
433 
434 	return (vdev_raidz_pow2[exp]);
435 }
436 
437 static void
438 vdev_raidz_generate_parity_p(raidz_map_t *rm)
439 {
440 	uint64_t *p, *src, pcount, ccount, i;
441 	int c;
442 
443 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
444 
445 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
446 		src = rm->rm_col[c].rc_data;
447 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
448 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
449 
450 		if (c == rm->rm_firstdatacol) {
451 			ASSERT(ccount == pcount);
452 			for (i = 0; i < ccount; i++, src++, p++) {
453 				*p = *src;
454 			}
455 		} else {
456 			ASSERT(ccount <= pcount);
457 			for (i = 0; i < ccount; i++, src++, p++) {
458 				*p ^= *src;
459 			}
460 		}
461 	}
462 }
463 
464 static void
465 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
466 {
467 	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
468 	int c;
469 
470 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
471 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
472 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
473 
474 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
475 		src = rm->rm_col[c].rc_data;
476 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
477 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
478 
479 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
480 
481 		if (c == rm->rm_firstdatacol) {
482 			ASSERT(ccnt == pcnt || ccnt == 0);
483 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
484 				*p = *src;
485 				*q = *src;
486 			}
487 			for (; i < pcnt; i++, src++, p++, q++) {
488 				*p = 0;
489 				*q = 0;
490 			}
491 		} else {
492 			ASSERT(ccnt <= pcnt);
493 
494 			/*
495 			 * Apply the algorithm described above by multiplying
496 			 * the previous result and adding in the new value.
497 			 */
498 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
499 				*p ^= *src;
500 
501 				VDEV_RAIDZ_64MUL_2(*q, mask);
502 				*q ^= *src;
503 			}
504 
505 			/*
506 			 * Treat short columns as though they are full of 0s.
507 			 * Note that there's therefore nothing needed for P.
508 			 */
509 			for (; i < pcnt; i++, q++) {
510 				VDEV_RAIDZ_64MUL_2(*q, mask);
511 			}
512 		}
513 	}
514 }
515 
516 static void
517 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
518 {
519 	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
520 	int c;
521 
522 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
523 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
524 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
525 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
526 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
527 
528 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
529 		src = rm->rm_col[c].rc_data;
530 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
531 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
532 		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
533 
534 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
535 
536 		if (c == rm->rm_firstdatacol) {
537 			ASSERT(ccnt == pcnt || ccnt == 0);
538 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
539 				*p = *src;
540 				*q = *src;
541 				*r = *src;
542 			}
543 			for (; i < pcnt; i++, src++, p++, q++, r++) {
544 				*p = 0;
545 				*q = 0;
546 				*r = 0;
547 			}
548 		} else {
549 			ASSERT(ccnt <= pcnt);
550 
551 			/*
552 			 * Apply the algorithm described above by multiplying
553 			 * the previous result and adding in the new value.
554 			 */
555 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
556 				*p ^= *src;
557 
558 				VDEV_RAIDZ_64MUL_2(*q, mask);
559 				*q ^= *src;
560 
561 				VDEV_RAIDZ_64MUL_4(*r, mask);
562 				*r ^= *src;
563 			}
564 
565 			/*
566 			 * Treat short columns as though they are full of 0s.
567 			 * Note that there's therefore nothing needed for P.
568 			 */
569 			for (; i < pcnt; i++, q++, r++) {
570 				VDEV_RAIDZ_64MUL_2(*q, mask);
571 				VDEV_RAIDZ_64MUL_4(*r, mask);
572 			}
573 		}
574 	}
575 }
576 
577 /*
578  * Generate RAID parity in the first virtual columns according to the number of
579  * parity columns available.
580  */
581 static void
582 vdev_raidz_generate_parity(raidz_map_t *rm)
583 {
584 	switch (rm->rm_firstdatacol) {
585 	case 1:
586 		vdev_raidz_generate_parity_p(rm);
587 		break;
588 	case 2:
589 		vdev_raidz_generate_parity_pq(rm);
590 		break;
591 	case 3:
592 		vdev_raidz_generate_parity_pqr(rm);
593 		break;
594 	default:
595 		panic("invalid RAID-Z configuration");
596 	}
597 }
598 
599 /* BEGIN CSTYLED */
600 /*
601  * In the general case of reconstruction, we must solve the system of linear
602  * equations defined by the coeffecients used to generate parity as well as
603  * the contents of the data and parity disks. This can be expressed with
604  * vectors for the original data (D) and the actual data (d) and parity (p)
605  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
606  *
607  *            __   __                     __     __
608  *            |     |         __     __   |  p_0  |
609  *            |  V  |         |  D_0  |   | p_m-1 |
610  *            |     |    x    |   :   | = |  d_0  |
611  *            |  I  |         | D_n-1 |   |   :   |
612  *            |     |         ~~     ~~   | d_n-1 |
613  *            ~~   ~~                     ~~     ~~
614  *
615  * I is simply a square identity matrix of size n, and V is a vandermonde
616  * matrix defined by the coeffecients we chose for the various parity columns
617  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
618  * computation as well as linear separability.
619  *
620  *      __               __               __     __
621  *      |   1   ..  1 1 1 |               |  p_0  |
622  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
623  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
624  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
625  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
626  *      |   :       : : : |   |   :   |   |  d_2  |
627  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
628  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
629  *      |   0   ..  0 0 1 |               | d_n-1 |
630  *      ~~               ~~               ~~     ~~
631  *
632  * Note that I, V, d, and p are known. To compute D, we must invert the
633  * matrix and use the known data and parity values to reconstruct the unknown
634  * data values. We begin by removing the rows in V|I and d|p that correspond
635  * to failed or missing columns; we then make V|I square (n x n) and d|p
636  * sized n by removing rows corresponding to unused parity from the bottom up
637  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
638  * using Gauss-Jordan elimination. In the example below we use m=3 parity
639  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
640  *           __                               __
641  *           |  1   1   1   1   1   1   1   1  |
642  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
643  *           |  19 205 116  29  64  16  4   1  |      / /
644  *           |  1   0   0   0   0   0   0   0  |     / /
645  *           |  0   1   0   0   0   0   0   0  | <--' /
646  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
647  *           |  0   0   0   1   0   0   0   0  |
648  *           |  0   0   0   0   1   0   0   0  |
649  *           |  0   0   0   0   0   1   0   0  |
650  *           |  0   0   0   0   0   0   1   0  |
651  *           |  0   0   0   0   0   0   0   1  |
652  *           ~~                               ~~
653  *           __                               __
654  *           |  1   1   1   1   1   1   1   1  |
655  *           | 128  64  32  16  8   4   2   1  |
656  *           |  19 205 116  29  64  16  4   1  |
657  *           |  1   0   0   0   0   0   0   0  |
658  *           |  0   1   0   0   0   0   0   0  |
659  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
660  *           |  0   0   0   1   0   0   0   0  |
661  *           |  0   0   0   0   1   0   0   0  |
662  *           |  0   0   0   0   0   1   0   0  |
663  *           |  0   0   0   0   0   0   1   0  |
664  *           |  0   0   0   0   0   0   0   1  |
665  *           ~~                               ~~
666  *
667  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
668  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
669  * matrix is not singular.
670  * __                                                                 __
671  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
672  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
673  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
674  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
675  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
676  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
677  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
678  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
679  * ~~                                                                 ~~
680  * __                                                                 __
681  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
682  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
683  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
684  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
685  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
686  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
687  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
688  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
689  * ~~                                                                 ~~
690  * __                                                                 __
691  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
692  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
693  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
694  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
695  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
696  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
697  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
698  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
699  * ~~                                                                 ~~
700  * __                                                                 __
701  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
702  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
703  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
704  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
705  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
706  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
707  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
708  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
709  * ~~                                                                 ~~
710  * __                                                                 __
711  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
712  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
713  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
714  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
715  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
716  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
717  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
718  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
719  * ~~                                                                 ~~
720  * __                                                                 __
721  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
722  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
723  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
724  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
725  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
726  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
727  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
728  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
729  * ~~                                                                 ~~
730  *                   __                               __
731  *                   |  0   0   1   0   0   0   0   0  |
732  *                   | 167 100  5   41 159 169 217 208 |
733  *                   | 166 100  4   40 158 168 216 209 |
734  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
735  *                   |  0   0   0   0   1   0   0   0  |
736  *                   |  0   0   0   0   0   1   0   0  |
737  *                   |  0   0   0   0   0   0   1   0  |
738  *                   |  0   0   0   0   0   0   0   1  |
739  *                   ~~                               ~~
740  *
741  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
742  * of the missing data.
743  *
744  * As is apparent from the example above, the only non-trivial rows in the
745  * inverse matrix correspond to the data disks that we're trying to
746  * reconstruct. Indeed, those are the only rows we need as the others would
747  * only be useful for reconstructing data known or assumed to be valid. For
748  * that reason, we only build the coefficients in the rows that correspond to
749  * targeted columns.
750  */
751 /* END CSTYLED */
752 
753 static void
754 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
755     uint8_t **rows)
756 {
757 	int i, j;
758 	int pow;
759 
760 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
761 
762 	/*
763 	 * Fill in the missing rows of interest.
764 	 */
765 	for (i = 0; i < nmap; i++) {
766 		ASSERT3S(0, <=, map[i]);
767 		ASSERT3S(map[i], <=, 2);
768 
769 		pow = map[i] * n;
770 		if (pow > 255)
771 			pow -= 255;
772 		ASSERT(pow <= 255);
773 
774 		for (j = 0; j < n; j++) {
775 			pow -= map[i];
776 			if (pow < 0)
777 				pow += 255;
778 			rows[i][j] = vdev_raidz_pow2[pow];
779 		}
780 	}
781 }
782 
783 static void
784 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
785     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
786 {
787 	int i, j, ii, jj;
788 	uint8_t log;
789 
790 	/*
791 	 * Assert that the first nmissing entries from the array of used
792 	 * columns correspond to parity columns and that subsequent entries
793 	 * correspond to data columns.
794 	 */
795 	for (i = 0; i < nmissing; i++) {
796 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
797 	}
798 	for (; i < n; i++) {
799 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
800 	}
801 
802 	/*
803 	 * First initialize the storage where we'll compute the inverse rows.
804 	 */
805 	for (i = 0; i < nmissing; i++) {
806 		for (j = 0; j < n; j++) {
807 			invrows[i][j] = (i == j) ? 1 : 0;
808 		}
809 	}
810 
811 	/*
812 	 * Subtract all trivial rows from the rows of consequence.
813 	 */
814 	for (i = 0; i < nmissing; i++) {
815 		for (j = nmissing; j < n; j++) {
816 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
817 			jj = used[j] - rm->rm_firstdatacol;
818 			ASSERT3S(jj, <, n);
819 			invrows[i][j] = rows[i][jj];
820 			rows[i][jj] = 0;
821 		}
822 	}
823 
824 	/*
825 	 * For each of the rows of interest, we must normalize it and subtract
826 	 * a multiple of it from the other rows.
827 	 */
828 	for (i = 0; i < nmissing; i++) {
829 		for (j = 0; j < missing[i]; j++) {
830 			ASSERT3U(rows[i][j], ==, 0);
831 		}
832 		ASSERT3U(rows[i][missing[i]], !=, 0);
833 
834 		/*
835 		 * Compute the inverse of the first element and multiply each
836 		 * element in the row by that value.
837 		 */
838 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
839 
840 		for (j = 0; j < n; j++) {
841 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
842 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
843 		}
844 
845 		for (ii = 0; ii < nmissing; ii++) {
846 			if (i == ii)
847 				continue;
848 
849 			ASSERT3U(rows[ii][missing[i]], !=, 0);
850 
851 			log = vdev_raidz_log2[rows[ii][missing[i]]];
852 
853 			for (j = 0; j < n; j++) {
854 				rows[ii][j] ^=
855 				    vdev_raidz_exp2(rows[i][j], log);
856 				invrows[ii][j] ^=
857 				    vdev_raidz_exp2(invrows[i][j], log);
858 			}
859 		}
860 	}
861 
862 	/*
863 	 * Verify that the data that is left in the rows are properly part of
864 	 * an identity matrix.
865 	 */
866 	for (i = 0; i < nmissing; i++) {
867 		for (j = 0; j < n; j++) {
868 			if (j == missing[i]) {
869 				ASSERT3U(rows[i][j], ==, 1);
870 			} else {
871 				ASSERT3U(rows[i][j], ==, 0);
872 			}
873 		}
874 	}
875 }
876 
877 static void
878 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
879     int *missing, uint8_t **invrows, const uint8_t *used)
880 {
881 	int i, j, x, cc, c;
882 	uint8_t *src;
883 	uint64_t ccount;
884 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
885 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
886 	uint8_t log, val;
887 	int ll;
888 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
889 	uint8_t *p, *pp;
890 	size_t psize;
891 
892 	log = 0;	/* gcc */
893 	psize = sizeof (invlog[0][0]) * n * nmissing;
894 	p = zfs_alloc(psize);
895 
896 	for (pp = p, i = 0; i < nmissing; i++) {
897 		invlog[i] = pp;
898 		pp += n;
899 	}
900 
901 	for (i = 0; i < nmissing; i++) {
902 		for (j = 0; j < n; j++) {
903 			ASSERT3U(invrows[i][j], !=, 0);
904 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
905 		}
906 	}
907 
908 	for (i = 0; i < n; i++) {
909 		c = used[i];
910 		ASSERT3U(c, <, rm->rm_cols);
911 
912 		src = rm->rm_col[c].rc_data;
913 		ccount = rm->rm_col[c].rc_size;
914 		for (j = 0; j < nmissing; j++) {
915 			cc = missing[j] + rm->rm_firstdatacol;
916 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
917 			ASSERT3U(cc, <, rm->rm_cols);
918 			ASSERT3U(cc, !=, c);
919 
920 			dst[j] = rm->rm_col[cc].rc_data;
921 			dcount[j] = rm->rm_col[cc].rc_size;
922 		}
923 
924 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
925 
926 		for (x = 0; x < ccount; x++, src++) {
927 			if (*src != 0)
928 				log = vdev_raidz_log2[*src];
929 
930 			for (cc = 0; cc < nmissing; cc++) {
931 				if (x >= dcount[cc])
932 					continue;
933 
934 				if (*src == 0) {
935 					val = 0;
936 				} else {
937 					if ((ll = log + invlog[cc][i]) >= 255)
938 						ll -= 255;
939 					val = vdev_raidz_pow2[ll];
940 				}
941 
942 				if (i == 0)
943 					dst[cc][x] = val;
944 				else
945 					dst[cc][x] ^= val;
946 			}
947 		}
948 	}
949 
950 	zfs_free(p, psize);
951 }
952 
953 static int
954 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
955 {
956 	int n, i, c, t, tt;
957 	int nmissing_rows;
958 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
959 	int parity_map[VDEV_RAIDZ_MAXPARITY];
960 
961 	uint8_t *p, *pp;
962 	size_t psize;
963 
964 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
965 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
966 	uint8_t *used;
967 
968 	int code = 0;
969 
970 
971 	n = rm->rm_cols - rm->rm_firstdatacol;
972 
973 	/*
974 	 * Figure out which data columns are missing.
975 	 */
976 	nmissing_rows = 0;
977 	for (t = 0; t < ntgts; t++) {
978 		if (tgts[t] >= rm->rm_firstdatacol) {
979 			missing_rows[nmissing_rows++] =
980 			    tgts[t] - rm->rm_firstdatacol;
981 		}
982 	}
983 
984 	/*
985 	 * Figure out which parity columns to use to help generate the missing
986 	 * data columns.
987 	 */
988 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
989 		ASSERT(tt < ntgts);
990 		ASSERT(c < rm->rm_firstdatacol);
991 
992 		/*
993 		 * Skip any targeted parity columns.
994 		 */
995 		if (c == tgts[tt]) {
996 			tt++;
997 			continue;
998 		}
999 
1000 		code |= 1 << c;
1001 
1002 		parity_map[i] = c;
1003 		i++;
1004 	}
1005 
1006 	ASSERT(code != 0);
1007 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1008 
1009 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1010 	    nmissing_rows * n + sizeof (used[0]) * n;
1011 	p = kmem_alloc(psize, KM_SLEEP);
1012 
1013 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1014 		rows[i] = pp;
1015 		pp += n;
1016 		invrows[i] = pp;
1017 		pp += n;
1018 	}
1019 	used = pp;
1020 
1021 	for (i = 0; i < nmissing_rows; i++) {
1022 		used[i] = parity_map[i];
1023 	}
1024 
1025 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1026 		if (tt < nmissing_rows &&
1027 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1028 			tt++;
1029 			continue;
1030 		}
1031 
1032 		ASSERT3S(i, <, n);
1033 		used[i] = c;
1034 		i++;
1035 	}
1036 
1037 	/*
1038 	 * Initialize the interesting rows of the matrix.
1039 	 */
1040 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1041 
1042 	/*
1043 	 * Invert the matrix.
1044 	 */
1045 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1046 	    invrows, used);
1047 
1048 	/*
1049 	 * Reconstruct the missing data using the generated matrix.
1050 	 */
1051 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1052 	    invrows, used);
1053 
1054 	kmem_free(p, psize);
1055 
1056 	return (code);
1057 }
1058 
1059 static int
1060 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1061 {
1062 	int tgts[VDEV_RAIDZ_MAXPARITY];
1063 	int ntgts;
1064 	int i, c;
1065 	int code;
1066 	int nbadparity, nbaddata;
1067 
1068 	/*
1069 	 * The tgts list must already be sorted.
1070 	 */
1071 	for (i = 1; i < nt; i++) {
1072 		ASSERT(t[i] > t[i - 1]);
1073 	}
1074 
1075 	nbadparity = rm->rm_firstdatacol;
1076 	nbaddata = rm->rm_cols - nbadparity;
1077 	ntgts = 0;
1078 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1079 		if (i < nt && c == t[i]) {
1080 			tgts[ntgts++] = c;
1081 			i++;
1082 		} else if (rm->rm_col[c].rc_error != 0) {
1083 			tgts[ntgts++] = c;
1084 		} else if (c >= rm->rm_firstdatacol) {
1085 			nbaddata--;
1086 		} else {
1087 			nbadparity--;
1088 		}
1089 	}
1090 
1091 	ASSERT(ntgts >= nt);
1092 	ASSERT(nbaddata >= 0);
1093 	ASSERT(nbaddata + nbadparity == ntgts);
1094 
1095 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1096 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1097 	ASSERT(code > 0);
1098 	return (code);
1099 }
1100 
1101 static raidz_map_t *
1102 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1103     uint64_t dcols, uint64_t nparity)
1104 {
1105 	raidz_map_t *rm;
1106 	uint64_t b = offset >> unit_shift;
1107 	uint64_t s = size >> unit_shift;
1108 	uint64_t f = b % dcols;
1109 	uint64_t o = (b / dcols) << unit_shift;
1110 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1111 
1112 	q = s / (dcols - nparity);
1113 	r = s - q * (dcols - nparity);
1114 	bc = (r == 0 ? 0 : r + nparity);
1115 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
1116 
1117 	if (q == 0) {
1118 		acols = bc;
1119 		scols = MIN(dcols, roundup(bc, nparity + 1));
1120 	} else {
1121 		acols = dcols;
1122 		scols = dcols;
1123 	}
1124 
1125 	ASSERT3U(acols, <=, scols);
1126 
1127 	rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols]));
1128 
1129 	rm->rm_cols = acols;
1130 	rm->rm_scols = scols;
1131 	rm->rm_bigcols = bc;
1132 	rm->rm_skipstart = bc;
1133 	rm->rm_missingdata = 0;
1134 	rm->rm_missingparity = 0;
1135 	rm->rm_firstdatacol = nparity;
1136 	rm->rm_reports = 0;
1137 	rm->rm_freed = 0;
1138 	rm->rm_ecksuminjected = 0;
1139 
1140 	asize = 0;
1141 
1142 	for (c = 0; c < scols; c++) {
1143 		col = f + c;
1144 		coff = o;
1145 		if (col >= dcols) {
1146 			col -= dcols;
1147 			coff += 1ULL << unit_shift;
1148 		}
1149 		rm->rm_col[c].rc_devidx = col;
1150 		rm->rm_col[c].rc_offset = coff;
1151 		rm->rm_col[c].rc_data = NULL;
1152 		rm->rm_col[c].rc_error = 0;
1153 		rm->rm_col[c].rc_tried = 0;
1154 		rm->rm_col[c].rc_skipped = 0;
1155 
1156 		if (c >= acols)
1157 			rm->rm_col[c].rc_size = 0;
1158 		else if (c < bc)
1159 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1160 		else
1161 			rm->rm_col[c].rc_size = q << unit_shift;
1162 
1163 		asize += rm->rm_col[c].rc_size;
1164 	}
1165 
1166 	ASSERT3U(asize, ==, tot << unit_shift);
1167 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1168 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1169 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1170 	ASSERT3U(rm->rm_nskip, <=, nparity);
1171 
1172 	for (c = 0; c < rm->rm_firstdatacol; c++)
1173 		rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size);
1174 
1175 	rm->rm_col[c].rc_data = data;
1176 
1177 	for (c = c + 1; c < acols; c++)
1178 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1179 		    rm->rm_col[c - 1].rc_size;
1180 
1181 	/*
1182 	 * If all data stored spans all columns, there's a danger that parity
1183 	 * will always be on the same device and, since parity isn't read
1184 	 * during normal operation, that that device's I/O bandwidth won't be
1185 	 * used effectively. We therefore switch the parity every 1MB.
1186 	 *
1187 	 * ... at least that was, ostensibly, the theory. As a practical
1188 	 * matter unless we juggle the parity between all devices evenly, we
1189 	 * won't see any benefit. Further, occasional writes that aren't a
1190 	 * multiple of the LCM of the number of children and the minimum
1191 	 * stripe width are sufficient to avoid pessimal behavior.
1192 	 * Unfortunately, this decision created an implicit on-disk format
1193 	 * requirement that we need to support for all eternity, but only
1194 	 * for single-parity RAID-Z.
1195 	 *
1196 	 * If we intend to skip a sector in the zeroth column for padding
1197 	 * we must make sure to note this swap. We will never intend to
1198 	 * skip the first column since at least one data and one parity
1199 	 * column must appear in each row.
1200 	 */
1201 	ASSERT(rm->rm_cols >= 2);
1202 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1203 
1204 	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1205 		devidx = rm->rm_col[0].rc_devidx;
1206 		o = rm->rm_col[0].rc_offset;
1207 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1208 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1209 		rm->rm_col[1].rc_devidx = devidx;
1210 		rm->rm_col[1].rc_offset = o;
1211 
1212 		if (rm->rm_skipstart == 0)
1213 			rm->rm_skipstart = 1;
1214 	}
1215 
1216 	return (rm);
1217 }
1218 
1219 static void
1220 vdev_raidz_map_free(raidz_map_t *rm)
1221 {
1222 	int c;
1223 
1224 	for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1225 		zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
1226 
1227 	zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
1228 }
1229 
1230 static vdev_t *
1231 vdev_child(vdev_t *pvd, uint64_t devidx)
1232 {
1233 	vdev_t *cvd;
1234 
1235 	STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1236 		if (cvd->v_id == devidx)
1237 			break;
1238 	}
1239 
1240 	return (cvd);
1241 }
1242 
1243 /*
1244  * We keep track of whether or not there were any injected errors, so that
1245  * any ereports we generate can note it.
1246  */
1247 static int
1248 raidz_checksum_verify(const blkptr_t *bp, void *data, uint64_t size)
1249 {
1250 
1251 	return (zio_checksum_verify(bp, data));
1252 }
1253 
1254 /*
1255  * Generate the parity from the data columns. If we tried and were able to
1256  * read the parity without error, verify that the generated parity matches the
1257  * data we read. If it doesn't, we fire off a checksum error. Return the
1258  * number such failures.
1259  */
1260 static int
1261 raidz_parity_verify(raidz_map_t *rm)
1262 {
1263 	void *orig[VDEV_RAIDZ_MAXPARITY];
1264 	int c, ret = 0;
1265 	raidz_col_t *rc;
1266 
1267 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1268 		rc = &rm->rm_col[c];
1269 		if (!rc->rc_tried || rc->rc_error != 0)
1270 			continue;
1271 		orig[c] = zfs_alloc(rc->rc_size);
1272 		bcopy(rc->rc_data, orig[c], rc->rc_size);
1273 	}
1274 
1275 	vdev_raidz_generate_parity(rm);
1276 
1277 	for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1278 		rc = &rm->rm_col[c];
1279 		if (!rc->rc_tried || rc->rc_error != 0)
1280 			continue;
1281 		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1282 			rc->rc_error = ECKSUM;
1283 			ret++;
1284 		}
1285 		zfs_free(orig[c], rc->rc_size);
1286 	}
1287 
1288 	return (ret);
1289 }
1290 
1291 /*
1292  * Iterate over all combinations of bad data and attempt a reconstruction.
1293  * Note that the algorithm below is non-optimal because it doesn't take into
1294  * account how reconstruction is actually performed. For example, with
1295  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1296  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1297  * cases we'd only use parity information in column 0.
1298  */
1299 static int
1300 vdev_raidz_combrec(raidz_map_t *rm, const blkptr_t *bp, void *data,
1301     off_t offset, uint64_t bytes, int total_errors, int data_errors)
1302 {
1303 	raidz_col_t *rc;
1304 	void *orig[VDEV_RAIDZ_MAXPARITY];
1305 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1306 	int *tgts = &tstore[1];
1307 	int current, next, i, c, n;
1308 	int code, ret = 0;
1309 
1310 	ASSERT(total_errors < rm->rm_firstdatacol);
1311 
1312 	/*
1313 	 * This simplifies one edge condition.
1314 	 */
1315 	tgts[-1] = -1;
1316 
1317 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1318 		/*
1319 		 * Initialize the targets array by finding the first n columns
1320 		 * that contain no error.
1321 		 *
1322 		 * If there were no data errors, we need to ensure that we're
1323 		 * always explicitly attempting to reconstruct at least one
1324 		 * data column. To do this, we simply push the highest target
1325 		 * up into the data columns.
1326 		 */
1327 		for (c = 0, i = 0; i < n; i++) {
1328 			if (i == n - 1 && data_errors == 0 &&
1329 			    c < rm->rm_firstdatacol) {
1330 				c = rm->rm_firstdatacol;
1331 			}
1332 
1333 			while (rm->rm_col[c].rc_error != 0) {
1334 				c++;
1335 				ASSERT3S(c, <, rm->rm_cols);
1336 			}
1337 
1338 			tgts[i] = c++;
1339 		}
1340 
1341 		/*
1342 		 * Setting tgts[n] simplifies the other edge condition.
1343 		 */
1344 		tgts[n] = rm->rm_cols;
1345 
1346 		/*
1347 		 * These buffers were allocated in previous iterations.
1348 		 */
1349 		for (i = 0; i < n - 1; i++) {
1350 			ASSERT(orig[i] != NULL);
1351 		}
1352 
1353 		orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size);
1354 
1355 		current = 0;
1356 		next = tgts[current];
1357 
1358 		while (current != n) {
1359 			tgts[current] = next;
1360 			current = 0;
1361 
1362 			/*
1363 			 * Save off the original data that we're going to
1364 			 * attempt to reconstruct.
1365 			 */
1366 			for (i = 0; i < n; i++) {
1367 				ASSERT(orig[i] != NULL);
1368 				c = tgts[i];
1369 				ASSERT3S(c, >=, 0);
1370 				ASSERT3S(c, <, rm->rm_cols);
1371 				rc = &rm->rm_col[c];
1372 				bcopy(rc->rc_data, orig[i], rc->rc_size);
1373 			}
1374 
1375 			/*
1376 			 * Attempt a reconstruction and exit the outer loop on
1377 			 * success.
1378 			 */
1379 			code = vdev_raidz_reconstruct(rm, tgts, n);
1380 			if (raidz_checksum_verify(bp, data, bytes) == 0) {
1381 				for (i = 0; i < n; i++) {
1382 					c = tgts[i];
1383 					rc = &rm->rm_col[c];
1384 					ASSERT(rc->rc_error == 0);
1385 					rc->rc_error = ECKSUM;
1386 				}
1387 
1388 				ret = code;
1389 				goto done;
1390 			}
1391 
1392 			/*
1393 			 * Restore the original data.
1394 			 */
1395 			for (i = 0; i < n; i++) {
1396 				c = tgts[i];
1397 				rc = &rm->rm_col[c];
1398 				bcopy(orig[i], rc->rc_data, rc->rc_size);
1399 			}
1400 
1401 			do {
1402 				/*
1403 				 * Find the next valid column after the current
1404 				 * position..
1405 				 */
1406 				for (next = tgts[current] + 1;
1407 				    next < rm->rm_cols &&
1408 				    rm->rm_col[next].rc_error != 0; next++)
1409 					continue;
1410 
1411 				ASSERT(next <= tgts[current + 1]);
1412 
1413 				/*
1414 				 * If that spot is available, we're done here.
1415 				 */
1416 				if (next != tgts[current + 1])
1417 					break;
1418 
1419 				/*
1420 				 * Otherwise, find the next valid column after
1421 				 * the previous position.
1422 				 */
1423 				for (c = tgts[current - 1] + 1;
1424 				    rm->rm_col[c].rc_error != 0; c++)
1425 					continue;
1426 
1427 				tgts[current] = c;
1428 				current++;
1429 
1430 			} while (current != n);
1431 		}
1432 	}
1433 	n--;
1434 done:
1435 	for (i = n - 1; i >= 0; i--) {
1436 		zfs_free(orig[i], rm->rm_col[0].rc_size);
1437 	}
1438 
1439 	return (ret);
1440 }
1441 
1442 static int
1443 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1444     off_t offset, size_t bytes)
1445 {
1446 	vdev_t *tvd = vd->v_top;
1447 	vdev_t *cvd;
1448 	raidz_map_t *rm;
1449 	raidz_col_t *rc;
1450 	int c, error;
1451 	int unexpected_errors;
1452 	int parity_errors;
1453 	int parity_untried;
1454 	int data_errors;
1455 	int total_errors;
1456 	int n;
1457 	int tgts[VDEV_RAIDZ_MAXPARITY];
1458 	int code;
1459 
1460 	rc = NULL;	/* gcc */
1461 	error = 0;
1462 
1463 	rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1464 	    vd->v_nchildren, vd->v_nparity);
1465 
1466 	/*
1467 	 * Iterate over the columns in reverse order so that we hit the parity
1468 	 * last -- any errors along the way will force us to read the parity.
1469 	 */
1470 	for (c = rm->rm_cols - 1; c >= 0; c--) {
1471 		rc = &rm->rm_col[c];
1472 		cvd = vdev_child(vd, rc->rc_devidx);
1473 		if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1474 			if (c >= rm->rm_firstdatacol)
1475 				rm->rm_missingdata++;
1476 			else
1477 				rm->rm_missingparity++;
1478 			rc->rc_error = ENXIO;
1479 			rc->rc_tried = 1;	/* don't even try */
1480 			rc->rc_skipped = 1;
1481 			continue;
1482 		}
1483 #if 0		/* XXX: Too hard for the boot code. */
1484 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1485 			if (c >= rm->rm_firstdatacol)
1486 				rm->rm_missingdata++;
1487 			else
1488 				rm->rm_missingparity++;
1489 			rc->rc_error = ESTALE;
1490 			rc->rc_skipped = 1;
1491 			continue;
1492 		}
1493 #endif
1494 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1495 			rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1496 			    rc->rc_offset, rc->rc_size);
1497 			rc->rc_tried = 1;
1498 			rc->rc_skipped = 0;
1499 		}
1500 	}
1501 
1502 reconstruct:
1503 	unexpected_errors = 0;
1504 	parity_errors = 0;
1505 	parity_untried = 0;
1506 	data_errors = 0;
1507 	total_errors = 0;
1508 
1509 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1510 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1511 
1512 	for (c = 0; c < rm->rm_cols; c++) {
1513 		rc = &rm->rm_col[c];
1514 
1515 		if (rc->rc_error) {
1516 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
1517 
1518 			if (c < rm->rm_firstdatacol)
1519 				parity_errors++;
1520 			else
1521 				data_errors++;
1522 
1523 			if (!rc->rc_skipped)
1524 				unexpected_errors++;
1525 
1526 			total_errors++;
1527 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1528 			parity_untried++;
1529 		}
1530 	}
1531 
1532 	/*
1533 	 * There are three potential phases for a read:
1534 	 *	1. produce valid data from the columns read
1535 	 *	2. read all disks and try again
1536 	 *	3. perform combinatorial reconstruction
1537 	 *
1538 	 * Each phase is progressively both more expensive and less likely to
1539 	 * occur. If we encounter more errors than we can repair or all phases
1540 	 * fail, we have no choice but to return an error.
1541 	 */
1542 
1543 	/*
1544 	 * If the number of errors we saw was correctable -- less than or equal
1545 	 * to the number of parity disks read -- attempt to produce data that
1546 	 * has a valid checksum. Naturally, this case applies in the absence of
1547 	 * any errors.
1548 	 */
1549 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1550 		if (data_errors == 0) {
1551 			if (raidz_checksum_verify(bp, data, bytes) == 0) {
1552 				/*
1553 				 * If we read parity information (unnecessarily
1554 				 * as it happens since no reconstruction was
1555 				 * needed) regenerate and verify the parity.
1556 				 * We also regenerate parity when resilvering
1557 				 * so we can write it out to the failed device
1558 				 * later.
1559 				 */
1560 				if (parity_errors + parity_untried <
1561 				    rm->rm_firstdatacol) {
1562 					n = raidz_parity_verify(rm);
1563 					unexpected_errors += n;
1564 					ASSERT(parity_errors + n <=
1565 					    rm->rm_firstdatacol);
1566 				}
1567 				goto done;
1568 			}
1569 		} else {
1570 			/*
1571 			 * We either attempt to read all the parity columns or
1572 			 * none of them. If we didn't try to read parity, we
1573 			 * wouldn't be here in the correctable case. There must
1574 			 * also have been fewer parity errors than parity
1575 			 * columns or, again, we wouldn't be in this code path.
1576 			 */
1577 			ASSERT(parity_untried == 0);
1578 			ASSERT(parity_errors < rm->rm_firstdatacol);
1579 
1580 			/*
1581 			 * Identify the data columns that reported an error.
1582 			 */
1583 			n = 0;
1584 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1585 				rc = &rm->rm_col[c];
1586 				if (rc->rc_error != 0) {
1587 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1588 					tgts[n++] = c;
1589 				}
1590 			}
1591 
1592 			ASSERT(rm->rm_firstdatacol >= n);
1593 
1594 			code = vdev_raidz_reconstruct(rm, tgts, n);
1595 
1596 			if (raidz_checksum_verify(bp, data, bytes) == 0) {
1597 				/*
1598 				 * If we read more parity disks than were used
1599 				 * for reconstruction, confirm that the other
1600 				 * parity disks produced correct data. This
1601 				 * routine is suboptimal in that it regenerates
1602 				 * the parity that we already used in addition
1603 				 * to the parity that we're attempting to
1604 				 * verify, but this should be a relatively
1605 				 * uncommon case, and can be optimized if it
1606 				 * becomes a problem. Note that we regenerate
1607 				 * parity when resilvering so we can write it
1608 				 * out to failed devices later.
1609 				 */
1610 				if (parity_errors < rm->rm_firstdatacol - n) {
1611 					n = raidz_parity_verify(rm);
1612 					unexpected_errors += n;
1613 					ASSERT(parity_errors + n <=
1614 					    rm->rm_firstdatacol);
1615 				}
1616 
1617 				goto done;
1618 			}
1619 		}
1620 	}
1621 
1622 	/*
1623 	 * This isn't a typical situation -- either we got a read
1624 	 * error or a child silently returned bad data. Read every
1625 	 * block so we can try again with as much data and parity as
1626 	 * we can track down. If we've already been through once
1627 	 * before, all children will be marked as tried so we'll
1628 	 * proceed to combinatorial reconstruction.
1629 	 */
1630 	unexpected_errors = 1;
1631 	rm->rm_missingdata = 0;
1632 	rm->rm_missingparity = 0;
1633 
1634 	n = 0;
1635 	for (c = 0; c < rm->rm_cols; c++) {
1636 		rc = &rm->rm_col[c];
1637 
1638 		if (rc->rc_tried)
1639 			continue;
1640 
1641 		cvd = vdev_child(vd, rc->rc_devidx);
1642 		ASSERT(cvd != NULL);
1643 		rc->rc_error = cvd->v_read(cvd, NULL,
1644 		    rc->rc_data, rc->rc_offset, rc->rc_size);
1645 		if (rc->rc_error == 0)
1646 			n++;
1647 		rc->rc_tried = 1;
1648 		rc->rc_skipped = 0;
1649 	}
1650 	/*
1651 	 * If we managed to read anything more, retry the
1652 	 * reconstruction.
1653 	 */
1654 	if (n > 0)
1655 		goto reconstruct;
1656 
1657 	/*
1658 	 * At this point we've attempted to reconstruct the data given the
1659 	 * errors we detected, and we've attempted to read all columns. There
1660 	 * must, therefore, be one or more additional problems -- silent errors
1661 	 * resulting in invalid data rather than explicit I/O errors resulting
1662 	 * in absent data. We check if there is enough additional data to
1663 	 * possibly reconstruct the data and then perform combinatorial
1664 	 * reconstruction over all possible combinations. If that fails,
1665 	 * we're cooked.
1666 	 */
1667 	if (total_errors > rm->rm_firstdatacol) {
1668 		error = EIO;
1669 	} else if (total_errors < rm->rm_firstdatacol &&
1670 	    (code = vdev_raidz_combrec(rm, bp, data, offset, bytes,
1671 	     total_errors, data_errors)) != 0) {
1672 		/*
1673 		 * If we didn't use all the available parity for the
1674 		 * combinatorial reconstruction, verify that the remaining
1675 		 * parity is correct.
1676 		 */
1677 		if (code != (1 << rm->rm_firstdatacol) - 1)
1678 			(void) raidz_parity_verify(rm);
1679 	} else {
1680 		/*
1681 		 * We're here because either:
1682 		 *
1683 		 *	total_errors == rm_first_datacol, or
1684 		 *	vdev_raidz_combrec() failed
1685 		 *
1686 		 * In either case, there is enough bad data to prevent
1687 		 * reconstruction.
1688 		 *
1689 		 * Start checksum ereports for all children which haven't
1690 		 * failed, and the IO wasn't speculative.
1691 		 */
1692 		error = ECKSUM;
1693 	}
1694 
1695 done:
1696 	vdev_raidz_map_free(rm);
1697 
1698 	return (error);
1699 }
1700