xref: /freebsd/sys/cddl/boot/zfs/zfssubr.c (revision 884a2a699669ec61e2366e3e358342dbc94be24a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/cdefs.h>
27 __FBSDID("$FreeBSD$");
28 
29 static uint64_t zfs_crc64_table[256];
30 
31 #define	ECKSUM	666
32 
33 #define	ASSERT(...)	do { } while (0)
34 #define	ASSERT3U(...)	do { } while (0)
35 #define	ASSERT3S(...)	do { } while (0)
36 
37 #define	panic(...)	do {						\
38 	printf(__VA_ARGS__);						\
39 	for (;;) ;							\
40 } while (0)
41 
42 #define	kmem_alloc(size, flag)	zfs_alloc((size))
43 #define	kmem_free(ptr, size)	zfs_free((ptr), (size))
44 
45 static void
46 zfs_init_crc(void)
47 {
48 	int i, j;
49 	uint64_t *ct;
50 
51 	/*
52 	 * Calculate the crc64 table (used for the zap hash
53 	 * function).
54 	 */
55 	if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
56 		memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
57 		for (i = 0; i < 256; i++)
58 			for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
59 				*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
60 	}
61 }
62 
63 static void
64 zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
65 {
66 	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
67 }
68 
69 /*
70  * Signature for checksum functions.
71  */
72 typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp);
73 
74 /*
75  * Information about each checksum function.
76  */
77 typedef struct zio_checksum_info {
78 	zio_checksum_t	*ci_func[2]; /* checksum function for each byteorder */
79 	int		ci_correctable;	/* number of correctable bits	*/
80 	int		ci_eck;		/* uses zio embedded checksum? */
81 	int		ci_dedup;	/* strong enough for dedup? */
82 	const char	*ci_name;	/* descriptive name */
83 } zio_checksum_info_t;
84 
85 #include "fletcher.c"
86 #include "sha256.c"
87 
88 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
89 	{{NULL,			NULL},			0, 0, 0, "inherit"},
90 	{{NULL,			NULL},			0, 0, 0, "on"},
91 	{{zio_checksum_off,	zio_checksum_off},	0, 0, 0, "off"},
92 	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 1, 0, "label"},
93 	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 1, 0, "gang_header"},
94 	{{fletcher_2_native,	fletcher_2_byteswap},	0, 1, 0, "zilog"},
95 	{{fletcher_2_native,	fletcher_2_byteswap},	0, 0, 0, "fletcher2"},
96 	{{fletcher_4_native,	fletcher_4_byteswap},	1, 0, 0, "fletcher4"},
97 	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 0, 1, "SHA256"},
98 	{{fletcher_4_native,	fletcher_4_byteswap},	0, 1, 0, "zillog2"},
99 };
100 
101 
102 /*
103  * Common signature for all zio compress/decompress functions.
104  */
105 typedef size_t zio_compress_func_t(void *src, void *dst,
106     size_t s_len, size_t d_len, int);
107 typedef int zio_decompress_func_t(void *src, void *dst,
108     size_t s_len, size_t d_len, int);
109 
110 /*
111  * Information about each compression function.
112  */
113 typedef struct zio_compress_info {
114 	zio_compress_func_t	*ci_compress;	/* compression function */
115 	zio_decompress_func_t	*ci_decompress;	/* decompression function */
116 	int			ci_level;	/* level parameter */
117 	const char		*ci_name;	/* algorithm name */
118 } zio_compress_info_t;
119 
120 #include "lzjb.c"
121 #include "zle.c"
122 
123 /*
124  * Compression vectors.
125  */
126 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
127 	{NULL,			NULL,			0,	"inherit"},
128 	{NULL,			NULL,			0,	"on"},
129 	{NULL,			NULL,			0,	"uncompressed"},
130 	{NULL,			lzjb_decompress,	0,	"lzjb"},
131 	{NULL,			NULL,			0,	"empty"},
132 	{NULL,			NULL,			1,	"gzip-1"},
133 	{NULL,			NULL,			2,	"gzip-2"},
134 	{NULL,			NULL,			3,	"gzip-3"},
135 	{NULL,			NULL,			4,	"gzip-4"},
136 	{NULL,			NULL,			5,	"gzip-5"},
137 	{NULL,			NULL,			6,	"gzip-6"},
138 	{NULL,			NULL,			7,	"gzip-7"},
139 	{NULL,			NULL,			8,	"gzip-8"},
140 	{NULL,			NULL,			9,	"gzip-9"},
141 	{NULL,			zle_decompress,		64,	"zle"},
142 };
143 
144 static void
145 byteswap_uint64_array(void *vbuf, size_t size)
146 {
147 	uint64_t *buf = vbuf;
148 	size_t count = size >> 3;
149 	int i;
150 
151 	ASSERT((size & 7) == 0);
152 
153 	for (i = 0; i < count; i++)
154 		buf[i] = BSWAP_64(buf[i]);
155 }
156 
157 /*
158  * Set the external verifier for a gang block based on <vdev, offset, txg>,
159  * a tuple which is guaranteed to be unique for the life of the pool.
160  */
161 static void
162 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
163 {
164 	const dva_t *dva = BP_IDENTITY(bp);
165 	uint64_t txg = BP_PHYSICAL_BIRTH(bp);
166 
167 	ASSERT(BP_IS_GANG(bp));
168 
169 	ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
170 }
171 
172 /*
173  * Set the external verifier for a label block based on its offset.
174  * The vdev is implicit, and the txg is unknowable at pool open time --
175  * hence the logic in vdev_uberblock_load() to find the most recent copy.
176  */
177 static void
178 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
179 {
180 	ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
181 }
182 
183 static int
184 zio_checksum_error(const blkptr_t *bp, void *data, uint64_t offset)
185 {
186 	unsigned int checksum = BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp);
187 	uint64_t size = BP_GET_PSIZE(bp);
188 	zio_checksum_info_t *ci;
189 	zio_cksum_t actual_cksum, expected_cksum, verifier;
190 	int byteswap;
191 
192 	if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
193 		return (EINVAL);
194 	ci = &zio_checksum_table[checksum];
195 	if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
196 		return (EINVAL);
197 
198 	if (ci->ci_eck) {
199 		zio_eck_t *eck;
200 
201 		ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
202 		    checksum == ZIO_CHECKSUM_LABEL);
203 
204 		eck = (zio_eck_t *)((char *)data + size) - 1;
205 
206 		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
207 			zio_checksum_gang_verifier(&verifier, bp);
208 		else if (checksum == ZIO_CHECKSUM_LABEL)
209 			zio_checksum_label_verifier(&verifier, offset);
210 		else
211 			verifier = bp->blk_cksum;
212 
213 		byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
214 
215 		if (byteswap)
216 			byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
217 
218 		expected_cksum = eck->zec_cksum;
219 		eck->zec_cksum = verifier;
220 		ci->ci_func[byteswap](data, size, &actual_cksum);
221 		eck->zec_cksum = expected_cksum;
222 
223 		if (byteswap)
224 			byteswap_uint64_array(&expected_cksum,
225 			    sizeof (zio_cksum_t));
226 	} else {
227 		ASSERT(!BP_IS_GANG(bp));
228 		expected_cksum = bp->blk_cksum;
229 		ci->ci_func[0](data, size, &actual_cksum);
230 	}
231 
232 	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
233 		/*printf("ZFS: read checksum failed\n");*/
234 		return (EIO);
235 	}
236 
237 	return (0);
238 }
239 
240 static int
241 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
242 	void *dest, uint64_t destsize)
243 {
244 	zio_compress_info_t *ci;
245 
246 	if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
247 		printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
248 		return (EIO);
249 	}
250 
251 	ci = &zio_compress_table[cpfunc];
252 	if (!ci->ci_decompress) {
253 		printf("ZFS: unsupported compression algorithm %s\n",
254 		    ci->ci_name);
255 		return (EIO);
256 	}
257 
258 	return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
259 }
260 
261 static uint64_t
262 zap_hash(uint64_t salt, const char *name)
263 {
264 	const uint8_t *cp;
265 	uint8_t c;
266 	uint64_t crc = salt;
267 
268 	ASSERT(crc != 0);
269 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
270 	for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
271 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
272 
273 	/*
274 	 * Only use 28 bits, since we need 4 bits in the cookie for the
275 	 * collision differentiator.  We MUST use the high bits, since
276 	 * those are the onces that we first pay attention to when
277 	 * chosing the bucket.
278 	 */
279 	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
280 
281 	return (crc);
282 }
283 
284 static void *zfs_alloc(size_t size);
285 static void zfs_free(void *ptr, size_t size);
286 
287 typedef struct raidz_col {
288 	uint64_t rc_devidx;		/* child device index for I/O */
289 	uint64_t rc_offset;		/* device offset */
290 	uint64_t rc_size;		/* I/O size */
291 	void *rc_data;			/* I/O data */
292 	int rc_error;			/* I/O error for this device */
293 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
294 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
295 } raidz_col_t;
296 
297 typedef struct raidz_map {
298 	uint64_t rm_cols;		/* Regular column count */
299 	uint64_t rm_scols;		/* Count including skipped columns */
300 	uint64_t rm_bigcols;		/* Number of oversized columns */
301 	uint64_t rm_asize;		/* Actual total I/O size */
302 	uint64_t rm_missingdata;	/* Count of missing data devices */
303 	uint64_t rm_missingparity;	/* Count of missing parity devices */
304 	uint64_t rm_firstdatacol;	/* First data column/parity count */
305 	uint64_t rm_nskip;		/* Skipped sectors for padding */
306 	uint64_t rm_skipstart;		/* Column index of padding start */
307 	uintptr_t rm_reports;		/* # of referencing checksum reports */
308 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
309 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
310 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
311 } raidz_map_t;
312 
313 #define	VDEV_RAIDZ_P		0
314 #define	VDEV_RAIDZ_Q		1
315 #define	VDEV_RAIDZ_R		2
316 
317 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
318 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
319 
320 /*
321  * We provide a mechanism to perform the field multiplication operation on a
322  * 64-bit value all at once rather than a byte at a time. This works by
323  * creating a mask from the top bit in each byte and using that to
324  * conditionally apply the XOR of 0x1d.
325  */
326 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
327 { \
328 	(mask) = (x) & 0x8080808080808080ULL; \
329 	(mask) = ((mask) << 1) - ((mask) >> 7); \
330 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
331 	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
332 }
333 
334 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
335 { \
336 	VDEV_RAIDZ_64MUL_2((x), mask); \
337 	VDEV_RAIDZ_64MUL_2((x), mask); \
338 }
339 
340 /*
341  * These two tables represent powers and logs of 2 in the Galois field defined
342  * above. These values were computed by repeatedly multiplying by 2 as above.
343  */
344 static const uint8_t vdev_raidz_pow2[256] = {
345 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
346 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
347 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
348 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
349 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
350 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
351 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
352 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
353 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
354 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
355 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
356 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
357 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
358 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
359 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
360 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
361 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
362 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
363 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
364 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
365 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
366 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
367 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
368 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
369 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
370 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
371 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
372 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
373 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
374 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
375 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
376 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
377 };
378 static const uint8_t vdev_raidz_log2[256] = {
379 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
380 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
381 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
382 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
383 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
384 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
385 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
386 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
387 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
388 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
389 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
390 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
391 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
392 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
393 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
394 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
395 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
396 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
397 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
398 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
399 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
400 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
401 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
402 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
403 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
404 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
405 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
406 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
407 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
408 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
409 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
410 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
411 };
412 
413 /*
414  * Multiply a given number by 2 raised to the given power.
415  */
416 static uint8_t
417 vdev_raidz_exp2(uint8_t a, int exp)
418 {
419 	if (a == 0)
420 		return (0);
421 
422 	ASSERT(exp >= 0);
423 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
424 
425 	exp += vdev_raidz_log2[a];
426 	if (exp > 255)
427 		exp -= 255;
428 
429 	return (vdev_raidz_pow2[exp]);
430 }
431 
432 static void
433 vdev_raidz_generate_parity_p(raidz_map_t *rm)
434 {
435 	uint64_t *p, *src, pcount, ccount, i;
436 	int c;
437 
438 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
439 
440 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
441 		src = rm->rm_col[c].rc_data;
442 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
443 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
444 
445 		if (c == rm->rm_firstdatacol) {
446 			ASSERT(ccount == pcount);
447 			for (i = 0; i < ccount; i++, src++, p++) {
448 				*p = *src;
449 			}
450 		} else {
451 			ASSERT(ccount <= pcount);
452 			for (i = 0; i < ccount; i++, src++, p++) {
453 				*p ^= *src;
454 			}
455 		}
456 	}
457 }
458 
459 static void
460 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
461 {
462 	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
463 	int c;
464 
465 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
466 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
467 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
468 
469 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
470 		src = rm->rm_col[c].rc_data;
471 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
472 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
473 
474 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
475 
476 		if (c == rm->rm_firstdatacol) {
477 			ASSERT(ccnt == pcnt || ccnt == 0);
478 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
479 				*p = *src;
480 				*q = *src;
481 			}
482 			for (; i < pcnt; i++, src++, p++, q++) {
483 				*p = 0;
484 				*q = 0;
485 			}
486 		} else {
487 			ASSERT(ccnt <= pcnt);
488 
489 			/*
490 			 * Apply the algorithm described above by multiplying
491 			 * the previous result and adding in the new value.
492 			 */
493 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
494 				*p ^= *src;
495 
496 				VDEV_RAIDZ_64MUL_2(*q, mask);
497 				*q ^= *src;
498 			}
499 
500 			/*
501 			 * Treat short columns as though they are full of 0s.
502 			 * Note that there's therefore nothing needed for P.
503 			 */
504 			for (; i < pcnt; i++, q++) {
505 				VDEV_RAIDZ_64MUL_2(*q, mask);
506 			}
507 		}
508 	}
509 }
510 
511 static void
512 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
513 {
514 	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
515 	int c;
516 
517 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
518 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
519 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
520 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
521 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
522 
523 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
524 		src = rm->rm_col[c].rc_data;
525 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
526 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
527 		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
528 
529 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
530 
531 		if (c == rm->rm_firstdatacol) {
532 			ASSERT(ccnt == pcnt || ccnt == 0);
533 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
534 				*p = *src;
535 				*q = *src;
536 				*r = *src;
537 			}
538 			for (; i < pcnt; i++, src++, p++, q++, r++) {
539 				*p = 0;
540 				*q = 0;
541 				*r = 0;
542 			}
543 		} else {
544 			ASSERT(ccnt <= pcnt);
545 
546 			/*
547 			 * Apply the algorithm described above by multiplying
548 			 * the previous result and adding in the new value.
549 			 */
550 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
551 				*p ^= *src;
552 
553 				VDEV_RAIDZ_64MUL_2(*q, mask);
554 				*q ^= *src;
555 
556 				VDEV_RAIDZ_64MUL_4(*r, mask);
557 				*r ^= *src;
558 			}
559 
560 			/*
561 			 * Treat short columns as though they are full of 0s.
562 			 * Note that there's therefore nothing needed for P.
563 			 */
564 			for (; i < pcnt; i++, q++, r++) {
565 				VDEV_RAIDZ_64MUL_2(*q, mask);
566 				VDEV_RAIDZ_64MUL_4(*r, mask);
567 			}
568 		}
569 	}
570 }
571 
572 /*
573  * Generate RAID parity in the first virtual columns according to the number of
574  * parity columns available.
575  */
576 static void
577 vdev_raidz_generate_parity(raidz_map_t *rm)
578 {
579 	switch (rm->rm_firstdatacol) {
580 	case 1:
581 		vdev_raidz_generate_parity_p(rm);
582 		break;
583 	case 2:
584 		vdev_raidz_generate_parity_pq(rm);
585 		break;
586 	case 3:
587 		vdev_raidz_generate_parity_pqr(rm);
588 		break;
589 	default:
590 		panic("invalid RAID-Z configuration");
591 	}
592 }
593 
594 /* BEGIN CSTYLED */
595 /*
596  * In the general case of reconstruction, we must solve the system of linear
597  * equations defined by the coeffecients used to generate parity as well as
598  * the contents of the data and parity disks. This can be expressed with
599  * vectors for the original data (D) and the actual data (d) and parity (p)
600  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
601  *
602  *            __   __                     __     __
603  *            |     |         __     __   |  p_0  |
604  *            |  V  |         |  D_0  |   | p_m-1 |
605  *            |     |    x    |   :   | = |  d_0  |
606  *            |  I  |         | D_n-1 |   |   :   |
607  *            |     |         ~~     ~~   | d_n-1 |
608  *            ~~   ~~                     ~~     ~~
609  *
610  * I is simply a square identity matrix of size n, and V is a vandermonde
611  * matrix defined by the coeffecients we chose for the various parity columns
612  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
613  * computation as well as linear separability.
614  *
615  *      __               __               __     __
616  *      |   1   ..  1 1 1 |               |  p_0  |
617  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
618  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
619  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
620  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
621  *      |   :       : : : |   |   :   |   |  d_2  |
622  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
623  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
624  *      |   0   ..  0 0 1 |               | d_n-1 |
625  *      ~~               ~~               ~~     ~~
626  *
627  * Note that I, V, d, and p are known. To compute D, we must invert the
628  * matrix and use the known data and parity values to reconstruct the unknown
629  * data values. We begin by removing the rows in V|I and d|p that correspond
630  * to failed or missing columns; we then make V|I square (n x n) and d|p
631  * sized n by removing rows corresponding to unused parity from the bottom up
632  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
633  * using Gauss-Jordan elimination. In the example below we use m=3 parity
634  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
635  *           __                               __
636  *           |  1   1   1   1   1   1   1   1  |
637  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
638  *           |  19 205 116  29  64  16  4   1  |      / /
639  *           |  1   0   0   0   0   0   0   0  |     / /
640  *           |  0   1   0   0   0   0   0   0  | <--' /
641  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
642  *           |  0   0   0   1   0   0   0   0  |
643  *           |  0   0   0   0   1   0   0   0  |
644  *           |  0   0   0   0   0   1   0   0  |
645  *           |  0   0   0   0   0   0   1   0  |
646  *           |  0   0   0   0   0   0   0   1  |
647  *           ~~                               ~~
648  *           __                               __
649  *           |  1   1   1   1   1   1   1   1  |
650  *           | 128  64  32  16  8   4   2   1  |
651  *           |  19 205 116  29  64  16  4   1  |
652  *           |  1   0   0   0   0   0   0   0  |
653  *           |  0   1   0   0   0   0   0   0  |
654  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
655  *           |  0   0   0   1   0   0   0   0  |
656  *           |  0   0   0   0   1   0   0   0  |
657  *           |  0   0   0   0   0   1   0   0  |
658  *           |  0   0   0   0   0   0   1   0  |
659  *           |  0   0   0   0   0   0   0   1  |
660  *           ~~                               ~~
661  *
662  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
663  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
664  * matrix is not singular.
665  * __                                                                 __
666  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
667  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
668  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
669  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
670  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
671  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
672  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
673  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
674  * ~~                                                                 ~~
675  * __                                                                 __
676  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
677  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
678  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
679  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
680  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
681  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
682  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
683  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
684  * ~~                                                                 ~~
685  * __                                                                 __
686  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
687  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
688  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
689  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
690  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
691  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
692  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
693  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
694  * ~~                                                                 ~~
695  * __                                                                 __
696  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
697  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
698  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
699  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
700  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
701  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
702  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
703  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
704  * ~~                                                                 ~~
705  * __                                                                 __
706  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
707  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
708  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
709  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
710  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
711  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
712  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
713  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
714  * ~~                                                                 ~~
715  * __                                                                 __
716  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
717  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
718  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
719  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
720  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
721  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
722  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
723  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
724  * ~~                                                                 ~~
725  *                   __                               __
726  *                   |  0   0   1   0   0   0   0   0  |
727  *                   | 167 100  5   41 159 169 217 208 |
728  *                   | 166 100  4   40 158 168 216 209 |
729  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
730  *                   |  0   0   0   0   1   0   0   0  |
731  *                   |  0   0   0   0   0   1   0   0  |
732  *                   |  0   0   0   0   0   0   1   0  |
733  *                   |  0   0   0   0   0   0   0   1  |
734  *                   ~~                               ~~
735  *
736  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
737  * of the missing data.
738  *
739  * As is apparent from the example above, the only non-trivial rows in the
740  * inverse matrix correspond to the data disks that we're trying to
741  * reconstruct. Indeed, those are the only rows we need as the others would
742  * only be useful for reconstructing data known or assumed to be valid. For
743  * that reason, we only build the coefficients in the rows that correspond to
744  * targeted columns.
745  */
746 /* END CSTYLED */
747 
748 static void
749 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
750     uint8_t **rows)
751 {
752 	int i, j;
753 	int pow;
754 
755 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
756 
757 	/*
758 	 * Fill in the missing rows of interest.
759 	 */
760 	for (i = 0; i < nmap; i++) {
761 		ASSERT3S(0, <=, map[i]);
762 		ASSERT3S(map[i], <=, 2);
763 
764 		pow = map[i] * n;
765 		if (pow > 255)
766 			pow -= 255;
767 		ASSERT(pow <= 255);
768 
769 		for (j = 0; j < n; j++) {
770 			pow -= map[i];
771 			if (pow < 0)
772 				pow += 255;
773 			rows[i][j] = vdev_raidz_pow2[pow];
774 		}
775 	}
776 }
777 
778 static void
779 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
780     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
781 {
782 	int i, j, ii, jj;
783 	uint8_t log;
784 
785 	/*
786 	 * Assert that the first nmissing entries from the array of used
787 	 * columns correspond to parity columns and that subsequent entries
788 	 * correspond to data columns.
789 	 */
790 	for (i = 0; i < nmissing; i++) {
791 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
792 	}
793 	for (; i < n; i++) {
794 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
795 	}
796 
797 	/*
798 	 * First initialize the storage where we'll compute the inverse rows.
799 	 */
800 	for (i = 0; i < nmissing; i++) {
801 		for (j = 0; j < n; j++) {
802 			invrows[i][j] = (i == j) ? 1 : 0;
803 		}
804 	}
805 
806 	/*
807 	 * Subtract all trivial rows from the rows of consequence.
808 	 */
809 	for (i = 0; i < nmissing; i++) {
810 		for (j = nmissing; j < n; j++) {
811 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
812 			jj = used[j] - rm->rm_firstdatacol;
813 			ASSERT3S(jj, <, n);
814 			invrows[i][j] = rows[i][jj];
815 			rows[i][jj] = 0;
816 		}
817 	}
818 
819 	/*
820 	 * For each of the rows of interest, we must normalize it and subtract
821 	 * a multiple of it from the other rows.
822 	 */
823 	for (i = 0; i < nmissing; i++) {
824 		for (j = 0; j < missing[i]; j++) {
825 			ASSERT3U(rows[i][j], ==, 0);
826 		}
827 		ASSERT3U(rows[i][missing[i]], !=, 0);
828 
829 		/*
830 		 * Compute the inverse of the first element and multiply each
831 		 * element in the row by that value.
832 		 */
833 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
834 
835 		for (j = 0; j < n; j++) {
836 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
837 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
838 		}
839 
840 		for (ii = 0; ii < nmissing; ii++) {
841 			if (i == ii)
842 				continue;
843 
844 			ASSERT3U(rows[ii][missing[i]], !=, 0);
845 
846 			log = vdev_raidz_log2[rows[ii][missing[i]]];
847 
848 			for (j = 0; j < n; j++) {
849 				rows[ii][j] ^=
850 				    vdev_raidz_exp2(rows[i][j], log);
851 				invrows[ii][j] ^=
852 				    vdev_raidz_exp2(invrows[i][j], log);
853 			}
854 		}
855 	}
856 
857 	/*
858 	 * Verify that the data that is left in the rows are properly part of
859 	 * an identity matrix.
860 	 */
861 	for (i = 0; i < nmissing; i++) {
862 		for (j = 0; j < n; j++) {
863 			if (j == missing[i]) {
864 				ASSERT3U(rows[i][j], ==, 1);
865 			} else {
866 				ASSERT3U(rows[i][j], ==, 0);
867 			}
868 		}
869 	}
870 }
871 
872 static void
873 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
874     int *missing, uint8_t **invrows, const uint8_t *used)
875 {
876 	int i, j, x, cc, c;
877 	uint8_t *src;
878 	uint64_t ccount;
879 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
880 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
881 	uint8_t log, val;
882 	int ll;
883 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
884 	uint8_t *p, *pp;
885 	size_t psize;
886 
887 	log = 0;	/* gcc */
888 	psize = sizeof (invlog[0][0]) * n * nmissing;
889 	p = zfs_alloc(psize);
890 
891 	for (pp = p, i = 0; i < nmissing; i++) {
892 		invlog[i] = pp;
893 		pp += n;
894 	}
895 
896 	for (i = 0; i < nmissing; i++) {
897 		for (j = 0; j < n; j++) {
898 			ASSERT3U(invrows[i][j], !=, 0);
899 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
900 		}
901 	}
902 
903 	for (i = 0; i < n; i++) {
904 		c = used[i];
905 		ASSERT3U(c, <, rm->rm_cols);
906 
907 		src = rm->rm_col[c].rc_data;
908 		ccount = rm->rm_col[c].rc_size;
909 		for (j = 0; j < nmissing; j++) {
910 			cc = missing[j] + rm->rm_firstdatacol;
911 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
912 			ASSERT3U(cc, <, rm->rm_cols);
913 			ASSERT3U(cc, !=, c);
914 
915 			dst[j] = rm->rm_col[cc].rc_data;
916 			dcount[j] = rm->rm_col[cc].rc_size;
917 		}
918 
919 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
920 
921 		for (x = 0; x < ccount; x++, src++) {
922 			if (*src != 0)
923 				log = vdev_raidz_log2[*src];
924 
925 			for (cc = 0; cc < nmissing; cc++) {
926 				if (x >= dcount[cc])
927 					continue;
928 
929 				if (*src == 0) {
930 					val = 0;
931 				} else {
932 					if ((ll = log + invlog[cc][i]) >= 255)
933 						ll -= 255;
934 					val = vdev_raidz_pow2[ll];
935 				}
936 
937 				if (i == 0)
938 					dst[cc][x] = val;
939 				else
940 					dst[cc][x] ^= val;
941 			}
942 		}
943 	}
944 
945 	zfs_free(p, psize);
946 }
947 
948 static int
949 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
950 {
951 	int n, i, c, t, tt;
952 	int nmissing_rows;
953 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
954 	int parity_map[VDEV_RAIDZ_MAXPARITY];
955 
956 	uint8_t *p, *pp;
957 	size_t psize;
958 
959 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
960 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
961 	uint8_t *used;
962 
963 	int code = 0;
964 
965 
966 	n = rm->rm_cols - rm->rm_firstdatacol;
967 
968 	/*
969 	 * Figure out which data columns are missing.
970 	 */
971 	nmissing_rows = 0;
972 	for (t = 0; t < ntgts; t++) {
973 		if (tgts[t] >= rm->rm_firstdatacol) {
974 			missing_rows[nmissing_rows++] =
975 			    tgts[t] - rm->rm_firstdatacol;
976 		}
977 	}
978 
979 	/*
980 	 * Figure out which parity columns to use to help generate the missing
981 	 * data columns.
982 	 */
983 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
984 		ASSERT(tt < ntgts);
985 		ASSERT(c < rm->rm_firstdatacol);
986 
987 		/*
988 		 * Skip any targeted parity columns.
989 		 */
990 		if (c == tgts[tt]) {
991 			tt++;
992 			continue;
993 		}
994 
995 		code |= 1 << c;
996 
997 		parity_map[i] = c;
998 		i++;
999 	}
1000 
1001 	ASSERT(code != 0);
1002 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1003 
1004 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1005 	    nmissing_rows * n + sizeof (used[0]) * n;
1006 	p = kmem_alloc(psize, KM_SLEEP);
1007 
1008 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1009 		rows[i] = pp;
1010 		pp += n;
1011 		invrows[i] = pp;
1012 		pp += n;
1013 	}
1014 	used = pp;
1015 
1016 	for (i = 0; i < nmissing_rows; i++) {
1017 		used[i] = parity_map[i];
1018 	}
1019 
1020 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1021 		if (tt < nmissing_rows &&
1022 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1023 			tt++;
1024 			continue;
1025 		}
1026 
1027 		ASSERT3S(i, <, n);
1028 		used[i] = c;
1029 		i++;
1030 	}
1031 
1032 	/*
1033 	 * Initialize the interesting rows of the matrix.
1034 	 */
1035 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1036 
1037 	/*
1038 	 * Invert the matrix.
1039 	 */
1040 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1041 	    invrows, used);
1042 
1043 	/*
1044 	 * Reconstruct the missing data using the generated matrix.
1045 	 */
1046 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1047 	    invrows, used);
1048 
1049 	kmem_free(p, psize);
1050 
1051 	return (code);
1052 }
1053 
1054 static int
1055 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1056 {
1057 	int tgts[VDEV_RAIDZ_MAXPARITY];
1058 	int ntgts;
1059 	int i, c;
1060 	int code;
1061 	int nbadparity, nbaddata;
1062 
1063 	/*
1064 	 * The tgts list must already be sorted.
1065 	 */
1066 	for (i = 1; i < nt; i++) {
1067 		ASSERT(t[i] > t[i - 1]);
1068 	}
1069 
1070 	nbadparity = rm->rm_firstdatacol;
1071 	nbaddata = rm->rm_cols - nbadparity;
1072 	ntgts = 0;
1073 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1074 		if (i < nt && c == t[i]) {
1075 			tgts[ntgts++] = c;
1076 			i++;
1077 		} else if (rm->rm_col[c].rc_error != 0) {
1078 			tgts[ntgts++] = c;
1079 		} else if (c >= rm->rm_firstdatacol) {
1080 			nbaddata--;
1081 		} else {
1082 			nbadparity--;
1083 		}
1084 	}
1085 
1086 	ASSERT(ntgts >= nt);
1087 	ASSERT(nbaddata >= 0);
1088 	ASSERT(nbaddata + nbadparity == ntgts);
1089 
1090 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1091 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1092 	ASSERT(code > 0);
1093 	return (code);
1094 }
1095 
1096 static raidz_map_t *
1097 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1098     uint64_t dcols, uint64_t nparity)
1099 {
1100 	raidz_map_t *rm;
1101 	uint64_t b = offset >> unit_shift;
1102 	uint64_t s = size >> unit_shift;
1103 	uint64_t f = b % dcols;
1104 	uint64_t o = (b / dcols) << unit_shift;
1105 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1106 
1107 	q = s / (dcols - nparity);
1108 	r = s - q * (dcols - nparity);
1109 	bc = (r == 0 ? 0 : r + nparity);
1110 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
1111 
1112 	if (q == 0) {
1113 		acols = bc;
1114 		scols = MIN(dcols, roundup(bc, nparity + 1));
1115 	} else {
1116 		acols = dcols;
1117 		scols = dcols;
1118 	}
1119 
1120 	ASSERT3U(acols, <=, scols);
1121 
1122 	rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols]));
1123 
1124 	rm->rm_cols = acols;
1125 	rm->rm_scols = scols;
1126 	rm->rm_bigcols = bc;
1127 	rm->rm_skipstart = bc;
1128 	rm->rm_missingdata = 0;
1129 	rm->rm_missingparity = 0;
1130 	rm->rm_firstdatacol = nparity;
1131 	rm->rm_reports = 0;
1132 	rm->rm_freed = 0;
1133 	rm->rm_ecksuminjected = 0;
1134 
1135 	asize = 0;
1136 
1137 	for (c = 0; c < scols; c++) {
1138 		col = f + c;
1139 		coff = o;
1140 		if (col >= dcols) {
1141 			col -= dcols;
1142 			coff += 1ULL << unit_shift;
1143 		}
1144 		rm->rm_col[c].rc_devidx = col;
1145 		rm->rm_col[c].rc_offset = coff;
1146 		rm->rm_col[c].rc_data = NULL;
1147 		rm->rm_col[c].rc_error = 0;
1148 		rm->rm_col[c].rc_tried = 0;
1149 		rm->rm_col[c].rc_skipped = 0;
1150 
1151 		if (c >= acols)
1152 			rm->rm_col[c].rc_size = 0;
1153 		else if (c < bc)
1154 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1155 		else
1156 			rm->rm_col[c].rc_size = q << unit_shift;
1157 
1158 		asize += rm->rm_col[c].rc_size;
1159 	}
1160 
1161 	ASSERT3U(asize, ==, tot << unit_shift);
1162 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1163 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1164 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1165 	ASSERT3U(rm->rm_nskip, <=, nparity);
1166 
1167 	for (c = 0; c < rm->rm_firstdatacol; c++)
1168 		rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size);
1169 
1170 	rm->rm_col[c].rc_data = data;
1171 
1172 	for (c = c + 1; c < acols; c++)
1173 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1174 		    rm->rm_col[c - 1].rc_size;
1175 
1176 	/*
1177 	 * If all data stored spans all columns, there's a danger that parity
1178 	 * will always be on the same device and, since parity isn't read
1179 	 * during normal operation, that that device's I/O bandwidth won't be
1180 	 * used effectively. We therefore switch the parity every 1MB.
1181 	 *
1182 	 * ... at least that was, ostensibly, the theory. As a practical
1183 	 * matter unless we juggle the parity between all devices evenly, we
1184 	 * won't see any benefit. Further, occasional writes that aren't a
1185 	 * multiple of the LCM of the number of children and the minimum
1186 	 * stripe width are sufficient to avoid pessimal behavior.
1187 	 * Unfortunately, this decision created an implicit on-disk format
1188 	 * requirement that we need to support for all eternity, but only
1189 	 * for single-parity RAID-Z.
1190 	 *
1191 	 * If we intend to skip a sector in the zeroth column for padding
1192 	 * we must make sure to note this swap. We will never intend to
1193 	 * skip the first column since at least one data and one parity
1194 	 * column must appear in each row.
1195 	 */
1196 	ASSERT(rm->rm_cols >= 2);
1197 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1198 
1199 	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1200 		devidx = rm->rm_col[0].rc_devidx;
1201 		o = rm->rm_col[0].rc_offset;
1202 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1203 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1204 		rm->rm_col[1].rc_devidx = devidx;
1205 		rm->rm_col[1].rc_offset = o;
1206 
1207 		if (rm->rm_skipstart == 0)
1208 			rm->rm_skipstart = 1;
1209 	}
1210 
1211 	return (rm);
1212 }
1213 
1214 static void
1215 vdev_raidz_map_free(raidz_map_t *rm)
1216 {
1217 	int c;
1218 	size_t size;
1219 
1220 	for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1221 		zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
1222 
1223 	size = 0;
1224 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
1225 		size += rm->rm_col[c].rc_size;
1226 
1227 	zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
1228 }
1229 
1230 static vdev_t *
1231 vdev_child(vdev_t *pvd, uint64_t devidx)
1232 {
1233 	vdev_t *cvd;
1234 
1235 	STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1236 		if (cvd->v_id == devidx)
1237 			break;
1238 	}
1239 
1240 	return (cvd);
1241 }
1242 
1243 /*
1244  * We keep track of whether or not there were any injected errors, so that
1245  * any ereports we generate can note it.
1246  */
1247 static int
1248 raidz_checksum_verify(const blkptr_t *bp, void *data)
1249 {
1250 
1251 	return (zio_checksum_error(bp, data, 0));
1252 }
1253 
1254 /*
1255  * Generate the parity from the data columns. If we tried and were able to
1256  * read the parity without error, verify that the generated parity matches the
1257  * data we read. If it doesn't, we fire off a checksum error. Return the
1258  * number such failures.
1259  */
1260 static int
1261 raidz_parity_verify(raidz_map_t *rm)
1262 {
1263 	void *orig[VDEV_RAIDZ_MAXPARITY];
1264 	int c, ret = 0;
1265 	raidz_col_t *rc;
1266 
1267 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1268 		rc = &rm->rm_col[c];
1269 		if (!rc->rc_tried || rc->rc_error != 0)
1270 			continue;
1271 		orig[c] = zfs_alloc(rc->rc_size);
1272 		bcopy(rc->rc_data, orig[c], rc->rc_size);
1273 	}
1274 
1275 	vdev_raidz_generate_parity(rm);
1276 
1277 	for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1278 		rc = &rm->rm_col[c];
1279 		if (!rc->rc_tried || rc->rc_error != 0)
1280 			continue;
1281 		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1282 			rc->rc_error = ECKSUM;
1283 			ret++;
1284 		}
1285 		zfs_free(orig[c], rc->rc_size);
1286 	}
1287 
1288 	return (ret);
1289 }
1290 
1291 /*
1292  * Iterate over all combinations of bad data and attempt a reconstruction.
1293  * Note that the algorithm below is non-optimal because it doesn't take into
1294  * account how reconstruction is actually performed. For example, with
1295  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1296  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1297  * cases we'd only use parity information in column 0.
1298  */
1299 static int
1300 vdev_raidz_combrec(raidz_map_t *rm, const blkptr_t *bp, void *data,
1301     off_t offset, int total_errors, int data_errors)
1302 {
1303 	raidz_col_t *rc;
1304 	void *orig[VDEV_RAIDZ_MAXPARITY];
1305 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1306 	int *tgts = &tstore[1];
1307 	int current, next, i, c, n;
1308 	int code, ret = 0;
1309 
1310 	ASSERT(total_errors < rm->rm_firstdatacol);
1311 
1312 	/*
1313 	 * This simplifies one edge condition.
1314 	 */
1315 	tgts[-1] = -1;
1316 
1317 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1318 		/*
1319 		 * Initialize the targets array by finding the first n columns
1320 		 * that contain no error.
1321 		 *
1322 		 * If there were no data errors, we need to ensure that we're
1323 		 * always explicitly attempting to reconstruct at least one
1324 		 * data column. To do this, we simply push the highest target
1325 		 * up into the data columns.
1326 		 */
1327 		for (c = 0, i = 0; i < n; i++) {
1328 			if (i == n - 1 && data_errors == 0 &&
1329 			    c < rm->rm_firstdatacol) {
1330 				c = rm->rm_firstdatacol;
1331 			}
1332 
1333 			while (rm->rm_col[c].rc_error != 0) {
1334 				c++;
1335 				ASSERT3S(c, <, rm->rm_cols);
1336 			}
1337 
1338 			tgts[i] = c++;
1339 		}
1340 
1341 		/*
1342 		 * Setting tgts[n] simplifies the other edge condition.
1343 		 */
1344 		tgts[n] = rm->rm_cols;
1345 
1346 		/*
1347 		 * These buffers were allocated in previous iterations.
1348 		 */
1349 		for (i = 0; i < n - 1; i++) {
1350 			ASSERT(orig[i] != NULL);
1351 		}
1352 
1353 		orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size);
1354 
1355 		current = 0;
1356 		next = tgts[current];
1357 
1358 		while (current != n) {
1359 			tgts[current] = next;
1360 			current = 0;
1361 
1362 			/*
1363 			 * Save off the original data that we're going to
1364 			 * attempt to reconstruct.
1365 			 */
1366 			for (i = 0; i < n; i++) {
1367 				ASSERT(orig[i] != NULL);
1368 				c = tgts[i];
1369 				ASSERT3S(c, >=, 0);
1370 				ASSERT3S(c, <, rm->rm_cols);
1371 				rc = &rm->rm_col[c];
1372 				bcopy(rc->rc_data, orig[i], rc->rc_size);
1373 			}
1374 
1375 			/*
1376 			 * Attempt a reconstruction and exit the outer loop on
1377 			 * success.
1378 			 */
1379 			code = vdev_raidz_reconstruct(rm, tgts, n);
1380 			if (raidz_checksum_verify(bp, data) == 0) {
1381 				for (i = 0; i < n; i++) {
1382 					c = tgts[i];
1383 					rc = &rm->rm_col[c];
1384 					ASSERT(rc->rc_error == 0);
1385 					rc->rc_error = ECKSUM;
1386 				}
1387 
1388 				ret = code;
1389 				goto done;
1390 			}
1391 
1392 			/*
1393 			 * Restore the original data.
1394 			 */
1395 			for (i = 0; i < n; i++) {
1396 				c = tgts[i];
1397 				rc = &rm->rm_col[c];
1398 				bcopy(orig[i], rc->rc_data, rc->rc_size);
1399 			}
1400 
1401 			do {
1402 				/*
1403 				 * Find the next valid column after the current
1404 				 * position..
1405 				 */
1406 				for (next = tgts[current] + 1;
1407 				    next < rm->rm_cols &&
1408 				    rm->rm_col[next].rc_error != 0; next++)
1409 					continue;
1410 
1411 				ASSERT(next <= tgts[current + 1]);
1412 
1413 				/*
1414 				 * If that spot is available, we're done here.
1415 				 */
1416 				if (next != tgts[current + 1])
1417 					break;
1418 
1419 				/*
1420 				 * Otherwise, find the next valid column after
1421 				 * the previous position.
1422 				 */
1423 				for (c = tgts[current - 1] + 1;
1424 				    rm->rm_col[c].rc_error != 0; c++)
1425 					continue;
1426 
1427 				tgts[current] = c;
1428 				current++;
1429 
1430 			} while (current != n);
1431 		}
1432 	}
1433 	n--;
1434 done:
1435 	for (i = n - 1; i >= 0; i--) {
1436 		zfs_free(orig[i], rm->rm_col[0].rc_size);
1437 	}
1438 
1439 	return (ret);
1440 }
1441 
1442 static int
1443 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1444     off_t offset, size_t bytes)
1445 {
1446 	vdev_t *tvd = vd->v_top;
1447 	vdev_t *cvd;
1448 	raidz_map_t *rm;
1449 	raidz_col_t *rc;
1450 	int c, error;
1451 	int unexpected_errors;
1452 	int parity_errors;
1453 	int parity_untried;
1454 	int data_errors;
1455 	int total_errors;
1456 	int n;
1457 	int tgts[VDEV_RAIDZ_MAXPARITY];
1458 	int code;
1459 
1460 	rc = NULL;	/* gcc */
1461 	error = 0;
1462 
1463 	rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1464 	    vd->v_nchildren, vd->v_nparity);
1465 
1466 	/*
1467 	 * Iterate over the columns in reverse order so that we hit the parity
1468 	 * last -- any errors along the way will force us to read the parity.
1469 	 */
1470 	for (c = rm->rm_cols - 1; c >= 0; c--) {
1471 		rc = &rm->rm_col[c];
1472 		cvd = vdev_child(vd, rc->rc_devidx);
1473 		if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1474 			if (c >= rm->rm_firstdatacol)
1475 				rm->rm_missingdata++;
1476 			else
1477 				rm->rm_missingparity++;
1478 			rc->rc_error = ENXIO;
1479 			rc->rc_tried = 1;	/* don't even try */
1480 			rc->rc_skipped = 1;
1481 			continue;
1482 		}
1483 #if 0		/* XXX: Too hard for the boot code. */
1484 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1485 			if (c >= rm->rm_firstdatacol)
1486 				rm->rm_missingdata++;
1487 			else
1488 				rm->rm_missingparity++;
1489 			rc->rc_error = ESTALE;
1490 			rc->rc_skipped = 1;
1491 			continue;
1492 		}
1493 #endif
1494 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1495 			rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1496 			    rc->rc_offset, rc->rc_size);
1497 			rc->rc_tried = 1;
1498 			rc->rc_skipped = 0;
1499 		}
1500 	}
1501 
1502 reconstruct:
1503 	unexpected_errors = 0;
1504 	parity_errors = 0;
1505 	parity_untried = 0;
1506 	data_errors = 0;
1507 	total_errors = 0;
1508 
1509 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1510 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1511 
1512 	for (c = 0; c < rm->rm_cols; c++) {
1513 		rc = &rm->rm_col[c];
1514 
1515 		if (rc->rc_error) {
1516 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
1517 
1518 			if (c < rm->rm_firstdatacol)
1519 				parity_errors++;
1520 			else
1521 				data_errors++;
1522 
1523 			if (!rc->rc_skipped)
1524 				unexpected_errors++;
1525 
1526 			total_errors++;
1527 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1528 			parity_untried++;
1529 		}
1530 	}
1531 
1532 	/*
1533 	 * There are three potential phases for a read:
1534 	 *	1. produce valid data from the columns read
1535 	 *	2. read all disks and try again
1536 	 *	3. perform combinatorial reconstruction
1537 	 *
1538 	 * Each phase is progressively both more expensive and less likely to
1539 	 * occur. If we encounter more errors than we can repair or all phases
1540 	 * fail, we have no choice but to return an error.
1541 	 */
1542 
1543 	/*
1544 	 * If the number of errors we saw was correctable -- less than or equal
1545 	 * to the number of parity disks read -- attempt to produce data that
1546 	 * has a valid checksum. Naturally, this case applies in the absence of
1547 	 * any errors.
1548 	 */
1549 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1550 		if (data_errors == 0) {
1551 			if (raidz_checksum_verify(bp, data) == 0) {
1552 				/*
1553 				 * If we read parity information (unnecessarily
1554 				 * as it happens since no reconstruction was
1555 				 * needed) regenerate and verify the parity.
1556 				 * We also regenerate parity when resilvering
1557 				 * so we can write it out to the failed device
1558 				 * later.
1559 				 */
1560 				if (parity_errors + parity_untried <
1561 				    rm->rm_firstdatacol) {
1562 					n = raidz_parity_verify(rm);
1563 					unexpected_errors += n;
1564 					ASSERT(parity_errors + n <=
1565 					    rm->rm_firstdatacol);
1566 				}
1567 				goto done;
1568 			}
1569 		} else {
1570 			/*
1571 			 * We either attempt to read all the parity columns or
1572 			 * none of them. If we didn't try to read parity, we
1573 			 * wouldn't be here in the correctable case. There must
1574 			 * also have been fewer parity errors than parity
1575 			 * columns or, again, we wouldn't be in this code path.
1576 			 */
1577 			ASSERT(parity_untried == 0);
1578 			ASSERT(parity_errors < rm->rm_firstdatacol);
1579 
1580 			/*
1581 			 * Identify the data columns that reported an error.
1582 			 */
1583 			n = 0;
1584 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1585 				rc = &rm->rm_col[c];
1586 				if (rc->rc_error != 0) {
1587 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1588 					tgts[n++] = c;
1589 				}
1590 			}
1591 
1592 			ASSERT(rm->rm_firstdatacol >= n);
1593 
1594 			code = vdev_raidz_reconstruct(rm, tgts, n);
1595 
1596 			if (raidz_checksum_verify(bp, data) == 0) {
1597 				/*
1598 				 * If we read more parity disks than were used
1599 				 * for reconstruction, confirm that the other
1600 				 * parity disks produced correct data. This
1601 				 * routine is suboptimal in that it regenerates
1602 				 * the parity that we already used in addition
1603 				 * to the parity that we're attempting to
1604 				 * verify, but this should be a relatively
1605 				 * uncommon case, and can be optimized if it
1606 				 * becomes a problem. Note that we regenerate
1607 				 * parity when resilvering so we can write it
1608 				 * out to failed devices later.
1609 				 */
1610 				if (parity_errors < rm->rm_firstdatacol - n) {
1611 					n = raidz_parity_verify(rm);
1612 					unexpected_errors += n;
1613 					ASSERT(parity_errors + n <=
1614 					    rm->rm_firstdatacol);
1615 				}
1616 
1617 				goto done;
1618 			}
1619 		}
1620 	}
1621 
1622 	/*
1623 	 * This isn't a typical situation -- either we got a read
1624 	 * error or a child silently returned bad data. Read every
1625 	 * block so we can try again with as much data and parity as
1626 	 * we can track down. If we've already been through once
1627 	 * before, all children will be marked as tried so we'll
1628 	 * proceed to combinatorial reconstruction.
1629 	 */
1630 	unexpected_errors = 1;
1631 	rm->rm_missingdata = 0;
1632 	rm->rm_missingparity = 0;
1633 
1634 	n = 0;
1635 	for (c = 0; c < rm->rm_cols; c++) {
1636 		if (rm->rm_col[c].rc_tried)
1637 			continue;
1638 
1639 		cvd = vdev_child(vd, rc->rc_devidx);
1640 		ASSERT(cvd != NULL);
1641 		rc->rc_error = cvd->v_read(cvd, NULL,
1642 		    rc->rc_data, rc->rc_offset, rc->rc_size);
1643 		if (rc->rc_error == 0)
1644 			n++;
1645 		rc->rc_tried = 1;
1646 		rc->rc_skipped = 0;
1647 	}
1648 	/*
1649 	 * If we managed to read anything more, retry the
1650 	 * reconstruction.
1651 	 */
1652 	if (n > 0)
1653 		goto reconstruct;
1654 
1655 	/*
1656 	 * At this point we've attempted to reconstruct the data given the
1657 	 * errors we detected, and we've attempted to read all columns. There
1658 	 * must, therefore, be one or more additional problems -- silent errors
1659 	 * resulting in invalid data rather than explicit I/O errors resulting
1660 	 * in absent data. We check if there is enough additional data to
1661 	 * possibly reconstruct the data and then perform combinatorial
1662 	 * reconstruction over all possible combinations. If that fails,
1663 	 * we're cooked.
1664 	 */
1665 	if (total_errors > rm->rm_firstdatacol) {
1666 		error = EIO;
1667 	} else if (total_errors < rm->rm_firstdatacol &&
1668 	    (code = vdev_raidz_combrec(rm, bp, data, offset, total_errors,
1669 	     data_errors)) != 0) {
1670 		/*
1671 		 * If we didn't use all the available parity for the
1672 		 * combinatorial reconstruction, verify that the remaining
1673 		 * parity is correct.
1674 		 */
1675 		if (code != (1 << rm->rm_firstdatacol) - 1)
1676 			(void) raidz_parity_verify(rm);
1677 	} else {
1678 		/*
1679 		 * We're here because either:
1680 		 *
1681 		 *	total_errors == rm_first_datacol, or
1682 		 *	vdev_raidz_combrec() failed
1683 		 *
1684 		 * In either case, there is enough bad data to prevent
1685 		 * reconstruction.
1686 		 *
1687 		 * Start checksum ereports for all children which haven't
1688 		 * failed, and the IO wasn't speculative.
1689 		 */
1690 		error = ECKSUM;
1691 	}
1692 
1693 done:
1694 	vdev_raidz_map_free(rm);
1695 
1696 	return (error);
1697 }
1698