xref: /freebsd/sys/cddl/boot/zfs/zfssubr.c (revision 7750ad47a9a7dbc83f87158464170c8640723293)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/cdefs.h>
27 __FBSDID("$FreeBSD$");
28 
29 static uint64_t zfs_crc64_table[256];
30 
31 #define	ECKSUM	666
32 
33 #define	ASSERT(...)	do { } while (0)
34 #define	ASSERT3U(...)	do { } while (0)
35 #define	ASSERT3S(...)	do { } while (0)
36 
37 #define	panic(...)	do {						\
38 	printf(__VA_ARGS__);						\
39 	for (;;) ;							\
40 } while (0)
41 
42 #define	kmem_alloc(size, flag)	zfs_alloc((size))
43 #define	kmem_free(ptr, size)	zfs_free((ptr), (size))
44 
45 static void
46 zfs_init_crc(void)
47 {
48 	int i, j;
49 	uint64_t *ct;
50 
51 	/*
52 	 * Calculate the crc64 table (used for the zap hash
53 	 * function).
54 	 */
55 	if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
56 		memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
57 		for (i = 0; i < 256; i++)
58 			for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
59 				*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
60 	}
61 }
62 
63 static void
64 zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
65 {
66 	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
67 }
68 
69 /*
70  * Signature for checksum functions.
71  */
72 typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp);
73 
74 /*
75  * Information about each checksum function.
76  */
77 typedef struct zio_checksum_info {
78 	zio_checksum_t	*ci_func[2]; /* checksum function for each byteorder */
79 	int		ci_correctable;	/* number of correctable bits	*/
80 	int		ci_eck;		/* uses zio embedded checksum? */
81 	int		ci_dedup;	/* strong enough for dedup? */
82 	const char	*ci_name;	/* descriptive name */
83 } zio_checksum_info_t;
84 
85 #include "fletcher.c"
86 #include "sha256.c"
87 
88 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
89 	{{NULL,			NULL},			0, 0, 0, "inherit"},
90 	{{NULL,			NULL},			0, 0, 0, "on"},
91 	{{zio_checksum_off,	zio_checksum_off},	0, 0, 0, "off"},
92 	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 1, 0, "label"},
93 	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 1, 0, "gang_header"},
94 	{{fletcher_2_native,	fletcher_2_byteswap},	0, 1, 0, "zilog"},
95 	{{fletcher_2_native,	fletcher_2_byteswap},	0, 0, 0, "fletcher2"},
96 	{{fletcher_4_native,	fletcher_4_byteswap},	1, 0, 0, "fletcher4"},
97 	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 0, 1, "SHA256"},
98 	{{fletcher_4_native,	fletcher_4_byteswap},	0, 1, 0, "zillog2"},
99 };
100 
101 
102 /*
103  * Common signature for all zio compress/decompress functions.
104  */
105 typedef size_t zio_compress_func_t(void *src, void *dst,
106     size_t s_len, size_t d_len, int);
107 typedef int zio_decompress_func_t(void *src, void *dst,
108     size_t s_len, size_t d_len, int);
109 
110 /*
111  * Information about each compression function.
112  */
113 typedef struct zio_compress_info {
114 	zio_compress_func_t	*ci_compress;	/* compression function */
115 	zio_decompress_func_t	*ci_decompress;	/* decompression function */
116 	int			ci_level;	/* level parameter */
117 	const char		*ci_name;	/* algorithm name */
118 } zio_compress_info_t;
119 
120 #include "lzjb.c"
121 #include "zle.c"
122 
123 /*
124  * Compression vectors.
125  */
126 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
127 	{NULL,			NULL,			0,	"inherit"},
128 	{NULL,			NULL,			0,	"on"},
129 	{NULL,			NULL,			0,	"uncompressed"},
130 	{NULL,			lzjb_decompress,	0,	"lzjb"},
131 	{NULL,			NULL,			0,	"empty"},
132 	{NULL,			NULL,			1,	"gzip-1"},
133 	{NULL,			NULL,			2,	"gzip-2"},
134 	{NULL,			NULL,			3,	"gzip-3"},
135 	{NULL,			NULL,			4,	"gzip-4"},
136 	{NULL,			NULL,			5,	"gzip-5"},
137 	{NULL,			NULL,			6,	"gzip-6"},
138 	{NULL,			NULL,			7,	"gzip-7"},
139 	{NULL,			NULL,			8,	"gzip-8"},
140 	{NULL,			NULL,			9,	"gzip-9"},
141 	{NULL,			zle_decompress,		64,	"zle"},
142 };
143 
144 static void
145 byteswap_uint64_array(void *vbuf, size_t size)
146 {
147 	uint64_t *buf = vbuf;
148 	size_t count = size >> 3;
149 	int i;
150 
151 	ASSERT((size & 7) == 0);
152 
153 	for (i = 0; i < count; i++)
154 		buf[i] = BSWAP_64(buf[i]);
155 }
156 
157 /*
158  * Set the external verifier for a gang block based on <vdev, offset, txg>,
159  * a tuple which is guaranteed to be unique for the life of the pool.
160  */
161 static void
162 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
163 {
164 	const dva_t *dva = BP_IDENTITY(bp);
165 	uint64_t txg = BP_PHYSICAL_BIRTH(bp);
166 
167 	ASSERT(BP_IS_GANG(bp));
168 
169 	ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
170 }
171 
172 /*
173  * Set the external verifier for a label block based on its offset.
174  * The vdev is implicit, and the txg is unknowable at pool open time --
175  * hence the logic in vdev_uberblock_load() to find the most recent copy.
176  */
177 static void
178 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
179 {
180 	ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
181 }
182 
183 static int
184 zio_checksum_verify(const blkptr_t *bp, void *data)
185 {
186 	uint64_t size;
187 	unsigned int checksum;
188 	zio_checksum_info_t *ci;
189 	zio_cksum_t actual_cksum, expected_cksum, verifier;
190 	int byteswap;
191 
192 	checksum = BP_GET_CHECKSUM(bp);
193 	size = BP_GET_PSIZE(bp);
194 
195 	if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
196 		return (EINVAL);
197 	ci = &zio_checksum_table[checksum];
198 	if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
199 		return (EINVAL);
200 
201 	if (ci->ci_eck) {
202 		zio_eck_t *eck;
203 
204 		ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
205 		    checksum == ZIO_CHECKSUM_LABEL);
206 
207 		eck = (zio_eck_t *)((char *)data + size) - 1;
208 
209 		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
210 			zio_checksum_gang_verifier(&verifier, bp);
211 		else if (checksum == ZIO_CHECKSUM_LABEL)
212 			zio_checksum_label_verifier(&verifier,
213 			    DVA_GET_OFFSET(BP_IDENTITY(bp)));
214 		else
215 			verifier = bp->blk_cksum;
216 
217 		byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
218 
219 		if (byteswap)
220 			byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
221 
222 		expected_cksum = eck->zec_cksum;
223 		eck->zec_cksum = verifier;
224 		ci->ci_func[byteswap](data, size, &actual_cksum);
225 		eck->zec_cksum = expected_cksum;
226 
227 		if (byteswap)
228 			byteswap_uint64_array(&expected_cksum,
229 			    sizeof (zio_cksum_t));
230 	} else {
231 		expected_cksum = bp->blk_cksum;
232 		ci->ci_func[0](data, size, &actual_cksum);
233 	}
234 
235 	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
236 		/*printf("ZFS: read checksum failed\n");*/
237 		return (EIO);
238 	}
239 
240 	return (0);
241 }
242 
243 static int
244 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
245 	void *dest, uint64_t destsize)
246 {
247 	zio_compress_info_t *ci;
248 
249 	if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
250 		printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
251 		return (EIO);
252 	}
253 
254 	ci = &zio_compress_table[cpfunc];
255 	if (!ci->ci_decompress) {
256 		printf("ZFS: unsupported compression algorithm %s\n",
257 		    ci->ci_name);
258 		return (EIO);
259 	}
260 
261 	return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
262 }
263 
264 static uint64_t
265 zap_hash(uint64_t salt, const char *name)
266 {
267 	const uint8_t *cp;
268 	uint8_t c;
269 	uint64_t crc = salt;
270 
271 	ASSERT(crc != 0);
272 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
273 	for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
274 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
275 
276 	/*
277 	 * Only use 28 bits, since we need 4 bits in the cookie for the
278 	 * collision differentiator.  We MUST use the high bits, since
279 	 * those are the onces that we first pay attention to when
280 	 * chosing the bucket.
281 	 */
282 	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
283 
284 	return (crc);
285 }
286 
287 static void *zfs_alloc(size_t size);
288 static void zfs_free(void *ptr, size_t size);
289 
290 typedef struct raidz_col {
291 	uint64_t rc_devidx;		/* child device index for I/O */
292 	uint64_t rc_offset;		/* device offset */
293 	uint64_t rc_size;		/* I/O size */
294 	void *rc_data;			/* I/O data */
295 	int rc_error;			/* I/O error for this device */
296 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
297 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
298 } raidz_col_t;
299 
300 typedef struct raidz_map {
301 	uint64_t rm_cols;		/* Regular column count */
302 	uint64_t rm_scols;		/* Count including skipped columns */
303 	uint64_t rm_bigcols;		/* Number of oversized columns */
304 	uint64_t rm_asize;		/* Actual total I/O size */
305 	uint64_t rm_missingdata;	/* Count of missing data devices */
306 	uint64_t rm_missingparity;	/* Count of missing parity devices */
307 	uint64_t rm_firstdatacol;	/* First data column/parity count */
308 	uint64_t rm_nskip;		/* Skipped sectors for padding */
309 	uint64_t rm_skipstart;		/* Column index of padding start */
310 	uintptr_t rm_reports;		/* # of referencing checksum reports */
311 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
312 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
313 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
314 } raidz_map_t;
315 
316 #define	VDEV_RAIDZ_P		0
317 #define	VDEV_RAIDZ_Q		1
318 #define	VDEV_RAIDZ_R		2
319 
320 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
321 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
322 
323 /*
324  * We provide a mechanism to perform the field multiplication operation on a
325  * 64-bit value all at once rather than a byte at a time. This works by
326  * creating a mask from the top bit in each byte and using that to
327  * conditionally apply the XOR of 0x1d.
328  */
329 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
330 { \
331 	(mask) = (x) & 0x8080808080808080ULL; \
332 	(mask) = ((mask) << 1) - ((mask) >> 7); \
333 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
334 	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
335 }
336 
337 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
338 { \
339 	VDEV_RAIDZ_64MUL_2((x), mask); \
340 	VDEV_RAIDZ_64MUL_2((x), mask); \
341 }
342 
343 /*
344  * These two tables represent powers and logs of 2 in the Galois field defined
345  * above. These values were computed by repeatedly multiplying by 2 as above.
346  */
347 static const uint8_t vdev_raidz_pow2[256] = {
348 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
349 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
350 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
351 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
352 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
353 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
354 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
355 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
356 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
357 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
358 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
359 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
360 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
361 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
362 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
363 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
364 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
365 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
366 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
367 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
368 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
369 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
370 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
371 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
372 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
373 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
374 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
375 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
376 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
377 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
378 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
379 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
380 };
381 static const uint8_t vdev_raidz_log2[256] = {
382 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
383 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
384 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
385 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
386 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
387 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
388 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
389 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
390 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
391 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
392 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
393 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
394 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
395 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
396 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
397 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
398 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
399 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
400 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
401 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
402 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
403 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
404 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
405 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
406 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
407 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
408 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
409 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
410 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
411 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
412 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
413 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
414 };
415 
416 /*
417  * Multiply a given number by 2 raised to the given power.
418  */
419 static uint8_t
420 vdev_raidz_exp2(uint8_t a, int exp)
421 {
422 	if (a == 0)
423 		return (0);
424 
425 	ASSERT(exp >= 0);
426 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
427 
428 	exp += vdev_raidz_log2[a];
429 	if (exp > 255)
430 		exp -= 255;
431 
432 	return (vdev_raidz_pow2[exp]);
433 }
434 
435 static void
436 vdev_raidz_generate_parity_p(raidz_map_t *rm)
437 {
438 	uint64_t *p, *src, pcount, ccount, i;
439 	int c;
440 
441 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
442 
443 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
444 		src = rm->rm_col[c].rc_data;
445 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
446 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
447 
448 		if (c == rm->rm_firstdatacol) {
449 			ASSERT(ccount == pcount);
450 			for (i = 0; i < ccount; i++, src++, p++) {
451 				*p = *src;
452 			}
453 		} else {
454 			ASSERT(ccount <= pcount);
455 			for (i = 0; i < ccount; i++, src++, p++) {
456 				*p ^= *src;
457 			}
458 		}
459 	}
460 }
461 
462 static void
463 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
464 {
465 	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
466 	int c;
467 
468 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
469 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
470 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
471 
472 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
473 		src = rm->rm_col[c].rc_data;
474 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
475 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
476 
477 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
478 
479 		if (c == rm->rm_firstdatacol) {
480 			ASSERT(ccnt == pcnt || ccnt == 0);
481 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
482 				*p = *src;
483 				*q = *src;
484 			}
485 			for (; i < pcnt; i++, src++, p++, q++) {
486 				*p = 0;
487 				*q = 0;
488 			}
489 		} else {
490 			ASSERT(ccnt <= pcnt);
491 
492 			/*
493 			 * Apply the algorithm described above by multiplying
494 			 * the previous result and adding in the new value.
495 			 */
496 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
497 				*p ^= *src;
498 
499 				VDEV_RAIDZ_64MUL_2(*q, mask);
500 				*q ^= *src;
501 			}
502 
503 			/*
504 			 * Treat short columns as though they are full of 0s.
505 			 * Note that there's therefore nothing needed for P.
506 			 */
507 			for (; i < pcnt; i++, q++) {
508 				VDEV_RAIDZ_64MUL_2(*q, mask);
509 			}
510 		}
511 	}
512 }
513 
514 static void
515 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
516 {
517 	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
518 	int c;
519 
520 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
521 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
522 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
523 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
524 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
525 
526 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
527 		src = rm->rm_col[c].rc_data;
528 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
529 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
530 		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
531 
532 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
533 
534 		if (c == rm->rm_firstdatacol) {
535 			ASSERT(ccnt == pcnt || ccnt == 0);
536 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
537 				*p = *src;
538 				*q = *src;
539 				*r = *src;
540 			}
541 			for (; i < pcnt; i++, src++, p++, q++, r++) {
542 				*p = 0;
543 				*q = 0;
544 				*r = 0;
545 			}
546 		} else {
547 			ASSERT(ccnt <= pcnt);
548 
549 			/*
550 			 * Apply the algorithm described above by multiplying
551 			 * the previous result and adding in the new value.
552 			 */
553 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
554 				*p ^= *src;
555 
556 				VDEV_RAIDZ_64MUL_2(*q, mask);
557 				*q ^= *src;
558 
559 				VDEV_RAIDZ_64MUL_4(*r, mask);
560 				*r ^= *src;
561 			}
562 
563 			/*
564 			 * Treat short columns as though they are full of 0s.
565 			 * Note that there's therefore nothing needed for P.
566 			 */
567 			for (; i < pcnt; i++, q++, r++) {
568 				VDEV_RAIDZ_64MUL_2(*q, mask);
569 				VDEV_RAIDZ_64MUL_4(*r, mask);
570 			}
571 		}
572 	}
573 }
574 
575 /*
576  * Generate RAID parity in the first virtual columns according to the number of
577  * parity columns available.
578  */
579 static void
580 vdev_raidz_generate_parity(raidz_map_t *rm)
581 {
582 	switch (rm->rm_firstdatacol) {
583 	case 1:
584 		vdev_raidz_generate_parity_p(rm);
585 		break;
586 	case 2:
587 		vdev_raidz_generate_parity_pq(rm);
588 		break;
589 	case 3:
590 		vdev_raidz_generate_parity_pqr(rm);
591 		break;
592 	default:
593 		panic("invalid RAID-Z configuration");
594 	}
595 }
596 
597 /* BEGIN CSTYLED */
598 /*
599  * In the general case of reconstruction, we must solve the system of linear
600  * equations defined by the coeffecients used to generate parity as well as
601  * the contents of the data and parity disks. This can be expressed with
602  * vectors for the original data (D) and the actual data (d) and parity (p)
603  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
604  *
605  *            __   __                     __     __
606  *            |     |         __     __   |  p_0  |
607  *            |  V  |         |  D_0  |   | p_m-1 |
608  *            |     |    x    |   :   | = |  d_0  |
609  *            |  I  |         | D_n-1 |   |   :   |
610  *            |     |         ~~     ~~   | d_n-1 |
611  *            ~~   ~~                     ~~     ~~
612  *
613  * I is simply a square identity matrix of size n, and V is a vandermonde
614  * matrix defined by the coeffecients we chose for the various parity columns
615  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
616  * computation as well as linear separability.
617  *
618  *      __               __               __     __
619  *      |   1   ..  1 1 1 |               |  p_0  |
620  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
621  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
622  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
623  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
624  *      |   :       : : : |   |   :   |   |  d_2  |
625  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
626  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
627  *      |   0   ..  0 0 1 |               | d_n-1 |
628  *      ~~               ~~               ~~     ~~
629  *
630  * Note that I, V, d, and p are known. To compute D, we must invert the
631  * matrix and use the known data and parity values to reconstruct the unknown
632  * data values. We begin by removing the rows in V|I and d|p that correspond
633  * to failed or missing columns; we then make V|I square (n x n) and d|p
634  * sized n by removing rows corresponding to unused parity from the bottom up
635  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
636  * using Gauss-Jordan elimination. In the example below we use m=3 parity
637  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
638  *           __                               __
639  *           |  1   1   1   1   1   1   1   1  |
640  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
641  *           |  19 205 116  29  64  16  4   1  |      / /
642  *           |  1   0   0   0   0   0   0   0  |     / /
643  *           |  0   1   0   0   0   0   0   0  | <--' /
644  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
645  *           |  0   0   0   1   0   0   0   0  |
646  *           |  0   0   0   0   1   0   0   0  |
647  *           |  0   0   0   0   0   1   0   0  |
648  *           |  0   0   0   0   0   0   1   0  |
649  *           |  0   0   0   0   0   0   0   1  |
650  *           ~~                               ~~
651  *           __                               __
652  *           |  1   1   1   1   1   1   1   1  |
653  *           | 128  64  32  16  8   4   2   1  |
654  *           |  19 205 116  29  64  16  4   1  |
655  *           |  1   0   0   0   0   0   0   0  |
656  *           |  0   1   0   0   0   0   0   0  |
657  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
658  *           |  0   0   0   1   0   0   0   0  |
659  *           |  0   0   0   0   1   0   0   0  |
660  *           |  0   0   0   0   0   1   0   0  |
661  *           |  0   0   0   0   0   0   1   0  |
662  *           |  0   0   0   0   0   0   0   1  |
663  *           ~~                               ~~
664  *
665  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
666  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
667  * matrix is not singular.
668  * __                                                                 __
669  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
670  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
671  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
672  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
673  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
674  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
675  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
676  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
677  * ~~                                                                 ~~
678  * __                                                                 __
679  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
680  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
681  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
682  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
683  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
684  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
685  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
686  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
687  * ~~                                                                 ~~
688  * __                                                                 __
689  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
690  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
691  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
692  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
693  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
694  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
695  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
696  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
697  * ~~                                                                 ~~
698  * __                                                                 __
699  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
700  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
701  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
702  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
703  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
704  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
705  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
706  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
707  * ~~                                                                 ~~
708  * __                                                                 __
709  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
710  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
711  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
712  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
713  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
714  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
715  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
716  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
717  * ~~                                                                 ~~
718  * __                                                                 __
719  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
720  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
721  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
722  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
723  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
724  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
725  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
726  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
727  * ~~                                                                 ~~
728  *                   __                               __
729  *                   |  0   0   1   0   0   0   0   0  |
730  *                   | 167 100  5   41 159 169 217 208 |
731  *                   | 166 100  4   40 158 168 216 209 |
732  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
733  *                   |  0   0   0   0   1   0   0   0  |
734  *                   |  0   0   0   0   0   1   0   0  |
735  *                   |  0   0   0   0   0   0   1   0  |
736  *                   |  0   0   0   0   0   0   0   1  |
737  *                   ~~                               ~~
738  *
739  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
740  * of the missing data.
741  *
742  * As is apparent from the example above, the only non-trivial rows in the
743  * inverse matrix correspond to the data disks that we're trying to
744  * reconstruct. Indeed, those are the only rows we need as the others would
745  * only be useful for reconstructing data known or assumed to be valid. For
746  * that reason, we only build the coefficients in the rows that correspond to
747  * targeted columns.
748  */
749 /* END CSTYLED */
750 
751 static void
752 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
753     uint8_t **rows)
754 {
755 	int i, j;
756 	int pow;
757 
758 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
759 
760 	/*
761 	 * Fill in the missing rows of interest.
762 	 */
763 	for (i = 0; i < nmap; i++) {
764 		ASSERT3S(0, <=, map[i]);
765 		ASSERT3S(map[i], <=, 2);
766 
767 		pow = map[i] * n;
768 		if (pow > 255)
769 			pow -= 255;
770 		ASSERT(pow <= 255);
771 
772 		for (j = 0; j < n; j++) {
773 			pow -= map[i];
774 			if (pow < 0)
775 				pow += 255;
776 			rows[i][j] = vdev_raidz_pow2[pow];
777 		}
778 	}
779 }
780 
781 static void
782 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
783     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
784 {
785 	int i, j, ii, jj;
786 	uint8_t log;
787 
788 	/*
789 	 * Assert that the first nmissing entries from the array of used
790 	 * columns correspond to parity columns and that subsequent entries
791 	 * correspond to data columns.
792 	 */
793 	for (i = 0; i < nmissing; i++) {
794 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
795 	}
796 	for (; i < n; i++) {
797 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
798 	}
799 
800 	/*
801 	 * First initialize the storage where we'll compute the inverse rows.
802 	 */
803 	for (i = 0; i < nmissing; i++) {
804 		for (j = 0; j < n; j++) {
805 			invrows[i][j] = (i == j) ? 1 : 0;
806 		}
807 	}
808 
809 	/*
810 	 * Subtract all trivial rows from the rows of consequence.
811 	 */
812 	for (i = 0; i < nmissing; i++) {
813 		for (j = nmissing; j < n; j++) {
814 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
815 			jj = used[j] - rm->rm_firstdatacol;
816 			ASSERT3S(jj, <, n);
817 			invrows[i][j] = rows[i][jj];
818 			rows[i][jj] = 0;
819 		}
820 	}
821 
822 	/*
823 	 * For each of the rows of interest, we must normalize it and subtract
824 	 * a multiple of it from the other rows.
825 	 */
826 	for (i = 0; i < nmissing; i++) {
827 		for (j = 0; j < missing[i]; j++) {
828 			ASSERT3U(rows[i][j], ==, 0);
829 		}
830 		ASSERT3U(rows[i][missing[i]], !=, 0);
831 
832 		/*
833 		 * Compute the inverse of the first element and multiply each
834 		 * element in the row by that value.
835 		 */
836 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
837 
838 		for (j = 0; j < n; j++) {
839 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
840 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
841 		}
842 
843 		for (ii = 0; ii < nmissing; ii++) {
844 			if (i == ii)
845 				continue;
846 
847 			ASSERT3U(rows[ii][missing[i]], !=, 0);
848 
849 			log = vdev_raidz_log2[rows[ii][missing[i]]];
850 
851 			for (j = 0; j < n; j++) {
852 				rows[ii][j] ^=
853 				    vdev_raidz_exp2(rows[i][j], log);
854 				invrows[ii][j] ^=
855 				    vdev_raidz_exp2(invrows[i][j], log);
856 			}
857 		}
858 	}
859 
860 	/*
861 	 * Verify that the data that is left in the rows are properly part of
862 	 * an identity matrix.
863 	 */
864 	for (i = 0; i < nmissing; i++) {
865 		for (j = 0; j < n; j++) {
866 			if (j == missing[i]) {
867 				ASSERT3U(rows[i][j], ==, 1);
868 			} else {
869 				ASSERT3U(rows[i][j], ==, 0);
870 			}
871 		}
872 	}
873 }
874 
875 static void
876 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
877     int *missing, uint8_t **invrows, const uint8_t *used)
878 {
879 	int i, j, x, cc, c;
880 	uint8_t *src;
881 	uint64_t ccount;
882 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
883 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
884 	uint8_t log, val;
885 	int ll;
886 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
887 	uint8_t *p, *pp;
888 	size_t psize;
889 
890 	log = 0;	/* gcc */
891 	psize = sizeof (invlog[0][0]) * n * nmissing;
892 	p = zfs_alloc(psize);
893 
894 	for (pp = p, i = 0; i < nmissing; i++) {
895 		invlog[i] = pp;
896 		pp += n;
897 	}
898 
899 	for (i = 0; i < nmissing; i++) {
900 		for (j = 0; j < n; j++) {
901 			ASSERT3U(invrows[i][j], !=, 0);
902 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
903 		}
904 	}
905 
906 	for (i = 0; i < n; i++) {
907 		c = used[i];
908 		ASSERT3U(c, <, rm->rm_cols);
909 
910 		src = rm->rm_col[c].rc_data;
911 		ccount = rm->rm_col[c].rc_size;
912 		for (j = 0; j < nmissing; j++) {
913 			cc = missing[j] + rm->rm_firstdatacol;
914 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
915 			ASSERT3U(cc, <, rm->rm_cols);
916 			ASSERT3U(cc, !=, c);
917 
918 			dst[j] = rm->rm_col[cc].rc_data;
919 			dcount[j] = rm->rm_col[cc].rc_size;
920 		}
921 
922 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
923 
924 		for (x = 0; x < ccount; x++, src++) {
925 			if (*src != 0)
926 				log = vdev_raidz_log2[*src];
927 
928 			for (cc = 0; cc < nmissing; cc++) {
929 				if (x >= dcount[cc])
930 					continue;
931 
932 				if (*src == 0) {
933 					val = 0;
934 				} else {
935 					if ((ll = log + invlog[cc][i]) >= 255)
936 						ll -= 255;
937 					val = vdev_raidz_pow2[ll];
938 				}
939 
940 				if (i == 0)
941 					dst[cc][x] = val;
942 				else
943 					dst[cc][x] ^= val;
944 			}
945 		}
946 	}
947 
948 	zfs_free(p, psize);
949 }
950 
951 static int
952 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
953 {
954 	int n, i, c, t, tt;
955 	int nmissing_rows;
956 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
957 	int parity_map[VDEV_RAIDZ_MAXPARITY];
958 
959 	uint8_t *p, *pp;
960 	size_t psize;
961 
962 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
963 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
964 	uint8_t *used;
965 
966 	int code = 0;
967 
968 
969 	n = rm->rm_cols - rm->rm_firstdatacol;
970 
971 	/*
972 	 * Figure out which data columns are missing.
973 	 */
974 	nmissing_rows = 0;
975 	for (t = 0; t < ntgts; t++) {
976 		if (tgts[t] >= rm->rm_firstdatacol) {
977 			missing_rows[nmissing_rows++] =
978 			    tgts[t] - rm->rm_firstdatacol;
979 		}
980 	}
981 
982 	/*
983 	 * Figure out which parity columns to use to help generate the missing
984 	 * data columns.
985 	 */
986 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
987 		ASSERT(tt < ntgts);
988 		ASSERT(c < rm->rm_firstdatacol);
989 
990 		/*
991 		 * Skip any targeted parity columns.
992 		 */
993 		if (c == tgts[tt]) {
994 			tt++;
995 			continue;
996 		}
997 
998 		code |= 1 << c;
999 
1000 		parity_map[i] = c;
1001 		i++;
1002 	}
1003 
1004 	ASSERT(code != 0);
1005 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1006 
1007 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1008 	    nmissing_rows * n + sizeof (used[0]) * n;
1009 	p = kmem_alloc(psize, KM_SLEEP);
1010 
1011 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1012 		rows[i] = pp;
1013 		pp += n;
1014 		invrows[i] = pp;
1015 		pp += n;
1016 	}
1017 	used = pp;
1018 
1019 	for (i = 0; i < nmissing_rows; i++) {
1020 		used[i] = parity_map[i];
1021 	}
1022 
1023 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1024 		if (tt < nmissing_rows &&
1025 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1026 			tt++;
1027 			continue;
1028 		}
1029 
1030 		ASSERT3S(i, <, n);
1031 		used[i] = c;
1032 		i++;
1033 	}
1034 
1035 	/*
1036 	 * Initialize the interesting rows of the matrix.
1037 	 */
1038 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1039 
1040 	/*
1041 	 * Invert the matrix.
1042 	 */
1043 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1044 	    invrows, used);
1045 
1046 	/*
1047 	 * Reconstruct the missing data using the generated matrix.
1048 	 */
1049 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1050 	    invrows, used);
1051 
1052 	kmem_free(p, psize);
1053 
1054 	return (code);
1055 }
1056 
1057 static int
1058 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1059 {
1060 	int tgts[VDEV_RAIDZ_MAXPARITY];
1061 	int ntgts;
1062 	int i, c;
1063 	int code;
1064 	int nbadparity, nbaddata;
1065 
1066 	/*
1067 	 * The tgts list must already be sorted.
1068 	 */
1069 	for (i = 1; i < nt; i++) {
1070 		ASSERT(t[i] > t[i - 1]);
1071 	}
1072 
1073 	nbadparity = rm->rm_firstdatacol;
1074 	nbaddata = rm->rm_cols - nbadparity;
1075 	ntgts = 0;
1076 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1077 		if (i < nt && c == t[i]) {
1078 			tgts[ntgts++] = c;
1079 			i++;
1080 		} else if (rm->rm_col[c].rc_error != 0) {
1081 			tgts[ntgts++] = c;
1082 		} else if (c >= rm->rm_firstdatacol) {
1083 			nbaddata--;
1084 		} else {
1085 			nbadparity--;
1086 		}
1087 	}
1088 
1089 	ASSERT(ntgts >= nt);
1090 	ASSERT(nbaddata >= 0);
1091 	ASSERT(nbaddata + nbadparity == ntgts);
1092 
1093 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1094 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1095 	ASSERT(code > 0);
1096 	return (code);
1097 }
1098 
1099 static raidz_map_t *
1100 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1101     uint64_t dcols, uint64_t nparity)
1102 {
1103 	raidz_map_t *rm;
1104 	uint64_t b = offset >> unit_shift;
1105 	uint64_t s = size >> unit_shift;
1106 	uint64_t f = b % dcols;
1107 	uint64_t o = (b / dcols) << unit_shift;
1108 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1109 
1110 	q = s / (dcols - nparity);
1111 	r = s - q * (dcols - nparity);
1112 	bc = (r == 0 ? 0 : r + nparity);
1113 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
1114 
1115 	if (q == 0) {
1116 		acols = bc;
1117 		scols = MIN(dcols, roundup(bc, nparity + 1));
1118 	} else {
1119 		acols = dcols;
1120 		scols = dcols;
1121 	}
1122 
1123 	ASSERT3U(acols, <=, scols);
1124 
1125 	rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols]));
1126 
1127 	rm->rm_cols = acols;
1128 	rm->rm_scols = scols;
1129 	rm->rm_bigcols = bc;
1130 	rm->rm_skipstart = bc;
1131 	rm->rm_missingdata = 0;
1132 	rm->rm_missingparity = 0;
1133 	rm->rm_firstdatacol = nparity;
1134 	rm->rm_reports = 0;
1135 	rm->rm_freed = 0;
1136 	rm->rm_ecksuminjected = 0;
1137 
1138 	asize = 0;
1139 
1140 	for (c = 0; c < scols; c++) {
1141 		col = f + c;
1142 		coff = o;
1143 		if (col >= dcols) {
1144 			col -= dcols;
1145 			coff += 1ULL << unit_shift;
1146 		}
1147 		rm->rm_col[c].rc_devidx = col;
1148 		rm->rm_col[c].rc_offset = coff;
1149 		rm->rm_col[c].rc_data = NULL;
1150 		rm->rm_col[c].rc_error = 0;
1151 		rm->rm_col[c].rc_tried = 0;
1152 		rm->rm_col[c].rc_skipped = 0;
1153 
1154 		if (c >= acols)
1155 			rm->rm_col[c].rc_size = 0;
1156 		else if (c < bc)
1157 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1158 		else
1159 			rm->rm_col[c].rc_size = q << unit_shift;
1160 
1161 		asize += rm->rm_col[c].rc_size;
1162 	}
1163 
1164 	ASSERT3U(asize, ==, tot << unit_shift);
1165 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1166 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1167 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1168 	ASSERT3U(rm->rm_nskip, <=, nparity);
1169 
1170 	for (c = 0; c < rm->rm_firstdatacol; c++)
1171 		rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size);
1172 
1173 	rm->rm_col[c].rc_data = data;
1174 
1175 	for (c = c + 1; c < acols; c++)
1176 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1177 		    rm->rm_col[c - 1].rc_size;
1178 
1179 	/*
1180 	 * If all data stored spans all columns, there's a danger that parity
1181 	 * will always be on the same device and, since parity isn't read
1182 	 * during normal operation, that that device's I/O bandwidth won't be
1183 	 * used effectively. We therefore switch the parity every 1MB.
1184 	 *
1185 	 * ... at least that was, ostensibly, the theory. As a practical
1186 	 * matter unless we juggle the parity between all devices evenly, we
1187 	 * won't see any benefit. Further, occasional writes that aren't a
1188 	 * multiple of the LCM of the number of children and the minimum
1189 	 * stripe width are sufficient to avoid pessimal behavior.
1190 	 * Unfortunately, this decision created an implicit on-disk format
1191 	 * requirement that we need to support for all eternity, but only
1192 	 * for single-parity RAID-Z.
1193 	 *
1194 	 * If we intend to skip a sector in the zeroth column for padding
1195 	 * we must make sure to note this swap. We will never intend to
1196 	 * skip the first column since at least one data and one parity
1197 	 * column must appear in each row.
1198 	 */
1199 	ASSERT(rm->rm_cols >= 2);
1200 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1201 
1202 	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1203 		devidx = rm->rm_col[0].rc_devidx;
1204 		o = rm->rm_col[0].rc_offset;
1205 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1206 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1207 		rm->rm_col[1].rc_devidx = devidx;
1208 		rm->rm_col[1].rc_offset = o;
1209 
1210 		if (rm->rm_skipstart == 0)
1211 			rm->rm_skipstart = 1;
1212 	}
1213 
1214 	return (rm);
1215 }
1216 
1217 static void
1218 vdev_raidz_map_free(raidz_map_t *rm)
1219 {
1220 	int c;
1221 
1222 	for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1223 		zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
1224 
1225 	zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
1226 }
1227 
1228 static vdev_t *
1229 vdev_child(vdev_t *pvd, uint64_t devidx)
1230 {
1231 	vdev_t *cvd;
1232 
1233 	STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1234 		if (cvd->v_id == devidx)
1235 			break;
1236 	}
1237 
1238 	return (cvd);
1239 }
1240 
1241 /*
1242  * We keep track of whether or not there were any injected errors, so that
1243  * any ereports we generate can note it.
1244  */
1245 static int
1246 raidz_checksum_verify(const blkptr_t *bp, void *data, uint64_t size)
1247 {
1248 
1249 	return (zio_checksum_verify(bp, data));
1250 }
1251 
1252 /*
1253  * Generate the parity from the data columns. If we tried and were able to
1254  * read the parity without error, verify that the generated parity matches the
1255  * data we read. If it doesn't, we fire off a checksum error. Return the
1256  * number such failures.
1257  */
1258 static int
1259 raidz_parity_verify(raidz_map_t *rm)
1260 {
1261 	void *orig[VDEV_RAIDZ_MAXPARITY];
1262 	int c, ret = 0;
1263 	raidz_col_t *rc;
1264 
1265 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1266 		rc = &rm->rm_col[c];
1267 		if (!rc->rc_tried || rc->rc_error != 0)
1268 			continue;
1269 		orig[c] = zfs_alloc(rc->rc_size);
1270 		bcopy(rc->rc_data, orig[c], rc->rc_size);
1271 	}
1272 
1273 	vdev_raidz_generate_parity(rm);
1274 
1275 	for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1276 		rc = &rm->rm_col[c];
1277 		if (!rc->rc_tried || rc->rc_error != 0)
1278 			continue;
1279 		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1280 			rc->rc_error = ECKSUM;
1281 			ret++;
1282 		}
1283 		zfs_free(orig[c], rc->rc_size);
1284 	}
1285 
1286 	return (ret);
1287 }
1288 
1289 /*
1290  * Iterate over all combinations of bad data and attempt a reconstruction.
1291  * Note that the algorithm below is non-optimal because it doesn't take into
1292  * account how reconstruction is actually performed. For example, with
1293  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1294  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1295  * cases we'd only use parity information in column 0.
1296  */
1297 static int
1298 vdev_raidz_combrec(raidz_map_t *rm, const blkptr_t *bp, void *data,
1299     off_t offset, uint64_t bytes, int total_errors, int data_errors)
1300 {
1301 	raidz_col_t *rc;
1302 	void *orig[VDEV_RAIDZ_MAXPARITY];
1303 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1304 	int *tgts = &tstore[1];
1305 	int current, next, i, c, n;
1306 	int code, ret = 0;
1307 
1308 	ASSERT(total_errors < rm->rm_firstdatacol);
1309 
1310 	/*
1311 	 * This simplifies one edge condition.
1312 	 */
1313 	tgts[-1] = -1;
1314 
1315 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1316 		/*
1317 		 * Initialize the targets array by finding the first n columns
1318 		 * that contain no error.
1319 		 *
1320 		 * If there were no data errors, we need to ensure that we're
1321 		 * always explicitly attempting to reconstruct at least one
1322 		 * data column. To do this, we simply push the highest target
1323 		 * up into the data columns.
1324 		 */
1325 		for (c = 0, i = 0; i < n; i++) {
1326 			if (i == n - 1 && data_errors == 0 &&
1327 			    c < rm->rm_firstdatacol) {
1328 				c = rm->rm_firstdatacol;
1329 			}
1330 
1331 			while (rm->rm_col[c].rc_error != 0) {
1332 				c++;
1333 				ASSERT3S(c, <, rm->rm_cols);
1334 			}
1335 
1336 			tgts[i] = c++;
1337 		}
1338 
1339 		/*
1340 		 * Setting tgts[n] simplifies the other edge condition.
1341 		 */
1342 		tgts[n] = rm->rm_cols;
1343 
1344 		/*
1345 		 * These buffers were allocated in previous iterations.
1346 		 */
1347 		for (i = 0; i < n - 1; i++) {
1348 			ASSERT(orig[i] != NULL);
1349 		}
1350 
1351 		orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size);
1352 
1353 		current = 0;
1354 		next = tgts[current];
1355 
1356 		while (current != n) {
1357 			tgts[current] = next;
1358 			current = 0;
1359 
1360 			/*
1361 			 * Save off the original data that we're going to
1362 			 * attempt to reconstruct.
1363 			 */
1364 			for (i = 0; i < n; i++) {
1365 				ASSERT(orig[i] != NULL);
1366 				c = tgts[i];
1367 				ASSERT3S(c, >=, 0);
1368 				ASSERT3S(c, <, rm->rm_cols);
1369 				rc = &rm->rm_col[c];
1370 				bcopy(rc->rc_data, orig[i], rc->rc_size);
1371 			}
1372 
1373 			/*
1374 			 * Attempt a reconstruction and exit the outer loop on
1375 			 * success.
1376 			 */
1377 			code = vdev_raidz_reconstruct(rm, tgts, n);
1378 			if (raidz_checksum_verify(bp, data, bytes) == 0) {
1379 				for (i = 0; i < n; i++) {
1380 					c = tgts[i];
1381 					rc = &rm->rm_col[c];
1382 					ASSERT(rc->rc_error == 0);
1383 					rc->rc_error = ECKSUM;
1384 				}
1385 
1386 				ret = code;
1387 				goto done;
1388 			}
1389 
1390 			/*
1391 			 * Restore the original data.
1392 			 */
1393 			for (i = 0; i < n; i++) {
1394 				c = tgts[i];
1395 				rc = &rm->rm_col[c];
1396 				bcopy(orig[i], rc->rc_data, rc->rc_size);
1397 			}
1398 
1399 			do {
1400 				/*
1401 				 * Find the next valid column after the current
1402 				 * position..
1403 				 */
1404 				for (next = tgts[current] + 1;
1405 				    next < rm->rm_cols &&
1406 				    rm->rm_col[next].rc_error != 0; next++)
1407 					continue;
1408 
1409 				ASSERT(next <= tgts[current + 1]);
1410 
1411 				/*
1412 				 * If that spot is available, we're done here.
1413 				 */
1414 				if (next != tgts[current + 1])
1415 					break;
1416 
1417 				/*
1418 				 * Otherwise, find the next valid column after
1419 				 * the previous position.
1420 				 */
1421 				for (c = tgts[current - 1] + 1;
1422 				    rm->rm_col[c].rc_error != 0; c++)
1423 					continue;
1424 
1425 				tgts[current] = c;
1426 				current++;
1427 
1428 			} while (current != n);
1429 		}
1430 	}
1431 	n--;
1432 done:
1433 	for (i = n - 1; i >= 0; i--) {
1434 		zfs_free(orig[i], rm->rm_col[0].rc_size);
1435 	}
1436 
1437 	return (ret);
1438 }
1439 
1440 static int
1441 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1442     off_t offset, size_t bytes)
1443 {
1444 	vdev_t *tvd = vd->v_top;
1445 	vdev_t *cvd;
1446 	raidz_map_t *rm;
1447 	raidz_col_t *rc;
1448 	int c, error;
1449 	int unexpected_errors;
1450 	int parity_errors;
1451 	int parity_untried;
1452 	int data_errors;
1453 	int total_errors;
1454 	int n;
1455 	int tgts[VDEV_RAIDZ_MAXPARITY];
1456 	int code;
1457 
1458 	rc = NULL;	/* gcc */
1459 	error = 0;
1460 
1461 	rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1462 	    vd->v_nchildren, vd->v_nparity);
1463 
1464 	/*
1465 	 * Iterate over the columns in reverse order so that we hit the parity
1466 	 * last -- any errors along the way will force us to read the parity.
1467 	 */
1468 	for (c = rm->rm_cols - 1; c >= 0; c--) {
1469 		rc = &rm->rm_col[c];
1470 		cvd = vdev_child(vd, rc->rc_devidx);
1471 		if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1472 			if (c >= rm->rm_firstdatacol)
1473 				rm->rm_missingdata++;
1474 			else
1475 				rm->rm_missingparity++;
1476 			rc->rc_error = ENXIO;
1477 			rc->rc_tried = 1;	/* don't even try */
1478 			rc->rc_skipped = 1;
1479 			continue;
1480 		}
1481 #if 0		/* XXX: Too hard for the boot code. */
1482 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1483 			if (c >= rm->rm_firstdatacol)
1484 				rm->rm_missingdata++;
1485 			else
1486 				rm->rm_missingparity++;
1487 			rc->rc_error = ESTALE;
1488 			rc->rc_skipped = 1;
1489 			continue;
1490 		}
1491 #endif
1492 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1493 			rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1494 			    rc->rc_offset, rc->rc_size);
1495 			rc->rc_tried = 1;
1496 			rc->rc_skipped = 0;
1497 		}
1498 	}
1499 
1500 reconstruct:
1501 	unexpected_errors = 0;
1502 	parity_errors = 0;
1503 	parity_untried = 0;
1504 	data_errors = 0;
1505 	total_errors = 0;
1506 
1507 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1508 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1509 
1510 	for (c = 0; c < rm->rm_cols; c++) {
1511 		rc = &rm->rm_col[c];
1512 
1513 		if (rc->rc_error) {
1514 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
1515 
1516 			if (c < rm->rm_firstdatacol)
1517 				parity_errors++;
1518 			else
1519 				data_errors++;
1520 
1521 			if (!rc->rc_skipped)
1522 				unexpected_errors++;
1523 
1524 			total_errors++;
1525 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1526 			parity_untried++;
1527 		}
1528 	}
1529 
1530 	/*
1531 	 * There are three potential phases for a read:
1532 	 *	1. produce valid data from the columns read
1533 	 *	2. read all disks and try again
1534 	 *	3. perform combinatorial reconstruction
1535 	 *
1536 	 * Each phase is progressively both more expensive and less likely to
1537 	 * occur. If we encounter more errors than we can repair or all phases
1538 	 * fail, we have no choice but to return an error.
1539 	 */
1540 
1541 	/*
1542 	 * If the number of errors we saw was correctable -- less than or equal
1543 	 * to the number of parity disks read -- attempt to produce data that
1544 	 * has a valid checksum. Naturally, this case applies in the absence of
1545 	 * any errors.
1546 	 */
1547 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1548 		if (data_errors == 0) {
1549 			if (raidz_checksum_verify(bp, data, bytes) == 0) {
1550 				/*
1551 				 * If we read parity information (unnecessarily
1552 				 * as it happens since no reconstruction was
1553 				 * needed) regenerate and verify the parity.
1554 				 * We also regenerate parity when resilvering
1555 				 * so we can write it out to the failed device
1556 				 * later.
1557 				 */
1558 				if (parity_errors + parity_untried <
1559 				    rm->rm_firstdatacol) {
1560 					n = raidz_parity_verify(rm);
1561 					unexpected_errors += n;
1562 					ASSERT(parity_errors + n <=
1563 					    rm->rm_firstdatacol);
1564 				}
1565 				goto done;
1566 			}
1567 		} else {
1568 			/*
1569 			 * We either attempt to read all the parity columns or
1570 			 * none of them. If we didn't try to read parity, we
1571 			 * wouldn't be here in the correctable case. There must
1572 			 * also have been fewer parity errors than parity
1573 			 * columns or, again, we wouldn't be in this code path.
1574 			 */
1575 			ASSERT(parity_untried == 0);
1576 			ASSERT(parity_errors < rm->rm_firstdatacol);
1577 
1578 			/*
1579 			 * Identify the data columns that reported an error.
1580 			 */
1581 			n = 0;
1582 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1583 				rc = &rm->rm_col[c];
1584 				if (rc->rc_error != 0) {
1585 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1586 					tgts[n++] = c;
1587 				}
1588 			}
1589 
1590 			ASSERT(rm->rm_firstdatacol >= n);
1591 
1592 			code = vdev_raidz_reconstruct(rm, tgts, n);
1593 
1594 			if (raidz_checksum_verify(bp, data, bytes) == 0) {
1595 				/*
1596 				 * If we read more parity disks than were used
1597 				 * for reconstruction, confirm that the other
1598 				 * parity disks produced correct data. This
1599 				 * routine is suboptimal in that it regenerates
1600 				 * the parity that we already used in addition
1601 				 * to the parity that we're attempting to
1602 				 * verify, but this should be a relatively
1603 				 * uncommon case, and can be optimized if it
1604 				 * becomes a problem. Note that we regenerate
1605 				 * parity when resilvering so we can write it
1606 				 * out to failed devices later.
1607 				 */
1608 				if (parity_errors < rm->rm_firstdatacol - n) {
1609 					n = raidz_parity_verify(rm);
1610 					unexpected_errors += n;
1611 					ASSERT(parity_errors + n <=
1612 					    rm->rm_firstdatacol);
1613 				}
1614 
1615 				goto done;
1616 			}
1617 		}
1618 	}
1619 
1620 	/*
1621 	 * This isn't a typical situation -- either we got a read
1622 	 * error or a child silently returned bad data. Read every
1623 	 * block so we can try again with as much data and parity as
1624 	 * we can track down. If we've already been through once
1625 	 * before, all children will be marked as tried so we'll
1626 	 * proceed to combinatorial reconstruction.
1627 	 */
1628 	unexpected_errors = 1;
1629 	rm->rm_missingdata = 0;
1630 	rm->rm_missingparity = 0;
1631 
1632 	n = 0;
1633 	for (c = 0; c < rm->rm_cols; c++) {
1634 		rc = &rm->rm_col[c];
1635 
1636 		if (rc->rc_tried)
1637 			continue;
1638 
1639 		cvd = vdev_child(vd, rc->rc_devidx);
1640 		ASSERT(cvd != NULL);
1641 		rc->rc_error = cvd->v_read(cvd, NULL,
1642 		    rc->rc_data, rc->rc_offset, rc->rc_size);
1643 		if (rc->rc_error == 0)
1644 			n++;
1645 		rc->rc_tried = 1;
1646 		rc->rc_skipped = 0;
1647 	}
1648 	/*
1649 	 * If we managed to read anything more, retry the
1650 	 * reconstruction.
1651 	 */
1652 	if (n > 0)
1653 		goto reconstruct;
1654 
1655 	/*
1656 	 * At this point we've attempted to reconstruct the data given the
1657 	 * errors we detected, and we've attempted to read all columns. There
1658 	 * must, therefore, be one or more additional problems -- silent errors
1659 	 * resulting in invalid data rather than explicit I/O errors resulting
1660 	 * in absent data. We check if there is enough additional data to
1661 	 * possibly reconstruct the data and then perform combinatorial
1662 	 * reconstruction over all possible combinations. If that fails,
1663 	 * we're cooked.
1664 	 */
1665 	if (total_errors > rm->rm_firstdatacol) {
1666 		error = EIO;
1667 	} else if (total_errors < rm->rm_firstdatacol &&
1668 	    (code = vdev_raidz_combrec(rm, bp, data, offset, bytes,
1669 	     total_errors, data_errors)) != 0) {
1670 		/*
1671 		 * If we didn't use all the available parity for the
1672 		 * combinatorial reconstruction, verify that the remaining
1673 		 * parity is correct.
1674 		 */
1675 		if (code != (1 << rm->rm_firstdatacol) - 1)
1676 			(void) raidz_parity_verify(rm);
1677 	} else {
1678 		/*
1679 		 * We're here because either:
1680 		 *
1681 		 *	total_errors == rm_first_datacol, or
1682 		 *	vdev_raidz_combrec() failed
1683 		 *
1684 		 * In either case, there is enough bad data to prevent
1685 		 * reconstruction.
1686 		 *
1687 		 * Start checksum ereports for all children which haven't
1688 		 * failed, and the IO wasn't speculative.
1689 		 */
1690 		error = ECKSUM;
1691 	}
1692 
1693 done:
1694 	vdev_raidz_map_free(rm);
1695 
1696 	return (error);
1697 }
1698