xref: /freebsd/sys/cddl/boot/zfs/zfssubr.c (revision 0b3105a37d7adcadcb720112fed4dc4e8040be99)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/cdefs.h>
27 __FBSDID("$FreeBSD$");
28 
29 static uint64_t zfs_crc64_table[256];
30 
31 #define	ECKSUM	666
32 
33 #define	ASSERT3S(x, y, z)	((void)0)
34 #define	ASSERT3U(x, y, z)	((void)0)
35 #define	ASSERT3P(x, y, z)	((void)0)
36 #define	ASSERT0(x)		((void)0)
37 #define	ASSERT(x)		((void)0)
38 
39 #define	panic(...)	do {						\
40 	printf(__VA_ARGS__);						\
41 	for (;;) ;							\
42 } while (0)
43 
44 #define	kmem_alloc(size, flag)	zfs_alloc((size))
45 #define	kmem_free(ptr, size)	zfs_free((ptr), (size))
46 
47 static void
48 zfs_init_crc(void)
49 {
50 	int i, j;
51 	uint64_t *ct;
52 
53 	/*
54 	 * Calculate the crc64 table (used for the zap hash
55 	 * function).
56 	 */
57 	if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
58 		memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
59 		for (i = 0; i < 256; i++)
60 			for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
61 				*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
62 	}
63 }
64 
65 static void
66 zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
67 {
68 	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
69 }
70 
71 /*
72  * Signature for checksum functions.
73  */
74 typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp);
75 
76 /*
77  * Information about each checksum function.
78  */
79 typedef struct zio_checksum_info {
80 	zio_checksum_t	*ci_func[2]; /* checksum function for each byteorder */
81 	int		ci_correctable;	/* number of correctable bits	*/
82 	int		ci_eck;		/* uses zio embedded checksum? */
83 	int		ci_dedup;	/* strong enough for dedup? */
84 	const char	*ci_name;	/* descriptive name */
85 } zio_checksum_info_t;
86 
87 #include "blkptr.c"
88 
89 #include "fletcher.c"
90 #include "sha256.c"
91 
92 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
93 	{{NULL,			NULL},			0, 0, 0, "inherit"},
94 	{{NULL,			NULL},			0, 0, 0, "on"},
95 	{{zio_checksum_off,	zio_checksum_off},	0, 0, 0, "off"},
96 	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 1, 0, "label"},
97 	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 1, 0, "gang_header"},
98 	{{fletcher_2_native,	fletcher_2_byteswap},	0, 1, 0, "zilog"},
99 	{{fletcher_2_native,	fletcher_2_byteswap},	0, 0, 0, "fletcher2"},
100 	{{fletcher_4_native,	fletcher_4_byteswap},	1, 0, 0, "fletcher4"},
101 	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 0, 1, "SHA256"},
102 	{{fletcher_4_native,	fletcher_4_byteswap},	0, 1, 0, "zillog2"},
103 };
104 
105 
106 /*
107  * Common signature for all zio compress/decompress functions.
108  */
109 typedef size_t zio_compress_func_t(void *src, void *dst,
110     size_t s_len, size_t d_len, int);
111 typedef int zio_decompress_func_t(void *src, void *dst,
112     size_t s_len, size_t d_len, int);
113 
114 /*
115  * Information about each compression function.
116  */
117 typedef struct zio_compress_info {
118 	zio_compress_func_t	*ci_compress;	/* compression function */
119 	zio_decompress_func_t	*ci_decompress;	/* decompression function */
120 	int			ci_level;	/* level parameter */
121 	const char		*ci_name;	/* algorithm name */
122 } zio_compress_info_t;
123 
124 #include "lzjb.c"
125 #include "zle.c"
126 #include "lz4.c"
127 
128 /*
129  * Compression vectors.
130  */
131 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
132 	{NULL,			NULL,			0,	"inherit"},
133 	{NULL,			NULL,			0,	"on"},
134 	{NULL,			NULL,			0,	"uncompressed"},
135 	{NULL,			lzjb_decompress,	0,	"lzjb"},
136 	{NULL,			NULL,			0,	"empty"},
137 	{NULL,			NULL,			1,	"gzip-1"},
138 	{NULL,			NULL,			2,	"gzip-2"},
139 	{NULL,			NULL,			3,	"gzip-3"},
140 	{NULL,			NULL,			4,	"gzip-4"},
141 	{NULL,			NULL,			5,	"gzip-5"},
142 	{NULL,			NULL,			6,	"gzip-6"},
143 	{NULL,			NULL,			7,	"gzip-7"},
144 	{NULL,			NULL,			8,	"gzip-8"},
145 	{NULL,			NULL,			9,	"gzip-9"},
146 	{NULL,			zle_decompress,		64,	"zle"},
147 	{NULL,			lz4_decompress,		0,	"lz4"},
148 };
149 
150 static void
151 byteswap_uint64_array(void *vbuf, size_t size)
152 {
153 	uint64_t *buf = vbuf;
154 	size_t count = size >> 3;
155 	int i;
156 
157 	ASSERT((size & 7) == 0);
158 
159 	for (i = 0; i < count; i++)
160 		buf[i] = BSWAP_64(buf[i]);
161 }
162 
163 /*
164  * Set the external verifier for a gang block based on <vdev, offset, txg>,
165  * a tuple which is guaranteed to be unique for the life of the pool.
166  */
167 static void
168 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
169 {
170 	const dva_t *dva = BP_IDENTITY(bp);
171 	uint64_t txg = BP_PHYSICAL_BIRTH(bp);
172 
173 	ASSERT(BP_IS_GANG(bp));
174 
175 	ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
176 }
177 
178 /*
179  * Set the external verifier for a label block based on its offset.
180  * The vdev is implicit, and the txg is unknowable at pool open time --
181  * hence the logic in vdev_uberblock_load() to find the most recent copy.
182  */
183 static void
184 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
185 {
186 	ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
187 }
188 
189 static int
190 zio_checksum_verify(const blkptr_t *bp, void *data)
191 {
192 	uint64_t size;
193 	unsigned int checksum;
194 	zio_checksum_info_t *ci;
195 	zio_cksum_t actual_cksum, expected_cksum, verifier;
196 	int byteswap;
197 
198 	checksum = BP_GET_CHECKSUM(bp);
199 	size = BP_GET_PSIZE(bp);
200 
201 	if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
202 		return (EINVAL);
203 	ci = &zio_checksum_table[checksum];
204 	if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
205 		return (EINVAL);
206 
207 	if (ci->ci_eck) {
208 		zio_eck_t *eck;
209 
210 		ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
211 		    checksum == ZIO_CHECKSUM_LABEL);
212 
213 		eck = (zio_eck_t *)((char *)data + size) - 1;
214 
215 		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
216 			zio_checksum_gang_verifier(&verifier, bp);
217 		else if (checksum == ZIO_CHECKSUM_LABEL)
218 			zio_checksum_label_verifier(&verifier,
219 			    DVA_GET_OFFSET(BP_IDENTITY(bp)));
220 		else
221 			verifier = bp->blk_cksum;
222 
223 		byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
224 
225 		if (byteswap)
226 			byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
227 
228 		expected_cksum = eck->zec_cksum;
229 		eck->zec_cksum = verifier;
230 		ci->ci_func[byteswap](data, size, &actual_cksum);
231 		eck->zec_cksum = expected_cksum;
232 
233 		if (byteswap)
234 			byteswap_uint64_array(&expected_cksum,
235 			    sizeof (zio_cksum_t));
236 	} else {
237 		expected_cksum = bp->blk_cksum;
238 		ci->ci_func[0](data, size, &actual_cksum);
239 	}
240 
241 	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
242 		/*printf("ZFS: read checksum failed\n");*/
243 		return (EIO);
244 	}
245 
246 	return (0);
247 }
248 
249 static int
250 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
251 	void *dest, uint64_t destsize)
252 {
253 	zio_compress_info_t *ci;
254 
255 	if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
256 		printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
257 		return (EIO);
258 	}
259 
260 	ci = &zio_compress_table[cpfunc];
261 	if (!ci->ci_decompress) {
262 		printf("ZFS: unsupported compression algorithm %s\n",
263 		    ci->ci_name);
264 		return (EIO);
265 	}
266 
267 	return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
268 }
269 
270 static uint64_t
271 zap_hash(uint64_t salt, const char *name)
272 {
273 	const uint8_t *cp;
274 	uint8_t c;
275 	uint64_t crc = salt;
276 
277 	ASSERT(crc != 0);
278 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
279 	for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
280 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
281 
282 	/*
283 	 * Only use 28 bits, since we need 4 bits in the cookie for the
284 	 * collision differentiator.  We MUST use the high bits, since
285 	 * those are the onces that we first pay attention to when
286 	 * chosing the bucket.
287 	 */
288 	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
289 
290 	return (crc);
291 }
292 
293 static void *zfs_alloc(size_t size);
294 static void zfs_free(void *ptr, size_t size);
295 
296 typedef struct raidz_col {
297 	uint64_t rc_devidx;		/* child device index for I/O */
298 	uint64_t rc_offset;		/* device offset */
299 	uint64_t rc_size;		/* I/O size */
300 	void *rc_data;			/* I/O data */
301 	int rc_error;			/* I/O error for this device */
302 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
303 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
304 } raidz_col_t;
305 
306 typedef struct raidz_map {
307 	uint64_t rm_cols;		/* Regular column count */
308 	uint64_t rm_scols;		/* Count including skipped columns */
309 	uint64_t rm_bigcols;		/* Number of oversized columns */
310 	uint64_t rm_asize;		/* Actual total I/O size */
311 	uint64_t rm_missingdata;	/* Count of missing data devices */
312 	uint64_t rm_missingparity;	/* Count of missing parity devices */
313 	uint64_t rm_firstdatacol;	/* First data column/parity count */
314 	uint64_t rm_nskip;		/* Skipped sectors for padding */
315 	uint64_t rm_skipstart;		/* Column index of padding start */
316 	uintptr_t rm_reports;		/* # of referencing checksum reports */
317 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
318 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
319 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
320 } raidz_map_t;
321 
322 #define	VDEV_RAIDZ_P		0
323 #define	VDEV_RAIDZ_Q		1
324 #define	VDEV_RAIDZ_R		2
325 
326 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
327 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
328 
329 /*
330  * We provide a mechanism to perform the field multiplication operation on a
331  * 64-bit value all at once rather than a byte at a time. This works by
332  * creating a mask from the top bit in each byte and using that to
333  * conditionally apply the XOR of 0x1d.
334  */
335 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
336 { \
337 	(mask) = (x) & 0x8080808080808080ULL; \
338 	(mask) = ((mask) << 1) - ((mask) >> 7); \
339 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
340 	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
341 }
342 
343 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
344 { \
345 	VDEV_RAIDZ_64MUL_2((x), mask); \
346 	VDEV_RAIDZ_64MUL_2((x), mask); \
347 }
348 
349 /*
350  * These two tables represent powers and logs of 2 in the Galois field defined
351  * above. These values were computed by repeatedly multiplying by 2 as above.
352  */
353 static const uint8_t vdev_raidz_pow2[256] = {
354 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
355 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
356 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
357 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
358 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
359 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
360 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
361 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
362 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
363 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
364 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
365 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
366 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
367 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
368 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
369 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
370 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
371 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
372 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
373 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
374 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
375 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
376 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
377 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
378 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
379 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
380 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
381 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
382 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
383 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
384 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
385 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
386 };
387 static const uint8_t vdev_raidz_log2[256] = {
388 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
389 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
390 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
391 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
392 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
393 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
394 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
395 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
396 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
397 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
398 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
399 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
400 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
401 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
402 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
403 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
404 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
405 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
406 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
407 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
408 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
409 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
410 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
411 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
412 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
413 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
414 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
415 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
416 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
417 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
418 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
419 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
420 };
421 
422 /*
423  * Multiply a given number by 2 raised to the given power.
424  */
425 static uint8_t
426 vdev_raidz_exp2(uint8_t a, int exp)
427 {
428 	if (a == 0)
429 		return (0);
430 
431 	ASSERT(exp >= 0);
432 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
433 
434 	exp += vdev_raidz_log2[a];
435 	if (exp > 255)
436 		exp -= 255;
437 
438 	return (vdev_raidz_pow2[exp]);
439 }
440 
441 static void
442 vdev_raidz_generate_parity_p(raidz_map_t *rm)
443 {
444 	uint64_t *p, *src, pcount, ccount, i;
445 	int c;
446 
447 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
448 
449 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
450 		src = rm->rm_col[c].rc_data;
451 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
452 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
453 
454 		if (c == rm->rm_firstdatacol) {
455 			ASSERT(ccount == pcount);
456 			for (i = 0; i < ccount; i++, src++, p++) {
457 				*p = *src;
458 			}
459 		} else {
460 			ASSERT(ccount <= pcount);
461 			for (i = 0; i < ccount; i++, src++, p++) {
462 				*p ^= *src;
463 			}
464 		}
465 	}
466 }
467 
468 static void
469 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
470 {
471 	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
472 	int c;
473 
474 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
475 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
476 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
477 
478 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
479 		src = rm->rm_col[c].rc_data;
480 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
481 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
482 
483 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
484 
485 		if (c == rm->rm_firstdatacol) {
486 			ASSERT(ccnt == pcnt || ccnt == 0);
487 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
488 				*p = *src;
489 				*q = *src;
490 			}
491 			for (; i < pcnt; i++, src++, p++, q++) {
492 				*p = 0;
493 				*q = 0;
494 			}
495 		} else {
496 			ASSERT(ccnt <= pcnt);
497 
498 			/*
499 			 * Apply the algorithm described above by multiplying
500 			 * the previous result and adding in the new value.
501 			 */
502 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
503 				*p ^= *src;
504 
505 				VDEV_RAIDZ_64MUL_2(*q, mask);
506 				*q ^= *src;
507 			}
508 
509 			/*
510 			 * Treat short columns as though they are full of 0s.
511 			 * Note that there's therefore nothing needed for P.
512 			 */
513 			for (; i < pcnt; i++, q++) {
514 				VDEV_RAIDZ_64MUL_2(*q, mask);
515 			}
516 		}
517 	}
518 }
519 
520 static void
521 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
522 {
523 	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
524 	int c;
525 
526 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
527 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
528 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
529 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
530 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
531 
532 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
533 		src = rm->rm_col[c].rc_data;
534 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
535 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
536 		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
537 
538 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
539 
540 		if (c == rm->rm_firstdatacol) {
541 			ASSERT(ccnt == pcnt || ccnt == 0);
542 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
543 				*p = *src;
544 				*q = *src;
545 				*r = *src;
546 			}
547 			for (; i < pcnt; i++, src++, p++, q++, r++) {
548 				*p = 0;
549 				*q = 0;
550 				*r = 0;
551 			}
552 		} else {
553 			ASSERT(ccnt <= pcnt);
554 
555 			/*
556 			 * Apply the algorithm described above by multiplying
557 			 * the previous result and adding in the new value.
558 			 */
559 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
560 				*p ^= *src;
561 
562 				VDEV_RAIDZ_64MUL_2(*q, mask);
563 				*q ^= *src;
564 
565 				VDEV_RAIDZ_64MUL_4(*r, mask);
566 				*r ^= *src;
567 			}
568 
569 			/*
570 			 * Treat short columns as though they are full of 0s.
571 			 * Note that there's therefore nothing needed for P.
572 			 */
573 			for (; i < pcnt; i++, q++, r++) {
574 				VDEV_RAIDZ_64MUL_2(*q, mask);
575 				VDEV_RAIDZ_64MUL_4(*r, mask);
576 			}
577 		}
578 	}
579 }
580 
581 /*
582  * Generate RAID parity in the first virtual columns according to the number of
583  * parity columns available.
584  */
585 static void
586 vdev_raidz_generate_parity(raidz_map_t *rm)
587 {
588 	switch (rm->rm_firstdatacol) {
589 	case 1:
590 		vdev_raidz_generate_parity_p(rm);
591 		break;
592 	case 2:
593 		vdev_raidz_generate_parity_pq(rm);
594 		break;
595 	case 3:
596 		vdev_raidz_generate_parity_pqr(rm);
597 		break;
598 	default:
599 		panic("invalid RAID-Z configuration");
600 	}
601 }
602 
603 /* BEGIN CSTYLED */
604 /*
605  * In the general case of reconstruction, we must solve the system of linear
606  * equations defined by the coeffecients used to generate parity as well as
607  * the contents of the data and parity disks. This can be expressed with
608  * vectors for the original data (D) and the actual data (d) and parity (p)
609  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
610  *
611  *            __   __                     __     __
612  *            |     |         __     __   |  p_0  |
613  *            |  V  |         |  D_0  |   | p_m-1 |
614  *            |     |    x    |   :   | = |  d_0  |
615  *            |  I  |         | D_n-1 |   |   :   |
616  *            |     |         ~~     ~~   | d_n-1 |
617  *            ~~   ~~                     ~~     ~~
618  *
619  * I is simply a square identity matrix of size n, and V is a vandermonde
620  * matrix defined by the coeffecients we chose for the various parity columns
621  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
622  * computation as well as linear separability.
623  *
624  *      __               __               __     __
625  *      |   1   ..  1 1 1 |               |  p_0  |
626  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
627  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
628  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
629  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
630  *      |   :       : : : |   |   :   |   |  d_2  |
631  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
632  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
633  *      |   0   ..  0 0 1 |               | d_n-1 |
634  *      ~~               ~~               ~~     ~~
635  *
636  * Note that I, V, d, and p are known. To compute D, we must invert the
637  * matrix and use the known data and parity values to reconstruct the unknown
638  * data values. We begin by removing the rows in V|I and d|p that correspond
639  * to failed or missing columns; we then make V|I square (n x n) and d|p
640  * sized n by removing rows corresponding to unused parity from the bottom up
641  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
642  * using Gauss-Jordan elimination. In the example below we use m=3 parity
643  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
644  *           __                               __
645  *           |  1   1   1   1   1   1   1   1  |
646  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
647  *           |  19 205 116  29  64  16  4   1  |      / /
648  *           |  1   0   0   0   0   0   0   0  |     / /
649  *           |  0   1   0   0   0   0   0   0  | <--' /
650  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
651  *           |  0   0   0   1   0   0   0   0  |
652  *           |  0   0   0   0   1   0   0   0  |
653  *           |  0   0   0   0   0   1   0   0  |
654  *           |  0   0   0   0   0   0   1   0  |
655  *           |  0   0   0   0   0   0   0   1  |
656  *           ~~                               ~~
657  *           __                               __
658  *           |  1   1   1   1   1   1   1   1  |
659  *           | 128  64  32  16  8   4   2   1  |
660  *           |  19 205 116  29  64  16  4   1  |
661  *           |  1   0   0   0   0   0   0   0  |
662  *           |  0   1   0   0   0   0   0   0  |
663  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
664  *           |  0   0   0   1   0   0   0   0  |
665  *           |  0   0   0   0   1   0   0   0  |
666  *           |  0   0   0   0   0   1   0   0  |
667  *           |  0   0   0   0   0   0   1   0  |
668  *           |  0   0   0   0   0   0   0   1  |
669  *           ~~                               ~~
670  *
671  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
672  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
673  * matrix is not singular.
674  * __                                                                 __
675  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
676  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
677  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
678  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
679  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
680  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
681  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
682  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
683  * ~~                                                                 ~~
684  * __                                                                 __
685  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
686  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
687  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
688  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
689  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
690  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
691  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
692  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
693  * ~~                                                                 ~~
694  * __                                                                 __
695  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
696  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
697  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
698  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
699  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
700  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
701  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
702  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
703  * ~~                                                                 ~~
704  * __                                                                 __
705  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
706  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
707  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
708  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
709  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
710  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
711  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
712  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
713  * ~~                                                                 ~~
714  * __                                                                 __
715  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
716  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
717  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
718  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
719  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
720  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
721  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
722  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
723  * ~~                                                                 ~~
724  * __                                                                 __
725  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
726  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
727  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
728  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
729  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
730  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
731  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
732  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
733  * ~~                                                                 ~~
734  *                   __                               __
735  *                   |  0   0   1   0   0   0   0   0  |
736  *                   | 167 100  5   41 159 169 217 208 |
737  *                   | 166 100  4   40 158 168 216 209 |
738  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
739  *                   |  0   0   0   0   1   0   0   0  |
740  *                   |  0   0   0   0   0   1   0   0  |
741  *                   |  0   0   0   0   0   0   1   0  |
742  *                   |  0   0   0   0   0   0   0   1  |
743  *                   ~~                               ~~
744  *
745  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
746  * of the missing data.
747  *
748  * As is apparent from the example above, the only non-trivial rows in the
749  * inverse matrix correspond to the data disks that we're trying to
750  * reconstruct. Indeed, those are the only rows we need as the others would
751  * only be useful for reconstructing data known or assumed to be valid. For
752  * that reason, we only build the coefficients in the rows that correspond to
753  * targeted columns.
754  */
755 /* END CSTYLED */
756 
757 static void
758 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
759     uint8_t **rows)
760 {
761 	int i, j;
762 	int pow;
763 
764 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
765 
766 	/*
767 	 * Fill in the missing rows of interest.
768 	 */
769 	for (i = 0; i < nmap; i++) {
770 		ASSERT3S(0, <=, map[i]);
771 		ASSERT3S(map[i], <=, 2);
772 
773 		pow = map[i] * n;
774 		if (pow > 255)
775 			pow -= 255;
776 		ASSERT(pow <= 255);
777 
778 		for (j = 0; j < n; j++) {
779 			pow -= map[i];
780 			if (pow < 0)
781 				pow += 255;
782 			rows[i][j] = vdev_raidz_pow2[pow];
783 		}
784 	}
785 }
786 
787 static void
788 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
789     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
790 {
791 	int i, j, ii, jj;
792 	uint8_t log;
793 
794 	/*
795 	 * Assert that the first nmissing entries from the array of used
796 	 * columns correspond to parity columns and that subsequent entries
797 	 * correspond to data columns.
798 	 */
799 	for (i = 0; i < nmissing; i++) {
800 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
801 	}
802 	for (; i < n; i++) {
803 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
804 	}
805 
806 	/*
807 	 * First initialize the storage where we'll compute the inverse rows.
808 	 */
809 	for (i = 0; i < nmissing; i++) {
810 		for (j = 0; j < n; j++) {
811 			invrows[i][j] = (i == j) ? 1 : 0;
812 		}
813 	}
814 
815 	/*
816 	 * Subtract all trivial rows from the rows of consequence.
817 	 */
818 	for (i = 0; i < nmissing; i++) {
819 		for (j = nmissing; j < n; j++) {
820 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
821 			jj = used[j] - rm->rm_firstdatacol;
822 			ASSERT3S(jj, <, n);
823 			invrows[i][j] = rows[i][jj];
824 			rows[i][jj] = 0;
825 		}
826 	}
827 
828 	/*
829 	 * For each of the rows of interest, we must normalize it and subtract
830 	 * a multiple of it from the other rows.
831 	 */
832 	for (i = 0; i < nmissing; i++) {
833 		for (j = 0; j < missing[i]; j++) {
834 			ASSERT3U(rows[i][j], ==, 0);
835 		}
836 		ASSERT3U(rows[i][missing[i]], !=, 0);
837 
838 		/*
839 		 * Compute the inverse of the first element and multiply each
840 		 * element in the row by that value.
841 		 */
842 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
843 
844 		for (j = 0; j < n; j++) {
845 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
846 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
847 		}
848 
849 		for (ii = 0; ii < nmissing; ii++) {
850 			if (i == ii)
851 				continue;
852 
853 			ASSERT3U(rows[ii][missing[i]], !=, 0);
854 
855 			log = vdev_raidz_log2[rows[ii][missing[i]]];
856 
857 			for (j = 0; j < n; j++) {
858 				rows[ii][j] ^=
859 				    vdev_raidz_exp2(rows[i][j], log);
860 				invrows[ii][j] ^=
861 				    vdev_raidz_exp2(invrows[i][j], log);
862 			}
863 		}
864 	}
865 
866 	/*
867 	 * Verify that the data that is left in the rows are properly part of
868 	 * an identity matrix.
869 	 */
870 	for (i = 0; i < nmissing; i++) {
871 		for (j = 0; j < n; j++) {
872 			if (j == missing[i]) {
873 				ASSERT3U(rows[i][j], ==, 1);
874 			} else {
875 				ASSERT3U(rows[i][j], ==, 0);
876 			}
877 		}
878 	}
879 }
880 
881 static void
882 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
883     int *missing, uint8_t **invrows, const uint8_t *used)
884 {
885 	int i, j, x, cc, c;
886 	uint8_t *src;
887 	uint64_t ccount;
888 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
889 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
890 	uint8_t log, val;
891 	int ll;
892 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
893 	uint8_t *p, *pp;
894 	size_t psize;
895 
896 	log = 0;	/* gcc */
897 	psize = sizeof (invlog[0][0]) * n * nmissing;
898 	p = zfs_alloc(psize);
899 
900 	for (pp = p, i = 0; i < nmissing; i++) {
901 		invlog[i] = pp;
902 		pp += n;
903 	}
904 
905 	for (i = 0; i < nmissing; i++) {
906 		for (j = 0; j < n; j++) {
907 			ASSERT3U(invrows[i][j], !=, 0);
908 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
909 		}
910 	}
911 
912 	for (i = 0; i < n; i++) {
913 		c = used[i];
914 		ASSERT3U(c, <, rm->rm_cols);
915 
916 		src = rm->rm_col[c].rc_data;
917 		ccount = rm->rm_col[c].rc_size;
918 		for (j = 0; j < nmissing; j++) {
919 			cc = missing[j] + rm->rm_firstdatacol;
920 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
921 			ASSERT3U(cc, <, rm->rm_cols);
922 			ASSERT3U(cc, !=, c);
923 
924 			dst[j] = rm->rm_col[cc].rc_data;
925 			dcount[j] = rm->rm_col[cc].rc_size;
926 		}
927 
928 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
929 
930 		for (x = 0; x < ccount; x++, src++) {
931 			if (*src != 0)
932 				log = vdev_raidz_log2[*src];
933 
934 			for (cc = 0; cc < nmissing; cc++) {
935 				if (x >= dcount[cc])
936 					continue;
937 
938 				if (*src == 0) {
939 					val = 0;
940 				} else {
941 					if ((ll = log + invlog[cc][i]) >= 255)
942 						ll -= 255;
943 					val = vdev_raidz_pow2[ll];
944 				}
945 
946 				if (i == 0)
947 					dst[cc][x] = val;
948 				else
949 					dst[cc][x] ^= val;
950 			}
951 		}
952 	}
953 
954 	zfs_free(p, psize);
955 }
956 
957 static int
958 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
959 {
960 	int n, i, c, t, tt;
961 	int nmissing_rows;
962 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
963 	int parity_map[VDEV_RAIDZ_MAXPARITY];
964 
965 	uint8_t *p, *pp;
966 	size_t psize;
967 
968 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
969 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
970 	uint8_t *used;
971 
972 	int code = 0;
973 
974 
975 	n = rm->rm_cols - rm->rm_firstdatacol;
976 
977 	/*
978 	 * Figure out which data columns are missing.
979 	 */
980 	nmissing_rows = 0;
981 	for (t = 0; t < ntgts; t++) {
982 		if (tgts[t] >= rm->rm_firstdatacol) {
983 			missing_rows[nmissing_rows++] =
984 			    tgts[t] - rm->rm_firstdatacol;
985 		}
986 	}
987 
988 	/*
989 	 * Figure out which parity columns to use to help generate the missing
990 	 * data columns.
991 	 */
992 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
993 		ASSERT(tt < ntgts);
994 		ASSERT(c < rm->rm_firstdatacol);
995 
996 		/*
997 		 * Skip any targeted parity columns.
998 		 */
999 		if (c == tgts[tt]) {
1000 			tt++;
1001 			continue;
1002 		}
1003 
1004 		code |= 1 << c;
1005 
1006 		parity_map[i] = c;
1007 		i++;
1008 	}
1009 
1010 	ASSERT(code != 0);
1011 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1012 
1013 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1014 	    nmissing_rows * n + sizeof (used[0]) * n;
1015 	p = kmem_alloc(psize, KM_SLEEP);
1016 
1017 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1018 		rows[i] = pp;
1019 		pp += n;
1020 		invrows[i] = pp;
1021 		pp += n;
1022 	}
1023 	used = pp;
1024 
1025 	for (i = 0; i < nmissing_rows; i++) {
1026 		used[i] = parity_map[i];
1027 	}
1028 
1029 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1030 		if (tt < nmissing_rows &&
1031 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1032 			tt++;
1033 			continue;
1034 		}
1035 
1036 		ASSERT3S(i, <, n);
1037 		used[i] = c;
1038 		i++;
1039 	}
1040 
1041 	/*
1042 	 * Initialize the interesting rows of the matrix.
1043 	 */
1044 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1045 
1046 	/*
1047 	 * Invert the matrix.
1048 	 */
1049 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1050 	    invrows, used);
1051 
1052 	/*
1053 	 * Reconstruct the missing data using the generated matrix.
1054 	 */
1055 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1056 	    invrows, used);
1057 
1058 	kmem_free(p, psize);
1059 
1060 	return (code);
1061 }
1062 
1063 static int
1064 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1065 {
1066 	int tgts[VDEV_RAIDZ_MAXPARITY];
1067 	int ntgts;
1068 	int i, c;
1069 	int code;
1070 	int nbadparity, nbaddata;
1071 
1072 	/*
1073 	 * The tgts list must already be sorted.
1074 	 */
1075 	for (i = 1; i < nt; i++) {
1076 		ASSERT(t[i] > t[i - 1]);
1077 	}
1078 
1079 	nbadparity = rm->rm_firstdatacol;
1080 	nbaddata = rm->rm_cols - nbadparity;
1081 	ntgts = 0;
1082 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1083 		if (i < nt && c == t[i]) {
1084 			tgts[ntgts++] = c;
1085 			i++;
1086 		} else if (rm->rm_col[c].rc_error != 0) {
1087 			tgts[ntgts++] = c;
1088 		} else if (c >= rm->rm_firstdatacol) {
1089 			nbaddata--;
1090 		} else {
1091 			nbadparity--;
1092 		}
1093 	}
1094 
1095 	ASSERT(ntgts >= nt);
1096 	ASSERT(nbaddata >= 0);
1097 	ASSERT(nbaddata + nbadparity == ntgts);
1098 
1099 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1100 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1101 	ASSERT(code > 0);
1102 	return (code);
1103 }
1104 
1105 static raidz_map_t *
1106 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1107     uint64_t dcols, uint64_t nparity)
1108 {
1109 	raidz_map_t *rm;
1110 	uint64_t b = offset >> unit_shift;
1111 	uint64_t s = size >> unit_shift;
1112 	uint64_t f = b % dcols;
1113 	uint64_t o = (b / dcols) << unit_shift;
1114 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1115 
1116 	q = s / (dcols - nparity);
1117 	r = s - q * (dcols - nparity);
1118 	bc = (r == 0 ? 0 : r + nparity);
1119 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
1120 
1121 	if (q == 0) {
1122 		acols = bc;
1123 		scols = MIN(dcols, roundup(bc, nparity + 1));
1124 	} else {
1125 		acols = dcols;
1126 		scols = dcols;
1127 	}
1128 
1129 	ASSERT3U(acols, <=, scols);
1130 
1131 	rm = zfs_alloc(offsetof(raidz_map_t, rm_col[scols]));
1132 
1133 	rm->rm_cols = acols;
1134 	rm->rm_scols = scols;
1135 	rm->rm_bigcols = bc;
1136 	rm->rm_skipstart = bc;
1137 	rm->rm_missingdata = 0;
1138 	rm->rm_missingparity = 0;
1139 	rm->rm_firstdatacol = nparity;
1140 	rm->rm_reports = 0;
1141 	rm->rm_freed = 0;
1142 	rm->rm_ecksuminjected = 0;
1143 
1144 	asize = 0;
1145 
1146 	for (c = 0; c < scols; c++) {
1147 		col = f + c;
1148 		coff = o;
1149 		if (col >= dcols) {
1150 			col -= dcols;
1151 			coff += 1ULL << unit_shift;
1152 		}
1153 		rm->rm_col[c].rc_devidx = col;
1154 		rm->rm_col[c].rc_offset = coff;
1155 		rm->rm_col[c].rc_data = NULL;
1156 		rm->rm_col[c].rc_error = 0;
1157 		rm->rm_col[c].rc_tried = 0;
1158 		rm->rm_col[c].rc_skipped = 0;
1159 
1160 		if (c >= acols)
1161 			rm->rm_col[c].rc_size = 0;
1162 		else if (c < bc)
1163 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1164 		else
1165 			rm->rm_col[c].rc_size = q << unit_shift;
1166 
1167 		asize += rm->rm_col[c].rc_size;
1168 	}
1169 
1170 	ASSERT3U(asize, ==, tot << unit_shift);
1171 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1172 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1173 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1174 	ASSERT3U(rm->rm_nskip, <=, nparity);
1175 
1176 	for (c = 0; c < rm->rm_firstdatacol; c++)
1177 		rm->rm_col[c].rc_data = zfs_alloc(rm->rm_col[c].rc_size);
1178 
1179 	rm->rm_col[c].rc_data = data;
1180 
1181 	for (c = c + 1; c < acols; c++)
1182 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1183 		    rm->rm_col[c - 1].rc_size;
1184 
1185 	/*
1186 	 * If all data stored spans all columns, there's a danger that parity
1187 	 * will always be on the same device and, since parity isn't read
1188 	 * during normal operation, that that device's I/O bandwidth won't be
1189 	 * used effectively. We therefore switch the parity every 1MB.
1190 	 *
1191 	 * ... at least that was, ostensibly, the theory. As a practical
1192 	 * matter unless we juggle the parity between all devices evenly, we
1193 	 * won't see any benefit. Further, occasional writes that aren't a
1194 	 * multiple of the LCM of the number of children and the minimum
1195 	 * stripe width are sufficient to avoid pessimal behavior.
1196 	 * Unfortunately, this decision created an implicit on-disk format
1197 	 * requirement that we need to support for all eternity, but only
1198 	 * for single-parity RAID-Z.
1199 	 *
1200 	 * If we intend to skip a sector in the zeroth column for padding
1201 	 * we must make sure to note this swap. We will never intend to
1202 	 * skip the first column since at least one data and one parity
1203 	 * column must appear in each row.
1204 	 */
1205 	ASSERT(rm->rm_cols >= 2);
1206 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1207 
1208 	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1209 		devidx = rm->rm_col[0].rc_devidx;
1210 		o = rm->rm_col[0].rc_offset;
1211 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1212 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1213 		rm->rm_col[1].rc_devidx = devidx;
1214 		rm->rm_col[1].rc_offset = o;
1215 
1216 		if (rm->rm_skipstart == 0)
1217 			rm->rm_skipstart = 1;
1218 	}
1219 
1220 	return (rm);
1221 }
1222 
1223 static void
1224 vdev_raidz_map_free(raidz_map_t *rm)
1225 {
1226 	int c;
1227 
1228 	for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1229 		zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
1230 
1231 	zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
1232 }
1233 
1234 static vdev_t *
1235 vdev_child(vdev_t *pvd, uint64_t devidx)
1236 {
1237 	vdev_t *cvd;
1238 
1239 	STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1240 		if (cvd->v_id == devidx)
1241 			break;
1242 	}
1243 
1244 	return (cvd);
1245 }
1246 
1247 /*
1248  * We keep track of whether or not there were any injected errors, so that
1249  * any ereports we generate can note it.
1250  */
1251 static int
1252 raidz_checksum_verify(const blkptr_t *bp, void *data, uint64_t size)
1253 {
1254 
1255 	return (zio_checksum_verify(bp, data));
1256 }
1257 
1258 /*
1259  * Generate the parity from the data columns. If we tried and were able to
1260  * read the parity without error, verify that the generated parity matches the
1261  * data we read. If it doesn't, we fire off a checksum error. Return the
1262  * number such failures.
1263  */
1264 static int
1265 raidz_parity_verify(raidz_map_t *rm)
1266 {
1267 	void *orig[VDEV_RAIDZ_MAXPARITY];
1268 	int c, ret = 0;
1269 	raidz_col_t *rc;
1270 
1271 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1272 		rc = &rm->rm_col[c];
1273 		if (!rc->rc_tried || rc->rc_error != 0)
1274 			continue;
1275 		orig[c] = zfs_alloc(rc->rc_size);
1276 		bcopy(rc->rc_data, orig[c], rc->rc_size);
1277 	}
1278 
1279 	vdev_raidz_generate_parity(rm);
1280 
1281 	for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1282 		rc = &rm->rm_col[c];
1283 		if (!rc->rc_tried || rc->rc_error != 0)
1284 			continue;
1285 		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1286 			rc->rc_error = ECKSUM;
1287 			ret++;
1288 		}
1289 		zfs_free(orig[c], rc->rc_size);
1290 	}
1291 
1292 	return (ret);
1293 }
1294 
1295 /*
1296  * Iterate over all combinations of bad data and attempt a reconstruction.
1297  * Note that the algorithm below is non-optimal because it doesn't take into
1298  * account how reconstruction is actually performed. For example, with
1299  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1300  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1301  * cases we'd only use parity information in column 0.
1302  */
1303 static int
1304 vdev_raidz_combrec(raidz_map_t *rm, const blkptr_t *bp, void *data,
1305     off_t offset, uint64_t bytes, int total_errors, int data_errors)
1306 {
1307 	raidz_col_t *rc;
1308 	void *orig[VDEV_RAIDZ_MAXPARITY];
1309 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1310 	int *tgts = &tstore[1];
1311 	int current, next, i, c, n;
1312 	int code, ret = 0;
1313 
1314 	ASSERT(total_errors < rm->rm_firstdatacol);
1315 
1316 	/*
1317 	 * This simplifies one edge condition.
1318 	 */
1319 	tgts[-1] = -1;
1320 
1321 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1322 		/*
1323 		 * Initialize the targets array by finding the first n columns
1324 		 * that contain no error.
1325 		 *
1326 		 * If there were no data errors, we need to ensure that we're
1327 		 * always explicitly attempting to reconstruct at least one
1328 		 * data column. To do this, we simply push the highest target
1329 		 * up into the data columns.
1330 		 */
1331 		for (c = 0, i = 0; i < n; i++) {
1332 			if (i == n - 1 && data_errors == 0 &&
1333 			    c < rm->rm_firstdatacol) {
1334 				c = rm->rm_firstdatacol;
1335 			}
1336 
1337 			while (rm->rm_col[c].rc_error != 0) {
1338 				c++;
1339 				ASSERT3S(c, <, rm->rm_cols);
1340 			}
1341 
1342 			tgts[i] = c++;
1343 		}
1344 
1345 		/*
1346 		 * Setting tgts[n] simplifies the other edge condition.
1347 		 */
1348 		tgts[n] = rm->rm_cols;
1349 
1350 		/*
1351 		 * These buffers were allocated in previous iterations.
1352 		 */
1353 		for (i = 0; i < n - 1; i++) {
1354 			ASSERT(orig[i] != NULL);
1355 		}
1356 
1357 		orig[n - 1] = zfs_alloc(rm->rm_col[0].rc_size);
1358 
1359 		current = 0;
1360 		next = tgts[current];
1361 
1362 		while (current != n) {
1363 			tgts[current] = next;
1364 			current = 0;
1365 
1366 			/*
1367 			 * Save off the original data that we're going to
1368 			 * attempt to reconstruct.
1369 			 */
1370 			for (i = 0; i < n; i++) {
1371 				ASSERT(orig[i] != NULL);
1372 				c = tgts[i];
1373 				ASSERT3S(c, >=, 0);
1374 				ASSERT3S(c, <, rm->rm_cols);
1375 				rc = &rm->rm_col[c];
1376 				bcopy(rc->rc_data, orig[i], rc->rc_size);
1377 			}
1378 
1379 			/*
1380 			 * Attempt a reconstruction and exit the outer loop on
1381 			 * success.
1382 			 */
1383 			code = vdev_raidz_reconstruct(rm, tgts, n);
1384 			if (raidz_checksum_verify(bp, data, bytes) == 0) {
1385 				for (i = 0; i < n; i++) {
1386 					c = tgts[i];
1387 					rc = &rm->rm_col[c];
1388 					ASSERT(rc->rc_error == 0);
1389 					rc->rc_error = ECKSUM;
1390 				}
1391 
1392 				ret = code;
1393 				goto done;
1394 			}
1395 
1396 			/*
1397 			 * Restore the original data.
1398 			 */
1399 			for (i = 0; i < n; i++) {
1400 				c = tgts[i];
1401 				rc = &rm->rm_col[c];
1402 				bcopy(orig[i], rc->rc_data, rc->rc_size);
1403 			}
1404 
1405 			do {
1406 				/*
1407 				 * Find the next valid column after the current
1408 				 * position..
1409 				 */
1410 				for (next = tgts[current] + 1;
1411 				    next < rm->rm_cols &&
1412 				    rm->rm_col[next].rc_error != 0; next++)
1413 					continue;
1414 
1415 				ASSERT(next <= tgts[current + 1]);
1416 
1417 				/*
1418 				 * If that spot is available, we're done here.
1419 				 */
1420 				if (next != tgts[current + 1])
1421 					break;
1422 
1423 				/*
1424 				 * Otherwise, find the next valid column after
1425 				 * the previous position.
1426 				 */
1427 				for (c = tgts[current - 1] + 1;
1428 				    rm->rm_col[c].rc_error != 0; c++)
1429 					continue;
1430 
1431 				tgts[current] = c;
1432 				current++;
1433 
1434 			} while (current != n);
1435 		}
1436 	}
1437 	n--;
1438 done:
1439 	for (i = n - 1; i >= 0; i--) {
1440 		zfs_free(orig[i], rm->rm_col[0].rc_size);
1441 	}
1442 
1443 	return (ret);
1444 }
1445 
1446 static int
1447 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1448     off_t offset, size_t bytes)
1449 {
1450 	vdev_t *tvd = vd->v_top;
1451 	vdev_t *cvd;
1452 	raidz_map_t *rm;
1453 	raidz_col_t *rc;
1454 	int c, error;
1455 	int unexpected_errors;
1456 	int parity_errors;
1457 	int parity_untried;
1458 	int data_errors;
1459 	int total_errors;
1460 	int n;
1461 	int tgts[VDEV_RAIDZ_MAXPARITY];
1462 	int code;
1463 
1464 	rc = NULL;	/* gcc */
1465 	error = 0;
1466 
1467 	rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1468 	    vd->v_nchildren, vd->v_nparity);
1469 
1470 	/*
1471 	 * Iterate over the columns in reverse order so that we hit the parity
1472 	 * last -- any errors along the way will force us to read the parity.
1473 	 */
1474 	for (c = rm->rm_cols - 1; c >= 0; c--) {
1475 		rc = &rm->rm_col[c];
1476 		cvd = vdev_child(vd, rc->rc_devidx);
1477 		if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1478 			if (c >= rm->rm_firstdatacol)
1479 				rm->rm_missingdata++;
1480 			else
1481 				rm->rm_missingparity++;
1482 			rc->rc_error = ENXIO;
1483 			rc->rc_tried = 1;	/* don't even try */
1484 			rc->rc_skipped = 1;
1485 			continue;
1486 		}
1487 #if 0		/* XXX: Too hard for the boot code. */
1488 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1489 			if (c >= rm->rm_firstdatacol)
1490 				rm->rm_missingdata++;
1491 			else
1492 				rm->rm_missingparity++;
1493 			rc->rc_error = ESTALE;
1494 			rc->rc_skipped = 1;
1495 			continue;
1496 		}
1497 #endif
1498 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1499 			rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1500 			    rc->rc_offset, rc->rc_size);
1501 			rc->rc_tried = 1;
1502 			rc->rc_skipped = 0;
1503 		}
1504 	}
1505 
1506 reconstruct:
1507 	unexpected_errors = 0;
1508 	parity_errors = 0;
1509 	parity_untried = 0;
1510 	data_errors = 0;
1511 	total_errors = 0;
1512 
1513 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1514 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1515 
1516 	for (c = 0; c < rm->rm_cols; c++) {
1517 		rc = &rm->rm_col[c];
1518 
1519 		if (rc->rc_error) {
1520 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
1521 
1522 			if (c < rm->rm_firstdatacol)
1523 				parity_errors++;
1524 			else
1525 				data_errors++;
1526 
1527 			if (!rc->rc_skipped)
1528 				unexpected_errors++;
1529 
1530 			total_errors++;
1531 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1532 			parity_untried++;
1533 		}
1534 	}
1535 
1536 	/*
1537 	 * There are three potential phases for a read:
1538 	 *	1. produce valid data from the columns read
1539 	 *	2. read all disks and try again
1540 	 *	3. perform combinatorial reconstruction
1541 	 *
1542 	 * Each phase is progressively both more expensive and less likely to
1543 	 * occur. If we encounter more errors than we can repair or all phases
1544 	 * fail, we have no choice but to return an error.
1545 	 */
1546 
1547 	/*
1548 	 * If the number of errors we saw was correctable -- less than or equal
1549 	 * to the number of parity disks read -- attempt to produce data that
1550 	 * has a valid checksum. Naturally, this case applies in the absence of
1551 	 * any errors.
1552 	 */
1553 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1554 		if (data_errors == 0) {
1555 			if (raidz_checksum_verify(bp, data, bytes) == 0) {
1556 				/*
1557 				 * If we read parity information (unnecessarily
1558 				 * as it happens since no reconstruction was
1559 				 * needed) regenerate and verify the parity.
1560 				 * We also regenerate parity when resilvering
1561 				 * so we can write it out to the failed device
1562 				 * later.
1563 				 */
1564 				if (parity_errors + parity_untried <
1565 				    rm->rm_firstdatacol) {
1566 					n = raidz_parity_verify(rm);
1567 					unexpected_errors += n;
1568 					ASSERT(parity_errors + n <=
1569 					    rm->rm_firstdatacol);
1570 				}
1571 				goto done;
1572 			}
1573 		} else {
1574 			/*
1575 			 * We either attempt to read all the parity columns or
1576 			 * none of them. If we didn't try to read parity, we
1577 			 * wouldn't be here in the correctable case. There must
1578 			 * also have been fewer parity errors than parity
1579 			 * columns or, again, we wouldn't be in this code path.
1580 			 */
1581 			ASSERT(parity_untried == 0);
1582 			ASSERT(parity_errors < rm->rm_firstdatacol);
1583 
1584 			/*
1585 			 * Identify the data columns that reported an error.
1586 			 */
1587 			n = 0;
1588 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1589 				rc = &rm->rm_col[c];
1590 				if (rc->rc_error != 0) {
1591 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1592 					tgts[n++] = c;
1593 				}
1594 			}
1595 
1596 			ASSERT(rm->rm_firstdatacol >= n);
1597 
1598 			code = vdev_raidz_reconstruct(rm, tgts, n);
1599 
1600 			if (raidz_checksum_verify(bp, data, bytes) == 0) {
1601 				/*
1602 				 * If we read more parity disks than were used
1603 				 * for reconstruction, confirm that the other
1604 				 * parity disks produced correct data. This
1605 				 * routine is suboptimal in that it regenerates
1606 				 * the parity that we already used in addition
1607 				 * to the parity that we're attempting to
1608 				 * verify, but this should be a relatively
1609 				 * uncommon case, and can be optimized if it
1610 				 * becomes a problem. Note that we regenerate
1611 				 * parity when resilvering so we can write it
1612 				 * out to failed devices later.
1613 				 */
1614 				if (parity_errors < rm->rm_firstdatacol - n) {
1615 					n = raidz_parity_verify(rm);
1616 					unexpected_errors += n;
1617 					ASSERT(parity_errors + n <=
1618 					    rm->rm_firstdatacol);
1619 				}
1620 
1621 				goto done;
1622 			}
1623 		}
1624 	}
1625 
1626 	/*
1627 	 * This isn't a typical situation -- either we got a read
1628 	 * error or a child silently returned bad data. Read every
1629 	 * block so we can try again with as much data and parity as
1630 	 * we can track down. If we've already been through once
1631 	 * before, all children will be marked as tried so we'll
1632 	 * proceed to combinatorial reconstruction.
1633 	 */
1634 	unexpected_errors = 1;
1635 	rm->rm_missingdata = 0;
1636 	rm->rm_missingparity = 0;
1637 
1638 	n = 0;
1639 	for (c = 0; c < rm->rm_cols; c++) {
1640 		rc = &rm->rm_col[c];
1641 
1642 		if (rc->rc_tried)
1643 			continue;
1644 
1645 		cvd = vdev_child(vd, rc->rc_devidx);
1646 		ASSERT(cvd != NULL);
1647 		rc->rc_error = cvd->v_read(cvd, NULL,
1648 		    rc->rc_data, rc->rc_offset, rc->rc_size);
1649 		if (rc->rc_error == 0)
1650 			n++;
1651 		rc->rc_tried = 1;
1652 		rc->rc_skipped = 0;
1653 	}
1654 	/*
1655 	 * If we managed to read anything more, retry the
1656 	 * reconstruction.
1657 	 */
1658 	if (n > 0)
1659 		goto reconstruct;
1660 
1661 	/*
1662 	 * At this point we've attempted to reconstruct the data given the
1663 	 * errors we detected, and we've attempted to read all columns. There
1664 	 * must, therefore, be one or more additional problems -- silent errors
1665 	 * resulting in invalid data rather than explicit I/O errors resulting
1666 	 * in absent data. We check if there is enough additional data to
1667 	 * possibly reconstruct the data and then perform combinatorial
1668 	 * reconstruction over all possible combinations. If that fails,
1669 	 * we're cooked.
1670 	 */
1671 	if (total_errors > rm->rm_firstdatacol) {
1672 		error = EIO;
1673 	} else if (total_errors < rm->rm_firstdatacol &&
1674 	    (code = vdev_raidz_combrec(rm, bp, data, offset, bytes,
1675 	     total_errors, data_errors)) != 0) {
1676 		/*
1677 		 * If we didn't use all the available parity for the
1678 		 * combinatorial reconstruction, verify that the remaining
1679 		 * parity is correct.
1680 		 */
1681 		if (code != (1 << rm->rm_firstdatacol) - 1)
1682 			(void) raidz_parity_verify(rm);
1683 	} else {
1684 		/*
1685 		 * We're here because either:
1686 		 *
1687 		 *	total_errors == rm_first_datacol, or
1688 		 *	vdev_raidz_combrec() failed
1689 		 *
1690 		 * In either case, there is enough bad data to prevent
1691 		 * reconstruction.
1692 		 *
1693 		 * Start checksum ereports for all children which haven't
1694 		 * failed, and the IO wasn't speculative.
1695 		 */
1696 		error = ECKSUM;
1697 	}
1698 
1699 done:
1700 	vdev_raidz_map_free(rm);
1701 
1702 	return (error);
1703 }
1704