xref: /freebsd/sys/cddl/boot/zfs/zfssubr.c (revision af23369a6deaaeb612ab266eb88b8bb8d560c322)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/cdefs.h>
27 __FBSDID("$FreeBSD$");
28 
29 #include <lz4.h>
30 
31 static uint64_t zfs_crc64_table[256];
32 
33 #ifndef ASSERT3S	/* Proxy for all the assert defines */
34 #define	ASSERT3S(x, y, z)	((void)0)
35 #define	ASSERT3U(x, y, z)	((void)0)
36 #define	ASSERT3P(x, y, z)	((void)0)
37 #define	ASSERT0(x)		((void)0)
38 #define	ASSERT(x)		((void)0)
39 #endif
40 
41 #define	panic(...)	do {						\
42 	printf(__VA_ARGS__);						\
43 	for (;;) ;							\
44 } while (0)
45 
46 static void
47 zfs_init_crc(void)
48 {
49 	int i, j;
50 	uint64_t *ct;
51 
52 	/*
53 	 * Calculate the crc64 table (used for the zap hash
54 	 * function).
55 	 */
56 	if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
57 		memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
58 		for (i = 0; i < 256; i++)
59 			for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
60 				*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
61 	}
62 }
63 
64 static void
65 zio_checksum_off(const void *buf, uint64_t size,
66     const void *ctx_template, zio_cksum_t *zcp)
67 {
68 	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
69 }
70 
71 /*
72  * Signature for checksum functions.
73  */
74 typedef void zio_checksum_t(const void *data, uint64_t size,
75     const void *ctx_template, zio_cksum_t *zcp);
76 typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
77 typedef void zio_checksum_tmpl_free_t(void *ctx_template);
78 
79 typedef enum zio_checksum_flags {
80 	/* Strong enough for metadata? */
81 	ZCHECKSUM_FLAG_METADATA = (1 << 1),
82 	/* ZIO embedded checksum */
83 	ZCHECKSUM_FLAG_EMBEDDED = (1 << 2),
84 	/* Strong enough for dedup (without verification)? */
85 	ZCHECKSUM_FLAG_DEDUP = (1 << 3),
86 	/* Uses salt value */
87 	ZCHECKSUM_FLAG_SALTED = (1 << 4),
88 	/* Strong enough for nopwrite? */
89 	ZCHECKSUM_FLAG_NOPWRITE = (1 << 5)
90 } zio_checksum_flags_t;
91 
92 /*
93  * Information about each checksum function.
94  */
95 typedef struct zio_checksum_info {
96 	/* checksum function for each byteorder */
97 	zio_checksum_t			*ci_func[2];
98 	zio_checksum_tmpl_init_t	*ci_tmpl_init;
99 	zio_checksum_tmpl_free_t	*ci_tmpl_free;
100 	zio_checksum_flags_t		ci_flags;
101 	const char			*ci_name;	/* descriptive name */
102 } zio_checksum_info_t;
103 
104 #include "blkptr.c"
105 
106 #include "fletcher.c"
107 #include "blake3_zfs.c"
108 #include "sha256.c"
109 #include "skein_zfs.c"
110 
111 #ifdef HAS_ZSTD_ZFS
112 extern int zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len,
113     size_t d_len, int n);
114 #endif
115 
116 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
117 	{{NULL, NULL}, NULL, NULL, 0, "inherit"},
118 	{{NULL, NULL}, NULL, NULL, 0, "on"},
119 	{{zio_checksum_off,	zio_checksum_off}, NULL, NULL, 0, "off"},
120 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
121 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"},
122 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
123 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"},
124 	{{fletcher_2_native,	fletcher_2_byteswap}, NULL, NULL,
125 	    ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
126 	{{fletcher_2_native,	fletcher_2_byteswap}, NULL, NULL,
127 	    0, "fletcher2"},
128 	{{fletcher_4_native,	fletcher_4_byteswap}, NULL, NULL,
129 	    ZCHECKSUM_FLAG_METADATA, "fletcher4"},
130 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
131 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
132 	    ZCHECKSUM_FLAG_NOPWRITE, "SHA256"},
133 	{{fletcher_4_native,	fletcher_4_byteswap}, NULL, NULL,
134 	    ZCHECKSUM_FLAG_EMBEDDED, "zillog2"},
135 	{{zio_checksum_off,	zio_checksum_off}, NULL, NULL,
136 	    0, "noparity"},
137 	{{zio_checksum_SHA512_native,	zio_checksum_SHA512_byteswap},
138 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
139 	    ZCHECKSUM_FLAG_NOPWRITE, "SHA512"},
140 	{{zio_checksum_skein_native, zio_checksum_skein_byteswap},
141 	    zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
142 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
143 	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
144 	/* no edonr for now */
145 	{{NULL, NULL}, NULL, NULL, ZCHECKSUM_FLAG_METADATA |
146 	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
147 	{{zio_checksum_blake3_native,	zio_checksum_blake3_byteswap},
148 	    zio_checksum_blake3_tmpl_init, zio_checksum_blake3_tmpl_free,
149 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
150 	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "blake3"}
151 };
152 
153 /*
154  * Common signature for all zio compress/decompress functions.
155  */
156 typedef size_t zio_compress_func_t(void *src, void *dst,
157     size_t s_len, size_t d_len, int);
158 typedef int zio_decompress_func_t(void *src, void *dst,
159     size_t s_len, size_t d_len, int);
160 
161 /*
162  * Information about each compression function.
163  */
164 typedef struct zio_compress_info {
165 	zio_compress_func_t	*ci_compress;	/* compression function */
166 	zio_decompress_func_t	*ci_decompress;	/* decompression function */
167 	int			ci_level;	/* level parameter */
168 	const char		*ci_name;	/* algorithm name */
169 } zio_compress_info_t;
170 
171 #include "lzjb.c"
172 #include "zle.c"
173 #include "gzip.c"
174 
175 /*
176  * Compression vectors.
177  */
178 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
179 	{NULL,			NULL,			0,	"inherit"},
180 	{NULL,			NULL,			0,	"on"},
181 	{NULL,			NULL,			0,	"uncompressed"},
182 	{NULL,			lzjb_decompress,	0,	"lzjb"},
183 	{NULL,			NULL,			0,	"empty"},
184 	{NULL,			gzip_decompress,	1,	"gzip-1"},
185 	{NULL,			gzip_decompress,	2,	"gzip-2"},
186 	{NULL,			gzip_decompress,	3,	"gzip-3"},
187 	{NULL,			gzip_decompress,	4,	"gzip-4"},
188 	{NULL,			gzip_decompress,	5,	"gzip-5"},
189 	{NULL,			gzip_decompress,	6,	"gzip-6"},
190 	{NULL,			gzip_decompress,	7,	"gzip-7"},
191 	{NULL,			gzip_decompress,	8,	"gzip-8"},
192 	{NULL,			gzip_decompress,	9,	"gzip-9"},
193 	{NULL,			zle_decompress,		64,	"zle"},
194 	{NULL,			lz4_decompress,		0,	"lz4"},
195 #ifdef HAS_ZSTD_ZFS
196 	{NULL,			zfs_zstd_decompress, ZIO_ZSTD_LEVEL_DEFAULT, "zstd"}
197 #endif
198 };
199 
200 static void
201 byteswap_uint64_array(void *vbuf, size_t size)
202 {
203 	uint64_t *buf = vbuf;
204 	size_t count = size >> 3;
205 	int i;
206 
207 	ASSERT((size & 7) == 0);
208 
209 	for (i = 0; i < count; i++)
210 		buf[i] = BSWAP_64(buf[i]);
211 }
212 
213 /*
214  * Set the external verifier for a gang block based on <vdev, offset, txg>,
215  * a tuple which is guaranteed to be unique for the life of the pool.
216  */
217 static void
218 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
219 {
220 	const dva_t *dva = BP_IDENTITY(bp);
221 	uint64_t txg = BP_PHYSICAL_BIRTH(bp);
222 
223 	ASSERT(BP_IS_GANG(bp));
224 
225 	ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
226 }
227 
228 /*
229  * Set the external verifier for a label block based on its offset.
230  * The vdev is implicit, and the txg is unknowable at pool open time --
231  * hence the logic in vdev_uberblock_load() to find the most recent copy.
232  */
233 static void
234 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
235 {
236 	ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
237 }
238 
239 /*
240  * Calls the template init function of a checksum which supports context
241  * templates and installs the template into the spa_t.
242  */
243 static void
244 zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
245 {
246 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
247 
248 	if (ci->ci_tmpl_init == NULL)
249 		return;
250 
251 	if (spa->spa_cksum_tmpls[checksum] != NULL)
252 		return;
253 
254 	if (spa->spa_cksum_tmpls[checksum] == NULL) {
255 		spa->spa_cksum_tmpls[checksum] =
256 		    ci->ci_tmpl_init(&spa->spa_cksum_salt);
257 	}
258 }
259 
260 /*
261  * Called by a spa_t that's about to be deallocated. This steps through
262  * all of the checksum context templates and deallocates any that were
263  * initialized using the algorithm-specific template init function.
264  */
265 static void __unused
266 zio_checksum_templates_free(spa_t *spa)
267 {
268 	for (enum zio_checksum checksum = 0;
269 	    checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
270 		if (spa->spa_cksum_tmpls[checksum] != NULL) {
271 			zio_checksum_info_t *ci = &zio_checksum_table[checksum];
272 
273 			ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
274 			spa->spa_cksum_tmpls[checksum] = NULL;
275 		}
276 	}
277 }
278 
279 static int
280 zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data)
281 {
282 	uint64_t size;
283 	unsigned int checksum;
284 	zio_checksum_info_t *ci;
285 	void *ctx = NULL;
286 	zio_cksum_t actual_cksum, expected_cksum, verifier;
287 	int byteswap;
288 
289 	checksum = BP_GET_CHECKSUM(bp);
290 	size = BP_GET_PSIZE(bp);
291 
292 	if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
293 		return (EINVAL);
294 	ci = &zio_checksum_table[checksum];
295 	if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
296 		return (EINVAL);
297 
298 	if (spa != NULL) {
299 		zio_checksum_template_init(checksum, __DECONST(spa_t *,spa));
300 		ctx = spa->spa_cksum_tmpls[checksum];
301 	}
302 
303 	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
304 		zio_eck_t *eck;
305 
306 		ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
307 		    checksum == ZIO_CHECKSUM_LABEL);
308 
309 		eck = (zio_eck_t *)((char *)data + size) - 1;
310 
311 		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
312 			zio_checksum_gang_verifier(&verifier, bp);
313 		else if (checksum == ZIO_CHECKSUM_LABEL)
314 			zio_checksum_label_verifier(&verifier,
315 			    DVA_GET_OFFSET(BP_IDENTITY(bp)));
316 		else
317 			verifier = bp->blk_cksum;
318 
319 		byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
320 
321 		if (byteswap)
322 			byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
323 
324 		expected_cksum = eck->zec_cksum;
325 		eck->zec_cksum = verifier;
326 		ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
327 		eck->zec_cksum = expected_cksum;
328 
329 		if (byteswap)
330 			byteswap_uint64_array(&expected_cksum,
331 			    sizeof (zio_cksum_t));
332 	} else {
333 		byteswap = BP_SHOULD_BYTESWAP(bp);
334 		expected_cksum = bp->blk_cksum;
335 		ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
336 	}
337 
338 	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
339 		/*printf("ZFS: read checksum %s failed\n", ci->ci_name);*/
340 		return (EIO);
341 	}
342 
343 	return (0);
344 }
345 
346 static int
347 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
348 	void *dest, uint64_t destsize)
349 {
350 	zio_compress_info_t *ci;
351 
352 	if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
353 		printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
354 		return (EIO);
355 	}
356 
357 	ci = &zio_compress_table[cpfunc];
358 	if (!ci->ci_decompress) {
359 		printf("ZFS: unsupported compression algorithm %s\n",
360 		    ci->ci_name);
361 		return (EIO);
362 	}
363 
364 	return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
365 }
366 
367 static uint64_t
368 zap_hash(uint64_t salt, const char *name)
369 {
370 	const uint8_t *cp;
371 	uint8_t c;
372 	uint64_t crc = salt;
373 
374 	ASSERT(crc != 0);
375 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
376 	for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
377 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
378 
379 	/*
380 	 * Only use 28 bits, since we need 4 bits in the cookie for the
381 	 * collision differentiator.  We MUST use the high bits, since
382 	 * those are the onces that we first pay attention to when
383 	 * chosing the bucket.
384 	 */
385 	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
386 
387 	return (crc);
388 }
389 
390 typedef struct raidz_col {
391 	uint64_t rc_devidx;		/* child device index for I/O */
392 	uint64_t rc_offset;		/* device offset */
393 	uint64_t rc_size;		/* I/O size */
394 	void *rc_data;			/* I/O data */
395 	int rc_error;			/* I/O error for this device */
396 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
397 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
398 } raidz_col_t;
399 
400 typedef struct raidz_map {
401 	uint64_t rm_cols;		/* Regular column count */
402 	uint64_t rm_scols;		/* Count including skipped columns */
403 	uint64_t rm_bigcols;		/* Number of oversized columns */
404 	uint64_t rm_asize;		/* Actual total I/O size */
405 	uint64_t rm_missingdata;	/* Count of missing data devices */
406 	uint64_t rm_missingparity;	/* Count of missing parity devices */
407 	uint64_t rm_firstdatacol;	/* First data column/parity count */
408 	uint64_t rm_nskip;		/* Skipped sectors for padding */
409 	uint64_t rm_skipstart;		/* Column index of padding start */
410 	uintptr_t rm_reports;		/* # of referencing checksum reports */
411 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
412 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
413 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
414 } raidz_map_t;
415 
416 #define	VDEV_RAIDZ_P		0
417 #define	VDEV_RAIDZ_Q		1
418 #define	VDEV_RAIDZ_R		2
419 
420 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
421 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
422 
423 /*
424  * We provide a mechanism to perform the field multiplication operation on a
425  * 64-bit value all at once rather than a byte at a time. This works by
426  * creating a mask from the top bit in each byte and using that to
427  * conditionally apply the XOR of 0x1d.
428  */
429 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
430 { \
431 	(mask) = (x) & 0x8080808080808080ULL; \
432 	(mask) = ((mask) << 1) - ((mask) >> 7); \
433 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
434 	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
435 }
436 
437 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
438 { \
439 	VDEV_RAIDZ_64MUL_2((x), mask); \
440 	VDEV_RAIDZ_64MUL_2((x), mask); \
441 }
442 
443 /*
444  * These two tables represent powers and logs of 2 in the Galois field defined
445  * above. These values were computed by repeatedly multiplying by 2 as above.
446  */
447 static const uint8_t vdev_raidz_pow2[256] = {
448 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
449 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
450 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
451 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
452 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
453 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
454 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
455 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
456 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
457 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
458 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
459 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
460 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
461 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
462 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
463 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
464 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
465 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
466 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
467 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
468 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
469 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
470 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
471 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
472 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
473 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
474 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
475 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
476 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
477 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
478 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
479 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
480 };
481 static const uint8_t vdev_raidz_log2[256] = {
482 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
483 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
484 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
485 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
486 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
487 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
488 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
489 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
490 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
491 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
492 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
493 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
494 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
495 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
496 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
497 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
498 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
499 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
500 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
501 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
502 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
503 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
504 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
505 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
506 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
507 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
508 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
509 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
510 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
511 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
512 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
513 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
514 };
515 
516 /*
517  * Multiply a given number by 2 raised to the given power.
518  */
519 static uint8_t
520 vdev_raidz_exp2(uint8_t a, int exp)
521 {
522 	if (a == 0)
523 		return (0);
524 
525 	ASSERT(exp >= 0);
526 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
527 
528 	exp += vdev_raidz_log2[a];
529 	if (exp > 255)
530 		exp -= 255;
531 
532 	return (vdev_raidz_pow2[exp]);
533 }
534 
535 static void
536 vdev_raidz_generate_parity_p(raidz_map_t *rm)
537 {
538 	uint64_t *p, *src, ccount, i;
539 	uint64_t pcount __unused;
540 	int c;
541 
542 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
543 
544 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
545 		src = rm->rm_col[c].rc_data;
546 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
547 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
548 
549 		if (c == rm->rm_firstdatacol) {
550 			ASSERT(ccount == pcount);
551 			for (i = 0; i < ccount; i++, src++, p++) {
552 				*p = *src;
553 			}
554 		} else {
555 			ASSERT(ccount <= pcount);
556 			for (i = 0; i < ccount; i++, src++, p++) {
557 				*p ^= *src;
558 			}
559 		}
560 	}
561 }
562 
563 static void
564 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
565 {
566 	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
567 	int c;
568 
569 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
570 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
571 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
572 
573 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
574 		src = rm->rm_col[c].rc_data;
575 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
576 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
577 
578 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
579 
580 		if (c == rm->rm_firstdatacol) {
581 			ASSERT(ccnt == pcnt || ccnt == 0);
582 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
583 				*p = *src;
584 				*q = *src;
585 			}
586 			for (; i < pcnt; i++, src++, p++, q++) {
587 				*p = 0;
588 				*q = 0;
589 			}
590 		} else {
591 			ASSERT(ccnt <= pcnt);
592 
593 			/*
594 			 * Apply the algorithm described above by multiplying
595 			 * the previous result and adding in the new value.
596 			 */
597 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
598 				*p ^= *src;
599 
600 				VDEV_RAIDZ_64MUL_2(*q, mask);
601 				*q ^= *src;
602 			}
603 
604 			/*
605 			 * Treat short columns as though they are full of 0s.
606 			 * Note that there's therefore nothing needed for P.
607 			 */
608 			for (; i < pcnt; i++, q++) {
609 				VDEV_RAIDZ_64MUL_2(*q, mask);
610 			}
611 		}
612 	}
613 }
614 
615 static void
616 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
617 {
618 	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
619 	int c;
620 
621 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
622 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
623 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
624 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
625 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
626 
627 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
628 		src = rm->rm_col[c].rc_data;
629 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
630 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
631 		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
632 
633 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
634 
635 		if (c == rm->rm_firstdatacol) {
636 			ASSERT(ccnt == pcnt || ccnt == 0);
637 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
638 				*p = *src;
639 				*q = *src;
640 				*r = *src;
641 			}
642 			for (; i < pcnt; i++, src++, p++, q++, r++) {
643 				*p = 0;
644 				*q = 0;
645 				*r = 0;
646 			}
647 		} else {
648 			ASSERT(ccnt <= pcnt);
649 
650 			/*
651 			 * Apply the algorithm described above by multiplying
652 			 * the previous result and adding in the new value.
653 			 */
654 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
655 				*p ^= *src;
656 
657 				VDEV_RAIDZ_64MUL_2(*q, mask);
658 				*q ^= *src;
659 
660 				VDEV_RAIDZ_64MUL_4(*r, mask);
661 				*r ^= *src;
662 			}
663 
664 			/*
665 			 * Treat short columns as though they are full of 0s.
666 			 * Note that there's therefore nothing needed for P.
667 			 */
668 			for (; i < pcnt; i++, q++, r++) {
669 				VDEV_RAIDZ_64MUL_2(*q, mask);
670 				VDEV_RAIDZ_64MUL_4(*r, mask);
671 			}
672 		}
673 	}
674 }
675 
676 /*
677  * Generate RAID parity in the first virtual columns according to the number of
678  * parity columns available.
679  */
680 static void
681 vdev_raidz_generate_parity(raidz_map_t *rm)
682 {
683 	switch (rm->rm_firstdatacol) {
684 	case 1:
685 		vdev_raidz_generate_parity_p(rm);
686 		break;
687 	case 2:
688 		vdev_raidz_generate_parity_pq(rm);
689 		break;
690 	case 3:
691 		vdev_raidz_generate_parity_pqr(rm);
692 		break;
693 	default:
694 		panic("invalid RAID-Z configuration");
695 	}
696 }
697 
698 /* BEGIN CSTYLED */
699 /*
700  * In the general case of reconstruction, we must solve the system of linear
701  * equations defined by the coeffecients used to generate parity as well as
702  * the contents of the data and parity disks. This can be expressed with
703  * vectors for the original data (D) and the actual data (d) and parity (p)
704  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
705  *
706  *            __   __                     __     __
707  *            |     |         __     __   |  p_0  |
708  *            |  V  |         |  D_0  |   | p_m-1 |
709  *            |     |    x    |   :   | = |  d_0  |
710  *            |  I  |         | D_n-1 |   |   :   |
711  *            |     |         ~~     ~~   | d_n-1 |
712  *            ~~   ~~                     ~~     ~~
713  *
714  * I is simply a square identity matrix of size n, and V is a vandermonde
715  * matrix defined by the coeffecients we chose for the various parity columns
716  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
717  * computation as well as linear separability.
718  *
719  *      __               __               __     __
720  *      |   1   ..  1 1 1 |               |  p_0  |
721  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
722  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
723  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
724  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
725  *      |   :       : : : |   |   :   |   |  d_2  |
726  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
727  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
728  *      |   0   ..  0 0 1 |               | d_n-1 |
729  *      ~~               ~~               ~~     ~~
730  *
731  * Note that I, V, d, and p are known. To compute D, we must invert the
732  * matrix and use the known data and parity values to reconstruct the unknown
733  * data values. We begin by removing the rows in V|I and d|p that correspond
734  * to failed or missing columns; we then make V|I square (n x n) and d|p
735  * sized n by removing rows corresponding to unused parity from the bottom up
736  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
737  * using Gauss-Jordan elimination. In the example below we use m=3 parity
738  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
739  *           __                               __
740  *           |  1   1   1   1   1   1   1   1  |
741  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
742  *           |  19 205 116  29  64  16  4   1  |      / /
743  *           |  1   0   0   0   0   0   0   0  |     / /
744  *           |  0   1   0   0   0   0   0   0  | <--' /
745  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
746  *           |  0   0   0   1   0   0   0   0  |
747  *           |  0   0   0   0   1   0   0   0  |
748  *           |  0   0   0   0   0   1   0   0  |
749  *           |  0   0   0   0   0   0   1   0  |
750  *           |  0   0   0   0   0   0   0   1  |
751  *           ~~                               ~~
752  *           __                               __
753  *           |  1   1   1   1   1   1   1   1  |
754  *           | 128  64  32  16  8   4   2   1  |
755  *           |  19 205 116  29  64  16  4   1  |
756  *           |  1   0   0   0   0   0   0   0  |
757  *           |  0   1   0   0   0   0   0   0  |
758  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
759  *           |  0   0   0   1   0   0   0   0  |
760  *           |  0   0   0   0   1   0   0   0  |
761  *           |  0   0   0   0   0   1   0   0  |
762  *           |  0   0   0   0   0   0   1   0  |
763  *           |  0   0   0   0   0   0   0   1  |
764  *           ~~                               ~~
765  *
766  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
767  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
768  * matrix is not singular.
769  * __                                                                 __
770  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
771  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
772  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
773  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
774  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
775  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
776  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
777  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
778  * ~~                                                                 ~~
779  * __                                                                 __
780  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
781  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
782  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
783  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
784  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
785  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
786  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
787  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
788  * ~~                                                                 ~~
789  * __                                                                 __
790  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
791  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
792  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
793  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
794  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
795  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
796  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
797  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
798  * ~~                                                                 ~~
799  * __                                                                 __
800  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
801  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
802  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
803  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
804  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
805  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
806  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
807  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
808  * ~~                                                                 ~~
809  * __                                                                 __
810  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
811  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
812  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
813  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
814  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
815  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
816  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
817  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
818  * ~~                                                                 ~~
819  * __                                                                 __
820  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
821  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
822  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
823  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
824  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
825  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
826  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
827  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
828  * ~~                                                                 ~~
829  *                   __                               __
830  *                   |  0   0   1   0   0   0   0   0  |
831  *                   | 167 100  5   41 159 169 217 208 |
832  *                   | 166 100  4   40 158 168 216 209 |
833  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
834  *                   |  0   0   0   0   1   0   0   0  |
835  *                   |  0   0   0   0   0   1   0   0  |
836  *                   |  0   0   0   0   0   0   1   0  |
837  *                   |  0   0   0   0   0   0   0   1  |
838  *                   ~~                               ~~
839  *
840  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
841  * of the missing data.
842  *
843  * As is apparent from the example above, the only non-trivial rows in the
844  * inverse matrix correspond to the data disks that we're trying to
845  * reconstruct. Indeed, those are the only rows we need as the others would
846  * only be useful for reconstructing data known or assumed to be valid. For
847  * that reason, we only build the coefficients in the rows that correspond to
848  * targeted columns.
849  */
850 /* END CSTYLED */
851 
852 static void
853 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
854     uint8_t **rows)
855 {
856 	int i, j;
857 	int pow;
858 
859 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
860 
861 	/*
862 	 * Fill in the missing rows of interest.
863 	 */
864 	for (i = 0; i < nmap; i++) {
865 		ASSERT3S(0, <=, map[i]);
866 		ASSERT3S(map[i], <=, 2);
867 
868 		pow = map[i] * n;
869 		if (pow > 255)
870 			pow -= 255;
871 		ASSERT(pow <= 255);
872 
873 		for (j = 0; j < n; j++) {
874 			pow -= map[i];
875 			if (pow < 0)
876 				pow += 255;
877 			rows[i][j] = vdev_raidz_pow2[pow];
878 		}
879 	}
880 }
881 
882 static void
883 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
884     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
885 {
886 	int i, j, ii, jj;
887 	uint8_t log;
888 
889 	/*
890 	 * Assert that the first nmissing entries from the array of used
891 	 * columns correspond to parity columns and that subsequent entries
892 	 * correspond to data columns.
893 	 */
894 	for (i = 0; i < nmissing; i++) {
895 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
896 	}
897 	for (; i < n; i++) {
898 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
899 	}
900 
901 	/*
902 	 * First initialize the storage where we'll compute the inverse rows.
903 	 */
904 	for (i = 0; i < nmissing; i++) {
905 		for (j = 0; j < n; j++) {
906 			invrows[i][j] = (i == j) ? 1 : 0;
907 		}
908 	}
909 
910 	/*
911 	 * Subtract all trivial rows from the rows of consequence.
912 	 */
913 	for (i = 0; i < nmissing; i++) {
914 		for (j = nmissing; j < n; j++) {
915 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
916 			jj = used[j] - rm->rm_firstdatacol;
917 			ASSERT3S(jj, <, n);
918 			invrows[i][j] = rows[i][jj];
919 			rows[i][jj] = 0;
920 		}
921 	}
922 
923 	/*
924 	 * For each of the rows of interest, we must normalize it and subtract
925 	 * a multiple of it from the other rows.
926 	 */
927 	for (i = 0; i < nmissing; i++) {
928 		for (j = 0; j < missing[i]; j++) {
929 			ASSERT3U(rows[i][j], ==, 0);
930 		}
931 		ASSERT3U(rows[i][missing[i]], !=, 0);
932 
933 		/*
934 		 * Compute the inverse of the first element and multiply each
935 		 * element in the row by that value.
936 		 */
937 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
938 
939 		for (j = 0; j < n; j++) {
940 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
941 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
942 		}
943 
944 		for (ii = 0; ii < nmissing; ii++) {
945 			if (i == ii)
946 				continue;
947 
948 			ASSERT3U(rows[ii][missing[i]], !=, 0);
949 
950 			log = vdev_raidz_log2[rows[ii][missing[i]]];
951 
952 			for (j = 0; j < n; j++) {
953 				rows[ii][j] ^=
954 				    vdev_raidz_exp2(rows[i][j], log);
955 				invrows[ii][j] ^=
956 				    vdev_raidz_exp2(invrows[i][j], log);
957 			}
958 		}
959 	}
960 
961 	/*
962 	 * Verify that the data that is left in the rows are properly part of
963 	 * an identity matrix.
964 	 */
965 	for (i = 0; i < nmissing; i++) {
966 		for (j = 0; j < n; j++) {
967 			if (j == missing[i]) {
968 				ASSERT3U(rows[i][j], ==, 1);
969 			} else {
970 				ASSERT3U(rows[i][j], ==, 0);
971 			}
972 		}
973 	}
974 }
975 
976 static void
977 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
978     int *missing, uint8_t **invrows, const uint8_t *used)
979 {
980 	int i, j, x, cc, c;
981 	uint8_t *src;
982 	uint64_t ccount;
983 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
984 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
985 	uint8_t log, val;
986 	int ll;
987 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
988 	uint8_t *p, *pp;
989 	size_t psize;
990 
991 	log = 0;	/* gcc */
992 	psize = sizeof (invlog[0][0]) * n * nmissing;
993 	p = malloc(psize);
994 	if (p == NULL) {
995 		printf("Out of memory\n");
996 		return;
997 	}
998 
999 	for (pp = p, i = 0; i < nmissing; i++) {
1000 		invlog[i] = pp;
1001 		pp += n;
1002 	}
1003 
1004 	for (i = 0; i < nmissing; i++) {
1005 		for (j = 0; j < n; j++) {
1006 			ASSERT3U(invrows[i][j], !=, 0);
1007 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1008 		}
1009 	}
1010 
1011 	for (i = 0; i < n; i++) {
1012 		c = used[i];
1013 		ASSERT3U(c, <, rm->rm_cols);
1014 
1015 		src = rm->rm_col[c].rc_data;
1016 		ccount = rm->rm_col[c].rc_size;
1017 		for (j = 0; j < nmissing; j++) {
1018 			cc = missing[j] + rm->rm_firstdatacol;
1019 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1020 			ASSERT3U(cc, <, rm->rm_cols);
1021 			ASSERT3U(cc, !=, c);
1022 
1023 			dst[j] = rm->rm_col[cc].rc_data;
1024 			dcount[j] = rm->rm_col[cc].rc_size;
1025 		}
1026 
1027 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1028 
1029 		for (x = 0; x < ccount; x++, src++) {
1030 			if (*src != 0)
1031 				log = vdev_raidz_log2[*src];
1032 
1033 			for (cc = 0; cc < nmissing; cc++) {
1034 				if (x >= dcount[cc])
1035 					continue;
1036 
1037 				if (*src == 0) {
1038 					val = 0;
1039 				} else {
1040 					if ((ll = log + invlog[cc][i]) >= 255)
1041 						ll -= 255;
1042 					val = vdev_raidz_pow2[ll];
1043 				}
1044 
1045 				if (i == 0)
1046 					dst[cc][x] = val;
1047 				else
1048 					dst[cc][x] ^= val;
1049 			}
1050 		}
1051 	}
1052 
1053 	free(p);
1054 }
1055 
1056 static int
1057 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1058 {
1059 	int n, i, c, t, tt;
1060 	int nmissing_rows;
1061 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1062 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1063 
1064 	uint8_t *p, *pp;
1065 	size_t psize;
1066 
1067 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1068 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1069 	uint8_t *used;
1070 
1071 	int code = 0;
1072 
1073 
1074 	n = rm->rm_cols - rm->rm_firstdatacol;
1075 
1076 	/*
1077 	 * Figure out which data columns are missing.
1078 	 */
1079 	nmissing_rows = 0;
1080 	for (t = 0; t < ntgts; t++) {
1081 		if (tgts[t] >= rm->rm_firstdatacol) {
1082 			missing_rows[nmissing_rows++] =
1083 			    tgts[t] - rm->rm_firstdatacol;
1084 		}
1085 	}
1086 
1087 	/*
1088 	 * Figure out which parity columns to use to help generate the missing
1089 	 * data columns.
1090 	 */
1091 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1092 		ASSERT(tt < ntgts);
1093 		ASSERT(c < rm->rm_firstdatacol);
1094 
1095 		/*
1096 		 * Skip any targeted parity columns.
1097 		 */
1098 		if (c == tgts[tt]) {
1099 			tt++;
1100 			continue;
1101 		}
1102 
1103 		code |= 1 << c;
1104 
1105 		parity_map[i] = c;
1106 		i++;
1107 	}
1108 
1109 	ASSERT(code != 0);
1110 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1111 
1112 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1113 	    nmissing_rows * n + sizeof (used[0]) * n;
1114 	p = malloc(psize);
1115 	if (p == NULL) {
1116 		printf("Out of memory\n");
1117 		return (code);
1118 	}
1119 
1120 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1121 		rows[i] = pp;
1122 		pp += n;
1123 		invrows[i] = pp;
1124 		pp += n;
1125 	}
1126 	used = pp;
1127 
1128 	for (i = 0; i < nmissing_rows; i++) {
1129 		used[i] = parity_map[i];
1130 	}
1131 
1132 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1133 		if (tt < nmissing_rows &&
1134 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1135 			tt++;
1136 			continue;
1137 		}
1138 
1139 		ASSERT3S(i, <, n);
1140 		used[i] = c;
1141 		i++;
1142 	}
1143 
1144 	/*
1145 	 * Initialize the interesting rows of the matrix.
1146 	 */
1147 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1148 
1149 	/*
1150 	 * Invert the matrix.
1151 	 */
1152 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1153 	    invrows, used);
1154 
1155 	/*
1156 	 * Reconstruct the missing data using the generated matrix.
1157 	 */
1158 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1159 	    invrows, used);
1160 
1161 	free(p);
1162 
1163 	return (code);
1164 }
1165 
1166 static int
1167 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1168 {
1169 	int tgts[VDEV_RAIDZ_MAXPARITY];
1170 	int ntgts;
1171 	int i, c;
1172 	int code;
1173 	int nbadparity, nbaddata;
1174 
1175 	/*
1176 	 * The tgts list must already be sorted.
1177 	 */
1178 	for (i = 1; i < nt; i++) {
1179 		ASSERT(t[i] > t[i - 1]);
1180 	}
1181 
1182 	nbadparity = rm->rm_firstdatacol;
1183 	nbaddata = rm->rm_cols - nbadparity;
1184 	ntgts = 0;
1185 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1186 		if (i < nt && c == t[i]) {
1187 			tgts[ntgts++] = c;
1188 			i++;
1189 		} else if (rm->rm_col[c].rc_error != 0) {
1190 			tgts[ntgts++] = c;
1191 		} else if (c >= rm->rm_firstdatacol) {
1192 			nbaddata--;
1193 		} else {
1194 			nbadparity--;
1195 		}
1196 	}
1197 
1198 	ASSERT(ntgts >= nt);
1199 	ASSERT(nbaddata >= 0);
1200 	ASSERT(nbaddata + nbadparity == ntgts);
1201 
1202 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1203 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1204 	ASSERT(code > 0);
1205 	return (code);
1206 }
1207 
1208 static raidz_map_t *
1209 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1210     uint64_t dcols, uint64_t nparity)
1211 {
1212 	raidz_map_t *rm;
1213 	uint64_t b = offset >> unit_shift;
1214 	uint64_t s = size >> unit_shift;
1215 	uint64_t f = b % dcols;
1216 	uint64_t o = (b / dcols) << unit_shift;
1217 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1218 
1219 	q = s / (dcols - nparity);
1220 	r = s - q * (dcols - nparity);
1221 	bc = (r == 0 ? 0 : r + nparity);
1222 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
1223 
1224 	if (q == 0) {
1225 		acols = bc;
1226 		scols = MIN(dcols, roundup(bc, nparity + 1));
1227 	} else {
1228 		acols = dcols;
1229 		scols = dcols;
1230 	}
1231 
1232 	ASSERT3U(acols, <=, scols);
1233 
1234 	rm = malloc(offsetof(raidz_map_t, rm_col[scols]));
1235 	if (rm == NULL)
1236 		return (rm);
1237 
1238 	rm->rm_cols = acols;
1239 	rm->rm_scols = scols;
1240 	rm->rm_bigcols = bc;
1241 	rm->rm_skipstart = bc;
1242 	rm->rm_missingdata = 0;
1243 	rm->rm_missingparity = 0;
1244 	rm->rm_firstdatacol = nparity;
1245 	rm->rm_reports = 0;
1246 	rm->rm_freed = 0;
1247 	rm->rm_ecksuminjected = 0;
1248 
1249 	asize = 0;
1250 
1251 	for (c = 0; c < scols; c++) {
1252 		col = f + c;
1253 		coff = o;
1254 		if (col >= dcols) {
1255 			col -= dcols;
1256 			coff += 1ULL << unit_shift;
1257 		}
1258 		rm->rm_col[c].rc_devidx = col;
1259 		rm->rm_col[c].rc_offset = coff;
1260 		rm->rm_col[c].rc_data = NULL;
1261 		rm->rm_col[c].rc_error = 0;
1262 		rm->rm_col[c].rc_tried = 0;
1263 		rm->rm_col[c].rc_skipped = 0;
1264 
1265 		if (c >= acols)
1266 			rm->rm_col[c].rc_size = 0;
1267 		else if (c < bc)
1268 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1269 		else
1270 			rm->rm_col[c].rc_size = q << unit_shift;
1271 
1272 		asize += rm->rm_col[c].rc_size;
1273 	}
1274 
1275 	ASSERT3U(asize, ==, tot << unit_shift);
1276 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1277 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1278 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1279 	ASSERT3U(rm->rm_nskip, <=, nparity);
1280 
1281 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1282 		rm->rm_col[c].rc_data = malloc(rm->rm_col[c].rc_size);
1283 		if (rm->rm_col[c].rc_data == NULL) {
1284 			c++;
1285 			while (c != 0)
1286 				free(rm->rm_col[--c].rc_data);
1287 			free(rm);
1288 			return (NULL);
1289 		}
1290 	}
1291 
1292 	rm->rm_col[c].rc_data = data;
1293 
1294 	for (c = c + 1; c < acols; c++)
1295 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1296 		    rm->rm_col[c - 1].rc_size;
1297 
1298 	/*
1299 	 * If all data stored spans all columns, there's a danger that parity
1300 	 * will always be on the same device and, since parity isn't read
1301 	 * during normal operation, that that device's I/O bandwidth won't be
1302 	 * used effectively. We therefore switch the parity every 1MB.
1303 	 *
1304 	 * ... at least that was, ostensibly, the theory. As a practical
1305 	 * matter unless we juggle the parity between all devices evenly, we
1306 	 * won't see any benefit. Further, occasional writes that aren't a
1307 	 * multiple of the LCM of the number of children and the minimum
1308 	 * stripe width are sufficient to avoid pessimal behavior.
1309 	 * Unfortunately, this decision created an implicit on-disk format
1310 	 * requirement that we need to support for all eternity, but only
1311 	 * for single-parity RAID-Z.
1312 	 *
1313 	 * If we intend to skip a sector in the zeroth column for padding
1314 	 * we must make sure to note this swap. We will never intend to
1315 	 * skip the first column since at least one data and one parity
1316 	 * column must appear in each row.
1317 	 */
1318 	ASSERT(rm->rm_cols >= 2);
1319 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1320 
1321 	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1322 		devidx = rm->rm_col[0].rc_devidx;
1323 		o = rm->rm_col[0].rc_offset;
1324 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1325 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1326 		rm->rm_col[1].rc_devidx = devidx;
1327 		rm->rm_col[1].rc_offset = o;
1328 
1329 		if (rm->rm_skipstart == 0)
1330 			rm->rm_skipstart = 1;
1331 	}
1332 
1333 	return (rm);
1334 }
1335 
1336 static void
1337 vdev_raidz_map_free(raidz_map_t *rm)
1338 {
1339 	int c;
1340 
1341 	for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1342 		free(rm->rm_col[c].rc_data);
1343 
1344 	free(rm);
1345 }
1346 
1347 static vdev_t *
1348 vdev_child(vdev_t *pvd, uint64_t devidx)
1349 {
1350 	vdev_t *cvd;
1351 
1352 	STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1353 		if (cvd->v_id == devidx)
1354 			break;
1355 	}
1356 
1357 	return (cvd);
1358 }
1359 
1360 /*
1361  * We keep track of whether or not there were any injected errors, so that
1362  * any ereports we generate can note it.
1363  */
1364 static int
1365 raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data,
1366     uint64_t size)
1367 {
1368 	return (zio_checksum_verify(spa, bp, data));
1369 }
1370 
1371 /*
1372  * Generate the parity from the data columns. If we tried and were able to
1373  * read the parity without error, verify that the generated parity matches the
1374  * data we read. If it doesn't, we fire off a checksum error. Return the
1375  * number such failures.
1376  */
1377 static int
1378 raidz_parity_verify(raidz_map_t *rm)
1379 {
1380 	void *orig[VDEV_RAIDZ_MAXPARITY];
1381 	int c, ret = 0;
1382 	raidz_col_t *rc;
1383 
1384 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1385 		rc = &rm->rm_col[c];
1386 		if (!rc->rc_tried || rc->rc_error != 0)
1387 			continue;
1388 		orig[c] = malloc(rc->rc_size);
1389 		if (orig[c] != NULL) {
1390 			bcopy(rc->rc_data, orig[c], rc->rc_size);
1391 		} else {
1392 			printf("Out of memory\n");
1393 		}
1394 	}
1395 
1396 	vdev_raidz_generate_parity(rm);
1397 
1398 	for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1399 		rc = &rm->rm_col[c];
1400 		if (!rc->rc_tried || rc->rc_error != 0)
1401 			continue;
1402 		if (orig[c] == NULL ||
1403 		    bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1404 			rc->rc_error = ECKSUM;
1405 			ret++;
1406 		}
1407 		free(orig[c]);
1408 	}
1409 
1410 	return (ret);
1411 }
1412 
1413 /*
1414  * Iterate over all combinations of bad data and attempt a reconstruction.
1415  * Note that the algorithm below is non-optimal because it doesn't take into
1416  * account how reconstruction is actually performed. For example, with
1417  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1418  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1419  * cases we'd only use parity information in column 0.
1420  */
1421 static int
1422 vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp,
1423     void *data, off_t offset, uint64_t bytes, int total_errors, int data_errors)
1424 {
1425 	raidz_col_t *rc;
1426 	void *orig[VDEV_RAIDZ_MAXPARITY];
1427 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1428 	int *tgts = &tstore[1];
1429 	int current, next, i, c, n;
1430 	int code, ret = 0;
1431 
1432 	ASSERT(total_errors < rm->rm_firstdatacol);
1433 
1434 	/*
1435 	 * This simplifies one edge condition.
1436 	 */
1437 	tgts[-1] = -1;
1438 
1439 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1440 		/*
1441 		 * Initialize the targets array by finding the first n columns
1442 		 * that contain no error.
1443 		 *
1444 		 * If there were no data errors, we need to ensure that we're
1445 		 * always explicitly attempting to reconstruct at least one
1446 		 * data column. To do this, we simply push the highest target
1447 		 * up into the data columns.
1448 		 */
1449 		for (c = 0, i = 0; i < n; i++) {
1450 			if (i == n - 1 && data_errors == 0 &&
1451 			    c < rm->rm_firstdatacol) {
1452 				c = rm->rm_firstdatacol;
1453 			}
1454 
1455 			while (rm->rm_col[c].rc_error != 0) {
1456 				c++;
1457 				ASSERT3S(c, <, rm->rm_cols);
1458 			}
1459 
1460 			tgts[i] = c++;
1461 		}
1462 
1463 		/*
1464 		 * Setting tgts[n] simplifies the other edge condition.
1465 		 */
1466 		tgts[n] = rm->rm_cols;
1467 
1468 		/*
1469 		 * These buffers were allocated in previous iterations.
1470 		 */
1471 		for (i = 0; i < n - 1; i++) {
1472 			ASSERT(orig[i] != NULL);
1473 		}
1474 
1475 		orig[n - 1] = malloc(rm->rm_col[0].rc_size);
1476 		if (orig[n - 1] == NULL) {
1477 			ret = ENOMEM;
1478 			goto done;
1479 		}
1480 
1481 		current = 0;
1482 		next = tgts[current];
1483 
1484 		while (current != n) {
1485 			tgts[current] = next;
1486 			current = 0;
1487 
1488 			/*
1489 			 * Save off the original data that we're going to
1490 			 * attempt to reconstruct.
1491 			 */
1492 			for (i = 0; i < n; i++) {
1493 				ASSERT(orig[i] != NULL);
1494 				c = tgts[i];
1495 				ASSERT3S(c, >=, 0);
1496 				ASSERT3S(c, <, rm->rm_cols);
1497 				rc = &rm->rm_col[c];
1498 				bcopy(rc->rc_data, orig[i], rc->rc_size);
1499 			}
1500 
1501 			/*
1502 			 * Attempt a reconstruction and exit the outer loop on
1503 			 * success.
1504 			 */
1505 			code = vdev_raidz_reconstruct(rm, tgts, n);
1506 			if (raidz_checksum_verify(spa, bp, data, bytes) == 0) {
1507 				for (i = 0; i < n; i++) {
1508 					c = tgts[i];
1509 					rc = &rm->rm_col[c];
1510 					ASSERT(rc->rc_error == 0);
1511 					rc->rc_error = ECKSUM;
1512 				}
1513 
1514 				ret = code;
1515 				goto done;
1516 			}
1517 
1518 			/*
1519 			 * Restore the original data.
1520 			 */
1521 			for (i = 0; i < n; i++) {
1522 				c = tgts[i];
1523 				rc = &rm->rm_col[c];
1524 				bcopy(orig[i], rc->rc_data, rc->rc_size);
1525 			}
1526 
1527 			do {
1528 				/*
1529 				 * Find the next valid column after the current
1530 				 * position..
1531 				 */
1532 				for (next = tgts[current] + 1;
1533 				    next < rm->rm_cols &&
1534 				    rm->rm_col[next].rc_error != 0; next++)
1535 					continue;
1536 
1537 				ASSERT(next <= tgts[current + 1]);
1538 
1539 				/*
1540 				 * If that spot is available, we're done here.
1541 				 */
1542 				if (next != tgts[current + 1])
1543 					break;
1544 
1545 				/*
1546 				 * Otherwise, find the next valid column after
1547 				 * the previous position.
1548 				 */
1549 				for (c = tgts[current - 1] + 1;
1550 				    rm->rm_col[c].rc_error != 0; c++)
1551 					continue;
1552 
1553 				tgts[current] = c;
1554 				current++;
1555 
1556 			} while (current != n);
1557 		}
1558 	}
1559 	n--;
1560 done:
1561 	for (i = n - 1; i >= 0; i--) {
1562 		free(orig[i]);
1563 	}
1564 
1565 	return (ret);
1566 }
1567 
1568 static int
1569 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1570     off_t offset, size_t bytes)
1571 {
1572 	vdev_t *tvd = vd->v_top;
1573 	vdev_t *cvd;
1574 	raidz_map_t *rm;
1575 	raidz_col_t *rc;
1576 	int c, error;
1577 	int unexpected_errors;
1578 	int parity_errors;
1579 	int parity_untried;
1580 	int data_errors;
1581 	int total_errors;
1582 	int n;
1583 	int tgts[VDEV_RAIDZ_MAXPARITY];
1584 	int code;
1585 
1586 	rc = NULL;	/* gcc */
1587 	error = 0;
1588 
1589 	rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1590 	    vd->v_nchildren, vd->v_nparity);
1591 	if (rm == NULL)
1592 		return (ENOMEM);
1593 
1594 	/*
1595 	 * Iterate over the columns in reverse order so that we hit the parity
1596 	 * last -- any errors along the way will force us to read the parity.
1597 	 */
1598 	for (c = rm->rm_cols - 1; c >= 0; c--) {
1599 		rc = &rm->rm_col[c];
1600 		cvd = vdev_child(vd, rc->rc_devidx);
1601 		if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1602 			if (c >= rm->rm_firstdatacol)
1603 				rm->rm_missingdata++;
1604 			else
1605 				rm->rm_missingparity++;
1606 			rc->rc_error = ENXIO;
1607 			rc->rc_tried = 1;	/* don't even try */
1608 			rc->rc_skipped = 1;
1609 			continue;
1610 		}
1611 #if 0		/* XXX: Too hard for the boot code. */
1612 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1613 			if (c >= rm->rm_firstdatacol)
1614 				rm->rm_missingdata++;
1615 			else
1616 				rm->rm_missingparity++;
1617 			rc->rc_error = ESTALE;
1618 			rc->rc_skipped = 1;
1619 			continue;
1620 		}
1621 #endif
1622 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1623 			rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1624 			    rc->rc_offset, rc->rc_size);
1625 			rc->rc_tried = 1;
1626 			rc->rc_skipped = 0;
1627 		}
1628 	}
1629 
1630 reconstruct:
1631 	unexpected_errors = 0;
1632 	parity_errors = 0;
1633 	parity_untried = 0;
1634 	data_errors = 0;
1635 	total_errors = 0;
1636 
1637 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1638 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1639 
1640 	for (c = 0; c < rm->rm_cols; c++) {
1641 		rc = &rm->rm_col[c];
1642 
1643 		if (rc->rc_error) {
1644 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
1645 
1646 			if (c < rm->rm_firstdatacol)
1647 				parity_errors++;
1648 			else
1649 				data_errors++;
1650 
1651 			if (!rc->rc_skipped)
1652 				unexpected_errors++;
1653 
1654 			total_errors++;
1655 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1656 			parity_untried++;
1657 		}
1658 	}
1659 
1660 	/*
1661 	 * There are three potential phases for a read:
1662 	 *	1. produce valid data from the columns read
1663 	 *	2. read all disks and try again
1664 	 *	3. perform combinatorial reconstruction
1665 	 *
1666 	 * Each phase is progressively both more expensive and less likely to
1667 	 * occur. If we encounter more errors than we can repair or all phases
1668 	 * fail, we have no choice but to return an error.
1669 	 */
1670 
1671 	/*
1672 	 * If the number of errors we saw was correctable -- less than or equal
1673 	 * to the number of parity disks read -- attempt to produce data that
1674 	 * has a valid checksum. Naturally, this case applies in the absence of
1675 	 * any errors.
1676 	 */
1677 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1678 		int rv;
1679 
1680 		if (data_errors == 0) {
1681 			rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
1682 			if (rv == 0) {
1683 				/*
1684 				 * If we read parity information (unnecessarily
1685 				 * as it happens since no reconstruction was
1686 				 * needed) regenerate and verify the parity.
1687 				 * We also regenerate parity when resilvering
1688 				 * so we can write it out to the failed device
1689 				 * later.
1690 				 */
1691 				if (parity_errors + parity_untried <
1692 				    rm->rm_firstdatacol) {
1693 					n = raidz_parity_verify(rm);
1694 					unexpected_errors += n;
1695 					ASSERT(parity_errors + n <=
1696 					    rm->rm_firstdatacol);
1697 				}
1698 				goto done;
1699 			}
1700 		} else {
1701 			/*
1702 			 * We either attempt to read all the parity columns or
1703 			 * none of them. If we didn't try to read parity, we
1704 			 * wouldn't be here in the correctable case. There must
1705 			 * also have been fewer parity errors than parity
1706 			 * columns or, again, we wouldn't be in this code path.
1707 			 */
1708 			ASSERT(parity_untried == 0);
1709 			ASSERT(parity_errors < rm->rm_firstdatacol);
1710 
1711 			/*
1712 			 * Identify the data columns that reported an error.
1713 			 */
1714 			n = 0;
1715 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1716 				rc = &rm->rm_col[c];
1717 				if (rc->rc_error != 0) {
1718 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1719 					tgts[n++] = c;
1720 				}
1721 			}
1722 
1723 			ASSERT(rm->rm_firstdatacol >= n);
1724 
1725 			code = vdev_raidz_reconstruct(rm, tgts, n);
1726 
1727 			rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
1728 			if (rv == 0) {
1729 				/*
1730 				 * If we read more parity disks than were used
1731 				 * for reconstruction, confirm that the other
1732 				 * parity disks produced correct data. This
1733 				 * routine is suboptimal in that it regenerates
1734 				 * the parity that we already used in addition
1735 				 * to the parity that we're attempting to
1736 				 * verify, but this should be a relatively
1737 				 * uncommon case, and can be optimized if it
1738 				 * becomes a problem. Note that we regenerate
1739 				 * parity when resilvering so we can write it
1740 				 * out to failed devices later.
1741 				 */
1742 				if (parity_errors < rm->rm_firstdatacol - n) {
1743 					n = raidz_parity_verify(rm);
1744 					unexpected_errors += n;
1745 					ASSERT(parity_errors + n <=
1746 					    rm->rm_firstdatacol);
1747 				}
1748 
1749 				goto done;
1750 			}
1751 		}
1752 	}
1753 
1754 	/*
1755 	 * This isn't a typical situation -- either we got a read
1756 	 * error or a child silently returned bad data. Read every
1757 	 * block so we can try again with as much data and parity as
1758 	 * we can track down. If we've already been through once
1759 	 * before, all children will be marked as tried so we'll
1760 	 * proceed to combinatorial reconstruction.
1761 	 */
1762 	unexpected_errors = 1;
1763 	rm->rm_missingdata = 0;
1764 	rm->rm_missingparity = 0;
1765 
1766 	n = 0;
1767 	for (c = 0; c < rm->rm_cols; c++) {
1768 		rc = &rm->rm_col[c];
1769 
1770 		if (rc->rc_tried)
1771 			continue;
1772 
1773 		cvd = vdev_child(vd, rc->rc_devidx);
1774 		ASSERT(cvd != NULL);
1775 		rc->rc_error = cvd->v_read(cvd, NULL,
1776 		    rc->rc_data, rc->rc_offset, rc->rc_size);
1777 		if (rc->rc_error == 0)
1778 			n++;
1779 		rc->rc_tried = 1;
1780 		rc->rc_skipped = 0;
1781 	}
1782 	/*
1783 	 * If we managed to read anything more, retry the
1784 	 * reconstruction.
1785 	 */
1786 	if (n > 0)
1787 		goto reconstruct;
1788 
1789 	/*
1790 	 * At this point we've attempted to reconstruct the data given the
1791 	 * errors we detected, and we've attempted to read all columns. There
1792 	 * must, therefore, be one or more additional problems -- silent errors
1793 	 * resulting in invalid data rather than explicit I/O errors resulting
1794 	 * in absent data. We check if there is enough additional data to
1795 	 * possibly reconstruct the data and then perform combinatorial
1796 	 * reconstruction over all possible combinations. If that fails,
1797 	 * we're cooked.
1798 	 */
1799 	if (total_errors > rm->rm_firstdatacol) {
1800 		error = EIO;
1801 	} else if (total_errors < rm->rm_firstdatacol &&
1802 	    (code = vdev_raidz_combrec(vd->v_spa, rm, bp, data, offset, bytes,
1803 	     total_errors, data_errors)) != 0) {
1804 		/*
1805 		 * If we didn't use all the available parity for the
1806 		 * combinatorial reconstruction, verify that the remaining
1807 		 * parity is correct.
1808 		 */
1809 		if (code != (1 << rm->rm_firstdatacol) - 1)
1810 			(void) raidz_parity_verify(rm);
1811 	} else {
1812 		/*
1813 		 * We're here because either:
1814 		 *
1815 		 *	total_errors == rm_first_datacol, or
1816 		 *	vdev_raidz_combrec() failed
1817 		 *
1818 		 * In either case, there is enough bad data to prevent
1819 		 * reconstruction.
1820 		 *
1821 		 * Start checksum ereports for all children which haven't
1822 		 * failed, and the IO wasn't speculative.
1823 		 */
1824 		error = ECKSUM;
1825 	}
1826 
1827 done:
1828 	vdev_raidz_map_free(rm);
1829 
1830 	return (error);
1831 }
1832