xref: /freebsd/sys/cddl/boot/zfs/zfssubr.c (revision e2df9bb44109577475aeb186e7186ac040f9bde1)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/cdefs.h>
27 #include <lz4.h>
28 
29 static uint64_t zfs_crc64_table[256];
30 
31 #ifndef ASSERT3S	/* Proxy for all the assert defines */
32 #define	ASSERT3S(x, y, z)	((void)0)
33 #define	ASSERT3U(x, y, z)	((void)0)
34 #define	ASSERT3P(x, y, z)	((void)0)
35 #define	ASSERT0(x)		((void)0)
36 #define	ASSERT(x)		((void)0)
37 #endif
38 
39 #define	panic(...)	do {						\
40 	printf(__VA_ARGS__);						\
41 	for (;;) ;							\
42 } while (0)
43 
44 static void
zfs_init_crc(void)45 zfs_init_crc(void)
46 {
47 	int i, j;
48 	uint64_t *ct;
49 
50 	/*
51 	 * Calculate the crc64 table (used for the zap hash
52 	 * function).
53 	 */
54 	if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
55 		memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
56 		for (i = 0; i < 256; i++)
57 			for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
58 				*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
59 	}
60 }
61 
62 static void
zio_checksum_off(const void * buf,uint64_t size,const void * ctx_template,zio_cksum_t * zcp)63 zio_checksum_off(const void *buf, uint64_t size,
64     const void *ctx_template, zio_cksum_t *zcp)
65 {
66 	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
67 }
68 
69 /*
70  * Signature for checksum functions.
71  */
72 typedef void zio_checksum_t(const void *data, uint64_t size,
73     const void *ctx_template, zio_cksum_t *zcp);
74 typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
75 typedef void zio_checksum_tmpl_free_t(void *ctx_template);
76 
77 typedef enum zio_checksum_flags {
78 	/* Strong enough for metadata? */
79 	ZCHECKSUM_FLAG_METADATA = (1 << 1),
80 	/* ZIO embedded checksum */
81 	ZCHECKSUM_FLAG_EMBEDDED = (1 << 2),
82 	/* Strong enough for dedup (without verification)? */
83 	ZCHECKSUM_FLAG_DEDUP = (1 << 3),
84 	/* Uses salt value */
85 	ZCHECKSUM_FLAG_SALTED = (1 << 4),
86 	/* Strong enough for nopwrite? */
87 	ZCHECKSUM_FLAG_NOPWRITE = (1 << 5)
88 } zio_checksum_flags_t;
89 
90 /*
91  * Information about each checksum function.
92  */
93 typedef struct zio_checksum_info {
94 	/* checksum function for each byteorder */
95 	zio_checksum_t			*ci_func[2];
96 	zio_checksum_tmpl_init_t	*ci_tmpl_init;
97 	zio_checksum_tmpl_free_t	*ci_tmpl_free;
98 	zio_checksum_flags_t		ci_flags;
99 	const char			*ci_name;	/* descriptive name */
100 } zio_checksum_info_t;
101 
102 #include "blkptr.c"
103 
104 #include "fletcher.c"
105 #include "blake3_zfs.c"
106 #include "sha256.c"
107 #include "skein_zfs.c"
108 
109 #ifdef HAS_ZSTD_ZFS
110 extern int zfs_zstd_decompress_buf(void *s_start, void *d_start, size_t s_len,
111     size_t d_len, int n);
112 #endif
113 
114 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
115 	{{NULL, NULL}, NULL, NULL, 0, "inherit"},
116 	{{NULL, NULL}, NULL, NULL, 0, "on"},
117 	{{zio_checksum_off,	zio_checksum_off}, NULL, NULL, 0, "off"},
118 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
119 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "label"},
120 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
121 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, "gang_header"},
122 	{{fletcher_2_native,	fletcher_2_byteswap}, NULL, NULL,
123 	    ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
124 	{{fletcher_2_native,	fletcher_2_byteswap}, NULL, NULL,
125 	    0, "fletcher2"},
126 	{{fletcher_4_native,	fletcher_4_byteswap}, NULL, NULL,
127 	    ZCHECKSUM_FLAG_METADATA, "fletcher4"},
128 	{{zio_checksum_SHA256,	zio_checksum_SHA256}, NULL, NULL,
129 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
130 	    ZCHECKSUM_FLAG_NOPWRITE, "SHA256"},
131 	{{fletcher_4_native,	fletcher_4_byteswap}, NULL, NULL,
132 	    ZCHECKSUM_FLAG_EMBEDDED, "zillog2"},
133 	{{zio_checksum_off,	zio_checksum_off}, NULL, NULL,
134 	    0, "noparity"},
135 	{{zio_checksum_SHA512_native,	zio_checksum_SHA512_byteswap},
136 	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
137 	    ZCHECKSUM_FLAG_NOPWRITE, "SHA512"},
138 	{{zio_checksum_skein_native, zio_checksum_skein_byteswap},
139 	    zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
140 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
141 	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
142 	/* no edonr for now */
143 	{{NULL, NULL}, NULL, NULL, ZCHECKSUM_FLAG_METADATA |
144 	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
145 	{{zio_checksum_blake3_native,	zio_checksum_blake3_byteswap},
146 	    zio_checksum_blake3_tmpl_init, zio_checksum_blake3_tmpl_free,
147 	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
148 	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "blake3"}
149 };
150 
151 /*
152  * Common signature for all zio compress/decompress functions.
153  */
154 typedef size_t zio_compress_func_t(void *src, void *dst,
155     size_t s_len, size_t d_len, int);
156 typedef int zio_decompress_func_t(void *src, void *dst,
157     size_t s_len, size_t d_len, int);
158 
159 /*
160  * Information about each compression function.
161  */
162 typedef struct zio_compress_info {
163 	zio_compress_func_t	*ci_compress;	/* compression function */
164 	zio_decompress_func_t	*ci_decompress;	/* decompression function */
165 	int			ci_level;	/* level parameter */
166 	const char		*ci_name;	/* algorithm name */
167 } zio_compress_info_t;
168 
169 #include "lzjb.c"
170 #include "zle.c"
171 #include "gzip.c"
172 
173 /*
174  * Compression vectors.
175  */
176 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
177 	{NULL,			NULL,			0,	"inherit"},
178 	{NULL,			NULL,			0,	"on"},
179 	{NULL,			NULL,			0,	"uncompressed"},
180 	{NULL,			lzjb_decompress,	0,	"lzjb"},
181 	{NULL,			NULL,			0,	"empty"},
182 	{NULL,			gzip_decompress,	1,	"gzip-1"},
183 	{NULL,			gzip_decompress,	2,	"gzip-2"},
184 	{NULL,			gzip_decompress,	3,	"gzip-3"},
185 	{NULL,			gzip_decompress,	4,	"gzip-4"},
186 	{NULL,			gzip_decompress,	5,	"gzip-5"},
187 	{NULL,			gzip_decompress,	6,	"gzip-6"},
188 	{NULL,			gzip_decompress,	7,	"gzip-7"},
189 	{NULL,			gzip_decompress,	8,	"gzip-8"},
190 	{NULL,			gzip_decompress,	9,	"gzip-9"},
191 	{NULL,			zle_decompress,		64,	"zle"},
192 	{NULL,			lz4_decompress,		0,	"lz4"},
193 #ifdef HAS_ZSTD_ZFS
194 	{NULL,			zfs_zstd_decompress_buf, ZIO_ZSTD_LEVEL_DEFAULT, "zstd"}
195 #endif
196 };
197 
198 static void
byteswap_uint64_array(void * vbuf,size_t size)199 byteswap_uint64_array(void *vbuf, size_t size)
200 {
201 	uint64_t *buf = vbuf;
202 	size_t count = size >> 3;
203 	int i;
204 
205 	ASSERT((size & 7) == 0);
206 
207 	for (i = 0; i < count; i++)
208 		buf[i] = BSWAP_64(buf[i]);
209 }
210 
211 /*
212  * Set the external verifier for a gang block based on <vdev, offset, txg>,
213  * a tuple which is guaranteed to be unique for the life of the pool.
214  */
215 static void
zio_checksum_gang_verifier(zio_cksum_t * zcp,const blkptr_t * bp)216 zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp)
217 {
218 	const dva_t *dva = BP_IDENTITY(bp);
219 	uint64_t txg = BP_PHYSICAL_BIRTH(bp);
220 
221 	ASSERT(BP_IS_GANG(bp));
222 
223 	ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
224 }
225 
226 /*
227  * Set the external verifier for a label block based on its offset.
228  * The vdev is implicit, and the txg is unknowable at pool open time --
229  * hence the logic in vdev_uberblock_load() to find the most recent copy.
230  */
231 static void
zio_checksum_label_verifier(zio_cksum_t * zcp,uint64_t offset)232 zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
233 {
234 	ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
235 }
236 
237 /*
238  * Calls the template init function of a checksum which supports context
239  * templates and installs the template into the spa_t.
240  */
241 static void
zio_checksum_template_init(enum zio_checksum checksum,spa_t * spa)242 zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
243 {
244 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
245 
246 	if (ci->ci_tmpl_init == NULL)
247 		return;
248 
249 	if (spa->spa_cksum_tmpls[checksum] != NULL)
250 		return;
251 
252 	if (spa->spa_cksum_tmpls[checksum] == NULL) {
253 		spa->spa_cksum_tmpls[checksum] =
254 		    ci->ci_tmpl_init(&spa->spa_cksum_salt);
255 	}
256 }
257 
258 /*
259  * Called by a spa_t that's about to be deallocated. This steps through
260  * all of the checksum context templates and deallocates any that were
261  * initialized using the algorithm-specific template init function.
262  */
263 static void __unused
zio_checksum_templates_free(spa_t * spa)264 zio_checksum_templates_free(spa_t *spa)
265 {
266 	for (enum zio_checksum checksum = 0;
267 	    checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
268 		if (spa->spa_cksum_tmpls[checksum] != NULL) {
269 			zio_checksum_info_t *ci = &zio_checksum_table[checksum];
270 
271 			ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
272 			spa->spa_cksum_tmpls[checksum] = NULL;
273 		}
274 	}
275 }
276 
277 static int
zio_checksum_verify(const spa_t * spa,const blkptr_t * bp,void * data)278 zio_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data)
279 {
280 	uint64_t size;
281 	unsigned int checksum;
282 	zio_checksum_info_t *ci;
283 	void *ctx = NULL;
284 	zio_cksum_t actual_cksum, expected_cksum, verifier;
285 	int byteswap;
286 
287 	checksum = BP_GET_CHECKSUM(bp);
288 	size = BP_GET_PSIZE(bp);
289 
290 	if (checksum >= ZIO_CHECKSUM_FUNCTIONS)
291 		return (EINVAL);
292 	ci = &zio_checksum_table[checksum];
293 	if (ci->ci_func[0] == NULL || ci->ci_func[1] == NULL)
294 		return (EINVAL);
295 
296 	if (spa != NULL) {
297 		zio_checksum_template_init(checksum, __DECONST(spa_t *,spa));
298 		ctx = spa->spa_cksum_tmpls[checksum];
299 	}
300 
301 	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
302 		zio_eck_t *eck;
303 
304 		ASSERT(checksum == ZIO_CHECKSUM_GANG_HEADER ||
305 		    checksum == ZIO_CHECKSUM_LABEL);
306 
307 		eck = (zio_eck_t *)((char *)data + size) - 1;
308 
309 		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
310 			zio_checksum_gang_verifier(&verifier, bp);
311 		else if (checksum == ZIO_CHECKSUM_LABEL)
312 			zio_checksum_label_verifier(&verifier,
313 			    DVA_GET_OFFSET(BP_IDENTITY(bp)));
314 		else
315 			verifier = bp->blk_cksum;
316 
317 		byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
318 
319 		if (byteswap)
320 			byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
321 
322 		expected_cksum = eck->zec_cksum;
323 		eck->zec_cksum = verifier;
324 		ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
325 		eck->zec_cksum = expected_cksum;
326 
327 		if (byteswap)
328 			byteswap_uint64_array(&expected_cksum,
329 			    sizeof (zio_cksum_t));
330 	} else {
331 		byteswap = BP_SHOULD_BYTESWAP(bp);
332 		expected_cksum = bp->blk_cksum;
333 		ci->ci_func[byteswap](data, size, ctx, &actual_cksum);
334 	}
335 
336 	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) {
337 		/*printf("ZFS: read checksum %s failed\n", ci->ci_name);*/
338 		return (EIO);
339 	}
340 
341 	return (0);
342 }
343 
344 static int
zio_decompress_data(int cpfunc,void * src,uint64_t srcsize,void * dest,uint64_t destsize)345 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
346 	void *dest, uint64_t destsize)
347 {
348 	zio_compress_info_t *ci;
349 
350 	if (cpfunc >= ZIO_COMPRESS_FUNCTIONS) {
351 		printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
352 		return (EIO);
353 	}
354 
355 	ci = &zio_compress_table[cpfunc];
356 	if (!ci->ci_decompress) {
357 		printf("ZFS: unsupported compression algorithm %s\n",
358 		    ci->ci_name);
359 		return (EIO);
360 	}
361 
362 	return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
363 }
364 
365 static uint64_t
zap_hash(uint64_t salt,const char * name)366 zap_hash(uint64_t salt, const char *name)
367 {
368 	const uint8_t *cp;
369 	uint8_t c;
370 	uint64_t crc = salt;
371 
372 	ASSERT(crc != 0);
373 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
374 	for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
375 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
376 
377 	/*
378 	 * Only use 28 bits, since we need 4 bits in the cookie for the
379 	 * collision differentiator.  We MUST use the high bits, since
380 	 * those are the onces that we first pay attention to when
381 	 * chosing the bucket.
382 	 */
383 	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
384 
385 	return (crc);
386 }
387 
388 typedef struct raidz_col {
389 	uint64_t rc_devidx;		/* child device index for I/O */
390 	uint64_t rc_offset;		/* device offset */
391 	uint64_t rc_size;		/* I/O size */
392 	void *rc_data;			/* I/O data */
393 	int rc_error;			/* I/O error for this device */
394 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
395 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
396 } raidz_col_t;
397 
398 typedef struct raidz_map {
399 	uint64_t rm_cols;		/* Regular column count */
400 	uint64_t rm_scols;		/* Count including skipped columns */
401 	uint64_t rm_bigcols;		/* Number of oversized columns */
402 	uint64_t rm_asize;		/* Actual total I/O size */
403 	uint64_t rm_missingdata;	/* Count of missing data devices */
404 	uint64_t rm_missingparity;	/* Count of missing parity devices */
405 	uint64_t rm_firstdatacol;	/* First data column/parity count */
406 	uint64_t rm_nskip;		/* Skipped sectors for padding */
407 	uint64_t rm_skipstart;		/* Column index of padding start */
408 	uintptr_t rm_reports;		/* # of referencing checksum reports */
409 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
410 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
411 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
412 } raidz_map_t;
413 
414 #define	VDEV_RAIDZ_P		0
415 #define	VDEV_RAIDZ_Q		1
416 #define	VDEV_RAIDZ_R		2
417 
418 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
419 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
420 
421 /*
422  * We provide a mechanism to perform the field multiplication operation on a
423  * 64-bit value all at once rather than a byte at a time. This works by
424  * creating a mask from the top bit in each byte and using that to
425  * conditionally apply the XOR of 0x1d.
426  */
427 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
428 { \
429 	(mask) = (x) & 0x8080808080808080ULL; \
430 	(mask) = ((mask) << 1) - ((mask) >> 7); \
431 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
432 	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
433 }
434 
435 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
436 { \
437 	VDEV_RAIDZ_64MUL_2((x), mask); \
438 	VDEV_RAIDZ_64MUL_2((x), mask); \
439 }
440 
441 /*
442  * These two tables represent powers and logs of 2 in the Galois field defined
443  * above. These values were computed by repeatedly multiplying by 2 as above.
444  */
445 static const uint8_t vdev_raidz_pow2[256] = {
446 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
447 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
448 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
449 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
450 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
451 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
452 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
453 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
454 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
455 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
456 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
457 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
458 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
459 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
460 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
461 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
462 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
463 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
464 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
465 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
466 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
467 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
468 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
469 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
470 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
471 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
472 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
473 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
474 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
475 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
476 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
477 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
478 };
479 static const uint8_t vdev_raidz_log2[256] = {
480 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
481 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
482 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
483 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
484 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
485 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
486 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
487 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
488 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
489 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
490 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
491 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
492 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
493 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
494 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
495 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
496 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
497 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
498 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
499 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
500 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
501 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
502 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
503 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
504 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
505 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
506 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
507 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
508 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
509 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
510 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
511 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
512 };
513 
514 /*
515  * Multiply a given number by 2 raised to the given power.
516  */
517 static uint8_t
vdev_raidz_exp2(uint8_t a,int exp)518 vdev_raidz_exp2(uint8_t a, int exp)
519 {
520 	if (a == 0)
521 		return (0);
522 
523 	ASSERT(exp >= 0);
524 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
525 
526 	exp += vdev_raidz_log2[a];
527 	if (exp > 255)
528 		exp -= 255;
529 
530 	return (vdev_raidz_pow2[exp]);
531 }
532 
533 static void
vdev_raidz_generate_parity_p(raidz_map_t * rm)534 vdev_raidz_generate_parity_p(raidz_map_t *rm)
535 {
536 	uint64_t *p, *src, ccount, i;
537 	uint64_t pcount __unused;
538 	int c;
539 
540 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
541 
542 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
543 		src = rm->rm_col[c].rc_data;
544 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
545 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
546 
547 		if (c == rm->rm_firstdatacol) {
548 			ASSERT(ccount == pcount);
549 			for (i = 0; i < ccount; i++, src++, p++) {
550 				*p = *src;
551 			}
552 		} else {
553 			ASSERT(ccount <= pcount);
554 			for (i = 0; i < ccount; i++, src++, p++) {
555 				*p ^= *src;
556 			}
557 		}
558 	}
559 }
560 
561 static void
vdev_raidz_generate_parity_pq(raidz_map_t * rm)562 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
563 {
564 	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
565 	int c;
566 
567 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
568 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
569 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
570 
571 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
572 		src = rm->rm_col[c].rc_data;
573 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
574 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
575 
576 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
577 
578 		if (c == rm->rm_firstdatacol) {
579 			ASSERT(ccnt == pcnt || ccnt == 0);
580 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
581 				*p = *src;
582 				*q = *src;
583 			}
584 			for (; i < pcnt; i++, src++, p++, q++) {
585 				*p = 0;
586 				*q = 0;
587 			}
588 		} else {
589 			ASSERT(ccnt <= pcnt);
590 
591 			/*
592 			 * Apply the algorithm described above by multiplying
593 			 * the previous result and adding in the new value.
594 			 */
595 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
596 				*p ^= *src;
597 
598 				VDEV_RAIDZ_64MUL_2(*q, mask);
599 				*q ^= *src;
600 			}
601 
602 			/*
603 			 * Treat short columns as though they are full of 0s.
604 			 * Note that there's therefore nothing needed for P.
605 			 */
606 			for (; i < pcnt; i++, q++) {
607 				VDEV_RAIDZ_64MUL_2(*q, mask);
608 			}
609 		}
610 	}
611 }
612 
613 static void
vdev_raidz_generate_parity_pqr(raidz_map_t * rm)614 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
615 {
616 	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
617 	int c;
618 
619 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
620 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
621 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
622 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
623 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
624 
625 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
626 		src = rm->rm_col[c].rc_data;
627 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
628 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
629 		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
630 
631 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
632 
633 		if (c == rm->rm_firstdatacol) {
634 			ASSERT(ccnt == pcnt || ccnt == 0);
635 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
636 				*p = *src;
637 				*q = *src;
638 				*r = *src;
639 			}
640 			for (; i < pcnt; i++, src++, p++, q++, r++) {
641 				*p = 0;
642 				*q = 0;
643 				*r = 0;
644 			}
645 		} else {
646 			ASSERT(ccnt <= pcnt);
647 
648 			/*
649 			 * Apply the algorithm described above by multiplying
650 			 * the previous result and adding in the new value.
651 			 */
652 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
653 				*p ^= *src;
654 
655 				VDEV_RAIDZ_64MUL_2(*q, mask);
656 				*q ^= *src;
657 
658 				VDEV_RAIDZ_64MUL_4(*r, mask);
659 				*r ^= *src;
660 			}
661 
662 			/*
663 			 * Treat short columns as though they are full of 0s.
664 			 * Note that there's therefore nothing needed for P.
665 			 */
666 			for (; i < pcnt; i++, q++, r++) {
667 				VDEV_RAIDZ_64MUL_2(*q, mask);
668 				VDEV_RAIDZ_64MUL_4(*r, mask);
669 			}
670 		}
671 	}
672 }
673 
674 /*
675  * Generate RAID parity in the first virtual columns according to the number of
676  * parity columns available.
677  */
678 static void
vdev_raidz_generate_parity(raidz_map_t * rm)679 vdev_raidz_generate_parity(raidz_map_t *rm)
680 {
681 	switch (rm->rm_firstdatacol) {
682 	case 1:
683 		vdev_raidz_generate_parity_p(rm);
684 		break;
685 	case 2:
686 		vdev_raidz_generate_parity_pq(rm);
687 		break;
688 	case 3:
689 		vdev_raidz_generate_parity_pqr(rm);
690 		break;
691 	default:
692 		panic("invalid RAID-Z configuration");
693 	}
694 }
695 
696 /* BEGIN CSTYLED */
697 /*
698  * In the general case of reconstruction, we must solve the system of linear
699  * equations defined by the coeffecients used to generate parity as well as
700  * the contents of the data and parity disks. This can be expressed with
701  * vectors for the original data (D) and the actual data (d) and parity (p)
702  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
703  *
704  *            __   __                     __     __
705  *            |     |         __     __   |  p_0  |
706  *            |  V  |         |  D_0  |   | p_m-1 |
707  *            |     |    x    |   :   | = |  d_0  |
708  *            |  I  |         | D_n-1 |   |   :   |
709  *            |     |         ~~     ~~   | d_n-1 |
710  *            ~~   ~~                     ~~     ~~
711  *
712  * I is simply a square identity matrix of size n, and V is a vandermonde
713  * matrix defined by the coeffecients we chose for the various parity columns
714  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
715  * computation as well as linear separability.
716  *
717  *      __               __               __     __
718  *      |   1   ..  1 1 1 |               |  p_0  |
719  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
720  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
721  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
722  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
723  *      |   :       : : : |   |   :   |   |  d_2  |
724  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
725  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
726  *      |   0   ..  0 0 1 |               | d_n-1 |
727  *      ~~               ~~               ~~     ~~
728  *
729  * Note that I, V, d, and p are known. To compute D, we must invert the
730  * matrix and use the known data and parity values to reconstruct the unknown
731  * data values. We begin by removing the rows in V|I and d|p that correspond
732  * to failed or missing columns; we then make V|I square (n x n) and d|p
733  * sized n by removing rows corresponding to unused parity from the bottom up
734  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
735  * using Gauss-Jordan elimination. In the example below we use m=3 parity
736  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
737  *           __                               __
738  *           |  1   1   1   1   1   1   1   1  |
739  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
740  *           |  19 205 116  29  64  16  4   1  |      / /
741  *           |  1   0   0   0   0   0   0   0  |     / /
742  *           |  0   1   0   0   0   0   0   0  | <--' /
743  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
744  *           |  0   0   0   1   0   0   0   0  |
745  *           |  0   0   0   0   1   0   0   0  |
746  *           |  0   0   0   0   0   1   0   0  |
747  *           |  0   0   0   0   0   0   1   0  |
748  *           |  0   0   0   0   0   0   0   1  |
749  *           ~~                               ~~
750  *           __                               __
751  *           |  1   1   1   1   1   1   1   1  |
752  *           | 128  64  32  16  8   4   2   1  |
753  *           |  19 205 116  29  64  16  4   1  |
754  *           |  1   0   0   0   0   0   0   0  |
755  *           |  0   1   0   0   0   0   0   0  |
756  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
757  *           |  0   0   0   1   0   0   0   0  |
758  *           |  0   0   0   0   1   0   0   0  |
759  *           |  0   0   0   0   0   1   0   0  |
760  *           |  0   0   0   0   0   0   1   0  |
761  *           |  0   0   0   0   0   0   0   1  |
762  *           ~~                               ~~
763  *
764  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
765  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
766  * matrix is not singular.
767  * __                                                                 __
768  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
769  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
770  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
771  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
772  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
773  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
774  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
775  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
776  * ~~                                                                 ~~
777  * __                                                                 __
778  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
779  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
780  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
781  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
782  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
783  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
784  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
785  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
786  * ~~                                                                 ~~
787  * __                                                                 __
788  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
789  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
790  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
791  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
792  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
793  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
794  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
795  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
796  * ~~                                                                 ~~
797  * __                                                                 __
798  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
799  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
800  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
801  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
802  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
803  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
804  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
805  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
806  * ~~                                                                 ~~
807  * __                                                                 __
808  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
809  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
810  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
811  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
812  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
813  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
814  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
815  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
816  * ~~                                                                 ~~
817  * __                                                                 __
818  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
819  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
820  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
821  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
822  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
823  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
824  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
825  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
826  * ~~                                                                 ~~
827  *                   __                               __
828  *                   |  0   0   1   0   0   0   0   0  |
829  *                   | 167 100  5   41 159 169 217 208 |
830  *                   | 166 100  4   40 158 168 216 209 |
831  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
832  *                   |  0   0   0   0   1   0   0   0  |
833  *                   |  0   0   0   0   0   1   0   0  |
834  *                   |  0   0   0   0   0   0   1   0  |
835  *                   |  0   0   0   0   0   0   0   1  |
836  *                   ~~                               ~~
837  *
838  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
839  * of the missing data.
840  *
841  * As is apparent from the example above, the only non-trivial rows in the
842  * inverse matrix correspond to the data disks that we're trying to
843  * reconstruct. Indeed, those are the only rows we need as the others would
844  * only be useful for reconstructing data known or assumed to be valid. For
845  * that reason, we only build the coefficients in the rows that correspond to
846  * targeted columns.
847  */
848 /* END CSTYLED */
849 
850 static void
vdev_raidz_matrix_init(raidz_map_t * rm,int n,int nmap,int * map,uint8_t ** rows)851 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
852     uint8_t **rows)
853 {
854 	int i, j;
855 	int pow;
856 
857 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
858 
859 	/*
860 	 * Fill in the missing rows of interest.
861 	 */
862 	for (i = 0; i < nmap; i++) {
863 		ASSERT3S(0, <=, map[i]);
864 		ASSERT3S(map[i], <=, 2);
865 
866 		pow = map[i] * n;
867 		if (pow > 255)
868 			pow -= 255;
869 		ASSERT(pow <= 255);
870 
871 		for (j = 0; j < n; j++) {
872 			pow -= map[i];
873 			if (pow < 0)
874 				pow += 255;
875 			rows[i][j] = vdev_raidz_pow2[pow];
876 		}
877 	}
878 }
879 
880 static void
vdev_raidz_matrix_invert(raidz_map_t * rm,int n,int nmissing,int * missing,uint8_t ** rows,uint8_t ** invrows,const uint8_t * used)881 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
882     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
883 {
884 	int i, j, ii, jj;
885 	uint8_t log;
886 
887 	/*
888 	 * Assert that the first nmissing entries from the array of used
889 	 * columns correspond to parity columns and that subsequent entries
890 	 * correspond to data columns.
891 	 */
892 	for (i = 0; i < nmissing; i++) {
893 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
894 	}
895 	for (; i < n; i++) {
896 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
897 	}
898 
899 	/*
900 	 * First initialize the storage where we'll compute the inverse rows.
901 	 */
902 	for (i = 0; i < nmissing; i++) {
903 		for (j = 0; j < n; j++) {
904 			invrows[i][j] = (i == j) ? 1 : 0;
905 		}
906 	}
907 
908 	/*
909 	 * Subtract all trivial rows from the rows of consequence.
910 	 */
911 	for (i = 0; i < nmissing; i++) {
912 		for (j = nmissing; j < n; j++) {
913 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
914 			jj = used[j] - rm->rm_firstdatacol;
915 			ASSERT3S(jj, <, n);
916 			invrows[i][j] = rows[i][jj];
917 			rows[i][jj] = 0;
918 		}
919 	}
920 
921 	/*
922 	 * For each of the rows of interest, we must normalize it and subtract
923 	 * a multiple of it from the other rows.
924 	 */
925 	for (i = 0; i < nmissing; i++) {
926 		for (j = 0; j < missing[i]; j++) {
927 			ASSERT3U(rows[i][j], ==, 0);
928 		}
929 		ASSERT3U(rows[i][missing[i]], !=, 0);
930 
931 		/*
932 		 * Compute the inverse of the first element and multiply each
933 		 * element in the row by that value.
934 		 */
935 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
936 
937 		for (j = 0; j < n; j++) {
938 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
939 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
940 		}
941 
942 		for (ii = 0; ii < nmissing; ii++) {
943 			if (i == ii)
944 				continue;
945 
946 			ASSERT3U(rows[ii][missing[i]], !=, 0);
947 
948 			log = vdev_raidz_log2[rows[ii][missing[i]]];
949 
950 			for (j = 0; j < n; j++) {
951 				rows[ii][j] ^=
952 				    vdev_raidz_exp2(rows[i][j], log);
953 				invrows[ii][j] ^=
954 				    vdev_raidz_exp2(invrows[i][j], log);
955 			}
956 		}
957 	}
958 
959 	/*
960 	 * Verify that the data that is left in the rows are properly part of
961 	 * an identity matrix.
962 	 */
963 	for (i = 0; i < nmissing; i++) {
964 		for (j = 0; j < n; j++) {
965 			if (j == missing[i]) {
966 				ASSERT3U(rows[i][j], ==, 1);
967 			} else {
968 				ASSERT3U(rows[i][j], ==, 0);
969 			}
970 		}
971 	}
972 }
973 
974 static void
vdev_raidz_matrix_reconstruct(raidz_map_t * rm,int n,int nmissing,int * missing,uint8_t ** invrows,const uint8_t * used)975 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
976     int *missing, uint8_t **invrows, const uint8_t *used)
977 {
978 	int i, j, x, cc, c;
979 	uint8_t *src;
980 	uint64_t ccount;
981 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
982 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
983 	uint8_t log, val;
984 	int ll;
985 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
986 	uint8_t *p, *pp;
987 	size_t psize;
988 
989 	log = 0;	/* gcc */
990 	psize = sizeof (invlog[0][0]) * n * nmissing;
991 	p = malloc(psize);
992 	if (p == NULL) {
993 		printf("Out of memory\n");
994 		return;
995 	}
996 
997 	for (pp = p, i = 0; i < nmissing; i++) {
998 		invlog[i] = pp;
999 		pp += n;
1000 	}
1001 
1002 	for (i = 0; i < nmissing; i++) {
1003 		for (j = 0; j < n; j++) {
1004 			ASSERT3U(invrows[i][j], !=, 0);
1005 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1006 		}
1007 	}
1008 
1009 	for (i = 0; i < n; i++) {
1010 		c = used[i];
1011 		ASSERT3U(c, <, rm->rm_cols);
1012 
1013 		src = rm->rm_col[c].rc_data;
1014 		ccount = rm->rm_col[c].rc_size;
1015 		for (j = 0; j < nmissing; j++) {
1016 			cc = missing[j] + rm->rm_firstdatacol;
1017 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1018 			ASSERT3U(cc, <, rm->rm_cols);
1019 			ASSERT3U(cc, !=, c);
1020 
1021 			dst[j] = rm->rm_col[cc].rc_data;
1022 			dcount[j] = rm->rm_col[cc].rc_size;
1023 		}
1024 
1025 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1026 
1027 		for (x = 0; x < ccount; x++, src++) {
1028 			if (*src != 0)
1029 				log = vdev_raidz_log2[*src];
1030 
1031 			for (cc = 0; cc < nmissing; cc++) {
1032 				if (x >= dcount[cc])
1033 					continue;
1034 
1035 				if (*src == 0) {
1036 					val = 0;
1037 				} else {
1038 					if ((ll = log + invlog[cc][i]) >= 255)
1039 						ll -= 255;
1040 					val = vdev_raidz_pow2[ll];
1041 				}
1042 
1043 				if (i == 0)
1044 					dst[cc][x] = val;
1045 				else
1046 					dst[cc][x] ^= val;
1047 			}
1048 		}
1049 	}
1050 
1051 	free(p);
1052 }
1053 
1054 static int
vdev_raidz_reconstruct_general(raidz_map_t * rm,int * tgts,int ntgts)1055 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1056 {
1057 	int n, i, c, t, tt;
1058 	int nmissing_rows;
1059 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1060 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1061 
1062 	uint8_t *p, *pp;
1063 	size_t psize;
1064 
1065 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1066 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1067 	uint8_t *used;
1068 
1069 	int code = 0;
1070 
1071 
1072 	n = rm->rm_cols - rm->rm_firstdatacol;
1073 
1074 	/*
1075 	 * Figure out which data columns are missing.
1076 	 */
1077 	nmissing_rows = 0;
1078 	for (t = 0; t < ntgts; t++) {
1079 		if (tgts[t] >= rm->rm_firstdatacol) {
1080 			missing_rows[nmissing_rows++] =
1081 			    tgts[t] - rm->rm_firstdatacol;
1082 		}
1083 	}
1084 
1085 	/*
1086 	 * Figure out which parity columns to use to help generate the missing
1087 	 * data columns.
1088 	 */
1089 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1090 		ASSERT(tt < ntgts);
1091 		ASSERT(c < rm->rm_firstdatacol);
1092 
1093 		/*
1094 		 * Skip any targeted parity columns.
1095 		 */
1096 		if (c == tgts[tt]) {
1097 			tt++;
1098 			continue;
1099 		}
1100 
1101 		code |= 1 << c;
1102 
1103 		parity_map[i] = c;
1104 		i++;
1105 	}
1106 
1107 	ASSERT(code != 0);
1108 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1109 
1110 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1111 	    nmissing_rows * n + sizeof (used[0]) * n;
1112 	p = malloc(psize);
1113 	if (p == NULL) {
1114 		printf("Out of memory\n");
1115 		return (code);
1116 	}
1117 
1118 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1119 		rows[i] = pp;
1120 		pp += n;
1121 		invrows[i] = pp;
1122 		pp += n;
1123 	}
1124 	used = pp;
1125 
1126 	for (i = 0; i < nmissing_rows; i++) {
1127 		used[i] = parity_map[i];
1128 	}
1129 
1130 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1131 		if (tt < nmissing_rows &&
1132 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1133 			tt++;
1134 			continue;
1135 		}
1136 
1137 		ASSERT3S(i, <, n);
1138 		used[i] = c;
1139 		i++;
1140 	}
1141 
1142 	/*
1143 	 * Initialize the interesting rows of the matrix.
1144 	 */
1145 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1146 
1147 	/*
1148 	 * Invert the matrix.
1149 	 */
1150 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1151 	    invrows, used);
1152 
1153 	/*
1154 	 * Reconstruct the missing data using the generated matrix.
1155 	 */
1156 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1157 	    invrows, used);
1158 
1159 	free(p);
1160 
1161 	return (code);
1162 }
1163 
1164 static int
vdev_raidz_reconstruct(raidz_map_t * rm,int * t,int nt)1165 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1166 {
1167 	int tgts[VDEV_RAIDZ_MAXPARITY];
1168 	int ntgts;
1169 	int i, c;
1170 	int code;
1171 	int nbadparity, nbaddata;
1172 
1173 	/*
1174 	 * The tgts list must already be sorted.
1175 	 */
1176 	for (i = 1; i < nt; i++) {
1177 		ASSERT(t[i] > t[i - 1]);
1178 	}
1179 
1180 	nbadparity = rm->rm_firstdatacol;
1181 	nbaddata = rm->rm_cols - nbadparity;
1182 	ntgts = 0;
1183 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1184 		if (i < nt && c == t[i]) {
1185 			tgts[ntgts++] = c;
1186 			i++;
1187 		} else if (rm->rm_col[c].rc_error != 0) {
1188 			tgts[ntgts++] = c;
1189 		} else if (c >= rm->rm_firstdatacol) {
1190 			nbaddata--;
1191 		} else {
1192 			nbadparity--;
1193 		}
1194 	}
1195 
1196 	ASSERT(ntgts >= nt);
1197 	ASSERT(nbaddata >= 0);
1198 	ASSERT(nbaddata + nbadparity == ntgts);
1199 
1200 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1201 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1202 	ASSERT(code > 0);
1203 	return (code);
1204 }
1205 
1206 static raidz_map_t *
vdev_raidz_map_alloc(void * data,off_t offset,size_t size,uint64_t unit_shift,uint64_t dcols,uint64_t nparity)1207 vdev_raidz_map_alloc(void *data, off_t offset, size_t size, uint64_t unit_shift,
1208     uint64_t dcols, uint64_t nparity)
1209 {
1210 	raidz_map_t *rm;
1211 	uint64_t b = offset >> unit_shift;
1212 	uint64_t s = size >> unit_shift;
1213 	uint64_t f = b % dcols;
1214 	uint64_t o = (b / dcols) << unit_shift;
1215 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
1216 
1217 	q = s / (dcols - nparity);
1218 	r = s - q * (dcols - nparity);
1219 	bc = (r == 0 ? 0 : r + nparity);
1220 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
1221 
1222 	if (q == 0) {
1223 		acols = bc;
1224 		scols = MIN(dcols, roundup(bc, nparity + 1));
1225 	} else {
1226 		acols = dcols;
1227 		scols = dcols;
1228 	}
1229 
1230 	ASSERT3U(acols, <=, scols);
1231 
1232 	rm = malloc(offsetof(raidz_map_t, rm_col[scols]));
1233 	if (rm == NULL)
1234 		return (rm);
1235 
1236 	rm->rm_cols = acols;
1237 	rm->rm_scols = scols;
1238 	rm->rm_bigcols = bc;
1239 	rm->rm_skipstart = bc;
1240 	rm->rm_missingdata = 0;
1241 	rm->rm_missingparity = 0;
1242 	rm->rm_firstdatacol = nparity;
1243 	rm->rm_reports = 0;
1244 	rm->rm_freed = 0;
1245 	rm->rm_ecksuminjected = 0;
1246 
1247 	asize = 0;
1248 
1249 	for (c = 0; c < scols; c++) {
1250 		col = f + c;
1251 		coff = o;
1252 		if (col >= dcols) {
1253 			col -= dcols;
1254 			coff += 1ULL << unit_shift;
1255 		}
1256 		rm->rm_col[c].rc_devidx = col;
1257 		rm->rm_col[c].rc_offset = coff;
1258 		rm->rm_col[c].rc_data = NULL;
1259 		rm->rm_col[c].rc_error = 0;
1260 		rm->rm_col[c].rc_tried = 0;
1261 		rm->rm_col[c].rc_skipped = 0;
1262 
1263 		if (c >= acols)
1264 			rm->rm_col[c].rc_size = 0;
1265 		else if (c < bc)
1266 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
1267 		else
1268 			rm->rm_col[c].rc_size = q << unit_shift;
1269 
1270 		asize += rm->rm_col[c].rc_size;
1271 	}
1272 
1273 	ASSERT3U(asize, ==, tot << unit_shift);
1274 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
1275 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
1276 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
1277 	ASSERT3U(rm->rm_nskip, <=, nparity);
1278 
1279 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1280 		rm->rm_col[c].rc_data = malloc(rm->rm_col[c].rc_size);
1281 		if (rm->rm_col[c].rc_data == NULL) {
1282 			c++;
1283 			while (c != 0)
1284 				free(rm->rm_col[--c].rc_data);
1285 			free(rm);
1286 			return (NULL);
1287 		}
1288 	}
1289 
1290 	rm->rm_col[c].rc_data = data;
1291 
1292 	for (c = c + 1; c < acols; c++)
1293 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
1294 		    rm->rm_col[c - 1].rc_size;
1295 
1296 	/*
1297 	 * If all data stored spans all columns, there's a danger that parity
1298 	 * will always be on the same device and, since parity isn't read
1299 	 * during normal operation, that that device's I/O bandwidth won't be
1300 	 * used effectively. We therefore switch the parity every 1MB.
1301 	 *
1302 	 * ... at least that was, ostensibly, the theory. As a practical
1303 	 * matter unless we juggle the parity between all devices evenly, we
1304 	 * won't see any benefit. Further, occasional writes that aren't a
1305 	 * multiple of the LCM of the number of children and the minimum
1306 	 * stripe width are sufficient to avoid pessimal behavior.
1307 	 * Unfortunately, this decision created an implicit on-disk format
1308 	 * requirement that we need to support for all eternity, but only
1309 	 * for single-parity RAID-Z.
1310 	 *
1311 	 * If we intend to skip a sector in the zeroth column for padding
1312 	 * we must make sure to note this swap. We will never intend to
1313 	 * skip the first column since at least one data and one parity
1314 	 * column must appear in each row.
1315 	 */
1316 	ASSERT(rm->rm_cols >= 2);
1317 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
1318 
1319 	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
1320 		devidx = rm->rm_col[0].rc_devidx;
1321 		o = rm->rm_col[0].rc_offset;
1322 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
1323 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
1324 		rm->rm_col[1].rc_devidx = devidx;
1325 		rm->rm_col[1].rc_offset = o;
1326 
1327 		if (rm->rm_skipstart == 0)
1328 			rm->rm_skipstart = 1;
1329 	}
1330 
1331 	return (rm);
1332 }
1333 
1334 static void
vdev_raidz_map_free(raidz_map_t * rm)1335 vdev_raidz_map_free(raidz_map_t *rm)
1336 {
1337 	int c;
1338 
1339 	for (c = rm->rm_firstdatacol - 1; c >= 0; c--)
1340 		free(rm->rm_col[c].rc_data);
1341 
1342 	free(rm);
1343 }
1344 
1345 static vdev_t *
vdev_child(vdev_t * pvd,uint64_t devidx)1346 vdev_child(vdev_t *pvd, uint64_t devidx)
1347 {
1348 	vdev_t *cvd;
1349 
1350 	STAILQ_FOREACH(cvd, &pvd->v_children, v_childlink) {
1351 		if (cvd->v_id == devidx)
1352 			break;
1353 	}
1354 
1355 	return (cvd);
1356 }
1357 
1358 /*
1359  * We keep track of whether or not there were any injected errors, so that
1360  * any ereports we generate can note it.
1361  */
1362 static int
raidz_checksum_verify(const spa_t * spa,const blkptr_t * bp,void * data,uint64_t size)1363 raidz_checksum_verify(const spa_t *spa, const blkptr_t *bp, void *data,
1364     uint64_t size)
1365 {
1366 	return (zio_checksum_verify(spa, bp, data));
1367 }
1368 
1369 /*
1370  * Generate the parity from the data columns. If we tried and were able to
1371  * read the parity without error, verify that the generated parity matches the
1372  * data we read. If it doesn't, we fire off a checksum error. Return the
1373  * number such failures.
1374  */
1375 static int
raidz_parity_verify(raidz_map_t * rm)1376 raidz_parity_verify(raidz_map_t *rm)
1377 {
1378 	void *orig[VDEV_RAIDZ_MAXPARITY];
1379 	int c, ret = 0;
1380 	raidz_col_t *rc;
1381 
1382 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1383 		rc = &rm->rm_col[c];
1384 		if (!rc->rc_tried || rc->rc_error != 0)
1385 			continue;
1386 		orig[c] = malloc(rc->rc_size);
1387 		if (orig[c] != NULL) {
1388 			bcopy(rc->rc_data, orig[c], rc->rc_size);
1389 		} else {
1390 			printf("Out of memory\n");
1391 		}
1392 	}
1393 
1394 	vdev_raidz_generate_parity(rm);
1395 
1396 	for (c = rm->rm_firstdatacol - 1; c >= 0; c--) {
1397 		rc = &rm->rm_col[c];
1398 		if (!rc->rc_tried || rc->rc_error != 0)
1399 			continue;
1400 		if (orig[c] == NULL ||
1401 		    bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1402 			rc->rc_error = ECKSUM;
1403 			ret++;
1404 		}
1405 		free(orig[c]);
1406 	}
1407 
1408 	return (ret);
1409 }
1410 
1411 /*
1412  * Iterate over all combinations of bad data and attempt a reconstruction.
1413  * Note that the algorithm below is non-optimal because it doesn't take into
1414  * account how reconstruction is actually performed. For example, with
1415  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1416  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1417  * cases we'd only use parity information in column 0.
1418  */
1419 static int
vdev_raidz_combrec(const spa_t * spa,raidz_map_t * rm,const blkptr_t * bp,void * data,off_t offset,uint64_t bytes,int total_errors,int data_errors)1420 vdev_raidz_combrec(const spa_t *spa, raidz_map_t *rm, const blkptr_t *bp,
1421     void *data, off_t offset, uint64_t bytes, int total_errors, int data_errors)
1422 {
1423 	raidz_col_t *rc;
1424 	void *orig[VDEV_RAIDZ_MAXPARITY];
1425 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1426 	int *tgts = &tstore[1];
1427 	int current, next, i, c, n;
1428 	int code, ret = 0;
1429 
1430 	ASSERT(total_errors < rm->rm_firstdatacol);
1431 
1432 	/*
1433 	 * This simplifies one edge condition.
1434 	 */
1435 	tgts[-1] = -1;
1436 
1437 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1438 		/*
1439 		 * Initialize the targets array by finding the first n columns
1440 		 * that contain no error.
1441 		 *
1442 		 * If there were no data errors, we need to ensure that we're
1443 		 * always explicitly attempting to reconstruct at least one
1444 		 * data column. To do this, we simply push the highest target
1445 		 * up into the data columns.
1446 		 */
1447 		for (c = 0, i = 0; i < n; i++) {
1448 			if (i == n - 1 && data_errors == 0 &&
1449 			    c < rm->rm_firstdatacol) {
1450 				c = rm->rm_firstdatacol;
1451 			}
1452 
1453 			while (rm->rm_col[c].rc_error != 0) {
1454 				c++;
1455 				ASSERT3S(c, <, rm->rm_cols);
1456 			}
1457 
1458 			tgts[i] = c++;
1459 		}
1460 
1461 		/*
1462 		 * Setting tgts[n] simplifies the other edge condition.
1463 		 */
1464 		tgts[n] = rm->rm_cols;
1465 
1466 		/*
1467 		 * These buffers were allocated in previous iterations.
1468 		 */
1469 		for (i = 0; i < n - 1; i++) {
1470 			ASSERT(orig[i] != NULL);
1471 		}
1472 
1473 		orig[n - 1] = malloc(rm->rm_col[0].rc_size);
1474 		if (orig[n - 1] == NULL) {
1475 			ret = ENOMEM;
1476 			goto done;
1477 		}
1478 
1479 		current = 0;
1480 		next = tgts[current];
1481 
1482 		while (current != n) {
1483 			tgts[current] = next;
1484 			current = 0;
1485 
1486 			/*
1487 			 * Save off the original data that we're going to
1488 			 * attempt to reconstruct.
1489 			 */
1490 			for (i = 0; i < n; i++) {
1491 				ASSERT(orig[i] != NULL);
1492 				c = tgts[i];
1493 				ASSERT3S(c, >=, 0);
1494 				ASSERT3S(c, <, rm->rm_cols);
1495 				rc = &rm->rm_col[c];
1496 				bcopy(rc->rc_data, orig[i], rc->rc_size);
1497 			}
1498 
1499 			/*
1500 			 * Attempt a reconstruction and exit the outer loop on
1501 			 * success.
1502 			 */
1503 			code = vdev_raidz_reconstruct(rm, tgts, n);
1504 			if (raidz_checksum_verify(spa, bp, data, bytes) == 0) {
1505 				for (i = 0; i < n; i++) {
1506 					c = tgts[i];
1507 					rc = &rm->rm_col[c];
1508 					ASSERT(rc->rc_error == 0);
1509 					rc->rc_error = ECKSUM;
1510 				}
1511 
1512 				ret = code;
1513 				goto done;
1514 			}
1515 
1516 			/*
1517 			 * Restore the original data.
1518 			 */
1519 			for (i = 0; i < n; i++) {
1520 				c = tgts[i];
1521 				rc = &rm->rm_col[c];
1522 				bcopy(orig[i], rc->rc_data, rc->rc_size);
1523 			}
1524 
1525 			do {
1526 				/*
1527 				 * Find the next valid column after the current
1528 				 * position..
1529 				 */
1530 				for (next = tgts[current] + 1;
1531 				    next < rm->rm_cols &&
1532 				    rm->rm_col[next].rc_error != 0; next++)
1533 					continue;
1534 
1535 				ASSERT(next <= tgts[current + 1]);
1536 
1537 				/*
1538 				 * If that spot is available, we're done here.
1539 				 */
1540 				if (next != tgts[current + 1])
1541 					break;
1542 
1543 				/*
1544 				 * Otherwise, find the next valid column after
1545 				 * the previous position.
1546 				 */
1547 				for (c = tgts[current - 1] + 1;
1548 				    rm->rm_col[c].rc_error != 0; c++)
1549 					continue;
1550 
1551 				tgts[current] = c;
1552 				current++;
1553 
1554 			} while (current != n);
1555 		}
1556 	}
1557 	n--;
1558 done:
1559 	for (i = n - 1; i >= 0; i--) {
1560 		free(orig[i]);
1561 	}
1562 
1563 	return (ret);
1564 }
1565 
1566 static int
vdev_raidz_read(vdev_t * vd,const blkptr_t * bp,void * data,off_t offset,size_t bytes)1567 vdev_raidz_read(vdev_t *vd, const blkptr_t *bp, void *data,
1568     off_t offset, size_t bytes)
1569 {
1570 	vdev_t *tvd = vd->v_top;
1571 	vdev_t *cvd;
1572 	raidz_map_t *rm;
1573 	raidz_col_t *rc;
1574 	int c, error;
1575 	int unexpected_errors __unused;
1576 	int parity_errors;
1577 	int parity_untried;
1578 	int data_errors;
1579 	int total_errors;
1580 	int n;
1581 	int tgts[VDEV_RAIDZ_MAXPARITY];
1582 	int code;
1583 
1584 	rc = NULL;	/* gcc */
1585 	error = 0;
1586 
1587 	rm = vdev_raidz_map_alloc(data, offset, bytes, tvd->v_ashift,
1588 	    vd->v_nchildren, vd->v_nparity);
1589 	if (rm == NULL)
1590 		return (ENOMEM);
1591 
1592 	/*
1593 	 * Iterate over the columns in reverse order so that we hit the parity
1594 	 * last -- any errors along the way will force us to read the parity.
1595 	 */
1596 	for (c = rm->rm_cols - 1; c >= 0; c--) {
1597 		rc = &rm->rm_col[c];
1598 		cvd = vdev_child(vd, rc->rc_devidx);
1599 		if (cvd == NULL || cvd->v_state != VDEV_STATE_HEALTHY) {
1600 			if (c >= rm->rm_firstdatacol)
1601 				rm->rm_missingdata++;
1602 			else
1603 				rm->rm_missingparity++;
1604 			rc->rc_error = ENXIO;
1605 			rc->rc_tried = 1;	/* don't even try */
1606 			rc->rc_skipped = 1;
1607 			continue;
1608 		}
1609 #if 0		/* XXX: Too hard for the boot code. */
1610 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1611 			if (c >= rm->rm_firstdatacol)
1612 				rm->rm_missingdata++;
1613 			else
1614 				rm->rm_missingparity++;
1615 			rc->rc_error = ESTALE;
1616 			rc->rc_skipped = 1;
1617 			continue;
1618 		}
1619 #endif
1620 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0) {
1621 			rc->rc_error = cvd->v_read(cvd, NULL, rc->rc_data,
1622 			    rc->rc_offset, rc->rc_size);
1623 			rc->rc_tried = 1;
1624 			rc->rc_skipped = 0;
1625 		}
1626 	}
1627 
1628 reconstruct:
1629 	unexpected_errors = 0;
1630 	parity_errors = 0;
1631 	parity_untried = 0;
1632 	data_errors = 0;
1633 	total_errors = 0;
1634 
1635 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1636 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1637 
1638 	for (c = 0; c < rm->rm_cols; c++) {
1639 		rc = &rm->rm_col[c];
1640 
1641 		if (rc->rc_error) {
1642 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
1643 
1644 			if (c < rm->rm_firstdatacol)
1645 				parity_errors++;
1646 			else
1647 				data_errors++;
1648 
1649 			if (!rc->rc_skipped)
1650 				unexpected_errors++;
1651 
1652 			total_errors++;
1653 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1654 			parity_untried++;
1655 		}
1656 	}
1657 
1658 	/*
1659 	 * There are three potential phases for a read:
1660 	 *	1. produce valid data from the columns read
1661 	 *	2. read all disks and try again
1662 	 *	3. perform combinatorial reconstruction
1663 	 *
1664 	 * Each phase is progressively both more expensive and less likely to
1665 	 * occur. If we encounter more errors than we can repair or all phases
1666 	 * fail, we have no choice but to return an error.
1667 	 */
1668 
1669 	/*
1670 	 * If the number of errors we saw was correctable -- less than or equal
1671 	 * to the number of parity disks read -- attempt to produce data that
1672 	 * has a valid checksum. Naturally, this case applies in the absence of
1673 	 * any errors.
1674 	 */
1675 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1676 		int rv;
1677 
1678 		if (data_errors == 0) {
1679 			rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
1680 			if (rv == 0) {
1681 				/*
1682 				 * If we read parity information (unnecessarily
1683 				 * as it happens since no reconstruction was
1684 				 * needed) regenerate and verify the parity.
1685 				 * We also regenerate parity when resilvering
1686 				 * so we can write it out to the failed device
1687 				 * later.
1688 				 */
1689 				if (parity_errors + parity_untried <
1690 				    rm->rm_firstdatacol) {
1691 					n = raidz_parity_verify(rm);
1692 					unexpected_errors += n;
1693 					ASSERT(parity_errors + n <=
1694 					    rm->rm_firstdatacol);
1695 				}
1696 				goto done;
1697 			}
1698 		} else {
1699 			/*
1700 			 * We either attempt to read all the parity columns or
1701 			 * none of them. If we didn't try to read parity, we
1702 			 * wouldn't be here in the correctable case. There must
1703 			 * also have been fewer parity errors than parity
1704 			 * columns or, again, we wouldn't be in this code path.
1705 			 */
1706 			ASSERT(parity_untried == 0);
1707 			ASSERT(parity_errors < rm->rm_firstdatacol);
1708 
1709 			/*
1710 			 * Identify the data columns that reported an error.
1711 			 */
1712 			n = 0;
1713 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1714 				rc = &rm->rm_col[c];
1715 				if (rc->rc_error != 0) {
1716 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1717 					tgts[n++] = c;
1718 				}
1719 			}
1720 
1721 			ASSERT(rm->rm_firstdatacol >= n);
1722 
1723 			code = vdev_raidz_reconstruct(rm, tgts, n);
1724 
1725 			rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes);
1726 			if (rv == 0) {
1727 				/*
1728 				 * If we read more parity disks than were used
1729 				 * for reconstruction, confirm that the other
1730 				 * parity disks produced correct data. This
1731 				 * routine is suboptimal in that it regenerates
1732 				 * the parity that we already used in addition
1733 				 * to the parity that we're attempting to
1734 				 * verify, but this should be a relatively
1735 				 * uncommon case, and can be optimized if it
1736 				 * becomes a problem. Note that we regenerate
1737 				 * parity when resilvering so we can write it
1738 				 * out to failed devices later.
1739 				 */
1740 				if (parity_errors < rm->rm_firstdatacol - n) {
1741 					n = raidz_parity_verify(rm);
1742 					unexpected_errors += n;
1743 					ASSERT(parity_errors + n <=
1744 					    rm->rm_firstdatacol);
1745 				}
1746 
1747 				goto done;
1748 			}
1749 		}
1750 	}
1751 
1752 	/*
1753 	 * This isn't a typical situation -- either we got a read
1754 	 * error or a child silently returned bad data. Read every
1755 	 * block so we can try again with as much data and parity as
1756 	 * we can track down. If we've already been through once
1757 	 * before, all children will be marked as tried so we'll
1758 	 * proceed to combinatorial reconstruction.
1759 	 */
1760 	unexpected_errors = 1;
1761 	rm->rm_missingdata = 0;
1762 	rm->rm_missingparity = 0;
1763 
1764 	n = 0;
1765 	for (c = 0; c < rm->rm_cols; c++) {
1766 		rc = &rm->rm_col[c];
1767 
1768 		if (rc->rc_tried)
1769 			continue;
1770 
1771 		cvd = vdev_child(vd, rc->rc_devidx);
1772 		ASSERT(cvd != NULL);
1773 		rc->rc_error = cvd->v_read(cvd, NULL,
1774 		    rc->rc_data, rc->rc_offset, rc->rc_size);
1775 		if (rc->rc_error == 0)
1776 			n++;
1777 		rc->rc_tried = 1;
1778 		rc->rc_skipped = 0;
1779 	}
1780 	/*
1781 	 * If we managed to read anything more, retry the
1782 	 * reconstruction.
1783 	 */
1784 	if (n > 0)
1785 		goto reconstruct;
1786 
1787 	/*
1788 	 * At this point we've attempted to reconstruct the data given the
1789 	 * errors we detected, and we've attempted to read all columns. There
1790 	 * must, therefore, be one or more additional problems -- silent errors
1791 	 * resulting in invalid data rather than explicit I/O errors resulting
1792 	 * in absent data. We check if there is enough additional data to
1793 	 * possibly reconstruct the data and then perform combinatorial
1794 	 * reconstruction over all possible combinations. If that fails,
1795 	 * we're cooked.
1796 	 */
1797 	if (total_errors > rm->rm_firstdatacol) {
1798 		error = EIO;
1799 	} else if (total_errors < rm->rm_firstdatacol &&
1800 	    (code = vdev_raidz_combrec(vd->v_spa, rm, bp, data, offset, bytes,
1801 	     total_errors, data_errors)) != 0) {
1802 		/*
1803 		 * If we didn't use all the available parity for the
1804 		 * combinatorial reconstruction, verify that the remaining
1805 		 * parity is correct.
1806 		 */
1807 		if (code != (1 << rm->rm_firstdatacol) - 1)
1808 			(void) raidz_parity_verify(rm);
1809 	} else {
1810 		/*
1811 		 * We're here because either:
1812 		 *
1813 		 *	total_errors == rm_first_datacol, or
1814 		 *	vdev_raidz_combrec() failed
1815 		 *
1816 		 * In either case, there is enough bad data to prevent
1817 		 * reconstruction.
1818 		 *
1819 		 * Start checksum ereports for all children which haven't
1820 		 * failed, and the IO wasn't speculative.
1821 		 */
1822 		error = ECKSUM;
1823 	}
1824 
1825 done:
1826 	vdev_raidz_map_free(rm);
1827 
1828 	return (error);
1829 }
1830