xref: /freebsd/sys/cddl/boot/zfs/zfssubr.c (revision eb6d21b4ca6d668cf89afd99eef7baeafa712197)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/cdefs.h>
27 __FBSDID("$FreeBSD$");
28 
29 static uint64_t zfs_crc64_table[256];
30 
31 static void
32 zfs_init_crc(void)
33 {
34 	int i, j;
35 	uint64_t *ct;
36 
37 	/*
38 	 * Calculate the crc64 table (used for the zap hash
39 	 * function).
40 	 */
41 	if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
42 		memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
43 		for (i = 0; i < 256; i++)
44 			for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
45 				*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
46 	}
47 }
48 
49 static void
50 zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
51 {
52 	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
53 }
54 
55 /*
56  * Signature for checksum functions.
57  */
58 typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp);
59 
60 /*
61  * Information about each checksum function.
62  */
63 typedef struct zio_checksum_info {
64 	zio_checksum_t	*ci_func[2]; /* checksum function for each byteorder */
65 	int		ci_correctable;	/* number of correctable bits	*/
66 	int		ci_zbt;		/* uses zio block tail?	*/
67 	const char	*ci_name;	/* descriptive name */
68 } zio_checksum_info_t;
69 
70 #include "fletcher.c"
71 #include "sha256.c"
72 
73 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
74 	{{NULL,			NULL},			0, 0,	"inherit"},
75 	{{NULL,			NULL},			0, 0,	"on"},
76 	{{zio_checksum_off,	zio_checksum_off},	0, 0,	"off"},
77 	{{zio_checksum_SHA256,	NULL},			1, 1,	"label"},
78 	{{zio_checksum_SHA256,	NULL},			1, 1,	"gang_header"},
79 	{{fletcher_2_native,	NULL},			0, 1,	"zilog"},
80 	{{fletcher_2_native,	NULL},			0, 0,	"fletcher2"},
81 	{{fletcher_4_native,	NULL},			1, 0,	"fletcher4"},
82 	{{zio_checksum_SHA256,	NULL},			1, 0,	"SHA256"},
83 };
84 
85 /*
86  * Common signature for all zio compress/decompress functions.
87  */
88 typedef size_t zio_compress_func_t(void *src, void *dst,
89     size_t s_len, size_t d_len, int);
90 typedef int zio_decompress_func_t(void *src, void *dst,
91     size_t s_len, size_t d_len, int);
92 
93 /*
94  * Information about each compression function.
95  */
96 typedef struct zio_compress_info {
97 	zio_compress_func_t	*ci_compress;	/* compression function */
98 	zio_decompress_func_t	*ci_decompress;	/* decompression function */
99 	int			ci_level;	/* level parameter */
100 	const char		*ci_name;	/* algorithm name */
101 } zio_compress_info_t;
102 
103 #include "lzjb.c"
104 
105 /*
106  * Compression vectors.
107  */
108 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
109 	{NULL,			NULL,			0,	"inherit"},
110 	{NULL,			NULL,			0,	"on"},
111 	{NULL,			NULL,			0,	"uncompressed"},
112 	{NULL,			lzjb_decompress,	0,	"lzjb"},
113 	{NULL,			NULL,			0,	"empty"},
114 	{NULL,			NULL,			1,	"gzip-1"},
115 	{NULL,			NULL,			2,	"gzip-2"},
116 	{NULL,			NULL,			3,	"gzip-3"},
117 	{NULL,			NULL,			4,	"gzip-4"},
118 	{NULL,			NULL,			5,	"gzip-5"},
119 	{NULL,			NULL,			6,	"gzip-6"},
120 	{NULL,			NULL,			7,	"gzip-7"},
121 	{NULL,			NULL,			8,	"gzip-8"},
122 	{NULL,			NULL,			9,	"gzip-9"},
123 };
124 
125 static int
126 zio_checksum_error(const blkptr_t *bp, void *data)
127 {
128 	zio_cksum_t zc = bp->blk_cksum;
129 	unsigned int checksum = BP_GET_CHECKSUM(bp);
130 	uint64_t size = BP_GET_PSIZE(bp);
131 	zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1;
132 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
133 	zio_cksum_t actual_cksum, expected_cksum;
134 
135 	if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
136 		return (EINVAL);
137 
138 	if (ci->ci_zbt) {
139 		expected_cksum = zbt->zbt_cksum;
140 		zbt->zbt_cksum = zc;
141 		ci->ci_func[0](data, size, &actual_cksum);
142 		zbt->zbt_cksum = expected_cksum;
143 		zc = expected_cksum;
144 	} else {
145 		/* ASSERT(!BP_IS_GANG(bp)); */
146 		ci->ci_func[0](data, size, &actual_cksum);
147 	}
148 
149 	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, zc)) {
150 		/*printf("ZFS: read checksum failed\n");*/
151 		return (EIO);
152 	}
153 
154 	return (0);
155 }
156 
157 static int
158 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
159 	void *dest, uint64_t destsize)
160 {
161 	zio_compress_info_t *ci = &zio_compress_table[cpfunc];
162 
163 	/* ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS); */
164 	if (!ci->ci_decompress) {
165 		printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
166 		return (EIO);
167 	}
168 
169 	return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
170 }
171 
172 static uint64_t
173 zap_hash(uint64_t salt, const char *name)
174 {
175 	const uint8_t *cp;
176 	uint8_t c;
177 	uint64_t crc = salt;
178 
179 	/*ASSERT(crc != 0);*/
180 	/*ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);*/
181 	for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
182 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
183 
184 	/*
185 	 * Only use 28 bits, since we need 4 bits in the cookie for the
186 	 * collision differentiator.  We MUST use the high bits, since
187 	 * those are the onces that we first pay attention to when
188 	 * chosing the bucket.
189 	 */
190 	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
191 
192 	return (crc);
193 }
194 
195 static char *zfs_alloc_temp(size_t sz);
196 
197 typedef struct raidz_col {
198 	uint64_t rc_devidx;		/* child device index for I/O */
199 	uint64_t rc_offset;		/* device offset */
200 	uint64_t rc_size;		/* I/O size */
201 	void *rc_data;			/* I/O data */
202 	int rc_error;			/* I/O error for this device */
203 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
204 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
205 } raidz_col_t;
206 
207 #define	VDEV_RAIDZ_P		0
208 #define	VDEV_RAIDZ_Q		1
209 
210 static void
211 vdev_raidz_reconstruct_p(raidz_col_t *cols, int nparity, int acols, int x)
212 {
213 	uint64_t *dst, *src, xcount, ccount, count, i;
214 	int c;
215 
216 	xcount = cols[x].rc_size / sizeof (src[0]);
217 	//ASSERT(xcount <= cols[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
218 	//ASSERT(xcount > 0);
219 
220 	src = cols[VDEV_RAIDZ_P].rc_data;
221 	dst = cols[x].rc_data;
222 	for (i = 0; i < xcount; i++, dst++, src++) {
223 		*dst = *src;
224 	}
225 
226 	for (c = nparity; c < acols; c++) {
227 		src = cols[c].rc_data;
228 		dst = cols[x].rc_data;
229 
230 		if (c == x)
231 			continue;
232 
233 		ccount = cols[c].rc_size / sizeof (src[0]);
234 		count = MIN(ccount, xcount);
235 
236 		for (i = 0; i < count; i++, dst++, src++) {
237 			*dst ^= *src;
238 		}
239 	}
240 }
241 
242 /*
243  * These two tables represent powers and logs of 2 in the Galois field defined
244  * above. These values were computed by repeatedly multiplying by 2 as above.
245  */
246 static const uint8_t vdev_raidz_pow2[256] = {
247 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
248 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
249 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
250 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
251 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
252 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
253 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
254 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
255 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
256 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
257 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
258 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
259 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
260 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
261 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
262 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
263 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
264 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
265 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
266 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
267 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
268 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
269 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
270 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
271 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
272 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
273 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
274 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
275 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
276 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
277 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
278 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
279 };
280 static const uint8_t vdev_raidz_log2[256] = {
281 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
282 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
283 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
284 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
285 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
286 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
287 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
288 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
289 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
290 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
291 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
292 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
293 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
294 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
295 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
296 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
297 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
298 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
299 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
300 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
301 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
302 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
303 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
304 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
305 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
306 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
307 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
308 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
309 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
310 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
311 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
312 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
313 };
314 
315 /*
316  * Multiply a given number by 2 raised to the given power.
317  */
318 static uint8_t
319 vdev_raidz_exp2(uint8_t a, int exp)
320 {
321 	if (a == 0)
322 		return (0);
323 
324 	//ASSERT(exp >= 0);
325 	//ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
326 
327 	exp += vdev_raidz_log2[a];
328 	if (exp > 255)
329 		exp -= 255;
330 
331 	return (vdev_raidz_pow2[exp]);
332 }
333 
334 static void
335 vdev_raidz_generate_parity_pq(raidz_col_t *cols, int nparity, int acols)
336 {
337 	uint64_t *q, *p, *src, pcount, ccount, mask, i;
338 	int c;
339 
340 	pcount = cols[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
341 	//ASSERT(cols[VDEV_RAIDZ_P].rc_size == cols[VDEV_RAIDZ_Q].rc_size);
342 
343 	for (c = nparity; c < acols; c++) {
344 		src = cols[c].rc_data;
345 		p = cols[VDEV_RAIDZ_P].rc_data;
346 		q = cols[VDEV_RAIDZ_Q].rc_data;
347 		ccount = cols[c].rc_size / sizeof (src[0]);
348 
349 		if (c == nparity) {
350 			//ASSERT(ccount == pcount || ccount == 0);
351 			for (i = 0; i < ccount; i++, p++, q++, src++) {
352 				*q = *src;
353 				*p = *src;
354 			}
355 			for (; i < pcount; i++, p++, q++, src++) {
356 				*q = 0;
357 				*p = 0;
358 			}
359 		} else {
360 			//ASSERT(ccount <= pcount);
361 
362 			/*
363 			 * Rather than multiplying each byte
364 			 * individually (as described above), we are
365 			 * able to handle 8 at once by generating a
366 			 * mask based on the high bit in each byte and
367 			 * using that to conditionally XOR in 0x1d.
368 			 */
369 			for (i = 0; i < ccount; i++, p++, q++, src++) {
370 				mask = *q & 0x8080808080808080ULL;
371 				mask = (mask << 1) - (mask >> 7);
372 				*q = ((*q << 1) & 0xfefefefefefefefeULL) ^
373 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
374 				*q ^= *src;
375 				*p ^= *src;
376 			}
377 
378 			/*
379 			 * Treat short columns as though they are full of 0s.
380 			 */
381 			for (; i < pcount; i++, q++) {
382 				mask = *q & 0x8080808080808080ULL;
383 				mask = (mask << 1) - (mask >> 7);
384 				*q = ((*q << 1) & 0xfefefefefefefefeULL) ^
385 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
386 			}
387 		}
388 	}
389 }
390 
391 static void
392 vdev_raidz_reconstruct_q(raidz_col_t *cols, int nparity, int acols, int x)
393 {
394 	uint64_t *dst, *src, xcount, ccount, count, mask, i;
395 	uint8_t *b;
396 	int c, j, exp;
397 
398 	xcount = cols[x].rc_size / sizeof (src[0]);
399 	//ASSERT(xcount <= cols[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
400 
401 	for (c = nparity; c < acols; c++) {
402 		src = cols[c].rc_data;
403 		dst = cols[x].rc_data;
404 
405 		if (c == x)
406 			ccount = 0;
407 		else
408 			ccount = cols[c].rc_size / sizeof (src[0]);
409 
410 		count = MIN(ccount, xcount);
411 
412 		if (c == nparity) {
413 			for (i = 0; i < count; i++, dst++, src++) {
414 				*dst = *src;
415 			}
416 			for (; i < xcount; i++, dst++) {
417 				*dst = 0;
418 			}
419 
420 		} else {
421 			/*
422 			 * For an explanation of this, see the comment in
423 			 * vdev_raidz_generate_parity_pq() above.
424 			 */
425 			for (i = 0; i < count; i++, dst++, src++) {
426 				mask = *dst & 0x8080808080808080ULL;
427 				mask = (mask << 1) - (mask >> 7);
428 				*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
429 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
430 				*dst ^= *src;
431 			}
432 
433 			for (; i < xcount; i++, dst++) {
434 				mask = *dst & 0x8080808080808080ULL;
435 				mask = (mask << 1) - (mask >> 7);
436 				*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
437 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
438 			}
439 		}
440 	}
441 
442 	src = cols[VDEV_RAIDZ_Q].rc_data;
443 	dst = cols[x].rc_data;
444 	exp = 255 - (acols - 1 - x);
445 
446 	for (i = 0; i < xcount; i++, dst++, src++) {
447 		*dst ^= *src;
448 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
449 			*b = vdev_raidz_exp2(*b, exp);
450 		}
451 	}
452 }
453 
454 
455 static void
456 vdev_raidz_reconstruct_pq(raidz_col_t *cols, int nparity, int acols,
457     int x, int y)
458 {
459 	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
460 	void *pdata, *qdata;
461 	uint64_t xsize, ysize, i;
462 
463 	//ASSERT(x < y);
464 	//ASSERT(x >= nparity);
465 	//ASSERT(y < acols);
466 
467 	//ASSERT(cols[x].rc_size >= cols[y].rc_size);
468 
469 	/*
470 	 * Move the parity data aside -- we're going to compute parity as
471 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
472 	 * reuse the parity generation mechanism without trashing the actual
473 	 * parity so we make those columns appear to be full of zeros by
474 	 * setting their lengths to zero.
475 	 */
476 	pdata = cols[VDEV_RAIDZ_P].rc_data;
477 	qdata = cols[VDEV_RAIDZ_Q].rc_data;
478 	xsize = cols[x].rc_size;
479 	ysize = cols[y].rc_size;
480 
481 	cols[VDEV_RAIDZ_P].rc_data =
482 		zfs_alloc_temp(cols[VDEV_RAIDZ_P].rc_size);
483 	cols[VDEV_RAIDZ_Q].rc_data =
484 		zfs_alloc_temp(cols[VDEV_RAIDZ_Q].rc_size);
485 	cols[x].rc_size = 0;
486 	cols[y].rc_size = 0;
487 
488 	vdev_raidz_generate_parity_pq(cols, nparity, acols);
489 
490 	cols[x].rc_size = xsize;
491 	cols[y].rc_size = ysize;
492 
493 	p = pdata;
494 	q = qdata;
495 	pxy = cols[VDEV_RAIDZ_P].rc_data;
496 	qxy = cols[VDEV_RAIDZ_Q].rc_data;
497 	xd = cols[x].rc_data;
498 	yd = cols[y].rc_data;
499 
500 	/*
501 	 * We now have:
502 	 *	Pxy = P + D_x + D_y
503 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
504 	 *
505 	 * We can then solve for D_x:
506 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
507 	 * where
508 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
509 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
510 	 *
511 	 * With D_x in hand, we can easily solve for D_y:
512 	 *	D_y = P + Pxy + D_x
513 	 */
514 
515 	a = vdev_raidz_pow2[255 + x - y];
516 	b = vdev_raidz_pow2[255 - (acols - 1 - x)];
517 	tmp = 255 - vdev_raidz_log2[a ^ 1];
518 
519 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
520 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
521 
522 	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
523 		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
524 		    vdev_raidz_exp2(*q ^ *qxy, bexp);
525 
526 		if (i < ysize)
527 			*yd = *p ^ *pxy ^ *xd;
528 	}
529 
530 	/*
531 	 * Restore the saved parity data.
532 	 */
533 	cols[VDEV_RAIDZ_P].rc_data = pdata;
534 	cols[VDEV_RAIDZ_Q].rc_data = qdata;
535 }
536 
537 static int
538 vdev_raidz_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
539     off_t offset, size_t bytes)
540 {
541 	size_t psize = BP_GET_PSIZE(bp);
542 	vdev_t *kid;
543 	int unit_shift = vdev->v_ashift;
544 	int dcols = vdev->v_nchildren;
545 	int nparity = vdev->v_nparity;
546 	int missingdata, missingparity;
547 	int parity_errors, data_errors, unexpected_errors, total_errors;
548 	int parity_untried;
549 	uint64_t b = offset >> unit_shift;
550 	uint64_t s = psize >> unit_shift;
551 	uint64_t f = b % dcols;
552 	uint64_t o = (b / dcols) << unit_shift;
553 	uint64_t q, r, coff;
554 	int c, c1, bc, col, acols, devidx, asize, n;
555 	static raidz_col_t cols[16];
556 	raidz_col_t *rc, *rc1;
557 
558 	q = s / (dcols - nparity);
559 	r = s - q * (dcols - nparity);
560 	bc = (r == 0 ? 0 : r + nparity);
561 
562 	acols = (q == 0 ? bc : dcols);
563 	asize = 0;
564 
565 	for (c = 0; c < acols; c++) {
566 		col = f + c;
567 		coff = o;
568 		if (col >= dcols) {
569 			col -= dcols;
570 			coff += 1ULL << unit_shift;
571 		}
572 		cols[c].rc_devidx = col;
573 		cols[c].rc_offset = coff;
574 		cols[c].rc_size = (q + (c < bc)) << unit_shift;
575 		cols[c].rc_data = NULL;
576 		cols[c].rc_error = 0;
577 		cols[c].rc_tried = 0;
578 		cols[c].rc_skipped = 0;
579 		asize += cols[c].rc_size;
580 	}
581 
582 	asize = roundup(asize, (nparity + 1) << unit_shift);
583 
584 	for (c = 0; c < nparity; c++) {
585 		cols[c].rc_data = zfs_alloc_temp(cols[c].rc_size);
586 	}
587 
588 	cols[c].rc_data = buf;
589 
590 	for (c = c + 1; c < acols; c++)
591 		cols[c].rc_data = (char *)cols[c - 1].rc_data +
592 		    cols[c - 1].rc_size;
593 
594 	/*
595 	 * If all data stored spans all columns, there's a danger that
596 	 * parity will always be on the same device and, since parity
597 	 * isn't read during normal operation, that that device's I/O
598 	 * bandwidth won't be used effectively. We therefore switch
599 	 * the parity every 1MB.
600 	 *
601 	 * ... at least that was, ostensibly, the theory. As a
602 	 * practical matter unless we juggle the parity between all
603 	 * devices evenly, we won't see any benefit. Further,
604 	 * occasional writes that aren't a multiple of the LCM of the
605 	 * number of children and the minimum stripe width are
606 	 * sufficient to avoid pessimal behavior.  Unfortunately, this
607 	 * decision created an implicit on-disk format requirement
608 	 * that we need to support for all eternity, but only for
609 	 * single-parity RAID-Z.
610 	 */
611 	//ASSERT(acols >= 2);
612 	//ASSERT(cols[0].rc_size == cols[1].rc_size);
613 
614 	if (nparity == 1 && (offset & (1ULL << 20))) {
615 		devidx = cols[0].rc_devidx;
616 		o = cols[0].rc_offset;
617 		cols[0].rc_devidx = cols[1].rc_devidx;
618 		cols[0].rc_offset = cols[1].rc_offset;
619 		cols[1].rc_devidx = devidx;
620 		cols[1].rc_offset = o;
621 	}
622 
623 	/*
624 	 * Iterate over the columns in reverse order so that we hit
625 	 * the parity last -- any errors along the way will force us
626 	 * to read the parity data.
627 	 */
628 	missingdata = 0;
629 	missingparity = 0;
630 	for (c = acols - 1; c >= 0; c--) {
631 		rc = &cols[c];
632 		devidx = rc->rc_devidx;
633 		STAILQ_FOREACH(kid, &vdev->v_children, v_childlink)
634 			if (kid->v_id == devidx)
635 				break;
636 		if (kid == NULL || kid->v_state != VDEV_STATE_HEALTHY) {
637 			if (c >= nparity)
638 				missingdata++;
639 			else
640 				missingparity++;
641 			rc->rc_error = ENXIO;
642 			rc->rc_tried = 1;	/* don't even try */
643 			rc->rc_skipped = 1;
644 			continue;
645 		}
646 #if 0
647 		/*
648 		 * Too hard for the bootcode
649 		 */
650 		if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) {
651 			if (c >= nparity)
652 				rm->rm_missingdata++;
653 			else
654 				rm->rm_missingparity++;
655 			rc->rc_error = ESTALE;
656 			rc->rc_skipped = 1;
657 			continue;
658 		}
659 #endif
660 		if (c >= nparity || missingdata > 0) {
661 			if (rc->rc_data)
662 				rc->rc_error = kid->v_read(kid, NULL,
663 				    rc->rc_data, rc->rc_offset, rc->rc_size);
664 			else
665 				rc->rc_error = ENXIO;
666 			rc->rc_tried = 1;
667 			rc->rc_skipped = 0;
668 		}
669 	}
670 
671 reconstruct:
672 	parity_errors = 0;
673 	data_errors = 0;
674 	unexpected_errors = 0;
675 	total_errors = 0;
676 	parity_untried = 0;
677 	for (c = 0; c < acols; c++) {
678 		rc = &cols[c];
679 
680 		if (rc->rc_error) {
681 			if (c < nparity)
682 				parity_errors++;
683 			else
684 				data_errors++;
685 
686 			if (!rc->rc_skipped)
687 				unexpected_errors++;
688 
689 			total_errors++;
690 		} else if (c < nparity && !rc->rc_tried) {
691 			parity_untried++;
692 		}
693 	}
694 
695 	/*
696 	 * There are three potential phases for a read:
697 	 *	1. produce valid data from the columns read
698 	 *	2. read all disks and try again
699 	 *	3. perform combinatorial reconstruction
700 	 *
701 	 * Each phase is progressively both more expensive and less
702 	 * likely to occur. If we encounter more errors than we can
703 	 * repair or all phases fail, we have no choice but to return
704 	 * an error.
705 	 */
706 
707 	/*
708 	 * If the number of errors we saw was correctable -- less than
709 	 * or equal to the number of parity disks read -- attempt to
710 	 * produce data that has a valid checksum. Naturally, this
711 	 * case applies in the absence of any errors.
712 	 */
713 	if (total_errors <= nparity - parity_untried) {
714 		switch (data_errors) {
715 		case 0:
716 			if (zio_checksum_error(bp, buf) == 0)
717 				return (0);
718 			break;
719 
720 		case 1:
721 			/*
722 			 * We either attempt to read all the parity columns or
723 			 * none of them. If we didn't try to read parity, we
724 			 * wouldn't be here in the correctable case. There must
725 			 * also have been fewer parity errors than parity
726 			 * columns or, again, we wouldn't be in this code path.
727 			 */
728 			//ASSERT(parity_untried == 0);
729 			//ASSERT(parity_errors < nparity);
730 
731 			/*
732 			 * Find the column that reported the error.
733 			 */
734 			for (c = nparity; c < acols; c++) {
735 				rc = &cols[c];
736 				if (rc->rc_error != 0)
737 					break;
738 			}
739 			//ASSERT(c != acols);
740 			//ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || rc->rc_error == ESTALE);
741 
742 			if (cols[VDEV_RAIDZ_P].rc_error == 0) {
743 				vdev_raidz_reconstruct_p(cols, nparity,
744 				    acols, c);
745 			} else {
746 				//ASSERT(nparity > 1);
747 				vdev_raidz_reconstruct_q(cols, nparity,
748 				    acols, c);
749 			}
750 
751 			if (zio_checksum_error(bp, buf) == 0)
752 				return (0);
753 			break;
754 
755 		case 2:
756 			/*
757 			 * Two data column errors require double parity.
758 			 */
759 			//ASSERT(nparity == 2);
760 
761 			/*
762 			 * Find the two columns that reported errors.
763 			 */
764 			for (c = nparity; c < acols; c++) {
765 				rc = &cols[c];
766 				if (rc->rc_error != 0)
767 					break;
768 			}
769 			//ASSERT(c != acols);
770 			//ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || rc->rc_error == ESTALE);
771 
772 			for (c1 = c++; c < acols; c++) {
773 				rc = &cols[c];
774 				if (rc->rc_error != 0)
775 					break;
776 			}
777 			//ASSERT(c != acols);
778 			//ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || rc->rc_error == ESTALE);
779 
780 			vdev_raidz_reconstruct_pq(cols, nparity, acols,
781 			    c1, c);
782 
783 			if (zio_checksum_error(bp, buf) == 0)
784 				return (0);
785 			break;
786 
787 		default:
788 			break;
789 			//ASSERT(nparity <= 2);
790 			//ASSERT(0);
791 		}
792 	}
793 
794 	/*
795 	 * This isn't a typical situation -- either we got a read
796 	 * error or a child silently returned bad data. Read every
797 	 * block so we can try again with as much data and parity as
798 	 * we can track down. If we've already been through once
799 	 * before, all children will be marked as tried so we'll
800 	 * proceed to combinatorial reconstruction.
801 	 */
802 	n = 0;
803 	for (c = 0; c < acols; c++) {
804 		rc = &cols[c];
805 		if (rc->rc_tried)
806 			continue;
807 
808 		devidx = rc->rc_devidx;
809 		STAILQ_FOREACH(kid, &vdev->v_children, v_childlink)
810 			if (kid->v_id == devidx)
811 				break;
812 		if (kid == NULL || kid->v_state != VDEV_STATE_HEALTHY) {
813 			rc->rc_error = ENXIO;
814 			rc->rc_tried = 1;	/* don't even try */
815 			rc->rc_skipped = 1;
816 			continue;
817 		}
818 		if (rc->rc_data)
819 			rc->rc_error = kid->v_read(kid, NULL,
820 			    rc->rc_data, rc->rc_offset, rc->rc_size);
821 		else
822 			rc->rc_error = ENXIO;
823 		if (rc->rc_error == 0)
824 			n++;
825 		rc->rc_tried = 1;
826 		rc->rc_skipped = 0;
827 	}
828 
829 	/*
830 	 * If we managed to read anything more, retry the
831 	 * reconstruction.
832 	 */
833 	if (n)
834 		goto reconstruct;
835 
836 	/*
837 	 * At this point we've attempted to reconstruct the data given the
838 	 * errors we detected, and we've attempted to read all columns. There
839 	 * must, therefore, be one or more additional problems -- silent errors
840 	 * resulting in invalid data rather than explicit I/O errors resulting
841 	 * in absent data. Before we attempt combinatorial reconstruction make
842 	 * sure we have a chance of coming up with the right answer.
843 	 */
844 	if (total_errors >= nparity) {
845 		return (EIO);
846 	}
847 
848 	asize = 0;
849 	for (c = 0; c < acols; c++) {
850 		rc = &cols[c];
851 		if (rc->rc_size > asize)
852 			asize = rc->rc_size;
853 	}
854 	if (cols[VDEV_RAIDZ_P].rc_error == 0) {
855 		/*
856 		 * Attempt to reconstruct the data from parity P.
857 		 */
858 		void *orig;
859 		orig = zfs_alloc_temp(asize);
860 		for (c = nparity; c < acols; c++) {
861 			rc = &cols[c];
862 
863 			memcpy(orig, rc->rc_data, rc->rc_size);
864 			vdev_raidz_reconstruct_p(cols, nparity, acols, c);
865 
866 			if (zio_checksum_error(bp, buf) == 0)
867 				return (0);
868 
869 			memcpy(rc->rc_data, orig, rc->rc_size);
870 		}
871 	}
872 
873 	if (nparity > 1 && cols[VDEV_RAIDZ_Q].rc_error == 0) {
874 		/*
875 		 * Attempt to reconstruct the data from parity Q.
876 		 */
877 		void *orig;
878 		orig = zfs_alloc_temp(asize);
879 		for (c = nparity; c < acols; c++) {
880 			rc = &cols[c];
881 
882 			memcpy(orig, rc->rc_data, rc->rc_size);
883 			vdev_raidz_reconstruct_q(cols, nparity, acols, c);
884 
885 			if (zio_checksum_error(bp, buf) == 0)
886 				return (0);
887 
888 			memcpy(rc->rc_data, orig, rc->rc_size);
889 		}
890 	}
891 
892 	if (nparity > 1 &&
893 	    cols[VDEV_RAIDZ_P].rc_error == 0 &&
894 	    cols[VDEV_RAIDZ_Q].rc_error == 0) {
895 		/*
896 		 * Attempt to reconstruct the data from both P and Q.
897 		 */
898 		void *orig, *orig1;
899 		orig = zfs_alloc_temp(asize);
900 		orig1 = zfs_alloc_temp(asize);
901 		for (c = nparity; c < acols - 1; c++) {
902 			rc = &cols[c];
903 
904 			memcpy(orig, rc->rc_data, rc->rc_size);
905 
906 			for (c1 = c + 1; c1 < acols; c1++) {
907 				rc1 = &cols[c1];
908 
909 				memcpy(orig1, rc1->rc_data, rc1->rc_size);
910 
911 				vdev_raidz_reconstruct_pq(cols, nparity,
912 				    acols, c, c1);
913 
914 				if (zio_checksum_error(bp, buf) == 0)
915 					return (0);
916 
917 				memcpy(rc1->rc_data, orig1, rc1->rc_size);
918 			}
919 
920 			memcpy(rc->rc_data, orig, rc->rc_size);
921 		}
922 	}
923 
924 	return (EIO);
925 }
926 
927