xref: /freebsd/sys/cddl/boot/zfs/zfssubr.c (revision 830940567b49bb0c08dfaed40418999e76616909)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/cdefs.h>
27 __FBSDID("$FreeBSD$");
28 
29 static uint64_t zfs_crc64_table[256];
30 
31 static void
32 zfs_init_crc(void)
33 {
34 	int i, j;
35 	uint64_t *ct;
36 
37 	/*
38 	 * Calculate the crc64 table (used for the zap hash
39 	 * function).
40 	 */
41 	if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
42 		memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
43 		for (i = 0; i < 256; i++)
44 			for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
45 				*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
46 	}
47 }
48 
49 static void
50 zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
51 {
52 	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
53 }
54 
55 /*
56  * Signature for checksum functions.
57  */
58 typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp);
59 
60 /*
61  * Information about each checksum function.
62  */
63 typedef struct zio_checksum_info {
64 	zio_checksum_t	*ci_func[2]; /* checksum function for each byteorder */
65 	int		ci_correctable;	/* number of correctable bits	*/
66 	int		ci_zbt;		/* uses zio block tail?	*/
67 	const char	*ci_name;	/* descriptive name */
68 } zio_checksum_info_t;
69 
70 #include "fletcher.c"
71 #include "sha256.c"
72 
73 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
74 	{{NULL,			NULL},			0, 0,	"inherit"},
75 	{{NULL,			NULL},			0, 0,	"on"},
76 	{{zio_checksum_off,	zio_checksum_off},	0, 0,	"off"},
77 	{{zio_checksum_SHA256,	NULL},			1, 1,	"label"},
78 	{{zio_checksum_SHA256,	NULL},			1, 1,	"gang_header"},
79 	{{fletcher_2_native,	NULL},			0, 1,	"zilog"},
80 	{{fletcher_2_native,	NULL},			0, 0,	"fletcher2"},
81 	{{fletcher_4_native,	NULL},			1, 0,	"fletcher4"},
82 	{{zio_checksum_SHA256,	NULL},			1, 0,	"SHA256"},
83 };
84 
85 /*
86  * Common signature for all zio compress/decompress functions.
87  */
88 typedef size_t zio_compress_func_t(void *src, void *dst,
89     size_t s_len, size_t d_len, int);
90 typedef int zio_decompress_func_t(void *src, void *dst,
91     size_t s_len, size_t d_len, int);
92 
93 /*
94  * Information about each compression function.
95  */
96 typedef struct zio_compress_info {
97 	zio_compress_func_t	*ci_compress;	/* compression function */
98 	zio_decompress_func_t	*ci_decompress;	/* decompression function */
99 	int			ci_level;	/* level parameter */
100 	const char		*ci_name;	/* algorithm name */
101 } zio_compress_info_t;
102 
103 #include "lzjb.c"
104 
105 /*
106  * Compression vectors.
107  */
108 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
109 	{NULL,			NULL,			0,	"inherit"},
110 	{NULL,			NULL,			0,	"on"},
111 	{NULL,			NULL,			0,	"uncompressed"},
112 	{NULL,			lzjb_decompress,	0,	"lzjb"},
113 	{NULL,			NULL,			0,	"empty"},
114 	{NULL,			NULL,			1,	"gzip-1"},
115 	{NULL,			NULL,			2,	"gzip-2"},
116 	{NULL,			NULL,			3,	"gzip-3"},
117 	{NULL,			NULL,			4,	"gzip-4"},
118 	{NULL,			NULL,			5,	"gzip-5"},
119 	{NULL,			NULL,			6,	"gzip-6"},
120 	{NULL,			NULL,			7,	"gzip-7"},
121 	{NULL,			NULL,			8,	"gzip-8"},
122 	{NULL,			NULL,			9,	"gzip-9"},
123 };
124 
125 static int
126 zio_checksum_error(const blkptr_t *bp, void *data)
127 {
128 	zio_cksum_t zc = bp->blk_cksum;
129 	unsigned int checksum = BP_GET_CHECKSUM(bp);
130 	uint64_t size = BP_GET_PSIZE(bp);
131 	zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1;
132 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
133 	zio_cksum_t actual_cksum, expected_cksum;
134 
135 	if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
136 		return (EINVAL);
137 
138 	if (ci->ci_zbt) {
139 		expected_cksum = zbt->zbt_cksum;
140 		zbt->zbt_cksum = zc;
141 		ci->ci_func[0](data, size, &actual_cksum);
142 		zbt->zbt_cksum = expected_cksum;
143 		zc = expected_cksum;
144 	} else {
145 		/* ASSERT(!BP_IS_GANG(bp)); */
146 		ci->ci_func[0](data, size, &actual_cksum);
147 	}
148 
149 	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, zc)) {
150 		/*printf("ZFS: read checksum failed\n");*/
151 		return (EIO);
152 	}
153 
154 	return (0);
155 }
156 
157 static int
158 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
159 	void *dest, uint64_t destsize)
160 {
161 	zio_compress_info_t *ci = &zio_compress_table[cpfunc];
162 
163 	/* ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS); */
164 	if (!ci->ci_decompress) {
165 		printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
166 		return (EIO);
167 	}
168 
169 	return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
170 }
171 
172 static uint64_t
173 zap_hash(uint64_t salt, const char *name)
174 {
175 	const uint8_t *cp;
176 	uint8_t c;
177 	uint64_t crc = salt;
178 
179 	/*ASSERT(crc != 0);*/
180 	/*ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);*/
181 	for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
182 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
183 
184 	/*
185 	 * Only use 28 bits, since we need 4 bits in the cookie for the
186 	 * collision differentiator.  We MUST use the high bits, since
187 	 * those are the onces that we first pay attention to when
188 	 * chosing the bucket.
189 	 */
190 	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
191 
192 	return (crc);
193 }
194 
195 static char *zfs_alloc_temp(size_t sz);
196 
197 typedef struct raidz_col {
198 	uint64_t rc_devidx;		/* child device index for I/O */
199 	uint64_t rc_offset;		/* device offset */
200 	uint64_t rc_size;		/* I/O size */
201 	void *rc_data;			/* I/O data */
202 	int rc_error;			/* I/O error for this device */
203 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
204 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
205 } raidz_col_t;
206 
207 #define	VDEV_RAIDZ_P		0
208 #define	VDEV_RAIDZ_Q		1
209 
210 static void
211 vdev_raidz_reconstruct_p(raidz_col_t *cols, int nparity, int acols, int x)
212 {
213 	uint64_t *dst, *src, xcount, ccount, count, i;
214 	int c;
215 
216 	xcount = cols[x].rc_size / sizeof (src[0]);
217 	//ASSERT(xcount <= cols[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
218 	//ASSERT(xcount > 0);
219 
220 	src = cols[VDEV_RAIDZ_P].rc_data;
221 	dst = cols[x].rc_data;
222 	for (i = 0; i < xcount; i++, dst++, src++) {
223 		*dst = *src;
224 	}
225 
226 	for (c = nparity; c < acols; c++) {
227 		src = cols[c].rc_data;
228 		dst = cols[x].rc_data;
229 
230 		if (c == x)
231 			continue;
232 
233 		ccount = cols[c].rc_size / sizeof (src[0]);
234 		count = MIN(ccount, xcount);
235 
236 		for (i = 0; i < count; i++, dst++, src++) {
237 			*dst ^= *src;
238 		}
239 	}
240 }
241 
242 /*
243  * These two tables represent powers and logs of 2 in the Galois field defined
244  * above. These values were computed by repeatedly multiplying by 2 as above.
245  */
246 static const uint8_t vdev_raidz_pow2[256] = {
247 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
248 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
249 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
250 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
251 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
252 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
253 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
254 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
255 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
256 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
257 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
258 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
259 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
260 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
261 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
262 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
263 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
264 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
265 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
266 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
267 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
268 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
269 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
270 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
271 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
272 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
273 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
274 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
275 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
276 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
277 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
278 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
279 };
280 static const uint8_t vdev_raidz_log2[256] = {
281 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
282 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
283 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
284 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
285 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
286 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
287 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
288 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
289 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
290 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
291 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
292 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
293 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
294 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
295 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
296 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
297 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
298 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
299 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
300 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
301 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
302 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
303 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
304 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
305 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
306 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
307 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
308 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
309 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
310 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
311 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
312 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
313 };
314 
315 /*
316  * Multiply a given number by 2 raised to the given power.
317  */
318 static uint8_t
319 vdev_raidz_exp2(uint8_t a, int exp)
320 {
321 	if (a == 0)
322 		return (0);
323 
324 	//ASSERT(exp >= 0);
325 	//ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
326 
327 	exp += vdev_raidz_log2[a];
328 	if (exp > 255)
329 		exp -= 255;
330 
331 	return (vdev_raidz_pow2[exp]);
332 }
333 
334 static void
335 vdev_raidz_generate_parity_pq(raidz_col_t *cols, int nparity, int acols)
336 {
337 	uint64_t *q, *p, *src, pcount, ccount, mask, i;
338 	int c;
339 
340 	pcount = cols[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
341 	//ASSERT(cols[VDEV_RAIDZ_P].rc_size == cols[VDEV_RAIDZ_Q].rc_size);
342 
343 	for (c = nparity; c < acols; c++) {
344 		src = cols[c].rc_data;
345 		p = cols[VDEV_RAIDZ_P].rc_data;
346 		q = cols[VDEV_RAIDZ_Q].rc_data;
347 		ccount = cols[c].rc_size / sizeof (src[0]);
348 
349 		if (c == nparity) {
350 			//ASSERT(ccount == pcount || ccount == 0);
351 			for (i = 0; i < ccount; i++, p++, q++, src++) {
352 				*q = *src;
353 				*p = *src;
354 			}
355 			for (; i < pcount; i++, p++, q++, src++) {
356 				*q = 0;
357 				*p = 0;
358 			}
359 		} else {
360 			//ASSERT(ccount <= pcount);
361 
362 			/*
363 			 * Rather than multiplying each byte
364 			 * individually (as described above), we are
365 			 * able to handle 8 at once by generating a
366 			 * mask based on the high bit in each byte and
367 			 * using that to conditionally XOR in 0x1d.
368 			 */
369 			for (i = 0; i < ccount; i++, p++, q++, src++) {
370 				mask = *q & 0x8080808080808080ULL;
371 				mask = (mask << 1) - (mask >> 7);
372 				*q = ((*q << 1) & 0xfefefefefefefefeULL) ^
373 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
374 				*q ^= *src;
375 				*p ^= *src;
376 			}
377 
378 			/*
379 			 * Treat short columns as though they are full of 0s.
380 			 */
381 			for (; i < pcount; i++, q++) {
382 				mask = *q & 0x8080808080808080ULL;
383 				mask = (mask << 1) - (mask >> 7);
384 				*q = ((*q << 1) & 0xfefefefefefefefeULL) ^
385 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
386 			}
387 		}
388 	}
389 }
390 
391 static void
392 vdev_raidz_reconstruct_q(raidz_col_t *cols, int nparity, int acols, int x)
393 {
394 	uint64_t *dst, *src, xcount, ccount, count, mask, i;
395 	uint8_t *b;
396 	int c, j, exp;
397 
398 	xcount = cols[x].rc_size / sizeof (src[0]);
399 	//ASSERT(xcount <= cols[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
400 
401 	for (c = nparity; c < acols; c++) {
402 		src = cols[c].rc_data;
403 		dst = cols[x].rc_data;
404 
405 		if (c == x)
406 			ccount = 0;
407 		else
408 			ccount = cols[c].rc_size / sizeof (src[0]);
409 
410 		count = MIN(ccount, xcount);
411 
412 		if (c == nparity) {
413 			for (i = 0; i < count; i++, dst++, src++) {
414 				*dst = *src;
415 			}
416 			for (; i < xcount; i++, dst++) {
417 				*dst = 0;
418 			}
419 
420 		} else {
421 			/*
422 			 * For an explanation of this, see the comment in
423 			 * vdev_raidz_generate_parity_pq() above.
424 			 */
425 			for (i = 0; i < count; i++, dst++, src++) {
426 				mask = *dst & 0x8080808080808080ULL;
427 				mask = (mask << 1) - (mask >> 7);
428 				*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
429 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
430 				*dst ^= *src;
431 			}
432 
433 			for (; i < xcount; i++, dst++) {
434 				mask = *dst & 0x8080808080808080ULL;
435 				mask = (mask << 1) - (mask >> 7);
436 				*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
437 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
438 			}
439 		}
440 	}
441 
442 	src = cols[VDEV_RAIDZ_Q].rc_data;
443 	dst = cols[x].rc_data;
444 	exp = 255 - (acols - 1 - x);
445 
446 	for (i = 0; i < xcount; i++, dst++, src++) {
447 		*dst ^= *src;
448 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
449 			*b = vdev_raidz_exp2(*b, exp);
450 		}
451 	}
452 }
453 
454 
455 static void
456 vdev_raidz_reconstruct_pq(raidz_col_t *cols, int nparity, int acols,
457     int x, int y)
458 {
459 	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
460 	void *pdata, *qdata;
461 	uint64_t xsize, ysize, i;
462 
463 	//ASSERT(x < y);
464 	//ASSERT(x >= nparity);
465 	//ASSERT(y < acols);
466 
467 	//ASSERT(cols[x].rc_size >= cols[y].rc_size);
468 
469 	/*
470 	 * Move the parity data aside -- we're going to compute parity as
471 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
472 	 * reuse the parity generation mechanism without trashing the actual
473 	 * parity so we make those columns appear to be full of zeros by
474 	 * setting their lengths to zero.
475 	 */
476 	pdata = cols[VDEV_RAIDZ_P].rc_data;
477 	qdata = cols[VDEV_RAIDZ_Q].rc_data;
478 	xsize = cols[x].rc_size;
479 	ysize = cols[y].rc_size;
480 
481 	cols[VDEV_RAIDZ_P].rc_data =
482 		zfs_alloc_temp(cols[VDEV_RAIDZ_P].rc_size);
483 	cols[VDEV_RAIDZ_Q].rc_data =
484 		zfs_alloc_temp(cols[VDEV_RAIDZ_Q].rc_size);
485 	cols[x].rc_size = 0;
486 	cols[y].rc_size = 0;
487 
488 	vdev_raidz_generate_parity_pq(cols, nparity, acols);
489 
490 	cols[x].rc_size = xsize;
491 	cols[y].rc_size = ysize;
492 
493 	p = pdata;
494 	q = qdata;
495 	pxy = cols[VDEV_RAIDZ_P].rc_data;
496 	qxy = cols[VDEV_RAIDZ_Q].rc_data;
497 	xd = cols[x].rc_data;
498 	yd = cols[y].rc_data;
499 
500 	/*
501 	 * We now have:
502 	 *	Pxy = P + D_x + D_y
503 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
504 	 *
505 	 * We can then solve for D_x:
506 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
507 	 * where
508 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
509 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
510 	 *
511 	 * With D_x in hand, we can easily solve for D_y:
512 	 *	D_y = P + Pxy + D_x
513 	 */
514 
515 	a = vdev_raidz_pow2[255 + x - y];
516 	b = vdev_raidz_pow2[255 - (acols - 1 - x)];
517 	tmp = 255 - vdev_raidz_log2[a ^ 1];
518 
519 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
520 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
521 
522 	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
523 		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
524 		    vdev_raidz_exp2(*q ^ *qxy, bexp);
525 
526 		if (i < ysize)
527 			*yd = *p ^ *pxy ^ *xd;
528 	}
529 
530 	/*
531 	 * Restore the saved parity data.
532 	 */
533 	cols[VDEV_RAIDZ_P].rc_data = pdata;
534 	cols[VDEV_RAIDZ_Q].rc_data = qdata;
535 }
536 
537 static int
538 vdev_raidz_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
539     off_t offset, size_t bytes)
540 {
541 	size_t psize = BP_GET_PSIZE(bp);
542 	vdev_t *kid;
543 	int unit_shift = vdev->v_ashift;
544 	int dcols = vdev->v_nchildren;
545 	int nparity = vdev->v_nparity;
546 	int missingdata, missingparity;
547 	int parity_errors, data_errors, unexpected_errors, total_errors;
548 	int parity_untried;
549 	uint64_t b = offset >> unit_shift;
550 	uint64_t s = psize >> unit_shift;
551 	uint64_t f = b % dcols;
552 	uint64_t o = (b / dcols) << unit_shift;
553 	int q, r, c, c1, bc, col, acols, coff, devidx, asize, n;
554 	static raidz_col_t cols[16];
555 	raidz_col_t *rc, *rc1;
556 
557 	q = s / (dcols - nparity);
558 	r = s - q * (dcols - nparity);
559 	bc = (r == 0 ? 0 : r + nparity);
560 
561 	acols = (q == 0 ? bc : dcols);
562 	asize = 0;
563 
564 	for (c = 0; c < acols; c++) {
565 		col = f + c;
566 		coff = o;
567 		if (col >= dcols) {
568 			col -= dcols;
569 			coff += 1ULL << unit_shift;
570 		}
571 		cols[c].rc_devidx = col;
572 		cols[c].rc_offset = coff;
573 		cols[c].rc_size = (q + (c < bc)) << unit_shift;
574 		cols[c].rc_data = NULL;
575 		cols[c].rc_error = 0;
576 		cols[c].rc_tried = 0;
577 		cols[c].rc_skipped = 0;
578 		asize += cols[c].rc_size;
579 	}
580 
581 	asize = roundup(asize, (nparity + 1) << unit_shift);
582 
583 	for (c = 0; c < nparity; c++) {
584 		cols[c].rc_data = zfs_alloc_temp(cols[c].rc_size);
585 	}
586 
587 	cols[c].rc_data = buf;
588 
589 	for (c = c + 1; c < acols; c++)
590 		cols[c].rc_data = (char *)cols[c - 1].rc_data +
591 		    cols[c - 1].rc_size;
592 
593 	/*
594 	 * If all data stored spans all columns, there's a danger that
595 	 * parity will always be on the same device and, since parity
596 	 * isn't read during normal operation, that that device's I/O
597 	 * bandwidth won't be used effectively. We therefore switch
598 	 * the parity every 1MB.
599 	 *
600 	 * ... at least that was, ostensibly, the theory. As a
601 	 * practical matter unless we juggle the parity between all
602 	 * devices evenly, we won't see any benefit. Further,
603 	 * occasional writes that aren't a multiple of the LCM of the
604 	 * number of children and the minimum stripe width are
605 	 * sufficient to avoid pessimal behavior.  Unfortunately, this
606 	 * decision created an implicit on-disk format requirement
607 	 * that we need to support for all eternity, but only for
608 	 * single-parity RAID-Z.
609 	 */
610 	//ASSERT(acols >= 2);
611 	//ASSERT(cols[0].rc_size == cols[1].rc_size);
612 
613 	if (nparity == 1 && (offset & (1ULL << 20))) {
614 		devidx = cols[0].rc_devidx;
615 		o = cols[0].rc_offset;
616 		cols[0].rc_devidx = cols[1].rc_devidx;
617 		cols[0].rc_offset = cols[1].rc_offset;
618 		cols[1].rc_devidx = devidx;
619 		cols[1].rc_offset = o;
620 	}
621 
622 	/*
623 	 * Iterate over the columns in reverse order so that we hit
624 	 * the parity last -- any errors along the way will force us
625 	 * to read the parity data.
626 	 */
627 	missingdata = 0;
628 	missingparity = 0;
629 	for (c = acols - 1; c >= 0; c--) {
630 		rc = &cols[c];
631 		devidx = rc->rc_devidx;
632 		STAILQ_FOREACH(kid, &vdev->v_children, v_childlink)
633 			if (kid->v_id == devidx)
634 				break;
635 		if (kid == NULL || kid->v_state != VDEV_STATE_HEALTHY) {
636 			if (c >= nparity)
637 				missingdata++;
638 			else
639 				missingparity++;
640 			rc->rc_error = ENXIO;
641 			rc->rc_tried = 1;	/* don't even try */
642 			rc->rc_skipped = 1;
643 			continue;
644 		}
645 #if 0
646 		/*
647 		 * Too hard for the bootcode
648 		 */
649 		if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) {
650 			if (c >= nparity)
651 				rm->rm_missingdata++;
652 			else
653 				rm->rm_missingparity++;
654 			rc->rc_error = ESTALE;
655 			rc->rc_skipped = 1;
656 			continue;
657 		}
658 #endif
659 		if (c >= nparity || missingdata > 0) {
660 			if (rc->rc_data)
661 				rc->rc_error = kid->v_read(kid, NULL,
662 				    rc->rc_data, rc->rc_offset, rc->rc_size);
663 			else
664 				rc->rc_error = ENXIO;
665 			rc->rc_tried = 1;
666 			rc->rc_skipped = 0;
667 		}
668 	}
669 
670 reconstruct:
671 	parity_errors = 0;
672 	data_errors = 0;
673 	unexpected_errors = 0;
674 	total_errors = 0;
675 	parity_untried = 0;
676 	for (c = 0; c < acols; c++) {
677 		rc = &cols[c];
678 
679 		if (rc->rc_error) {
680 			if (c < nparity)
681 				parity_errors++;
682 			else
683 				data_errors++;
684 
685 			if (!rc->rc_skipped)
686 				unexpected_errors++;
687 
688 			total_errors++;
689 		} else if (c < nparity && !rc->rc_tried) {
690 			parity_untried++;
691 		}
692 	}
693 
694 	/*
695 	 * There are three potential phases for a read:
696 	 *	1. produce valid data from the columns read
697 	 *	2. read all disks and try again
698 	 *	3. perform combinatorial reconstruction
699 	 *
700 	 * Each phase is progressively both more expensive and less
701 	 * likely to occur. If we encounter more errors than we can
702 	 * repair or all phases fail, we have no choice but to return
703 	 * an error.
704 	 */
705 
706 	/*
707 	 * If the number of errors we saw was correctable -- less than
708 	 * or equal to the number of parity disks read -- attempt to
709 	 * produce data that has a valid checksum. Naturally, this
710 	 * case applies in the absence of any errors.
711 	 */
712 	if (total_errors <= nparity - parity_untried) {
713 		switch (data_errors) {
714 		case 0:
715 			if (zio_checksum_error(bp, buf) == 0)
716 				return (0);
717 			break;
718 
719 		case 1:
720 			/*
721 			 * We either attempt to read all the parity columns or
722 			 * none of them. If we didn't try to read parity, we
723 			 * wouldn't be here in the correctable case. There must
724 			 * also have been fewer parity errors than parity
725 			 * columns or, again, we wouldn't be in this code path.
726 			 */
727 			//ASSERT(parity_untried == 0);
728 			//ASSERT(parity_errors < nparity);
729 
730 			/*
731 			 * Find the column that reported the error.
732 			 */
733 			for (c = nparity; c < acols; c++) {
734 				rc = &cols[c];
735 				if (rc->rc_error != 0)
736 					break;
737 			}
738 			//ASSERT(c != acols);
739 			//ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || rc->rc_error == ESTALE);
740 
741 			if (cols[VDEV_RAIDZ_P].rc_error == 0) {
742 				vdev_raidz_reconstruct_p(cols, nparity,
743 				    acols, c);
744 			} else {
745 				//ASSERT(nparity > 1);
746 				vdev_raidz_reconstruct_q(cols, nparity,
747 				    acols, c);
748 			}
749 
750 			if (zio_checksum_error(bp, buf) == 0)
751 				return (0);
752 			break;
753 
754 		case 2:
755 			/*
756 			 * Two data column errors require double parity.
757 			 */
758 			//ASSERT(nparity == 2);
759 
760 			/*
761 			 * Find the two columns that reported errors.
762 			 */
763 			for (c = nparity; c < acols; c++) {
764 				rc = &cols[c];
765 				if (rc->rc_error != 0)
766 					break;
767 			}
768 			//ASSERT(c != acols);
769 			//ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || rc->rc_error == ESTALE);
770 
771 			for (c1 = c++; c < acols; c++) {
772 				rc = &cols[c];
773 				if (rc->rc_error != 0)
774 					break;
775 			}
776 			//ASSERT(c != acols);
777 			//ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || rc->rc_error == ESTALE);
778 
779 			vdev_raidz_reconstruct_pq(cols, nparity, acols,
780 			    c1, c);
781 
782 			if (zio_checksum_error(bp, buf) == 0)
783 				return (0);
784 			break;
785 
786 		default:
787 			break;
788 			//ASSERT(nparity <= 2);
789 			//ASSERT(0);
790 		}
791 	}
792 
793 	/*
794 	 * This isn't a typical situation -- either we got a read
795 	 * error or a child silently returned bad data. Read every
796 	 * block so we can try again with as much data and parity as
797 	 * we can track down. If we've already been through once
798 	 * before, all children will be marked as tried so we'll
799 	 * proceed to combinatorial reconstruction.
800 	 */
801 	n = 0;
802 	for (c = 0; c < acols; c++) {
803 		rc = &cols[c];
804 		if (rc->rc_tried)
805 			continue;
806 
807 		devidx = rc->rc_devidx;
808 		STAILQ_FOREACH(kid, &vdev->v_children, v_childlink)
809 			if (kid->v_id == devidx)
810 				break;
811 		if (kid == NULL || kid->v_state != VDEV_STATE_HEALTHY) {
812 			rc->rc_error = ENXIO;
813 			rc->rc_tried = 1;	/* don't even try */
814 			rc->rc_skipped = 1;
815 			continue;
816 		}
817 		if (rc->rc_data)
818 			rc->rc_error = kid->v_read(kid, NULL,
819 			    rc->rc_data, rc->rc_offset, rc->rc_size);
820 		else
821 			rc->rc_error = ENXIO;
822 		if (rc->rc_error == 0)
823 			n++;
824 		rc->rc_tried = 1;
825 		rc->rc_skipped = 0;
826 	}
827 
828 	/*
829 	 * If we managed to read anything more, retry the
830 	 * reconstruction.
831 	 */
832 	if (n)
833 		goto reconstruct;
834 
835 	/*
836 	 * At this point we've attempted to reconstruct the data given the
837 	 * errors we detected, and we've attempted to read all columns. There
838 	 * must, therefore, be one or more additional problems -- silent errors
839 	 * resulting in invalid data rather than explicit I/O errors resulting
840 	 * in absent data. Before we attempt combinatorial reconstruction make
841 	 * sure we have a chance of coming up with the right answer.
842 	 */
843 	if (total_errors >= nparity) {
844 		return (EIO);
845 	}
846 
847 	asize = 0;
848 	for (c = 0; c < acols; c++) {
849 		rc = &cols[c];
850 		if (rc->rc_size > asize)
851 			asize = rc->rc_size;
852 	}
853 	if (cols[VDEV_RAIDZ_P].rc_error == 0) {
854 		/*
855 		 * Attempt to reconstruct the data from parity P.
856 		 */
857 		void *orig;
858 		orig = zfs_alloc_temp(asize);
859 		for (c = nparity; c < acols; c++) {
860 			rc = &cols[c];
861 
862 			memcpy(orig, rc->rc_data, rc->rc_size);
863 			vdev_raidz_reconstruct_p(cols, nparity, acols, c);
864 
865 			if (zio_checksum_error(bp, buf) == 0)
866 				return (0);
867 
868 			memcpy(rc->rc_data, orig, rc->rc_size);
869 		}
870 	}
871 
872 	if (nparity > 1 && cols[VDEV_RAIDZ_Q].rc_error == 0) {
873 		/*
874 		 * Attempt to reconstruct the data from parity Q.
875 		 */
876 		void *orig;
877 		orig = zfs_alloc_temp(asize);
878 		for (c = nparity; c < acols; c++) {
879 			rc = &cols[c];
880 
881 			memcpy(orig, rc->rc_data, rc->rc_size);
882 			vdev_raidz_reconstruct_q(cols, nparity, acols, c);
883 
884 			if (zio_checksum_error(bp, buf) == 0)
885 				return (0);
886 
887 			memcpy(rc->rc_data, orig, rc->rc_size);
888 		}
889 	}
890 
891 	if (nparity > 1 &&
892 	    cols[VDEV_RAIDZ_P].rc_error == 0 &&
893 	    cols[VDEV_RAIDZ_Q].rc_error == 0) {
894 		/*
895 		 * Attempt to reconstruct the data from both P and Q.
896 		 */
897 		void *orig, *orig1;
898 		orig = zfs_alloc_temp(asize);
899 		orig1 = zfs_alloc_temp(asize);
900 		for (c = nparity; c < acols - 1; c++) {
901 			rc = &cols[c];
902 
903 			memcpy(orig, rc->rc_data, rc->rc_size);
904 
905 			for (c1 = c + 1; c1 < acols; c1++) {
906 				rc1 = &cols[c1];
907 
908 				memcpy(orig1, rc1->rc_data, rc1->rc_size);
909 
910 				vdev_raidz_reconstruct_pq(cols, nparity,
911 				    acols, c, c1);
912 
913 				if (zio_checksum_error(bp, buf) == 0)
914 					return (0);
915 
916 				memcpy(rc1->rc_data, orig1, rc1->rc_size);
917 			}
918 
919 			memcpy(rc->rc_data, orig, rc->rc_size);
920 		}
921 	}
922 
923 	return (EIO);
924 }
925 
926