xref: /freebsd/sys/cddl/boot/zfs/zfssubr.c (revision aa64588d28258aef88cc33b8043112e8856948d0)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/cdefs.h>
27 __FBSDID("$FreeBSD$");
28 
29 static uint64_t zfs_crc64_table[256];
30 
31 static void
32 zfs_init_crc(void)
33 {
34 	int i, j;
35 	uint64_t *ct;
36 
37 	/*
38 	 * Calculate the crc64 table (used for the zap hash
39 	 * function).
40 	 */
41 	if (zfs_crc64_table[128] != ZFS_CRC64_POLY) {
42 		memset(zfs_crc64_table, 0, sizeof(zfs_crc64_table));
43 		for (i = 0; i < 256; i++)
44 			for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
45 				*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
46 	}
47 }
48 
49 static void
50 zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
51 {
52 	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
53 }
54 
55 /*
56  * Signature for checksum functions.
57  */
58 typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp);
59 
60 /*
61  * Information about each checksum function.
62  */
63 typedef struct zio_checksum_info {
64 	zio_checksum_t	*ci_func[2]; /* checksum function for each byteorder */
65 	int		ci_correctable;	/* number of correctable bits	*/
66 	int		ci_zbt;		/* uses zio block tail?	*/
67 	const char	*ci_name;	/* descriptive name */
68 } zio_checksum_info_t;
69 
70 #include "fletcher.c"
71 #include "sha256.c"
72 
73 static zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
74 	{{NULL,			NULL},			0, 0,	"inherit"},
75 	{{NULL,			NULL},			0, 0,	"on"},
76 	{{zio_checksum_off,	zio_checksum_off},	0, 0,	"off"},
77 	{{zio_checksum_SHA256,	NULL},			1, 1,	"label"},
78 	{{zio_checksum_SHA256,	NULL},			1, 1,	"gang_header"},
79 	{{fletcher_2_native,	NULL},			0, 1,	"zilog"},
80 	{{fletcher_2_native,	NULL},			0, 0,	"fletcher2"},
81 	{{fletcher_4_native,	NULL},			1, 0,	"fletcher4"},
82 	{{zio_checksum_SHA256,	NULL},			1, 0,	"SHA256"},
83 };
84 
85 /*
86  * Common signature for all zio compress/decompress functions.
87  */
88 typedef size_t zio_compress_func_t(void *src, void *dst,
89     size_t s_len, size_t d_len, int);
90 typedef int zio_decompress_func_t(void *src, void *dst,
91     size_t s_len, size_t d_len, int);
92 
93 /*
94  * Information about each compression function.
95  */
96 typedef struct zio_compress_info {
97 	zio_compress_func_t	*ci_compress;	/* compression function */
98 	zio_decompress_func_t	*ci_decompress;	/* decompression function */
99 	int			ci_level;	/* level parameter */
100 	const char		*ci_name;	/* algorithm name */
101 } zio_compress_info_t;
102 
103 #include "lzjb.c"
104 
105 /*
106  * Compression vectors.
107  */
108 static zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
109 	{NULL,			NULL,			0,	"inherit"},
110 	{NULL,			NULL,			0,	"on"},
111 	{NULL,			NULL,			0,	"uncompressed"},
112 	{NULL,			lzjb_decompress,	0,	"lzjb"},
113 	{NULL,			NULL,			0,	"empty"},
114 	{NULL,			NULL,			1,	"gzip-1"},
115 	{NULL,			NULL,			2,	"gzip-2"},
116 	{NULL,			NULL,			3,	"gzip-3"},
117 	{NULL,			NULL,			4,	"gzip-4"},
118 	{NULL,			NULL,			5,	"gzip-5"},
119 	{NULL,			NULL,			6,	"gzip-6"},
120 	{NULL,			NULL,			7,	"gzip-7"},
121 	{NULL,			NULL,			8,	"gzip-8"},
122 	{NULL,			NULL,			9,	"gzip-9"},
123 };
124 
125 static int
126 zio_checksum_error(const blkptr_t *bp, void *data)
127 {
128 	zio_cksum_t zc = bp->blk_cksum;
129 	unsigned int checksum = BP_GET_CHECKSUM(bp);
130 	uint64_t size = BP_GET_PSIZE(bp);
131 	zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1;
132 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
133 	zio_cksum_t actual_cksum, expected_cksum;
134 
135 	if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
136 		return (EINVAL);
137 
138 	if (ci->ci_zbt) {
139 		expected_cksum = zbt->zbt_cksum;
140 		zbt->zbt_cksum = zc;
141 		ci->ci_func[0](data, size, &actual_cksum);
142 		zbt->zbt_cksum = expected_cksum;
143 		zc = expected_cksum;
144 	} else {
145 		/* ASSERT(!BP_IS_GANG(bp)); */
146 		ci->ci_func[0](data, size, &actual_cksum);
147 	}
148 
149 	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, zc)) {
150 		/*printf("ZFS: read checksum failed\n");*/
151 		return (EIO);
152 	}
153 
154 	return (0);
155 }
156 
157 static int
158 zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
159 	void *dest, uint64_t destsize)
160 {
161 	zio_compress_info_t *ci = &zio_compress_table[cpfunc];
162 
163 	/* ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS); */
164 	if (!ci->ci_decompress) {
165 		printf("ZFS: unsupported compression algorithm %u\n", cpfunc);
166 		return (EIO);
167 	}
168 
169 	return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
170 }
171 
172 static uint64_t
173 zap_hash(uint64_t salt, const char *name)
174 {
175 	const uint8_t *cp;
176 	uint8_t c;
177 	uint64_t crc = salt;
178 
179 	/*ASSERT(crc != 0);*/
180 	/*ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);*/
181 	for (cp = (const uint8_t *)name; (c = *cp) != '\0'; cp++)
182 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
183 
184 	/*
185 	 * Only use 28 bits, since we need 4 bits in the cookie for the
186 	 * collision differentiator.  We MUST use the high bits, since
187 	 * those are the onces that we first pay attention to when
188 	 * chosing the bucket.
189 	 */
190 	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
191 
192 	return (crc);
193 }
194 
195 static char *zfs_alloc_temp(size_t sz);
196 
197 typedef struct raidz_col {
198 	uint64_t rc_devidx;		/* child device index for I/O */
199 	uint64_t rc_offset;		/* device offset */
200 	uint64_t rc_size;		/* I/O size */
201 	void *rc_data;			/* I/O data */
202 	int rc_error;			/* I/O error for this device */
203 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
204 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
205 } raidz_col_t;
206 
207 #define	VDEV_RAIDZ_P		0
208 #define	VDEV_RAIDZ_Q		1
209 
210 static void
211 vdev_raidz_reconstruct_p(raidz_col_t *cols, int nparity, int acols, int x)
212 {
213 	uint64_t *dst, *src, xcount, ccount, count, i;
214 	int c;
215 
216 	xcount = cols[x].rc_size / sizeof (src[0]);
217 	//ASSERT(xcount <= cols[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
218 	//ASSERT(xcount > 0);
219 
220 	src = cols[VDEV_RAIDZ_P].rc_data;
221 	dst = cols[x].rc_data;
222 	for (i = 0; i < xcount; i++, dst++, src++) {
223 		*dst = *src;
224 	}
225 
226 	for (c = nparity; c < acols; c++) {
227 		src = cols[c].rc_data;
228 		dst = cols[x].rc_data;
229 
230 		if (c == x)
231 			continue;
232 
233 		ccount = cols[c].rc_size / sizeof (src[0]);
234 		count = MIN(ccount, xcount);
235 
236 		for (i = 0; i < count; i++, dst++, src++) {
237 			*dst ^= *src;
238 		}
239 	}
240 }
241 
242 /*
243  * These two tables represent powers and logs of 2 in the Galois field defined
244  * above. These values were computed by repeatedly multiplying by 2 as above.
245  */
246 static const uint8_t vdev_raidz_pow2[256] = {
247 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
248 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
249 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
250 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
251 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
252 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
253 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
254 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
255 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
256 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
257 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
258 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
259 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
260 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
261 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
262 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
263 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
264 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
265 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
266 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
267 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
268 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
269 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
270 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
271 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
272 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
273 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
274 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
275 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
276 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
277 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
278 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
279 };
280 static const uint8_t vdev_raidz_log2[256] = {
281 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
282 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
283 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
284 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
285 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
286 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
287 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
288 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
289 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
290 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
291 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
292 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
293 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
294 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
295 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
296 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
297 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
298 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
299 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
300 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
301 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
302 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
303 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
304 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
305 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
306 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
307 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
308 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
309 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
310 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
311 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
312 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
313 };
314 
315 /*
316  * Multiply a given number by 2 raised to the given power.
317  */
318 static uint8_t
319 vdev_raidz_exp2(uint8_t a, int exp)
320 {
321 	if (a == 0)
322 		return (0);
323 
324 	//ASSERT(exp >= 0);
325 	//ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
326 
327 	exp += vdev_raidz_log2[a];
328 	if (exp > 255)
329 		exp -= 255;
330 
331 	return (vdev_raidz_pow2[exp]);
332 }
333 
334 static void
335 vdev_raidz_generate_parity_pq(raidz_col_t *cols, int nparity, int acols)
336 {
337 	uint64_t *q, *p, *src, pcount, ccount, mask, i;
338 	int c;
339 
340 	pcount = cols[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
341 	//ASSERT(cols[VDEV_RAIDZ_P].rc_size == cols[VDEV_RAIDZ_Q].rc_size);
342 
343 	for (c = nparity; c < acols; c++) {
344 		src = cols[c].rc_data;
345 		p = cols[VDEV_RAIDZ_P].rc_data;
346 		q = cols[VDEV_RAIDZ_Q].rc_data;
347 		ccount = cols[c].rc_size / sizeof (src[0]);
348 
349 		if (c == nparity) {
350 			//ASSERT(ccount == pcount || ccount == 0);
351 			for (i = 0; i < ccount; i++, p++, q++, src++) {
352 				*q = *src;
353 				*p = *src;
354 			}
355 			for (; i < pcount; i++, p++, q++, src++) {
356 				*q = 0;
357 				*p = 0;
358 			}
359 		} else {
360 			//ASSERT(ccount <= pcount);
361 
362 			/*
363 			 * Rather than multiplying each byte
364 			 * individually (as described above), we are
365 			 * able to handle 8 at once by generating a
366 			 * mask based on the high bit in each byte and
367 			 * using that to conditionally XOR in 0x1d.
368 			 */
369 			for (i = 0; i < ccount; i++, p++, q++, src++) {
370 				mask = *q & 0x8080808080808080ULL;
371 				mask = (mask << 1) - (mask >> 7);
372 				*q = ((*q << 1) & 0xfefefefefefefefeULL) ^
373 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
374 				*q ^= *src;
375 				*p ^= *src;
376 			}
377 
378 			/*
379 			 * Treat short columns as though they are full of 0s.
380 			 */
381 			for (; i < pcount; i++, q++) {
382 				mask = *q & 0x8080808080808080ULL;
383 				mask = (mask << 1) - (mask >> 7);
384 				*q = ((*q << 1) & 0xfefefefefefefefeULL) ^
385 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
386 			}
387 		}
388 	}
389 }
390 
391 static void
392 vdev_raidz_reconstruct_q(raidz_col_t *cols, int nparity, int acols, int x)
393 {
394 	uint64_t *dst, *src, xcount, ccount, count, mask, i;
395 	uint8_t *b;
396 	int c, j, exp;
397 
398 	xcount = cols[x].rc_size / sizeof (src[0]);
399 	//ASSERT(xcount <= cols[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
400 
401 	for (c = nparity; c < acols; c++) {
402 		src = cols[c].rc_data;
403 		dst = cols[x].rc_data;
404 
405 		if (c == x)
406 			ccount = 0;
407 		else
408 			ccount = cols[c].rc_size / sizeof (src[0]);
409 
410 		count = MIN(ccount, xcount);
411 
412 		if (c == nparity) {
413 			for (i = 0; i < count; i++, dst++, src++) {
414 				*dst = *src;
415 			}
416 			for (; i < xcount; i++, dst++) {
417 				*dst = 0;
418 			}
419 
420 		} else {
421 			/*
422 			 * For an explanation of this, see the comment in
423 			 * vdev_raidz_generate_parity_pq() above.
424 			 */
425 			for (i = 0; i < count; i++, dst++, src++) {
426 				mask = *dst & 0x8080808080808080ULL;
427 				mask = (mask << 1) - (mask >> 7);
428 				*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
429 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
430 				*dst ^= *src;
431 			}
432 
433 			for (; i < xcount; i++, dst++) {
434 				mask = *dst & 0x8080808080808080ULL;
435 				mask = (mask << 1) - (mask >> 7);
436 				*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
437 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
438 			}
439 		}
440 	}
441 
442 	src = cols[VDEV_RAIDZ_Q].rc_data;
443 	dst = cols[x].rc_data;
444 	exp = 255 - (acols - 1 - x);
445 
446 	for (i = 0; i < xcount; i++, dst++, src++) {
447 		*dst ^= *src;
448 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
449 			*b = vdev_raidz_exp2(*b, exp);
450 		}
451 	}
452 }
453 
454 
455 static void
456 vdev_raidz_reconstruct_pq(raidz_col_t *cols, int nparity, int acols,
457     int x, int y, void *temp_p, void *temp_q)
458 {
459 	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
460 	void *pdata, *qdata;
461 	uint64_t xsize, ysize, i;
462 
463 	//ASSERT(x < y);
464 	//ASSERT(x >= nparity);
465 	//ASSERT(y < acols);
466 
467 	//ASSERT(cols[x].rc_size >= cols[y].rc_size);
468 
469 	/*
470 	 * Move the parity data aside -- we're going to compute parity as
471 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
472 	 * reuse the parity generation mechanism without trashing the actual
473 	 * parity so we make those columns appear to be full of zeros by
474 	 * setting their lengths to zero.
475 	 */
476 	pdata = cols[VDEV_RAIDZ_P].rc_data;
477 	qdata = cols[VDEV_RAIDZ_Q].rc_data;
478 	xsize = cols[x].rc_size;
479 	ysize = cols[y].rc_size;
480 
481 	cols[VDEV_RAIDZ_P].rc_data = temp_p;
482 	cols[VDEV_RAIDZ_Q].rc_data = temp_q;
483 	cols[x].rc_size = 0;
484 	cols[y].rc_size = 0;
485 
486 	vdev_raidz_generate_parity_pq(cols, nparity, acols);
487 
488 	cols[x].rc_size = xsize;
489 	cols[y].rc_size = ysize;
490 
491 	p = pdata;
492 	q = qdata;
493 	pxy = cols[VDEV_RAIDZ_P].rc_data;
494 	qxy = cols[VDEV_RAIDZ_Q].rc_data;
495 	xd = cols[x].rc_data;
496 	yd = cols[y].rc_data;
497 
498 	/*
499 	 * We now have:
500 	 *	Pxy = P + D_x + D_y
501 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
502 	 *
503 	 * We can then solve for D_x:
504 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
505 	 * where
506 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
507 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
508 	 *
509 	 * With D_x in hand, we can easily solve for D_y:
510 	 *	D_y = P + Pxy + D_x
511 	 */
512 
513 	a = vdev_raidz_pow2[255 + x - y];
514 	b = vdev_raidz_pow2[255 - (acols - 1 - x)];
515 	tmp = 255 - vdev_raidz_log2[a ^ 1];
516 
517 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
518 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
519 
520 	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
521 		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
522 		    vdev_raidz_exp2(*q ^ *qxy, bexp);
523 
524 		if (i < ysize)
525 			*yd = *p ^ *pxy ^ *xd;
526 	}
527 
528 	/*
529 	 * Restore the saved parity data.
530 	 */
531 	cols[VDEV_RAIDZ_P].rc_data = pdata;
532 	cols[VDEV_RAIDZ_Q].rc_data = qdata;
533 }
534 
535 static int
536 vdev_raidz_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
537     off_t offset, size_t bytes)
538 {
539 	size_t psize = BP_GET_PSIZE(bp);
540 	vdev_t *kid;
541 	int unit_shift = vdev->v_ashift;
542 	int dcols = vdev->v_nchildren;
543 	int nparity = vdev->v_nparity;
544 	int missingdata, missingparity;
545 	int parity_errors, data_errors, unexpected_errors, total_errors;
546 	int parity_untried;
547 	uint64_t b = offset >> unit_shift;
548 	uint64_t s = psize >> unit_shift;
549 	uint64_t f = b % dcols;
550 	uint64_t o = (b / dcols) << unit_shift;
551 	uint64_t q, r, coff;
552 	int c, c1, bc, col, acols, devidx, asize, n, max_rc_size;
553 	static raidz_col_t cols[16];
554 	raidz_col_t *rc, *rc1;
555 	void *orig, *orig1, *temp_p, *temp_q;
556 
557 	orig = orig1 = temp_p = temp_q = NULL;
558 
559 	q = s / (dcols - nparity);
560 	r = s - q * (dcols - nparity);
561 	bc = (r == 0 ? 0 : r + nparity);
562 
563 	acols = (q == 0 ? bc : dcols);
564 	asize = 0;
565 	max_rc_size = 0;
566 
567 	for (c = 0; c < acols; c++) {
568 		col = f + c;
569 		coff = o;
570 		if (col >= dcols) {
571 			col -= dcols;
572 			coff += 1ULL << unit_shift;
573 		}
574 		cols[c].rc_devidx = col;
575 		cols[c].rc_offset = coff;
576 		cols[c].rc_size = (q + (c < bc)) << unit_shift;
577 		cols[c].rc_data = NULL;
578 		cols[c].rc_error = 0;
579 		cols[c].rc_tried = 0;
580 		cols[c].rc_skipped = 0;
581 		asize += cols[c].rc_size;
582 		if (cols[c].rc_size > max_rc_size)
583 			max_rc_size = cols[c].rc_size;
584 	}
585 
586 	asize = roundup(asize, (nparity + 1) << unit_shift);
587 
588 	for (c = 0; c < nparity; c++) {
589 		cols[c].rc_data = zfs_alloc_temp(cols[c].rc_size);
590 	}
591 
592 	cols[c].rc_data = buf;
593 
594 	for (c = c + 1; c < acols; c++)
595 		cols[c].rc_data = (char *)cols[c - 1].rc_data +
596 		    cols[c - 1].rc_size;
597 
598 	/*
599 	 * If all data stored spans all columns, there's a danger that
600 	 * parity will always be on the same device and, since parity
601 	 * isn't read during normal operation, that that device's I/O
602 	 * bandwidth won't be used effectively. We therefore switch
603 	 * the parity every 1MB.
604 	 *
605 	 * ... at least that was, ostensibly, the theory. As a
606 	 * practical matter unless we juggle the parity between all
607 	 * devices evenly, we won't see any benefit. Further,
608 	 * occasional writes that aren't a multiple of the LCM of the
609 	 * number of children and the minimum stripe width are
610 	 * sufficient to avoid pessimal behavior.  Unfortunately, this
611 	 * decision created an implicit on-disk format requirement
612 	 * that we need to support for all eternity, but only for
613 	 * single-parity RAID-Z.
614 	 */
615 	//ASSERT(acols >= 2);
616 	//ASSERT(cols[0].rc_size == cols[1].rc_size);
617 
618 	if (nparity == 1 && (offset & (1ULL << 20))) {
619 		devidx = cols[0].rc_devidx;
620 		o = cols[0].rc_offset;
621 		cols[0].rc_devidx = cols[1].rc_devidx;
622 		cols[0].rc_offset = cols[1].rc_offset;
623 		cols[1].rc_devidx = devidx;
624 		cols[1].rc_offset = o;
625 	}
626 
627 	/*
628 	 * Iterate over the columns in reverse order so that we hit
629 	 * the parity last -- any errors along the way will force us
630 	 * to read the parity data.
631 	 */
632 	missingdata = 0;
633 	missingparity = 0;
634 	for (c = acols - 1; c >= 0; c--) {
635 		rc = &cols[c];
636 		devidx = rc->rc_devidx;
637 		STAILQ_FOREACH(kid, &vdev->v_children, v_childlink)
638 			if (kid->v_id == devidx)
639 				break;
640 		if (kid == NULL || kid->v_state != VDEV_STATE_HEALTHY) {
641 			if (c >= nparity)
642 				missingdata++;
643 			else
644 				missingparity++;
645 			rc->rc_error = ENXIO;
646 			rc->rc_tried = 1;	/* don't even try */
647 			rc->rc_skipped = 1;
648 			continue;
649 		}
650 #if 0
651 		/*
652 		 * Too hard for the bootcode
653 		 */
654 		if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) {
655 			if (c >= nparity)
656 				rm->rm_missingdata++;
657 			else
658 				rm->rm_missingparity++;
659 			rc->rc_error = ESTALE;
660 			rc->rc_skipped = 1;
661 			continue;
662 		}
663 #endif
664 		if (c >= nparity || missingdata > 0) {
665 			if (rc->rc_data)
666 				rc->rc_error = kid->v_read(kid, NULL,
667 				    rc->rc_data, rc->rc_offset, rc->rc_size);
668 			else
669 				rc->rc_error = ENXIO;
670 			rc->rc_tried = 1;
671 			rc->rc_skipped = 0;
672 		}
673 	}
674 
675 reconstruct:
676 	parity_errors = 0;
677 	data_errors = 0;
678 	unexpected_errors = 0;
679 	total_errors = 0;
680 	parity_untried = 0;
681 	for (c = 0; c < acols; c++) {
682 		rc = &cols[c];
683 
684 		if (rc->rc_error) {
685 			if (c < nparity)
686 				parity_errors++;
687 			else
688 				data_errors++;
689 
690 			if (!rc->rc_skipped)
691 				unexpected_errors++;
692 
693 			total_errors++;
694 		} else if (c < nparity && !rc->rc_tried) {
695 			parity_untried++;
696 		}
697 	}
698 
699 	/*
700 	 * There are three potential phases for a read:
701 	 *	1. produce valid data from the columns read
702 	 *	2. read all disks and try again
703 	 *	3. perform combinatorial reconstruction
704 	 *
705 	 * Each phase is progressively both more expensive and less
706 	 * likely to occur. If we encounter more errors than we can
707 	 * repair or all phases fail, we have no choice but to return
708 	 * an error.
709 	 */
710 
711 	/*
712 	 * If the number of errors we saw was correctable -- less than
713 	 * or equal to the number of parity disks read -- attempt to
714 	 * produce data that has a valid checksum. Naturally, this
715 	 * case applies in the absence of any errors.
716 	 */
717 	if (total_errors <= nparity - parity_untried) {
718 		switch (data_errors) {
719 		case 0:
720 			if (zio_checksum_error(bp, buf) == 0)
721 				return (0);
722 			break;
723 
724 		case 1:
725 			/*
726 			 * We either attempt to read all the parity columns or
727 			 * none of them. If we didn't try to read parity, we
728 			 * wouldn't be here in the correctable case. There must
729 			 * also have been fewer parity errors than parity
730 			 * columns or, again, we wouldn't be in this code path.
731 			 */
732 			//ASSERT(parity_untried == 0);
733 			//ASSERT(parity_errors < nparity);
734 
735 			/*
736 			 * Find the column that reported the error.
737 			 */
738 			for (c = nparity; c < acols; c++) {
739 				rc = &cols[c];
740 				if (rc->rc_error != 0)
741 					break;
742 			}
743 			//ASSERT(c != acols);
744 			//ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || rc->rc_error == ESTALE);
745 
746 			if (cols[VDEV_RAIDZ_P].rc_error == 0) {
747 				vdev_raidz_reconstruct_p(cols, nparity,
748 				    acols, c);
749 			} else {
750 				//ASSERT(nparity > 1);
751 				vdev_raidz_reconstruct_q(cols, nparity,
752 				    acols, c);
753 			}
754 
755 			if (zio_checksum_error(bp, buf) == 0)
756 				return (0);
757 			break;
758 
759 		case 2:
760 			/*
761 			 * Two data column errors require double parity.
762 			 */
763 			//ASSERT(nparity == 2);
764 
765 			/*
766 			 * Find the two columns that reported errors.
767 			 */
768 			for (c = nparity; c < acols; c++) {
769 				rc = &cols[c];
770 				if (rc->rc_error != 0)
771 					break;
772 			}
773 			//ASSERT(c != acols);
774 			//ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || rc->rc_error == ESTALE);
775 
776 			for (c1 = c++; c < acols; c++) {
777 				rc = &cols[c];
778 				if (rc->rc_error != 0)
779 					break;
780 			}
781 			//ASSERT(c != acols);
782 			//ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || rc->rc_error == ESTALE);
783 
784 			if (temp_p == NULL)
785 				temp_p = zfs_alloc_temp(max_rc_size);
786 			if (temp_q == NULL)
787 				temp_q = zfs_alloc_temp(max_rc_size);
788 
789 			vdev_raidz_reconstruct_pq(cols, nparity, acols,
790 			    c1, c, temp_p, temp_q);
791 
792 			if (zio_checksum_error(bp, buf) == 0)
793 				return (0);
794 			break;
795 
796 		default:
797 			break;
798 			//ASSERT(nparity <= 2);
799 			//ASSERT(0);
800 		}
801 	}
802 
803 	/*
804 	 * This isn't a typical situation -- either we got a read
805 	 * error or a child silently returned bad data. Read every
806 	 * block so we can try again with as much data and parity as
807 	 * we can track down. If we've already been through once
808 	 * before, all children will be marked as tried so we'll
809 	 * proceed to combinatorial reconstruction.
810 	 */
811 	n = 0;
812 	for (c = 0; c < acols; c++) {
813 		rc = &cols[c];
814 		if (rc->rc_tried)
815 			continue;
816 
817 		devidx = rc->rc_devidx;
818 		STAILQ_FOREACH(kid, &vdev->v_children, v_childlink)
819 			if (kid->v_id == devidx)
820 				break;
821 		if (kid == NULL || kid->v_state != VDEV_STATE_HEALTHY) {
822 			rc->rc_error = ENXIO;
823 			rc->rc_tried = 1;	/* don't even try */
824 			rc->rc_skipped = 1;
825 			continue;
826 		}
827 		if (rc->rc_data)
828 			rc->rc_error = kid->v_read(kid, NULL,
829 			    rc->rc_data, rc->rc_offset, rc->rc_size);
830 		else
831 			rc->rc_error = ENXIO;
832 		if (rc->rc_error == 0)
833 			n++;
834 		rc->rc_tried = 1;
835 		rc->rc_skipped = 0;
836 	}
837 
838 	/*
839 	 * If we managed to read anything more, retry the
840 	 * reconstruction.
841 	 */
842 	if (n)
843 		goto reconstruct;
844 
845 	/*
846 	 * At this point we've attempted to reconstruct the data given the
847 	 * errors we detected, and we've attempted to read all columns. There
848 	 * must, therefore, be one or more additional problems -- silent errors
849 	 * resulting in invalid data rather than explicit I/O errors resulting
850 	 * in absent data. Before we attempt combinatorial reconstruction make
851 	 * sure we have a chance of coming up with the right answer.
852 	 */
853 	if (total_errors >= nparity) {
854 		return (EIO);
855 	}
856 
857 	if (cols[VDEV_RAIDZ_P].rc_error == 0) {
858 		/*
859 		 * Attempt to reconstruct the data from parity P.
860 		 */
861 		if (orig == NULL)
862 			orig = zfs_alloc_temp(max_rc_size);
863 		for (c = nparity; c < acols; c++) {
864 			rc = &cols[c];
865 
866 			memcpy(orig, rc->rc_data, rc->rc_size);
867 			vdev_raidz_reconstruct_p(cols, nparity, acols, c);
868 
869 			if (zio_checksum_error(bp, buf) == 0)
870 				return (0);
871 
872 			memcpy(rc->rc_data, orig, rc->rc_size);
873 		}
874 	}
875 
876 	if (nparity > 1 && cols[VDEV_RAIDZ_Q].rc_error == 0) {
877 		/*
878 		 * Attempt to reconstruct the data from parity Q.
879 		 */
880 		if (orig == NULL)
881 			orig = zfs_alloc_temp(max_rc_size);
882 		for (c = nparity; c < acols; c++) {
883 			rc = &cols[c];
884 
885 			memcpy(orig, rc->rc_data, rc->rc_size);
886 			vdev_raidz_reconstruct_q(cols, nparity, acols, c);
887 
888 			if (zio_checksum_error(bp, buf) == 0)
889 				return (0);
890 
891 			memcpy(rc->rc_data, orig, rc->rc_size);
892 		}
893 	}
894 
895 	if (nparity > 1 &&
896 	    cols[VDEV_RAIDZ_P].rc_error == 0 &&
897 	    cols[VDEV_RAIDZ_Q].rc_error == 0) {
898 		/*
899 		 * Attempt to reconstruct the data from both P and Q.
900 		 */
901 		if (orig == NULL)
902 			orig = zfs_alloc_temp(max_rc_size);
903 		if (orig1 == NULL)
904 			orig1 = zfs_alloc_temp(max_rc_size);
905 		if (temp_p == NULL)
906 			temp_p = zfs_alloc_temp(max_rc_size);
907 		if (temp_q == NULL)
908 			temp_q = zfs_alloc_temp(max_rc_size);
909 		for (c = nparity; c < acols - 1; c++) {
910 			rc = &cols[c];
911 
912 			memcpy(orig, rc->rc_data, rc->rc_size);
913 
914 			for (c1 = c + 1; c1 < acols; c1++) {
915 				rc1 = &cols[c1];
916 
917 				memcpy(orig1, rc1->rc_data, rc1->rc_size);
918 
919 				vdev_raidz_reconstruct_pq(cols, nparity,
920 				    acols, c, c1, temp_p, temp_q);
921 
922 				if (zio_checksum_error(bp, buf) == 0)
923 					return (0);
924 
925 				memcpy(rc1->rc_data, orig1, rc1->rc_size);
926 			}
927 
928 			memcpy(rc->rc_data, orig, rc->rc_size);
929 		}
930 	}
931 
932 	return (EIO);
933 }
934 
935