xref: /titanic_50/usr/src/uts/common/fs/zfs/vdev_raidz.c (revision 4a8d0ea71c9a4e51c6a916a083ced6b499eb207f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/zfs_context.h>
28 #include <sys/spa.h>
29 #include <sys/vdev_impl.h>
30 #include <sys/zio.h>
31 #include <sys/zio_checksum.h>
32 #include <sys/fs/zfs.h>
33 #include <sys/fm/fs/zfs.h>
34 
35 /*
36  * Virtual device vector for RAID-Z.
37  *
38  * This vdev supports both single and double parity. For single parity, we
39  * use a simple XOR of all the data columns. For double parity, we use both
40  * the simple XOR as well as a technique described in "The mathematics of
41  * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8),
42  * over the integers expressable in a single byte. Briefly, the operations on
43  * the field are defined as follows:
44  *
45  *   o addition (+) is represented by a bitwise XOR
46  *   o subtraction (-) is therefore identical to addition: A + B = A - B
47  *   o multiplication of A by 2 is defined by the following bitwise expression:
48  *	(A * 2)_7 = A_6
49  *	(A * 2)_6 = A_5
50  *	(A * 2)_5 = A_4
51  *	(A * 2)_4 = A_3 + A_7
52  *	(A * 2)_3 = A_2 + A_7
53  *	(A * 2)_2 = A_1 + A_7
54  *	(A * 2)_1 = A_0
55  *	(A * 2)_0 = A_7
56  *
57  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
58  *
59  * Observe that any number in the field (except for 0) can be expressed as a
60  * power of 2 -- a generator for the field. We store a table of the powers of
61  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
62  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
63  * than field addition). The inverse of a field element A (A^-1) is A^254.
64  *
65  * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1,
66  * can be expressed by field operations:
67  *
68  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
69  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
70  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
71  *
72  * See the reconstruction code below for how P and Q can used individually or
73  * in concert to recover missing data columns.
74  */
75 
76 typedef struct raidz_col {
77 	uint64_t rc_devidx;		/* child device index for I/O */
78 	uint64_t rc_offset;		/* device offset */
79 	uint64_t rc_size;		/* I/O size */
80 	void *rc_data;			/* I/O data */
81 	int rc_error;			/* I/O error for this device */
82 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
83 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
84 } raidz_col_t;
85 
86 typedef struct raidz_map {
87 	uint64_t rm_cols;		/* Column count */
88 	uint64_t rm_bigcols;		/* Number of oversized columns */
89 	uint64_t rm_asize;		/* Actual total I/O size */
90 	uint64_t rm_missingdata;	/* Count of missing data devices */
91 	uint64_t rm_missingparity;	/* Count of missing parity devices */
92 	uint64_t rm_firstdatacol;	/* First data column/parity count */
93 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
94 } raidz_map_t;
95 
96 #define	VDEV_RAIDZ_P		0
97 #define	VDEV_RAIDZ_Q		1
98 
99 #define	VDEV_RAIDZ_MAXPARITY	2
100 
101 #define	VDEV_RAIDZ_MUL_2(a)	(((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0))
102 
103 /*
104  * These two tables represent powers and logs of 2 in the Galois field defined
105  * above. These values were computed by repeatedly multiplying by 2 as above.
106  */
107 static const uint8_t vdev_raidz_pow2[256] = {
108 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
109 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
110 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
111 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
112 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
113 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
114 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
115 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
116 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
117 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
118 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
119 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
120 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
121 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
122 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
123 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
124 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
125 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
126 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
127 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
128 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
129 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
130 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
131 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
132 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
133 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
134 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
135 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
136 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
137 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
138 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
139 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
140 };
141 static const uint8_t vdev_raidz_log2[256] = {
142 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
143 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
144 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
145 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
146 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
147 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
148 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
149 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
150 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
151 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
152 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
153 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
154 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
155 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
156 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
157 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
158 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
159 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
160 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
161 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
162 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
163 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
164 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
165 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
166 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
167 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
168 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
169 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
170 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
171 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
172 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
173 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
174 };
175 
176 /*
177  * Multiply a given number by 2 raised to the given power.
178  */
179 static uint8_t
180 vdev_raidz_exp2(uint_t a, int exp)
181 {
182 	if (a == 0)
183 		return (0);
184 
185 	ASSERT(exp >= 0);
186 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
187 
188 	exp += vdev_raidz_log2[a];
189 	if (exp > 255)
190 		exp -= 255;
191 
192 	return (vdev_raidz_pow2[exp]);
193 }
194 
195 static void
196 vdev_raidz_map_free(zio_t *zio)
197 {
198 	raidz_map_t *rm = zio->io_vsd;
199 	int c;
200 
201 	for (c = 0; c < rm->rm_firstdatacol; c++)
202 		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
203 
204 	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols]));
205 }
206 
207 static raidz_map_t *
208 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
209     uint64_t nparity)
210 {
211 	raidz_map_t *rm;
212 	uint64_t b = zio->io_offset >> unit_shift;
213 	uint64_t s = zio->io_size >> unit_shift;
214 	uint64_t f = b % dcols;
215 	uint64_t o = (b / dcols) << unit_shift;
216 	uint64_t q, r, c, bc, col, acols, coff, devidx;
217 
218 	q = s / (dcols - nparity);
219 	r = s - q * (dcols - nparity);
220 	bc = (r == 0 ? 0 : r + nparity);
221 
222 	acols = (q == 0 ? bc : dcols);
223 
224 	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP);
225 
226 	rm->rm_cols = acols;
227 	rm->rm_bigcols = bc;
228 	rm->rm_asize = 0;
229 	rm->rm_missingdata = 0;
230 	rm->rm_missingparity = 0;
231 	rm->rm_firstdatacol = nparity;
232 
233 	for (c = 0; c < acols; c++) {
234 		col = f + c;
235 		coff = o;
236 		if (col >= dcols) {
237 			col -= dcols;
238 			coff += 1ULL << unit_shift;
239 		}
240 		rm->rm_col[c].rc_devidx = col;
241 		rm->rm_col[c].rc_offset = coff;
242 		rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift;
243 		rm->rm_col[c].rc_data = NULL;
244 		rm->rm_col[c].rc_error = 0;
245 		rm->rm_col[c].rc_tried = 0;
246 		rm->rm_col[c].rc_skipped = 0;
247 		rm->rm_asize += rm->rm_col[c].rc_size;
248 	}
249 
250 	rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift);
251 
252 	for (c = 0; c < rm->rm_firstdatacol; c++)
253 		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
254 
255 	rm->rm_col[c].rc_data = zio->io_data;
256 
257 	for (c = c + 1; c < acols; c++)
258 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
259 		    rm->rm_col[c - 1].rc_size;
260 
261 	/*
262 	 * If all data stored spans all columns, there's a danger that parity
263 	 * will always be on the same device and, since parity isn't read
264 	 * during normal operation, that that device's I/O bandwidth won't be
265 	 * used effectively. We therefore switch the parity every 1MB.
266 	 *
267 	 * ... at least that was, ostensibly, the theory. As a practical
268 	 * matter unless we juggle the parity between all devices evenly, we
269 	 * won't see any benefit. Further, occasional writes that aren't a
270 	 * multiple of the LCM of the number of children and the minimum
271 	 * stripe width are sufficient to avoid pessimal behavior.
272 	 * Unfortunately, this decision created an implicit on-disk format
273 	 * requirement that we need to support for all eternity, but only
274 	 * for single-parity RAID-Z.
275 	 */
276 	ASSERT(rm->rm_cols >= 2);
277 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
278 
279 	if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
280 		devidx = rm->rm_col[0].rc_devidx;
281 		o = rm->rm_col[0].rc_offset;
282 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
283 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
284 		rm->rm_col[1].rc_devidx = devidx;
285 		rm->rm_col[1].rc_offset = o;
286 	}
287 
288 	zio->io_vsd = rm;
289 	zio->io_vsd_free = vdev_raidz_map_free;
290 	return (rm);
291 }
292 
293 static void
294 vdev_raidz_generate_parity_p(raidz_map_t *rm)
295 {
296 	uint64_t *p, *src, pcount, ccount, i;
297 	int c;
298 
299 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
300 
301 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
302 		src = rm->rm_col[c].rc_data;
303 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
304 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
305 
306 		if (c == rm->rm_firstdatacol) {
307 			ASSERT(ccount == pcount);
308 			for (i = 0; i < ccount; i++, p++, src++) {
309 				*p = *src;
310 			}
311 		} else {
312 			ASSERT(ccount <= pcount);
313 			for (i = 0; i < ccount; i++, p++, src++) {
314 				*p ^= *src;
315 			}
316 		}
317 	}
318 }
319 
320 static void
321 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
322 {
323 	uint64_t *q, *p, *src, pcount, ccount, mask, i;
324 	int c;
325 
326 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
327 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
328 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
329 
330 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
331 		src = rm->rm_col[c].rc_data;
332 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
333 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
334 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
335 
336 		if (c == rm->rm_firstdatacol) {
337 			ASSERT(ccount == pcount || ccount == 0);
338 			for (i = 0; i < ccount; i++, p++, q++, src++) {
339 				*q = *src;
340 				*p = *src;
341 			}
342 			for (; i < pcount; i++, p++, q++, src++) {
343 				*q = 0;
344 				*p = 0;
345 			}
346 		} else {
347 			ASSERT(ccount <= pcount);
348 
349 			/*
350 			 * Rather than multiplying each byte individually (as
351 			 * described above), we are able to handle 8 at once
352 			 * by generating a mask based on the high bit in each
353 			 * byte and using that to conditionally XOR in 0x1d.
354 			 */
355 			for (i = 0; i < ccount; i++, p++, q++, src++) {
356 				mask = *q & 0x8080808080808080ULL;
357 				mask = (mask << 1) - (mask >> 7);
358 				*q = ((*q << 1) & 0xfefefefefefefefeULL) ^
359 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
360 				*q ^= *src;
361 				*p ^= *src;
362 			}
363 
364 			/*
365 			 * Treat short columns as though they are full of 0s.
366 			 */
367 			for (; i < pcount; i++, q++) {
368 				mask = *q & 0x8080808080808080ULL;
369 				mask = (mask << 1) - (mask >> 7);
370 				*q = ((*q << 1) & 0xfefefefefefefefeULL) ^
371 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
372 			}
373 		}
374 	}
375 }
376 
377 static void
378 vdev_raidz_reconstruct_p(raidz_map_t *rm, int x)
379 {
380 	uint64_t *dst, *src, xcount, ccount, count, i;
381 	int c;
382 
383 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
384 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
385 	ASSERT(xcount > 0);
386 
387 	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
388 	dst = rm->rm_col[x].rc_data;
389 	for (i = 0; i < xcount; i++, dst++, src++) {
390 		*dst = *src;
391 	}
392 
393 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
394 		src = rm->rm_col[c].rc_data;
395 		dst = rm->rm_col[x].rc_data;
396 
397 		if (c == x)
398 			continue;
399 
400 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
401 		count = MIN(ccount, xcount);
402 
403 		for (i = 0; i < count; i++, dst++, src++) {
404 			*dst ^= *src;
405 		}
406 	}
407 }
408 
409 static void
410 vdev_raidz_reconstruct_q(raidz_map_t *rm, int x)
411 {
412 	uint64_t *dst, *src, xcount, ccount, count, mask, i;
413 	uint8_t *b;
414 	int c, j, exp;
415 
416 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
417 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
418 
419 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
420 		src = rm->rm_col[c].rc_data;
421 		dst = rm->rm_col[x].rc_data;
422 
423 		if (c == x)
424 			ccount = 0;
425 		else
426 			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
427 
428 		count = MIN(ccount, xcount);
429 
430 		if (c == rm->rm_firstdatacol) {
431 			for (i = 0; i < count; i++, dst++, src++) {
432 				*dst = *src;
433 			}
434 			for (; i < xcount; i++, dst++) {
435 				*dst = 0;
436 			}
437 
438 		} else {
439 			/*
440 			 * For an explanation of this, see the comment in
441 			 * vdev_raidz_generate_parity_pq() above.
442 			 */
443 			for (i = 0; i < count; i++, dst++, src++) {
444 				mask = *dst & 0x8080808080808080ULL;
445 				mask = (mask << 1) - (mask >> 7);
446 				*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
447 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
448 				*dst ^= *src;
449 			}
450 
451 			for (; i < xcount; i++, dst++) {
452 				mask = *dst & 0x8080808080808080ULL;
453 				mask = (mask << 1) - (mask >> 7);
454 				*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
455 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
456 			}
457 		}
458 	}
459 
460 	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
461 	dst = rm->rm_col[x].rc_data;
462 	exp = 255 - (rm->rm_cols - 1 - x);
463 
464 	for (i = 0; i < xcount; i++, dst++, src++) {
465 		*dst ^= *src;
466 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
467 			*b = vdev_raidz_exp2(*b, exp);
468 		}
469 	}
470 }
471 
472 static void
473 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y)
474 {
475 	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
476 	void *pdata, *qdata;
477 	uint64_t xsize, ysize, i;
478 
479 	ASSERT(x < y);
480 	ASSERT(x >= rm->rm_firstdatacol);
481 	ASSERT(y < rm->rm_cols);
482 
483 	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
484 
485 	/*
486 	 * Move the parity data aside -- we're going to compute parity as
487 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
488 	 * reuse the parity generation mechanism without trashing the actual
489 	 * parity so we make those columns appear to be full of zeros by
490 	 * setting their lengths to zero.
491 	 */
492 	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
493 	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
494 	xsize = rm->rm_col[x].rc_size;
495 	ysize = rm->rm_col[y].rc_size;
496 
497 	rm->rm_col[VDEV_RAIDZ_P].rc_data =
498 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
499 	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
500 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
501 	rm->rm_col[x].rc_size = 0;
502 	rm->rm_col[y].rc_size = 0;
503 
504 	vdev_raidz_generate_parity_pq(rm);
505 
506 	rm->rm_col[x].rc_size = xsize;
507 	rm->rm_col[y].rc_size = ysize;
508 
509 	p = pdata;
510 	q = qdata;
511 	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
512 	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
513 	xd = rm->rm_col[x].rc_data;
514 	yd = rm->rm_col[y].rc_data;
515 
516 	/*
517 	 * We now have:
518 	 *	Pxy = P + D_x + D_y
519 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
520 	 *
521 	 * We can then solve for D_x:
522 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
523 	 * where
524 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
525 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
526 	 *
527 	 * With D_x in hand, we can easily solve for D_y:
528 	 *	D_y = P + Pxy + D_x
529 	 */
530 
531 	a = vdev_raidz_pow2[255 + x - y];
532 	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
533 	tmp = 255 - vdev_raidz_log2[a ^ 1];
534 
535 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
536 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
537 
538 	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
539 		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
540 		    vdev_raidz_exp2(*q ^ *qxy, bexp);
541 
542 		if (i < ysize)
543 			*yd = *p ^ *pxy ^ *xd;
544 	}
545 
546 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
547 	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
548 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
549 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
550 
551 	/*
552 	 * Restore the saved parity data.
553 	 */
554 	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
555 	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
556 }
557 
558 
559 static int
560 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
561 {
562 	uint64_t nparity = vd->vdev_nparity;
563 	int lasterror = 0;
564 	int numerrors = 0;
565 
566 	ASSERT(nparity > 0);
567 
568 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
569 	    vd->vdev_children < nparity + 1) {
570 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
571 		return (EINVAL);
572 	}
573 
574 	vdev_open_children(vd);
575 
576 	for (int c = 0; c < vd->vdev_children; c++) {
577 		vdev_t *cvd = vd->vdev_child[c];
578 
579 		if (cvd->vdev_open_error) {
580 			lasterror = cvd->vdev_open_error;
581 			numerrors++;
582 			continue;
583 		}
584 
585 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
586 		*ashift = MAX(*ashift, cvd->vdev_ashift);
587 	}
588 
589 	*asize *= vd->vdev_children;
590 
591 	if (numerrors > nparity) {
592 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
593 		return (lasterror);
594 	}
595 
596 	return (0);
597 }
598 
599 static void
600 vdev_raidz_close(vdev_t *vd)
601 {
602 	for (int c = 0; c < vd->vdev_children; c++)
603 		vdev_close(vd->vdev_child[c]);
604 }
605 
606 static uint64_t
607 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
608 {
609 	uint64_t asize;
610 	uint64_t ashift = vd->vdev_top->vdev_ashift;
611 	uint64_t cols = vd->vdev_children;
612 	uint64_t nparity = vd->vdev_nparity;
613 
614 	asize = ((psize - 1) >> ashift) + 1;
615 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
616 	asize = roundup(asize, nparity + 1) << ashift;
617 
618 	return (asize);
619 }
620 
621 static void
622 vdev_raidz_child_done(zio_t *zio)
623 {
624 	raidz_col_t *rc = zio->io_private;
625 
626 	rc->rc_error = zio->io_error;
627 	rc->rc_tried = 1;
628 	rc->rc_skipped = 0;
629 }
630 
631 static int
632 vdev_raidz_io_start(zio_t *zio)
633 {
634 	vdev_t *vd = zio->io_vd;
635 	vdev_t *tvd = vd->vdev_top;
636 	vdev_t *cvd;
637 	blkptr_t *bp = zio->io_bp;
638 	raidz_map_t *rm;
639 	raidz_col_t *rc;
640 	int c;
641 
642 	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
643 	    vd->vdev_nparity);
644 
645 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
646 
647 	if (zio->io_type == ZIO_TYPE_WRITE) {
648 		/*
649 		 * Generate RAID parity in the first virtual columns.
650 		 */
651 		if (rm->rm_firstdatacol == 1)
652 			vdev_raidz_generate_parity_p(rm);
653 		else
654 			vdev_raidz_generate_parity_pq(rm);
655 
656 		for (c = 0; c < rm->rm_cols; c++) {
657 			rc = &rm->rm_col[c];
658 			cvd = vd->vdev_child[rc->rc_devidx];
659 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
660 			    rc->rc_offset, rc->rc_data, rc->rc_size,
661 			    zio->io_type, zio->io_priority, 0,
662 			    vdev_raidz_child_done, rc));
663 		}
664 
665 		return (ZIO_PIPELINE_CONTINUE);
666 	}
667 
668 	ASSERT(zio->io_type == ZIO_TYPE_READ);
669 
670 	/*
671 	 * Iterate over the columns in reverse order so that we hit the parity
672 	 * last -- any errors along the way will force us to read the parity
673 	 * data.
674 	 */
675 	for (c = rm->rm_cols - 1; c >= 0; c--) {
676 		rc = &rm->rm_col[c];
677 		cvd = vd->vdev_child[rc->rc_devidx];
678 		if (!vdev_readable(cvd)) {
679 			if (c >= rm->rm_firstdatacol)
680 				rm->rm_missingdata++;
681 			else
682 				rm->rm_missingparity++;
683 			rc->rc_error = ENXIO;
684 			rc->rc_tried = 1;	/* don't even try */
685 			rc->rc_skipped = 1;
686 			continue;
687 		}
688 		if (vdev_dtl_contains(cvd, DTL_MISSING, bp->blk_birth, 1)) {
689 			if (c >= rm->rm_firstdatacol)
690 				rm->rm_missingdata++;
691 			else
692 				rm->rm_missingparity++;
693 			rc->rc_error = ESTALE;
694 			rc->rc_skipped = 1;
695 			continue;
696 		}
697 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
698 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
699 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
700 			    rc->rc_offset, rc->rc_data, rc->rc_size,
701 			    zio->io_type, zio->io_priority, 0,
702 			    vdev_raidz_child_done, rc));
703 		}
704 	}
705 
706 	return (ZIO_PIPELINE_CONTINUE);
707 }
708 
709 /*
710  * Report a checksum error for a child of a RAID-Z device.
711  */
712 static void
713 raidz_checksum_error(zio_t *zio, raidz_col_t *rc)
714 {
715 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
716 
717 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
718 		mutex_enter(&vd->vdev_stat_lock);
719 		vd->vdev_stat.vs_checksum_errors++;
720 		mutex_exit(&vd->vdev_stat_lock);
721 	}
722 
723 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE))
724 		zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
725 		    zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size);
726 }
727 
728 /*
729  * Generate the parity from the data columns. If we tried and were able to
730  * read the parity without error, verify that the generated parity matches the
731  * data we read. If it doesn't, we fire off a checksum error. Return the
732  * number such failures.
733  */
734 static int
735 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
736 {
737 	void *orig[VDEV_RAIDZ_MAXPARITY];
738 	int c, ret = 0;
739 	raidz_col_t *rc;
740 
741 	for (c = 0; c < rm->rm_firstdatacol; c++) {
742 		rc = &rm->rm_col[c];
743 		if (!rc->rc_tried || rc->rc_error != 0)
744 			continue;
745 		orig[c] = zio_buf_alloc(rc->rc_size);
746 		bcopy(rc->rc_data, orig[c], rc->rc_size);
747 	}
748 
749 	if (rm->rm_firstdatacol == 1)
750 		vdev_raidz_generate_parity_p(rm);
751 	else
752 		vdev_raidz_generate_parity_pq(rm);
753 
754 	for (c = 0; c < rm->rm_firstdatacol; c++) {
755 		rc = &rm->rm_col[c];
756 		if (!rc->rc_tried || rc->rc_error != 0)
757 			continue;
758 		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
759 			raidz_checksum_error(zio, rc);
760 			rc->rc_error = ECKSUM;
761 			ret++;
762 		}
763 		zio_buf_free(orig[c], rc->rc_size);
764 	}
765 
766 	return (ret);
767 }
768 
769 static uint64_t raidz_corrected_p;
770 static uint64_t raidz_corrected_q;
771 static uint64_t raidz_corrected_pq;
772 
773 static int
774 vdev_raidz_worst_error(raidz_map_t *rm)
775 {
776 	int error = 0;
777 
778 	for (int c = 0; c < rm->rm_cols; c++)
779 		error = zio_worst_error(error, rm->rm_col[c].rc_error);
780 
781 	return (error);
782 }
783 
784 static void
785 vdev_raidz_io_done(zio_t *zio)
786 {
787 	vdev_t *vd = zio->io_vd;
788 	vdev_t *cvd;
789 	raidz_map_t *rm = zio->io_vsd;
790 	raidz_col_t *rc, *rc1;
791 	int unexpected_errors = 0;
792 	int parity_errors = 0;
793 	int parity_untried = 0;
794 	int data_errors = 0;
795 	int total_errors = 0;
796 	int n, c, c1;
797 
798 	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
799 
800 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
801 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
802 
803 	for (c = 0; c < rm->rm_cols; c++) {
804 		rc = &rm->rm_col[c];
805 
806 		if (rc->rc_error) {
807 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
808 
809 			if (c < rm->rm_firstdatacol)
810 				parity_errors++;
811 			else
812 				data_errors++;
813 
814 			if (!rc->rc_skipped)
815 				unexpected_errors++;
816 
817 			total_errors++;
818 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
819 			parity_untried++;
820 		}
821 	}
822 
823 	if (zio->io_type == ZIO_TYPE_WRITE) {
824 		/*
825 		 * XXX -- for now, treat partial writes as a success.
826 		 * (If we couldn't write enough columns to reconstruct
827 		 * the data, the I/O failed.  Otherwise, good enough.)
828 		 *
829 		 * Now that we support write reallocation, it would be better
830 		 * to treat partial failure as real failure unless there are
831 		 * no non-degraded top-level vdevs left, and not update DTLs
832 		 * if we intend to reallocate.
833 		 */
834 		/* XXPOLICY */
835 		if (total_errors > rm->rm_firstdatacol)
836 			zio->io_error = vdev_raidz_worst_error(rm);
837 
838 		return;
839 	}
840 
841 	ASSERT(zio->io_type == ZIO_TYPE_READ);
842 	/*
843 	 * There are three potential phases for a read:
844 	 *	1. produce valid data from the columns read
845 	 *	2. read all disks and try again
846 	 *	3. perform combinatorial reconstruction
847 	 *
848 	 * Each phase is progressively both more expensive and less likely to
849 	 * occur. If we encounter more errors than we can repair or all phases
850 	 * fail, we have no choice but to return an error.
851 	 */
852 
853 	/*
854 	 * If the number of errors we saw was correctable -- less than or equal
855 	 * to the number of parity disks read -- attempt to produce data that
856 	 * has a valid checksum. Naturally, this case applies in the absence of
857 	 * any errors.
858 	 */
859 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
860 		switch (data_errors) {
861 		case 0:
862 			if (zio_checksum_error(zio) == 0) {
863 				/*
864 				 * If we read parity information (unnecessarily
865 				 * as it happens since no reconstruction was
866 				 * needed) regenerate and verify the parity.
867 				 * We also regenerate parity when resilvering
868 				 * so we can write it out to the failed device
869 				 * later.
870 				 */
871 				if (parity_errors + parity_untried <
872 				    rm->rm_firstdatacol ||
873 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
874 					n = raidz_parity_verify(zio, rm);
875 					unexpected_errors += n;
876 					ASSERT(parity_errors + n <=
877 					    rm->rm_firstdatacol);
878 				}
879 				goto done;
880 			}
881 			break;
882 
883 		case 1:
884 			/*
885 			 * We either attempt to read all the parity columns or
886 			 * none of them. If we didn't try to read parity, we
887 			 * wouldn't be here in the correctable case. There must
888 			 * also have been fewer parity errors than parity
889 			 * columns or, again, we wouldn't be in this code path.
890 			 */
891 			ASSERT(parity_untried == 0);
892 			ASSERT(parity_errors < rm->rm_firstdatacol);
893 
894 			/*
895 			 * Find the column that reported the error.
896 			 */
897 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
898 				rc = &rm->rm_col[c];
899 				if (rc->rc_error != 0)
900 					break;
901 			}
902 			ASSERT(c != rm->rm_cols);
903 			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
904 			    rc->rc_error == ESTALE);
905 
906 			if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
907 				vdev_raidz_reconstruct_p(rm, c);
908 			} else {
909 				ASSERT(rm->rm_firstdatacol > 1);
910 				vdev_raidz_reconstruct_q(rm, c);
911 			}
912 
913 			if (zio_checksum_error(zio) == 0) {
914 				if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0)
915 					atomic_inc_64(&raidz_corrected_p);
916 				else
917 					atomic_inc_64(&raidz_corrected_q);
918 
919 				/*
920 				 * If there's more than one parity disk that
921 				 * was successfully read, confirm that the
922 				 * other parity disk produced the correct data.
923 				 * This routine is suboptimal in that it
924 				 * regenerates both the parity we wish to test
925 				 * as well as the parity we just used to
926 				 * perform the reconstruction, but this should
927 				 * be a relatively uncommon case, and can be
928 				 * optimized if it becomes a problem.
929 				 * We also regenerate parity when resilvering
930 				 * so we can write it out to the failed device
931 				 * later.
932 				 */
933 				if (parity_errors < rm->rm_firstdatacol - 1 ||
934 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
935 					n = raidz_parity_verify(zio, rm);
936 					unexpected_errors += n;
937 					ASSERT(parity_errors + n <=
938 					    rm->rm_firstdatacol);
939 				}
940 
941 				goto done;
942 			}
943 			break;
944 
945 		case 2:
946 			/*
947 			 * Two data column errors require double parity.
948 			 */
949 			ASSERT(rm->rm_firstdatacol == 2);
950 
951 			/*
952 			 * Find the two columns that reported errors.
953 			 */
954 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
955 				rc = &rm->rm_col[c];
956 				if (rc->rc_error != 0)
957 					break;
958 			}
959 			ASSERT(c != rm->rm_cols);
960 			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
961 			    rc->rc_error == ESTALE);
962 
963 			for (c1 = c++; c < rm->rm_cols; c++) {
964 				rc = &rm->rm_col[c];
965 				if (rc->rc_error != 0)
966 					break;
967 			}
968 			ASSERT(c != rm->rm_cols);
969 			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
970 			    rc->rc_error == ESTALE);
971 
972 			vdev_raidz_reconstruct_pq(rm, c1, c);
973 
974 			if (zio_checksum_error(zio) == 0) {
975 				atomic_inc_64(&raidz_corrected_pq);
976 				goto done;
977 			}
978 			break;
979 
980 		default:
981 			ASSERT(rm->rm_firstdatacol <= 2);
982 			ASSERT(0);
983 		}
984 	}
985 
986 	/*
987 	 * This isn't a typical situation -- either we got a read error or
988 	 * a child silently returned bad data. Read every block so we can
989 	 * try again with as much data and parity as we can track down. If
990 	 * we've already been through once before, all children will be marked
991 	 * as tried so we'll proceed to combinatorial reconstruction.
992 	 */
993 	unexpected_errors = 1;
994 	rm->rm_missingdata = 0;
995 	rm->rm_missingparity = 0;
996 
997 	for (c = 0; c < rm->rm_cols; c++) {
998 		if (rm->rm_col[c].rc_tried)
999 			continue;
1000 
1001 		zio_vdev_io_redone(zio);
1002 		do {
1003 			rc = &rm->rm_col[c];
1004 			if (rc->rc_tried)
1005 				continue;
1006 			zio_nowait(zio_vdev_child_io(zio, NULL,
1007 			    vd->vdev_child[rc->rc_devidx],
1008 			    rc->rc_offset, rc->rc_data, rc->rc_size,
1009 			    zio->io_type, zio->io_priority, 0,
1010 			    vdev_raidz_child_done, rc));
1011 		} while (++c < rm->rm_cols);
1012 
1013 		return;
1014 	}
1015 
1016 	/*
1017 	 * At this point we've attempted to reconstruct the data given the
1018 	 * errors we detected, and we've attempted to read all columns. There
1019 	 * must, therefore, be one or more additional problems -- silent errors
1020 	 * resulting in invalid data rather than explicit I/O errors resulting
1021 	 * in absent data. Before we attempt combinatorial reconstruction make
1022 	 * sure we have a chance of coming up with the right answer.
1023 	 */
1024 	if (total_errors >= rm->rm_firstdatacol) {
1025 		zio->io_error = vdev_raidz_worst_error(rm);
1026 		/*
1027 		 * If there were exactly as many device errors as parity
1028 		 * columns, yet we couldn't reconstruct the data, then at
1029 		 * least one device must have returned bad data silently.
1030 		 */
1031 		if (total_errors == rm->rm_firstdatacol)
1032 			zio->io_error = zio_worst_error(zio->io_error, ECKSUM);
1033 		goto done;
1034 	}
1035 
1036 	if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
1037 		/*
1038 		 * Attempt to reconstruct the data from parity P.
1039 		 */
1040 		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1041 			void *orig;
1042 			rc = &rm->rm_col[c];
1043 
1044 			orig = zio_buf_alloc(rc->rc_size);
1045 			bcopy(rc->rc_data, orig, rc->rc_size);
1046 			vdev_raidz_reconstruct_p(rm, c);
1047 
1048 			if (zio_checksum_error(zio) == 0) {
1049 				zio_buf_free(orig, rc->rc_size);
1050 				atomic_inc_64(&raidz_corrected_p);
1051 
1052 				/*
1053 				 * If this child didn't know that it returned
1054 				 * bad data, inform it.
1055 				 */
1056 				if (rc->rc_tried && rc->rc_error == 0)
1057 					raidz_checksum_error(zio, rc);
1058 				rc->rc_error = ECKSUM;
1059 				goto done;
1060 			}
1061 
1062 			bcopy(orig, rc->rc_data, rc->rc_size);
1063 			zio_buf_free(orig, rc->rc_size);
1064 		}
1065 	}
1066 
1067 	if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
1068 		/*
1069 		 * Attempt to reconstruct the data from parity Q.
1070 		 */
1071 		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1072 			void *orig;
1073 			rc = &rm->rm_col[c];
1074 
1075 			orig = zio_buf_alloc(rc->rc_size);
1076 			bcopy(rc->rc_data, orig, rc->rc_size);
1077 			vdev_raidz_reconstruct_q(rm, c);
1078 
1079 			if (zio_checksum_error(zio) == 0) {
1080 				zio_buf_free(orig, rc->rc_size);
1081 				atomic_inc_64(&raidz_corrected_q);
1082 
1083 				/*
1084 				 * If this child didn't know that it returned
1085 				 * bad data, inform it.
1086 				 */
1087 				if (rc->rc_tried && rc->rc_error == 0)
1088 					raidz_checksum_error(zio, rc);
1089 				rc->rc_error = ECKSUM;
1090 				goto done;
1091 			}
1092 
1093 			bcopy(orig, rc->rc_data, rc->rc_size);
1094 			zio_buf_free(orig, rc->rc_size);
1095 		}
1096 	}
1097 
1098 	if (rm->rm_firstdatacol > 1 &&
1099 	    rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 &&
1100 	    rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
1101 		/*
1102 		 * Attempt to reconstruct the data from both P and Q.
1103 		 */
1104 		for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) {
1105 			void *orig, *orig1;
1106 			rc = &rm->rm_col[c];
1107 
1108 			orig = zio_buf_alloc(rc->rc_size);
1109 			bcopy(rc->rc_data, orig, rc->rc_size);
1110 
1111 			for (c1 = c + 1; c1 < rm->rm_cols; c1++) {
1112 				rc1 = &rm->rm_col[c1];
1113 
1114 				orig1 = zio_buf_alloc(rc1->rc_size);
1115 				bcopy(rc1->rc_data, orig1, rc1->rc_size);
1116 
1117 				vdev_raidz_reconstruct_pq(rm, c, c1);
1118 
1119 				if (zio_checksum_error(zio) == 0) {
1120 					zio_buf_free(orig, rc->rc_size);
1121 					zio_buf_free(orig1, rc1->rc_size);
1122 					atomic_inc_64(&raidz_corrected_pq);
1123 
1124 					/*
1125 					 * If these children didn't know they
1126 					 * returned bad data, inform them.
1127 					 */
1128 					if (rc->rc_tried && rc->rc_error == 0)
1129 						raidz_checksum_error(zio, rc);
1130 					if (rc1->rc_tried && rc1->rc_error == 0)
1131 						raidz_checksum_error(zio, rc1);
1132 
1133 					rc->rc_error = ECKSUM;
1134 					rc1->rc_error = ECKSUM;
1135 
1136 					goto done;
1137 				}
1138 
1139 				bcopy(orig1, rc1->rc_data, rc1->rc_size);
1140 				zio_buf_free(orig1, rc1->rc_size);
1141 			}
1142 
1143 			bcopy(orig, rc->rc_data, rc->rc_size);
1144 			zio_buf_free(orig, rc->rc_size);
1145 		}
1146 	}
1147 
1148 	/*
1149 	 * All combinations failed to checksum. Generate checksum ereports for
1150 	 * all children.
1151 	 */
1152 	zio->io_error = ECKSUM;
1153 
1154 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1155 		for (c = 0; c < rm->rm_cols; c++) {
1156 			rc = &rm->rm_col[c];
1157 			zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
1158 			    zio->io_spa, vd->vdev_child[rc->rc_devidx], zio,
1159 			    rc->rc_offset, rc->rc_size);
1160 		}
1161 	}
1162 
1163 done:
1164 	zio_checksum_verified(zio);
1165 
1166 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
1167 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
1168 		/*
1169 		 * Use the good data we have in hand to repair damaged children.
1170 		 */
1171 		for (c = 0; c < rm->rm_cols; c++) {
1172 			rc = &rm->rm_col[c];
1173 			cvd = vd->vdev_child[rc->rc_devidx];
1174 
1175 			if (rc->rc_error == 0)
1176 				continue;
1177 
1178 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1179 			    rc->rc_offset, rc->rc_data, rc->rc_size,
1180 			    ZIO_TYPE_WRITE, zio->io_priority,
1181 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
1182 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
1183 		}
1184 	}
1185 }
1186 
1187 static void
1188 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
1189 {
1190 	if (faulted > vd->vdev_nparity)
1191 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1192 		    VDEV_AUX_NO_REPLICAS);
1193 	else if (degraded + faulted != 0)
1194 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
1195 	else
1196 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
1197 }
1198 
1199 vdev_ops_t vdev_raidz_ops = {
1200 	vdev_raidz_open,
1201 	vdev_raidz_close,
1202 	vdev_raidz_asize,
1203 	vdev_raidz_io_start,
1204 	vdev_raidz_io_done,
1205 	vdev_raidz_state_change,
1206 	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
1207 	B_FALSE			/* not a leaf vdev */
1208 };
1209