xref: /illumos-gate/usr/src/uts/common/fs/zfs/vdev_raidz.c (revision 354507029a42e4bcb1ea64fc4685f2bfd4792db8)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/zfs_context.h>
28 #include <sys/spa.h>
29 #include <sys/vdev_impl.h>
30 #include <sys/zio.h>
31 #include <sys/zio_checksum.h>
32 #include <sys/fs/zfs.h>
33 #include <sys/fm/fs/zfs.h>
34 
35 /*
36  * Virtual device vector for RAID-Z.
37  *
38  * This vdev supports both single and double parity. For single parity, we
39  * use a simple XOR of all the data columns. For double parity, we use both
40  * the simple XOR as well as a technique described in "The mathematics of
41  * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8),
42  * over the integers expressable in a single byte. Briefly, the operations on
43  * the field are defined as follows:
44  *
45  *   o addition (+) is represented by a bitwise XOR
46  *   o subtraction (-) is therefore identical to addition: A + B = A - B
47  *   o multiplication of A by 2 is defined by the following bitwise expression:
48  *	(A * 2)_7 = A_6
49  *	(A * 2)_6 = A_5
50  *	(A * 2)_5 = A_4
51  *	(A * 2)_4 = A_3 + A_7
52  *	(A * 2)_3 = A_2 + A_7
53  *	(A * 2)_2 = A_1 + A_7
54  *	(A * 2)_1 = A_0
55  *	(A * 2)_0 = A_7
56  *
57  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
58  *
59  * Observe that any number in the field (except for 0) can be expressed as a
60  * power of 2 -- a generator for the field. We store a table of the powers of
61  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
62  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
63  * than field addition). The inverse of a field element A (A^-1) is A^254.
64  *
65  * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1,
66  * can be expressed by field operations:
67  *
68  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
69  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
70  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
71  *
72  * See the reconstruction code below for how P and Q can used individually or
73  * in concert to recover missing data columns.
74  */
75 
76 typedef struct raidz_col {
77 	uint64_t rc_devidx;		/* child device index for I/O */
78 	uint64_t rc_offset;		/* device offset */
79 	uint64_t rc_size;		/* I/O size */
80 	void *rc_data;			/* I/O data */
81 	int rc_error;			/* I/O error for this device */
82 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
83 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
84 } raidz_col_t;
85 
86 typedef struct raidz_map {
87 	uint64_t rm_cols;		/* Column count */
88 	uint64_t rm_bigcols;		/* Number of oversized columns */
89 	uint64_t rm_asize;		/* Actual total I/O size */
90 	uint64_t rm_missingdata;	/* Count of missing data devices */
91 	uint64_t rm_missingparity;	/* Count of missing parity devices */
92 	uint64_t rm_firstdatacol;	/* First data column/parity count */
93 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
94 } raidz_map_t;
95 
96 #define	VDEV_RAIDZ_P		0
97 #define	VDEV_RAIDZ_Q		1
98 
99 #define	VDEV_RAIDZ_MAXPARITY	2
100 
101 #define	VDEV_RAIDZ_MUL_2(a)	(((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0))
102 
103 /*
104  * These two tables represent powers and logs of 2 in the Galois field defined
105  * above. These values were computed by repeatedly multiplying by 2 as above.
106  */
107 static const uint8_t vdev_raidz_pow2[256] = {
108 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
109 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
110 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
111 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
112 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
113 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
114 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
115 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
116 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
117 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
118 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
119 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
120 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
121 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
122 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
123 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
124 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
125 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
126 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
127 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
128 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
129 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
130 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
131 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
132 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
133 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
134 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
135 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
136 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
137 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
138 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
139 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
140 };
141 static const uint8_t vdev_raidz_log2[256] = {
142 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
143 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
144 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
145 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
146 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
147 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
148 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
149 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
150 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
151 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
152 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
153 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
154 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
155 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
156 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
157 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
158 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
159 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
160 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
161 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
162 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
163 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
164 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
165 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
166 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
167 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
168 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
169 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
170 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
171 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
172 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
173 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
174 };
175 
176 /*
177  * Multiply a given number by 2 raised to the given power.
178  */
179 static uint8_t
180 vdev_raidz_exp2(uint_t a, int exp)
181 {
182 	if (a == 0)
183 		return (0);
184 
185 	ASSERT(exp >= 0);
186 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
187 
188 	exp += vdev_raidz_log2[a];
189 	if (exp > 255)
190 		exp -= 255;
191 
192 	return (vdev_raidz_pow2[exp]);
193 }
194 
195 static void
196 vdev_raidz_map_free(zio_t *zio)
197 {
198 	raidz_map_t *rm = zio->io_vsd;
199 	int c;
200 
201 	for (c = 0; c < rm->rm_firstdatacol; c++)
202 		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
203 
204 	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols]));
205 }
206 
207 static raidz_map_t *
208 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
209     uint64_t nparity)
210 {
211 	raidz_map_t *rm;
212 	uint64_t b = zio->io_offset >> unit_shift;
213 	uint64_t s = zio->io_size >> unit_shift;
214 	uint64_t f = b % dcols;
215 	uint64_t o = (b / dcols) << unit_shift;
216 	uint64_t q, r, c, bc, col, acols, coff, devidx;
217 
218 	q = s / (dcols - nparity);
219 	r = s - q * (dcols - nparity);
220 	bc = (r == 0 ? 0 : r + nparity);
221 
222 	acols = (q == 0 ? bc : dcols);
223 
224 	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP);
225 
226 	rm->rm_cols = acols;
227 	rm->rm_bigcols = bc;
228 	rm->rm_asize = 0;
229 	rm->rm_missingdata = 0;
230 	rm->rm_missingparity = 0;
231 	rm->rm_firstdatacol = nparity;
232 
233 	for (c = 0; c < acols; c++) {
234 		col = f + c;
235 		coff = o;
236 		if (col >= dcols) {
237 			col -= dcols;
238 			coff += 1ULL << unit_shift;
239 		}
240 		rm->rm_col[c].rc_devidx = col;
241 		rm->rm_col[c].rc_offset = coff;
242 		rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift;
243 		rm->rm_col[c].rc_data = NULL;
244 		rm->rm_col[c].rc_error = 0;
245 		rm->rm_col[c].rc_tried = 0;
246 		rm->rm_col[c].rc_skipped = 0;
247 		rm->rm_asize += rm->rm_col[c].rc_size;
248 	}
249 
250 	rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift);
251 
252 	for (c = 0; c < rm->rm_firstdatacol; c++)
253 		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
254 
255 	rm->rm_col[c].rc_data = zio->io_data;
256 
257 	for (c = c + 1; c < acols; c++)
258 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
259 		    rm->rm_col[c - 1].rc_size;
260 
261 	/*
262 	 * If all data stored spans all columns, there's a danger that parity
263 	 * will always be on the same device and, since parity isn't read
264 	 * during normal operation, that that device's I/O bandwidth won't be
265 	 * used effectively. We therefore switch the parity every 1MB.
266 	 *
267 	 * ... at least that was, ostensibly, the theory. As a practical
268 	 * matter unless we juggle the parity between all devices evenly, we
269 	 * won't see any benefit. Further, occasional writes that aren't a
270 	 * multiple of the LCM of the number of children and the minimum
271 	 * stripe width are sufficient to avoid pessimal behavior.
272 	 * Unfortunately, this decision created an implicit on-disk format
273 	 * requirement that we need to support for all eternity, but only
274 	 * for single-parity RAID-Z.
275 	 */
276 	ASSERT(rm->rm_cols >= 2);
277 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
278 
279 	if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
280 		devidx = rm->rm_col[0].rc_devidx;
281 		o = rm->rm_col[0].rc_offset;
282 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
283 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
284 		rm->rm_col[1].rc_devidx = devidx;
285 		rm->rm_col[1].rc_offset = o;
286 	}
287 
288 	zio->io_vsd = rm;
289 	zio->io_vsd_free = vdev_raidz_map_free;
290 	return (rm);
291 }
292 
293 static void
294 vdev_raidz_generate_parity_p(raidz_map_t *rm)
295 {
296 	uint64_t *p, *src, pcount, ccount, i;
297 	int c;
298 
299 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
300 
301 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
302 		src = rm->rm_col[c].rc_data;
303 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
304 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
305 
306 		if (c == rm->rm_firstdatacol) {
307 			ASSERT(ccount == pcount);
308 			for (i = 0; i < ccount; i++, p++, src++) {
309 				*p = *src;
310 			}
311 		} else {
312 			ASSERT(ccount <= pcount);
313 			for (i = 0; i < ccount; i++, p++, src++) {
314 				*p ^= *src;
315 			}
316 		}
317 	}
318 }
319 
320 static void
321 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
322 {
323 	uint64_t *q, *p, *src, pcount, ccount, mask, i;
324 	int c;
325 
326 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
327 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
328 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
329 
330 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
331 		src = rm->rm_col[c].rc_data;
332 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
333 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
334 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
335 
336 		if (c == rm->rm_firstdatacol) {
337 			ASSERT(ccount == pcount || ccount == 0);
338 			for (i = 0; i < ccount; i++, p++, q++, src++) {
339 				*q = *src;
340 				*p = *src;
341 			}
342 			for (; i < pcount; i++, p++, q++, src++) {
343 				*q = 0;
344 				*p = 0;
345 			}
346 		} else {
347 			ASSERT(ccount <= pcount);
348 
349 			/*
350 			 * Rather than multiplying each byte individually (as
351 			 * described above), we are able to handle 8 at once
352 			 * by generating a mask based on the high bit in each
353 			 * byte and using that to conditionally XOR in 0x1d.
354 			 */
355 			for (i = 0; i < ccount; i++, p++, q++, src++) {
356 				mask = *q & 0x8080808080808080ULL;
357 				mask = (mask << 1) - (mask >> 7);
358 				*q = ((*q << 1) & 0xfefefefefefefefeULL) ^
359 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
360 				*q ^= *src;
361 				*p ^= *src;
362 			}
363 
364 			/*
365 			 * Treat short columns as though they are full of 0s.
366 			 */
367 			for (; i < pcount; i++, q++) {
368 				mask = *q & 0x8080808080808080ULL;
369 				mask = (mask << 1) - (mask >> 7);
370 				*q = ((*q << 1) & 0xfefefefefefefefeULL) ^
371 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
372 			}
373 		}
374 	}
375 }
376 
377 static void
378 vdev_raidz_reconstruct_p(raidz_map_t *rm, int x)
379 {
380 	uint64_t *dst, *src, xcount, ccount, count, i;
381 	int c;
382 
383 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
384 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
385 	ASSERT(xcount > 0);
386 
387 	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
388 	dst = rm->rm_col[x].rc_data;
389 	for (i = 0; i < xcount; i++, dst++, src++) {
390 		*dst = *src;
391 	}
392 
393 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
394 		src = rm->rm_col[c].rc_data;
395 		dst = rm->rm_col[x].rc_data;
396 
397 		if (c == x)
398 			continue;
399 
400 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
401 		count = MIN(ccount, xcount);
402 
403 		for (i = 0; i < count; i++, dst++, src++) {
404 			*dst ^= *src;
405 		}
406 	}
407 }
408 
409 static void
410 vdev_raidz_reconstruct_q(raidz_map_t *rm, int x)
411 {
412 	uint64_t *dst, *src, xcount, ccount, count, mask, i;
413 	uint8_t *b;
414 	int c, j, exp;
415 
416 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
417 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
418 
419 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
420 		src = rm->rm_col[c].rc_data;
421 		dst = rm->rm_col[x].rc_data;
422 
423 		if (c == x)
424 			ccount = 0;
425 		else
426 			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
427 
428 		count = MIN(ccount, xcount);
429 
430 		if (c == rm->rm_firstdatacol) {
431 			for (i = 0; i < count; i++, dst++, src++) {
432 				*dst = *src;
433 			}
434 			for (; i < xcount; i++, dst++) {
435 				*dst = 0;
436 			}
437 
438 		} else {
439 			/*
440 			 * For an explanation of this, see the comment in
441 			 * vdev_raidz_generate_parity_pq() above.
442 			 */
443 			for (i = 0; i < count; i++, dst++, src++) {
444 				mask = *dst & 0x8080808080808080ULL;
445 				mask = (mask << 1) - (mask >> 7);
446 				*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
447 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
448 				*dst ^= *src;
449 			}
450 
451 			for (; i < xcount; i++, dst++) {
452 				mask = *dst & 0x8080808080808080ULL;
453 				mask = (mask << 1) - (mask >> 7);
454 				*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
455 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
456 			}
457 		}
458 	}
459 
460 	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
461 	dst = rm->rm_col[x].rc_data;
462 	exp = 255 - (rm->rm_cols - 1 - x);
463 
464 	for (i = 0; i < xcount; i++, dst++, src++) {
465 		*dst ^= *src;
466 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
467 			*b = vdev_raidz_exp2(*b, exp);
468 		}
469 	}
470 }
471 
472 static void
473 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y)
474 {
475 	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
476 	void *pdata, *qdata;
477 	uint64_t xsize, ysize, i;
478 
479 	ASSERT(x < y);
480 	ASSERT(x >= rm->rm_firstdatacol);
481 	ASSERT(y < rm->rm_cols);
482 
483 	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
484 
485 	/*
486 	 * Move the parity data aside -- we're going to compute parity as
487 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
488 	 * reuse the parity generation mechanism without trashing the actual
489 	 * parity so we make those columns appear to be full of zeros by
490 	 * setting their lengths to zero.
491 	 */
492 	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
493 	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
494 	xsize = rm->rm_col[x].rc_size;
495 	ysize = rm->rm_col[y].rc_size;
496 
497 	rm->rm_col[VDEV_RAIDZ_P].rc_data =
498 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
499 	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
500 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
501 	rm->rm_col[x].rc_size = 0;
502 	rm->rm_col[y].rc_size = 0;
503 
504 	vdev_raidz_generate_parity_pq(rm);
505 
506 	rm->rm_col[x].rc_size = xsize;
507 	rm->rm_col[y].rc_size = ysize;
508 
509 	p = pdata;
510 	q = qdata;
511 	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
512 	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
513 	xd = rm->rm_col[x].rc_data;
514 	yd = rm->rm_col[y].rc_data;
515 
516 	/*
517 	 * We now have:
518 	 *	Pxy = P + D_x + D_y
519 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
520 	 *
521 	 * We can then solve for D_x:
522 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
523 	 * where
524 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
525 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
526 	 *
527 	 * With D_x in hand, we can easily solve for D_y:
528 	 *	D_y = P + Pxy + D_x
529 	 */
530 
531 	a = vdev_raidz_pow2[255 + x - y];
532 	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
533 	tmp = 255 - vdev_raidz_log2[a ^ 1];
534 
535 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
536 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
537 
538 	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
539 		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
540 		    vdev_raidz_exp2(*q ^ *qxy, bexp);
541 
542 		if (i < ysize)
543 			*yd = *p ^ *pxy ^ *xd;
544 	}
545 
546 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
547 	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
548 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
549 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
550 
551 	/*
552 	 * Restore the saved parity data.
553 	 */
554 	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
555 	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
556 }
557 
558 
559 static int
560 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
561 {
562 	vdev_t *cvd;
563 	uint64_t nparity = vd->vdev_nparity;
564 	int c, error;
565 	int lasterror = 0;
566 	int numerrors = 0;
567 
568 	ASSERT(nparity > 0);
569 
570 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
571 	    vd->vdev_children < nparity + 1) {
572 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
573 		return (EINVAL);
574 	}
575 
576 	for (c = 0; c < vd->vdev_children; c++) {
577 		cvd = vd->vdev_child[c];
578 
579 		if ((error = vdev_open(cvd)) != 0) {
580 			lasterror = error;
581 			numerrors++;
582 			continue;
583 		}
584 
585 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
586 		*ashift = MAX(*ashift, cvd->vdev_ashift);
587 	}
588 
589 	*asize *= vd->vdev_children;
590 
591 	if (numerrors > nparity) {
592 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
593 		return (lasterror);
594 	}
595 
596 	return (0);
597 }
598 
599 static void
600 vdev_raidz_close(vdev_t *vd)
601 {
602 	int c;
603 
604 	for (c = 0; c < vd->vdev_children; c++)
605 		vdev_close(vd->vdev_child[c]);
606 }
607 
608 static uint64_t
609 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
610 {
611 	uint64_t asize;
612 	uint64_t ashift = vd->vdev_top->vdev_ashift;
613 	uint64_t cols = vd->vdev_children;
614 	uint64_t nparity = vd->vdev_nparity;
615 
616 	asize = ((psize - 1) >> ashift) + 1;
617 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
618 	asize = roundup(asize, nparity + 1) << ashift;
619 
620 	return (asize);
621 }
622 
623 static void
624 vdev_raidz_child_done(zio_t *zio)
625 {
626 	raidz_col_t *rc = zio->io_private;
627 
628 	rc->rc_error = zio->io_error;
629 	rc->rc_tried = 1;
630 	rc->rc_skipped = 0;
631 }
632 
633 static int
634 vdev_raidz_io_start(zio_t *zio)
635 {
636 	vdev_t *vd = zio->io_vd;
637 	vdev_t *tvd = vd->vdev_top;
638 	vdev_t *cvd;
639 	blkptr_t *bp = zio->io_bp;
640 	raidz_map_t *rm;
641 	raidz_col_t *rc;
642 	int c;
643 
644 	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
645 	    vd->vdev_nparity);
646 
647 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
648 
649 	if (zio->io_type == ZIO_TYPE_WRITE) {
650 		/*
651 		 * Generate RAID parity in the first virtual columns.
652 		 */
653 		if (rm->rm_firstdatacol == 1)
654 			vdev_raidz_generate_parity_p(rm);
655 		else
656 			vdev_raidz_generate_parity_pq(rm);
657 
658 		for (c = 0; c < rm->rm_cols; c++) {
659 			rc = &rm->rm_col[c];
660 			cvd = vd->vdev_child[rc->rc_devidx];
661 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
662 			    rc->rc_offset, rc->rc_data, rc->rc_size,
663 			    zio->io_type, zio->io_priority, 0,
664 			    vdev_raidz_child_done, rc));
665 		}
666 
667 		return (ZIO_PIPELINE_CONTINUE);
668 	}
669 
670 	ASSERT(zio->io_type == ZIO_TYPE_READ);
671 
672 	/*
673 	 * Iterate over the columns in reverse order so that we hit the parity
674 	 * last -- any errors along the way will force us to read the parity
675 	 * data.
676 	 */
677 	for (c = rm->rm_cols - 1; c >= 0; c--) {
678 		rc = &rm->rm_col[c];
679 		cvd = vd->vdev_child[rc->rc_devidx];
680 		if (!vdev_readable(cvd)) {
681 			if (c >= rm->rm_firstdatacol)
682 				rm->rm_missingdata++;
683 			else
684 				rm->rm_missingparity++;
685 			rc->rc_error = ENXIO;
686 			rc->rc_tried = 1;	/* don't even try */
687 			rc->rc_skipped = 1;
688 			continue;
689 		}
690 		if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) {
691 			if (c >= rm->rm_firstdatacol)
692 				rm->rm_missingdata++;
693 			else
694 				rm->rm_missingparity++;
695 			rc->rc_error = ESTALE;
696 			rc->rc_skipped = 1;
697 			continue;
698 		}
699 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
700 		    (zio->io_flags & ZIO_FLAG_SCRUB)) {
701 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
702 			    rc->rc_offset, rc->rc_data, rc->rc_size,
703 			    zio->io_type, zio->io_priority, 0,
704 			    vdev_raidz_child_done, rc));
705 		}
706 	}
707 
708 	return (ZIO_PIPELINE_CONTINUE);
709 }
710 
711 /*
712  * Report a checksum error for a child of a RAID-Z device.
713  */
714 static void
715 raidz_checksum_error(zio_t *zio, raidz_col_t *rc)
716 {
717 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
718 
719 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
720 		mutex_enter(&vd->vdev_stat_lock);
721 		vd->vdev_stat.vs_checksum_errors++;
722 		mutex_exit(&vd->vdev_stat_lock);
723 	}
724 
725 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE))
726 		zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
727 		    zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size);
728 }
729 
730 /*
731  * Generate the parity from the data columns. If we tried and were able to
732  * read the parity without error, verify that the generated parity matches the
733  * data we read. If it doesn't, we fire off a checksum error. Return the
734  * number such failures.
735  */
736 static int
737 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
738 {
739 	void *orig[VDEV_RAIDZ_MAXPARITY];
740 	int c, ret = 0;
741 	raidz_col_t *rc;
742 
743 	for (c = 0; c < rm->rm_firstdatacol; c++) {
744 		rc = &rm->rm_col[c];
745 		if (!rc->rc_tried || rc->rc_error != 0)
746 			continue;
747 		orig[c] = zio_buf_alloc(rc->rc_size);
748 		bcopy(rc->rc_data, orig[c], rc->rc_size);
749 	}
750 
751 	if (rm->rm_firstdatacol == 1)
752 		vdev_raidz_generate_parity_p(rm);
753 	else
754 		vdev_raidz_generate_parity_pq(rm);
755 
756 	for (c = 0; c < rm->rm_firstdatacol; c++) {
757 		rc = &rm->rm_col[c];
758 		if (!rc->rc_tried || rc->rc_error != 0)
759 			continue;
760 		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
761 			raidz_checksum_error(zio, rc);
762 			rc->rc_error = ECKSUM;
763 			ret++;
764 		}
765 		zio_buf_free(orig[c], rc->rc_size);
766 	}
767 
768 	return (ret);
769 }
770 
771 static uint64_t raidz_corrected_p;
772 static uint64_t raidz_corrected_q;
773 static uint64_t raidz_corrected_pq;
774 
775 static int
776 vdev_raidz_worst_error(raidz_map_t *rm)
777 {
778 	int error = 0;
779 
780 	for (int c = 0; c < rm->rm_cols; c++)
781 		error = zio_worst_error(error, rm->rm_col[c].rc_error);
782 
783 	return (error);
784 }
785 
786 static void
787 vdev_raidz_io_done(zio_t *zio)
788 {
789 	vdev_t *vd = zio->io_vd;
790 	vdev_t *cvd;
791 	raidz_map_t *rm = zio->io_vsd;
792 	raidz_col_t *rc, *rc1;
793 	int unexpected_errors = 0;
794 	int parity_errors = 0;
795 	int parity_untried = 0;
796 	int data_errors = 0;
797 	int total_errors = 0;
798 	int n, c, c1;
799 
800 	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
801 
802 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
803 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
804 
805 	for (c = 0; c < rm->rm_cols; c++) {
806 		rc = &rm->rm_col[c];
807 
808 		if (rc->rc_error) {
809 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
810 
811 			if (c < rm->rm_firstdatacol)
812 				parity_errors++;
813 			else
814 				data_errors++;
815 
816 			if (!rc->rc_skipped)
817 				unexpected_errors++;
818 
819 			total_errors++;
820 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
821 			parity_untried++;
822 		}
823 	}
824 
825 	if (zio->io_type == ZIO_TYPE_WRITE) {
826 		/*
827 		 * XXX -- for now, treat partial writes as a success.
828 		 * (If we couldn't write enough columns to reconstruct
829 		 * the data, the I/O failed.  Otherwise, good enough.)
830 		 *
831 		 * Now that we support write reallocation, it would be better
832 		 * to treat partial failure as real failure unless there are
833 		 * no non-degraded top-level vdevs left, and not update DTLs
834 		 * if we intend to reallocate.
835 		 */
836 		/* XXPOLICY */
837 		if (total_errors > rm->rm_firstdatacol)
838 			zio->io_error = vdev_raidz_worst_error(rm);
839 
840 		return;
841 	}
842 
843 	ASSERT(zio->io_type == ZIO_TYPE_READ);
844 	/*
845 	 * There are three potential phases for a read:
846 	 *	1. produce valid data from the columns read
847 	 *	2. read all disks and try again
848 	 *	3. perform combinatorial reconstruction
849 	 *
850 	 * Each phase is progressively both more expensive and less likely to
851 	 * occur. If we encounter more errors than we can repair or all phases
852 	 * fail, we have no choice but to return an error.
853 	 */
854 
855 	/*
856 	 * If the number of errors we saw was correctable -- less than or equal
857 	 * to the number of parity disks read -- attempt to produce data that
858 	 * has a valid checksum. Naturally, this case applies in the absence of
859 	 * any errors.
860 	 */
861 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
862 		switch (data_errors) {
863 		case 0:
864 			if (zio_checksum_error(zio) == 0) {
865 				/*
866 				 * If we read parity information (unnecessarily
867 				 * as it happens since no reconstruction was
868 				 * needed) regenerate and verify the parity.
869 				 * We also regenerate parity when resilvering
870 				 * so we can write it out to the failed device
871 				 * later.
872 				 */
873 				if (parity_errors + parity_untried <
874 				    rm->rm_firstdatacol ||
875 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
876 					n = raidz_parity_verify(zio, rm);
877 					unexpected_errors += n;
878 					ASSERT(parity_errors + n <=
879 					    rm->rm_firstdatacol);
880 				}
881 				goto done;
882 			}
883 			break;
884 
885 		case 1:
886 			/*
887 			 * We either attempt to read all the parity columns or
888 			 * none of them. If we didn't try to read parity, we
889 			 * wouldn't be here in the correctable case. There must
890 			 * also have been fewer parity errors than parity
891 			 * columns or, again, we wouldn't be in this code path.
892 			 */
893 			ASSERT(parity_untried == 0);
894 			ASSERT(parity_errors < rm->rm_firstdatacol);
895 
896 			/*
897 			 * Find the column that reported the error.
898 			 */
899 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
900 				rc = &rm->rm_col[c];
901 				if (rc->rc_error != 0)
902 					break;
903 			}
904 			ASSERT(c != rm->rm_cols);
905 			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
906 			    rc->rc_error == ESTALE);
907 
908 			if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
909 				vdev_raidz_reconstruct_p(rm, c);
910 			} else {
911 				ASSERT(rm->rm_firstdatacol > 1);
912 				vdev_raidz_reconstruct_q(rm, c);
913 			}
914 
915 			if (zio_checksum_error(zio) == 0) {
916 				if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0)
917 					atomic_inc_64(&raidz_corrected_p);
918 				else
919 					atomic_inc_64(&raidz_corrected_q);
920 
921 				/*
922 				 * If there's more than one parity disk that
923 				 * was successfully read, confirm that the
924 				 * other parity disk produced the correct data.
925 				 * This routine is suboptimal in that it
926 				 * regenerates both the parity we wish to test
927 				 * as well as the parity we just used to
928 				 * perform the reconstruction, but this should
929 				 * be a relatively uncommon case, and can be
930 				 * optimized if it becomes a problem.
931 				 * We also regenerate parity when resilvering
932 				 * so we can write it out to the failed device
933 				 * later.
934 				 */
935 				if (parity_errors < rm->rm_firstdatacol - 1 ||
936 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
937 					n = raidz_parity_verify(zio, rm);
938 					unexpected_errors += n;
939 					ASSERT(parity_errors + n <=
940 					    rm->rm_firstdatacol);
941 				}
942 
943 				goto done;
944 			}
945 			break;
946 
947 		case 2:
948 			/*
949 			 * Two data column errors require double parity.
950 			 */
951 			ASSERT(rm->rm_firstdatacol == 2);
952 
953 			/*
954 			 * Find the two columns that reported errors.
955 			 */
956 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
957 				rc = &rm->rm_col[c];
958 				if (rc->rc_error != 0)
959 					break;
960 			}
961 			ASSERT(c != rm->rm_cols);
962 			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
963 			    rc->rc_error == ESTALE);
964 
965 			for (c1 = c++; c < rm->rm_cols; c++) {
966 				rc = &rm->rm_col[c];
967 				if (rc->rc_error != 0)
968 					break;
969 			}
970 			ASSERT(c != rm->rm_cols);
971 			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
972 			    rc->rc_error == ESTALE);
973 
974 			vdev_raidz_reconstruct_pq(rm, c1, c);
975 
976 			if (zio_checksum_error(zio) == 0) {
977 				atomic_inc_64(&raidz_corrected_pq);
978 				goto done;
979 			}
980 			break;
981 
982 		default:
983 			ASSERT(rm->rm_firstdatacol <= 2);
984 			ASSERT(0);
985 		}
986 	}
987 
988 	/*
989 	 * This isn't a typical situation -- either we got a read error or
990 	 * a child silently returned bad data. Read every block so we can
991 	 * try again with as much data and parity as we can track down. If
992 	 * we've already been through once before, all children will be marked
993 	 * as tried so we'll proceed to combinatorial reconstruction.
994 	 */
995 	unexpected_errors = 1;
996 	rm->rm_missingdata = 0;
997 	rm->rm_missingparity = 0;
998 
999 	for (c = 0; c < rm->rm_cols; c++) {
1000 		if (rm->rm_col[c].rc_tried)
1001 			continue;
1002 
1003 		zio_vdev_io_redone(zio);
1004 		do {
1005 			rc = &rm->rm_col[c];
1006 			if (rc->rc_tried)
1007 				continue;
1008 			zio_nowait(zio_vdev_child_io(zio, NULL,
1009 			    vd->vdev_child[rc->rc_devidx],
1010 			    rc->rc_offset, rc->rc_data, rc->rc_size,
1011 			    zio->io_type, zio->io_priority, 0,
1012 			    vdev_raidz_child_done, rc));
1013 		} while (++c < rm->rm_cols);
1014 
1015 		return;
1016 	}
1017 
1018 	/*
1019 	 * At this point we've attempted to reconstruct the data given the
1020 	 * errors we detected, and we've attempted to read all columns. There
1021 	 * must, therefore, be one or more additional problems -- silent errors
1022 	 * resulting in invalid data rather than explicit I/O errors resulting
1023 	 * in absent data. Before we attempt combinatorial reconstruction make
1024 	 * sure we have a chance of coming up with the right answer.
1025 	 */
1026 	if (total_errors >= rm->rm_firstdatacol) {
1027 		zio->io_error = vdev_raidz_worst_error(rm);
1028 		/*
1029 		 * If there were exactly as many device errors as parity
1030 		 * columns, yet we couldn't reconstruct the data, then at
1031 		 * least one device must have returned bad data silently.
1032 		 */
1033 		if (total_errors == rm->rm_firstdatacol)
1034 			zio->io_error = zio_worst_error(zio->io_error, ECKSUM);
1035 		goto done;
1036 	}
1037 
1038 	if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
1039 		/*
1040 		 * Attempt to reconstruct the data from parity P.
1041 		 */
1042 		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1043 			void *orig;
1044 			rc = &rm->rm_col[c];
1045 
1046 			orig = zio_buf_alloc(rc->rc_size);
1047 			bcopy(rc->rc_data, orig, rc->rc_size);
1048 			vdev_raidz_reconstruct_p(rm, c);
1049 
1050 			if (zio_checksum_error(zio) == 0) {
1051 				zio_buf_free(orig, rc->rc_size);
1052 				atomic_inc_64(&raidz_corrected_p);
1053 
1054 				/*
1055 				 * If this child didn't know that it returned
1056 				 * bad data, inform it.
1057 				 */
1058 				if (rc->rc_tried && rc->rc_error == 0)
1059 					raidz_checksum_error(zio, rc);
1060 				rc->rc_error = ECKSUM;
1061 				goto done;
1062 			}
1063 
1064 			bcopy(orig, rc->rc_data, rc->rc_size);
1065 			zio_buf_free(orig, rc->rc_size);
1066 		}
1067 	}
1068 
1069 	if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
1070 		/*
1071 		 * Attempt to reconstruct the data from parity Q.
1072 		 */
1073 		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1074 			void *orig;
1075 			rc = &rm->rm_col[c];
1076 
1077 			orig = zio_buf_alloc(rc->rc_size);
1078 			bcopy(rc->rc_data, orig, rc->rc_size);
1079 			vdev_raidz_reconstruct_q(rm, c);
1080 
1081 			if (zio_checksum_error(zio) == 0) {
1082 				zio_buf_free(orig, rc->rc_size);
1083 				atomic_inc_64(&raidz_corrected_q);
1084 
1085 				/*
1086 				 * If this child didn't know that it returned
1087 				 * bad data, inform it.
1088 				 */
1089 				if (rc->rc_tried && rc->rc_error == 0)
1090 					raidz_checksum_error(zio, rc);
1091 				rc->rc_error = ECKSUM;
1092 				goto done;
1093 			}
1094 
1095 			bcopy(orig, rc->rc_data, rc->rc_size);
1096 			zio_buf_free(orig, rc->rc_size);
1097 		}
1098 	}
1099 
1100 	if (rm->rm_firstdatacol > 1 &&
1101 	    rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 &&
1102 	    rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
1103 		/*
1104 		 * Attempt to reconstruct the data from both P and Q.
1105 		 */
1106 		for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) {
1107 			void *orig, *orig1;
1108 			rc = &rm->rm_col[c];
1109 
1110 			orig = zio_buf_alloc(rc->rc_size);
1111 			bcopy(rc->rc_data, orig, rc->rc_size);
1112 
1113 			for (c1 = c + 1; c1 < rm->rm_cols; c1++) {
1114 				rc1 = &rm->rm_col[c1];
1115 
1116 				orig1 = zio_buf_alloc(rc1->rc_size);
1117 				bcopy(rc1->rc_data, orig1, rc1->rc_size);
1118 
1119 				vdev_raidz_reconstruct_pq(rm, c, c1);
1120 
1121 				if (zio_checksum_error(zio) == 0) {
1122 					zio_buf_free(orig, rc->rc_size);
1123 					zio_buf_free(orig1, rc1->rc_size);
1124 					atomic_inc_64(&raidz_corrected_pq);
1125 
1126 					/*
1127 					 * If these children didn't know they
1128 					 * returned bad data, inform them.
1129 					 */
1130 					if (rc->rc_tried && rc->rc_error == 0)
1131 						raidz_checksum_error(zio, rc);
1132 					if (rc1->rc_tried && rc1->rc_error == 0)
1133 						raidz_checksum_error(zio, rc1);
1134 
1135 					rc->rc_error = ECKSUM;
1136 					rc1->rc_error = ECKSUM;
1137 
1138 					goto done;
1139 				}
1140 
1141 				bcopy(orig1, rc1->rc_data, rc1->rc_size);
1142 				zio_buf_free(orig1, rc1->rc_size);
1143 			}
1144 
1145 			bcopy(orig, rc->rc_data, rc->rc_size);
1146 			zio_buf_free(orig, rc->rc_size);
1147 		}
1148 	}
1149 
1150 	/*
1151 	 * All combinations failed to checksum. Generate checksum ereports for
1152 	 * all children.
1153 	 */
1154 	zio->io_error = ECKSUM;
1155 
1156 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1157 		for (c = 0; c < rm->rm_cols; c++) {
1158 			rc = &rm->rm_col[c];
1159 			zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
1160 			    zio->io_spa, vd->vdev_child[rc->rc_devidx], zio,
1161 			    rc->rc_offset, rc->rc_size);
1162 		}
1163 	}
1164 
1165 done:
1166 	zio_checksum_verified(zio);
1167 
1168 	if (zio->io_error == 0 && (spa_mode & FWRITE) &&
1169 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
1170 		/*
1171 		 * Use the good data we have in hand to repair damaged children.
1172 		 */
1173 		for (c = 0; c < rm->rm_cols; c++) {
1174 			rc = &rm->rm_col[c];
1175 			cvd = vd->vdev_child[rc->rc_devidx];
1176 
1177 			if (rc->rc_error == 0)
1178 				continue;
1179 
1180 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1181 			    rc->rc_offset, rc->rc_data, rc->rc_size,
1182 			    ZIO_TYPE_WRITE, zio->io_priority,
1183 			    ZIO_FLAG_IO_REPAIR, NULL, NULL));
1184 		}
1185 	}
1186 }
1187 
1188 static void
1189 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
1190 {
1191 	if (faulted > vd->vdev_nparity)
1192 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1193 		    VDEV_AUX_NO_REPLICAS);
1194 	else if (degraded + faulted != 0)
1195 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
1196 	else
1197 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
1198 }
1199 
1200 vdev_ops_t vdev_raidz_ops = {
1201 	vdev_raidz_open,
1202 	vdev_raidz_close,
1203 	vdev_raidz_asize,
1204 	vdev_raidz_io_start,
1205 	vdev_raidz_io_done,
1206 	vdev_raidz_state_change,
1207 	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
1208 	B_FALSE			/* not a leaf vdev */
1209 };
1210