xref: /titanic_50/usr/src/uts/common/fs/zfs/vdev_raidz.c (revision 8b2e16e76f55405c78218b9f08c6aefaf13c9e24)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/zfs_context.h>
30 #include <sys/spa.h>
31 #include <sys/vdev_impl.h>
32 #include <sys/zio.h>
33 #include <sys/zio_checksum.h>
34 #include <sys/fs/zfs.h>
35 #include <sys/fm/fs/zfs.h>
36 
37 /*
38  * Virtual device vector for RAID-Z.
39  *
40  * This vdev supports both single and double parity. For single parity, we
41  * use a simple XOR of all the data columns. For double parity, we use both
42  * the simple XOR as well as a technique described in "The mathematics of
43  * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8),
44  * over the integers expressable in a single byte. Briefly, the operations on
45  * the field are defined as follows:
46  *
47  *   o addition (+) is represented by a bitwise XOR
48  *   o subtraction (-) is therefore identical to addition: A + B = A - B
49  *   o multiplication of A by 2 is defined by the following bitwise expression:
50  *	(A * 2)_7 = A_6
51  *	(A * 2)_6 = A_5
52  *	(A * 2)_5 = A_4
53  *	(A * 2)_4 = A_3 + A_7
54  *	(A * 2)_3 = A_2 + A_7
55  *	(A * 2)_2 = A_1 + A_7
56  *	(A * 2)_1 = A_0
57  *	(A * 2)_0 = A_7
58  *
59  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
60  *
61  * Observe that any number in the field (except for 0) can be expressed as a
62  * power of 2 -- a generator for the field. We store a table of the powers of
63  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
64  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
65  * than field addition). The inverse of a field element A (A^-1) is A^254.
66  *
67  * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1,
68  * can be expressed by field operations:
69  *
70  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
71  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
72  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
73  *
74  * See the reconstruction code below for how P and Q can used individually or
75  * in concert to recover missing data columns.
76  */
77 
78 typedef struct raidz_col {
79 	uint64_t rc_devidx;		/* child device index for I/O */
80 	uint64_t rc_offset;		/* device offset */
81 	uint64_t rc_size;		/* I/O size */
82 	void *rc_data;			/* I/O data */
83 	int rc_error;			/* I/O error for this device */
84 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
85 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
86 } raidz_col_t;
87 
88 typedef struct raidz_map {
89 	uint64_t rm_cols;		/* Column count */
90 	uint64_t rm_bigcols;		/* Number of oversized columns */
91 	uint64_t rm_asize;		/* Actual total I/O size */
92 	uint64_t rm_missingdata;	/* Count of missing data devices */
93 	uint64_t rm_missingparity;	/* Count of missing parity devices */
94 	uint64_t rm_firstdatacol;	/* First data column/parity count */
95 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
96 } raidz_map_t;
97 
98 #define	VDEV_RAIDZ_P		0
99 #define	VDEV_RAIDZ_Q		1
100 
101 #define	VDEV_RAIDZ_MAXPARITY	2
102 
103 #define	VDEV_RAIDZ_MUL_2(a)	(((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0))
104 
105 /*
106  * These two tables represent powers and logs of 2 in the Galois field defined
107  * above. These values were computed by repeatedly multiplying by 2 as above.
108  */
109 static const uint8_t vdev_raidz_pow2[256] = {
110 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
111 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
112 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
113 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
114 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
115 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
116 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
117 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
118 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
119 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
120 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
121 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
122 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
123 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
124 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
125 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
126 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
127 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
128 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
129 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
130 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
131 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
132 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
133 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
134 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
135 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
136 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
137 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
138 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
139 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
140 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
141 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
142 };
143 static const uint8_t vdev_raidz_log2[256] = {
144 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
145 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
146 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
147 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
148 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
149 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
150 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
151 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
152 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
153 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
154 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
155 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
156 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
157 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
158 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
159 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
160 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
161 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
162 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
163 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
164 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
165 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
166 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
167 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
168 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
169 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
170 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
171 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
172 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
173 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
174 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
175 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
176 };
177 
178 /*
179  * Multiply a given number by 2 raised to the given power.
180  */
181 static uint8_t
182 vdev_raidz_exp2(uint_t a, int exp)
183 {
184 	if (a == 0)
185 		return (0);
186 
187 	ASSERT(exp >= 0);
188 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
189 
190 	exp += vdev_raidz_log2[a];
191 	if (exp > 255)
192 		exp -= 255;
193 
194 	return (vdev_raidz_pow2[exp]);
195 }
196 
197 static raidz_map_t *
198 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
199     uint64_t nparity)
200 {
201 	raidz_map_t *rm;
202 	uint64_t b = zio->io_offset >> unit_shift;
203 	uint64_t s = zio->io_size >> unit_shift;
204 	uint64_t f = b % dcols;
205 	uint64_t o = (b / dcols) << unit_shift;
206 	uint64_t q, r, c, bc, col, acols, coff, devidx;
207 
208 	q = s / (dcols - nparity);
209 	r = s - q * (dcols - nparity);
210 	bc = (r == 0 ? 0 : r + nparity);
211 
212 	acols = (q == 0 ? bc : dcols);
213 
214 	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP);
215 
216 	rm->rm_cols = acols;
217 	rm->rm_bigcols = bc;
218 	rm->rm_asize = 0;
219 	rm->rm_missingdata = 0;
220 	rm->rm_missingparity = 0;
221 	rm->rm_firstdatacol = nparity;
222 
223 	for (c = 0; c < acols; c++) {
224 		col = f + c;
225 		coff = o;
226 		if (col >= dcols) {
227 			col -= dcols;
228 			coff += 1ULL << unit_shift;
229 		}
230 		rm->rm_col[c].rc_devidx = col;
231 		rm->rm_col[c].rc_offset = coff;
232 		rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift;
233 		rm->rm_col[c].rc_data = NULL;
234 		rm->rm_col[c].rc_error = 0;
235 		rm->rm_col[c].rc_tried = 0;
236 		rm->rm_col[c].rc_skipped = 0;
237 		rm->rm_asize += rm->rm_col[c].rc_size;
238 	}
239 
240 	rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift);
241 
242 	for (c = 0; c < rm->rm_firstdatacol; c++)
243 		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
244 
245 	rm->rm_col[c].rc_data = zio->io_data;
246 
247 	for (c = c + 1; c < acols; c++)
248 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
249 		    rm->rm_col[c - 1].rc_size;
250 
251 	/*
252 	 * If all data stored spans all columns, there's a danger that parity
253 	 * will always be on the same device and, since parity isn't read
254 	 * during normal operation, that that device's I/O bandwidth won't be
255 	 * used effectively. We therefore switch the parity every 1MB.
256 	 *
257 	 * ... at least that was, ostensibly, the theory. As a practical
258 	 * matter unless we juggle the parity between all devices evenly, we
259 	 * won't see any benefit. Further, occasional writes that aren't a
260 	 * multiple of the LCM of the number of children and the minimum
261 	 * stripe width are sufficient to avoid pessimal behavior.
262 	 * Unfortunately, this decision created an implicit on-disk format
263 	 * requirement that we need to support for all eternity, but only
264 	 * for single-parity RAID-Z.
265 	 */
266 	ASSERT(rm->rm_cols >= 2);
267 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
268 
269 	if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
270 		devidx = rm->rm_col[0].rc_devidx;
271 		o = rm->rm_col[0].rc_offset;
272 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
273 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
274 		rm->rm_col[1].rc_devidx = devidx;
275 		rm->rm_col[1].rc_offset = o;
276 	}
277 
278 	zio->io_vsd = rm;
279 	return (rm);
280 }
281 
282 static void
283 vdev_raidz_map_free(zio_t *zio)
284 {
285 	raidz_map_t *rm = zio->io_vsd;
286 	int c;
287 
288 	for (c = 0; c < rm->rm_firstdatacol; c++)
289 		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
290 
291 	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols]));
292 	zio->io_vsd = NULL;
293 }
294 
295 static void
296 vdev_raidz_generate_parity_p(raidz_map_t *rm)
297 {
298 	uint64_t *p, *src, pcount, ccount, i;
299 	int c;
300 
301 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
302 
303 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
304 		src = rm->rm_col[c].rc_data;
305 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
306 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
307 
308 		if (c == rm->rm_firstdatacol) {
309 			ASSERT(ccount == pcount);
310 			for (i = 0; i < ccount; i++, p++, src++) {
311 				*p = *src;
312 			}
313 		} else {
314 			ASSERT(ccount <= pcount);
315 			for (i = 0; i < ccount; i++, p++, src++) {
316 				*p ^= *src;
317 			}
318 		}
319 	}
320 }
321 
322 static void
323 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
324 {
325 	uint64_t *q, *p, *src, pcount, ccount, mask, i;
326 	int c;
327 
328 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
329 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
330 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
331 
332 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
333 		src = rm->rm_col[c].rc_data;
334 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
335 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
336 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
337 
338 		if (c == rm->rm_firstdatacol) {
339 			ASSERT(ccount == pcount || ccount == 0);
340 			for (i = 0; i < ccount; i++, p++, q++, src++) {
341 				*q = *src;
342 				*p = *src;
343 			}
344 			for (; i < pcount; i++, p++, q++, src++) {
345 				*q = 0;
346 				*p = 0;
347 			}
348 		} else {
349 			ASSERT(ccount <= pcount);
350 
351 			/*
352 			 * Rather than multiplying each byte individually (as
353 			 * described above), we are able to handle 8 at once
354 			 * by generating a mask based on the high bit in each
355 			 * byte and using that to conditionally XOR in 0x1d.
356 			 */
357 			for (i = 0; i < ccount; i++, p++, q++, src++) {
358 				mask = *q & 0x8080808080808080ULL;
359 				mask = (mask << 1) - (mask >> 7);
360 				*q = ((*q << 1) & 0xfefefefefefefefeULL) ^
361 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
362 				*q ^= *src;
363 				*p ^= *src;
364 			}
365 
366 			/*
367 			 * Treat short columns as though they are full of 0s.
368 			 */
369 			for (; i < pcount; i++, q++) {
370 				mask = *q & 0x8080808080808080ULL;
371 				mask = (mask << 1) - (mask >> 7);
372 				*q = ((*q << 1) & 0xfefefefefefefefeULL) ^
373 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
374 			}
375 		}
376 	}
377 }
378 
379 static void
380 vdev_raidz_reconstruct_p(raidz_map_t *rm, int x)
381 {
382 	uint64_t *dst, *src, xcount, ccount, count, i;
383 	int c;
384 
385 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
386 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
387 	ASSERT(xcount > 0);
388 
389 	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
390 	dst = rm->rm_col[x].rc_data;
391 	for (i = 0; i < xcount; i++, dst++, src++) {
392 		*dst = *src;
393 	}
394 
395 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
396 		src = rm->rm_col[c].rc_data;
397 		dst = rm->rm_col[x].rc_data;
398 
399 		if (c == x)
400 			continue;
401 
402 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
403 		count = MIN(ccount, xcount);
404 
405 		for (i = 0; i < count; i++, dst++, src++) {
406 			*dst ^= *src;
407 		}
408 	}
409 }
410 
411 static void
412 vdev_raidz_reconstruct_q(raidz_map_t *rm, int x)
413 {
414 	uint64_t *dst, *src, xcount, ccount, count, mask, i;
415 	uint8_t *b;
416 	int c, j, exp;
417 
418 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
419 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
420 
421 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
422 		src = rm->rm_col[c].rc_data;
423 		dst = rm->rm_col[x].rc_data;
424 
425 		if (c == x)
426 			ccount = 0;
427 		else
428 			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
429 
430 		count = MIN(ccount, xcount);
431 
432 		if (c == rm->rm_firstdatacol) {
433 			for (i = 0; i < count; i++, dst++, src++) {
434 				*dst = *src;
435 			}
436 			for (; i < xcount; i++, dst++) {
437 				*dst = 0;
438 			}
439 
440 		} else {
441 			/*
442 			 * For an explanation of this, see the comment in
443 			 * vdev_raidz_generate_parity_pq() above.
444 			 */
445 			for (i = 0; i < count; i++, dst++, src++) {
446 				mask = *dst & 0x8080808080808080ULL;
447 				mask = (mask << 1) - (mask >> 7);
448 				*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
449 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
450 				*dst ^= *src;
451 			}
452 
453 			for (; i < xcount; i++, dst++) {
454 				mask = *dst & 0x8080808080808080ULL;
455 				mask = (mask << 1) - (mask >> 7);
456 				*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
457 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
458 			}
459 		}
460 	}
461 
462 	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
463 	dst = rm->rm_col[x].rc_data;
464 	exp = 255 - (rm->rm_cols - 1 - x);
465 
466 	for (i = 0; i < xcount; i++, dst++, src++) {
467 		*dst ^= *src;
468 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
469 			*b = vdev_raidz_exp2(*b, exp);
470 		}
471 	}
472 }
473 
474 static void
475 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y)
476 {
477 	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
478 	void *pdata, *qdata;
479 	uint64_t xsize, ysize, i;
480 
481 	ASSERT(x < y);
482 	ASSERT(x >= rm->rm_firstdatacol);
483 	ASSERT(y < rm->rm_cols);
484 
485 	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
486 
487 	/*
488 	 * Move the parity data aside -- we're going to compute parity as
489 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
490 	 * reuse the parity generation mechanism without trashing the actual
491 	 * parity so we make those columns appear to be full of zeros by
492 	 * setting their lengths to zero.
493 	 */
494 	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
495 	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
496 	xsize = rm->rm_col[x].rc_size;
497 	ysize = rm->rm_col[y].rc_size;
498 
499 	rm->rm_col[VDEV_RAIDZ_P].rc_data =
500 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
501 	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
502 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
503 	rm->rm_col[x].rc_size = 0;
504 	rm->rm_col[y].rc_size = 0;
505 
506 	vdev_raidz_generate_parity_pq(rm);
507 
508 	rm->rm_col[x].rc_size = xsize;
509 	rm->rm_col[y].rc_size = ysize;
510 
511 	p = pdata;
512 	q = qdata;
513 	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
514 	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
515 	xd = rm->rm_col[x].rc_data;
516 	yd = rm->rm_col[y].rc_data;
517 
518 	/*
519 	 * We now have:
520 	 *	Pxy = P + D_x + D_y
521 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
522 	 *
523 	 * We can then solve for D_x:
524 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
525 	 * where
526 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
527 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
528 	 *
529 	 * With D_x in hand, we can easily solve for D_y:
530 	 *	D_y = P + Pxy + D_x
531 	 */
532 
533 	a = vdev_raidz_pow2[255 + x - y];
534 	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
535 	tmp = 255 - vdev_raidz_log2[a ^ 1];
536 
537 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
538 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
539 
540 	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
541 		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
542 		    vdev_raidz_exp2(*q ^ *qxy, bexp);
543 
544 		if (i < ysize)
545 			*yd = *p ^ *pxy ^ *xd;
546 	}
547 
548 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
549 	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
550 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
551 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
552 
553 	/*
554 	 * Restore the saved parity data.
555 	 */
556 	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
557 	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
558 }
559 
560 
561 static int
562 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
563 {
564 	vdev_t *cvd;
565 	uint64_t nparity = vd->vdev_nparity;
566 	int c, error;
567 	int lasterror = 0;
568 	int numerrors = 0;
569 
570 	ASSERT(nparity > 0);
571 
572 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
573 	    vd->vdev_children < nparity + 1) {
574 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
575 		return (EINVAL);
576 	}
577 
578 	for (c = 0; c < vd->vdev_children; c++) {
579 		cvd = vd->vdev_child[c];
580 
581 		if ((error = vdev_open(cvd)) != 0) {
582 			lasterror = error;
583 			numerrors++;
584 			continue;
585 		}
586 
587 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
588 		*ashift = MAX(*ashift, cvd->vdev_ashift);
589 	}
590 
591 	*asize *= vd->vdev_children;
592 
593 	if (numerrors > nparity) {
594 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
595 		return (lasterror);
596 	}
597 
598 	return (0);
599 }
600 
601 static void
602 vdev_raidz_close(vdev_t *vd)
603 {
604 	int c;
605 
606 	for (c = 0; c < vd->vdev_children; c++)
607 		vdev_close(vd->vdev_child[c]);
608 }
609 
610 static uint64_t
611 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
612 {
613 	uint64_t asize;
614 	uint64_t ashift = vd->vdev_top->vdev_ashift;
615 	uint64_t cols = vd->vdev_children;
616 	uint64_t nparity = vd->vdev_nparity;
617 
618 	asize = ((psize - 1) >> ashift) + 1;
619 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
620 	asize = roundup(asize, nparity + 1) << ashift;
621 
622 	return (asize);
623 }
624 
625 static void
626 vdev_raidz_child_done(zio_t *zio)
627 {
628 	raidz_col_t *rc = zio->io_private;
629 
630 	rc->rc_error = zio->io_error;
631 	rc->rc_tried = 1;
632 	rc->rc_skipped = 0;
633 }
634 
635 static void
636 vdev_raidz_repair_done(zio_t *zio)
637 {
638 	ASSERT(zio->io_private == zio->io_parent);
639 	vdev_raidz_map_free(zio->io_private);
640 }
641 
642 static int
643 vdev_raidz_io_start(zio_t *zio)
644 {
645 	vdev_t *vd = zio->io_vd;
646 	vdev_t *tvd = vd->vdev_top;
647 	vdev_t *cvd;
648 	blkptr_t *bp = zio->io_bp;
649 	raidz_map_t *rm;
650 	raidz_col_t *rc;
651 	int c;
652 
653 	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
654 	    vd->vdev_nparity);
655 
656 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
657 
658 	if (zio->io_type == ZIO_TYPE_WRITE) {
659 		/*
660 		 * Generate RAID parity in the first virtual columns.
661 		 */
662 		if (rm->rm_firstdatacol == 1)
663 			vdev_raidz_generate_parity_p(rm);
664 		else
665 			vdev_raidz_generate_parity_pq(rm);
666 
667 		for (c = 0; c < rm->rm_cols; c++) {
668 			rc = &rm->rm_col[c];
669 			cvd = vd->vdev_child[rc->rc_devidx];
670 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
671 			    rc->rc_offset, rc->rc_data, rc->rc_size,
672 			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
673 			    vdev_raidz_child_done, rc));
674 		}
675 
676 		return (zio_wait_for_children_done(zio));
677 	}
678 
679 	ASSERT(zio->io_type == ZIO_TYPE_READ);
680 
681 	/*
682 	 * Iterate over the columns in reverse order so that we hit the parity
683 	 * last -- any errors along the way will force us to read the parity
684 	 * data.
685 	 */
686 	for (c = rm->rm_cols - 1; c >= 0; c--) {
687 		rc = &rm->rm_col[c];
688 		cvd = vd->vdev_child[rc->rc_devidx];
689 		if (!vdev_readable(cvd)) {
690 			if (c >= rm->rm_firstdatacol)
691 				rm->rm_missingdata++;
692 			else
693 				rm->rm_missingparity++;
694 			rc->rc_error = ENXIO;
695 			rc->rc_tried = 1;	/* don't even try */
696 			rc->rc_skipped = 1;
697 			continue;
698 		}
699 		if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) {
700 			if (c >= rm->rm_firstdatacol)
701 				rm->rm_missingdata++;
702 			else
703 				rm->rm_missingparity++;
704 			rc->rc_error = ESTALE;
705 			rc->rc_skipped = 1;
706 			continue;
707 		}
708 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
709 		    (zio->io_flags & ZIO_FLAG_SCRUB)) {
710 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
711 			    rc->rc_offset, rc->rc_data, rc->rc_size,
712 			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
713 			    vdev_raidz_child_done, rc));
714 		}
715 	}
716 
717 	return (zio_wait_for_children_done(zio));
718 }
719 
720 /*
721  * Report a checksum error for a child of a RAID-Z device.
722  */
723 static void
724 raidz_checksum_error(zio_t *zio, raidz_col_t *rc)
725 {
726 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
727 	dprintf_bp(zio->io_bp, "imputed checksum error on %s: ",
728 	    vdev_description(vd));
729 
730 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
731 		mutex_enter(&vd->vdev_stat_lock);
732 		vd->vdev_stat.vs_checksum_errors++;
733 		mutex_exit(&vd->vdev_stat_lock);
734 	}
735 
736 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE))
737 		zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
738 		    zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size);
739 }
740 
741 /*
742  * Generate the parity from the data columns. If we tried and were able to
743  * read the parity without error, verify that the generated parity matches the
744  * data we read. If it doesn't, we fire off a checksum error. Return the
745  * number such failures.
746  */
747 static int
748 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
749 {
750 	void *orig[VDEV_RAIDZ_MAXPARITY];
751 	int c, ret = 0;
752 	raidz_col_t *rc;
753 
754 	for (c = 0; c < rm->rm_firstdatacol; c++) {
755 		rc = &rm->rm_col[c];
756 		if (!rc->rc_tried || rc->rc_error != 0)
757 			continue;
758 		orig[c] = zio_buf_alloc(rc->rc_size);
759 		bcopy(rc->rc_data, orig[c], rc->rc_size);
760 	}
761 
762 	if (rm->rm_firstdatacol == 1)
763 		vdev_raidz_generate_parity_p(rm);
764 	else
765 		vdev_raidz_generate_parity_pq(rm);
766 
767 	for (c = 0; c < rm->rm_firstdatacol; c++) {
768 		rc = &rm->rm_col[c];
769 		if (!rc->rc_tried || rc->rc_error != 0)
770 			continue;
771 		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
772 			raidz_checksum_error(zio, rc);
773 			rc->rc_error = ECKSUM;
774 			ret++;
775 		}
776 		zio_buf_free(orig[c], rc->rc_size);
777 	}
778 
779 	return (ret);
780 }
781 
782 static uint64_t raidz_corrected_p;
783 static uint64_t raidz_corrected_q;
784 static uint64_t raidz_corrected_pq;
785 
786 static int
787 vdev_raidz_io_done(zio_t *zio)
788 {
789 	vdev_t *vd = zio->io_vd;
790 	vdev_t *cvd;
791 	raidz_map_t *rm = zio->io_vsd;
792 	raidz_col_t *rc, *rc1;
793 	int unexpected_errors = 0;
794 	int parity_errors = 0;
795 	int parity_untried = 0;
796 	int data_errors = 0;
797 	int n, c, c1;
798 
799 	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
800 
801 	zio->io_error = 0;
802 	zio->io_numerrors = 0;
803 
804 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
805 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
806 
807 	for (c = 0; c < rm->rm_cols; c++) {
808 		rc = &rm->rm_col[c];
809 
810 		/*
811 		 * We preserve any EIOs because those may be worth retrying;
812 		 * whereas ECKSUM and ENXIO are more likely to be persistent.
813 		 */
814 		if (rc->rc_error) {
815 			if (zio->io_error != EIO)
816 				zio->io_error = rc->rc_error;
817 
818 			if (c < rm->rm_firstdatacol)
819 				parity_errors++;
820 			else
821 				data_errors++;
822 
823 			if (!rc->rc_skipped)
824 				unexpected_errors++;
825 
826 			zio->io_numerrors++;
827 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
828 			parity_untried++;
829 		}
830 	}
831 
832 	if (zio->io_type == ZIO_TYPE_WRITE) {
833 		/*
834 		 * If this is not a failfast write, and we were able to
835 		 * write enough columns to reconstruct the data, good enough.
836 		 */
837 		/* XXPOLICY */
838 		if (zio->io_numerrors <= rm->rm_firstdatacol &&
839 		    !(zio->io_flags & ZIO_FLAG_FAILFAST))
840 			zio->io_error = 0;
841 
842 		vdev_raidz_map_free(zio);
843 
844 		return (ZIO_PIPELINE_CONTINUE);
845 	}
846 
847 	ASSERT(zio->io_type == ZIO_TYPE_READ);
848 	/*
849 	 * There are three potential phases for a read:
850 	 *	1. produce valid data from the columns read
851 	 *	2. read all disks and try again
852 	 *	3. perform combinatorial reconstruction
853 	 *
854 	 * Each phase is progressively both more expensive and less likely to
855 	 * occur. If we encounter more errors than we can repair or all phases
856 	 * fail, we have no choice but to return an error.
857 	 */
858 
859 	/*
860 	 * If the number of errors we saw was correctable -- less than or equal
861 	 * to the number of parity disks read -- attempt to produce data that
862 	 * has a valid checksum. Naturally, this case applies in the absence of
863 	 * any errors.
864 	 */
865 	if (zio->io_numerrors <= rm->rm_firstdatacol - parity_untried) {
866 		switch (data_errors) {
867 		case 0:
868 			if (zio_checksum_error(zio) == 0) {
869 				zio->io_error = 0;
870 
871 				/*
872 				 * If we read parity information (unnecessarily
873 				 * as it happens since no reconstruction was
874 				 * needed) regenerate and verify the parity.
875 				 * We also regenerate parity when resilvering
876 				 * so we can write it out to the failed device
877 				 * later.
878 				 */
879 				if (parity_errors + parity_untried <
880 				    rm->rm_firstdatacol ||
881 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
882 					n = raidz_parity_verify(zio, rm);
883 					unexpected_errors += n;
884 					ASSERT(parity_errors + n <=
885 					    rm->rm_firstdatacol);
886 				}
887 				goto done;
888 			}
889 			break;
890 
891 		case 1:
892 			/*
893 			 * We either attempt to read all the parity columns or
894 			 * none of them. If we didn't try to read parity, we
895 			 * wouldn't be here in the correctable case. There must
896 			 * also have been fewer parity errors than parity
897 			 * columns or, again, we wouldn't be in this code path.
898 			 */
899 			ASSERT(parity_untried == 0);
900 			ASSERT(parity_errors < rm->rm_firstdatacol);
901 
902 			/*
903 			 * Find the column that reported the error.
904 			 */
905 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
906 				rc = &rm->rm_col[c];
907 				if (rc->rc_error != 0)
908 					break;
909 			}
910 			ASSERT(c != rm->rm_cols);
911 			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
912 			    rc->rc_error == ESTALE);
913 
914 			if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
915 				vdev_raidz_reconstruct_p(rm, c);
916 			} else {
917 				ASSERT(rm->rm_firstdatacol > 1);
918 				vdev_raidz_reconstruct_q(rm, c);
919 			}
920 
921 			if (zio_checksum_error(zio) == 0) {
922 				zio->io_error = 0;
923 				if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0)
924 					atomic_inc_64(&raidz_corrected_p);
925 				else
926 					atomic_inc_64(&raidz_corrected_q);
927 
928 				/*
929 				 * If there's more than one parity disk that
930 				 * was successfully read, confirm that the
931 				 * other parity disk produced the correct data.
932 				 * This routine is suboptimal in that it
933 				 * regenerates both the parity we wish to test
934 				 * as well as the parity we just used to
935 				 * perform the reconstruction, but this should
936 				 * be a relatively uncommon case, and can be
937 				 * optimized if it becomes a problem.
938 				 * We also regenerate parity when resilvering
939 				 * so we can write it out to the failed device
940 				 * later.
941 				 */
942 				if (parity_errors < rm->rm_firstdatacol - 1 ||
943 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
944 					n = raidz_parity_verify(zio, rm);
945 					unexpected_errors += n;
946 					ASSERT(parity_errors + n <=
947 					    rm->rm_firstdatacol);
948 				}
949 
950 				goto done;
951 			}
952 			break;
953 
954 		case 2:
955 			/*
956 			 * Two data column errors require double parity.
957 			 */
958 			ASSERT(rm->rm_firstdatacol == 2);
959 
960 			/*
961 			 * Find the two columns that reported errors.
962 			 */
963 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
964 				rc = &rm->rm_col[c];
965 				if (rc->rc_error != 0)
966 					break;
967 			}
968 			ASSERT(c != rm->rm_cols);
969 			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
970 			    rc->rc_error == ESTALE);
971 
972 			for (c1 = c++; c < rm->rm_cols; c++) {
973 				rc = &rm->rm_col[c];
974 				if (rc->rc_error != 0)
975 					break;
976 			}
977 			ASSERT(c != rm->rm_cols);
978 			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
979 			    rc->rc_error == ESTALE);
980 
981 			vdev_raidz_reconstruct_pq(rm, c1, c);
982 
983 			if (zio_checksum_error(zio) == 0) {
984 				zio->io_error = 0;
985 				atomic_inc_64(&raidz_corrected_pq);
986 
987 				goto done;
988 			}
989 			break;
990 
991 		default:
992 			ASSERT(rm->rm_firstdatacol <= 2);
993 			ASSERT(0);
994 		}
995 	}
996 
997 	/*
998 	 * This isn't a typical situation -- either we got a read error or
999 	 * a child silently returned bad data. Read every block so we can
1000 	 * try again with as much data and parity as we can track down. If
1001 	 * we've already been through once before, all children will be marked
1002 	 * as tried so we'll proceed to combinatorial reconstruction.
1003 	 */
1004 	unexpected_errors = 1;
1005 	rm->rm_missingdata = 0;
1006 	rm->rm_missingparity = 0;
1007 
1008 	for (c = 0; c < rm->rm_cols; c++) {
1009 		if (rm->rm_col[c].rc_tried)
1010 			continue;
1011 
1012 		zio->io_error = 0;
1013 		zio_vdev_io_redone(zio);
1014 		do {
1015 			rc = &rm->rm_col[c];
1016 			if (rc->rc_tried)
1017 				continue;
1018 			zio_nowait(zio_vdev_child_io(zio, NULL,
1019 			    vd->vdev_child[rc->rc_devidx],
1020 			    rc->rc_offset, rc->rc_data, rc->rc_size,
1021 			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
1022 			    vdev_raidz_child_done, rc));
1023 		} while (++c < rm->rm_cols);
1024 		dprintf("rereading\n");
1025 
1026 		return (zio_wait_for_children_done(zio));
1027 	}
1028 
1029 	/*
1030 	 * At this point we've attempted to reconstruct the data given the
1031 	 * errors we detected, and we've attempted to read all columns. There
1032 	 * must, therefore, be one or more additional problems -- silent errors
1033 	 * resulting in invalid data rather than explicit I/O errors resulting
1034 	 * in absent data. Before we attempt combinatorial reconstruction make
1035 	 * sure we have a chance of coming up with the right answer.
1036 	 */
1037 	if (zio->io_numerrors >= rm->rm_firstdatacol) {
1038 		ASSERT(zio->io_error != 0);
1039 		goto done;
1040 	}
1041 
1042 	if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
1043 		/*
1044 		 * Attempt to reconstruct the data from parity P.
1045 		 */
1046 		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1047 			void *orig;
1048 			rc = &rm->rm_col[c];
1049 
1050 			orig = zio_buf_alloc(rc->rc_size);
1051 			bcopy(rc->rc_data, orig, rc->rc_size);
1052 			vdev_raidz_reconstruct_p(rm, c);
1053 
1054 			if (zio_checksum_error(zio) == 0) {
1055 				zio_buf_free(orig, rc->rc_size);
1056 				zio->io_error = 0;
1057 				atomic_inc_64(&raidz_corrected_p);
1058 
1059 				/*
1060 				 * If this child didn't know that it returned
1061 				 * bad data, inform it.
1062 				 */
1063 				if (rc->rc_tried && rc->rc_error == 0)
1064 					raidz_checksum_error(zio, rc);
1065 				rc->rc_error = ECKSUM;
1066 				goto done;
1067 			}
1068 
1069 			bcopy(orig, rc->rc_data, rc->rc_size);
1070 			zio_buf_free(orig, rc->rc_size);
1071 		}
1072 	}
1073 
1074 	if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
1075 		/*
1076 		 * Attempt to reconstruct the data from parity Q.
1077 		 */
1078 		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1079 			void *orig;
1080 			rc = &rm->rm_col[c];
1081 
1082 			orig = zio_buf_alloc(rc->rc_size);
1083 			bcopy(rc->rc_data, orig, rc->rc_size);
1084 			vdev_raidz_reconstruct_q(rm, c);
1085 
1086 			if (zio_checksum_error(zio) == 0) {
1087 				zio_buf_free(orig, rc->rc_size);
1088 				zio->io_error = 0;
1089 				atomic_inc_64(&raidz_corrected_q);
1090 
1091 				/*
1092 				 * If this child didn't know that it returned
1093 				 * bad data, inform it.
1094 				 */
1095 				if (rc->rc_tried && rc->rc_error == 0)
1096 					raidz_checksum_error(zio, rc);
1097 				rc->rc_error = ECKSUM;
1098 				goto done;
1099 			}
1100 
1101 			bcopy(orig, rc->rc_data, rc->rc_size);
1102 			zio_buf_free(orig, rc->rc_size);
1103 		}
1104 	}
1105 
1106 	if (rm->rm_firstdatacol > 1 &&
1107 	    rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 &&
1108 	    rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
1109 		/*
1110 		 * Attempt to reconstruct the data from both P and Q.
1111 		 */
1112 		for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) {
1113 			void *orig, *orig1;
1114 			rc = &rm->rm_col[c];
1115 
1116 			orig = zio_buf_alloc(rc->rc_size);
1117 			bcopy(rc->rc_data, orig, rc->rc_size);
1118 
1119 			for (c1 = c + 1; c1 < rm->rm_cols; c1++) {
1120 				rc1 = &rm->rm_col[c1];
1121 
1122 				orig1 = zio_buf_alloc(rc1->rc_size);
1123 				bcopy(rc1->rc_data, orig1, rc1->rc_size);
1124 
1125 				vdev_raidz_reconstruct_pq(rm, c, c1);
1126 
1127 				if (zio_checksum_error(zio) == 0) {
1128 					zio_buf_free(orig, rc->rc_size);
1129 					zio_buf_free(orig1, rc1->rc_size);
1130 					zio->io_error = 0;
1131 					atomic_inc_64(&raidz_corrected_pq);
1132 
1133 					/*
1134 					 * If these children didn't know they
1135 					 * returned bad data, inform them.
1136 					 */
1137 					if (rc->rc_tried && rc->rc_error == 0)
1138 						raidz_checksum_error(zio, rc);
1139 					if (rc1->rc_tried && rc1->rc_error == 0)
1140 						raidz_checksum_error(zio, rc1);
1141 
1142 					rc->rc_error = ECKSUM;
1143 					rc1->rc_error = ECKSUM;
1144 
1145 					goto done;
1146 				}
1147 
1148 				bcopy(orig1, rc1->rc_data, rc1->rc_size);
1149 				zio_buf_free(orig1, rc1->rc_size);
1150 			}
1151 
1152 			bcopy(orig, rc->rc_data, rc->rc_size);
1153 			zio_buf_free(orig, rc->rc_size);
1154 		}
1155 	}
1156 
1157 	/*
1158 	 * All combinations failed to checksum. Generate checksum ereports for
1159 	 * all children.
1160 	 */
1161 	zio->io_error = ECKSUM;
1162 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1163 		for (c = 0; c < rm->rm_cols; c++) {
1164 			rc = &rm->rm_col[c];
1165 			zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
1166 			    zio->io_spa, vd->vdev_child[rc->rc_devidx], zio,
1167 			    rc->rc_offset, rc->rc_size);
1168 		}
1169 	}
1170 
1171 done:
1172 	zio_checksum_verified(zio);
1173 
1174 	if (zio->io_error == 0 && (spa_mode & FWRITE) &&
1175 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
1176 		zio_t *rio;
1177 
1178 		/*
1179 		 * Use the good data we have in hand to repair damaged children.
1180 		 *
1181 		 * We issue all repair I/Os as children of 'rio' to arrange
1182 		 * that vdev_raidz_map_free(zio) will be invoked after all
1183 		 * repairs complete, but before we advance to the next stage.
1184 		 */
1185 		rio = zio_null(zio, zio->io_spa,
1186 		    vdev_raidz_repair_done, zio, ZIO_FLAG_CANFAIL);
1187 
1188 		for (c = 0; c < rm->rm_cols; c++) {
1189 			rc = &rm->rm_col[c];
1190 			cvd = vd->vdev_child[rc->rc_devidx];
1191 
1192 			if (rc->rc_error == 0)
1193 				continue;
1194 
1195 			dprintf("%s resilvered %s @ 0x%llx error %d\n",
1196 			    vdev_description(vd),
1197 			    vdev_description(cvd),
1198 			    zio->io_offset, rc->rc_error);
1199 
1200 			zio_nowait(zio_vdev_child_io(rio, NULL, cvd,
1201 			    rc->rc_offset, rc->rc_data, rc->rc_size,
1202 			    ZIO_TYPE_WRITE, zio->io_priority,
1203 			    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_DONT_PROPAGATE |
1204 			    ZIO_FLAG_CANFAIL, NULL, NULL));
1205 		}
1206 
1207 		zio_nowait(rio);
1208 
1209 		return (zio_wait_for_children_done(zio));
1210 	}
1211 
1212 	vdev_raidz_map_free(zio);
1213 
1214 	return (ZIO_PIPELINE_CONTINUE);
1215 }
1216 
1217 static void
1218 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
1219 {
1220 	if (faulted > vd->vdev_nparity)
1221 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1222 		    VDEV_AUX_NO_REPLICAS);
1223 	else if (degraded + faulted != 0)
1224 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
1225 	else
1226 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
1227 }
1228 
1229 vdev_ops_t vdev_raidz_ops = {
1230 	vdev_raidz_open,
1231 	vdev_raidz_close,
1232 	NULL,
1233 	vdev_raidz_asize,
1234 	vdev_raidz_io_start,
1235 	vdev_raidz_io_done,
1236 	vdev_raidz_state_change,
1237 	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
1238 	B_FALSE			/* not a leaf vdev */
1239 };
1240