xref: /illumos-gate/usr/src/uts/common/fs/zfs/vdev_raidz.c (revision 150d2c5288c645a1c1a7d2bee61199a3729406c7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/zfs_context.h>
30 #include <sys/spa.h>
31 #include <sys/vdev_impl.h>
32 #include <sys/zio.h>
33 #include <sys/zio_checksum.h>
34 #include <sys/fs/zfs.h>
35 #include <sys/fm/fs/zfs.h>
36 
37 /*
38  * Virtual device vector for RAID-Z.
39  *
40  * This vdev supports both single and double parity. For single parity, we
41  * use a simple XOR of all the data columns. For double parity, we use both
42  * the simple XOR as well as a technique described in "The mathematics of
43  * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8),
44  * over the integers expressable in a single byte. Briefly, the operations on
45  * the field are defined as follows:
46  *
47  *   o addition (+) is represented by a bitwise XOR
48  *   o subtraction (-) is therefore identical to addition: A + B = A - B
49  *   o multiplication of A by 2 is defined by the following bitwise expression:
50  *	(A * 2)_7 = A_6
51  *	(A * 2)_6 = A_5
52  *	(A * 2)_5 = A_4
53  *	(A * 2)_4 = A_3 + A_7
54  *	(A * 2)_3 = A_2 + A_7
55  *	(A * 2)_2 = A_1 + A_7
56  *	(A * 2)_1 = A_0
57  *	(A * 2)_0 = A_7
58  *
59  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
60  *
61  * Observe that any number in the field (except for 0) can be expressed as a
62  * power of 2 -- a generator for the field. We store a table of the powers of
63  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
64  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
65  * than field addition). The inverse of a field element A (A^-1) is A^254.
66  *
67  * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1,
68  * can be expressed by field operations:
69  *
70  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
71  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
72  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
73  *
74  * See the reconstruction code below for how P and Q can used individually or
75  * in concert to recover missing data columns.
76  */
77 
78 typedef struct raidz_col {
79 	uint64_t rc_devidx;		/* child device index for I/O */
80 	uint64_t rc_offset;		/* device offset */
81 	uint64_t rc_size;		/* I/O size */
82 	void *rc_data;			/* I/O data */
83 	int rc_error;			/* I/O error for this device */
84 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
85 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
86 } raidz_col_t;
87 
88 typedef struct raidz_map {
89 	uint64_t rm_cols;		/* Column count */
90 	uint64_t rm_bigcols;		/* Number of oversized columns */
91 	uint64_t rm_asize;		/* Actual total I/O size */
92 	uint64_t rm_missingdata;	/* Count of missing data devices */
93 	uint64_t rm_missingparity;	/* Count of missing parity devices */
94 	uint64_t rm_firstdatacol;	/* First data column/parity count */
95 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
96 } raidz_map_t;
97 
98 #define	VDEV_RAIDZ_P		0
99 #define	VDEV_RAIDZ_Q		1
100 
101 #define	VDEV_RAIDZ_MAXPARITY	2
102 
103 #define	VDEV_RAIDZ_MUL_2(a)	(((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0))
104 
105 /*
106  * These two tables represent powers and logs of 2 in the Galois field defined
107  * above. These values were computed by repeatedly multiplying by 2 as above.
108  */
109 static const uint8_t vdev_raidz_pow2[256] = {
110 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
111 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
112 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
113 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
114 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
115 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
116 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
117 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
118 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
119 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
120 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
121 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
122 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
123 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
124 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
125 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
126 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
127 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
128 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
129 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
130 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
131 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
132 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
133 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
134 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
135 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
136 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
137 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
138 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
139 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
140 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
141 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
142 };
143 static const uint8_t vdev_raidz_log2[256] = {
144 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
145 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
146 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
147 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
148 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
149 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
150 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
151 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
152 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
153 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
154 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
155 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
156 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
157 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
158 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
159 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
160 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
161 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
162 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
163 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
164 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
165 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
166 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
167 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
168 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
169 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
170 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
171 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
172 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
173 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
174 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
175 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
176 };
177 
178 /*
179  * Multiply a given number by 2 raised to the given power.
180  */
181 static uint8_t
182 vdev_raidz_exp2(uint_t a, int exp)
183 {
184 	if (a == 0)
185 		return (0);
186 
187 	ASSERT(exp >= 0);
188 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
189 
190 	exp += vdev_raidz_log2[a];
191 	if (exp > 255)
192 		exp -= 255;
193 
194 	return (vdev_raidz_pow2[exp]);
195 }
196 
197 static raidz_map_t *
198 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
199     uint64_t nparity)
200 {
201 	raidz_map_t *rm;
202 	uint64_t b = zio->io_offset >> unit_shift;
203 	uint64_t s = zio->io_size >> unit_shift;
204 	uint64_t f = b % dcols;
205 	uint64_t o = (b / dcols) << unit_shift;
206 	uint64_t q, r, c, bc, col, acols, coff, devidx;
207 
208 	q = s / (dcols - nparity);
209 	r = s - q * (dcols - nparity);
210 	bc = (r == 0 ? 0 : r + nparity);
211 
212 	acols = (q == 0 ? bc : dcols);
213 
214 	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP);
215 
216 	rm->rm_cols = acols;
217 	rm->rm_bigcols = bc;
218 	rm->rm_asize = 0;
219 	rm->rm_missingdata = 0;
220 	rm->rm_missingparity = 0;
221 	rm->rm_firstdatacol = nparity;
222 
223 	for (c = 0; c < acols; c++) {
224 		col = f + c;
225 		coff = o;
226 		if (col >= dcols) {
227 			col -= dcols;
228 			coff += 1ULL << unit_shift;
229 		}
230 		rm->rm_col[c].rc_devidx = col;
231 		rm->rm_col[c].rc_offset = coff;
232 		rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift;
233 		rm->rm_col[c].rc_data = NULL;
234 		rm->rm_col[c].rc_error = 0;
235 		rm->rm_col[c].rc_tried = 0;
236 		rm->rm_col[c].rc_skipped = 0;
237 		rm->rm_asize += rm->rm_col[c].rc_size;
238 	}
239 
240 	rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift);
241 
242 	for (c = 0; c < rm->rm_firstdatacol; c++)
243 		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
244 
245 	rm->rm_col[c].rc_data = zio->io_data;
246 
247 	for (c = c + 1; c < acols; c++)
248 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
249 		    rm->rm_col[c - 1].rc_size;
250 
251 	/*
252 	 * If all data stored spans all columns, there's a danger that parity
253 	 * will always be on the same device and, since parity isn't read
254 	 * during normal operation, that that device's I/O bandwidth won't be
255 	 * used effectively. We therefore switch the parity every 1MB.
256 	 *
257 	 * ... at least that was, ostensibly, the theory. As a practical
258 	 * matter unless we juggle the parity between all devices evenly, we
259 	 * won't see any benefit. Further, occasional writes that aren't a
260 	 * multiple of the LCM of the number of children and the minimum
261 	 * stripe width are sufficient to avoid pessimal behavior.
262 	 * Unfortunately, this decision created an implicit on-disk format
263 	 * requirement that we need to support for all eternity, but only
264 	 * for single-parity RAID-Z.
265 	 */
266 	ASSERT(rm->rm_cols >= 2);
267 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
268 
269 	if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
270 		devidx = rm->rm_col[0].rc_devidx;
271 		o = rm->rm_col[0].rc_offset;
272 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
273 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
274 		rm->rm_col[1].rc_devidx = devidx;
275 		rm->rm_col[1].rc_offset = o;
276 	}
277 
278 	zio->io_vsd = rm;
279 	return (rm);
280 }
281 
282 static void
283 vdev_raidz_map_free(zio_t *zio)
284 {
285 	raidz_map_t *rm = zio->io_vsd;
286 	int c;
287 
288 	for (c = 0; c < rm->rm_firstdatacol; c++)
289 		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
290 
291 	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols]));
292 	zio->io_vsd = NULL;
293 }
294 
295 static void
296 vdev_raidz_generate_parity_p(raidz_map_t *rm)
297 {
298 	uint64_t *p, *src, pcount, ccount, i;
299 	int c;
300 
301 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
302 
303 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
304 		src = rm->rm_col[c].rc_data;
305 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
306 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
307 
308 		if (c == rm->rm_firstdatacol) {
309 			ASSERT(ccount == pcount);
310 			for (i = 0; i < ccount; i++, p++, src++) {
311 				*p = *src;
312 			}
313 		} else {
314 			ASSERT(ccount <= pcount);
315 			for (i = 0; i < ccount; i++, p++, src++) {
316 				*p ^= *src;
317 			}
318 		}
319 	}
320 }
321 
322 static void
323 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
324 {
325 	uint64_t *q, *p, *src, pcount, ccount, mask, i;
326 	int c;
327 
328 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
329 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
330 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
331 
332 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
333 		src = rm->rm_col[c].rc_data;
334 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
335 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
336 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
337 
338 		if (c == rm->rm_firstdatacol) {
339 			ASSERT(ccount == pcount || ccount == 0);
340 			for (i = 0; i < ccount; i++, p++, q++, src++) {
341 				*q = *src;
342 				*p = *src;
343 			}
344 			for (; i < pcount; i++, p++, q++, src++) {
345 				*q = 0;
346 				*p = 0;
347 			}
348 		} else {
349 			ASSERT(ccount <= pcount);
350 
351 			/*
352 			 * Rather than multiplying each byte individually (as
353 			 * described above), we are able to handle 8 at once
354 			 * by generating a mask based on the high bit in each
355 			 * byte and using that to conditionally XOR in 0x1d.
356 			 */
357 			for (i = 0; i < ccount; i++, p++, q++, src++) {
358 				mask = *q & 0x8080808080808080ULL;
359 				mask = (mask << 1) - (mask >> 7);
360 				*q = ((*q << 1) & 0xfefefefefefefefeULL) ^
361 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
362 				*q ^= *src;
363 				*p ^= *src;
364 			}
365 
366 			/*
367 			 * Treat short columns as though they are full of 0s.
368 			 */
369 			for (; i < pcount; i++, q++) {
370 				mask = *q & 0x8080808080808080ULL;
371 				mask = (mask << 1) - (mask >> 7);
372 				*q = ((*q << 1) & 0xfefefefefefefefeULL) ^
373 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
374 			}
375 		}
376 	}
377 }
378 
379 static void
380 vdev_raidz_reconstruct_p(raidz_map_t *rm, int x)
381 {
382 	uint64_t *dst, *src, xcount, ccount, count, i;
383 	int c;
384 
385 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
386 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
387 	ASSERT(xcount > 0);
388 
389 	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
390 	dst = rm->rm_col[x].rc_data;
391 	for (i = 0; i < xcount; i++, dst++, src++) {
392 		*dst = *src;
393 	}
394 
395 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
396 		src = rm->rm_col[c].rc_data;
397 		dst = rm->rm_col[x].rc_data;
398 
399 		if (c == x)
400 			continue;
401 
402 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
403 		count = MIN(ccount, xcount);
404 
405 		for (i = 0; i < count; i++, dst++, src++) {
406 			*dst ^= *src;
407 		}
408 	}
409 }
410 
411 static void
412 vdev_raidz_reconstruct_q(raidz_map_t *rm, int x)
413 {
414 	uint64_t *dst, *src, xcount, ccount, count, mask, i;
415 	uint8_t *b;
416 	int c, j, exp;
417 
418 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
419 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
420 
421 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
422 		src = rm->rm_col[c].rc_data;
423 		dst = rm->rm_col[x].rc_data;
424 
425 		if (c == x)
426 			ccount = 0;
427 		else
428 			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
429 
430 		count = MIN(ccount, xcount);
431 
432 		if (c == rm->rm_firstdatacol) {
433 			for (i = 0; i < count; i++, dst++, src++) {
434 				*dst = *src;
435 			}
436 			for (; i < xcount; i++, dst++) {
437 				*dst = 0;
438 			}
439 
440 		} else {
441 			/*
442 			 * For an explanation of this, see the comment in
443 			 * vdev_raidz_generate_parity_pq() above.
444 			 */
445 			for (i = 0; i < count; i++, dst++, src++) {
446 				mask = *dst & 0x8080808080808080ULL;
447 				mask = (mask << 1) - (mask >> 7);
448 				*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
449 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
450 				*dst ^= *src;
451 			}
452 
453 			for (; i < xcount; i++, dst++) {
454 				mask = *dst & 0x8080808080808080ULL;
455 				mask = (mask << 1) - (mask >> 7);
456 				*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
457 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
458 			}
459 		}
460 	}
461 
462 	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
463 	dst = rm->rm_col[x].rc_data;
464 	exp = 255 - (rm->rm_cols - 1 - x);
465 
466 	for (i = 0; i < xcount; i++, dst++, src++) {
467 		*dst ^= *src;
468 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
469 			*b = vdev_raidz_exp2(*b, exp);
470 		}
471 	}
472 }
473 
474 static void
475 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y)
476 {
477 	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
478 	void *pdata, *qdata;
479 	uint64_t xsize, ysize, i;
480 
481 	ASSERT(x < y);
482 	ASSERT(x >= rm->rm_firstdatacol);
483 	ASSERT(y < rm->rm_cols);
484 
485 	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
486 
487 	/*
488 	 * Move the parity data aside -- we're going to compute parity as
489 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
490 	 * reuse the parity generation mechanism without trashing the actual
491 	 * parity so we make those columns appear to be full of zeros by
492 	 * setting their lengths to zero.
493 	 */
494 	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
495 	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
496 	xsize = rm->rm_col[x].rc_size;
497 	ysize = rm->rm_col[y].rc_size;
498 
499 	rm->rm_col[VDEV_RAIDZ_P].rc_data =
500 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
501 	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
502 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
503 	rm->rm_col[x].rc_size = 0;
504 	rm->rm_col[y].rc_size = 0;
505 
506 	vdev_raidz_generate_parity_pq(rm);
507 
508 	rm->rm_col[x].rc_size = xsize;
509 	rm->rm_col[y].rc_size = ysize;
510 
511 	p = pdata;
512 	q = qdata;
513 	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
514 	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
515 	xd = rm->rm_col[x].rc_data;
516 	yd = rm->rm_col[y].rc_data;
517 
518 	/*
519 	 * We now have:
520 	 *	Pxy = P + D_x + D_y
521 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
522 	 *
523 	 * We can then solve for D_x:
524 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
525 	 * where
526 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
527 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
528 	 *
529 	 * With D_x in hand, we can easily solve for D_y:
530 	 *	D_y = P + Pxy + D_x
531 	 */
532 
533 	a = vdev_raidz_pow2[255 + x - y];
534 	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
535 	tmp = 255 - vdev_raidz_log2[a ^ 1];
536 
537 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
538 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
539 
540 	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
541 		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
542 		    vdev_raidz_exp2(*q ^ *qxy, bexp);
543 
544 		if (i < ysize)
545 			*yd = *p ^ *pxy ^ *xd;
546 	}
547 
548 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
549 	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
550 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
551 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
552 
553 	/*
554 	 * Restore the saved parity data.
555 	 */
556 	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
557 	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
558 }
559 
560 
561 static int
562 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
563 {
564 	vdev_t *cvd;
565 	uint64_t nparity = vd->vdev_nparity;
566 	int c, error;
567 	int lasterror = 0;
568 	int numerrors = 0;
569 
570 	ASSERT(nparity > 0);
571 
572 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
573 	    vd->vdev_children < nparity + 1) {
574 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
575 		return (EINVAL);
576 	}
577 
578 	for (c = 0; c < vd->vdev_children; c++) {
579 		cvd = vd->vdev_child[c];
580 
581 		if ((error = vdev_open(cvd)) != 0) {
582 			lasterror = error;
583 			numerrors++;
584 			continue;
585 		}
586 
587 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
588 		*ashift = MAX(*ashift, cvd->vdev_ashift);
589 	}
590 
591 	*asize *= vd->vdev_children;
592 
593 	if (numerrors > nparity) {
594 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
595 		return (lasterror);
596 	}
597 
598 	return (0);
599 }
600 
601 static void
602 vdev_raidz_close(vdev_t *vd)
603 {
604 	int c;
605 
606 	for (c = 0; c < vd->vdev_children; c++)
607 		vdev_close(vd->vdev_child[c]);
608 }
609 
610 static uint64_t
611 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
612 {
613 	uint64_t asize;
614 	uint64_t ashift = vd->vdev_top->vdev_ashift;
615 	uint64_t cols = vd->vdev_children;
616 	uint64_t nparity = vd->vdev_nparity;
617 
618 	asize = ((psize - 1) >> ashift) + 1;
619 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
620 	asize = roundup(asize, nparity + 1) << ashift;
621 
622 	return (asize);
623 }
624 
625 static void
626 vdev_raidz_child_done(zio_t *zio)
627 {
628 	raidz_col_t *rc = zio->io_private;
629 
630 	rc->rc_error = zio->io_error;
631 	rc->rc_tried = 1;
632 	rc->rc_skipped = 0;
633 }
634 
635 static void
636 vdev_raidz_repair_done(zio_t *zio)
637 {
638 	ASSERT(zio->io_private == zio->io_parent);
639 	vdev_raidz_map_free(zio->io_private);
640 }
641 
642 static void
643 vdev_raidz_io_start(zio_t *zio)
644 {
645 	vdev_t *vd = zio->io_vd;
646 	vdev_t *tvd = vd->vdev_top;
647 	vdev_t *cvd;
648 	blkptr_t *bp = zio->io_bp;
649 	raidz_map_t *rm;
650 	raidz_col_t *rc;
651 	int c;
652 
653 	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
654 	    vd->vdev_nparity);
655 
656 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
657 
658 	if (zio->io_type == ZIO_TYPE_WRITE) {
659 		/*
660 		 * Generate RAID parity in the first virtual columns.
661 		 */
662 		if (rm->rm_firstdatacol == 1)
663 			vdev_raidz_generate_parity_p(rm);
664 		else
665 			vdev_raidz_generate_parity_pq(rm);
666 
667 		for (c = 0; c < rm->rm_cols; c++) {
668 			rc = &rm->rm_col[c];
669 			cvd = vd->vdev_child[rc->rc_devidx];
670 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
671 			    rc->rc_offset, rc->rc_data, rc->rc_size,
672 			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
673 			    vdev_raidz_child_done, rc));
674 		}
675 		zio_wait_children_done(zio);
676 		return;
677 	}
678 
679 	ASSERT(zio->io_type == ZIO_TYPE_READ);
680 
681 	/*
682 	 * Iterate over the columns in reverse order so that we hit the parity
683 	 * last -- any errors along the way will force us to read the parity
684 	 * data.
685 	 */
686 	for (c = rm->rm_cols - 1; c >= 0; c--) {
687 		rc = &rm->rm_col[c];
688 		cvd = vd->vdev_child[rc->rc_devidx];
689 		if (vdev_is_dead(cvd)) {
690 			if (c >= rm->rm_firstdatacol)
691 				rm->rm_missingdata++;
692 			else
693 				rm->rm_missingparity++;
694 			rc->rc_error = ENXIO;
695 			rc->rc_tried = 1;	/* don't even try */
696 			rc->rc_skipped = 1;
697 			continue;
698 		}
699 		if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) {
700 			if (c >= rm->rm_firstdatacol)
701 				rm->rm_missingdata++;
702 			else
703 				rm->rm_missingparity++;
704 			rc->rc_error = ESTALE;
705 			rc->rc_skipped = 1;
706 			continue;
707 		}
708 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
709 		    (zio->io_flags & ZIO_FLAG_SCRUB)) {
710 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
711 			    rc->rc_offset, rc->rc_data, rc->rc_size,
712 			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
713 			    vdev_raidz_child_done, rc));
714 		}
715 	}
716 
717 	zio_wait_children_done(zio);
718 }
719 
720 /*
721  * Report a checksum error for a child of a RAID-Z device.
722  */
723 static void
724 raidz_checksum_error(zio_t *zio, raidz_col_t *rc)
725 {
726 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
727 	dprintf_bp(zio->io_bp, "imputed checksum error on %s: ",
728 	    vdev_description(vd));
729 
730 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
731 		mutex_enter(&vd->vdev_stat_lock);
732 		vd->vdev_stat.vs_checksum_errors++;
733 		mutex_exit(&vd->vdev_stat_lock);
734 	}
735 
736 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE))
737 		zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
738 		    zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size);
739 }
740 
741 /*
742  * Generate the parity from the data columns. If we tried and were able to
743  * read the parity without error, verify that the generated parity matches the
744  * data we read. If it doesn't, we fire off a checksum error. Return the
745  * number such failures.
746  */
747 static int
748 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
749 {
750 	void *orig[VDEV_RAIDZ_MAXPARITY];
751 	int c, ret = 0;
752 	raidz_col_t *rc;
753 
754 	for (c = 0; c < rm->rm_firstdatacol; c++) {
755 		rc = &rm->rm_col[c];
756 		if (!rc->rc_tried || rc->rc_error != 0)
757 			continue;
758 		orig[c] = zio_buf_alloc(rc->rc_size);
759 		bcopy(rc->rc_data, orig[c], rc->rc_size);
760 	}
761 
762 	if (rm->rm_firstdatacol == 1)
763 		vdev_raidz_generate_parity_p(rm);
764 	else
765 		vdev_raidz_generate_parity_pq(rm);
766 
767 	for (c = 0; c < rm->rm_firstdatacol; c++) {
768 		rc = &rm->rm_col[c];
769 		if (!rc->rc_tried || rc->rc_error != 0)
770 			continue;
771 		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
772 			raidz_checksum_error(zio, rc);
773 			rc->rc_error = ECKSUM;
774 			ret++;
775 		}
776 		zio_buf_free(orig[c], rc->rc_size);
777 	}
778 
779 	return (ret);
780 }
781 
782 static uint64_t raidz_corrected_p;
783 static uint64_t raidz_corrected_q;
784 static uint64_t raidz_corrected_pq;
785 
786 static void
787 vdev_raidz_io_done(zio_t *zio)
788 {
789 	vdev_t *vd = zio->io_vd;
790 	vdev_t *cvd;
791 	raidz_map_t *rm = zio->io_vsd;
792 	raidz_col_t *rc, *rc1;
793 	int unexpected_errors = 0;
794 	int parity_errors = 0;
795 	int parity_untried = 0;
796 	int data_errors = 0;
797 	int n, c, c1;
798 
799 	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
800 
801 	zio->io_error = 0;
802 	zio->io_numerrors = 0;
803 
804 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
805 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
806 
807 	for (c = 0; c < rm->rm_cols; c++) {
808 		rc = &rm->rm_col[c];
809 
810 		/*
811 		 * We preserve any EIOs because those may be worth retrying;
812 		 * whereas ECKSUM and ENXIO are more likely to be persistent.
813 		 */
814 		if (rc->rc_error) {
815 			if (zio->io_error != EIO)
816 				zio->io_error = rc->rc_error;
817 
818 			if (c < rm->rm_firstdatacol)
819 				parity_errors++;
820 			else
821 				data_errors++;
822 
823 			if (!rc->rc_skipped)
824 				unexpected_errors++;
825 
826 			zio->io_numerrors++;
827 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
828 			parity_untried++;
829 		}
830 	}
831 
832 	if (zio->io_type == ZIO_TYPE_WRITE) {
833 		/*
834 		 * If this is not a failfast write, and we were able to
835 		 * write enough columns to reconstruct the data, good enough.
836 		 */
837 		/* XXPOLICY */
838 		if (zio->io_numerrors <= rm->rm_firstdatacol &&
839 		    !(zio->io_flags & ZIO_FLAG_FAILFAST))
840 			zio->io_error = 0;
841 
842 		vdev_raidz_map_free(zio);
843 		zio_next_stage(zio);
844 		return;
845 	}
846 
847 	ASSERT(zio->io_type == ZIO_TYPE_READ);
848 	/*
849 	 * There are three potential phases for a read:
850 	 *	1. produce valid data from the columns read
851 	 *	2. read all disks and try again
852 	 *	3. perform combinatorial reconstruction
853 	 *
854 	 * Each phase is progressively both more expensive and less likely to
855 	 * occur. If we encounter more errors than we can repair or all phases
856 	 * fail, we have no choice but to return an error.
857 	 */
858 
859 	/*
860 	 * If the number of errors we saw was correctable -- less than or equal
861 	 * to the number of parity disks read -- attempt to produce data that
862 	 * has a valid checksum. Naturally, this case applies in the absence of
863 	 * any errors.
864 	 */
865 	if (zio->io_numerrors <= rm->rm_firstdatacol - parity_untried) {
866 		switch (data_errors) {
867 		case 0:
868 			if (zio_checksum_error(zio) == 0) {
869 				zio->io_error = 0;
870 				if (parity_errors + parity_untried <
871 				    rm->rm_firstdatacol) {
872 					n = raidz_parity_verify(zio, rm);
873 					unexpected_errors += n;
874 					ASSERT(parity_errors + n <=
875 					    rm->rm_firstdatacol);
876 				}
877 				goto done;
878 			}
879 			break;
880 
881 		case 1:
882 			/*
883 			 * We either attempt to read all the parity columns or
884 			 * none of them. If we didn't try to read parity, we
885 			 * wouldn't be here in the correctable case. There must
886 			 * also have been fewer parity errors than parity
887 			 * columns or, again, we wouldn't be in this code path.
888 			 */
889 			ASSERT(parity_untried == 0);
890 			ASSERT(parity_errors < rm->rm_firstdatacol);
891 
892 			/*
893 			 * Find the column that reported the error.
894 			 */
895 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
896 				rc = &rm->rm_col[c];
897 				if (rc->rc_error != 0)
898 					break;
899 			}
900 			ASSERT(c != rm->rm_cols);
901 			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
902 			    rc->rc_error == ESTALE);
903 
904 			if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
905 				vdev_raidz_reconstruct_p(rm, c);
906 			} else {
907 				ASSERT(rm->rm_firstdatacol > 1);
908 				vdev_raidz_reconstruct_q(rm, c);
909 			}
910 
911 			if (zio_checksum_error(zio) == 0) {
912 				zio->io_error = 0;
913 				if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0)
914 					atomic_inc_64(&raidz_corrected_p);
915 				else
916 					atomic_inc_64(&raidz_corrected_q);
917 
918 				/*
919 				 * If there's more than one parity disk that
920 				 * was successfully read, confirm that the
921 				 * other parity disk produced the correct data.
922 				 * This routine is suboptimal in that it
923 				 * regenerates both the parity we wish to test
924 				 * as well as the parity we just used to
925 				 * perform the reconstruction, but this should
926 				 * be a relatively uncommon case, and can be
927 				 * optimized if it becomes a problem.
928 				 */
929 				if (parity_errors < rm->rm_firstdatacol - 1) {
930 					n = raidz_parity_verify(zio, rm);
931 					unexpected_errors += n;
932 					ASSERT(parity_errors + n <=
933 					    rm->rm_firstdatacol);
934 				}
935 
936 				goto done;
937 			}
938 			break;
939 
940 		case 2:
941 			/*
942 			 * Two data column errors require double parity.
943 			 */
944 			ASSERT(rm->rm_firstdatacol == 2);
945 
946 			/*
947 			 * Find the two columns that reported errors.
948 			 */
949 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
950 				rc = &rm->rm_col[c];
951 				if (rc->rc_error != 0)
952 					break;
953 			}
954 			ASSERT(c != rm->rm_cols);
955 			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
956 			    rc->rc_error == ESTALE);
957 
958 			for (c1 = c++; c < rm->rm_cols; c++) {
959 				rc = &rm->rm_col[c];
960 				if (rc->rc_error != 0)
961 					break;
962 			}
963 			ASSERT(c != rm->rm_cols);
964 			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
965 			    rc->rc_error == ESTALE);
966 
967 			vdev_raidz_reconstruct_pq(rm, c1, c);
968 
969 			if (zio_checksum_error(zio) == 0) {
970 				zio->io_error = 0;
971 				atomic_inc_64(&raidz_corrected_pq);
972 
973 				goto done;
974 			}
975 			break;
976 
977 		default:
978 			ASSERT(rm->rm_firstdatacol <= 2);
979 			ASSERT(0);
980 		}
981 	}
982 
983 	/*
984 	 * This isn't a typical situation -- either we got a read error or
985 	 * a child silently returned bad data. Read every block so we can
986 	 * try again with as much data and parity as we can track down. If
987 	 * we've already been through once before, all children will be marked
988 	 * as tried so we'll proceed to combinatorial reconstruction.
989 	 */
990 	unexpected_errors = 1;
991 	rm->rm_missingdata = 0;
992 	rm->rm_missingparity = 0;
993 
994 	for (c = 0; c < rm->rm_cols; c++) {
995 		if (rm->rm_col[c].rc_tried)
996 			continue;
997 
998 		zio->io_error = 0;
999 		zio_vdev_io_redone(zio);
1000 		do {
1001 			rc = &rm->rm_col[c];
1002 			if (rc->rc_tried)
1003 				continue;
1004 			zio_nowait(zio_vdev_child_io(zio, NULL,
1005 			    vd->vdev_child[rc->rc_devidx],
1006 			    rc->rc_offset, rc->rc_data, rc->rc_size,
1007 			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
1008 			    vdev_raidz_child_done, rc));
1009 		} while (++c < rm->rm_cols);
1010 		dprintf("rereading\n");
1011 		zio_wait_children_done(zio);
1012 		return;
1013 	}
1014 
1015 	/*
1016 	 * At this point we've attempted to reconstruct the data given the
1017 	 * errors we detected, and we've attempted to read all columns. There
1018 	 * must, therefore, be one or more additional problems -- silent errors
1019 	 * resulting in invalid data rather than explicit I/O errors resulting
1020 	 * in absent data. Before we attempt combinatorial reconstruction make
1021 	 * sure we have a chance of coming up with the right answer.
1022 	 */
1023 	if (zio->io_numerrors >= rm->rm_firstdatacol) {
1024 		ASSERT(zio->io_error != 0);
1025 		goto done;
1026 	}
1027 
1028 	if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
1029 		/*
1030 		 * Attempt to reconstruct the data from parity P.
1031 		 */
1032 		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1033 			void *orig;
1034 			rc = &rm->rm_col[c];
1035 
1036 			orig = zio_buf_alloc(rc->rc_size);
1037 			bcopy(rc->rc_data, orig, rc->rc_size);
1038 			vdev_raidz_reconstruct_p(rm, c);
1039 
1040 			if (zio_checksum_error(zio) == 0) {
1041 				zio_buf_free(orig, rc->rc_size);
1042 				zio->io_error = 0;
1043 				atomic_inc_64(&raidz_corrected_p);
1044 
1045 				/*
1046 				 * If this child didn't know that it returned
1047 				 * bad data, inform it.
1048 				 */
1049 				if (rc->rc_tried && rc->rc_error == 0)
1050 					raidz_checksum_error(zio, rc);
1051 				rc->rc_error = ECKSUM;
1052 				goto done;
1053 			}
1054 
1055 			bcopy(orig, rc->rc_data, rc->rc_size);
1056 			zio_buf_free(orig, rc->rc_size);
1057 		}
1058 	}
1059 
1060 	if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
1061 		/*
1062 		 * Attempt to reconstruct the data from parity Q.
1063 		 */
1064 		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1065 			void *orig;
1066 			rc = &rm->rm_col[c];
1067 
1068 			orig = zio_buf_alloc(rc->rc_size);
1069 			bcopy(rc->rc_data, orig, rc->rc_size);
1070 			vdev_raidz_reconstruct_q(rm, c);
1071 
1072 			if (zio_checksum_error(zio) == 0) {
1073 				zio_buf_free(orig, rc->rc_size);
1074 				zio->io_error = 0;
1075 				atomic_inc_64(&raidz_corrected_q);
1076 
1077 				/*
1078 				 * If this child didn't know that it returned
1079 				 * bad data, inform it.
1080 				 */
1081 				if (rc->rc_tried && rc->rc_error == 0)
1082 					raidz_checksum_error(zio, rc);
1083 				rc->rc_error = ECKSUM;
1084 				goto done;
1085 			}
1086 
1087 			bcopy(orig, rc->rc_data, rc->rc_size);
1088 			zio_buf_free(orig, rc->rc_size);
1089 		}
1090 	}
1091 
1092 	if (rm->rm_firstdatacol > 1 &&
1093 	    rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 &&
1094 	    rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
1095 		/*
1096 		 * Attempt to reconstruct the data from both P and Q.
1097 		 */
1098 		for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) {
1099 			void *orig, *orig1;
1100 			rc = &rm->rm_col[c];
1101 
1102 			orig = zio_buf_alloc(rc->rc_size);
1103 			bcopy(rc->rc_data, orig, rc->rc_size);
1104 
1105 			for (c1 = c + 1; c1 < rm->rm_cols; c1++) {
1106 				rc1 = &rm->rm_col[c1];
1107 
1108 				orig1 = zio_buf_alloc(rc1->rc_size);
1109 				bcopy(rc1->rc_data, orig1, rc1->rc_size);
1110 
1111 				vdev_raidz_reconstruct_pq(rm, c, c1);
1112 
1113 				if (zio_checksum_error(zio) == 0) {
1114 					zio_buf_free(orig, rc->rc_size);
1115 					zio_buf_free(orig1, rc1->rc_size);
1116 					zio->io_error = 0;
1117 					atomic_inc_64(&raidz_corrected_pq);
1118 
1119 					/*
1120 					 * If these children didn't know they
1121 					 * returned bad data, inform them.
1122 					 */
1123 					if (rc->rc_tried && rc->rc_error == 0)
1124 						raidz_checksum_error(zio, rc);
1125 					if (rc1->rc_tried && rc1->rc_error == 0)
1126 						raidz_checksum_error(zio, rc1);
1127 
1128 					rc->rc_error = ECKSUM;
1129 					rc1->rc_error = ECKSUM;
1130 
1131 					goto done;
1132 				}
1133 
1134 				bcopy(orig1, rc1->rc_data, rc1->rc_size);
1135 				zio_buf_free(orig1, rc1->rc_size);
1136 			}
1137 
1138 			bcopy(orig, rc->rc_data, rc->rc_size);
1139 			zio_buf_free(orig, rc->rc_size);
1140 		}
1141 	}
1142 
1143 	/*
1144 	 * All combinations failed to checksum. Generate checksum ereports for
1145 	 * all children.
1146 	 */
1147 	zio->io_error = ECKSUM;
1148 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1149 		for (c = 0; c < rm->rm_cols; c++) {
1150 			rc = &rm->rm_col[c];
1151 			zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
1152 			    zio->io_spa, vd->vdev_child[rc->rc_devidx], zio,
1153 			    rc->rc_offset, rc->rc_size);
1154 		}
1155 	}
1156 
1157 done:
1158 	zio_checksum_verified(zio);
1159 
1160 	if (zio->io_error == 0 && (spa_mode & FWRITE) &&
1161 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
1162 		zio_t *rio;
1163 
1164 		/*
1165 		 * Use the good data we have in hand to repair damaged children.
1166 		 *
1167 		 * We issue all repair I/Os as children of 'rio' to arrange
1168 		 * that vdev_raidz_map_free(zio) will be invoked after all
1169 		 * repairs complete, but before we advance to the next stage.
1170 		 */
1171 		rio = zio_null(zio, zio->io_spa,
1172 		    vdev_raidz_repair_done, zio, ZIO_FLAG_CANFAIL);
1173 
1174 		for (c = 0; c < rm->rm_cols; c++) {
1175 			rc = &rm->rm_col[c];
1176 			cvd = vd->vdev_child[rc->rc_devidx];
1177 
1178 			if (rc->rc_error == 0)
1179 				continue;
1180 
1181 			dprintf("%s resilvered %s @ 0x%llx error %d\n",
1182 			    vdev_description(vd),
1183 			    vdev_description(cvd),
1184 			    zio->io_offset, rc->rc_error);
1185 
1186 			zio_nowait(zio_vdev_child_io(rio, NULL, cvd,
1187 			    rc->rc_offset, rc->rc_data, rc->rc_size,
1188 			    ZIO_TYPE_WRITE, zio->io_priority,
1189 			    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_DONT_PROPAGATE |
1190 			    ZIO_FLAG_CANFAIL, NULL, NULL));
1191 		}
1192 
1193 		zio_nowait(rio);
1194 		zio_wait_children_done(zio);
1195 		return;
1196 	}
1197 
1198 	vdev_raidz_map_free(zio);
1199 	zio_next_stage(zio);
1200 }
1201 
1202 static void
1203 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
1204 {
1205 	if (faulted > vd->vdev_nparity)
1206 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1207 		    VDEV_AUX_NO_REPLICAS);
1208 	else if (degraded + faulted != 0)
1209 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
1210 	else
1211 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
1212 }
1213 
1214 vdev_ops_t vdev_raidz_ops = {
1215 	vdev_raidz_open,
1216 	vdev_raidz_close,
1217 	vdev_raidz_asize,
1218 	vdev_raidz_io_start,
1219 	vdev_raidz_io_done,
1220 	vdev_raidz_state_change,
1221 	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
1222 	B_FALSE			/* not a leaf vdev */
1223 };
1224