xref: /illumos-gate/usr/src/uts/common/fs/zfs/vdev_raidz.c (revision 1a220b56b93ff1dc80855691548503117af4cc10)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/zfs_context.h>
30 #include <sys/spa.h>
31 #include <sys/vdev_impl.h>
32 #include <sys/zio.h>
33 #include <sys/zio_checksum.h>
34 #include <sys/fs/zfs.h>
35 #include <sys/fm/fs/zfs.h>
36 
37 /*
38  * Virtual device vector for RAID-Z.
39  *
40  * This vdev supports both single and double parity. For single parity, we
41  * use a simple XOR of all the data columns. For double parity, we use both
42  * the simple XOR as well as a technique described in "The mathematics of
43  * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8),
44  * over the integers expressable in a single byte. Briefly, the operations on
45  * the field are defined as follows:
46  *
47  *   o addition (+) is represented by a bitwise XOR
48  *   o subtraction (-) is therefore identical to addition: A + B = A - B
49  *   o multiplication of A by 2 is defined by the following bitwise expression:
50  *	(A * 2)_7 = A_6
51  *	(A * 2)_6 = A_5
52  *	(A * 2)_5 = A_4
53  *	(A * 2)_4 = A_3 + A_7
54  *	(A * 2)_3 = A_2 + A_7
55  *	(A * 2)_2 = A_1 + A_7
56  *	(A * 2)_1 = A_0
57  *	(A * 2)_0 = A_7
58  *
59  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
60  *
61  * Observe that any number in the field (except for 0) can be expressed as a
62  * power of 2 -- a generator for the field. We store a table of the powers of
63  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
64  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
65  * than field addition). The inverse of a field element A (A^-1) is A^254.
66  *
67  * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1,
68  * can be expressed by field operations:
69  *
70  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
71  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
72  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
73  *
74  * See the reconstruction code below for how P and Q can used individually or
75  * in concert to recover missing data columns.
76  */
77 
78 typedef struct raidz_col {
79 	uint64_t rc_devidx;		/* child device index for I/O */
80 	uint64_t rc_offset;		/* device offset */
81 	uint64_t rc_size;		/* I/O size */
82 	void *rc_data;			/* I/O data */
83 	int rc_error;			/* I/O error for this device */
84 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
85 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
86 } raidz_col_t;
87 
88 typedef struct raidz_map {
89 	uint64_t rm_cols;		/* Column count */
90 	uint64_t rm_bigcols;		/* Number of oversized columns */
91 	uint64_t rm_asize;		/* Actual total I/O size */
92 	uint64_t rm_missingdata;	/* Count of missing data devices */
93 	uint64_t rm_missingparity;	/* Count of missing parity devices */
94 	uint64_t rm_firstdatacol;	/* First data column/parity count */
95 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
96 } raidz_map_t;
97 
98 #define	VDEV_RAIDZ_P		0
99 #define	VDEV_RAIDZ_Q		1
100 
101 #define	VDEV_RAIDZ_MAXPARITY	2
102 
103 #define	VDEV_RAIDZ_MUL_2(a)	(((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0))
104 
105 /*
106  * These two tables represent powers and logs of 2 in the Galois field defined
107  * above. These values were computed by repeatedly multiplying by 2 as above.
108  */
109 static const uint8_t vdev_raidz_pow2[256] = {
110 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
111 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
112 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
113 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
114 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
115 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
116 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
117 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
118 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
119 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
120 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
121 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
122 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
123 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
124 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
125 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
126 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
127 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
128 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
129 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
130 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
131 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
132 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
133 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
134 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
135 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
136 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
137 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
138 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
139 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
140 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
141 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
142 };
143 static const uint8_t vdev_raidz_log2[256] = {
144 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
145 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
146 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
147 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
148 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
149 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
150 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
151 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
152 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
153 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
154 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
155 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
156 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
157 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
158 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
159 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
160 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
161 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
162 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
163 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
164 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
165 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
166 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
167 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
168 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
169 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
170 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
171 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
172 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
173 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
174 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
175 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
176 };
177 
178 /*
179  * Multiply a given number by 2 raised to the given power.
180  */
181 static uint8_t
182 vdev_raidz_exp2(uint_t a, int exp)
183 {
184 	if (a == 0)
185 		return (0);
186 
187 	ASSERT(exp >= 0);
188 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
189 
190 	exp += vdev_raidz_log2[a];
191 	if (exp > 255)
192 		exp -= 255;
193 
194 	return (vdev_raidz_pow2[exp]);
195 }
196 
197 static raidz_map_t *
198 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
199     uint64_t nparity)
200 {
201 	raidz_map_t *rm;
202 	uint64_t b = zio->io_offset >> unit_shift;
203 	uint64_t s = zio->io_size >> unit_shift;
204 	uint64_t f = b % dcols;
205 	uint64_t o = (b / dcols) << unit_shift;
206 	uint64_t q, r, c, bc, col, acols, coff, devidx;
207 
208 	q = s / (dcols - nparity);
209 	r = s - q * (dcols - nparity);
210 	bc = (r == 0 ? 0 : r + nparity);
211 
212 	acols = (q == 0 ? bc : dcols);
213 
214 	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP);
215 
216 	rm->rm_cols = acols;
217 	rm->rm_bigcols = bc;
218 	rm->rm_asize = 0;
219 	rm->rm_missingdata = 0;
220 	rm->rm_missingparity = 0;
221 	rm->rm_firstdatacol = nparity;
222 
223 	for (c = 0; c < acols; c++) {
224 		col = f + c;
225 		coff = o;
226 		if (col >= dcols) {
227 			col -= dcols;
228 			coff += 1ULL << unit_shift;
229 		}
230 		rm->rm_col[c].rc_devidx = col;
231 		rm->rm_col[c].rc_offset = coff;
232 		rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift;
233 		rm->rm_col[c].rc_data = NULL;
234 		rm->rm_col[c].rc_error = 0;
235 		rm->rm_col[c].rc_tried = 0;
236 		rm->rm_col[c].rc_skipped = 0;
237 		rm->rm_asize += rm->rm_col[c].rc_size;
238 	}
239 
240 	rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift);
241 
242 	for (c = 0; c < rm->rm_firstdatacol; c++)
243 		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
244 
245 	rm->rm_col[c].rc_data = zio->io_data;
246 
247 	for (c = c + 1; c < acols; c++)
248 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
249 		    rm->rm_col[c - 1].rc_size;
250 
251 	/*
252 	 * If all data stored spans all columns, there's a danger that parity
253 	 * will always be on the same device and, since parity isn't read
254 	 * during normal operation, that that device's I/O bandwidth won't be
255 	 * used effectively. We therefore switch the parity every 1MB.
256 	 *
257 	 * ... at least that was, ostensibly, the theory. As a practical
258 	 * matter unless we juggle the parity between all devices evenly, we
259 	 * won't see any benefit. Further, occasional writes that aren't a
260 	 * multiple of the LCM of the number of children and the minimum
261 	 * stripe width are sufficient to avoid pessimal behavior.
262 	 * Unfortunately, this decision created an implicit on-disk format
263 	 * requirement that we need to support for all eternity (but only for
264 	 * RAID-Z with one parity device).
265 	 */
266 	ASSERT(rm->rm_cols >= 2);
267 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
268 
269 	if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
270 		devidx = rm->rm_col[0].rc_devidx;
271 		o = rm->rm_col[0].rc_offset;
272 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
273 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
274 		rm->rm_col[1].rc_devidx = devidx;
275 		rm->rm_col[1].rc_offset = o;
276 	}
277 
278 	zio->io_vsd = rm;
279 	return (rm);
280 }
281 
282 static void
283 vdev_raidz_map_free(zio_t *zio)
284 {
285 	raidz_map_t *rm = zio->io_vsd;
286 	int c;
287 
288 	for (c = 0; c < rm->rm_firstdatacol; c++)
289 		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
290 
291 	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols]));
292 	zio->io_vsd = NULL;
293 }
294 
295 static void
296 vdev_raidz_generate_parity_p(raidz_map_t *rm)
297 {
298 	uint64_t *p, *src, pcount, ccount, i;
299 	int c;
300 
301 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
302 
303 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
304 		src = rm->rm_col[c].rc_data;
305 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
306 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
307 
308 		if (c == rm->rm_firstdatacol) {
309 			ASSERT(ccount == pcount);
310 			for (i = 0; i < ccount; i++, p++, src++) {
311 				*p = *src;
312 			}
313 		} else {
314 			ASSERT(ccount <= pcount);
315 			for (i = 0; i < ccount; i++, p++, src++) {
316 				*p ^= *src;
317 			}
318 		}
319 	}
320 }
321 
322 static void
323 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
324 {
325 	uint64_t *q, *p, *src, pcount, ccount, mask, i;
326 	int c;
327 
328 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
329 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
330 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
331 
332 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
333 		src = rm->rm_col[c].rc_data;
334 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
335 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
336 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
337 
338 		if (c == rm->rm_firstdatacol) {
339 			ASSERT(ccount == pcount || ccount == 0);
340 			for (i = 0; i < ccount; i++, p++, q++, src++) {
341 				*q = *src;
342 				*p = *src;
343 			}
344 			for (; i < pcount; i++, p++, q++, src++) {
345 				*q = 0;
346 				*p = 0;
347 			}
348 		} else {
349 			ASSERT(ccount <= pcount);
350 
351 			/*
352 			 * Rather than multiplying each byte individually (as
353 			 * described above), we are able to handle 8 at once
354 			 * by generating a mask based on the high bit in each
355 			 * byte and using that to conditionally XOR in 0x1d.
356 			 */
357 			for (i = 0; i < ccount; i++, p++, q++, src++) {
358 				mask = *q & 0x8080808080808080ULL;
359 				mask = (mask << 1) - (mask >> 7);
360 				*q = ((*q << 1) & 0xfefefefefefefefeULL) ^
361 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
362 				*q ^= *src;
363 				*p ^= *src;
364 			}
365 
366 			/*
367 			 * Treat short columns as though they are full of 0s.
368 			 */
369 			for (; i < pcount; i++, q++) {
370 				mask = *q & 0x8080808080808080ULL;
371 				mask = (mask << 1) - (mask >> 7);
372 				*q = ((*q << 1) & 0xfefefefefefefefeULL) ^
373 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
374 			}
375 		}
376 	}
377 }
378 
379 static void
380 vdev_raidz_reconstruct_p(raidz_map_t *rm, int x)
381 {
382 	uint64_t *dst, *src, xcount, ccount, count, i;
383 	int c;
384 
385 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
386 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
387 	ASSERT(xcount > 0);
388 
389 	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
390 	dst = rm->rm_col[x].rc_data;
391 	for (i = 0; i < xcount; i++, dst++, src++) {
392 		*dst = *src;
393 	}
394 
395 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
396 		src = rm->rm_col[c].rc_data;
397 		dst = rm->rm_col[x].rc_data;
398 
399 		if (c == x)
400 			continue;
401 
402 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
403 		count = MIN(ccount, xcount);
404 
405 		for (i = 0; i < count; i++, dst++, src++) {
406 			*dst ^= *src;
407 		}
408 	}
409 }
410 
411 static void
412 vdev_raidz_reconstruct_q(raidz_map_t *rm, int x)
413 {
414 	uint64_t *dst, *src, xcount, ccount, count, mask, i;
415 	uint8_t *b;
416 	int c, j, exp;
417 
418 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
419 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
420 
421 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
422 		src = rm->rm_col[c].rc_data;
423 		dst = rm->rm_col[x].rc_data;
424 
425 		if (c == x)
426 			ccount = 0;
427 		else
428 			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
429 
430 		count = MIN(ccount, xcount);
431 
432 		if (c == rm->rm_firstdatacol) {
433 			for (i = 0; i < count; i++, dst++, src++) {
434 				*dst = *src;
435 			}
436 			for (; i < xcount; i++, dst++) {
437 				*dst = 0;
438 			}
439 
440 		} else {
441 			/*
442 			 * For an explanation of this, see the comment in
443 			 * vdev_raidz_generate_parity_pq() above.
444 			 */
445 			for (i = 0; i < count; i++, dst++, src++) {
446 				mask = *dst & 0x8080808080808080ULL;
447 				mask = (mask << 1) - (mask >> 7);
448 				*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
449 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
450 				*dst ^= *src;
451 			}
452 
453 			for (; i < xcount; i++, dst++) {
454 				mask = *dst & 0x8080808080808080ULL;
455 				mask = (mask << 1) - (mask >> 7);
456 				*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
457 				    (mask & 0x1d1d1d1d1d1d1d1dULL);
458 			}
459 		}
460 	}
461 
462 	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
463 	dst = rm->rm_col[x].rc_data;
464 	exp = 255 - (rm->rm_cols - 1 - x);
465 
466 	for (i = 0; i < xcount; i++, dst++, src++) {
467 		*dst ^= *src;
468 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
469 			*b = vdev_raidz_exp2(*b, exp);
470 		}
471 	}
472 }
473 
474 static void
475 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y)
476 {
477 	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
478 	void *pdata, *qdata;
479 	uint64_t xsize, ysize, i;
480 
481 	ASSERT(x < y);
482 	ASSERT(x >= rm->rm_firstdatacol);
483 	ASSERT(y < rm->rm_cols);
484 
485 	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
486 
487 	/*
488 	 * Move the parity data aside -- we're going to compute parity as
489 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
490 	 * reuse the parity generation mechanism without trashing the actual
491 	 * parity so we make those columns appear to be full of zeros by
492 	 * setting their lengths to zero.
493 	 */
494 	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
495 	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
496 	xsize = rm->rm_col[x].rc_size;
497 	ysize = rm->rm_col[y].rc_size;
498 
499 	rm->rm_col[VDEV_RAIDZ_P].rc_data =
500 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
501 	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
502 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
503 	rm->rm_col[x].rc_size = 0;
504 	rm->rm_col[y].rc_size = 0;
505 
506 	vdev_raidz_generate_parity_pq(rm);
507 
508 	rm->rm_col[x].rc_size = xsize;
509 	rm->rm_col[y].rc_size = ysize;
510 
511 	p = pdata;
512 	q = qdata;
513 	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
514 	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
515 	xd = rm->rm_col[x].rc_data;
516 	yd = rm->rm_col[y].rc_data;
517 
518 	/*
519 	 * We now have:
520 	 *	Pxy = P + D_x + D_y
521 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
522 	 *
523 	 * We can then solve for D_x:
524 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
525 	 * where
526 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
527 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
528 	 *
529 	 * With D_x in hand, we can easily solve for D_y:
530 	 *	D_y = P + Pxy + D_x
531 	 */
532 
533 	a = vdev_raidz_pow2[255 + x - y];
534 	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
535 	tmp = 255 - vdev_raidz_log2[a ^ 1];
536 
537 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
538 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
539 
540 	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
541 		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
542 		    vdev_raidz_exp2(*q ^ *qxy, bexp);
543 
544 		if (i < ysize)
545 			*yd = *p ^ *pxy ^ *xd;
546 	}
547 
548 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
549 	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
550 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
551 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
552 
553 	/*
554 	 * Restore the saved parity data.
555 	 */
556 	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
557 	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
558 }
559 
560 
561 static int
562 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
563 {
564 	vdev_t *cvd;
565 	uint64_t nparity = vd->vdev_nparity;
566 	int c, error;
567 	int lasterror = 0;
568 	int numerrors = 0;
569 
570 	ASSERT(nparity > 0);
571 
572 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
573 	    vd->vdev_children < nparity + 1) {
574 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
575 		return (EINVAL);
576 	}
577 
578 	for (c = 0; c < vd->vdev_children; c++) {
579 		cvd = vd->vdev_child[c];
580 
581 		if ((error = vdev_open(cvd)) != 0) {
582 			lasterror = error;
583 			numerrors++;
584 			continue;
585 		}
586 
587 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
588 		*ashift = MAX(*ashift, cvd->vdev_ashift);
589 	}
590 
591 	*asize *= vd->vdev_children;
592 
593 	if (numerrors > nparity) {
594 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
595 		return (lasterror);
596 	}
597 
598 	return (0);
599 }
600 
601 static void
602 vdev_raidz_close(vdev_t *vd)
603 {
604 	int c;
605 
606 	for (c = 0; c < vd->vdev_children; c++)
607 		vdev_close(vd->vdev_child[c]);
608 }
609 
610 static uint64_t
611 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
612 {
613 	uint64_t asize;
614 	uint64_t ashift = vd->vdev_top->vdev_ashift;
615 	uint64_t cols = vd->vdev_children;
616 	uint64_t nparity = vd->vdev_nparity;
617 
618 	asize = ((psize - 1) >> ashift) + 1;
619 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
620 	asize = roundup(asize, nparity + 1) << ashift;
621 
622 	return (asize);
623 }
624 
625 static void
626 vdev_raidz_child_done(zio_t *zio)
627 {
628 	raidz_col_t *rc = zio->io_private;
629 
630 	rc->rc_error = zio->io_error;
631 	rc->rc_tried = 1;
632 	rc->rc_skipped = 0;
633 }
634 
635 static void
636 vdev_raidz_repair_done(zio_t *zio)
637 {
638 	ASSERT(zio->io_private == zio->io_parent);
639 	vdev_raidz_map_free(zio->io_private);
640 }
641 
642 static void
643 vdev_raidz_io_start(zio_t *zio)
644 {
645 	vdev_t *vd = zio->io_vd;
646 	vdev_t *tvd = vd->vdev_top;
647 	vdev_t *cvd;
648 	blkptr_t *bp = zio->io_bp;
649 	raidz_map_t *rm;
650 	raidz_col_t *rc;
651 	int c;
652 
653 	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
654 	    vd->vdev_nparity);
655 
656 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
657 
658 	if (zio->io_type == ZIO_TYPE_WRITE) {
659 		/*
660 		 * Generate RAID parity in the first virtual columns.
661 		 */
662 		if (rm->rm_firstdatacol == 1)
663 			vdev_raidz_generate_parity_p(rm);
664 		else
665 			vdev_raidz_generate_parity_pq(rm);
666 
667 		for (c = 0; c < rm->rm_cols; c++) {
668 			rc = &rm->rm_col[c];
669 			cvd = vd->vdev_child[rc->rc_devidx];
670 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
671 			    rc->rc_offset, rc->rc_data, rc->rc_size,
672 			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
673 			    vdev_raidz_child_done, rc));
674 		}
675 		zio_wait_children_done(zio);
676 		return;
677 	}
678 
679 	ASSERT(zio->io_type == ZIO_TYPE_READ);
680 
681 	/*
682 	 * Iterate over the columns in reverse order so that we hit the parity
683 	 * last -- any errors along the way will force us to read the parity
684 	 * data.
685 	 */
686 	for (c = rm->rm_cols - 1; c >= 0; c--) {
687 		rc = &rm->rm_col[c];
688 		cvd = vd->vdev_child[rc->rc_devidx];
689 		if (vdev_is_dead(cvd)) {
690 			if (c >= rm->rm_firstdatacol)
691 				rm->rm_missingdata++;
692 			else
693 				rm->rm_missingparity++;
694 			rc->rc_error = ENXIO;
695 			rc->rc_tried = 1;	/* don't even try */
696 			rc->rc_skipped = 1;
697 			continue;
698 		}
699 		if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) {
700 			if (c >= rm->rm_firstdatacol)
701 				rm->rm_missingdata++;
702 			else
703 				rm->rm_missingparity++;
704 			rc->rc_error = ESTALE;
705 			rc->rc_skipped = 1;
706 			continue;
707 		}
708 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
709 		    (zio->io_flags & ZIO_FLAG_SCRUB)) {
710 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
711 			    rc->rc_offset, rc->rc_data, rc->rc_size,
712 			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
713 			    vdev_raidz_child_done, rc));
714 		}
715 	}
716 
717 	zio_wait_children_done(zio);
718 }
719 
720 /*
721  * Report a checksum error for a child of a RAID-Z device.
722  */
723 static void
724 raidz_checksum_error(zio_t *zio, raidz_col_t *rc)
725 {
726 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
727 	dprintf_bp(zio->io_bp, "imputed checksum error on %s: ",
728 	    vdev_description(vd));
729 
730 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
731 		mutex_enter(&vd->vdev_stat_lock);
732 		vd->vdev_stat.vs_checksum_errors++;
733 		mutex_exit(&vd->vdev_stat_lock);
734 	}
735 
736 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE))
737 		zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
738 		    zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size);
739 }
740 
741 /*
742  * Generate the parity from the data columns. If we tried and were able to
743  * read the parity without error, verify that the generated parity matches the
744  * data we read. If it doesn't, we fire off a checksum error. Return the
745  * number such failures.
746  */
747 static int
748 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
749 {
750 	void *orig[VDEV_RAIDZ_MAXPARITY];
751 	int c, ret = 0;
752 	raidz_col_t *rc;
753 
754 	for (c = 0; c < rm->rm_firstdatacol; c++) {
755 		rc = &rm->rm_col[c];
756 		if (!rc->rc_tried || rc->rc_error != 0)
757 			continue;
758 		orig[c] = zio_buf_alloc(rc->rc_size);
759 		bcopy(rc->rc_data, orig[c], rc->rc_size);
760 	}
761 
762 	if (rm->rm_firstdatacol == 1)
763 		vdev_raidz_generate_parity_p(rm);
764 	else
765 		vdev_raidz_generate_parity_pq(rm);
766 
767 	for (c = 0; c < rm->rm_firstdatacol; c++) {
768 		rc = &rm->rm_col[c];
769 		if (!rc->rc_tried || rc->rc_error != 0)
770 			continue;
771 		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
772 			raidz_checksum_error(zio, rc);
773 			rc->rc_error = ECKSUM;
774 			ret++;
775 		}
776 		zio_buf_free(orig[c], rc->rc_size);
777 	}
778 
779 	return (ret);
780 }
781 
782 static uint64_t raidz_corrected_p;
783 static uint64_t raidz_corrected_q;
784 static uint64_t raidz_corrected_pq;
785 
786 static void
787 vdev_raidz_io_done(zio_t *zio)
788 {
789 	vdev_t *vd = zio->io_vd;
790 	vdev_t *cvd;
791 	raidz_map_t *rm = zio->io_vsd;
792 	raidz_col_t *rc, *rc1;
793 	int unexpected_errors = 0;
794 	int parity_errors = 0;
795 	int data_errors = 0;
796 	int n, c, c1;
797 
798 	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
799 
800 	zio->io_error = 0;
801 	zio->io_numerrors = 0;
802 
803 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
804 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
805 
806 	for (c = 0; c < rm->rm_cols; c++) {
807 		rc = &rm->rm_col[c];
808 
809 		/*
810 		 * We preserve any EIOs because those may be worth retrying;
811 		 * whereas ECKSUM and ENXIO are more likely to be persistent.
812 		 */
813 		if (rc->rc_error) {
814 			if (zio->io_error != EIO)
815 				zio->io_error = rc->rc_error;
816 
817 			if (c < rm->rm_firstdatacol)
818 				parity_errors++;
819 			else
820 				data_errors++;
821 
822 			if (!rc->rc_skipped)
823 				unexpected_errors++;
824 
825 			zio->io_numerrors++;
826 		}
827 	}
828 
829 	if (zio->io_type == ZIO_TYPE_WRITE) {
830 		/*
831 		 * If this is not a failfast write, and we were able to
832 		 * write enough columns to reconstruct the data, good enough.
833 		 */
834 		/* XXPOLICY */
835 		if (zio->io_numerrors <= rm->rm_firstdatacol &&
836 		    !(zio->io_flags & ZIO_FLAG_FAILFAST))
837 			zio->io_error = 0;
838 
839 		vdev_raidz_map_free(zio);
840 		zio_next_stage(zio);
841 		return;
842 	}
843 
844 	ASSERT(zio->io_type == ZIO_TYPE_READ);
845 	/*
846 	 * There are three potential phases for a read:
847 	 *	1. produce valid data from the columns read
848 	 *	2. read all disks and try again
849 	 *	3. perform combinatorial reconstruction
850 	 *
851 	 * Each phase is progressively both more expensive and less likely to
852 	 * occur. If we encounter more errors than we can repair or all phases
853 	 * fail, we have no choice but to return an error.
854 	 */
855 
856 	/*
857 	 * If the number of errors we saw was correctable -- less than or equal
858 	 * to the number of parity disks -- attempt to produce data that has a
859 	 * valid checksum. Naturally, zero errors falls into this case.
860 	 */
861 	if (zio->io_numerrors <= rm->rm_firstdatacol) {
862 		switch (data_errors) {
863 		case 0:
864 			if (zio_checksum_error(zio) == 0) {
865 				zio->io_error = 0;
866 				n = raidz_parity_verify(zio, rm);
867 				unexpected_errors += n;
868 				ASSERT(parity_errors + n <=
869 				    rm->rm_firstdatacol);
870 				goto done;
871 			}
872 			break;
873 
874 		case 1:
875 			ASSERT(parity_errors < rm->rm_firstdatacol);
876 
877 			/*
878 			 * Find the column that reported the error.
879 			 */
880 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
881 				rc = &rm->rm_col[c];
882 				if (rc->rc_error != 0)
883 					break;
884 			}
885 			ASSERT(c != rm->rm_cols);
886 			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
887 			    rc->rc_error == ESTALE);
888 
889 			if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
890 				vdev_raidz_reconstruct_p(rm, c);
891 			} else {
892 				ASSERT(rm->rm_firstdatacol > 1);
893 				vdev_raidz_reconstruct_q(rm, c);
894 			}
895 
896 			if (zio_checksum_error(zio) == 0) {
897 				zio->io_error = 0;
898 				if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0)
899 					atomic_inc_64(&raidz_corrected_p);
900 				else
901 					atomic_inc_64(&raidz_corrected_q);
902 
903 				/*
904 				 * If there's more than one parity disk,
905 				 * confirm that the parity disk not used above
906 				 * has the correct data.
907 				 */
908 				if (rm->rm_firstdatacol > 1) {
909 					n = raidz_parity_verify(zio, rm);
910 					unexpected_errors += n;
911 					ASSERT(parity_errors + n <=
912 					    rm->rm_firstdatacol);
913 				}
914 
915 				goto done;
916 			}
917 			break;
918 
919 		case 2:
920 			/*
921 			 * Find the two columns that reported errors.
922 			 */
923 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
924 				rc = &rm->rm_col[c];
925 				if (rc->rc_error != 0)
926 					break;
927 			}
928 			ASSERT(c != rm->rm_cols);
929 			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
930 			    rc->rc_error == ESTALE);
931 
932 			for (c1 = c++; c < rm->rm_cols; c++) {
933 				rc = &rm->rm_col[c];
934 				if (rc->rc_error != 0)
935 					break;
936 			}
937 			ASSERT(c != rm->rm_cols);
938 			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
939 			    rc->rc_error == ESTALE);
940 
941 			vdev_raidz_reconstruct_pq(rm, c1, c);
942 
943 			if (zio_checksum_error(zio) == 0) {
944 				zio->io_error = 0;
945 				atomic_inc_64(&raidz_corrected_pq);
946 
947 				goto done;
948 			}
949 			break;
950 
951 		default:
952 			ASSERT(rm->rm_firstdatacol <= 2);
953 			ASSERT(0);
954 		}
955 	}
956 
957 	/*
958 	 * This isn't a typical situation -- either we got a read error or
959 	 * a child silently returned bad data. Read every block so we can
960 	 * try again with as much data and parity as we can track down. If
961 	 * we've already been through once before, all children will be marked
962 	 * as tried so we'll proceed to combinatorial reconstruction.
963 	 */
964 	unexpected_errors = 1;
965 	rm->rm_missingdata = 0;
966 	rm->rm_missingparity = 0;
967 
968 	for (c = 0; c < rm->rm_cols; c++) {
969 		if (rm->rm_col[c].rc_tried)
970 			continue;
971 
972 		zio->io_error = 0;
973 		zio_vdev_io_redone(zio);
974 		do {
975 			rc = &rm->rm_col[c];
976 			if (rc->rc_tried)
977 				continue;
978 			zio_nowait(zio_vdev_child_io(zio, NULL,
979 			    vd->vdev_child[rc->rc_devidx],
980 			    rc->rc_offset, rc->rc_data, rc->rc_size,
981 			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
982 			    vdev_raidz_child_done, rc));
983 		} while (++c < rm->rm_cols);
984 		dprintf("rereading\n");
985 		zio_wait_children_done(zio);
986 		return;
987 	}
988 
989 	/*
990 	 * At this point we've attempted to reconstruct the data given the
991 	 * errors we detected, and we've attempted to read all columns. There
992 	 * must, therefore, be one or more additional problems -- silent errors
993 	 * resulting in invalid data rather than explicit I/O errors resulting
994 	 * in absent data. Before we attempt combinatorial reconstruction make
995 	 * sure we have a chance of coming up with the right answer.
996 	 */
997 	if (zio->io_numerrors >= rm->rm_firstdatacol) {
998 		ASSERT(zio->io_error != 0);
999 		goto done;
1000 	}
1001 
1002 	if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
1003 		/*
1004 		 * Attempt to reconstruct the data from parity P.
1005 		 */
1006 		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1007 			void *orig;
1008 			rc = &rm->rm_col[c];
1009 
1010 			orig = zio_buf_alloc(rc->rc_size);
1011 			bcopy(rc->rc_data, orig, rc->rc_size);
1012 			vdev_raidz_reconstruct_p(rm, c);
1013 
1014 			if (zio_checksum_error(zio) == 0) {
1015 				zio_buf_free(orig, rc->rc_size);
1016 				zio->io_error = 0;
1017 				atomic_inc_64(&raidz_corrected_p);
1018 
1019 				/*
1020 				 * If this child didn't know that it returned
1021 				 * bad data, inform it.
1022 				 */
1023 				if (rc->rc_tried && rc->rc_error == 0)
1024 					raidz_checksum_error(zio, rc);
1025 				rc->rc_error = ECKSUM;
1026 				goto done;
1027 			}
1028 
1029 			bcopy(orig, rc->rc_data, rc->rc_size);
1030 			zio_buf_free(orig, rc->rc_size);
1031 		}
1032 	}
1033 
1034 	if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
1035 		/*
1036 		 * Attempt to reconstruct the data from parity Q.
1037 		 */
1038 		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1039 			void *orig;
1040 			rc = &rm->rm_col[c];
1041 
1042 			orig = zio_buf_alloc(rc->rc_size);
1043 			bcopy(rc->rc_data, orig, rc->rc_size);
1044 			vdev_raidz_reconstruct_q(rm, c);
1045 
1046 			if (zio_checksum_error(zio) == 0) {
1047 				zio_buf_free(orig, rc->rc_size);
1048 				zio->io_error = 0;
1049 				atomic_inc_64(&raidz_corrected_q);
1050 
1051 				/*
1052 				 * If this child didn't know that it returned
1053 				 * bad data, inform it.
1054 				 */
1055 				if (rc->rc_tried && rc->rc_error == 0)
1056 					raidz_checksum_error(zio, rc);
1057 				rc->rc_error = ECKSUM;
1058 				goto done;
1059 			}
1060 
1061 			bcopy(orig, rc->rc_data, rc->rc_size);
1062 			zio_buf_free(orig, rc->rc_size);
1063 		}
1064 	}
1065 
1066 	if (rm->rm_firstdatacol > 1 &&
1067 	    rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 &&
1068 	    rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
1069 		/*
1070 		 * Attempt to reconstruct the data from both P and Q.
1071 		 */
1072 		for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) {
1073 			void *orig, *orig1;
1074 			rc = &rm->rm_col[c];
1075 
1076 			orig = zio_buf_alloc(rc->rc_size);
1077 			bcopy(rc->rc_data, orig, rc->rc_size);
1078 
1079 			for (c1 = c + 1; c1 < rm->rm_cols; c1++) {
1080 				rc1 = &rm->rm_col[c1];
1081 
1082 				orig1 = zio_buf_alloc(rc1->rc_size);
1083 				bcopy(rc1->rc_data, orig1, rc1->rc_size);
1084 
1085 				vdev_raidz_reconstruct_pq(rm, c, c1);
1086 
1087 				if (zio_checksum_error(zio) == 0) {
1088 					zio_buf_free(orig, rc->rc_size);
1089 					zio_buf_free(orig1, rc1->rc_size);
1090 					zio->io_error = 0;
1091 					atomic_inc_64(&raidz_corrected_pq);
1092 
1093 					/*
1094 					 * If these children didn't know they
1095 					 * returned bad data, inform them.
1096 					 */
1097 					if (rc->rc_tried && rc->rc_error == 0)
1098 						raidz_checksum_error(zio, rc);
1099 					if (rc1->rc_tried && rc1->rc_error == 0)
1100 						raidz_checksum_error(zio, rc1);
1101 
1102 					rc->rc_error = ECKSUM;
1103 					rc1->rc_error = ECKSUM;
1104 
1105 					goto done;
1106 				}
1107 
1108 				bcopy(orig1, rc1->rc_data, rc1->rc_size);
1109 				zio_buf_free(orig1, rc1->rc_size);
1110 			}
1111 
1112 			bcopy(orig, rc->rc_data, rc->rc_size);
1113 			zio_buf_free(orig, rc->rc_size);
1114 		}
1115 	}
1116 
1117 	/*
1118 	 * All combinations failed to checksum. Generate checksum ereports for
1119 	 * all children.
1120 	 */
1121 	zio->io_error = ECKSUM;
1122 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1123 		for (c = 0; c < rm->rm_cols; c++) {
1124 			rc = &rm->rm_col[c];
1125 			zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
1126 			    zio->io_spa, vd->vdev_child[rc->rc_devidx], zio,
1127 			    rc->rc_offset, rc->rc_size);
1128 		}
1129 	}
1130 
1131 done:
1132 	zio_checksum_verified(zio);
1133 
1134 	if (zio->io_error == 0 && (spa_mode & FWRITE) &&
1135 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
1136 		zio_t *rio;
1137 
1138 		/*
1139 		 * Use the good data we have in hand to repair damaged children.
1140 		 *
1141 		 * We issue all repair I/Os as children of 'rio' to arrange
1142 		 * that vdev_raidz_map_free(zio) will be invoked after all
1143 		 * repairs complete, but before we advance to the next stage.
1144 		 */
1145 		rio = zio_null(zio, zio->io_spa,
1146 		    vdev_raidz_repair_done, zio, ZIO_FLAG_CANFAIL);
1147 
1148 		for (c = 0; c < rm->rm_cols; c++) {
1149 			rc = &rm->rm_col[c];
1150 			cvd = vd->vdev_child[rc->rc_devidx];
1151 
1152 			if (rc->rc_error == 0)
1153 				continue;
1154 
1155 			dprintf("%s resilvered %s @ 0x%llx error %d\n",
1156 			    vdev_description(vd),
1157 			    vdev_description(cvd),
1158 			    zio->io_offset, rc->rc_error);
1159 
1160 			zio_nowait(zio_vdev_child_io(rio, NULL, cvd,
1161 			    rc->rc_offset, rc->rc_data, rc->rc_size,
1162 			    ZIO_TYPE_WRITE, zio->io_priority,
1163 			    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_DONT_PROPAGATE |
1164 			    ZIO_FLAG_CANFAIL, NULL, NULL));
1165 		}
1166 
1167 		zio_nowait(rio);
1168 		zio_wait_children_done(zio);
1169 		return;
1170 	}
1171 
1172 	vdev_raidz_map_free(zio);
1173 	zio_next_stage(zio);
1174 }
1175 
1176 static void
1177 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
1178 {
1179 	if (faulted > vd->vdev_nparity)
1180 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1181 		    VDEV_AUX_NO_REPLICAS);
1182 	else if (degraded + faulted != 0)
1183 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
1184 	else
1185 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
1186 }
1187 
1188 vdev_ops_t vdev_raidz_ops = {
1189 	vdev_raidz_open,
1190 	vdev_raidz_close,
1191 	vdev_raidz_asize,
1192 	vdev_raidz_io_start,
1193 	vdev_raidz_io_done,
1194 	vdev_raidz_state_change,
1195 	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
1196 	B_FALSE			/* not a leaf vdev */
1197 };
1198