xref: /titanic_51/usr/src/uts/common/fs/zfs/vdev_raidz.c (revision ab8b4e5c888370036603c72719799620ea5c6b77)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/zfs_context.h>
28 #include <sys/spa.h>
29 #include <sys/vdev_impl.h>
30 #include <sys/zio.h>
31 #include <sys/zio_checksum.h>
32 #include <sys/fs/zfs.h>
33 #include <sys/fm/fs/zfs.h>
34 
35 /*
36  * Virtual device vector for RAID-Z.
37  *
38  * This vdev supports single, double, and triple parity. For single parity,
39  * we use a simple XOR of all the data columns. For double or triple parity,
40  * we use a special case of Reed-Solomon coding. This extends the
41  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
42  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
43  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
44  * former is also based. The latter is designed to provide higher performance
45  * for writes.
46  *
47  * Note that the Plank paper claimed to support arbitrary N+M, but was then
48  * amended six years later identifying a critical flaw that invalidates its
49  * claims. Nevertheless, the technique can be adapted to work for up to
50  * triple parity. For additional parity, the amendment "Note: Correction to
51  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
52  * is viable, but the additional complexity means that write performance will
53  * suffer.
54  *
55  * All of the methods above operate on a Galois field, defined over the
56  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
57  * can be expressed with a single byte. Briefly, the operations on the
58  * field are defined as follows:
59  *
60  *   o addition (+) is represented by a bitwise XOR
61  *   o subtraction (-) is therefore identical to addition: A + B = A - B
62  *   o multiplication of A by 2 is defined by the following bitwise expression:
63  *	(A * 2)_7 = A_6
64  *	(A * 2)_6 = A_5
65  *	(A * 2)_5 = A_4
66  *	(A * 2)_4 = A_3 + A_7
67  *	(A * 2)_3 = A_2 + A_7
68  *	(A * 2)_2 = A_1 + A_7
69  *	(A * 2)_1 = A_0
70  *	(A * 2)_0 = A_7
71  *
72  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
73  * As an aside, this multiplication is derived from the error correcting
74  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
75  *
76  * Observe that any number in the field (except for 0) can be expressed as a
77  * power of 2 -- a generator for the field. We store a table of the powers of
78  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
79  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
80  * than field addition). The inverse of a field element A (A^-1) is therefore
81  * A ^ (255 - 1) = A^254.
82  *
83  * The up-to-three parity columns, P, Q, R over several data columns,
84  * D_0, ... D_n-1, can be expressed by field operations:
85  *
86  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
87  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
88  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
89  *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
90  *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
91  *
92  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
93  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
94  * independent coefficients. (There are no additional coefficients that have
95  * this property which is why the uncorrected Plank method breaks down.)
96  *
97  * See the reconstruction code below for how P, Q and R can used individually
98  * or in concert to recover missing data columns.
99  */
100 
101 typedef struct raidz_col {
102 	uint64_t rc_devidx;		/* child device index for I/O */
103 	uint64_t rc_offset;		/* device offset */
104 	uint64_t rc_size;		/* I/O size */
105 	void *rc_data;			/* I/O data */
106 	int rc_error;			/* I/O error for this device */
107 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
108 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
109 } raidz_col_t;
110 
111 typedef struct raidz_map {
112 	uint64_t rm_cols;		/* Regular column count */
113 	uint64_t rm_scols;		/* Count including skipped columns */
114 	uint64_t rm_bigcols;		/* Number of oversized columns */
115 	uint64_t rm_asize;		/* Actual total I/O size */
116 	uint64_t rm_missingdata;	/* Count of missing data devices */
117 	uint64_t rm_missingparity;	/* Count of missing parity devices */
118 	uint64_t rm_firstdatacol;	/* First data column/parity count */
119 	uint64_t rm_skipped;		/* Skipped sectors for padding */
120 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
121 } raidz_map_t;
122 
123 #define	VDEV_RAIDZ_P		0
124 #define	VDEV_RAIDZ_Q		1
125 #define	VDEV_RAIDZ_R		2
126 #define	VDEV_RAIDZ_MAXPARITY	3
127 
128 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
129 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
130 
131 /*
132  * We provide a mechanism to perform the field multiplication operation on a
133  * 64-bit value all at once rather than a byte at a time. This works by
134  * creating a mask from the top bit in each byte and using that to
135  * conditionally apply the XOR of 0x1d.
136  */
137 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
138 { \
139 	(mask) = (x) & 0x8080808080808080ULL; \
140 	(mask) = ((mask) << 1) - ((mask) >> 7); \
141 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
142 	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
143 }
144 
145 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
146 { \
147 	VDEV_RAIDZ_64MUL_2((x), mask); \
148 	VDEV_RAIDZ_64MUL_2((x), mask); \
149 }
150 
151 /*
152  * Force reconstruction to use the general purpose method.
153  */
154 int vdev_raidz_default_to_general;
155 
156 /*
157  * These two tables represent powers and logs of 2 in the Galois field defined
158  * above. These values were computed by repeatedly multiplying by 2 as above.
159  */
160 static const uint8_t vdev_raidz_pow2[256] = {
161 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
162 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
163 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
164 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
165 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
166 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
167 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
168 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
169 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
170 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
171 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
172 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
173 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
174 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
175 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
176 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
177 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
178 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
179 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
180 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
181 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
182 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
183 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
184 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
185 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
186 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
187 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
188 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
189 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
190 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
191 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
192 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
193 };
194 static const uint8_t vdev_raidz_log2[256] = {
195 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
196 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
197 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
198 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
199 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
200 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
201 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
202 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
203 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
204 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
205 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
206 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
207 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
208 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
209 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
210 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
211 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
212 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
213 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
214 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
215 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
216 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
217 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
218 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
219 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
220 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
221 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
222 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
223 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
224 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
225 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
226 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
227 };
228 
229 /*
230  * Multiply a given number by 2 raised to the given power.
231  */
232 static uint8_t
233 vdev_raidz_exp2(uint_t a, int exp)
234 {
235 	if (a == 0)
236 		return (0);
237 
238 	ASSERT(exp >= 0);
239 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
240 
241 	exp += vdev_raidz_log2[a];
242 	if (exp > 255)
243 		exp -= 255;
244 
245 	return (vdev_raidz_pow2[exp]);
246 }
247 
248 static void
249 vdev_raidz_map_free(zio_t *zio)
250 {
251 	raidz_map_t *rm = zio->io_vsd;
252 	int c;
253 
254 	for (c = 0; c < rm->rm_firstdatacol; c++)
255 		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
256 
257 	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
258 }
259 
260 static raidz_map_t *
261 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
262     uint64_t nparity)
263 {
264 	raidz_map_t *rm;
265 	uint64_t b = zio->io_offset >> unit_shift;
266 	uint64_t s = zio->io_size >> unit_shift;
267 	uint64_t f = b % dcols;
268 	uint64_t o = (b / dcols) << unit_shift;
269 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
270 
271 	q = s / (dcols - nparity);
272 	r = s - q * (dcols - nparity);
273 	bc = (r == 0 ? 0 : r + nparity);
274 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
275 
276 	if (q == 0) {
277 		acols = bc;
278 		scols = MIN(dcols, roundup(bc, nparity + 1));
279 	} else {
280 		acols = dcols;
281 		scols = dcols;
282 	}
283 
284 	ASSERT3U(acols, <=, scols);
285 
286 	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
287 
288 	rm->rm_cols = acols;
289 	rm->rm_scols = scols;
290 	rm->rm_bigcols = bc;
291 	rm->rm_missingdata = 0;
292 	rm->rm_missingparity = 0;
293 	rm->rm_firstdatacol = nparity;
294 
295 	asize = 0;
296 
297 	for (c = 0; c < scols; c++) {
298 		col = f + c;
299 		coff = o;
300 		if (col >= dcols) {
301 			col -= dcols;
302 			coff += 1ULL << unit_shift;
303 		}
304 		rm->rm_col[c].rc_devidx = col;
305 		rm->rm_col[c].rc_offset = coff;
306 		rm->rm_col[c].rc_data = NULL;
307 		rm->rm_col[c].rc_error = 0;
308 		rm->rm_col[c].rc_tried = 0;
309 		rm->rm_col[c].rc_skipped = 0;
310 
311 		if (c >= acols)
312 			rm->rm_col[c].rc_size = 0;
313 		else if (c < bc)
314 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
315 		else
316 			rm->rm_col[c].rc_size = q << unit_shift;
317 
318 		asize += rm->rm_col[c].rc_size;
319 	}
320 
321 	ASSERT3U(asize, ==, tot << unit_shift);
322 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
323 	rm->rm_skipped = roundup(tot, nparity + 1) - tot;
324 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_skipped << unit_shift);
325 	ASSERT3U(rm->rm_skipped, <=, nparity);
326 
327 	for (c = 0; c < rm->rm_firstdatacol; c++)
328 		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
329 
330 	rm->rm_col[c].rc_data = zio->io_data;
331 
332 	for (c = c + 1; c < acols; c++)
333 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
334 		    rm->rm_col[c - 1].rc_size;
335 
336 	/*
337 	 * If all data stored spans all columns, there's a danger that parity
338 	 * will always be on the same device and, since parity isn't read
339 	 * during normal operation, that that device's I/O bandwidth won't be
340 	 * used effectively. We therefore switch the parity every 1MB.
341 	 *
342 	 * ... at least that was, ostensibly, the theory. As a practical
343 	 * matter unless we juggle the parity between all devices evenly, we
344 	 * won't see any benefit. Further, occasional writes that aren't a
345 	 * multiple of the LCM of the number of children and the minimum
346 	 * stripe width are sufficient to avoid pessimal behavior.
347 	 * Unfortunately, this decision created an implicit on-disk format
348 	 * requirement that we need to support for all eternity, but only
349 	 * for single-parity RAID-Z.
350 	 */
351 	ASSERT(rm->rm_cols >= 2);
352 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
353 
354 	if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
355 		devidx = rm->rm_col[0].rc_devidx;
356 		o = rm->rm_col[0].rc_offset;
357 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
358 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
359 		rm->rm_col[1].rc_devidx = devidx;
360 		rm->rm_col[1].rc_offset = o;
361 	}
362 
363 	zio->io_vsd = rm;
364 	zio->io_vsd_free = vdev_raidz_map_free;
365 	return (rm);
366 }
367 
368 static void
369 vdev_raidz_generate_parity_p(raidz_map_t *rm)
370 {
371 	uint64_t *p, *src, pcount, ccount, i;
372 	int c;
373 
374 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
375 
376 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
377 		src = rm->rm_col[c].rc_data;
378 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
379 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
380 
381 		if (c == rm->rm_firstdatacol) {
382 			ASSERT(ccount == pcount);
383 			for (i = 0; i < ccount; i++, src++, p++) {
384 				*p = *src;
385 			}
386 		} else {
387 			ASSERT(ccount <= pcount);
388 			for (i = 0; i < ccount; i++, src++, p++) {
389 				*p ^= *src;
390 			}
391 		}
392 	}
393 }
394 
395 static void
396 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
397 {
398 	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
399 	int c;
400 
401 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
402 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
403 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
404 
405 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
406 		src = rm->rm_col[c].rc_data;
407 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
408 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
409 
410 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
411 
412 		if (c == rm->rm_firstdatacol) {
413 			ASSERT(ccnt == pcnt || ccnt == 0);
414 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
415 				*p = *src;
416 				*q = *src;
417 			}
418 			for (; i < pcnt; i++, src++, p++, q++) {
419 				*p = 0;
420 				*q = 0;
421 			}
422 		} else {
423 			ASSERT(ccnt <= pcnt);
424 
425 			/*
426 			 * Apply the algorithm described above by multiplying
427 			 * the previous result and adding in the new value.
428 			 */
429 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
430 				*p ^= *src;
431 
432 				VDEV_RAIDZ_64MUL_2(*q, mask);
433 				*q ^= *src;
434 			}
435 
436 			/*
437 			 * Treat short columns as though they are full of 0s.
438 			 * Note that there's therefore nothing needed for P.
439 			 */
440 			for (; i < pcnt; i++, q++) {
441 				VDEV_RAIDZ_64MUL_2(*q, mask);
442 			}
443 		}
444 	}
445 }
446 
447 static void
448 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
449 {
450 	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
451 	int c;
452 
453 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
454 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
455 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
456 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
457 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
458 
459 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
460 		src = rm->rm_col[c].rc_data;
461 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
462 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
463 		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
464 
465 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
466 
467 		if (c == rm->rm_firstdatacol) {
468 			ASSERT(ccnt == pcnt || ccnt == 0);
469 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
470 				*p = *src;
471 				*q = *src;
472 				*r = *src;
473 			}
474 			for (; i < pcnt; i++, src++, p++, q++, r++) {
475 				*p = 0;
476 				*q = 0;
477 				*r = 0;
478 			}
479 		} else {
480 			ASSERT(ccnt <= pcnt);
481 
482 			/*
483 			 * Apply the algorithm described above by multiplying
484 			 * the previous result and adding in the new value.
485 			 */
486 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
487 				*p ^= *src;
488 
489 				VDEV_RAIDZ_64MUL_2(*q, mask);
490 				*q ^= *src;
491 
492 				VDEV_RAIDZ_64MUL_4(*r, mask);
493 				*r ^= *src;
494 			}
495 
496 			/*
497 			 * Treat short columns as though they are full of 0s.
498 			 * Note that there's therefore nothing needed for P.
499 			 */
500 			for (; i < pcnt; i++, q++, r++) {
501 				VDEV_RAIDZ_64MUL_2(*q, mask);
502 				VDEV_RAIDZ_64MUL_4(*r, mask);
503 			}
504 		}
505 	}
506 }
507 
508 /*
509  * Generate RAID parity in the first virtual columns according to the number of
510  * parity columns available.
511  */
512 static void
513 vdev_raidz_generate_parity(raidz_map_t *rm)
514 {
515 	switch (rm->rm_firstdatacol) {
516 	case 1:
517 		vdev_raidz_generate_parity_p(rm);
518 		break;
519 	case 2:
520 		vdev_raidz_generate_parity_pq(rm);
521 		break;
522 	case 3:
523 		vdev_raidz_generate_parity_pqr(rm);
524 		break;
525 	default:
526 		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
527 	}
528 }
529 
530 static int
531 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
532 {
533 	uint64_t *dst, *src, xcount, ccount, count, i;
534 	int x = tgts[0];
535 	int c;
536 
537 	ASSERT(ntgts == 1);
538 	ASSERT(x >= rm->rm_firstdatacol);
539 	ASSERT(x < rm->rm_cols);
540 
541 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
542 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
543 	ASSERT(xcount > 0);
544 
545 	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
546 	dst = rm->rm_col[x].rc_data;
547 	for (i = 0; i < xcount; i++, dst++, src++) {
548 		*dst = *src;
549 	}
550 
551 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
552 		src = rm->rm_col[c].rc_data;
553 		dst = rm->rm_col[x].rc_data;
554 
555 		if (c == x)
556 			continue;
557 
558 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
559 		count = MIN(ccount, xcount);
560 
561 		for (i = 0; i < count; i++, dst++, src++) {
562 			*dst ^= *src;
563 		}
564 	}
565 
566 	return (1 << VDEV_RAIDZ_P);
567 }
568 
569 static int
570 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
571 {
572 	uint64_t *dst, *src, xcount, ccount, count, mask, i;
573 	uint8_t *b;
574 	int x = tgts[0];
575 	int c, j, exp;
576 
577 	ASSERT(ntgts == 1);
578 
579 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
580 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
581 
582 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
583 		src = rm->rm_col[c].rc_data;
584 		dst = rm->rm_col[x].rc_data;
585 
586 		if (c == x)
587 			ccount = 0;
588 		else
589 			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
590 
591 		count = MIN(ccount, xcount);
592 
593 		if (c == rm->rm_firstdatacol) {
594 			for (i = 0; i < count; i++, dst++, src++) {
595 				*dst = *src;
596 			}
597 			for (; i < xcount; i++, dst++) {
598 				*dst = 0;
599 			}
600 
601 		} else {
602 			for (i = 0; i < count; i++, dst++, src++) {
603 				VDEV_RAIDZ_64MUL_2(*dst, mask);
604 				*dst ^= *src;
605 			}
606 
607 			for (; i < xcount; i++, dst++) {
608 				VDEV_RAIDZ_64MUL_2(*dst, mask);
609 			}
610 		}
611 	}
612 
613 	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
614 	dst = rm->rm_col[x].rc_data;
615 	exp = 255 - (rm->rm_cols - 1 - x);
616 
617 	for (i = 0; i < xcount; i++, dst++, src++) {
618 		*dst ^= *src;
619 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
620 			*b = vdev_raidz_exp2(*b, exp);
621 		}
622 	}
623 
624 	return (1 << VDEV_RAIDZ_Q);
625 }
626 
627 static int
628 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
629 {
630 	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
631 	void *pdata, *qdata;
632 	uint64_t xsize, ysize, i;
633 	int x = tgts[0];
634 	int y = tgts[1];
635 
636 	ASSERT(ntgts == 2);
637 	ASSERT(x < y);
638 	ASSERT(x >= rm->rm_firstdatacol);
639 	ASSERT(y < rm->rm_cols);
640 
641 	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
642 
643 	/*
644 	 * Move the parity data aside -- we're going to compute parity as
645 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
646 	 * reuse the parity generation mechanism without trashing the actual
647 	 * parity so we make those columns appear to be full of zeros by
648 	 * setting their lengths to zero.
649 	 */
650 	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
651 	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
652 	xsize = rm->rm_col[x].rc_size;
653 	ysize = rm->rm_col[y].rc_size;
654 
655 	rm->rm_col[VDEV_RAIDZ_P].rc_data =
656 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
657 	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
658 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
659 	rm->rm_col[x].rc_size = 0;
660 	rm->rm_col[y].rc_size = 0;
661 
662 	vdev_raidz_generate_parity_pq(rm);
663 
664 	rm->rm_col[x].rc_size = xsize;
665 	rm->rm_col[y].rc_size = ysize;
666 
667 	p = pdata;
668 	q = qdata;
669 	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
670 	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
671 	xd = rm->rm_col[x].rc_data;
672 	yd = rm->rm_col[y].rc_data;
673 
674 	/*
675 	 * We now have:
676 	 *	Pxy = P + D_x + D_y
677 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
678 	 *
679 	 * We can then solve for D_x:
680 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
681 	 * where
682 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
683 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
684 	 *
685 	 * With D_x in hand, we can easily solve for D_y:
686 	 *	D_y = P + Pxy + D_x
687 	 */
688 
689 	a = vdev_raidz_pow2[255 + x - y];
690 	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
691 	tmp = 255 - vdev_raidz_log2[a ^ 1];
692 
693 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
694 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
695 
696 	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
697 		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
698 		    vdev_raidz_exp2(*q ^ *qxy, bexp);
699 
700 		if (i < ysize)
701 			*yd = *p ^ *pxy ^ *xd;
702 	}
703 
704 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
705 	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
706 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
707 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
708 
709 	/*
710 	 * Restore the saved parity data.
711 	 */
712 	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
713 	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
714 
715 	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
716 }
717 
718 /* BEGIN CSTYLED */
719 /*
720  * In the general case of reconstruction, we must solve the system of linear
721  * equations defined by the coeffecients used to generate parity as well as
722  * the contents of the data and parity disks. This can be expressed with
723  * vectors for the original data (D) and the actual data (d) and parity (p)
724  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
725  *
726  *            __   __                     __     __
727  *            |     |         __     __   |  p_0  |
728  *            |  V  |         |  D_0  |   | p_m-1 |
729  *            |     |    x    |   :   | = |  d_0  |
730  *            |  I  |         | D_n-1 |   |   :   |
731  *            |     |         ~~     ~~   | d_n-1 |
732  *            ~~   ~~                     ~~     ~~
733  *
734  * I is simply a square identity matrix of size n, and V is a vandermonde
735  * matrix defined by the coeffecients we chose for the various parity columns
736  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
737  * computation as well as linear separability.
738  *
739  *      __               __               __     __
740  *      |   1   ..  1 1 1 |               |  p_0  |
741  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
742  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
743  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
744  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
745  *      |   :       : : : |   |   :   |   |  d_2  |
746  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
747  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
748  *      |   0   ..  0 0 1 |               | d_n-1 |
749  *      ~~               ~~               ~~     ~~
750  *
751  * Note that I, V, d, and p are known. To compute D, we must invert the
752  * matrix and use the known data and parity values to reconstruct the unknown
753  * data values. We begin by removing the rows in V|I and d|p that correspond
754  * to failed or missing columns; we then make V|I square (n x n) and d|p
755  * sized n by removing rows corresponding to unused parity from the bottom up
756  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
757  * using Gauss-Jordan elimination. In the example below we use m=3 parity
758  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
759  *           __                               __
760  *           |  1   1   1   1   1   1   1   1  |
761  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
762  *           |  19 205 116  29  64  16  4   1  |      / /
763  *           |  1   0   0   0   0   0   0   0  |     / /
764  *           |  0   1   0   0   0   0   0   0  | <--' /
765  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
766  *           |  0   0   0   1   0   0   0   0  |
767  *           |  0   0   0   0   1   0   0   0  |
768  *           |  0   0   0   0   0   1   0   0  |
769  *           |  0   0   0   0   0   0   1   0  |
770  *           |  0   0   0   0   0   0   0   1  |
771  *           ~~                               ~~
772  *           __                               __
773  *           |  1   1   1   1   1   1   1   1  |
774  *           | 128  64  32  16  8   4   2   1  |
775  *           |  19 205 116  29  64  16  4   1  |
776  *           |  1   0   0   0   0   0   0   0  |
777  *           |  0   1   0   0   0   0   0   0  |
778  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
779  *           |  0   0   0   1   0   0   0   0  |
780  *           |  0   0   0   0   1   0   0   0  |
781  *           |  0   0   0   0   0   1   0   0  |
782  *           |  0   0   0   0   0   0   1   0  |
783  *           |  0   0   0   0   0   0   0   1  |
784  *           ~~                               ~~
785  *
786  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
787  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
788  * matrix is not singular.
789  * __                                                                 __
790  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
791  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
792  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
793  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
794  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
795  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
796  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
797  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
798  * ~~                                                                 ~~
799  * __                                                                 __
800  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
801  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
802  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
803  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
804  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
805  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
806  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
807  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
808  * ~~                                                                 ~~
809  * __                                                                 __
810  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
811  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
812  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
813  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
814  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
815  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
816  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
817  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
818  * ~~                                                                 ~~
819  * __                                                                 __
820  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
821  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
822  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
823  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
824  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
825  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
826  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
827  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
828  * ~~                                                                 ~~
829  * __                                                                 __
830  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
831  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
832  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
833  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
834  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
835  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
836  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
837  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
838  * ~~                                                                 ~~
839  * __                                                                 __
840  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
841  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
842  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
843  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
844  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
845  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
846  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
847  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
848  * ~~                                                                 ~~
849  *                   __                               __
850  *                   |  0   0   1   0   0   0   0   0  |
851  *                   | 167 100  5   41 159 169 217 208 |
852  *                   | 166 100  4   40 158 168 216 209 |
853  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
854  *                   |  0   0   0   0   1   0   0   0  |
855  *                   |  0   0   0   0   0   1   0   0  |
856  *                   |  0   0   0   0   0   0   1   0  |
857  *                   |  0   0   0   0   0   0   0   1  |
858  *                   ~~                               ~~
859  *
860  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
861  * of the missing data.
862  *
863  * As is apparent from the example above, the only non-trivial rows in the
864  * inverse matrix correspond to the data disks that we're trying to
865  * reconstruct. Indeed, those are the only rows we need as the others would
866  * only be useful for reconstructing data known or assumed to be valid. For
867  * that reason, we only build the coefficients in the rows that correspond to
868  * targeted columns.
869  */
870 /* END CSTYLED */
871 
872 static void
873 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
874     uint8_t **rows)
875 {
876 	int i, j;
877 	int pow;
878 
879 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
880 
881 	/*
882 	 * Fill in the missing rows of interest.
883 	 */
884 	for (i = 0; i < nmap; i++) {
885 		ASSERT3S(0, <=, map[i]);
886 		ASSERT3S(map[i], <=, 2);
887 
888 		pow = map[i] * n;
889 		if (pow > 255)
890 			pow -= 255;
891 		ASSERT(pow <= 255);
892 
893 		for (j = 0; j < n; j++) {
894 			pow -= map[i];
895 			if (pow < 0)
896 				pow += 255;
897 			rows[i][j] = vdev_raidz_pow2[pow];
898 		}
899 	}
900 }
901 
902 static void
903 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
904     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
905 {
906 	int i, j, ii, jj;
907 	uint8_t log;
908 
909 	/*
910 	 * Assert that the first nmissing entries from the array of used
911 	 * columns correspond to parity columns and that subsequent entries
912 	 * correspond to data columns.
913 	 */
914 	for (i = 0; i < nmissing; i++) {
915 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
916 	}
917 	for (; i < n; i++) {
918 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
919 	}
920 
921 	/*
922 	 * First initialize the storage where we'll compute the inverse rows.
923 	 */
924 	for (i = 0; i < nmissing; i++) {
925 		for (j = 0; j < n; j++) {
926 			invrows[i][j] = (i == j) ? 1 : 0;
927 		}
928 	}
929 
930 	/*
931 	 * Subtract all trivial rows from the rows of consequence.
932 	 */
933 	for (i = 0; i < nmissing; i++) {
934 		for (j = nmissing; j < n; j++) {
935 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
936 			jj = used[j] - rm->rm_firstdatacol;
937 			ASSERT3S(jj, <, n);
938 			invrows[i][j] = rows[i][jj];
939 			rows[i][jj] = 0;
940 		}
941 	}
942 
943 	/*
944 	 * For each of the rows of interest, we must normalize it and subtract
945 	 * a multiple of it from the other rows.
946 	 */
947 	for (i = 0; i < nmissing; i++) {
948 		for (j = 0; j < missing[i]; j++) {
949 			ASSERT3U(rows[i][j], ==, 0);
950 		}
951 		ASSERT3U(rows[i][missing[i]], !=, 0);
952 
953 		/*
954 		 * Compute the inverse of the first element and multiply each
955 		 * element in the row by that value.
956 		 */
957 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
958 
959 		for (j = 0; j < n; j++) {
960 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
961 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
962 		}
963 
964 		for (ii = 0; ii < nmissing; ii++) {
965 			if (i == ii)
966 				continue;
967 
968 			ASSERT3U(rows[ii][missing[i]], !=, 0);
969 
970 			log = vdev_raidz_log2[rows[ii][missing[i]]];
971 
972 			for (j = 0; j < n; j++) {
973 				rows[ii][j] ^=
974 				    vdev_raidz_exp2(rows[i][j], log);
975 				invrows[ii][j] ^=
976 				    vdev_raidz_exp2(invrows[i][j], log);
977 			}
978 		}
979 	}
980 
981 	/*
982 	 * Verify that the data that is left in the rows are properly part of
983 	 * an identity matrix.
984 	 */
985 	for (i = 0; i < nmissing; i++) {
986 		for (j = 0; j < n; j++) {
987 			if (j == missing[i]) {
988 				ASSERT3U(rows[i][j], ==, 1);
989 			} else {
990 				ASSERT3U(rows[i][j], ==, 0);
991 			}
992 		}
993 	}
994 }
995 
996 static void
997 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
998     int *missing, uint8_t **invrows, const uint8_t *used)
999 {
1000 	int i, j, x, cc, c;
1001 	uint8_t *src;
1002 	uint64_t ccount;
1003 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1004 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1005 	uint8_t log, val;
1006 	int ll;
1007 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1008 	uint8_t *p, *pp;
1009 	size_t psize;
1010 
1011 	psize = sizeof (invlog[0][0]) * n * nmissing;
1012 	p = kmem_alloc(psize, KM_SLEEP);
1013 
1014 	for (pp = p, i = 0; i < nmissing; i++) {
1015 		invlog[i] = pp;
1016 		pp += n;
1017 	}
1018 
1019 	for (i = 0; i < nmissing; i++) {
1020 		for (j = 0; j < n; j++) {
1021 			ASSERT3U(invrows[i][j], !=, 0);
1022 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1023 		}
1024 	}
1025 
1026 	for (i = 0; i < n; i++) {
1027 		c = used[i];
1028 		ASSERT3U(c, <, rm->rm_cols);
1029 
1030 		src = rm->rm_col[c].rc_data;
1031 		ccount = rm->rm_col[c].rc_size;
1032 		for (j = 0; j < nmissing; j++) {
1033 			cc = missing[j] + rm->rm_firstdatacol;
1034 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1035 			ASSERT3U(cc, <, rm->rm_cols);
1036 			ASSERT3U(cc, !=, c);
1037 
1038 			dst[j] = rm->rm_col[cc].rc_data;
1039 			dcount[j] = rm->rm_col[cc].rc_size;
1040 		}
1041 
1042 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1043 
1044 		for (x = 0; x < ccount; x++, src++) {
1045 			if (*src != 0)
1046 				log = vdev_raidz_log2[*src];
1047 
1048 			for (cc = 0; cc < nmissing; cc++) {
1049 				if (x >= dcount[cc])
1050 					continue;
1051 
1052 				if (*src == 0) {
1053 					val = 0;
1054 				} else {
1055 					if ((ll = log + invlog[cc][i]) >= 255)
1056 						ll -= 255;
1057 					val = vdev_raidz_pow2[ll];
1058 				}
1059 
1060 				if (i == 0)
1061 					dst[cc][x] = val;
1062 				else
1063 					dst[cc][x] ^= val;
1064 			}
1065 		}
1066 	}
1067 
1068 	kmem_free(p, psize);
1069 }
1070 
1071 static int
1072 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1073 {
1074 	int n, i, c, t, tt;
1075 	int nmissing_rows;
1076 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1077 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1078 
1079 	uint8_t *p, *pp;
1080 	size_t psize;
1081 
1082 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1083 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1084 	uint8_t *used;
1085 
1086 	int code = 0;
1087 
1088 
1089 	n = rm->rm_cols - rm->rm_firstdatacol;
1090 
1091 	/*
1092 	 * Figure out which data columns are missing.
1093 	 */
1094 	nmissing_rows = 0;
1095 	for (t = 0; t < ntgts; t++) {
1096 		if (tgts[t] >= rm->rm_firstdatacol) {
1097 			missing_rows[nmissing_rows++] =
1098 			    tgts[t] - rm->rm_firstdatacol;
1099 		}
1100 	}
1101 
1102 	/*
1103 	 * Figure out which parity columns to use to help generate the missing
1104 	 * data columns.
1105 	 */
1106 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1107 		ASSERT(tt < ntgts);
1108 		ASSERT(c < rm->rm_firstdatacol);
1109 
1110 		/*
1111 		 * Skip any targeted parity columns.
1112 		 */
1113 		if (c == tgts[tt]) {
1114 			tt++;
1115 			continue;
1116 		}
1117 
1118 		code |= 1 << c;
1119 
1120 		parity_map[i] = c;
1121 		i++;
1122 	}
1123 
1124 	ASSERT(code != 0);
1125 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1126 
1127 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1128 	    nmissing_rows * n + sizeof (used[0]) * n;
1129 	p = kmem_alloc(psize, KM_SLEEP);
1130 
1131 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1132 		rows[i] = pp;
1133 		pp += n;
1134 		invrows[i] = pp;
1135 		pp += n;
1136 	}
1137 	used = pp;
1138 
1139 	for (i = 0; i < nmissing_rows; i++) {
1140 		used[i] = parity_map[i];
1141 	}
1142 
1143 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1144 		if (tt < nmissing_rows &&
1145 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1146 			tt++;
1147 			continue;
1148 		}
1149 
1150 		ASSERT3S(i, <, n);
1151 		used[i] = c;
1152 		i++;
1153 	}
1154 
1155 	/*
1156 	 * Initialize the interesting rows of the matrix.
1157 	 */
1158 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1159 
1160 	/*
1161 	 * Invert the matrix.
1162 	 */
1163 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1164 	    invrows, used);
1165 
1166 	/*
1167 	 * Reconstruct the missing data using the generated matrix.
1168 	 */
1169 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1170 	    invrows, used);
1171 
1172 	kmem_free(p, psize);
1173 
1174 	return (code);
1175 }
1176 
1177 static int
1178 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1179 {
1180 	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1181 	int ntgts;
1182 	int i, c;
1183 	int code;
1184 	int nbadparity, nbaddata;
1185 	int parity_valid[VDEV_RAIDZ_MAXPARITY];
1186 
1187 	/*
1188 	 * The tgts list must already be sorted.
1189 	 */
1190 	for (i = 1; i < nt; i++) {
1191 		ASSERT(t[i] > t[i - 1]);
1192 	}
1193 
1194 	nbadparity = rm->rm_firstdatacol;
1195 	nbaddata = rm->rm_cols - nbadparity;
1196 	ntgts = 0;
1197 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1198 		if (c < rm->rm_firstdatacol)
1199 			parity_valid[c] = B_FALSE;
1200 
1201 		if (i < nt && c == t[i]) {
1202 			tgts[ntgts++] = c;
1203 			i++;
1204 		} else if (rm->rm_col[c].rc_error != 0) {
1205 			tgts[ntgts++] = c;
1206 		} else if (c >= rm->rm_firstdatacol) {
1207 			nbaddata--;
1208 		} else {
1209 			parity_valid[c] = B_TRUE;
1210 			nbadparity--;
1211 		}
1212 	}
1213 
1214 	ASSERT(ntgts >= nt);
1215 	ASSERT(nbaddata >= 0);
1216 	ASSERT(nbaddata + nbadparity == ntgts);
1217 
1218 	dt = &tgts[nbadparity];
1219 
1220 	/*
1221 	 * See if we can use any of our optimized reconstruction routines.
1222 	 */
1223 	if (!vdev_raidz_default_to_general) {
1224 		switch (nbaddata) {
1225 		case 1:
1226 			if (parity_valid[VDEV_RAIDZ_P])
1227 				return (vdev_raidz_reconstruct_p(rm, dt, 1));
1228 
1229 			ASSERT(rm->rm_firstdatacol > 1);
1230 
1231 			if (parity_valid[VDEV_RAIDZ_Q])
1232 				return (vdev_raidz_reconstruct_q(rm, dt, 1));
1233 
1234 			ASSERT(rm->rm_firstdatacol > 2);
1235 			break;
1236 
1237 		case 2:
1238 			ASSERT(rm->rm_firstdatacol > 1);
1239 
1240 			if (parity_valid[VDEV_RAIDZ_P] &&
1241 			    parity_valid[VDEV_RAIDZ_Q])
1242 				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1243 
1244 			ASSERT(rm->rm_firstdatacol > 2);
1245 
1246 			break;
1247 		}
1248 	}
1249 
1250 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1251 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1252 	ASSERT(code > 0);
1253 	return (code);
1254 }
1255 
1256 static int
1257 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
1258 {
1259 	vdev_t *cvd;
1260 	uint64_t nparity = vd->vdev_nparity;
1261 	int c;
1262 	int lasterror = 0;
1263 	int numerrors = 0;
1264 
1265 	ASSERT(nparity > 0);
1266 
1267 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
1268 	    vd->vdev_children < nparity + 1) {
1269 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1270 		return (EINVAL);
1271 	}
1272 
1273 	vdev_open_children(vd);
1274 
1275 	for (c = 0; c < vd->vdev_children; c++) {
1276 		cvd = vd->vdev_child[c];
1277 
1278 		if (cvd->vdev_open_error != 0) {
1279 			lasterror = cvd->vdev_open_error;
1280 			numerrors++;
1281 			continue;
1282 		}
1283 
1284 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1285 		*ashift = MAX(*ashift, cvd->vdev_ashift);
1286 	}
1287 
1288 	*asize *= vd->vdev_children;
1289 
1290 	if (numerrors > nparity) {
1291 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1292 		return (lasterror);
1293 	}
1294 
1295 	return (0);
1296 }
1297 
1298 static void
1299 vdev_raidz_close(vdev_t *vd)
1300 {
1301 	int c;
1302 
1303 	for (c = 0; c < vd->vdev_children; c++)
1304 		vdev_close(vd->vdev_child[c]);
1305 }
1306 
1307 static uint64_t
1308 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1309 {
1310 	uint64_t asize;
1311 	uint64_t ashift = vd->vdev_top->vdev_ashift;
1312 	uint64_t cols = vd->vdev_children;
1313 	uint64_t nparity = vd->vdev_nparity;
1314 
1315 	asize = ((psize - 1) >> ashift) + 1;
1316 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1317 	asize = roundup(asize, nparity + 1) << ashift;
1318 
1319 	return (asize);
1320 }
1321 
1322 static void
1323 vdev_raidz_child_done(zio_t *zio)
1324 {
1325 	raidz_col_t *rc = zio->io_private;
1326 
1327 	rc->rc_error = zio->io_error;
1328 	rc->rc_tried = 1;
1329 	rc->rc_skipped = 0;
1330 }
1331 
1332 static int
1333 vdev_raidz_io_start(zio_t *zio)
1334 {
1335 	vdev_t *vd = zio->io_vd;
1336 	vdev_t *tvd = vd->vdev_top;
1337 	vdev_t *cvd;
1338 	blkptr_t *bp = zio->io_bp;
1339 	raidz_map_t *rm;
1340 	raidz_col_t *rc;
1341 	int c, i;
1342 
1343 	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
1344 	    vd->vdev_nparity);
1345 
1346 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1347 
1348 	if (zio->io_type == ZIO_TYPE_WRITE) {
1349 		vdev_raidz_generate_parity(rm);
1350 
1351 		for (c = 0; c < rm->rm_cols; c++) {
1352 			rc = &rm->rm_col[c];
1353 			cvd = vd->vdev_child[rc->rc_devidx];
1354 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1355 			    rc->rc_offset, rc->rc_data, rc->rc_size,
1356 			    zio->io_type, zio->io_priority, 0,
1357 			    vdev_raidz_child_done, rc));
1358 		}
1359 
1360 		/*
1361 		 * Generate optional I/Os for any skipped sectors to improve
1362 		 * aggregation contiguity.
1363 		 */
1364 		for (c = rm->rm_bigcols, i = 0; i < rm->rm_skipped; c++, i++) {
1365 			ASSERT(c <= rm->rm_scols);
1366 			if (c == rm->rm_scols)
1367 				c = 0;
1368 			rc = &rm->rm_col[c];
1369 			cvd = vd->vdev_child[rc->rc_devidx];
1370 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1371 			    rc->rc_offset + rc->rc_size, NULL,
1372 			    1 << tvd->vdev_ashift,
1373 			    zio->io_type, zio->io_priority,
1374 			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1375 		}
1376 
1377 		return (ZIO_PIPELINE_CONTINUE);
1378 	}
1379 
1380 	ASSERT(zio->io_type == ZIO_TYPE_READ);
1381 
1382 	/*
1383 	 * Iterate over the columns in reverse order so that we hit the parity
1384 	 * last -- any errors along the way will force us to read the parity.
1385 	 */
1386 	for (c = rm->rm_cols - 1; c >= 0; c--) {
1387 		rc = &rm->rm_col[c];
1388 		cvd = vd->vdev_child[rc->rc_devidx];
1389 		if (!vdev_readable(cvd)) {
1390 			if (c >= rm->rm_firstdatacol)
1391 				rm->rm_missingdata++;
1392 			else
1393 				rm->rm_missingparity++;
1394 			rc->rc_error = ENXIO;
1395 			rc->rc_tried = 1;	/* don't even try */
1396 			rc->rc_skipped = 1;
1397 			continue;
1398 		}
1399 		if (vdev_dtl_contains(cvd, DTL_MISSING, bp->blk_birth, 1)) {
1400 			if (c >= rm->rm_firstdatacol)
1401 				rm->rm_missingdata++;
1402 			else
1403 				rm->rm_missingparity++;
1404 			rc->rc_error = ESTALE;
1405 			rc->rc_skipped = 1;
1406 			continue;
1407 		}
1408 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1409 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1410 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1411 			    rc->rc_offset, rc->rc_data, rc->rc_size,
1412 			    zio->io_type, zio->io_priority, 0,
1413 			    vdev_raidz_child_done, rc));
1414 		}
1415 	}
1416 
1417 	return (ZIO_PIPELINE_CONTINUE);
1418 }
1419 
1420 /*
1421  * Report a checksum error for a child of a RAID-Z device.
1422  */
1423 static void
1424 raidz_checksum_error(zio_t *zio, raidz_col_t *rc)
1425 {
1426 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1427 
1428 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1429 		mutex_enter(&vd->vdev_stat_lock);
1430 		vd->vdev_stat.vs_checksum_errors++;
1431 		mutex_exit(&vd->vdev_stat_lock);
1432 	}
1433 
1434 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE))
1435 		zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
1436 		    zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size);
1437 }
1438 
1439 /*
1440  * Generate the parity from the data columns. If we tried and were able to
1441  * read the parity without error, verify that the generated parity matches the
1442  * data we read. If it doesn't, we fire off a checksum error. Return the
1443  * number such failures.
1444  */
1445 static int
1446 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1447 {
1448 	void *orig[VDEV_RAIDZ_MAXPARITY];
1449 	int c, ret = 0;
1450 	raidz_col_t *rc;
1451 
1452 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1453 		rc = &rm->rm_col[c];
1454 		if (!rc->rc_tried || rc->rc_error != 0)
1455 			continue;
1456 		orig[c] = zio_buf_alloc(rc->rc_size);
1457 		bcopy(rc->rc_data, orig[c], rc->rc_size);
1458 	}
1459 
1460 	vdev_raidz_generate_parity(rm);
1461 
1462 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1463 		rc = &rm->rm_col[c];
1464 		if (!rc->rc_tried || rc->rc_error != 0)
1465 			continue;
1466 		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1467 			raidz_checksum_error(zio, rc);
1468 			rc->rc_error = ECKSUM;
1469 			ret++;
1470 		}
1471 		zio_buf_free(orig[c], rc->rc_size);
1472 	}
1473 
1474 	return (ret);
1475 }
1476 
1477 /*
1478  * Keep statistics on all the ways that we used parity to correct data.
1479  */
1480 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1481 
1482 static int
1483 vdev_raidz_worst_error(raidz_map_t *rm)
1484 {
1485 	int error = 0;
1486 
1487 	for (int c = 0; c < rm->rm_cols; c++)
1488 		error = zio_worst_error(error, rm->rm_col[c].rc_error);
1489 
1490 	return (error);
1491 }
1492 
1493 /*
1494  * Iterate over all combinations of bad data and attempt a reconstruction.
1495  * Note that the algorithm below is non-optimal because it doesn't take into
1496  * account how reconstruction is actually performed. For example, with
1497  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1498  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1499  * cases we'd only use parity information in column 0.
1500  */
1501 static int
1502 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1503 {
1504 	raidz_map_t *rm = zio->io_vsd;
1505 	raidz_col_t *rc;
1506 	void *orig[VDEV_RAIDZ_MAXPARITY];
1507 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1508 	int *tgts = &tstore[1];
1509 	int current, next, i, c, n;
1510 	int code, ret = 0;
1511 
1512 	ASSERT(total_errors < rm->rm_firstdatacol);
1513 
1514 	/*
1515 	 * This simplifies one edge condition.
1516 	 */
1517 	tgts[-1] = -1;
1518 
1519 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1520 		/*
1521 		 * Initialize the targets array by finding the first n columns
1522 		 * that contain no error.
1523 		 *
1524 		 * If there were no data errors, we need to ensure that we're
1525 		 * always explicitly attempting to reconstruct at least one
1526 		 * data column. To do this, we simply push the highest target
1527 		 * up into the data columns.
1528 		 */
1529 		for (c = 0, i = 0; i < n; i++) {
1530 			if (i == n - 1 && data_errors == 0 &&
1531 			    c < rm->rm_firstdatacol) {
1532 				c = rm->rm_firstdatacol;
1533 			}
1534 
1535 			while (rm->rm_col[c].rc_error != 0) {
1536 				c++;
1537 				ASSERT3S(c, <, rm->rm_cols);
1538 			}
1539 
1540 			tgts[i] = c++;
1541 		}
1542 
1543 		/*
1544 		 * Setting tgts[n] simplifies the other edge condition.
1545 		 */
1546 		tgts[n] = rm->rm_cols;
1547 
1548 		/*
1549 		 * These buffers were allocated in previous iterations.
1550 		 */
1551 		for (i = 0; i < n - 1; i++) {
1552 			ASSERT(orig[i] != NULL);
1553 		}
1554 
1555 		orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
1556 
1557 		current = 0;
1558 		next = tgts[current];
1559 
1560 		while (current != n) {
1561 			tgts[current] = next;
1562 			current = 0;
1563 
1564 			/*
1565 			 * Save off the original data that we're going to
1566 			 * attempt to reconstruct.
1567 			 */
1568 			for (i = 0; i < n; i++) {
1569 				ASSERT(orig[i] != NULL);
1570 				c = tgts[i];
1571 				ASSERT3S(c, >=, 0);
1572 				ASSERT3S(c, <, rm->rm_cols);
1573 				rc = &rm->rm_col[c];
1574 				bcopy(rc->rc_data, orig[i], rc->rc_size);
1575 			}
1576 
1577 			/*
1578 			 * Attempt a reconstruction and exit the outer loop on
1579 			 * success.
1580 			 */
1581 			code = vdev_raidz_reconstruct(rm, tgts, n);
1582 			if (zio_checksum_error(zio) == 0) {
1583 				atomic_inc_64(&raidz_corrected[code]);
1584 
1585 				for (i = 0; i < n; i++) {
1586 					c = tgts[i];
1587 					rc = &rm->rm_col[c];
1588 					ASSERT(rc->rc_error == 0);
1589 					if (rc->rc_tried)
1590 						raidz_checksum_error(zio, rc);
1591 					rc->rc_error = ECKSUM;
1592 				}
1593 
1594 				ret = code;
1595 				goto done;
1596 			}
1597 
1598 			/*
1599 			 * Restore the original data.
1600 			 */
1601 			for (i = 0; i < n; i++) {
1602 				c = tgts[i];
1603 				rc = &rm->rm_col[c];
1604 				bcopy(orig[i], rc->rc_data, rc->rc_size);
1605 			}
1606 
1607 			do {
1608 				/*
1609 				 * Find the next valid column after the current
1610 				 * position..
1611 				 */
1612 				for (next = tgts[current] + 1;
1613 				    next < rm->rm_cols &&
1614 				    rm->rm_col[next].rc_error != 0; next++)
1615 					continue;
1616 
1617 				ASSERT(next <= tgts[current + 1]);
1618 
1619 				/*
1620 				 * If that spot is available, we're done here.
1621 				 */
1622 				if (next != tgts[current + 1])
1623 					break;
1624 
1625 				/*
1626 				 * Otherwise, find the next valid column after
1627 				 * the previous position.
1628 				 */
1629 				for (c = tgts[current - 1] + 1;
1630 				    rm->rm_col[c].rc_error != 0; c++)
1631 					continue;
1632 
1633 				tgts[current] = c;
1634 				current++;
1635 
1636 			} while (current != n);
1637 		}
1638 	}
1639 	n--;
1640 done:
1641 	for (i = 0; i < n; i++) {
1642 		zio_buf_free(orig[i], rm->rm_col[0].rc_size);
1643 	}
1644 
1645 	return (ret);
1646 }
1647 
1648 static void
1649 vdev_raidz_io_done(zio_t *zio)
1650 {
1651 	vdev_t *vd = zio->io_vd;
1652 	vdev_t *cvd;
1653 	raidz_map_t *rm = zio->io_vsd;
1654 	raidz_col_t *rc;
1655 	int unexpected_errors = 0;
1656 	int parity_errors = 0;
1657 	int parity_untried = 0;
1658 	int data_errors = 0;
1659 	int total_errors = 0;
1660 	int n, c;
1661 	int tgts[VDEV_RAIDZ_MAXPARITY];
1662 	int code;
1663 
1664 	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
1665 
1666 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1667 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1668 
1669 	for (c = 0; c < rm->rm_cols; c++) {
1670 		rc = &rm->rm_col[c];
1671 
1672 		if (rc->rc_error) {
1673 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
1674 
1675 			if (c < rm->rm_firstdatacol)
1676 				parity_errors++;
1677 			else
1678 				data_errors++;
1679 
1680 			if (!rc->rc_skipped)
1681 				unexpected_errors++;
1682 
1683 			total_errors++;
1684 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1685 			parity_untried++;
1686 		}
1687 	}
1688 
1689 	if (zio->io_type == ZIO_TYPE_WRITE) {
1690 		/*
1691 		 * XXX -- for now, treat partial writes as a success.
1692 		 * (If we couldn't write enough columns to reconstruct
1693 		 * the data, the I/O failed.  Otherwise, good enough.)
1694 		 *
1695 		 * Now that we support write reallocation, it would be better
1696 		 * to treat partial failure as real failure unless there are
1697 		 * no non-degraded top-level vdevs left, and not update DTLs
1698 		 * if we intend to reallocate.
1699 		 */
1700 		/* XXPOLICY */
1701 		if (total_errors > rm->rm_firstdatacol)
1702 			zio->io_error = vdev_raidz_worst_error(rm);
1703 
1704 		return;
1705 	}
1706 
1707 	ASSERT(zio->io_type == ZIO_TYPE_READ);
1708 	/*
1709 	 * There are three potential phases for a read:
1710 	 *	1. produce valid data from the columns read
1711 	 *	2. read all disks and try again
1712 	 *	3. perform combinatorial reconstruction
1713 	 *
1714 	 * Each phase is progressively both more expensive and less likely to
1715 	 * occur. If we encounter more errors than we can repair or all phases
1716 	 * fail, we have no choice but to return an error.
1717 	 */
1718 
1719 	/*
1720 	 * If the number of errors we saw was correctable -- less than or equal
1721 	 * to the number of parity disks read -- attempt to produce data that
1722 	 * has a valid checksum. Naturally, this case applies in the absence of
1723 	 * any errors.
1724 	 */
1725 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1726 		if (data_errors == 0) {
1727 			if (zio_checksum_error(zio) == 0) {
1728 				/*
1729 				 * If we read parity information (unnecessarily
1730 				 * as it happens since no reconstruction was
1731 				 * needed) regenerate and verify the parity.
1732 				 * We also regenerate parity when resilvering
1733 				 * so we can write it out to the failed device
1734 				 * later.
1735 				 */
1736 				if (parity_errors + parity_untried <
1737 				    rm->rm_firstdatacol ||
1738 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
1739 					n = raidz_parity_verify(zio, rm);
1740 					unexpected_errors += n;
1741 					ASSERT(parity_errors + n <=
1742 					    rm->rm_firstdatacol);
1743 				}
1744 				goto done;
1745 			}
1746 		} else {
1747 			/*
1748 			 * We either attempt to read all the parity columns or
1749 			 * none of them. If we didn't try to read parity, we
1750 			 * wouldn't be here in the correctable case. There must
1751 			 * also have been fewer parity errors than parity
1752 			 * columns or, again, we wouldn't be in this code path.
1753 			 */
1754 			ASSERT(parity_untried == 0);
1755 			ASSERT(parity_errors < rm->rm_firstdatacol);
1756 
1757 			/*
1758 			 * Identify the data columns that reported an error.
1759 			 */
1760 			n = 0;
1761 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1762 				rc = &rm->rm_col[c];
1763 				if (rc->rc_error != 0) {
1764 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1765 					tgts[n++] = c;
1766 				}
1767 			}
1768 
1769 			ASSERT(rm->rm_firstdatacol >= n);
1770 
1771 			code = vdev_raidz_reconstruct(rm, tgts, n);
1772 
1773 			if (zio_checksum_error(zio) == 0) {
1774 				atomic_inc_64(&raidz_corrected[code]);
1775 
1776 				/*
1777 				 * If we read more parity disks than were used
1778 				 * for reconstruction, confirm that the other
1779 				 * parity disks produced correct data. This
1780 				 * routine is suboptimal in that it regenerates
1781 				 * the parity that we already used in addition
1782 				 * to the parity that we're attempting to
1783 				 * verify, but this should be a relatively
1784 				 * uncommon case, and can be optimized if it
1785 				 * becomes a problem. Note that we regenerate
1786 				 * parity when resilvering so we can write it
1787 				 * out to failed devices later.
1788 				 */
1789 				if (parity_errors < rm->rm_firstdatacol - n ||
1790 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
1791 					n = raidz_parity_verify(zio, rm);
1792 					unexpected_errors += n;
1793 					ASSERT(parity_errors + n <=
1794 					    rm->rm_firstdatacol);
1795 				}
1796 
1797 				goto done;
1798 			}
1799 		}
1800 	}
1801 
1802 	/*
1803 	 * This isn't a typical situation -- either we got a read error or
1804 	 * a child silently returned bad data. Read every block so we can
1805 	 * try again with as much data and parity as we can track down. If
1806 	 * we've already been through once before, all children will be marked
1807 	 * as tried so we'll proceed to combinatorial reconstruction.
1808 	 */
1809 	unexpected_errors = 1;
1810 	rm->rm_missingdata = 0;
1811 	rm->rm_missingparity = 0;
1812 
1813 	for (c = 0; c < rm->rm_cols; c++) {
1814 		if (rm->rm_col[c].rc_tried)
1815 			continue;
1816 
1817 		zio_vdev_io_redone(zio);
1818 		do {
1819 			rc = &rm->rm_col[c];
1820 			if (rc->rc_tried)
1821 				continue;
1822 			zio_nowait(zio_vdev_child_io(zio, NULL,
1823 			    vd->vdev_child[rc->rc_devidx],
1824 			    rc->rc_offset, rc->rc_data, rc->rc_size,
1825 			    zio->io_type, zio->io_priority, 0,
1826 			    vdev_raidz_child_done, rc));
1827 		} while (++c < rm->rm_cols);
1828 
1829 		return;
1830 	}
1831 
1832 	/*
1833 	 * At this point we've attempted to reconstruct the data given the
1834 	 * errors we detected, and we've attempted to read all columns. There
1835 	 * must, therefore, be one or more additional problems -- silent errors
1836 	 * resulting in invalid data rather than explicit I/O errors resulting
1837 	 * in absent data. We check if there is enough additional data to
1838 	 * possibly reconstruct the data and then perform combinatorial
1839 	 * reconstruction over all possible combinations. If that fails,
1840 	 * we're cooked.
1841 	 */
1842 	if (total_errors >= rm->rm_firstdatacol) {
1843 		zio->io_error = vdev_raidz_worst_error(rm);
1844 		/*
1845 		 * If there were exactly as many device errors as parity
1846 		 * columns, yet we couldn't reconstruct the data, then at
1847 		 * least one device must have returned bad data silently.
1848 		 */
1849 		if (total_errors == rm->rm_firstdatacol)
1850 			zio->io_error = zio_worst_error(zio->io_error, ECKSUM);
1851 
1852 	} else if ((code = vdev_raidz_combrec(zio, total_errors,
1853 	    data_errors)) != 0) {
1854 		/*
1855 		 * If we didn't use all the available parity for the
1856 		 * combinatorial reconstruction, verify that the remaining
1857 		 * parity is correct.
1858 		 */
1859 		if (code != (1 << rm->rm_firstdatacol) - 1)
1860 			(void) raidz_parity_verify(zio, rm);
1861 	} else {
1862 		/*
1863 		 * All combinations failed to checksum. Generate checksum
1864 		 * ereports for all children.
1865 		 */
1866 		zio->io_error = ECKSUM;
1867 
1868 		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1869 			for (c = 0; c < rm->rm_cols; c++) {
1870 				rc = &rm->rm_col[c];
1871 				zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
1872 				    zio->io_spa, vd->vdev_child[rc->rc_devidx],
1873 				    zio, rc->rc_offset, rc->rc_size);
1874 			}
1875 		}
1876 	}
1877 
1878 done:
1879 	zio_checksum_verified(zio);
1880 
1881 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
1882 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
1883 		/*
1884 		 * Use the good data we have in hand to repair damaged children.
1885 		 */
1886 		for (c = 0; c < rm->rm_cols; c++) {
1887 			rc = &rm->rm_col[c];
1888 			cvd = vd->vdev_child[rc->rc_devidx];
1889 
1890 			if (rc->rc_error == 0)
1891 				continue;
1892 
1893 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1894 			    rc->rc_offset, rc->rc_data, rc->rc_size,
1895 			    ZIO_TYPE_WRITE, zio->io_priority,
1896 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
1897 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
1898 		}
1899 	}
1900 }
1901 
1902 static void
1903 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
1904 {
1905 	if (faulted > vd->vdev_nparity)
1906 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1907 		    VDEV_AUX_NO_REPLICAS);
1908 	else if (degraded + faulted != 0)
1909 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
1910 	else
1911 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
1912 }
1913 
1914 vdev_ops_t vdev_raidz_ops = {
1915 	vdev_raidz_open,
1916 	vdev_raidz_close,
1917 	vdev_raidz_asize,
1918 	vdev_raidz_io_start,
1919 	vdev_raidz_io_done,
1920 	vdev_raidz_state_change,
1921 	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
1922 	B_FALSE			/* not a leaf vdev */
1923 };
1924