xref: /illumos-gate/usr/src/uts/common/fs/zfs/vdev_raidz.c (revision 88ecc943b4eb72f7c4fbbd8435997b85ef171fc3)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/zfs_context.h>
28 #include <sys/spa.h>
29 #include <sys/vdev_impl.h>
30 #include <sys/zio.h>
31 #include <sys/zio_checksum.h>
32 #include <sys/fs/zfs.h>
33 #include <sys/fm/fs/zfs.h>
34 
35 /*
36  * Virtual device vector for RAID-Z.
37  *
38  * This vdev supports single, double, and triple parity. For single parity,
39  * we use a simple XOR of all the data columns. For double or triple parity,
40  * we use a special case of Reed-Solomon coding. This extends the
41  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
42  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
43  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
44  * former is also based. The latter is designed to provide higher performance
45  * for writes.
46  *
47  * Note that the Plank paper claimed to support arbitrary N+M, but was then
48  * amended six years later identifying a critical flaw that invalidates its
49  * claims. Nevertheless, the technique can be adapted to work for up to
50  * triple parity. For additional parity, the amendment "Note: Correction to
51  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
52  * is viable, but the additional complexity means that write performance will
53  * suffer.
54  *
55  * All of the methods above operate on a Galois field, defined over the
56  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
57  * can be expressed with a single byte. Briefly, the operations on the
58  * field are defined as follows:
59  *
60  *   o addition (+) is represented by a bitwise XOR
61  *   o subtraction (-) is therefore identical to addition: A + B = A - B
62  *   o multiplication of A by 2 is defined by the following bitwise expression:
63  *	(A * 2)_7 = A_6
64  *	(A * 2)_6 = A_5
65  *	(A * 2)_5 = A_4
66  *	(A * 2)_4 = A_3 + A_7
67  *	(A * 2)_3 = A_2 + A_7
68  *	(A * 2)_2 = A_1 + A_7
69  *	(A * 2)_1 = A_0
70  *	(A * 2)_0 = A_7
71  *
72  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
73  * As an aside, this multiplication is derived from the error correcting
74  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
75  *
76  * Observe that any number in the field (except for 0) can be expressed as a
77  * power of 2 -- a generator for the field. We store a table of the powers of
78  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
79  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
80  * than field addition). The inverse of a field element A (A^-1) is therefore
81  * A ^ (255 - 1) = A^254.
82  *
83  * The up-to-three parity columns, P, Q, R over several data columns,
84  * D_0, ... D_n-1, can be expressed by field operations:
85  *
86  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
87  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
88  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
89  *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
90  *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
91  *
92  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
93  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
94  * independent coefficients. (There are no additional coefficients that have
95  * this property which is why the uncorrected Plank method breaks down.)
96  *
97  * See the reconstruction code below for how P, Q and R can used individually
98  * or in concert to recover missing data columns.
99  */
100 
101 typedef struct raidz_col {
102 	uint64_t rc_devidx;		/* child device index for I/O */
103 	uint64_t rc_offset;		/* device offset */
104 	uint64_t rc_size;		/* I/O size */
105 	void *rc_data;			/* I/O data */
106 	int rc_error;			/* I/O error for this device */
107 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
108 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
109 } raidz_col_t;
110 
111 typedef struct raidz_map {
112 	uint64_t rm_cols;		/* Regular column count */
113 	uint64_t rm_scols;		/* Count including skipped columns */
114 	uint64_t rm_bigcols;		/* Number of oversized columns */
115 	uint64_t rm_asize;		/* Actual total I/O size */
116 	uint64_t rm_missingdata;	/* Count of missing data devices */
117 	uint64_t rm_missingparity;	/* Count of missing parity devices */
118 	uint64_t rm_firstdatacol;	/* First data column/parity count */
119 	uint64_t rm_nskip;		/* Skipped sectors for padding */
120 	uint64_t rm_skipstart;	/* Column index of padding start */
121 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
122 } raidz_map_t;
123 
124 #define	VDEV_RAIDZ_P		0
125 #define	VDEV_RAIDZ_Q		1
126 #define	VDEV_RAIDZ_R		2
127 #define	VDEV_RAIDZ_MAXPARITY	3
128 
129 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
130 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
131 
132 /*
133  * We provide a mechanism to perform the field multiplication operation on a
134  * 64-bit value all at once rather than a byte at a time. This works by
135  * creating a mask from the top bit in each byte and using that to
136  * conditionally apply the XOR of 0x1d.
137  */
138 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
139 { \
140 	(mask) = (x) & 0x8080808080808080ULL; \
141 	(mask) = ((mask) << 1) - ((mask) >> 7); \
142 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
143 	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
144 }
145 
146 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
147 { \
148 	VDEV_RAIDZ_64MUL_2((x), mask); \
149 	VDEV_RAIDZ_64MUL_2((x), mask); \
150 }
151 
152 /*
153  * Force reconstruction to use the general purpose method.
154  */
155 int vdev_raidz_default_to_general;
156 
157 /*
158  * These two tables represent powers and logs of 2 in the Galois field defined
159  * above. These values were computed by repeatedly multiplying by 2 as above.
160  */
161 static const uint8_t vdev_raidz_pow2[256] = {
162 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
163 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
164 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
165 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
166 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
167 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
168 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
169 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
170 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
171 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
172 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
173 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
174 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
175 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
176 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
177 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
178 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
179 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
180 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
181 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
182 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
183 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
184 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
185 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
186 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
187 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
188 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
189 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
190 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
191 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
192 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
193 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
194 };
195 static const uint8_t vdev_raidz_log2[256] = {
196 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
197 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
198 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
199 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
200 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
201 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
202 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
203 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
204 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
205 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
206 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
207 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
208 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
209 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
210 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
211 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
212 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
213 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
214 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
215 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
216 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
217 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
218 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
219 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
220 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
221 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
222 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
223 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
224 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
225 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
226 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
227 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
228 };
229 
230 /*
231  * Multiply a given number by 2 raised to the given power.
232  */
233 static uint8_t
234 vdev_raidz_exp2(uint_t a, int exp)
235 {
236 	if (a == 0)
237 		return (0);
238 
239 	ASSERT(exp >= 0);
240 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
241 
242 	exp += vdev_raidz_log2[a];
243 	if (exp > 255)
244 		exp -= 255;
245 
246 	return (vdev_raidz_pow2[exp]);
247 }
248 
249 static void
250 vdev_raidz_map_free(zio_t *zio)
251 {
252 	raidz_map_t *rm = zio->io_vsd;
253 	int c;
254 
255 	for (c = 0; c < rm->rm_firstdatacol; c++)
256 		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
257 
258 	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
259 }
260 
261 static raidz_map_t *
262 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
263     uint64_t nparity)
264 {
265 	raidz_map_t *rm;
266 	uint64_t b = zio->io_offset >> unit_shift;
267 	uint64_t s = zio->io_size >> unit_shift;
268 	uint64_t f = b % dcols;
269 	uint64_t o = (b / dcols) << unit_shift;
270 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
271 
272 	q = s / (dcols - nparity);
273 	r = s - q * (dcols - nparity);
274 	bc = (r == 0 ? 0 : r + nparity);
275 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
276 
277 	if (q == 0) {
278 		acols = bc;
279 		scols = MIN(dcols, roundup(bc, nparity + 1));
280 	} else {
281 		acols = dcols;
282 		scols = dcols;
283 	}
284 
285 	ASSERT3U(acols, <=, scols);
286 
287 	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
288 
289 	rm->rm_cols = acols;
290 	rm->rm_scols = scols;
291 	rm->rm_bigcols = bc;
292 	rm->rm_skipstart = bc;
293 	rm->rm_missingdata = 0;
294 	rm->rm_missingparity = 0;
295 	rm->rm_firstdatacol = nparity;
296 
297 	asize = 0;
298 
299 	for (c = 0; c < scols; c++) {
300 		col = f + c;
301 		coff = o;
302 		if (col >= dcols) {
303 			col -= dcols;
304 			coff += 1ULL << unit_shift;
305 		}
306 		rm->rm_col[c].rc_devidx = col;
307 		rm->rm_col[c].rc_offset = coff;
308 		rm->rm_col[c].rc_data = NULL;
309 		rm->rm_col[c].rc_error = 0;
310 		rm->rm_col[c].rc_tried = 0;
311 		rm->rm_col[c].rc_skipped = 0;
312 
313 		if (c >= acols)
314 			rm->rm_col[c].rc_size = 0;
315 		else if (c < bc)
316 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
317 		else
318 			rm->rm_col[c].rc_size = q << unit_shift;
319 
320 		asize += rm->rm_col[c].rc_size;
321 	}
322 
323 	ASSERT3U(asize, ==, tot << unit_shift);
324 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
325 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
326 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
327 	ASSERT3U(rm->rm_nskip, <=, nparity);
328 
329 	for (c = 0; c < rm->rm_firstdatacol; c++)
330 		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
331 
332 	rm->rm_col[c].rc_data = zio->io_data;
333 
334 	for (c = c + 1; c < acols; c++)
335 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
336 		    rm->rm_col[c - 1].rc_size;
337 
338 	/*
339 	 * If all data stored spans all columns, there's a danger that parity
340 	 * will always be on the same device and, since parity isn't read
341 	 * during normal operation, that that device's I/O bandwidth won't be
342 	 * used effectively. We therefore switch the parity every 1MB.
343 	 *
344 	 * ... at least that was, ostensibly, the theory. As a practical
345 	 * matter unless we juggle the parity between all devices evenly, we
346 	 * won't see any benefit. Further, occasional writes that aren't a
347 	 * multiple of the LCM of the number of children and the minimum
348 	 * stripe width are sufficient to avoid pessimal behavior.
349 	 * Unfortunately, this decision created an implicit on-disk format
350 	 * requirement that we need to support for all eternity, but only
351 	 * for single-parity RAID-Z.
352 	 *
353 	 * If we intend to skip a sector in the zeroth column for padding
354 	 * we must make sure to note this swap. We will never intend to
355 	 * skip the first column since at least one data and one parity
356 	 * column must appear in each row.
357 	 */
358 	ASSERT(rm->rm_cols >= 2);
359 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
360 
361 	if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
362 		devidx = rm->rm_col[0].rc_devidx;
363 		o = rm->rm_col[0].rc_offset;
364 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
365 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
366 		rm->rm_col[1].rc_devidx = devidx;
367 		rm->rm_col[1].rc_offset = o;
368 
369 		if (rm->rm_skipstart == 0)
370 			rm->rm_skipstart = 1;
371 	}
372 
373 	zio->io_vsd = rm;
374 	zio->io_vsd_free = vdev_raidz_map_free;
375 	return (rm);
376 }
377 
378 static void
379 vdev_raidz_generate_parity_p(raidz_map_t *rm)
380 {
381 	uint64_t *p, *src, pcount, ccount, i;
382 	int c;
383 
384 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
385 
386 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
387 		src = rm->rm_col[c].rc_data;
388 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
389 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
390 
391 		if (c == rm->rm_firstdatacol) {
392 			ASSERT(ccount == pcount);
393 			for (i = 0; i < ccount; i++, src++, p++) {
394 				*p = *src;
395 			}
396 		} else {
397 			ASSERT(ccount <= pcount);
398 			for (i = 0; i < ccount; i++, src++, p++) {
399 				*p ^= *src;
400 			}
401 		}
402 	}
403 }
404 
405 static void
406 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
407 {
408 	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
409 	int c;
410 
411 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
412 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
413 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
414 
415 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
416 		src = rm->rm_col[c].rc_data;
417 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
418 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
419 
420 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
421 
422 		if (c == rm->rm_firstdatacol) {
423 			ASSERT(ccnt == pcnt || ccnt == 0);
424 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
425 				*p = *src;
426 				*q = *src;
427 			}
428 			for (; i < pcnt; i++, src++, p++, q++) {
429 				*p = 0;
430 				*q = 0;
431 			}
432 		} else {
433 			ASSERT(ccnt <= pcnt);
434 
435 			/*
436 			 * Apply the algorithm described above by multiplying
437 			 * the previous result and adding in the new value.
438 			 */
439 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
440 				*p ^= *src;
441 
442 				VDEV_RAIDZ_64MUL_2(*q, mask);
443 				*q ^= *src;
444 			}
445 
446 			/*
447 			 * Treat short columns as though they are full of 0s.
448 			 * Note that there's therefore nothing needed for P.
449 			 */
450 			for (; i < pcnt; i++, q++) {
451 				VDEV_RAIDZ_64MUL_2(*q, mask);
452 			}
453 		}
454 	}
455 }
456 
457 static void
458 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
459 {
460 	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
461 	int c;
462 
463 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
464 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
465 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
466 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
467 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
468 
469 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
470 		src = rm->rm_col[c].rc_data;
471 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
472 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
473 		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
474 
475 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
476 
477 		if (c == rm->rm_firstdatacol) {
478 			ASSERT(ccnt == pcnt || ccnt == 0);
479 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
480 				*p = *src;
481 				*q = *src;
482 				*r = *src;
483 			}
484 			for (; i < pcnt; i++, src++, p++, q++, r++) {
485 				*p = 0;
486 				*q = 0;
487 				*r = 0;
488 			}
489 		} else {
490 			ASSERT(ccnt <= pcnt);
491 
492 			/*
493 			 * Apply the algorithm described above by multiplying
494 			 * the previous result and adding in the new value.
495 			 */
496 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
497 				*p ^= *src;
498 
499 				VDEV_RAIDZ_64MUL_2(*q, mask);
500 				*q ^= *src;
501 
502 				VDEV_RAIDZ_64MUL_4(*r, mask);
503 				*r ^= *src;
504 			}
505 
506 			/*
507 			 * Treat short columns as though they are full of 0s.
508 			 * Note that there's therefore nothing needed for P.
509 			 */
510 			for (; i < pcnt; i++, q++, r++) {
511 				VDEV_RAIDZ_64MUL_2(*q, mask);
512 				VDEV_RAIDZ_64MUL_4(*r, mask);
513 			}
514 		}
515 	}
516 }
517 
518 /*
519  * Generate RAID parity in the first virtual columns according to the number of
520  * parity columns available.
521  */
522 static void
523 vdev_raidz_generate_parity(raidz_map_t *rm)
524 {
525 	switch (rm->rm_firstdatacol) {
526 	case 1:
527 		vdev_raidz_generate_parity_p(rm);
528 		break;
529 	case 2:
530 		vdev_raidz_generate_parity_pq(rm);
531 		break;
532 	case 3:
533 		vdev_raidz_generate_parity_pqr(rm);
534 		break;
535 	default:
536 		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
537 	}
538 }
539 
540 static int
541 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
542 {
543 	uint64_t *dst, *src, xcount, ccount, count, i;
544 	int x = tgts[0];
545 	int c;
546 
547 	ASSERT(ntgts == 1);
548 	ASSERT(x >= rm->rm_firstdatacol);
549 	ASSERT(x < rm->rm_cols);
550 
551 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
552 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
553 	ASSERT(xcount > 0);
554 
555 	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
556 	dst = rm->rm_col[x].rc_data;
557 	for (i = 0; i < xcount; i++, dst++, src++) {
558 		*dst = *src;
559 	}
560 
561 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
562 		src = rm->rm_col[c].rc_data;
563 		dst = rm->rm_col[x].rc_data;
564 
565 		if (c == x)
566 			continue;
567 
568 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
569 		count = MIN(ccount, xcount);
570 
571 		for (i = 0; i < count; i++, dst++, src++) {
572 			*dst ^= *src;
573 		}
574 	}
575 
576 	return (1 << VDEV_RAIDZ_P);
577 }
578 
579 static int
580 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
581 {
582 	uint64_t *dst, *src, xcount, ccount, count, mask, i;
583 	uint8_t *b;
584 	int x = tgts[0];
585 	int c, j, exp;
586 
587 	ASSERT(ntgts == 1);
588 
589 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
590 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
591 
592 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
593 		src = rm->rm_col[c].rc_data;
594 		dst = rm->rm_col[x].rc_data;
595 
596 		if (c == x)
597 			ccount = 0;
598 		else
599 			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
600 
601 		count = MIN(ccount, xcount);
602 
603 		if (c == rm->rm_firstdatacol) {
604 			for (i = 0; i < count; i++, dst++, src++) {
605 				*dst = *src;
606 			}
607 			for (; i < xcount; i++, dst++) {
608 				*dst = 0;
609 			}
610 
611 		} else {
612 			for (i = 0; i < count; i++, dst++, src++) {
613 				VDEV_RAIDZ_64MUL_2(*dst, mask);
614 				*dst ^= *src;
615 			}
616 
617 			for (; i < xcount; i++, dst++) {
618 				VDEV_RAIDZ_64MUL_2(*dst, mask);
619 			}
620 		}
621 	}
622 
623 	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
624 	dst = rm->rm_col[x].rc_data;
625 	exp = 255 - (rm->rm_cols - 1 - x);
626 
627 	for (i = 0; i < xcount; i++, dst++, src++) {
628 		*dst ^= *src;
629 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
630 			*b = vdev_raidz_exp2(*b, exp);
631 		}
632 	}
633 
634 	return (1 << VDEV_RAIDZ_Q);
635 }
636 
637 static int
638 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
639 {
640 	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
641 	void *pdata, *qdata;
642 	uint64_t xsize, ysize, i;
643 	int x = tgts[0];
644 	int y = tgts[1];
645 
646 	ASSERT(ntgts == 2);
647 	ASSERT(x < y);
648 	ASSERT(x >= rm->rm_firstdatacol);
649 	ASSERT(y < rm->rm_cols);
650 
651 	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
652 
653 	/*
654 	 * Move the parity data aside -- we're going to compute parity as
655 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
656 	 * reuse the parity generation mechanism without trashing the actual
657 	 * parity so we make those columns appear to be full of zeros by
658 	 * setting their lengths to zero.
659 	 */
660 	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
661 	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
662 	xsize = rm->rm_col[x].rc_size;
663 	ysize = rm->rm_col[y].rc_size;
664 
665 	rm->rm_col[VDEV_RAIDZ_P].rc_data =
666 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
667 	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
668 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
669 	rm->rm_col[x].rc_size = 0;
670 	rm->rm_col[y].rc_size = 0;
671 
672 	vdev_raidz_generate_parity_pq(rm);
673 
674 	rm->rm_col[x].rc_size = xsize;
675 	rm->rm_col[y].rc_size = ysize;
676 
677 	p = pdata;
678 	q = qdata;
679 	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
680 	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
681 	xd = rm->rm_col[x].rc_data;
682 	yd = rm->rm_col[y].rc_data;
683 
684 	/*
685 	 * We now have:
686 	 *	Pxy = P + D_x + D_y
687 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
688 	 *
689 	 * We can then solve for D_x:
690 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
691 	 * where
692 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
693 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
694 	 *
695 	 * With D_x in hand, we can easily solve for D_y:
696 	 *	D_y = P + Pxy + D_x
697 	 */
698 
699 	a = vdev_raidz_pow2[255 + x - y];
700 	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
701 	tmp = 255 - vdev_raidz_log2[a ^ 1];
702 
703 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
704 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
705 
706 	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
707 		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
708 		    vdev_raidz_exp2(*q ^ *qxy, bexp);
709 
710 		if (i < ysize)
711 			*yd = *p ^ *pxy ^ *xd;
712 	}
713 
714 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
715 	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
716 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
717 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
718 
719 	/*
720 	 * Restore the saved parity data.
721 	 */
722 	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
723 	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
724 
725 	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
726 }
727 
728 /* BEGIN CSTYLED */
729 /*
730  * In the general case of reconstruction, we must solve the system of linear
731  * equations defined by the coeffecients used to generate parity as well as
732  * the contents of the data and parity disks. This can be expressed with
733  * vectors for the original data (D) and the actual data (d) and parity (p)
734  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
735  *
736  *            __   __                     __     __
737  *            |     |         __     __   |  p_0  |
738  *            |  V  |         |  D_0  |   | p_m-1 |
739  *            |     |    x    |   :   | = |  d_0  |
740  *            |  I  |         | D_n-1 |   |   :   |
741  *            |     |         ~~     ~~   | d_n-1 |
742  *            ~~   ~~                     ~~     ~~
743  *
744  * I is simply a square identity matrix of size n, and V is a vandermonde
745  * matrix defined by the coeffecients we chose for the various parity columns
746  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
747  * computation as well as linear separability.
748  *
749  *      __               __               __     __
750  *      |   1   ..  1 1 1 |               |  p_0  |
751  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
752  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
753  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
754  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
755  *      |   :       : : : |   |   :   |   |  d_2  |
756  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
757  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
758  *      |   0   ..  0 0 1 |               | d_n-1 |
759  *      ~~               ~~               ~~     ~~
760  *
761  * Note that I, V, d, and p are known. To compute D, we must invert the
762  * matrix and use the known data and parity values to reconstruct the unknown
763  * data values. We begin by removing the rows in V|I and d|p that correspond
764  * to failed or missing columns; we then make V|I square (n x n) and d|p
765  * sized n by removing rows corresponding to unused parity from the bottom up
766  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
767  * using Gauss-Jordan elimination. In the example below we use m=3 parity
768  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
769  *           __                               __
770  *           |  1   1   1   1   1   1   1   1  |
771  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
772  *           |  19 205 116  29  64  16  4   1  |      / /
773  *           |  1   0   0   0   0   0   0   0  |     / /
774  *           |  0   1   0   0   0   0   0   0  | <--' /
775  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
776  *           |  0   0   0   1   0   0   0   0  |
777  *           |  0   0   0   0   1   0   0   0  |
778  *           |  0   0   0   0   0   1   0   0  |
779  *           |  0   0   0   0   0   0   1   0  |
780  *           |  0   0   0   0   0   0   0   1  |
781  *           ~~                               ~~
782  *           __                               __
783  *           |  1   1   1   1   1   1   1   1  |
784  *           | 128  64  32  16  8   4   2   1  |
785  *           |  19 205 116  29  64  16  4   1  |
786  *           |  1   0   0   0   0   0   0   0  |
787  *           |  0   1   0   0   0   0   0   0  |
788  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
789  *           |  0   0   0   1   0   0   0   0  |
790  *           |  0   0   0   0   1   0   0   0  |
791  *           |  0   0   0   0   0   1   0   0  |
792  *           |  0   0   0   0   0   0   1   0  |
793  *           |  0   0   0   0   0   0   0   1  |
794  *           ~~                               ~~
795  *
796  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
797  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
798  * matrix is not singular.
799  * __                                                                 __
800  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
801  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
802  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
803  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
804  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
805  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
806  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
807  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
808  * ~~                                                                 ~~
809  * __                                                                 __
810  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
811  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
812  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
813  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
814  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
815  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
816  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
817  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
818  * ~~                                                                 ~~
819  * __                                                                 __
820  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
821  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
822  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
823  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
824  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
825  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
826  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
827  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
828  * ~~                                                                 ~~
829  * __                                                                 __
830  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
831  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
832  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
833  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
834  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
835  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
836  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
837  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
838  * ~~                                                                 ~~
839  * __                                                                 __
840  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
841  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
842  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
843  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
844  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
845  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
846  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
847  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
848  * ~~                                                                 ~~
849  * __                                                                 __
850  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
851  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
852  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
853  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
854  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
855  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
856  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
857  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
858  * ~~                                                                 ~~
859  *                   __                               __
860  *                   |  0   0   1   0   0   0   0   0  |
861  *                   | 167 100  5   41 159 169 217 208 |
862  *                   | 166 100  4   40 158 168 216 209 |
863  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
864  *                   |  0   0   0   0   1   0   0   0  |
865  *                   |  0   0   0   0   0   1   0   0  |
866  *                   |  0   0   0   0   0   0   1   0  |
867  *                   |  0   0   0   0   0   0   0   1  |
868  *                   ~~                               ~~
869  *
870  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
871  * of the missing data.
872  *
873  * As is apparent from the example above, the only non-trivial rows in the
874  * inverse matrix correspond to the data disks that we're trying to
875  * reconstruct. Indeed, those are the only rows we need as the others would
876  * only be useful for reconstructing data known or assumed to be valid. For
877  * that reason, we only build the coefficients in the rows that correspond to
878  * targeted columns.
879  */
880 /* END CSTYLED */
881 
882 static void
883 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
884     uint8_t **rows)
885 {
886 	int i, j;
887 	int pow;
888 
889 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
890 
891 	/*
892 	 * Fill in the missing rows of interest.
893 	 */
894 	for (i = 0; i < nmap; i++) {
895 		ASSERT3S(0, <=, map[i]);
896 		ASSERT3S(map[i], <=, 2);
897 
898 		pow = map[i] * n;
899 		if (pow > 255)
900 			pow -= 255;
901 		ASSERT(pow <= 255);
902 
903 		for (j = 0; j < n; j++) {
904 			pow -= map[i];
905 			if (pow < 0)
906 				pow += 255;
907 			rows[i][j] = vdev_raidz_pow2[pow];
908 		}
909 	}
910 }
911 
912 static void
913 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
914     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
915 {
916 	int i, j, ii, jj;
917 	uint8_t log;
918 
919 	/*
920 	 * Assert that the first nmissing entries from the array of used
921 	 * columns correspond to parity columns and that subsequent entries
922 	 * correspond to data columns.
923 	 */
924 	for (i = 0; i < nmissing; i++) {
925 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
926 	}
927 	for (; i < n; i++) {
928 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
929 	}
930 
931 	/*
932 	 * First initialize the storage where we'll compute the inverse rows.
933 	 */
934 	for (i = 0; i < nmissing; i++) {
935 		for (j = 0; j < n; j++) {
936 			invrows[i][j] = (i == j) ? 1 : 0;
937 		}
938 	}
939 
940 	/*
941 	 * Subtract all trivial rows from the rows of consequence.
942 	 */
943 	for (i = 0; i < nmissing; i++) {
944 		for (j = nmissing; j < n; j++) {
945 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
946 			jj = used[j] - rm->rm_firstdatacol;
947 			ASSERT3S(jj, <, n);
948 			invrows[i][j] = rows[i][jj];
949 			rows[i][jj] = 0;
950 		}
951 	}
952 
953 	/*
954 	 * For each of the rows of interest, we must normalize it and subtract
955 	 * a multiple of it from the other rows.
956 	 */
957 	for (i = 0; i < nmissing; i++) {
958 		for (j = 0; j < missing[i]; j++) {
959 			ASSERT3U(rows[i][j], ==, 0);
960 		}
961 		ASSERT3U(rows[i][missing[i]], !=, 0);
962 
963 		/*
964 		 * Compute the inverse of the first element and multiply each
965 		 * element in the row by that value.
966 		 */
967 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
968 
969 		for (j = 0; j < n; j++) {
970 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
971 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
972 		}
973 
974 		for (ii = 0; ii < nmissing; ii++) {
975 			if (i == ii)
976 				continue;
977 
978 			ASSERT3U(rows[ii][missing[i]], !=, 0);
979 
980 			log = vdev_raidz_log2[rows[ii][missing[i]]];
981 
982 			for (j = 0; j < n; j++) {
983 				rows[ii][j] ^=
984 				    vdev_raidz_exp2(rows[i][j], log);
985 				invrows[ii][j] ^=
986 				    vdev_raidz_exp2(invrows[i][j], log);
987 			}
988 		}
989 	}
990 
991 	/*
992 	 * Verify that the data that is left in the rows are properly part of
993 	 * an identity matrix.
994 	 */
995 	for (i = 0; i < nmissing; i++) {
996 		for (j = 0; j < n; j++) {
997 			if (j == missing[i]) {
998 				ASSERT3U(rows[i][j], ==, 1);
999 			} else {
1000 				ASSERT3U(rows[i][j], ==, 0);
1001 			}
1002 		}
1003 	}
1004 }
1005 
1006 static void
1007 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1008     int *missing, uint8_t **invrows, const uint8_t *used)
1009 {
1010 	int i, j, x, cc, c;
1011 	uint8_t *src;
1012 	uint64_t ccount;
1013 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1014 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1015 	uint8_t log, val;
1016 	int ll;
1017 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1018 	uint8_t *p, *pp;
1019 	size_t psize;
1020 
1021 	psize = sizeof (invlog[0][0]) * n * nmissing;
1022 	p = kmem_alloc(psize, KM_SLEEP);
1023 
1024 	for (pp = p, i = 0; i < nmissing; i++) {
1025 		invlog[i] = pp;
1026 		pp += n;
1027 	}
1028 
1029 	for (i = 0; i < nmissing; i++) {
1030 		for (j = 0; j < n; j++) {
1031 			ASSERT3U(invrows[i][j], !=, 0);
1032 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1033 		}
1034 	}
1035 
1036 	for (i = 0; i < n; i++) {
1037 		c = used[i];
1038 		ASSERT3U(c, <, rm->rm_cols);
1039 
1040 		src = rm->rm_col[c].rc_data;
1041 		ccount = rm->rm_col[c].rc_size;
1042 		for (j = 0; j < nmissing; j++) {
1043 			cc = missing[j] + rm->rm_firstdatacol;
1044 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1045 			ASSERT3U(cc, <, rm->rm_cols);
1046 			ASSERT3U(cc, !=, c);
1047 
1048 			dst[j] = rm->rm_col[cc].rc_data;
1049 			dcount[j] = rm->rm_col[cc].rc_size;
1050 		}
1051 
1052 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1053 
1054 		for (x = 0; x < ccount; x++, src++) {
1055 			if (*src != 0)
1056 				log = vdev_raidz_log2[*src];
1057 
1058 			for (cc = 0; cc < nmissing; cc++) {
1059 				if (x >= dcount[cc])
1060 					continue;
1061 
1062 				if (*src == 0) {
1063 					val = 0;
1064 				} else {
1065 					if ((ll = log + invlog[cc][i]) >= 255)
1066 						ll -= 255;
1067 					val = vdev_raidz_pow2[ll];
1068 				}
1069 
1070 				if (i == 0)
1071 					dst[cc][x] = val;
1072 				else
1073 					dst[cc][x] ^= val;
1074 			}
1075 		}
1076 	}
1077 
1078 	kmem_free(p, psize);
1079 }
1080 
1081 static int
1082 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1083 {
1084 	int n, i, c, t, tt;
1085 	int nmissing_rows;
1086 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1087 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1088 
1089 	uint8_t *p, *pp;
1090 	size_t psize;
1091 
1092 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1093 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1094 	uint8_t *used;
1095 
1096 	int code = 0;
1097 
1098 
1099 	n = rm->rm_cols - rm->rm_firstdatacol;
1100 
1101 	/*
1102 	 * Figure out which data columns are missing.
1103 	 */
1104 	nmissing_rows = 0;
1105 	for (t = 0; t < ntgts; t++) {
1106 		if (tgts[t] >= rm->rm_firstdatacol) {
1107 			missing_rows[nmissing_rows++] =
1108 			    tgts[t] - rm->rm_firstdatacol;
1109 		}
1110 	}
1111 
1112 	/*
1113 	 * Figure out which parity columns to use to help generate the missing
1114 	 * data columns.
1115 	 */
1116 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1117 		ASSERT(tt < ntgts);
1118 		ASSERT(c < rm->rm_firstdatacol);
1119 
1120 		/*
1121 		 * Skip any targeted parity columns.
1122 		 */
1123 		if (c == tgts[tt]) {
1124 			tt++;
1125 			continue;
1126 		}
1127 
1128 		code |= 1 << c;
1129 
1130 		parity_map[i] = c;
1131 		i++;
1132 	}
1133 
1134 	ASSERT(code != 0);
1135 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1136 
1137 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1138 	    nmissing_rows * n + sizeof (used[0]) * n;
1139 	p = kmem_alloc(psize, KM_SLEEP);
1140 
1141 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1142 		rows[i] = pp;
1143 		pp += n;
1144 		invrows[i] = pp;
1145 		pp += n;
1146 	}
1147 	used = pp;
1148 
1149 	for (i = 0; i < nmissing_rows; i++) {
1150 		used[i] = parity_map[i];
1151 	}
1152 
1153 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1154 		if (tt < nmissing_rows &&
1155 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1156 			tt++;
1157 			continue;
1158 		}
1159 
1160 		ASSERT3S(i, <, n);
1161 		used[i] = c;
1162 		i++;
1163 	}
1164 
1165 	/*
1166 	 * Initialize the interesting rows of the matrix.
1167 	 */
1168 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1169 
1170 	/*
1171 	 * Invert the matrix.
1172 	 */
1173 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1174 	    invrows, used);
1175 
1176 	/*
1177 	 * Reconstruct the missing data using the generated matrix.
1178 	 */
1179 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1180 	    invrows, used);
1181 
1182 	kmem_free(p, psize);
1183 
1184 	return (code);
1185 }
1186 
1187 static int
1188 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1189 {
1190 	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1191 	int ntgts;
1192 	int i, c;
1193 	int code;
1194 	int nbadparity, nbaddata;
1195 	int parity_valid[VDEV_RAIDZ_MAXPARITY];
1196 
1197 	/*
1198 	 * The tgts list must already be sorted.
1199 	 */
1200 	for (i = 1; i < nt; i++) {
1201 		ASSERT(t[i] > t[i - 1]);
1202 	}
1203 
1204 	nbadparity = rm->rm_firstdatacol;
1205 	nbaddata = rm->rm_cols - nbadparity;
1206 	ntgts = 0;
1207 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1208 		if (c < rm->rm_firstdatacol)
1209 			parity_valid[c] = B_FALSE;
1210 
1211 		if (i < nt && c == t[i]) {
1212 			tgts[ntgts++] = c;
1213 			i++;
1214 		} else if (rm->rm_col[c].rc_error != 0) {
1215 			tgts[ntgts++] = c;
1216 		} else if (c >= rm->rm_firstdatacol) {
1217 			nbaddata--;
1218 		} else {
1219 			parity_valid[c] = B_TRUE;
1220 			nbadparity--;
1221 		}
1222 	}
1223 
1224 	ASSERT(ntgts >= nt);
1225 	ASSERT(nbaddata >= 0);
1226 	ASSERT(nbaddata + nbadparity == ntgts);
1227 
1228 	dt = &tgts[nbadparity];
1229 
1230 	/*
1231 	 * See if we can use any of our optimized reconstruction routines.
1232 	 */
1233 	if (!vdev_raidz_default_to_general) {
1234 		switch (nbaddata) {
1235 		case 1:
1236 			if (parity_valid[VDEV_RAIDZ_P])
1237 				return (vdev_raidz_reconstruct_p(rm, dt, 1));
1238 
1239 			ASSERT(rm->rm_firstdatacol > 1);
1240 
1241 			if (parity_valid[VDEV_RAIDZ_Q])
1242 				return (vdev_raidz_reconstruct_q(rm, dt, 1));
1243 
1244 			ASSERT(rm->rm_firstdatacol > 2);
1245 			break;
1246 
1247 		case 2:
1248 			ASSERT(rm->rm_firstdatacol > 1);
1249 
1250 			if (parity_valid[VDEV_RAIDZ_P] &&
1251 			    parity_valid[VDEV_RAIDZ_Q])
1252 				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1253 
1254 			ASSERT(rm->rm_firstdatacol > 2);
1255 
1256 			break;
1257 		}
1258 	}
1259 
1260 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1261 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1262 	ASSERT(code > 0);
1263 	return (code);
1264 }
1265 
1266 static int
1267 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
1268 {
1269 	vdev_t *cvd;
1270 	uint64_t nparity = vd->vdev_nparity;
1271 	int c;
1272 	int lasterror = 0;
1273 	int numerrors = 0;
1274 
1275 	ASSERT(nparity > 0);
1276 
1277 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
1278 	    vd->vdev_children < nparity + 1) {
1279 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1280 		return (EINVAL);
1281 	}
1282 
1283 	vdev_open_children(vd);
1284 
1285 	for (c = 0; c < vd->vdev_children; c++) {
1286 		cvd = vd->vdev_child[c];
1287 
1288 		if (cvd->vdev_open_error != 0) {
1289 			lasterror = cvd->vdev_open_error;
1290 			numerrors++;
1291 			continue;
1292 		}
1293 
1294 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1295 		*ashift = MAX(*ashift, cvd->vdev_ashift);
1296 	}
1297 
1298 	*asize *= vd->vdev_children;
1299 
1300 	if (numerrors > nparity) {
1301 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1302 		return (lasterror);
1303 	}
1304 
1305 	return (0);
1306 }
1307 
1308 static void
1309 vdev_raidz_close(vdev_t *vd)
1310 {
1311 	int c;
1312 
1313 	for (c = 0; c < vd->vdev_children; c++)
1314 		vdev_close(vd->vdev_child[c]);
1315 }
1316 
1317 static uint64_t
1318 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1319 {
1320 	uint64_t asize;
1321 	uint64_t ashift = vd->vdev_top->vdev_ashift;
1322 	uint64_t cols = vd->vdev_children;
1323 	uint64_t nparity = vd->vdev_nparity;
1324 
1325 	asize = ((psize - 1) >> ashift) + 1;
1326 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1327 	asize = roundup(asize, nparity + 1) << ashift;
1328 
1329 	return (asize);
1330 }
1331 
1332 static void
1333 vdev_raidz_child_done(zio_t *zio)
1334 {
1335 	raidz_col_t *rc = zio->io_private;
1336 
1337 	rc->rc_error = zio->io_error;
1338 	rc->rc_tried = 1;
1339 	rc->rc_skipped = 0;
1340 }
1341 
1342 static int
1343 vdev_raidz_io_start(zio_t *zio)
1344 {
1345 	vdev_t *vd = zio->io_vd;
1346 	vdev_t *tvd = vd->vdev_top;
1347 	vdev_t *cvd;
1348 	blkptr_t *bp = zio->io_bp;
1349 	raidz_map_t *rm;
1350 	raidz_col_t *rc;
1351 	int c, i;
1352 
1353 	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
1354 	    vd->vdev_nparity);
1355 
1356 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1357 
1358 	if (zio->io_type == ZIO_TYPE_WRITE) {
1359 		vdev_raidz_generate_parity(rm);
1360 
1361 		for (c = 0; c < rm->rm_cols; c++) {
1362 			rc = &rm->rm_col[c];
1363 			cvd = vd->vdev_child[rc->rc_devidx];
1364 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1365 			    rc->rc_offset, rc->rc_data, rc->rc_size,
1366 			    zio->io_type, zio->io_priority, 0,
1367 			    vdev_raidz_child_done, rc));
1368 		}
1369 
1370 		/*
1371 		 * Generate optional I/Os for any skipped sectors to improve
1372 		 * aggregation contiguity.
1373 		 */
1374 		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1375 			ASSERT(c <= rm->rm_scols);
1376 			if (c == rm->rm_scols)
1377 				c = 0;
1378 			rc = &rm->rm_col[c];
1379 			cvd = vd->vdev_child[rc->rc_devidx];
1380 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1381 			    rc->rc_offset + rc->rc_size, NULL,
1382 			    1 << tvd->vdev_ashift,
1383 			    zio->io_type, zio->io_priority,
1384 			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1385 		}
1386 
1387 		return (ZIO_PIPELINE_CONTINUE);
1388 	}
1389 
1390 	ASSERT(zio->io_type == ZIO_TYPE_READ);
1391 
1392 	/*
1393 	 * Iterate over the columns in reverse order so that we hit the parity
1394 	 * last -- any errors along the way will force us to read the parity.
1395 	 */
1396 	for (c = rm->rm_cols - 1; c >= 0; c--) {
1397 		rc = &rm->rm_col[c];
1398 		cvd = vd->vdev_child[rc->rc_devidx];
1399 		if (!vdev_readable(cvd)) {
1400 			if (c >= rm->rm_firstdatacol)
1401 				rm->rm_missingdata++;
1402 			else
1403 				rm->rm_missingparity++;
1404 			rc->rc_error = ENXIO;
1405 			rc->rc_tried = 1;	/* don't even try */
1406 			rc->rc_skipped = 1;
1407 			continue;
1408 		}
1409 		if (vdev_dtl_contains(cvd, DTL_MISSING, bp->blk_birth, 1)) {
1410 			if (c >= rm->rm_firstdatacol)
1411 				rm->rm_missingdata++;
1412 			else
1413 				rm->rm_missingparity++;
1414 			rc->rc_error = ESTALE;
1415 			rc->rc_skipped = 1;
1416 			continue;
1417 		}
1418 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1419 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1420 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1421 			    rc->rc_offset, rc->rc_data, rc->rc_size,
1422 			    zio->io_type, zio->io_priority, 0,
1423 			    vdev_raidz_child_done, rc));
1424 		}
1425 	}
1426 
1427 	return (ZIO_PIPELINE_CONTINUE);
1428 }
1429 
1430 /*
1431  * Report a checksum error for a child of a RAID-Z device.
1432  */
1433 static void
1434 raidz_checksum_error(zio_t *zio, raidz_col_t *rc)
1435 {
1436 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1437 
1438 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1439 		mutex_enter(&vd->vdev_stat_lock);
1440 		vd->vdev_stat.vs_checksum_errors++;
1441 		mutex_exit(&vd->vdev_stat_lock);
1442 	}
1443 
1444 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE))
1445 		zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
1446 		    zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size);
1447 }
1448 
1449 /*
1450  * Generate the parity from the data columns. If we tried and were able to
1451  * read the parity without error, verify that the generated parity matches the
1452  * data we read. If it doesn't, we fire off a checksum error. Return the
1453  * number such failures.
1454  */
1455 static int
1456 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1457 {
1458 	void *orig[VDEV_RAIDZ_MAXPARITY];
1459 	int c, ret = 0;
1460 	raidz_col_t *rc;
1461 
1462 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1463 		rc = &rm->rm_col[c];
1464 		if (!rc->rc_tried || rc->rc_error != 0)
1465 			continue;
1466 		orig[c] = zio_buf_alloc(rc->rc_size);
1467 		bcopy(rc->rc_data, orig[c], rc->rc_size);
1468 	}
1469 
1470 	vdev_raidz_generate_parity(rm);
1471 
1472 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1473 		rc = &rm->rm_col[c];
1474 		if (!rc->rc_tried || rc->rc_error != 0)
1475 			continue;
1476 		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1477 			raidz_checksum_error(zio, rc);
1478 			rc->rc_error = ECKSUM;
1479 			ret++;
1480 		}
1481 		zio_buf_free(orig[c], rc->rc_size);
1482 	}
1483 
1484 	return (ret);
1485 }
1486 
1487 /*
1488  * Keep statistics on all the ways that we used parity to correct data.
1489  */
1490 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1491 
1492 static int
1493 vdev_raidz_worst_error(raidz_map_t *rm)
1494 {
1495 	int error = 0;
1496 
1497 	for (int c = 0; c < rm->rm_cols; c++)
1498 		error = zio_worst_error(error, rm->rm_col[c].rc_error);
1499 
1500 	return (error);
1501 }
1502 
1503 /*
1504  * Iterate over all combinations of bad data and attempt a reconstruction.
1505  * Note that the algorithm below is non-optimal because it doesn't take into
1506  * account how reconstruction is actually performed. For example, with
1507  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1508  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1509  * cases we'd only use parity information in column 0.
1510  */
1511 static int
1512 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1513 {
1514 	raidz_map_t *rm = zio->io_vsd;
1515 	raidz_col_t *rc;
1516 	void *orig[VDEV_RAIDZ_MAXPARITY];
1517 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1518 	int *tgts = &tstore[1];
1519 	int current, next, i, c, n;
1520 	int code, ret = 0;
1521 
1522 	ASSERT(total_errors < rm->rm_firstdatacol);
1523 
1524 	/*
1525 	 * This simplifies one edge condition.
1526 	 */
1527 	tgts[-1] = -1;
1528 
1529 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1530 		/*
1531 		 * Initialize the targets array by finding the first n columns
1532 		 * that contain no error.
1533 		 *
1534 		 * If there were no data errors, we need to ensure that we're
1535 		 * always explicitly attempting to reconstruct at least one
1536 		 * data column. To do this, we simply push the highest target
1537 		 * up into the data columns.
1538 		 */
1539 		for (c = 0, i = 0; i < n; i++) {
1540 			if (i == n - 1 && data_errors == 0 &&
1541 			    c < rm->rm_firstdatacol) {
1542 				c = rm->rm_firstdatacol;
1543 			}
1544 
1545 			while (rm->rm_col[c].rc_error != 0) {
1546 				c++;
1547 				ASSERT3S(c, <, rm->rm_cols);
1548 			}
1549 
1550 			tgts[i] = c++;
1551 		}
1552 
1553 		/*
1554 		 * Setting tgts[n] simplifies the other edge condition.
1555 		 */
1556 		tgts[n] = rm->rm_cols;
1557 
1558 		/*
1559 		 * These buffers were allocated in previous iterations.
1560 		 */
1561 		for (i = 0; i < n - 1; i++) {
1562 			ASSERT(orig[i] != NULL);
1563 		}
1564 
1565 		orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
1566 
1567 		current = 0;
1568 		next = tgts[current];
1569 
1570 		while (current != n) {
1571 			tgts[current] = next;
1572 			current = 0;
1573 
1574 			/*
1575 			 * Save off the original data that we're going to
1576 			 * attempt to reconstruct.
1577 			 */
1578 			for (i = 0; i < n; i++) {
1579 				ASSERT(orig[i] != NULL);
1580 				c = tgts[i];
1581 				ASSERT3S(c, >=, 0);
1582 				ASSERT3S(c, <, rm->rm_cols);
1583 				rc = &rm->rm_col[c];
1584 				bcopy(rc->rc_data, orig[i], rc->rc_size);
1585 			}
1586 
1587 			/*
1588 			 * Attempt a reconstruction and exit the outer loop on
1589 			 * success.
1590 			 */
1591 			code = vdev_raidz_reconstruct(rm, tgts, n);
1592 			if (zio_checksum_error(zio) == 0) {
1593 				atomic_inc_64(&raidz_corrected[code]);
1594 
1595 				for (i = 0; i < n; i++) {
1596 					c = tgts[i];
1597 					rc = &rm->rm_col[c];
1598 					ASSERT(rc->rc_error == 0);
1599 					if (rc->rc_tried) {
1600 						if (bcmp(orig[i], rc->rc_data,
1601 						    rc->rc_size) == 0)
1602 							continue;
1603 						raidz_checksum_error(zio, rc);
1604 					}
1605 					rc->rc_error = ECKSUM;
1606 				}
1607 
1608 				ret = code;
1609 				goto done;
1610 			}
1611 
1612 			/*
1613 			 * Restore the original data.
1614 			 */
1615 			for (i = 0; i < n; i++) {
1616 				c = tgts[i];
1617 				rc = &rm->rm_col[c];
1618 				bcopy(orig[i], rc->rc_data, rc->rc_size);
1619 			}
1620 
1621 			do {
1622 				/*
1623 				 * Find the next valid column after the current
1624 				 * position..
1625 				 */
1626 				for (next = tgts[current] + 1;
1627 				    next < rm->rm_cols &&
1628 				    rm->rm_col[next].rc_error != 0; next++)
1629 					continue;
1630 
1631 				ASSERT(next <= tgts[current + 1]);
1632 
1633 				/*
1634 				 * If that spot is available, we're done here.
1635 				 */
1636 				if (next != tgts[current + 1])
1637 					break;
1638 
1639 				/*
1640 				 * Otherwise, find the next valid column after
1641 				 * the previous position.
1642 				 */
1643 				for (c = tgts[current - 1] + 1;
1644 				    rm->rm_col[c].rc_error != 0; c++)
1645 					continue;
1646 
1647 				tgts[current] = c;
1648 				current++;
1649 
1650 			} while (current != n);
1651 		}
1652 	}
1653 	n--;
1654 done:
1655 	for (i = 0; i < n; i++) {
1656 		zio_buf_free(orig[i], rm->rm_col[0].rc_size);
1657 	}
1658 
1659 	return (ret);
1660 }
1661 
1662 static void
1663 vdev_raidz_io_done(zio_t *zio)
1664 {
1665 	vdev_t *vd = zio->io_vd;
1666 	vdev_t *cvd;
1667 	raidz_map_t *rm = zio->io_vsd;
1668 	raidz_col_t *rc;
1669 	int unexpected_errors = 0;
1670 	int parity_errors = 0;
1671 	int parity_untried = 0;
1672 	int data_errors = 0;
1673 	int total_errors = 0;
1674 	int n, c;
1675 	int tgts[VDEV_RAIDZ_MAXPARITY];
1676 	int code;
1677 
1678 	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
1679 
1680 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1681 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1682 
1683 	for (c = 0; c < rm->rm_cols; c++) {
1684 		rc = &rm->rm_col[c];
1685 
1686 		if (rc->rc_error) {
1687 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
1688 
1689 			if (c < rm->rm_firstdatacol)
1690 				parity_errors++;
1691 			else
1692 				data_errors++;
1693 
1694 			if (!rc->rc_skipped)
1695 				unexpected_errors++;
1696 
1697 			total_errors++;
1698 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1699 			parity_untried++;
1700 		}
1701 	}
1702 
1703 	if (zio->io_type == ZIO_TYPE_WRITE) {
1704 		/*
1705 		 * XXX -- for now, treat partial writes as a success.
1706 		 * (If we couldn't write enough columns to reconstruct
1707 		 * the data, the I/O failed.  Otherwise, good enough.)
1708 		 *
1709 		 * Now that we support write reallocation, it would be better
1710 		 * to treat partial failure as real failure unless there are
1711 		 * no non-degraded top-level vdevs left, and not update DTLs
1712 		 * if we intend to reallocate.
1713 		 */
1714 		/* XXPOLICY */
1715 		if (total_errors > rm->rm_firstdatacol)
1716 			zio->io_error = vdev_raidz_worst_error(rm);
1717 
1718 		return;
1719 	}
1720 
1721 	ASSERT(zio->io_type == ZIO_TYPE_READ);
1722 	/*
1723 	 * There are three potential phases for a read:
1724 	 *	1. produce valid data from the columns read
1725 	 *	2. read all disks and try again
1726 	 *	3. perform combinatorial reconstruction
1727 	 *
1728 	 * Each phase is progressively both more expensive and less likely to
1729 	 * occur. If we encounter more errors than we can repair or all phases
1730 	 * fail, we have no choice but to return an error.
1731 	 */
1732 
1733 	/*
1734 	 * If the number of errors we saw was correctable -- less than or equal
1735 	 * to the number of parity disks read -- attempt to produce data that
1736 	 * has a valid checksum. Naturally, this case applies in the absence of
1737 	 * any errors.
1738 	 */
1739 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1740 		if (data_errors == 0) {
1741 			if (zio_checksum_error(zio) == 0) {
1742 				/*
1743 				 * If we read parity information (unnecessarily
1744 				 * as it happens since no reconstruction was
1745 				 * needed) regenerate and verify the parity.
1746 				 * We also regenerate parity when resilvering
1747 				 * so we can write it out to the failed device
1748 				 * later.
1749 				 */
1750 				if (parity_errors + parity_untried <
1751 				    rm->rm_firstdatacol ||
1752 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
1753 					n = raidz_parity_verify(zio, rm);
1754 					unexpected_errors += n;
1755 					ASSERT(parity_errors + n <=
1756 					    rm->rm_firstdatacol);
1757 				}
1758 				goto done;
1759 			}
1760 		} else {
1761 			/*
1762 			 * We either attempt to read all the parity columns or
1763 			 * none of them. If we didn't try to read parity, we
1764 			 * wouldn't be here in the correctable case. There must
1765 			 * also have been fewer parity errors than parity
1766 			 * columns or, again, we wouldn't be in this code path.
1767 			 */
1768 			ASSERT(parity_untried == 0);
1769 			ASSERT(parity_errors < rm->rm_firstdatacol);
1770 
1771 			/*
1772 			 * Identify the data columns that reported an error.
1773 			 */
1774 			n = 0;
1775 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1776 				rc = &rm->rm_col[c];
1777 				if (rc->rc_error != 0) {
1778 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1779 					tgts[n++] = c;
1780 				}
1781 			}
1782 
1783 			ASSERT(rm->rm_firstdatacol >= n);
1784 
1785 			code = vdev_raidz_reconstruct(rm, tgts, n);
1786 
1787 			if (zio_checksum_error(zio) == 0) {
1788 				atomic_inc_64(&raidz_corrected[code]);
1789 
1790 				/*
1791 				 * If we read more parity disks than were used
1792 				 * for reconstruction, confirm that the other
1793 				 * parity disks produced correct data. This
1794 				 * routine is suboptimal in that it regenerates
1795 				 * the parity that we already used in addition
1796 				 * to the parity that we're attempting to
1797 				 * verify, but this should be a relatively
1798 				 * uncommon case, and can be optimized if it
1799 				 * becomes a problem. Note that we regenerate
1800 				 * parity when resilvering so we can write it
1801 				 * out to failed devices later.
1802 				 */
1803 				if (parity_errors < rm->rm_firstdatacol - n ||
1804 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
1805 					n = raidz_parity_verify(zio, rm);
1806 					unexpected_errors += n;
1807 					ASSERT(parity_errors + n <=
1808 					    rm->rm_firstdatacol);
1809 				}
1810 
1811 				goto done;
1812 			}
1813 		}
1814 	}
1815 
1816 	/*
1817 	 * This isn't a typical situation -- either we got a read error or
1818 	 * a child silently returned bad data. Read every block so we can
1819 	 * try again with as much data and parity as we can track down. If
1820 	 * we've already been through once before, all children will be marked
1821 	 * as tried so we'll proceed to combinatorial reconstruction.
1822 	 */
1823 	unexpected_errors = 1;
1824 	rm->rm_missingdata = 0;
1825 	rm->rm_missingparity = 0;
1826 
1827 	for (c = 0; c < rm->rm_cols; c++) {
1828 		if (rm->rm_col[c].rc_tried)
1829 			continue;
1830 
1831 		zio_vdev_io_redone(zio);
1832 		do {
1833 			rc = &rm->rm_col[c];
1834 			if (rc->rc_tried)
1835 				continue;
1836 			zio_nowait(zio_vdev_child_io(zio, NULL,
1837 			    vd->vdev_child[rc->rc_devidx],
1838 			    rc->rc_offset, rc->rc_data, rc->rc_size,
1839 			    zio->io_type, zio->io_priority, 0,
1840 			    vdev_raidz_child_done, rc));
1841 		} while (++c < rm->rm_cols);
1842 
1843 		return;
1844 	}
1845 
1846 	/*
1847 	 * At this point we've attempted to reconstruct the data given the
1848 	 * errors we detected, and we've attempted to read all columns. There
1849 	 * must, therefore, be one or more additional problems -- silent errors
1850 	 * resulting in invalid data rather than explicit I/O errors resulting
1851 	 * in absent data. We check if there is enough additional data to
1852 	 * possibly reconstruct the data and then perform combinatorial
1853 	 * reconstruction over all possible combinations. If that fails,
1854 	 * we're cooked.
1855 	 */
1856 	if (total_errors >= rm->rm_firstdatacol) {
1857 		zio->io_error = vdev_raidz_worst_error(rm);
1858 		/*
1859 		 * If there were exactly as many device errors as parity
1860 		 * columns, yet we couldn't reconstruct the data, then at
1861 		 * least one device must have returned bad data silently.
1862 		 */
1863 		if (total_errors == rm->rm_firstdatacol)
1864 			zio->io_error = zio_worst_error(zio->io_error, ECKSUM);
1865 
1866 	} else if ((code = vdev_raidz_combrec(zio, total_errors,
1867 	    data_errors)) != 0) {
1868 		/*
1869 		 * If we didn't use all the available parity for the
1870 		 * combinatorial reconstruction, verify that the remaining
1871 		 * parity is correct.
1872 		 */
1873 		if (code != (1 << rm->rm_firstdatacol) - 1)
1874 			(void) raidz_parity_verify(zio, rm);
1875 	} else {
1876 		/*
1877 		 * All combinations failed to checksum. Generate checksum
1878 		 * ereports for all children.
1879 		 */
1880 		zio->io_error = ECKSUM;
1881 
1882 		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1883 			for (c = 0; c < rm->rm_cols; c++) {
1884 				rc = &rm->rm_col[c];
1885 				zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
1886 				    zio->io_spa, vd->vdev_child[rc->rc_devidx],
1887 				    zio, rc->rc_offset, rc->rc_size);
1888 			}
1889 		}
1890 	}
1891 
1892 done:
1893 	zio_checksum_verified(zio);
1894 
1895 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
1896 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
1897 		/*
1898 		 * Use the good data we have in hand to repair damaged children.
1899 		 */
1900 		for (c = 0; c < rm->rm_cols; c++) {
1901 			rc = &rm->rm_col[c];
1902 			cvd = vd->vdev_child[rc->rc_devidx];
1903 
1904 			if (rc->rc_error == 0)
1905 				continue;
1906 
1907 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1908 			    rc->rc_offset, rc->rc_data, rc->rc_size,
1909 			    ZIO_TYPE_WRITE, zio->io_priority,
1910 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
1911 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
1912 		}
1913 	}
1914 }
1915 
1916 static void
1917 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
1918 {
1919 	if (faulted > vd->vdev_nparity)
1920 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1921 		    VDEV_AUX_NO_REPLICAS);
1922 	else if (degraded + faulted != 0)
1923 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
1924 	else
1925 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
1926 }
1927 
1928 vdev_ops_t vdev_raidz_ops = {
1929 	vdev_raidz_open,
1930 	vdev_raidz_close,
1931 	vdev_raidz_asize,
1932 	vdev_raidz_io_start,
1933 	vdev_raidz_io_done,
1934 	vdev_raidz_state_change,
1935 	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
1936 	B_FALSE			/* not a leaf vdev */
1937 };
1938