xref: /titanic_51/usr/src/uts/common/fs/zfs/vdev_raidz.c (revision b27516f55237249607f754e6e42e865f12456675)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 #include <sys/zfs_context.h>
27 #include <sys/spa.h>
28 #include <sys/vdev_impl.h>
29 #include <sys/zio.h>
30 #include <sys/zio_checksum.h>
31 #include <sys/fs/zfs.h>
32 #include <sys/fm/fs/zfs.h>
33 
34 /*
35  * Virtual device vector for RAID-Z.
36  *
37  * This vdev supports single, double, and triple parity. For single parity,
38  * we use a simple XOR of all the data columns. For double or triple parity,
39  * we use a special case of Reed-Solomon coding. This extends the
40  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
41  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
42  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
43  * former is also based. The latter is designed to provide higher performance
44  * for writes.
45  *
46  * Note that the Plank paper claimed to support arbitrary N+M, but was then
47  * amended six years later identifying a critical flaw that invalidates its
48  * claims. Nevertheless, the technique can be adapted to work for up to
49  * triple parity. For additional parity, the amendment "Note: Correction to
50  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
51  * is viable, but the additional complexity means that write performance will
52  * suffer.
53  *
54  * All of the methods above operate on a Galois field, defined over the
55  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
56  * can be expressed with a single byte. Briefly, the operations on the
57  * field are defined as follows:
58  *
59  *   o addition (+) is represented by a bitwise XOR
60  *   o subtraction (-) is therefore identical to addition: A + B = A - B
61  *   o multiplication of A by 2 is defined by the following bitwise expression:
62  *	(A * 2)_7 = A_6
63  *	(A * 2)_6 = A_5
64  *	(A * 2)_5 = A_4
65  *	(A * 2)_4 = A_3 + A_7
66  *	(A * 2)_3 = A_2 + A_7
67  *	(A * 2)_2 = A_1 + A_7
68  *	(A * 2)_1 = A_0
69  *	(A * 2)_0 = A_7
70  *
71  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
72  * As an aside, this multiplication is derived from the error correcting
73  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
74  *
75  * Observe that any number in the field (except for 0) can be expressed as a
76  * power of 2 -- a generator for the field. We store a table of the powers of
77  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
78  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
79  * than field addition). The inverse of a field element A (A^-1) is therefore
80  * A ^ (255 - 1) = A^254.
81  *
82  * The up-to-three parity columns, P, Q, R over several data columns,
83  * D_0, ... D_n-1, can be expressed by field operations:
84  *
85  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
86  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
87  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
88  *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
89  *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
90  *
91  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
92  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
93  * independent coefficients. (There are no additional coefficients that have
94  * this property which is why the uncorrected Plank method breaks down.)
95  *
96  * See the reconstruction code below for how P, Q and R can used individually
97  * or in concert to recover missing data columns.
98  */
99 
100 typedef struct raidz_col {
101 	uint64_t rc_devidx;		/* child device index for I/O */
102 	uint64_t rc_offset;		/* device offset */
103 	uint64_t rc_size;		/* I/O size */
104 	void *rc_data;			/* I/O data */
105 	void *rc_gdata;			/* used to store the "good" version */
106 	int rc_error;			/* I/O error for this device */
107 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
108 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
109 } raidz_col_t;
110 
111 typedef struct raidz_map {
112 	uint64_t rm_cols;		/* Regular column count */
113 	uint64_t rm_scols;		/* Count including skipped columns */
114 	uint64_t rm_bigcols;		/* Number of oversized columns */
115 	uint64_t rm_asize;		/* Actual total I/O size */
116 	uint64_t rm_missingdata;	/* Count of missing data devices */
117 	uint64_t rm_missingparity;	/* Count of missing parity devices */
118 	uint64_t rm_firstdatacol;	/* First data column/parity count */
119 	uint64_t rm_nskip;		/* Skipped sectors for padding */
120 	uint64_t rm_skipstart;	/* Column index of padding start */
121 	void *rm_datacopy;		/* rm_asize-buffer of copied data */
122 	uintptr_t rm_reports;		/* # of referencing checksum reports */
123 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
124 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
125 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
126 } raidz_map_t;
127 
128 #define	VDEV_RAIDZ_P		0
129 #define	VDEV_RAIDZ_Q		1
130 #define	VDEV_RAIDZ_R		2
131 
132 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
133 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
134 
135 /*
136  * We provide a mechanism to perform the field multiplication operation on a
137  * 64-bit value all at once rather than a byte at a time. This works by
138  * creating a mask from the top bit in each byte and using that to
139  * conditionally apply the XOR of 0x1d.
140  */
141 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
142 { \
143 	(mask) = (x) & 0x8080808080808080ULL; \
144 	(mask) = ((mask) << 1) - ((mask) >> 7); \
145 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
146 	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
147 }
148 
149 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
150 { \
151 	VDEV_RAIDZ_64MUL_2((x), mask); \
152 	VDEV_RAIDZ_64MUL_2((x), mask); \
153 }
154 
155 /*
156  * Force reconstruction to use the general purpose method.
157  */
158 int vdev_raidz_default_to_general;
159 
160 /*
161  * These two tables represent powers and logs of 2 in the Galois field defined
162  * above. These values were computed by repeatedly multiplying by 2 as above.
163  */
164 static const uint8_t vdev_raidz_pow2[256] = {
165 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
166 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
167 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
168 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
169 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
170 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
171 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
172 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
173 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
174 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
175 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
176 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
177 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
178 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
179 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
180 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
181 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
182 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
183 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
184 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
185 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
186 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
187 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
188 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
189 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
190 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
191 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
192 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
193 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
194 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
195 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
196 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
197 };
198 static const uint8_t vdev_raidz_log2[256] = {
199 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
200 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
201 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
202 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
203 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
204 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
205 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
206 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
207 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
208 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
209 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
210 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
211 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
212 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
213 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
214 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
215 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
216 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
217 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
218 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
219 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
220 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
221 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
222 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
223 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
224 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
225 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
226 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
227 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
228 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
229 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
230 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
231 };
232 
233 static void vdev_raidz_generate_parity(raidz_map_t *rm);
234 
235 /*
236  * Multiply a given number by 2 raised to the given power.
237  */
238 static uint8_t
239 vdev_raidz_exp2(uint_t a, int exp)
240 {
241 	if (a == 0)
242 		return (0);
243 
244 	ASSERT(exp >= 0);
245 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
246 
247 	exp += vdev_raidz_log2[a];
248 	if (exp > 255)
249 		exp -= 255;
250 
251 	return (vdev_raidz_pow2[exp]);
252 }
253 
254 static void
255 vdev_raidz_map_free(raidz_map_t *rm)
256 {
257 	int c;
258 	size_t size;
259 
260 	for (c = 0; c < rm->rm_firstdatacol; c++) {
261 		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
262 
263 		if (rm->rm_col[c].rc_gdata != NULL)
264 			zio_buf_free(rm->rm_col[c].rc_gdata,
265 			    rm->rm_col[c].rc_size);
266 	}
267 
268 	size = 0;
269 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
270 		size += rm->rm_col[c].rc_size;
271 
272 	if (rm->rm_datacopy != NULL)
273 		zio_buf_free(rm->rm_datacopy, size);
274 
275 	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
276 }
277 
278 static void
279 vdev_raidz_map_free_vsd(zio_t *zio)
280 {
281 	raidz_map_t *rm = zio->io_vsd;
282 
283 	ASSERT3U(rm->rm_freed, ==, 0);
284 	rm->rm_freed = 1;
285 
286 	if (rm->rm_reports == 0)
287 		vdev_raidz_map_free(rm);
288 }
289 
290 /*ARGSUSED*/
291 static void
292 vdev_raidz_cksum_free(void *arg, size_t ignored)
293 {
294 	raidz_map_t *rm = arg;
295 
296 	ASSERT3U(rm->rm_reports, >, 0);
297 
298 	if (--rm->rm_reports == 0 && rm->rm_freed != 0)
299 		vdev_raidz_map_free(rm);
300 }
301 
302 static void
303 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
304 {
305 	raidz_map_t *rm = zcr->zcr_cbdata;
306 	size_t c = zcr->zcr_cbinfo;
307 	size_t x;
308 
309 	const char *good = NULL;
310 	const char *bad = rm->rm_col[c].rc_data;
311 
312 	if (good_data == NULL) {
313 		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
314 		return;
315 	}
316 
317 	if (c < rm->rm_firstdatacol) {
318 		/*
319 		 * The first time through, calculate the parity blocks for
320 		 * the good data (this relies on the fact that the good
321 		 * data never changes for a given logical ZIO)
322 		 */
323 		if (rm->rm_col[0].rc_gdata == NULL) {
324 			char *bad_parity[VDEV_RAIDZ_MAXPARITY];
325 			char *buf;
326 
327 			/*
328 			 * Set up the rm_col[]s to generate the parity for
329 			 * good_data, first saving the parity bufs and
330 			 * replacing them with buffers to hold the result.
331 			 */
332 			for (x = 0; x < rm->rm_firstdatacol; x++) {
333 				bad_parity[x] = rm->rm_col[x].rc_data;
334 				rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
335 				    zio_buf_alloc(rm->rm_col[x].rc_size);
336 			}
337 
338 			/* fill in the data columns from good_data */
339 			buf = (char *)good_data;
340 			for (; x < rm->rm_cols; x++) {
341 				rm->rm_col[x].rc_data = buf;
342 				buf += rm->rm_col[x].rc_size;
343 			}
344 
345 			/*
346 			 * Construct the parity from the good data.
347 			 */
348 			vdev_raidz_generate_parity(rm);
349 
350 			/* restore everything back to its original state */
351 			for (x = 0; x < rm->rm_firstdatacol; x++)
352 				rm->rm_col[x].rc_data = bad_parity[x];
353 
354 			buf = rm->rm_datacopy;
355 			for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
356 				rm->rm_col[x].rc_data = buf;
357 				buf += rm->rm_col[x].rc_size;
358 			}
359 		}
360 
361 		ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
362 		good = rm->rm_col[c].rc_gdata;
363 	} else {
364 		/* adjust good_data to point at the start of our column */
365 		good = good_data;
366 
367 		for (x = rm->rm_firstdatacol; x < c; x++)
368 			good += rm->rm_col[x].rc_size;
369 	}
370 
371 	/* we drop the ereport if it ends up that the data was good */
372 	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
373 }
374 
375 /*
376  * Invoked indirectly by zfs_ereport_start_checksum(), called
377  * below when our read operation fails completely.  The main point
378  * is to keep a copy of everything we read from disk, so that at
379  * vdev_raidz_cksum_finish() time we can compare it with the good data.
380  */
381 static void
382 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
383 {
384 	size_t c = (size_t)(uintptr_t)arg;
385 	caddr_t buf;
386 
387 	raidz_map_t *rm = zio->io_vsd;
388 	size_t size;
389 
390 	/* set up the report and bump the refcount  */
391 	zcr->zcr_cbdata = rm;
392 	zcr->zcr_cbinfo = c;
393 	zcr->zcr_finish = vdev_raidz_cksum_finish;
394 	zcr->zcr_free = vdev_raidz_cksum_free;
395 
396 	rm->rm_reports++;
397 	ASSERT3U(rm->rm_reports, >, 0);
398 
399 	if (rm->rm_datacopy != NULL)
400 		return;
401 
402 	/*
403 	 * It's the first time we're called for this raidz_map_t, so we need
404 	 * to copy the data aside; there's no guarantee that our zio's buffer
405 	 * won't be re-used for something else.
406 	 *
407 	 * Our parity data is already in separate buffers, so there's no need
408 	 * to copy them.
409 	 */
410 
411 	size = 0;
412 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
413 		size += rm->rm_col[c].rc_size;
414 
415 	buf = rm->rm_datacopy = zio_buf_alloc(size);
416 
417 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
418 		raidz_col_t *col = &rm->rm_col[c];
419 
420 		bcopy(col->rc_data, buf, col->rc_size);
421 		col->rc_data = buf;
422 
423 		buf += col->rc_size;
424 	}
425 	ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
426 }
427 
428 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
429 	vdev_raidz_map_free_vsd,
430 	vdev_raidz_cksum_report
431 };
432 
433 static raidz_map_t *
434 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
435     uint64_t nparity)
436 {
437 	raidz_map_t *rm;
438 	uint64_t b = zio->io_offset >> unit_shift;
439 	uint64_t s = zio->io_size >> unit_shift;
440 	uint64_t f = b % dcols;
441 	uint64_t o = (b / dcols) << unit_shift;
442 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
443 
444 	q = s / (dcols - nparity);
445 	r = s - q * (dcols - nparity);
446 	bc = (r == 0 ? 0 : r + nparity);
447 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
448 
449 	if (q == 0) {
450 		acols = bc;
451 		scols = MIN(dcols, roundup(bc, nparity + 1));
452 	} else {
453 		acols = dcols;
454 		scols = dcols;
455 	}
456 
457 	ASSERT3U(acols, <=, scols);
458 
459 	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
460 
461 	rm->rm_cols = acols;
462 	rm->rm_scols = scols;
463 	rm->rm_bigcols = bc;
464 	rm->rm_skipstart = bc;
465 	rm->rm_missingdata = 0;
466 	rm->rm_missingparity = 0;
467 	rm->rm_firstdatacol = nparity;
468 	rm->rm_datacopy = NULL;
469 	rm->rm_reports = 0;
470 	rm->rm_freed = 0;
471 	rm->rm_ecksuminjected = 0;
472 
473 	asize = 0;
474 
475 	for (c = 0; c < scols; c++) {
476 		col = f + c;
477 		coff = o;
478 		if (col >= dcols) {
479 			col -= dcols;
480 			coff += 1ULL << unit_shift;
481 		}
482 		rm->rm_col[c].rc_devidx = col;
483 		rm->rm_col[c].rc_offset = coff;
484 		rm->rm_col[c].rc_data = NULL;
485 		rm->rm_col[c].rc_gdata = NULL;
486 		rm->rm_col[c].rc_error = 0;
487 		rm->rm_col[c].rc_tried = 0;
488 		rm->rm_col[c].rc_skipped = 0;
489 
490 		if (c >= acols)
491 			rm->rm_col[c].rc_size = 0;
492 		else if (c < bc)
493 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
494 		else
495 			rm->rm_col[c].rc_size = q << unit_shift;
496 
497 		asize += rm->rm_col[c].rc_size;
498 	}
499 
500 	ASSERT3U(asize, ==, tot << unit_shift);
501 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
502 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
503 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
504 	ASSERT3U(rm->rm_nskip, <=, nparity);
505 
506 	for (c = 0; c < rm->rm_firstdatacol; c++)
507 		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
508 
509 	rm->rm_col[c].rc_data = zio->io_data;
510 
511 	for (c = c + 1; c < acols; c++)
512 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
513 		    rm->rm_col[c - 1].rc_size;
514 
515 	/*
516 	 * If all data stored spans all columns, there's a danger that parity
517 	 * will always be on the same device and, since parity isn't read
518 	 * during normal operation, that that device's I/O bandwidth won't be
519 	 * used effectively. We therefore switch the parity every 1MB.
520 	 *
521 	 * ... at least that was, ostensibly, the theory. As a practical
522 	 * matter unless we juggle the parity between all devices evenly, we
523 	 * won't see any benefit. Further, occasional writes that aren't a
524 	 * multiple of the LCM of the number of children and the minimum
525 	 * stripe width are sufficient to avoid pessimal behavior.
526 	 * Unfortunately, this decision created an implicit on-disk format
527 	 * requirement that we need to support for all eternity, but only
528 	 * for single-parity RAID-Z.
529 	 *
530 	 * If we intend to skip a sector in the zeroth column for padding
531 	 * we must make sure to note this swap. We will never intend to
532 	 * skip the first column since at least one data and one parity
533 	 * column must appear in each row.
534 	 */
535 	ASSERT(rm->rm_cols >= 2);
536 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
537 
538 	if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
539 		devidx = rm->rm_col[0].rc_devidx;
540 		o = rm->rm_col[0].rc_offset;
541 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
542 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
543 		rm->rm_col[1].rc_devidx = devidx;
544 		rm->rm_col[1].rc_offset = o;
545 
546 		if (rm->rm_skipstart == 0)
547 			rm->rm_skipstart = 1;
548 	}
549 
550 	zio->io_vsd = rm;
551 	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
552 	return (rm);
553 }
554 
555 static void
556 vdev_raidz_generate_parity_p(raidz_map_t *rm)
557 {
558 	uint64_t *p, *src, pcount, ccount, i;
559 	int c;
560 
561 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
562 
563 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
564 		src = rm->rm_col[c].rc_data;
565 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
566 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
567 
568 		if (c == rm->rm_firstdatacol) {
569 			ASSERT(ccount == pcount);
570 			for (i = 0; i < ccount; i++, src++, p++) {
571 				*p = *src;
572 			}
573 		} else {
574 			ASSERT(ccount <= pcount);
575 			for (i = 0; i < ccount; i++, src++, p++) {
576 				*p ^= *src;
577 			}
578 		}
579 	}
580 }
581 
582 static void
583 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
584 {
585 	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
586 	int c;
587 
588 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
589 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
590 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
591 
592 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
593 		src = rm->rm_col[c].rc_data;
594 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
595 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
596 
597 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
598 
599 		if (c == rm->rm_firstdatacol) {
600 			ASSERT(ccnt == pcnt || ccnt == 0);
601 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
602 				*p = *src;
603 				*q = *src;
604 			}
605 			for (; i < pcnt; i++, src++, p++, q++) {
606 				*p = 0;
607 				*q = 0;
608 			}
609 		} else {
610 			ASSERT(ccnt <= pcnt);
611 
612 			/*
613 			 * Apply the algorithm described above by multiplying
614 			 * the previous result and adding in the new value.
615 			 */
616 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
617 				*p ^= *src;
618 
619 				VDEV_RAIDZ_64MUL_2(*q, mask);
620 				*q ^= *src;
621 			}
622 
623 			/*
624 			 * Treat short columns as though they are full of 0s.
625 			 * Note that there's therefore nothing needed for P.
626 			 */
627 			for (; i < pcnt; i++, q++) {
628 				VDEV_RAIDZ_64MUL_2(*q, mask);
629 			}
630 		}
631 	}
632 }
633 
634 static void
635 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
636 {
637 	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
638 	int c;
639 
640 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
641 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
642 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
643 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
644 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
645 
646 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
647 		src = rm->rm_col[c].rc_data;
648 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
649 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
650 		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
651 
652 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
653 
654 		if (c == rm->rm_firstdatacol) {
655 			ASSERT(ccnt == pcnt || ccnt == 0);
656 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
657 				*p = *src;
658 				*q = *src;
659 				*r = *src;
660 			}
661 			for (; i < pcnt; i++, src++, p++, q++, r++) {
662 				*p = 0;
663 				*q = 0;
664 				*r = 0;
665 			}
666 		} else {
667 			ASSERT(ccnt <= pcnt);
668 
669 			/*
670 			 * Apply the algorithm described above by multiplying
671 			 * the previous result and adding in the new value.
672 			 */
673 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
674 				*p ^= *src;
675 
676 				VDEV_RAIDZ_64MUL_2(*q, mask);
677 				*q ^= *src;
678 
679 				VDEV_RAIDZ_64MUL_4(*r, mask);
680 				*r ^= *src;
681 			}
682 
683 			/*
684 			 * Treat short columns as though they are full of 0s.
685 			 * Note that there's therefore nothing needed for P.
686 			 */
687 			for (; i < pcnt; i++, q++, r++) {
688 				VDEV_RAIDZ_64MUL_2(*q, mask);
689 				VDEV_RAIDZ_64MUL_4(*r, mask);
690 			}
691 		}
692 	}
693 }
694 
695 /*
696  * Generate RAID parity in the first virtual columns according to the number of
697  * parity columns available.
698  */
699 static void
700 vdev_raidz_generate_parity(raidz_map_t *rm)
701 {
702 	switch (rm->rm_firstdatacol) {
703 	case 1:
704 		vdev_raidz_generate_parity_p(rm);
705 		break;
706 	case 2:
707 		vdev_raidz_generate_parity_pq(rm);
708 		break;
709 	case 3:
710 		vdev_raidz_generate_parity_pqr(rm);
711 		break;
712 	default:
713 		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
714 	}
715 }
716 
717 static int
718 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
719 {
720 	uint64_t *dst, *src, xcount, ccount, count, i;
721 	int x = tgts[0];
722 	int c;
723 
724 	ASSERT(ntgts == 1);
725 	ASSERT(x >= rm->rm_firstdatacol);
726 	ASSERT(x < rm->rm_cols);
727 
728 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
729 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
730 	ASSERT(xcount > 0);
731 
732 	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
733 	dst = rm->rm_col[x].rc_data;
734 	for (i = 0; i < xcount; i++, dst++, src++) {
735 		*dst = *src;
736 	}
737 
738 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
739 		src = rm->rm_col[c].rc_data;
740 		dst = rm->rm_col[x].rc_data;
741 
742 		if (c == x)
743 			continue;
744 
745 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
746 		count = MIN(ccount, xcount);
747 
748 		for (i = 0; i < count; i++, dst++, src++) {
749 			*dst ^= *src;
750 		}
751 	}
752 
753 	return (1 << VDEV_RAIDZ_P);
754 }
755 
756 static int
757 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
758 {
759 	uint64_t *dst, *src, xcount, ccount, count, mask, i;
760 	uint8_t *b;
761 	int x = tgts[0];
762 	int c, j, exp;
763 
764 	ASSERT(ntgts == 1);
765 
766 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
767 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
768 
769 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
770 		src = rm->rm_col[c].rc_data;
771 		dst = rm->rm_col[x].rc_data;
772 
773 		if (c == x)
774 			ccount = 0;
775 		else
776 			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
777 
778 		count = MIN(ccount, xcount);
779 
780 		if (c == rm->rm_firstdatacol) {
781 			for (i = 0; i < count; i++, dst++, src++) {
782 				*dst = *src;
783 			}
784 			for (; i < xcount; i++, dst++) {
785 				*dst = 0;
786 			}
787 
788 		} else {
789 			for (i = 0; i < count; i++, dst++, src++) {
790 				VDEV_RAIDZ_64MUL_2(*dst, mask);
791 				*dst ^= *src;
792 			}
793 
794 			for (; i < xcount; i++, dst++) {
795 				VDEV_RAIDZ_64MUL_2(*dst, mask);
796 			}
797 		}
798 	}
799 
800 	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
801 	dst = rm->rm_col[x].rc_data;
802 	exp = 255 - (rm->rm_cols - 1 - x);
803 
804 	for (i = 0; i < xcount; i++, dst++, src++) {
805 		*dst ^= *src;
806 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
807 			*b = vdev_raidz_exp2(*b, exp);
808 		}
809 	}
810 
811 	return (1 << VDEV_RAIDZ_Q);
812 }
813 
814 static int
815 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
816 {
817 	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
818 	void *pdata, *qdata;
819 	uint64_t xsize, ysize, i;
820 	int x = tgts[0];
821 	int y = tgts[1];
822 
823 	ASSERT(ntgts == 2);
824 	ASSERT(x < y);
825 	ASSERT(x >= rm->rm_firstdatacol);
826 	ASSERT(y < rm->rm_cols);
827 
828 	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
829 
830 	/*
831 	 * Move the parity data aside -- we're going to compute parity as
832 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
833 	 * reuse the parity generation mechanism without trashing the actual
834 	 * parity so we make those columns appear to be full of zeros by
835 	 * setting their lengths to zero.
836 	 */
837 	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
838 	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
839 	xsize = rm->rm_col[x].rc_size;
840 	ysize = rm->rm_col[y].rc_size;
841 
842 	rm->rm_col[VDEV_RAIDZ_P].rc_data =
843 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
844 	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
845 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
846 	rm->rm_col[x].rc_size = 0;
847 	rm->rm_col[y].rc_size = 0;
848 
849 	vdev_raidz_generate_parity_pq(rm);
850 
851 	rm->rm_col[x].rc_size = xsize;
852 	rm->rm_col[y].rc_size = ysize;
853 
854 	p = pdata;
855 	q = qdata;
856 	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
857 	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
858 	xd = rm->rm_col[x].rc_data;
859 	yd = rm->rm_col[y].rc_data;
860 
861 	/*
862 	 * We now have:
863 	 *	Pxy = P + D_x + D_y
864 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
865 	 *
866 	 * We can then solve for D_x:
867 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
868 	 * where
869 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
870 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
871 	 *
872 	 * With D_x in hand, we can easily solve for D_y:
873 	 *	D_y = P + Pxy + D_x
874 	 */
875 
876 	a = vdev_raidz_pow2[255 + x - y];
877 	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
878 	tmp = 255 - vdev_raidz_log2[a ^ 1];
879 
880 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
881 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
882 
883 	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
884 		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
885 		    vdev_raidz_exp2(*q ^ *qxy, bexp);
886 
887 		if (i < ysize)
888 			*yd = *p ^ *pxy ^ *xd;
889 	}
890 
891 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
892 	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
893 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
894 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
895 
896 	/*
897 	 * Restore the saved parity data.
898 	 */
899 	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
900 	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
901 
902 	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
903 }
904 
905 /* BEGIN CSTYLED */
906 /*
907  * In the general case of reconstruction, we must solve the system of linear
908  * equations defined by the coeffecients used to generate parity as well as
909  * the contents of the data and parity disks. This can be expressed with
910  * vectors for the original data (D) and the actual data (d) and parity (p)
911  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
912  *
913  *            __   __                     __     __
914  *            |     |         __     __   |  p_0  |
915  *            |  V  |         |  D_0  |   | p_m-1 |
916  *            |     |    x    |   :   | = |  d_0  |
917  *            |  I  |         | D_n-1 |   |   :   |
918  *            |     |         ~~     ~~   | d_n-1 |
919  *            ~~   ~~                     ~~     ~~
920  *
921  * I is simply a square identity matrix of size n, and V is a vandermonde
922  * matrix defined by the coeffecients we chose for the various parity columns
923  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
924  * computation as well as linear separability.
925  *
926  *      __               __               __     __
927  *      |   1   ..  1 1 1 |               |  p_0  |
928  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
929  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
930  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
931  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
932  *      |   :       : : : |   |   :   |   |  d_2  |
933  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
934  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
935  *      |   0   ..  0 0 1 |               | d_n-1 |
936  *      ~~               ~~               ~~     ~~
937  *
938  * Note that I, V, d, and p are known. To compute D, we must invert the
939  * matrix and use the known data and parity values to reconstruct the unknown
940  * data values. We begin by removing the rows in V|I and d|p that correspond
941  * to failed or missing columns; we then make V|I square (n x n) and d|p
942  * sized n by removing rows corresponding to unused parity from the bottom up
943  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
944  * using Gauss-Jordan elimination. In the example below we use m=3 parity
945  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
946  *           __                               __
947  *           |  1   1   1   1   1   1   1   1  |
948  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
949  *           |  19 205 116  29  64  16  4   1  |      / /
950  *           |  1   0   0   0   0   0   0   0  |     / /
951  *           |  0   1   0   0   0   0   0   0  | <--' /
952  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
953  *           |  0   0   0   1   0   0   0   0  |
954  *           |  0   0   0   0   1   0   0   0  |
955  *           |  0   0   0   0   0   1   0   0  |
956  *           |  0   0   0   0   0   0   1   0  |
957  *           |  0   0   0   0   0   0   0   1  |
958  *           ~~                               ~~
959  *           __                               __
960  *           |  1   1   1   1   1   1   1   1  |
961  *           | 128  64  32  16  8   4   2   1  |
962  *           |  19 205 116  29  64  16  4   1  |
963  *           |  1   0   0   0   0   0   0   0  |
964  *           |  0   1   0   0   0   0   0   0  |
965  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
966  *           |  0   0   0   1   0   0   0   0  |
967  *           |  0   0   0   0   1   0   0   0  |
968  *           |  0   0   0   0   0   1   0   0  |
969  *           |  0   0   0   0   0   0   1   0  |
970  *           |  0   0   0   0   0   0   0   1  |
971  *           ~~                               ~~
972  *
973  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
974  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
975  * matrix is not singular.
976  * __                                                                 __
977  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
978  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
979  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
980  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
981  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
982  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
983  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
984  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
985  * ~~                                                                 ~~
986  * __                                                                 __
987  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
988  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
989  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
990  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
991  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
992  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
993  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
994  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
995  * ~~                                                                 ~~
996  * __                                                                 __
997  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
998  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
999  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1000  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1001  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1002  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1003  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1004  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1005  * ~~                                                                 ~~
1006  * __                                                                 __
1007  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1008  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1009  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1010  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1011  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1012  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1013  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1014  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1015  * ~~                                                                 ~~
1016  * __                                                                 __
1017  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1018  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1019  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1020  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1021  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1022  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1023  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1024  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1025  * ~~                                                                 ~~
1026  * __                                                                 __
1027  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1028  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1029  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1030  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1031  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1032  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1033  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1034  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1035  * ~~                                                                 ~~
1036  *                   __                               __
1037  *                   |  0   0   1   0   0   0   0   0  |
1038  *                   | 167 100  5   41 159 169 217 208 |
1039  *                   | 166 100  4   40 158 168 216 209 |
1040  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1041  *                   |  0   0   0   0   1   0   0   0  |
1042  *                   |  0   0   0   0   0   1   0   0  |
1043  *                   |  0   0   0   0   0   0   1   0  |
1044  *                   |  0   0   0   0   0   0   0   1  |
1045  *                   ~~                               ~~
1046  *
1047  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1048  * of the missing data.
1049  *
1050  * As is apparent from the example above, the only non-trivial rows in the
1051  * inverse matrix correspond to the data disks that we're trying to
1052  * reconstruct. Indeed, those are the only rows we need as the others would
1053  * only be useful for reconstructing data known or assumed to be valid. For
1054  * that reason, we only build the coefficients in the rows that correspond to
1055  * targeted columns.
1056  */
1057 /* END CSTYLED */
1058 
1059 static void
1060 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1061     uint8_t **rows)
1062 {
1063 	int i, j;
1064 	int pow;
1065 
1066 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1067 
1068 	/*
1069 	 * Fill in the missing rows of interest.
1070 	 */
1071 	for (i = 0; i < nmap; i++) {
1072 		ASSERT3S(0, <=, map[i]);
1073 		ASSERT3S(map[i], <=, 2);
1074 
1075 		pow = map[i] * n;
1076 		if (pow > 255)
1077 			pow -= 255;
1078 		ASSERT(pow <= 255);
1079 
1080 		for (j = 0; j < n; j++) {
1081 			pow -= map[i];
1082 			if (pow < 0)
1083 				pow += 255;
1084 			rows[i][j] = vdev_raidz_pow2[pow];
1085 		}
1086 	}
1087 }
1088 
1089 static void
1090 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1091     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1092 {
1093 	int i, j, ii, jj;
1094 	uint8_t log;
1095 
1096 	/*
1097 	 * Assert that the first nmissing entries from the array of used
1098 	 * columns correspond to parity columns and that subsequent entries
1099 	 * correspond to data columns.
1100 	 */
1101 	for (i = 0; i < nmissing; i++) {
1102 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
1103 	}
1104 	for (; i < n; i++) {
1105 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1106 	}
1107 
1108 	/*
1109 	 * First initialize the storage where we'll compute the inverse rows.
1110 	 */
1111 	for (i = 0; i < nmissing; i++) {
1112 		for (j = 0; j < n; j++) {
1113 			invrows[i][j] = (i == j) ? 1 : 0;
1114 		}
1115 	}
1116 
1117 	/*
1118 	 * Subtract all trivial rows from the rows of consequence.
1119 	 */
1120 	for (i = 0; i < nmissing; i++) {
1121 		for (j = nmissing; j < n; j++) {
1122 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1123 			jj = used[j] - rm->rm_firstdatacol;
1124 			ASSERT3S(jj, <, n);
1125 			invrows[i][j] = rows[i][jj];
1126 			rows[i][jj] = 0;
1127 		}
1128 	}
1129 
1130 	/*
1131 	 * For each of the rows of interest, we must normalize it and subtract
1132 	 * a multiple of it from the other rows.
1133 	 */
1134 	for (i = 0; i < nmissing; i++) {
1135 		for (j = 0; j < missing[i]; j++) {
1136 			ASSERT3U(rows[i][j], ==, 0);
1137 		}
1138 		ASSERT3U(rows[i][missing[i]], !=, 0);
1139 
1140 		/*
1141 		 * Compute the inverse of the first element and multiply each
1142 		 * element in the row by that value.
1143 		 */
1144 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1145 
1146 		for (j = 0; j < n; j++) {
1147 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1148 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1149 		}
1150 
1151 		for (ii = 0; ii < nmissing; ii++) {
1152 			if (i == ii)
1153 				continue;
1154 
1155 			ASSERT3U(rows[ii][missing[i]], !=, 0);
1156 
1157 			log = vdev_raidz_log2[rows[ii][missing[i]]];
1158 
1159 			for (j = 0; j < n; j++) {
1160 				rows[ii][j] ^=
1161 				    vdev_raidz_exp2(rows[i][j], log);
1162 				invrows[ii][j] ^=
1163 				    vdev_raidz_exp2(invrows[i][j], log);
1164 			}
1165 		}
1166 	}
1167 
1168 	/*
1169 	 * Verify that the data that is left in the rows are properly part of
1170 	 * an identity matrix.
1171 	 */
1172 	for (i = 0; i < nmissing; i++) {
1173 		for (j = 0; j < n; j++) {
1174 			if (j == missing[i]) {
1175 				ASSERT3U(rows[i][j], ==, 1);
1176 			} else {
1177 				ASSERT3U(rows[i][j], ==, 0);
1178 			}
1179 		}
1180 	}
1181 }
1182 
1183 static void
1184 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1185     int *missing, uint8_t **invrows, const uint8_t *used)
1186 {
1187 	int i, j, x, cc, c;
1188 	uint8_t *src;
1189 	uint64_t ccount;
1190 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1191 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1192 	uint8_t log, val;
1193 	int ll;
1194 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1195 	uint8_t *p, *pp;
1196 	size_t psize;
1197 
1198 	psize = sizeof (invlog[0][0]) * n * nmissing;
1199 	p = kmem_alloc(psize, KM_SLEEP);
1200 
1201 	for (pp = p, i = 0; i < nmissing; i++) {
1202 		invlog[i] = pp;
1203 		pp += n;
1204 	}
1205 
1206 	for (i = 0; i < nmissing; i++) {
1207 		for (j = 0; j < n; j++) {
1208 			ASSERT3U(invrows[i][j], !=, 0);
1209 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1210 		}
1211 	}
1212 
1213 	for (i = 0; i < n; i++) {
1214 		c = used[i];
1215 		ASSERT3U(c, <, rm->rm_cols);
1216 
1217 		src = rm->rm_col[c].rc_data;
1218 		ccount = rm->rm_col[c].rc_size;
1219 		for (j = 0; j < nmissing; j++) {
1220 			cc = missing[j] + rm->rm_firstdatacol;
1221 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1222 			ASSERT3U(cc, <, rm->rm_cols);
1223 			ASSERT3U(cc, !=, c);
1224 
1225 			dst[j] = rm->rm_col[cc].rc_data;
1226 			dcount[j] = rm->rm_col[cc].rc_size;
1227 		}
1228 
1229 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1230 
1231 		for (x = 0; x < ccount; x++, src++) {
1232 			if (*src != 0)
1233 				log = vdev_raidz_log2[*src];
1234 
1235 			for (cc = 0; cc < nmissing; cc++) {
1236 				if (x >= dcount[cc])
1237 					continue;
1238 
1239 				if (*src == 0) {
1240 					val = 0;
1241 				} else {
1242 					if ((ll = log + invlog[cc][i]) >= 255)
1243 						ll -= 255;
1244 					val = vdev_raidz_pow2[ll];
1245 				}
1246 
1247 				if (i == 0)
1248 					dst[cc][x] = val;
1249 				else
1250 					dst[cc][x] ^= val;
1251 			}
1252 		}
1253 	}
1254 
1255 	kmem_free(p, psize);
1256 }
1257 
1258 static int
1259 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1260 {
1261 	int n, i, c, t, tt;
1262 	int nmissing_rows;
1263 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1264 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1265 
1266 	uint8_t *p, *pp;
1267 	size_t psize;
1268 
1269 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1270 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1271 	uint8_t *used;
1272 
1273 	int code = 0;
1274 
1275 
1276 	n = rm->rm_cols - rm->rm_firstdatacol;
1277 
1278 	/*
1279 	 * Figure out which data columns are missing.
1280 	 */
1281 	nmissing_rows = 0;
1282 	for (t = 0; t < ntgts; t++) {
1283 		if (tgts[t] >= rm->rm_firstdatacol) {
1284 			missing_rows[nmissing_rows++] =
1285 			    tgts[t] - rm->rm_firstdatacol;
1286 		}
1287 	}
1288 
1289 	/*
1290 	 * Figure out which parity columns to use to help generate the missing
1291 	 * data columns.
1292 	 */
1293 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1294 		ASSERT(tt < ntgts);
1295 		ASSERT(c < rm->rm_firstdatacol);
1296 
1297 		/*
1298 		 * Skip any targeted parity columns.
1299 		 */
1300 		if (c == tgts[tt]) {
1301 			tt++;
1302 			continue;
1303 		}
1304 
1305 		code |= 1 << c;
1306 
1307 		parity_map[i] = c;
1308 		i++;
1309 	}
1310 
1311 	ASSERT(code != 0);
1312 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1313 
1314 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1315 	    nmissing_rows * n + sizeof (used[0]) * n;
1316 	p = kmem_alloc(psize, KM_SLEEP);
1317 
1318 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1319 		rows[i] = pp;
1320 		pp += n;
1321 		invrows[i] = pp;
1322 		pp += n;
1323 	}
1324 	used = pp;
1325 
1326 	for (i = 0; i < nmissing_rows; i++) {
1327 		used[i] = parity_map[i];
1328 	}
1329 
1330 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1331 		if (tt < nmissing_rows &&
1332 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1333 			tt++;
1334 			continue;
1335 		}
1336 
1337 		ASSERT3S(i, <, n);
1338 		used[i] = c;
1339 		i++;
1340 	}
1341 
1342 	/*
1343 	 * Initialize the interesting rows of the matrix.
1344 	 */
1345 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1346 
1347 	/*
1348 	 * Invert the matrix.
1349 	 */
1350 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1351 	    invrows, used);
1352 
1353 	/*
1354 	 * Reconstruct the missing data using the generated matrix.
1355 	 */
1356 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1357 	    invrows, used);
1358 
1359 	kmem_free(p, psize);
1360 
1361 	return (code);
1362 }
1363 
1364 static int
1365 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1366 {
1367 	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1368 	int ntgts;
1369 	int i, c;
1370 	int code;
1371 	int nbadparity, nbaddata;
1372 	int parity_valid[VDEV_RAIDZ_MAXPARITY];
1373 
1374 	/*
1375 	 * The tgts list must already be sorted.
1376 	 */
1377 	for (i = 1; i < nt; i++) {
1378 		ASSERT(t[i] > t[i - 1]);
1379 	}
1380 
1381 	nbadparity = rm->rm_firstdatacol;
1382 	nbaddata = rm->rm_cols - nbadparity;
1383 	ntgts = 0;
1384 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1385 		if (c < rm->rm_firstdatacol)
1386 			parity_valid[c] = B_FALSE;
1387 
1388 		if (i < nt && c == t[i]) {
1389 			tgts[ntgts++] = c;
1390 			i++;
1391 		} else if (rm->rm_col[c].rc_error != 0) {
1392 			tgts[ntgts++] = c;
1393 		} else if (c >= rm->rm_firstdatacol) {
1394 			nbaddata--;
1395 		} else {
1396 			parity_valid[c] = B_TRUE;
1397 			nbadparity--;
1398 		}
1399 	}
1400 
1401 	ASSERT(ntgts >= nt);
1402 	ASSERT(nbaddata >= 0);
1403 	ASSERT(nbaddata + nbadparity == ntgts);
1404 
1405 	dt = &tgts[nbadparity];
1406 
1407 	/*
1408 	 * See if we can use any of our optimized reconstruction routines.
1409 	 */
1410 	if (!vdev_raidz_default_to_general) {
1411 		switch (nbaddata) {
1412 		case 1:
1413 			if (parity_valid[VDEV_RAIDZ_P])
1414 				return (vdev_raidz_reconstruct_p(rm, dt, 1));
1415 
1416 			ASSERT(rm->rm_firstdatacol > 1);
1417 
1418 			if (parity_valid[VDEV_RAIDZ_Q])
1419 				return (vdev_raidz_reconstruct_q(rm, dt, 1));
1420 
1421 			ASSERT(rm->rm_firstdatacol > 2);
1422 			break;
1423 
1424 		case 2:
1425 			ASSERT(rm->rm_firstdatacol > 1);
1426 
1427 			if (parity_valid[VDEV_RAIDZ_P] &&
1428 			    parity_valid[VDEV_RAIDZ_Q])
1429 				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1430 
1431 			ASSERT(rm->rm_firstdatacol > 2);
1432 
1433 			break;
1434 		}
1435 	}
1436 
1437 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1438 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1439 	ASSERT(code > 0);
1440 	return (code);
1441 }
1442 
1443 static int
1444 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
1445 {
1446 	vdev_t *cvd;
1447 	uint64_t nparity = vd->vdev_nparity;
1448 	int c;
1449 	int lasterror = 0;
1450 	int numerrors = 0;
1451 
1452 	ASSERT(nparity > 0);
1453 
1454 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
1455 	    vd->vdev_children < nparity + 1) {
1456 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1457 		return (EINVAL);
1458 	}
1459 
1460 	vdev_open_children(vd);
1461 
1462 	for (c = 0; c < vd->vdev_children; c++) {
1463 		cvd = vd->vdev_child[c];
1464 
1465 		if (cvd->vdev_open_error != 0) {
1466 			lasterror = cvd->vdev_open_error;
1467 			numerrors++;
1468 			continue;
1469 		}
1470 
1471 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1472 		*ashift = MAX(*ashift, cvd->vdev_ashift);
1473 	}
1474 
1475 	*asize *= vd->vdev_children;
1476 
1477 	if (numerrors > nparity) {
1478 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1479 		return (lasterror);
1480 	}
1481 
1482 	return (0);
1483 }
1484 
1485 static void
1486 vdev_raidz_close(vdev_t *vd)
1487 {
1488 	int c;
1489 
1490 	for (c = 0; c < vd->vdev_children; c++)
1491 		vdev_close(vd->vdev_child[c]);
1492 }
1493 
1494 static uint64_t
1495 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1496 {
1497 	uint64_t asize;
1498 	uint64_t ashift = vd->vdev_top->vdev_ashift;
1499 	uint64_t cols = vd->vdev_children;
1500 	uint64_t nparity = vd->vdev_nparity;
1501 
1502 	asize = ((psize - 1) >> ashift) + 1;
1503 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1504 	asize = roundup(asize, nparity + 1) << ashift;
1505 
1506 	return (asize);
1507 }
1508 
1509 static void
1510 vdev_raidz_child_done(zio_t *zio)
1511 {
1512 	raidz_col_t *rc = zio->io_private;
1513 
1514 	rc->rc_error = zio->io_error;
1515 	rc->rc_tried = 1;
1516 	rc->rc_skipped = 0;
1517 }
1518 
1519 static int
1520 vdev_raidz_io_start(zio_t *zio)
1521 {
1522 	vdev_t *vd = zio->io_vd;
1523 	vdev_t *tvd = vd->vdev_top;
1524 	vdev_t *cvd;
1525 	raidz_map_t *rm;
1526 	raidz_col_t *rc;
1527 	int c, i;
1528 
1529 	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
1530 	    vd->vdev_nparity);
1531 
1532 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1533 
1534 	if (zio->io_type == ZIO_TYPE_WRITE) {
1535 		vdev_raidz_generate_parity(rm);
1536 
1537 		for (c = 0; c < rm->rm_cols; c++) {
1538 			rc = &rm->rm_col[c];
1539 			cvd = vd->vdev_child[rc->rc_devidx];
1540 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1541 			    rc->rc_offset, rc->rc_data, rc->rc_size,
1542 			    zio->io_type, zio->io_priority, 0,
1543 			    vdev_raidz_child_done, rc));
1544 		}
1545 
1546 		/*
1547 		 * Generate optional I/Os for any skipped sectors to improve
1548 		 * aggregation contiguity.
1549 		 */
1550 		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1551 			ASSERT(c <= rm->rm_scols);
1552 			if (c == rm->rm_scols)
1553 				c = 0;
1554 			rc = &rm->rm_col[c];
1555 			cvd = vd->vdev_child[rc->rc_devidx];
1556 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1557 			    rc->rc_offset + rc->rc_size, NULL,
1558 			    1 << tvd->vdev_ashift,
1559 			    zio->io_type, zio->io_priority,
1560 			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1561 		}
1562 
1563 		return (ZIO_PIPELINE_CONTINUE);
1564 	}
1565 
1566 	ASSERT(zio->io_type == ZIO_TYPE_READ);
1567 
1568 	/*
1569 	 * Iterate over the columns in reverse order so that we hit the parity
1570 	 * last -- any errors along the way will force us to read the parity.
1571 	 */
1572 	for (c = rm->rm_cols - 1; c >= 0; c--) {
1573 		rc = &rm->rm_col[c];
1574 		cvd = vd->vdev_child[rc->rc_devidx];
1575 		if (!vdev_readable(cvd)) {
1576 			if (c >= rm->rm_firstdatacol)
1577 				rm->rm_missingdata++;
1578 			else
1579 				rm->rm_missingparity++;
1580 			rc->rc_error = ENXIO;
1581 			rc->rc_tried = 1;	/* don't even try */
1582 			rc->rc_skipped = 1;
1583 			continue;
1584 		}
1585 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1586 			if (c >= rm->rm_firstdatacol)
1587 				rm->rm_missingdata++;
1588 			else
1589 				rm->rm_missingparity++;
1590 			rc->rc_error = ESTALE;
1591 			rc->rc_skipped = 1;
1592 			continue;
1593 		}
1594 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1595 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1596 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1597 			    rc->rc_offset, rc->rc_data, rc->rc_size,
1598 			    zio->io_type, zio->io_priority, 0,
1599 			    vdev_raidz_child_done, rc));
1600 		}
1601 	}
1602 
1603 	return (ZIO_PIPELINE_CONTINUE);
1604 }
1605 
1606 
1607 /*
1608  * Report a checksum error for a child of a RAID-Z device.
1609  */
1610 static void
1611 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1612 {
1613 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1614 
1615 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1616 		zio_bad_cksum_t zbc;
1617 		raidz_map_t *rm = zio->io_vsd;
1618 
1619 		mutex_enter(&vd->vdev_stat_lock);
1620 		vd->vdev_stat.vs_checksum_errors++;
1621 		mutex_exit(&vd->vdev_stat_lock);
1622 
1623 		zbc.zbc_has_cksum = 0;
1624 		zbc.zbc_injected = rm->rm_ecksuminjected;
1625 
1626 		zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1627 		    rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1628 		    &zbc);
1629 	}
1630 }
1631 
1632 /*
1633  * We keep track of whether or not there were any injected errors, so that
1634  * any ereports we generate can note it.
1635  */
1636 static int
1637 raidz_checksum_verify(zio_t *zio)
1638 {
1639 	zio_bad_cksum_t zbc;
1640 	raidz_map_t *rm = zio->io_vsd;
1641 
1642 	int ret = zio_checksum_error(zio, &zbc);
1643 	if (ret != 0 && zbc.zbc_injected != 0)
1644 		rm->rm_ecksuminjected = 1;
1645 
1646 	return (ret);
1647 }
1648 
1649 /*
1650  * Generate the parity from the data columns. If we tried and were able to
1651  * read the parity without error, verify that the generated parity matches the
1652  * data we read. If it doesn't, we fire off a checksum error. Return the
1653  * number such failures.
1654  */
1655 static int
1656 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1657 {
1658 	void *orig[VDEV_RAIDZ_MAXPARITY];
1659 	int c, ret = 0;
1660 	raidz_col_t *rc;
1661 
1662 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1663 		rc = &rm->rm_col[c];
1664 		if (!rc->rc_tried || rc->rc_error != 0)
1665 			continue;
1666 		orig[c] = zio_buf_alloc(rc->rc_size);
1667 		bcopy(rc->rc_data, orig[c], rc->rc_size);
1668 	}
1669 
1670 	vdev_raidz_generate_parity(rm);
1671 
1672 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1673 		rc = &rm->rm_col[c];
1674 		if (!rc->rc_tried || rc->rc_error != 0)
1675 			continue;
1676 		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1677 			raidz_checksum_error(zio, rc, orig[c]);
1678 			rc->rc_error = ECKSUM;
1679 			ret++;
1680 		}
1681 		zio_buf_free(orig[c], rc->rc_size);
1682 	}
1683 
1684 	return (ret);
1685 }
1686 
1687 /*
1688  * Keep statistics on all the ways that we used parity to correct data.
1689  */
1690 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1691 
1692 static int
1693 vdev_raidz_worst_error(raidz_map_t *rm)
1694 {
1695 	int error = 0;
1696 
1697 	for (int c = 0; c < rm->rm_cols; c++)
1698 		error = zio_worst_error(error, rm->rm_col[c].rc_error);
1699 
1700 	return (error);
1701 }
1702 
1703 /*
1704  * Iterate over all combinations of bad data and attempt a reconstruction.
1705  * Note that the algorithm below is non-optimal because it doesn't take into
1706  * account how reconstruction is actually performed. For example, with
1707  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1708  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1709  * cases we'd only use parity information in column 0.
1710  */
1711 static int
1712 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1713 {
1714 	raidz_map_t *rm = zio->io_vsd;
1715 	raidz_col_t *rc;
1716 	void *orig[VDEV_RAIDZ_MAXPARITY];
1717 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1718 	int *tgts = &tstore[1];
1719 	int current, next, i, c, n;
1720 	int code, ret = 0;
1721 
1722 	ASSERT(total_errors < rm->rm_firstdatacol);
1723 
1724 	/*
1725 	 * This simplifies one edge condition.
1726 	 */
1727 	tgts[-1] = -1;
1728 
1729 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1730 		/*
1731 		 * Initialize the targets array by finding the first n columns
1732 		 * that contain no error.
1733 		 *
1734 		 * If there were no data errors, we need to ensure that we're
1735 		 * always explicitly attempting to reconstruct at least one
1736 		 * data column. To do this, we simply push the highest target
1737 		 * up into the data columns.
1738 		 */
1739 		for (c = 0, i = 0; i < n; i++) {
1740 			if (i == n - 1 && data_errors == 0 &&
1741 			    c < rm->rm_firstdatacol) {
1742 				c = rm->rm_firstdatacol;
1743 			}
1744 
1745 			while (rm->rm_col[c].rc_error != 0) {
1746 				c++;
1747 				ASSERT3S(c, <, rm->rm_cols);
1748 			}
1749 
1750 			tgts[i] = c++;
1751 		}
1752 
1753 		/*
1754 		 * Setting tgts[n] simplifies the other edge condition.
1755 		 */
1756 		tgts[n] = rm->rm_cols;
1757 
1758 		/*
1759 		 * These buffers were allocated in previous iterations.
1760 		 */
1761 		for (i = 0; i < n - 1; i++) {
1762 			ASSERT(orig[i] != NULL);
1763 		}
1764 
1765 		orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
1766 
1767 		current = 0;
1768 		next = tgts[current];
1769 
1770 		while (current != n) {
1771 			tgts[current] = next;
1772 			current = 0;
1773 
1774 			/*
1775 			 * Save off the original data that we're going to
1776 			 * attempt to reconstruct.
1777 			 */
1778 			for (i = 0; i < n; i++) {
1779 				ASSERT(orig[i] != NULL);
1780 				c = tgts[i];
1781 				ASSERT3S(c, >=, 0);
1782 				ASSERT3S(c, <, rm->rm_cols);
1783 				rc = &rm->rm_col[c];
1784 				bcopy(rc->rc_data, orig[i], rc->rc_size);
1785 			}
1786 
1787 			/*
1788 			 * Attempt a reconstruction and exit the outer loop on
1789 			 * success.
1790 			 */
1791 			code = vdev_raidz_reconstruct(rm, tgts, n);
1792 			if (raidz_checksum_verify(zio) == 0) {
1793 				atomic_inc_64(&raidz_corrected[code]);
1794 
1795 				for (i = 0; i < n; i++) {
1796 					c = tgts[i];
1797 					rc = &rm->rm_col[c];
1798 					ASSERT(rc->rc_error == 0);
1799 					if (rc->rc_tried)
1800 						raidz_checksum_error(zio, rc,
1801 						    orig[i]);
1802 					rc->rc_error = ECKSUM;
1803 				}
1804 
1805 				ret = code;
1806 				goto done;
1807 			}
1808 
1809 			/*
1810 			 * Restore the original data.
1811 			 */
1812 			for (i = 0; i < n; i++) {
1813 				c = tgts[i];
1814 				rc = &rm->rm_col[c];
1815 				bcopy(orig[i], rc->rc_data, rc->rc_size);
1816 			}
1817 
1818 			do {
1819 				/*
1820 				 * Find the next valid column after the current
1821 				 * position..
1822 				 */
1823 				for (next = tgts[current] + 1;
1824 				    next < rm->rm_cols &&
1825 				    rm->rm_col[next].rc_error != 0; next++)
1826 					continue;
1827 
1828 				ASSERT(next <= tgts[current + 1]);
1829 
1830 				/*
1831 				 * If that spot is available, we're done here.
1832 				 */
1833 				if (next != tgts[current + 1])
1834 					break;
1835 
1836 				/*
1837 				 * Otherwise, find the next valid column after
1838 				 * the previous position.
1839 				 */
1840 				for (c = tgts[current - 1] + 1;
1841 				    rm->rm_col[c].rc_error != 0; c++)
1842 					continue;
1843 
1844 				tgts[current] = c;
1845 				current++;
1846 
1847 			} while (current != n);
1848 		}
1849 	}
1850 	n--;
1851 done:
1852 	for (i = 0; i < n; i++) {
1853 		zio_buf_free(orig[i], rm->rm_col[0].rc_size);
1854 	}
1855 
1856 	return (ret);
1857 }
1858 
1859 static void
1860 vdev_raidz_io_done(zio_t *zio)
1861 {
1862 	vdev_t *vd = zio->io_vd;
1863 	vdev_t *cvd;
1864 	raidz_map_t *rm = zio->io_vsd;
1865 	raidz_col_t *rc;
1866 	int unexpected_errors = 0;
1867 	int parity_errors = 0;
1868 	int parity_untried = 0;
1869 	int data_errors = 0;
1870 	int total_errors = 0;
1871 	int n, c;
1872 	int tgts[VDEV_RAIDZ_MAXPARITY];
1873 	int code;
1874 
1875 	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
1876 
1877 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1878 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1879 
1880 	for (c = 0; c < rm->rm_cols; c++) {
1881 		rc = &rm->rm_col[c];
1882 
1883 		if (rc->rc_error) {
1884 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
1885 
1886 			if (c < rm->rm_firstdatacol)
1887 				parity_errors++;
1888 			else
1889 				data_errors++;
1890 
1891 			if (!rc->rc_skipped)
1892 				unexpected_errors++;
1893 
1894 			total_errors++;
1895 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1896 			parity_untried++;
1897 		}
1898 	}
1899 
1900 	if (zio->io_type == ZIO_TYPE_WRITE) {
1901 		/*
1902 		 * XXX -- for now, treat partial writes as a success.
1903 		 * (If we couldn't write enough columns to reconstruct
1904 		 * the data, the I/O failed.  Otherwise, good enough.)
1905 		 *
1906 		 * Now that we support write reallocation, it would be better
1907 		 * to treat partial failure as real failure unless there are
1908 		 * no non-degraded top-level vdevs left, and not update DTLs
1909 		 * if we intend to reallocate.
1910 		 */
1911 		/* XXPOLICY */
1912 		if (total_errors > rm->rm_firstdatacol)
1913 			zio->io_error = vdev_raidz_worst_error(rm);
1914 
1915 		return;
1916 	}
1917 
1918 	ASSERT(zio->io_type == ZIO_TYPE_READ);
1919 	/*
1920 	 * There are three potential phases for a read:
1921 	 *	1. produce valid data from the columns read
1922 	 *	2. read all disks and try again
1923 	 *	3. perform combinatorial reconstruction
1924 	 *
1925 	 * Each phase is progressively both more expensive and less likely to
1926 	 * occur. If we encounter more errors than we can repair or all phases
1927 	 * fail, we have no choice but to return an error.
1928 	 */
1929 
1930 	/*
1931 	 * If the number of errors we saw was correctable -- less than or equal
1932 	 * to the number of parity disks read -- attempt to produce data that
1933 	 * has a valid checksum. Naturally, this case applies in the absence of
1934 	 * any errors.
1935 	 */
1936 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1937 		if (data_errors == 0) {
1938 			if (raidz_checksum_verify(zio) == 0) {
1939 				/*
1940 				 * If we read parity information (unnecessarily
1941 				 * as it happens since no reconstruction was
1942 				 * needed) regenerate and verify the parity.
1943 				 * We also regenerate parity when resilvering
1944 				 * so we can write it out to the failed device
1945 				 * later.
1946 				 */
1947 				if (parity_errors + parity_untried <
1948 				    rm->rm_firstdatacol ||
1949 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
1950 					n = raidz_parity_verify(zio, rm);
1951 					unexpected_errors += n;
1952 					ASSERT(parity_errors + n <=
1953 					    rm->rm_firstdatacol);
1954 				}
1955 				goto done;
1956 			}
1957 		} else {
1958 			/*
1959 			 * We either attempt to read all the parity columns or
1960 			 * none of them. If we didn't try to read parity, we
1961 			 * wouldn't be here in the correctable case. There must
1962 			 * also have been fewer parity errors than parity
1963 			 * columns or, again, we wouldn't be in this code path.
1964 			 */
1965 			ASSERT(parity_untried == 0);
1966 			ASSERT(parity_errors < rm->rm_firstdatacol);
1967 
1968 			/*
1969 			 * Identify the data columns that reported an error.
1970 			 */
1971 			n = 0;
1972 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1973 				rc = &rm->rm_col[c];
1974 				if (rc->rc_error != 0) {
1975 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1976 					tgts[n++] = c;
1977 				}
1978 			}
1979 
1980 			ASSERT(rm->rm_firstdatacol >= n);
1981 
1982 			code = vdev_raidz_reconstruct(rm, tgts, n);
1983 
1984 			if (raidz_checksum_verify(zio) == 0) {
1985 				atomic_inc_64(&raidz_corrected[code]);
1986 
1987 				/*
1988 				 * If we read more parity disks than were used
1989 				 * for reconstruction, confirm that the other
1990 				 * parity disks produced correct data. This
1991 				 * routine is suboptimal in that it regenerates
1992 				 * the parity that we already used in addition
1993 				 * to the parity that we're attempting to
1994 				 * verify, but this should be a relatively
1995 				 * uncommon case, and can be optimized if it
1996 				 * becomes a problem. Note that we regenerate
1997 				 * parity when resilvering so we can write it
1998 				 * out to failed devices later.
1999 				 */
2000 				if (parity_errors < rm->rm_firstdatacol - n ||
2001 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2002 					n = raidz_parity_verify(zio, rm);
2003 					unexpected_errors += n;
2004 					ASSERT(parity_errors + n <=
2005 					    rm->rm_firstdatacol);
2006 				}
2007 
2008 				goto done;
2009 			}
2010 		}
2011 	}
2012 
2013 	/*
2014 	 * This isn't a typical situation -- either we got a read error or
2015 	 * a child silently returned bad data. Read every block so we can
2016 	 * try again with as much data and parity as we can track down. If
2017 	 * we've already been through once before, all children will be marked
2018 	 * as tried so we'll proceed to combinatorial reconstruction.
2019 	 */
2020 	unexpected_errors = 1;
2021 	rm->rm_missingdata = 0;
2022 	rm->rm_missingparity = 0;
2023 
2024 	for (c = 0; c < rm->rm_cols; c++) {
2025 		if (rm->rm_col[c].rc_tried)
2026 			continue;
2027 
2028 		zio_vdev_io_redone(zio);
2029 		do {
2030 			rc = &rm->rm_col[c];
2031 			if (rc->rc_tried)
2032 				continue;
2033 			zio_nowait(zio_vdev_child_io(zio, NULL,
2034 			    vd->vdev_child[rc->rc_devidx],
2035 			    rc->rc_offset, rc->rc_data, rc->rc_size,
2036 			    zio->io_type, zio->io_priority, 0,
2037 			    vdev_raidz_child_done, rc));
2038 		} while (++c < rm->rm_cols);
2039 
2040 		return;
2041 	}
2042 
2043 	/*
2044 	 * At this point we've attempted to reconstruct the data given the
2045 	 * errors we detected, and we've attempted to read all columns. There
2046 	 * must, therefore, be one or more additional problems -- silent errors
2047 	 * resulting in invalid data rather than explicit I/O errors resulting
2048 	 * in absent data. We check if there is enough additional data to
2049 	 * possibly reconstruct the data and then perform combinatorial
2050 	 * reconstruction over all possible combinations. If that fails,
2051 	 * we're cooked.
2052 	 */
2053 	if (total_errors > rm->rm_firstdatacol) {
2054 		zio->io_error = vdev_raidz_worst_error(rm);
2055 
2056 	} else if (total_errors < rm->rm_firstdatacol &&
2057 	    (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2058 		/*
2059 		 * If we didn't use all the available parity for the
2060 		 * combinatorial reconstruction, verify that the remaining
2061 		 * parity is correct.
2062 		 */
2063 		if (code != (1 << rm->rm_firstdatacol) - 1)
2064 			(void) raidz_parity_verify(zio, rm);
2065 	} else {
2066 		/*
2067 		 * We're here because either:
2068 		 *
2069 		 *	total_errors == rm_first_datacol, or
2070 		 *	vdev_raidz_combrec() failed
2071 		 *
2072 		 * In either case, there is enough bad data to prevent
2073 		 * reconstruction.
2074 		 *
2075 		 * Start checksum ereports for all children which haven't
2076 		 * failed, and the IO wasn't speculative.
2077 		 */
2078 		zio->io_error = ECKSUM;
2079 
2080 		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2081 			for (c = 0; c < rm->rm_cols; c++) {
2082 				rc = &rm->rm_col[c];
2083 				if (rc->rc_error == 0) {
2084 					zio_bad_cksum_t zbc;
2085 					zbc.zbc_has_cksum = 0;
2086 					zbc.zbc_injected =
2087 					    rm->rm_ecksuminjected;
2088 
2089 					zfs_ereport_start_checksum(
2090 					    zio->io_spa,
2091 					    vd->vdev_child[rc->rc_devidx],
2092 					    zio, rc->rc_offset, rc->rc_size,
2093 					    (void *)(uintptr_t)c, &zbc);
2094 				}
2095 			}
2096 		}
2097 	}
2098 
2099 done:
2100 	zio_checksum_verified(zio);
2101 
2102 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2103 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2104 		/*
2105 		 * Use the good data we have in hand to repair damaged children.
2106 		 */
2107 		for (c = 0; c < rm->rm_cols; c++) {
2108 			rc = &rm->rm_col[c];
2109 			cvd = vd->vdev_child[rc->rc_devidx];
2110 
2111 			if (rc->rc_error == 0)
2112 				continue;
2113 
2114 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2115 			    rc->rc_offset, rc->rc_data, rc->rc_size,
2116 			    ZIO_TYPE_WRITE, zio->io_priority,
2117 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2118 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2119 		}
2120 	}
2121 }
2122 
2123 static void
2124 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2125 {
2126 	if (faulted > vd->vdev_nparity)
2127 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2128 		    VDEV_AUX_NO_REPLICAS);
2129 	else if (degraded + faulted != 0)
2130 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2131 	else
2132 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2133 }
2134 
2135 vdev_ops_t vdev_raidz_ops = {
2136 	vdev_raidz_open,
2137 	vdev_raidz_close,
2138 	vdev_raidz_asize,
2139 	vdev_raidz_io_start,
2140 	vdev_raidz_io_done,
2141 	vdev_raidz_state_change,
2142 	NULL,
2143 	NULL,
2144 	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
2145 	B_FALSE			/* not a leaf vdev */
2146 };
2147