xref: /illumos-gate/usr/src/uts/common/fs/zfs/vdev_raidz.c (revision 5bbb4db2c3f208d12bf0fd11769728f9e5ba66a2)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/zfs_context.h>
28 #include <sys/spa.h>
29 #include <sys/vdev_impl.h>
30 #include <sys/zio.h>
31 #include <sys/zio_checksum.h>
32 #include <sys/fs/zfs.h>
33 #include <sys/fm/fs/zfs.h>
34 
35 /*
36  * Virtual device vector for RAID-Z.
37  *
38  * This vdev supports single, double, and triple parity. For single parity,
39  * we use a simple XOR of all the data columns. For double or triple parity,
40  * we use a special case of Reed-Solomon coding. This extends the
41  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
42  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
43  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
44  * former is also based. The latter is designed to provide higher performance
45  * for writes.
46  *
47  * Note that the Plank paper claimed to support arbitrary N+M, but was then
48  * amended six years later identifying a critical flaw that invalidates its
49  * claims. Nevertheless, the technique can be adapted to work for up to
50  * triple parity. For additional parity, the amendment "Note: Correction to
51  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
52  * is viable, but the additional complexity means that write performance will
53  * suffer.
54  *
55  * All of the methods above operate on a Galois field, defined over the
56  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
57  * can be expressed with a single byte. Briefly, the operations on the
58  * field are defined as follows:
59  *
60  *   o addition (+) is represented by a bitwise XOR
61  *   o subtraction (-) is therefore identical to addition: A + B = A - B
62  *   o multiplication of A by 2 is defined by the following bitwise expression:
63  *	(A * 2)_7 = A_6
64  *	(A * 2)_6 = A_5
65  *	(A * 2)_5 = A_4
66  *	(A * 2)_4 = A_3 + A_7
67  *	(A * 2)_3 = A_2 + A_7
68  *	(A * 2)_2 = A_1 + A_7
69  *	(A * 2)_1 = A_0
70  *	(A * 2)_0 = A_7
71  *
72  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
73  * As an aside, this multiplication is derived from the error correcting
74  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
75  *
76  * Observe that any number in the field (except for 0) can be expressed as a
77  * power of 2 -- a generator for the field. We store a table of the powers of
78  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
79  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
80  * than field addition). The inverse of a field element A (A^-1) is therefore
81  * A ^ (255 - 1) = A^254.
82  *
83  * The up-to-three parity columns, P, Q, R over several data columns,
84  * D_0, ... D_n-1, can be expressed by field operations:
85  *
86  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
87  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
88  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
89  *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
90  *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
91  *
92  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
93  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
94  * independent coefficients. (There are no additional coefficients that have
95  * this property which is why the uncorrected Plank method breaks down.)
96  *
97  * See the reconstruction code below for how P, Q and R can used individually
98  * or in concert to recover missing data columns.
99  */
100 
101 typedef struct raidz_col {
102 	uint64_t rc_devidx;		/* child device index for I/O */
103 	uint64_t rc_offset;		/* device offset */
104 	uint64_t rc_size;		/* I/O size */
105 	void *rc_data;			/* I/O data */
106 	void *rc_gdata;			/* used to store the "good" version */
107 	int rc_error;			/* I/O error for this device */
108 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
109 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
110 } raidz_col_t;
111 
112 typedef struct raidz_map {
113 	uint64_t rm_cols;		/* Regular column count */
114 	uint64_t rm_scols;		/* Count including skipped columns */
115 	uint64_t rm_bigcols;		/* Number of oversized columns */
116 	uint64_t rm_asize;		/* Actual total I/O size */
117 	uint64_t rm_missingdata;	/* Count of missing data devices */
118 	uint64_t rm_missingparity;	/* Count of missing parity devices */
119 	uint64_t rm_firstdatacol;	/* First data column/parity count */
120 	uint64_t rm_nskip;		/* Skipped sectors for padding */
121 	uint64_t rm_skipstart;	/* Column index of padding start */
122 	void *rm_datacopy;		/* rm_asize-buffer of copied data */
123 	uintptr_t rm_reports;		/* # of referencing checksum reports */
124 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
125 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
126 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
127 } raidz_map_t;
128 
129 #define	VDEV_RAIDZ_P		0
130 #define	VDEV_RAIDZ_Q		1
131 #define	VDEV_RAIDZ_R		2
132 #define	VDEV_RAIDZ_MAXPARITY	3
133 
134 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
135 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
136 
137 /*
138  * We provide a mechanism to perform the field multiplication operation on a
139  * 64-bit value all at once rather than a byte at a time. This works by
140  * creating a mask from the top bit in each byte and using that to
141  * conditionally apply the XOR of 0x1d.
142  */
143 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
144 { \
145 	(mask) = (x) & 0x8080808080808080ULL; \
146 	(mask) = ((mask) << 1) - ((mask) >> 7); \
147 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
148 	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
149 }
150 
151 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
152 { \
153 	VDEV_RAIDZ_64MUL_2((x), mask); \
154 	VDEV_RAIDZ_64MUL_2((x), mask); \
155 }
156 
157 /*
158  * Force reconstruction to use the general purpose method.
159  */
160 int vdev_raidz_default_to_general;
161 
162 /*
163  * These two tables represent powers and logs of 2 in the Galois field defined
164  * above. These values were computed by repeatedly multiplying by 2 as above.
165  */
166 static const uint8_t vdev_raidz_pow2[256] = {
167 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
168 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
169 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
170 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
171 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
172 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
173 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
174 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
175 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
176 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
177 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
178 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
179 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
180 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
181 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
182 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
183 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
184 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
185 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
186 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
187 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
188 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
189 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
190 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
191 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
192 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
193 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
194 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
195 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
196 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
197 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
198 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
199 };
200 static const uint8_t vdev_raidz_log2[256] = {
201 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
202 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
203 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
204 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
205 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
206 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
207 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
208 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
209 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
210 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
211 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
212 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
213 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
214 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
215 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
216 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
217 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
218 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
219 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
220 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
221 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
222 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
223 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
224 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
225 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
226 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
227 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
228 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
229 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
230 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
231 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
232 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
233 };
234 
235 static void vdev_raidz_generate_parity(raidz_map_t *rm);
236 
237 /*
238  * Multiply a given number by 2 raised to the given power.
239  */
240 static uint8_t
241 vdev_raidz_exp2(uint_t a, int exp)
242 {
243 	if (a == 0)
244 		return (0);
245 
246 	ASSERT(exp >= 0);
247 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
248 
249 	exp += vdev_raidz_log2[a];
250 	if (exp > 255)
251 		exp -= 255;
252 
253 	return (vdev_raidz_pow2[exp]);
254 }
255 
256 static void
257 vdev_raidz_map_free(raidz_map_t *rm)
258 {
259 	int c;
260 	size_t size;
261 
262 	for (c = 0; c < rm->rm_firstdatacol; c++) {
263 		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
264 
265 		if (rm->rm_col[c].rc_gdata != NULL)
266 			zio_buf_free(rm->rm_col[c].rc_gdata,
267 			    rm->rm_col[c].rc_size);
268 	}
269 
270 	size = 0;
271 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
272 		size += rm->rm_col[c].rc_size;
273 
274 	if (rm->rm_datacopy != NULL)
275 		zio_buf_free(rm->rm_datacopy, size);
276 
277 	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
278 }
279 
280 static void
281 vdev_raidz_map_free_vsd(zio_t *zio)
282 {
283 	raidz_map_t *rm = zio->io_vsd;
284 
285 	ASSERT3U(rm->rm_freed, ==, 0);
286 	rm->rm_freed = 1;
287 
288 	if (rm->rm_reports == 0)
289 		vdev_raidz_map_free(rm);
290 }
291 
292 /*ARGSUSED*/
293 static void
294 vdev_raidz_cksum_free(void *arg, size_t ignored)
295 {
296 	raidz_map_t *rm = arg;
297 
298 	ASSERT3U(rm->rm_reports, >, 0);
299 
300 	if (--rm->rm_reports == 0 && rm->rm_freed != 0)
301 		vdev_raidz_map_free(rm);
302 }
303 
304 static void
305 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
306 {
307 	raidz_map_t *rm = zcr->zcr_cbdata;
308 	size_t c = zcr->zcr_cbinfo;
309 	size_t x;
310 
311 	const char *good = NULL;
312 	const char *bad = rm->rm_col[c].rc_data;
313 
314 	if (good_data == NULL) {
315 		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
316 		return;
317 	}
318 
319 	if (c < rm->rm_firstdatacol) {
320 		/*
321 		 * The first time through, calculate the parity blocks for
322 		 * the good data (this relies on the fact that the good
323 		 * data never changes for a given logical ZIO)
324 		 */
325 		if (rm->rm_col[0].rc_gdata == NULL) {
326 			char *bad_parity[VDEV_RAIDZ_MAXPARITY];
327 			char *buf;
328 
329 			/*
330 			 * Set up the rm_col[]s to generate the parity for
331 			 * good_data, first saving the parity bufs and
332 			 * replacing them with buffers to hold the result.
333 			 */
334 			for (x = 0; x < rm->rm_firstdatacol; x++) {
335 				bad_parity[x] = rm->rm_col[x].rc_data;
336 				rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
337 				    zio_buf_alloc(rm->rm_col[x].rc_size);
338 			}
339 
340 			/* fill in the data columns from good_data */
341 			buf = (char *)good_data;
342 			for (; x < rm->rm_cols; x++) {
343 				rm->rm_col[x].rc_data = buf;
344 				buf += rm->rm_col[x].rc_size;
345 			}
346 
347 			/*
348 			 * Construct the parity from the good data.
349 			 */
350 			vdev_raidz_generate_parity(rm);
351 
352 			/* restore everything back to its original state */
353 			for (x = 0; x < rm->rm_firstdatacol; x++)
354 				rm->rm_col[x].rc_data = bad_parity[x];
355 
356 			buf = rm->rm_datacopy;
357 			for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
358 				rm->rm_col[x].rc_data = buf;
359 				buf += rm->rm_col[x].rc_size;
360 			}
361 		}
362 
363 		ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
364 		good = rm->rm_col[c].rc_gdata;
365 	} else {
366 		/* adjust good_data to point at the start of our column */
367 		good = good_data;
368 
369 		for (x = rm->rm_firstdatacol; x < c; x++)
370 			good += rm->rm_col[x].rc_size;
371 	}
372 
373 	/* we drop the ereport if it ends up that the data was good */
374 	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
375 }
376 
377 /*
378  * Invoked indirectly by zfs_ereport_start_checksum(), called
379  * below when our read operation fails completely.  The main point
380  * is to keep a copy of everything we read from disk, so that at
381  * vdev_raidz_cksum_finish() time we can compare it with the good data.
382  */
383 static void
384 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
385 {
386 	size_t c = (size_t)(uintptr_t)arg;
387 	caddr_t buf;
388 
389 	raidz_map_t *rm = zio->io_vsd;
390 	size_t size;
391 
392 	/* set up the report and bump the refcount  */
393 	zcr->zcr_cbdata = rm;
394 	zcr->zcr_cbinfo = c;
395 	zcr->zcr_finish = vdev_raidz_cksum_finish;
396 	zcr->zcr_free = vdev_raidz_cksum_free;
397 
398 	rm->rm_reports++;
399 	ASSERT3U(rm->rm_reports, >, 0);
400 
401 	if (rm->rm_datacopy != NULL)
402 		return;
403 
404 	/*
405 	 * It's the first time we're called for this raidz_map_t, so we need
406 	 * to copy the data aside; there's no guarantee that our zio's buffer
407 	 * won't be re-used for something else.
408 	 *
409 	 * Our parity data is already in separate buffers, so there's no need
410 	 * to copy them.
411 	 */
412 
413 	size = 0;
414 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
415 		size += rm->rm_col[c].rc_size;
416 
417 	buf = rm->rm_datacopy = zio_buf_alloc(size);
418 
419 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
420 		raidz_col_t *col = &rm->rm_col[c];
421 
422 		bcopy(col->rc_data, buf, col->rc_size);
423 		col->rc_data = buf;
424 
425 		buf += col->rc_size;
426 	}
427 	ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
428 }
429 
430 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
431 	vdev_raidz_map_free_vsd,
432 	vdev_raidz_cksum_report
433 };
434 
435 static raidz_map_t *
436 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
437     uint64_t nparity)
438 {
439 	raidz_map_t *rm;
440 	uint64_t b = zio->io_offset >> unit_shift;
441 	uint64_t s = zio->io_size >> unit_shift;
442 	uint64_t f = b % dcols;
443 	uint64_t o = (b / dcols) << unit_shift;
444 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
445 
446 	q = s / (dcols - nparity);
447 	r = s - q * (dcols - nparity);
448 	bc = (r == 0 ? 0 : r + nparity);
449 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
450 
451 	if (q == 0) {
452 		acols = bc;
453 		scols = MIN(dcols, roundup(bc, nparity + 1));
454 	} else {
455 		acols = dcols;
456 		scols = dcols;
457 	}
458 
459 	ASSERT3U(acols, <=, scols);
460 
461 	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
462 
463 	rm->rm_cols = acols;
464 	rm->rm_scols = scols;
465 	rm->rm_bigcols = bc;
466 	rm->rm_skipstart = bc;
467 	rm->rm_missingdata = 0;
468 	rm->rm_missingparity = 0;
469 	rm->rm_firstdatacol = nparity;
470 	rm->rm_datacopy = NULL;
471 	rm->rm_reports = 0;
472 	rm->rm_freed = 0;
473 	rm->rm_ecksuminjected = 0;
474 
475 	asize = 0;
476 
477 	for (c = 0; c < scols; c++) {
478 		col = f + c;
479 		coff = o;
480 		if (col >= dcols) {
481 			col -= dcols;
482 			coff += 1ULL << unit_shift;
483 		}
484 		rm->rm_col[c].rc_devidx = col;
485 		rm->rm_col[c].rc_offset = coff;
486 		rm->rm_col[c].rc_data = NULL;
487 		rm->rm_col[c].rc_gdata = NULL;
488 		rm->rm_col[c].rc_error = 0;
489 		rm->rm_col[c].rc_tried = 0;
490 		rm->rm_col[c].rc_skipped = 0;
491 
492 		if (c >= acols)
493 			rm->rm_col[c].rc_size = 0;
494 		else if (c < bc)
495 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
496 		else
497 			rm->rm_col[c].rc_size = q << unit_shift;
498 
499 		asize += rm->rm_col[c].rc_size;
500 	}
501 
502 	ASSERT3U(asize, ==, tot << unit_shift);
503 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
504 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
505 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
506 	ASSERT3U(rm->rm_nskip, <=, nparity);
507 
508 	for (c = 0; c < rm->rm_firstdatacol; c++)
509 		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
510 
511 	rm->rm_col[c].rc_data = zio->io_data;
512 
513 	for (c = c + 1; c < acols; c++)
514 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
515 		    rm->rm_col[c - 1].rc_size;
516 
517 	/*
518 	 * If all data stored spans all columns, there's a danger that parity
519 	 * will always be on the same device and, since parity isn't read
520 	 * during normal operation, that that device's I/O bandwidth won't be
521 	 * used effectively. We therefore switch the parity every 1MB.
522 	 *
523 	 * ... at least that was, ostensibly, the theory. As a practical
524 	 * matter unless we juggle the parity between all devices evenly, we
525 	 * won't see any benefit. Further, occasional writes that aren't a
526 	 * multiple of the LCM of the number of children and the minimum
527 	 * stripe width are sufficient to avoid pessimal behavior.
528 	 * Unfortunately, this decision created an implicit on-disk format
529 	 * requirement that we need to support for all eternity, but only
530 	 * for single-parity RAID-Z.
531 	 *
532 	 * If we intend to skip a sector in the zeroth column for padding
533 	 * we must make sure to note this swap. We will never intend to
534 	 * skip the first column since at least one data and one parity
535 	 * column must appear in each row.
536 	 */
537 	ASSERT(rm->rm_cols >= 2);
538 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
539 
540 	if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
541 		devidx = rm->rm_col[0].rc_devidx;
542 		o = rm->rm_col[0].rc_offset;
543 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
544 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
545 		rm->rm_col[1].rc_devidx = devidx;
546 		rm->rm_col[1].rc_offset = o;
547 
548 		if (rm->rm_skipstart == 0)
549 			rm->rm_skipstart = 1;
550 	}
551 
552 	zio->io_vsd = rm;
553 	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
554 	return (rm);
555 }
556 
557 static void
558 vdev_raidz_generate_parity_p(raidz_map_t *rm)
559 {
560 	uint64_t *p, *src, pcount, ccount, i;
561 	int c;
562 
563 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
564 
565 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
566 		src = rm->rm_col[c].rc_data;
567 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
568 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
569 
570 		if (c == rm->rm_firstdatacol) {
571 			ASSERT(ccount == pcount);
572 			for (i = 0; i < ccount; i++, src++, p++) {
573 				*p = *src;
574 			}
575 		} else {
576 			ASSERT(ccount <= pcount);
577 			for (i = 0; i < ccount; i++, src++, p++) {
578 				*p ^= *src;
579 			}
580 		}
581 	}
582 }
583 
584 static void
585 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
586 {
587 	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
588 	int c;
589 
590 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
591 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
592 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
593 
594 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
595 		src = rm->rm_col[c].rc_data;
596 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
597 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
598 
599 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
600 
601 		if (c == rm->rm_firstdatacol) {
602 			ASSERT(ccnt == pcnt || ccnt == 0);
603 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
604 				*p = *src;
605 				*q = *src;
606 			}
607 			for (; i < pcnt; i++, src++, p++, q++) {
608 				*p = 0;
609 				*q = 0;
610 			}
611 		} else {
612 			ASSERT(ccnt <= pcnt);
613 
614 			/*
615 			 * Apply the algorithm described above by multiplying
616 			 * the previous result and adding in the new value.
617 			 */
618 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
619 				*p ^= *src;
620 
621 				VDEV_RAIDZ_64MUL_2(*q, mask);
622 				*q ^= *src;
623 			}
624 
625 			/*
626 			 * Treat short columns as though they are full of 0s.
627 			 * Note that there's therefore nothing needed for P.
628 			 */
629 			for (; i < pcnt; i++, q++) {
630 				VDEV_RAIDZ_64MUL_2(*q, mask);
631 			}
632 		}
633 	}
634 }
635 
636 static void
637 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
638 {
639 	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
640 	int c;
641 
642 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
643 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
644 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
645 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
646 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
647 
648 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
649 		src = rm->rm_col[c].rc_data;
650 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
651 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
652 		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
653 
654 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
655 
656 		if (c == rm->rm_firstdatacol) {
657 			ASSERT(ccnt == pcnt || ccnt == 0);
658 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
659 				*p = *src;
660 				*q = *src;
661 				*r = *src;
662 			}
663 			for (; i < pcnt; i++, src++, p++, q++, r++) {
664 				*p = 0;
665 				*q = 0;
666 				*r = 0;
667 			}
668 		} else {
669 			ASSERT(ccnt <= pcnt);
670 
671 			/*
672 			 * Apply the algorithm described above by multiplying
673 			 * the previous result and adding in the new value.
674 			 */
675 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
676 				*p ^= *src;
677 
678 				VDEV_RAIDZ_64MUL_2(*q, mask);
679 				*q ^= *src;
680 
681 				VDEV_RAIDZ_64MUL_4(*r, mask);
682 				*r ^= *src;
683 			}
684 
685 			/*
686 			 * Treat short columns as though they are full of 0s.
687 			 * Note that there's therefore nothing needed for P.
688 			 */
689 			for (; i < pcnt; i++, q++, r++) {
690 				VDEV_RAIDZ_64MUL_2(*q, mask);
691 				VDEV_RAIDZ_64MUL_4(*r, mask);
692 			}
693 		}
694 	}
695 }
696 
697 /*
698  * Generate RAID parity in the first virtual columns according to the number of
699  * parity columns available.
700  */
701 static void
702 vdev_raidz_generate_parity(raidz_map_t *rm)
703 {
704 	switch (rm->rm_firstdatacol) {
705 	case 1:
706 		vdev_raidz_generate_parity_p(rm);
707 		break;
708 	case 2:
709 		vdev_raidz_generate_parity_pq(rm);
710 		break;
711 	case 3:
712 		vdev_raidz_generate_parity_pqr(rm);
713 		break;
714 	default:
715 		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
716 	}
717 }
718 
719 static int
720 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
721 {
722 	uint64_t *dst, *src, xcount, ccount, count, i;
723 	int x = tgts[0];
724 	int c;
725 
726 	ASSERT(ntgts == 1);
727 	ASSERT(x >= rm->rm_firstdatacol);
728 	ASSERT(x < rm->rm_cols);
729 
730 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
731 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
732 	ASSERT(xcount > 0);
733 
734 	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
735 	dst = rm->rm_col[x].rc_data;
736 	for (i = 0; i < xcount; i++, dst++, src++) {
737 		*dst = *src;
738 	}
739 
740 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
741 		src = rm->rm_col[c].rc_data;
742 		dst = rm->rm_col[x].rc_data;
743 
744 		if (c == x)
745 			continue;
746 
747 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
748 		count = MIN(ccount, xcount);
749 
750 		for (i = 0; i < count; i++, dst++, src++) {
751 			*dst ^= *src;
752 		}
753 	}
754 
755 	return (1 << VDEV_RAIDZ_P);
756 }
757 
758 static int
759 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
760 {
761 	uint64_t *dst, *src, xcount, ccount, count, mask, i;
762 	uint8_t *b;
763 	int x = tgts[0];
764 	int c, j, exp;
765 
766 	ASSERT(ntgts == 1);
767 
768 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
769 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
770 
771 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
772 		src = rm->rm_col[c].rc_data;
773 		dst = rm->rm_col[x].rc_data;
774 
775 		if (c == x)
776 			ccount = 0;
777 		else
778 			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
779 
780 		count = MIN(ccount, xcount);
781 
782 		if (c == rm->rm_firstdatacol) {
783 			for (i = 0; i < count; i++, dst++, src++) {
784 				*dst = *src;
785 			}
786 			for (; i < xcount; i++, dst++) {
787 				*dst = 0;
788 			}
789 
790 		} else {
791 			for (i = 0; i < count; i++, dst++, src++) {
792 				VDEV_RAIDZ_64MUL_2(*dst, mask);
793 				*dst ^= *src;
794 			}
795 
796 			for (; i < xcount; i++, dst++) {
797 				VDEV_RAIDZ_64MUL_2(*dst, mask);
798 			}
799 		}
800 	}
801 
802 	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
803 	dst = rm->rm_col[x].rc_data;
804 	exp = 255 - (rm->rm_cols - 1 - x);
805 
806 	for (i = 0; i < xcount; i++, dst++, src++) {
807 		*dst ^= *src;
808 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
809 			*b = vdev_raidz_exp2(*b, exp);
810 		}
811 	}
812 
813 	return (1 << VDEV_RAIDZ_Q);
814 }
815 
816 static int
817 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
818 {
819 	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
820 	void *pdata, *qdata;
821 	uint64_t xsize, ysize, i;
822 	int x = tgts[0];
823 	int y = tgts[1];
824 
825 	ASSERT(ntgts == 2);
826 	ASSERT(x < y);
827 	ASSERT(x >= rm->rm_firstdatacol);
828 	ASSERT(y < rm->rm_cols);
829 
830 	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
831 
832 	/*
833 	 * Move the parity data aside -- we're going to compute parity as
834 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
835 	 * reuse the parity generation mechanism without trashing the actual
836 	 * parity so we make those columns appear to be full of zeros by
837 	 * setting their lengths to zero.
838 	 */
839 	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
840 	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
841 	xsize = rm->rm_col[x].rc_size;
842 	ysize = rm->rm_col[y].rc_size;
843 
844 	rm->rm_col[VDEV_RAIDZ_P].rc_data =
845 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
846 	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
847 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
848 	rm->rm_col[x].rc_size = 0;
849 	rm->rm_col[y].rc_size = 0;
850 
851 	vdev_raidz_generate_parity_pq(rm);
852 
853 	rm->rm_col[x].rc_size = xsize;
854 	rm->rm_col[y].rc_size = ysize;
855 
856 	p = pdata;
857 	q = qdata;
858 	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
859 	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
860 	xd = rm->rm_col[x].rc_data;
861 	yd = rm->rm_col[y].rc_data;
862 
863 	/*
864 	 * We now have:
865 	 *	Pxy = P + D_x + D_y
866 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
867 	 *
868 	 * We can then solve for D_x:
869 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
870 	 * where
871 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
872 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
873 	 *
874 	 * With D_x in hand, we can easily solve for D_y:
875 	 *	D_y = P + Pxy + D_x
876 	 */
877 
878 	a = vdev_raidz_pow2[255 + x - y];
879 	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
880 	tmp = 255 - vdev_raidz_log2[a ^ 1];
881 
882 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
883 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
884 
885 	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
886 		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
887 		    vdev_raidz_exp2(*q ^ *qxy, bexp);
888 
889 		if (i < ysize)
890 			*yd = *p ^ *pxy ^ *xd;
891 	}
892 
893 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
894 	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
895 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
896 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
897 
898 	/*
899 	 * Restore the saved parity data.
900 	 */
901 	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
902 	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
903 
904 	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
905 }
906 
907 /* BEGIN CSTYLED */
908 /*
909  * In the general case of reconstruction, we must solve the system of linear
910  * equations defined by the coeffecients used to generate parity as well as
911  * the contents of the data and parity disks. This can be expressed with
912  * vectors for the original data (D) and the actual data (d) and parity (p)
913  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
914  *
915  *            __   __                     __     __
916  *            |     |         __     __   |  p_0  |
917  *            |  V  |         |  D_0  |   | p_m-1 |
918  *            |     |    x    |   :   | = |  d_0  |
919  *            |  I  |         | D_n-1 |   |   :   |
920  *            |     |         ~~     ~~   | d_n-1 |
921  *            ~~   ~~                     ~~     ~~
922  *
923  * I is simply a square identity matrix of size n, and V is a vandermonde
924  * matrix defined by the coeffecients we chose for the various parity columns
925  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
926  * computation as well as linear separability.
927  *
928  *      __               __               __     __
929  *      |   1   ..  1 1 1 |               |  p_0  |
930  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
931  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
932  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
933  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
934  *      |   :       : : : |   |   :   |   |  d_2  |
935  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
936  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
937  *      |   0   ..  0 0 1 |               | d_n-1 |
938  *      ~~               ~~               ~~     ~~
939  *
940  * Note that I, V, d, and p are known. To compute D, we must invert the
941  * matrix and use the known data and parity values to reconstruct the unknown
942  * data values. We begin by removing the rows in V|I and d|p that correspond
943  * to failed or missing columns; we then make V|I square (n x n) and d|p
944  * sized n by removing rows corresponding to unused parity from the bottom up
945  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
946  * using Gauss-Jordan elimination. In the example below we use m=3 parity
947  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
948  *           __                               __
949  *           |  1   1   1   1   1   1   1   1  |
950  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
951  *           |  19 205 116  29  64  16  4   1  |      / /
952  *           |  1   0   0   0   0   0   0   0  |     / /
953  *           |  0   1   0   0   0   0   0   0  | <--' /
954  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
955  *           |  0   0   0   1   0   0   0   0  |
956  *           |  0   0   0   0   1   0   0   0  |
957  *           |  0   0   0   0   0   1   0   0  |
958  *           |  0   0   0   0   0   0   1   0  |
959  *           |  0   0   0   0   0   0   0   1  |
960  *           ~~                               ~~
961  *           __                               __
962  *           |  1   1   1   1   1   1   1   1  |
963  *           | 128  64  32  16  8   4   2   1  |
964  *           |  19 205 116  29  64  16  4   1  |
965  *           |  1   0   0   0   0   0   0   0  |
966  *           |  0   1   0   0   0   0   0   0  |
967  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
968  *           |  0   0   0   1   0   0   0   0  |
969  *           |  0   0   0   0   1   0   0   0  |
970  *           |  0   0   0   0   0   1   0   0  |
971  *           |  0   0   0   0   0   0   1   0  |
972  *           |  0   0   0   0   0   0   0   1  |
973  *           ~~                               ~~
974  *
975  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
976  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
977  * matrix is not singular.
978  * __                                                                 __
979  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
980  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
981  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
982  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
983  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
984  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
985  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
986  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
987  * ~~                                                                 ~~
988  * __                                                                 __
989  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
990  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
991  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
992  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
993  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
994  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
995  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
996  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
997  * ~~                                                                 ~~
998  * __                                                                 __
999  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1000  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1001  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1002  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1003  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1004  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1005  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1006  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1007  * ~~                                                                 ~~
1008  * __                                                                 __
1009  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1010  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1011  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1012  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1013  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1014  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1015  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1016  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1017  * ~~                                                                 ~~
1018  * __                                                                 __
1019  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1020  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1021  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1022  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1023  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1024  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1025  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1026  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1027  * ~~                                                                 ~~
1028  * __                                                                 __
1029  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1030  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1031  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1032  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1033  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1034  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1035  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1036  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1037  * ~~                                                                 ~~
1038  *                   __                               __
1039  *                   |  0   0   1   0   0   0   0   0  |
1040  *                   | 167 100  5   41 159 169 217 208 |
1041  *                   | 166 100  4   40 158 168 216 209 |
1042  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1043  *                   |  0   0   0   0   1   0   0   0  |
1044  *                   |  0   0   0   0   0   1   0   0  |
1045  *                   |  0   0   0   0   0   0   1   0  |
1046  *                   |  0   0   0   0   0   0   0   1  |
1047  *                   ~~                               ~~
1048  *
1049  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1050  * of the missing data.
1051  *
1052  * As is apparent from the example above, the only non-trivial rows in the
1053  * inverse matrix correspond to the data disks that we're trying to
1054  * reconstruct. Indeed, those are the only rows we need as the others would
1055  * only be useful for reconstructing data known or assumed to be valid. For
1056  * that reason, we only build the coefficients in the rows that correspond to
1057  * targeted columns.
1058  */
1059 /* END CSTYLED */
1060 
1061 static void
1062 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1063     uint8_t **rows)
1064 {
1065 	int i, j;
1066 	int pow;
1067 
1068 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1069 
1070 	/*
1071 	 * Fill in the missing rows of interest.
1072 	 */
1073 	for (i = 0; i < nmap; i++) {
1074 		ASSERT3S(0, <=, map[i]);
1075 		ASSERT3S(map[i], <=, 2);
1076 
1077 		pow = map[i] * n;
1078 		if (pow > 255)
1079 			pow -= 255;
1080 		ASSERT(pow <= 255);
1081 
1082 		for (j = 0; j < n; j++) {
1083 			pow -= map[i];
1084 			if (pow < 0)
1085 				pow += 255;
1086 			rows[i][j] = vdev_raidz_pow2[pow];
1087 		}
1088 	}
1089 }
1090 
1091 static void
1092 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1093     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1094 {
1095 	int i, j, ii, jj;
1096 	uint8_t log;
1097 
1098 	/*
1099 	 * Assert that the first nmissing entries from the array of used
1100 	 * columns correspond to parity columns and that subsequent entries
1101 	 * correspond to data columns.
1102 	 */
1103 	for (i = 0; i < nmissing; i++) {
1104 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
1105 	}
1106 	for (; i < n; i++) {
1107 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1108 	}
1109 
1110 	/*
1111 	 * First initialize the storage where we'll compute the inverse rows.
1112 	 */
1113 	for (i = 0; i < nmissing; i++) {
1114 		for (j = 0; j < n; j++) {
1115 			invrows[i][j] = (i == j) ? 1 : 0;
1116 		}
1117 	}
1118 
1119 	/*
1120 	 * Subtract all trivial rows from the rows of consequence.
1121 	 */
1122 	for (i = 0; i < nmissing; i++) {
1123 		for (j = nmissing; j < n; j++) {
1124 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1125 			jj = used[j] - rm->rm_firstdatacol;
1126 			ASSERT3S(jj, <, n);
1127 			invrows[i][j] = rows[i][jj];
1128 			rows[i][jj] = 0;
1129 		}
1130 	}
1131 
1132 	/*
1133 	 * For each of the rows of interest, we must normalize it and subtract
1134 	 * a multiple of it from the other rows.
1135 	 */
1136 	for (i = 0; i < nmissing; i++) {
1137 		for (j = 0; j < missing[i]; j++) {
1138 			ASSERT3U(rows[i][j], ==, 0);
1139 		}
1140 		ASSERT3U(rows[i][missing[i]], !=, 0);
1141 
1142 		/*
1143 		 * Compute the inverse of the first element and multiply each
1144 		 * element in the row by that value.
1145 		 */
1146 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1147 
1148 		for (j = 0; j < n; j++) {
1149 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1150 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1151 		}
1152 
1153 		for (ii = 0; ii < nmissing; ii++) {
1154 			if (i == ii)
1155 				continue;
1156 
1157 			ASSERT3U(rows[ii][missing[i]], !=, 0);
1158 
1159 			log = vdev_raidz_log2[rows[ii][missing[i]]];
1160 
1161 			for (j = 0; j < n; j++) {
1162 				rows[ii][j] ^=
1163 				    vdev_raidz_exp2(rows[i][j], log);
1164 				invrows[ii][j] ^=
1165 				    vdev_raidz_exp2(invrows[i][j], log);
1166 			}
1167 		}
1168 	}
1169 
1170 	/*
1171 	 * Verify that the data that is left in the rows are properly part of
1172 	 * an identity matrix.
1173 	 */
1174 	for (i = 0; i < nmissing; i++) {
1175 		for (j = 0; j < n; j++) {
1176 			if (j == missing[i]) {
1177 				ASSERT3U(rows[i][j], ==, 1);
1178 			} else {
1179 				ASSERT3U(rows[i][j], ==, 0);
1180 			}
1181 		}
1182 	}
1183 }
1184 
1185 static void
1186 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1187     int *missing, uint8_t **invrows, const uint8_t *used)
1188 {
1189 	int i, j, x, cc, c;
1190 	uint8_t *src;
1191 	uint64_t ccount;
1192 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1193 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1194 	uint8_t log, val;
1195 	int ll;
1196 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1197 	uint8_t *p, *pp;
1198 	size_t psize;
1199 
1200 	psize = sizeof (invlog[0][0]) * n * nmissing;
1201 	p = kmem_alloc(psize, KM_SLEEP);
1202 
1203 	for (pp = p, i = 0; i < nmissing; i++) {
1204 		invlog[i] = pp;
1205 		pp += n;
1206 	}
1207 
1208 	for (i = 0; i < nmissing; i++) {
1209 		for (j = 0; j < n; j++) {
1210 			ASSERT3U(invrows[i][j], !=, 0);
1211 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1212 		}
1213 	}
1214 
1215 	for (i = 0; i < n; i++) {
1216 		c = used[i];
1217 		ASSERT3U(c, <, rm->rm_cols);
1218 
1219 		src = rm->rm_col[c].rc_data;
1220 		ccount = rm->rm_col[c].rc_size;
1221 		for (j = 0; j < nmissing; j++) {
1222 			cc = missing[j] + rm->rm_firstdatacol;
1223 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1224 			ASSERT3U(cc, <, rm->rm_cols);
1225 			ASSERT3U(cc, !=, c);
1226 
1227 			dst[j] = rm->rm_col[cc].rc_data;
1228 			dcount[j] = rm->rm_col[cc].rc_size;
1229 		}
1230 
1231 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1232 
1233 		for (x = 0; x < ccount; x++, src++) {
1234 			if (*src != 0)
1235 				log = vdev_raidz_log2[*src];
1236 
1237 			for (cc = 0; cc < nmissing; cc++) {
1238 				if (x >= dcount[cc])
1239 					continue;
1240 
1241 				if (*src == 0) {
1242 					val = 0;
1243 				} else {
1244 					if ((ll = log + invlog[cc][i]) >= 255)
1245 						ll -= 255;
1246 					val = vdev_raidz_pow2[ll];
1247 				}
1248 
1249 				if (i == 0)
1250 					dst[cc][x] = val;
1251 				else
1252 					dst[cc][x] ^= val;
1253 			}
1254 		}
1255 	}
1256 
1257 	kmem_free(p, psize);
1258 }
1259 
1260 static int
1261 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1262 {
1263 	int n, i, c, t, tt;
1264 	int nmissing_rows;
1265 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1266 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1267 
1268 	uint8_t *p, *pp;
1269 	size_t psize;
1270 
1271 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1272 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1273 	uint8_t *used;
1274 
1275 	int code = 0;
1276 
1277 
1278 	n = rm->rm_cols - rm->rm_firstdatacol;
1279 
1280 	/*
1281 	 * Figure out which data columns are missing.
1282 	 */
1283 	nmissing_rows = 0;
1284 	for (t = 0; t < ntgts; t++) {
1285 		if (tgts[t] >= rm->rm_firstdatacol) {
1286 			missing_rows[nmissing_rows++] =
1287 			    tgts[t] - rm->rm_firstdatacol;
1288 		}
1289 	}
1290 
1291 	/*
1292 	 * Figure out which parity columns to use to help generate the missing
1293 	 * data columns.
1294 	 */
1295 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1296 		ASSERT(tt < ntgts);
1297 		ASSERT(c < rm->rm_firstdatacol);
1298 
1299 		/*
1300 		 * Skip any targeted parity columns.
1301 		 */
1302 		if (c == tgts[tt]) {
1303 			tt++;
1304 			continue;
1305 		}
1306 
1307 		code |= 1 << c;
1308 
1309 		parity_map[i] = c;
1310 		i++;
1311 	}
1312 
1313 	ASSERT(code != 0);
1314 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1315 
1316 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1317 	    nmissing_rows * n + sizeof (used[0]) * n;
1318 	p = kmem_alloc(psize, KM_SLEEP);
1319 
1320 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1321 		rows[i] = pp;
1322 		pp += n;
1323 		invrows[i] = pp;
1324 		pp += n;
1325 	}
1326 	used = pp;
1327 
1328 	for (i = 0; i < nmissing_rows; i++) {
1329 		used[i] = parity_map[i];
1330 	}
1331 
1332 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1333 		if (tt < nmissing_rows &&
1334 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1335 			tt++;
1336 			continue;
1337 		}
1338 
1339 		ASSERT3S(i, <, n);
1340 		used[i] = c;
1341 		i++;
1342 	}
1343 
1344 	/*
1345 	 * Initialize the interesting rows of the matrix.
1346 	 */
1347 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1348 
1349 	/*
1350 	 * Invert the matrix.
1351 	 */
1352 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1353 	    invrows, used);
1354 
1355 	/*
1356 	 * Reconstruct the missing data using the generated matrix.
1357 	 */
1358 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1359 	    invrows, used);
1360 
1361 	kmem_free(p, psize);
1362 
1363 	return (code);
1364 }
1365 
1366 static int
1367 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1368 {
1369 	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1370 	int ntgts;
1371 	int i, c;
1372 	int code;
1373 	int nbadparity, nbaddata;
1374 	int parity_valid[VDEV_RAIDZ_MAXPARITY];
1375 
1376 	/*
1377 	 * The tgts list must already be sorted.
1378 	 */
1379 	for (i = 1; i < nt; i++) {
1380 		ASSERT(t[i] > t[i - 1]);
1381 	}
1382 
1383 	nbadparity = rm->rm_firstdatacol;
1384 	nbaddata = rm->rm_cols - nbadparity;
1385 	ntgts = 0;
1386 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1387 		if (c < rm->rm_firstdatacol)
1388 			parity_valid[c] = B_FALSE;
1389 
1390 		if (i < nt && c == t[i]) {
1391 			tgts[ntgts++] = c;
1392 			i++;
1393 		} else if (rm->rm_col[c].rc_error != 0) {
1394 			tgts[ntgts++] = c;
1395 		} else if (c >= rm->rm_firstdatacol) {
1396 			nbaddata--;
1397 		} else {
1398 			parity_valid[c] = B_TRUE;
1399 			nbadparity--;
1400 		}
1401 	}
1402 
1403 	ASSERT(ntgts >= nt);
1404 	ASSERT(nbaddata >= 0);
1405 	ASSERT(nbaddata + nbadparity == ntgts);
1406 
1407 	dt = &tgts[nbadparity];
1408 
1409 	/*
1410 	 * See if we can use any of our optimized reconstruction routines.
1411 	 */
1412 	if (!vdev_raidz_default_to_general) {
1413 		switch (nbaddata) {
1414 		case 1:
1415 			if (parity_valid[VDEV_RAIDZ_P])
1416 				return (vdev_raidz_reconstruct_p(rm, dt, 1));
1417 
1418 			ASSERT(rm->rm_firstdatacol > 1);
1419 
1420 			if (parity_valid[VDEV_RAIDZ_Q])
1421 				return (vdev_raidz_reconstruct_q(rm, dt, 1));
1422 
1423 			ASSERT(rm->rm_firstdatacol > 2);
1424 			break;
1425 
1426 		case 2:
1427 			ASSERT(rm->rm_firstdatacol > 1);
1428 
1429 			if (parity_valid[VDEV_RAIDZ_P] &&
1430 			    parity_valid[VDEV_RAIDZ_Q])
1431 				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1432 
1433 			ASSERT(rm->rm_firstdatacol > 2);
1434 
1435 			break;
1436 		}
1437 	}
1438 
1439 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1440 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1441 	ASSERT(code > 0);
1442 	return (code);
1443 }
1444 
1445 static int
1446 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
1447 {
1448 	vdev_t *cvd;
1449 	uint64_t nparity = vd->vdev_nparity;
1450 	int c;
1451 	int lasterror = 0;
1452 	int numerrors = 0;
1453 
1454 	ASSERT(nparity > 0);
1455 
1456 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
1457 	    vd->vdev_children < nparity + 1) {
1458 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1459 		return (EINVAL);
1460 	}
1461 
1462 	vdev_open_children(vd);
1463 
1464 	for (c = 0; c < vd->vdev_children; c++) {
1465 		cvd = vd->vdev_child[c];
1466 
1467 		if (cvd->vdev_open_error != 0) {
1468 			lasterror = cvd->vdev_open_error;
1469 			numerrors++;
1470 			continue;
1471 		}
1472 
1473 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1474 		*ashift = MAX(*ashift, cvd->vdev_ashift);
1475 	}
1476 
1477 	*asize *= vd->vdev_children;
1478 
1479 	if (numerrors > nparity) {
1480 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1481 		return (lasterror);
1482 	}
1483 
1484 	return (0);
1485 }
1486 
1487 static void
1488 vdev_raidz_close(vdev_t *vd)
1489 {
1490 	int c;
1491 
1492 	for (c = 0; c < vd->vdev_children; c++)
1493 		vdev_close(vd->vdev_child[c]);
1494 }
1495 
1496 static uint64_t
1497 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1498 {
1499 	uint64_t asize;
1500 	uint64_t ashift = vd->vdev_top->vdev_ashift;
1501 	uint64_t cols = vd->vdev_children;
1502 	uint64_t nparity = vd->vdev_nparity;
1503 
1504 	asize = ((psize - 1) >> ashift) + 1;
1505 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1506 	asize = roundup(asize, nparity + 1) << ashift;
1507 
1508 	return (asize);
1509 }
1510 
1511 static void
1512 vdev_raidz_child_done(zio_t *zio)
1513 {
1514 	raidz_col_t *rc = zio->io_private;
1515 
1516 	rc->rc_error = zio->io_error;
1517 	rc->rc_tried = 1;
1518 	rc->rc_skipped = 0;
1519 }
1520 
1521 static int
1522 vdev_raidz_io_start(zio_t *zio)
1523 {
1524 	vdev_t *vd = zio->io_vd;
1525 	vdev_t *tvd = vd->vdev_top;
1526 	vdev_t *cvd;
1527 	blkptr_t *bp = zio->io_bp;
1528 	raidz_map_t *rm;
1529 	raidz_col_t *rc;
1530 	int c, i;
1531 
1532 	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
1533 	    vd->vdev_nparity);
1534 
1535 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1536 
1537 	if (zio->io_type == ZIO_TYPE_WRITE) {
1538 		vdev_raidz_generate_parity(rm);
1539 
1540 		for (c = 0; c < rm->rm_cols; c++) {
1541 			rc = &rm->rm_col[c];
1542 			cvd = vd->vdev_child[rc->rc_devidx];
1543 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1544 			    rc->rc_offset, rc->rc_data, rc->rc_size,
1545 			    zio->io_type, zio->io_priority, 0,
1546 			    vdev_raidz_child_done, rc));
1547 		}
1548 
1549 		/*
1550 		 * Generate optional I/Os for any skipped sectors to improve
1551 		 * aggregation contiguity.
1552 		 */
1553 		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1554 			ASSERT(c <= rm->rm_scols);
1555 			if (c == rm->rm_scols)
1556 				c = 0;
1557 			rc = &rm->rm_col[c];
1558 			cvd = vd->vdev_child[rc->rc_devidx];
1559 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1560 			    rc->rc_offset + rc->rc_size, NULL,
1561 			    1 << tvd->vdev_ashift,
1562 			    zio->io_type, zio->io_priority,
1563 			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1564 		}
1565 
1566 		return (ZIO_PIPELINE_CONTINUE);
1567 	}
1568 
1569 	ASSERT(zio->io_type == ZIO_TYPE_READ);
1570 
1571 	/*
1572 	 * Iterate over the columns in reverse order so that we hit the parity
1573 	 * last -- any errors along the way will force us to read the parity.
1574 	 */
1575 	for (c = rm->rm_cols - 1; c >= 0; c--) {
1576 		rc = &rm->rm_col[c];
1577 		cvd = vd->vdev_child[rc->rc_devidx];
1578 		if (!vdev_readable(cvd)) {
1579 			if (c >= rm->rm_firstdatacol)
1580 				rm->rm_missingdata++;
1581 			else
1582 				rm->rm_missingparity++;
1583 			rc->rc_error = ENXIO;
1584 			rc->rc_tried = 1;	/* don't even try */
1585 			rc->rc_skipped = 1;
1586 			continue;
1587 		}
1588 		if (vdev_dtl_contains(cvd, DTL_MISSING, bp->blk_birth, 1)) {
1589 			if (c >= rm->rm_firstdatacol)
1590 				rm->rm_missingdata++;
1591 			else
1592 				rm->rm_missingparity++;
1593 			rc->rc_error = ESTALE;
1594 			rc->rc_skipped = 1;
1595 			continue;
1596 		}
1597 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1598 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1599 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1600 			    rc->rc_offset, rc->rc_data, rc->rc_size,
1601 			    zio->io_type, zio->io_priority, 0,
1602 			    vdev_raidz_child_done, rc));
1603 		}
1604 	}
1605 
1606 	return (ZIO_PIPELINE_CONTINUE);
1607 }
1608 
1609 /*
1610  * Report a checksum error for a child of a RAID-Z device.
1611  */
1612 static void
1613 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1614 {
1615 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1616 
1617 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1618 		zio_bad_cksum_t zbc;
1619 		raidz_map_t *rm = zio->io_vsd;
1620 
1621 		mutex_enter(&vd->vdev_stat_lock);
1622 		vd->vdev_stat.vs_checksum_errors++;
1623 		mutex_exit(&vd->vdev_stat_lock);
1624 
1625 		zbc.zbc_has_cksum = 0;
1626 		zbc.zbc_injected = rm->rm_ecksuminjected;
1627 
1628 		zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1629 		    rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1630 		    &zbc);
1631 	}
1632 }
1633 
1634 /*
1635  * We keep track of whether or not there were any injected errors, so that
1636  * any ereports we generate can note it.
1637  */
1638 static int
1639 raidz_checksum_verify(zio_t *zio)
1640 {
1641 	zio_bad_cksum_t zbc;
1642 	raidz_map_t *rm = zio->io_vsd;
1643 
1644 	int ret = zio_checksum_error(zio, &zbc);
1645 	if (ret != 0 && zbc.zbc_injected != 0)
1646 		rm->rm_ecksuminjected = 1;
1647 
1648 	return (ret);
1649 }
1650 
1651 /*
1652  * Generate the parity from the data columns. If we tried and were able to
1653  * read the parity without error, verify that the generated parity matches the
1654  * data we read. If it doesn't, we fire off a checksum error. Return the
1655  * number such failures.
1656  */
1657 static int
1658 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1659 {
1660 	void *orig[VDEV_RAIDZ_MAXPARITY];
1661 	int c, ret = 0;
1662 	raidz_col_t *rc;
1663 
1664 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1665 		rc = &rm->rm_col[c];
1666 		if (!rc->rc_tried || rc->rc_error != 0)
1667 			continue;
1668 		orig[c] = zio_buf_alloc(rc->rc_size);
1669 		bcopy(rc->rc_data, orig[c], rc->rc_size);
1670 	}
1671 
1672 	vdev_raidz_generate_parity(rm);
1673 
1674 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1675 		rc = &rm->rm_col[c];
1676 		if (!rc->rc_tried || rc->rc_error != 0)
1677 			continue;
1678 		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1679 			raidz_checksum_error(zio, rc, orig[c]);
1680 			rc->rc_error = ECKSUM;
1681 			ret++;
1682 		}
1683 		zio_buf_free(orig[c], rc->rc_size);
1684 	}
1685 
1686 	return (ret);
1687 }
1688 
1689 /*
1690  * Keep statistics on all the ways that we used parity to correct data.
1691  */
1692 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1693 
1694 static int
1695 vdev_raidz_worst_error(raidz_map_t *rm)
1696 {
1697 	int error = 0;
1698 
1699 	for (int c = 0; c < rm->rm_cols; c++)
1700 		error = zio_worst_error(error, rm->rm_col[c].rc_error);
1701 
1702 	return (error);
1703 }
1704 
1705 /*
1706  * Iterate over all combinations of bad data and attempt a reconstruction.
1707  * Note that the algorithm below is non-optimal because it doesn't take into
1708  * account how reconstruction is actually performed. For example, with
1709  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1710  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1711  * cases we'd only use parity information in column 0.
1712  */
1713 static int
1714 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1715 {
1716 	raidz_map_t *rm = zio->io_vsd;
1717 	raidz_col_t *rc;
1718 	void *orig[VDEV_RAIDZ_MAXPARITY];
1719 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1720 	int *tgts = &tstore[1];
1721 	int current, next, i, c, n;
1722 	int code, ret = 0;
1723 
1724 	ASSERT(total_errors < rm->rm_firstdatacol);
1725 
1726 	/*
1727 	 * This simplifies one edge condition.
1728 	 */
1729 	tgts[-1] = -1;
1730 
1731 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1732 		/*
1733 		 * Initialize the targets array by finding the first n columns
1734 		 * that contain no error.
1735 		 *
1736 		 * If there were no data errors, we need to ensure that we're
1737 		 * always explicitly attempting to reconstruct at least one
1738 		 * data column. To do this, we simply push the highest target
1739 		 * up into the data columns.
1740 		 */
1741 		for (c = 0, i = 0; i < n; i++) {
1742 			if (i == n - 1 && data_errors == 0 &&
1743 			    c < rm->rm_firstdatacol) {
1744 				c = rm->rm_firstdatacol;
1745 			}
1746 
1747 			while (rm->rm_col[c].rc_error != 0) {
1748 				c++;
1749 				ASSERT3S(c, <, rm->rm_cols);
1750 			}
1751 
1752 			tgts[i] = c++;
1753 		}
1754 
1755 		/*
1756 		 * Setting tgts[n] simplifies the other edge condition.
1757 		 */
1758 		tgts[n] = rm->rm_cols;
1759 
1760 		/*
1761 		 * These buffers were allocated in previous iterations.
1762 		 */
1763 		for (i = 0; i < n - 1; i++) {
1764 			ASSERT(orig[i] != NULL);
1765 		}
1766 
1767 		orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
1768 
1769 		current = 0;
1770 		next = tgts[current];
1771 
1772 		while (current != n) {
1773 			tgts[current] = next;
1774 			current = 0;
1775 
1776 			/*
1777 			 * Save off the original data that we're going to
1778 			 * attempt to reconstruct.
1779 			 */
1780 			for (i = 0; i < n; i++) {
1781 				ASSERT(orig[i] != NULL);
1782 				c = tgts[i];
1783 				ASSERT3S(c, >=, 0);
1784 				ASSERT3S(c, <, rm->rm_cols);
1785 				rc = &rm->rm_col[c];
1786 				bcopy(rc->rc_data, orig[i], rc->rc_size);
1787 			}
1788 
1789 			/*
1790 			 * Attempt a reconstruction and exit the outer loop on
1791 			 * success.
1792 			 */
1793 			code = vdev_raidz_reconstruct(rm, tgts, n);
1794 			if (raidz_checksum_verify(zio) == 0) {
1795 				atomic_inc_64(&raidz_corrected[code]);
1796 
1797 				for (i = 0; i < n; i++) {
1798 					c = tgts[i];
1799 					rc = &rm->rm_col[c];
1800 					ASSERT(rc->rc_error == 0);
1801 					if (rc->rc_tried)
1802 						raidz_checksum_error(zio, rc,
1803 						    orig[i]);
1804 					rc->rc_error = ECKSUM;
1805 				}
1806 
1807 				ret = code;
1808 				goto done;
1809 			}
1810 
1811 			/*
1812 			 * Restore the original data.
1813 			 */
1814 			for (i = 0; i < n; i++) {
1815 				c = tgts[i];
1816 				rc = &rm->rm_col[c];
1817 				bcopy(orig[i], rc->rc_data, rc->rc_size);
1818 			}
1819 
1820 			do {
1821 				/*
1822 				 * Find the next valid column after the current
1823 				 * position..
1824 				 */
1825 				for (next = tgts[current] + 1;
1826 				    next < rm->rm_cols &&
1827 				    rm->rm_col[next].rc_error != 0; next++)
1828 					continue;
1829 
1830 				ASSERT(next <= tgts[current + 1]);
1831 
1832 				/*
1833 				 * If that spot is available, we're done here.
1834 				 */
1835 				if (next != tgts[current + 1])
1836 					break;
1837 
1838 				/*
1839 				 * Otherwise, find the next valid column after
1840 				 * the previous position.
1841 				 */
1842 				for (c = tgts[current - 1] + 1;
1843 				    rm->rm_col[c].rc_error != 0; c++)
1844 					continue;
1845 
1846 				tgts[current] = c;
1847 				current++;
1848 
1849 			} while (current != n);
1850 		}
1851 	}
1852 	n--;
1853 done:
1854 	for (i = 0; i < n; i++) {
1855 		zio_buf_free(orig[i], rm->rm_col[0].rc_size);
1856 	}
1857 
1858 	return (ret);
1859 }
1860 
1861 static void
1862 vdev_raidz_io_done(zio_t *zio)
1863 {
1864 	vdev_t *vd = zio->io_vd;
1865 	vdev_t *cvd;
1866 	raidz_map_t *rm = zio->io_vsd;
1867 	raidz_col_t *rc;
1868 	int unexpected_errors = 0;
1869 	int parity_errors = 0;
1870 	int parity_untried = 0;
1871 	int data_errors = 0;
1872 	int total_errors = 0;
1873 	int n, c;
1874 	int tgts[VDEV_RAIDZ_MAXPARITY];
1875 	int code;
1876 
1877 	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
1878 
1879 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1880 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1881 
1882 	for (c = 0; c < rm->rm_cols; c++) {
1883 		rc = &rm->rm_col[c];
1884 
1885 		if (rc->rc_error) {
1886 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
1887 
1888 			if (c < rm->rm_firstdatacol)
1889 				parity_errors++;
1890 			else
1891 				data_errors++;
1892 
1893 			if (!rc->rc_skipped)
1894 				unexpected_errors++;
1895 
1896 			total_errors++;
1897 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1898 			parity_untried++;
1899 		}
1900 	}
1901 
1902 	if (zio->io_type == ZIO_TYPE_WRITE) {
1903 		/*
1904 		 * XXX -- for now, treat partial writes as a success.
1905 		 * (If we couldn't write enough columns to reconstruct
1906 		 * the data, the I/O failed.  Otherwise, good enough.)
1907 		 *
1908 		 * Now that we support write reallocation, it would be better
1909 		 * to treat partial failure as real failure unless there are
1910 		 * no non-degraded top-level vdevs left, and not update DTLs
1911 		 * if we intend to reallocate.
1912 		 */
1913 		/* XXPOLICY */
1914 		if (total_errors > rm->rm_firstdatacol)
1915 			zio->io_error = vdev_raidz_worst_error(rm);
1916 
1917 		return;
1918 	}
1919 
1920 	ASSERT(zio->io_type == ZIO_TYPE_READ);
1921 	/*
1922 	 * There are three potential phases for a read:
1923 	 *	1. produce valid data from the columns read
1924 	 *	2. read all disks and try again
1925 	 *	3. perform combinatorial reconstruction
1926 	 *
1927 	 * Each phase is progressively both more expensive and less likely to
1928 	 * occur. If we encounter more errors than we can repair or all phases
1929 	 * fail, we have no choice but to return an error.
1930 	 */
1931 
1932 	/*
1933 	 * If the number of errors we saw was correctable -- less than or equal
1934 	 * to the number of parity disks read -- attempt to produce data that
1935 	 * has a valid checksum. Naturally, this case applies in the absence of
1936 	 * any errors.
1937 	 */
1938 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1939 		if (data_errors == 0) {
1940 			if (raidz_checksum_verify(zio) == 0) {
1941 				/*
1942 				 * If we read parity information (unnecessarily
1943 				 * as it happens since no reconstruction was
1944 				 * needed) regenerate and verify the parity.
1945 				 * We also regenerate parity when resilvering
1946 				 * so we can write it out to the failed device
1947 				 * later.
1948 				 */
1949 				if (parity_errors + parity_untried <
1950 				    rm->rm_firstdatacol ||
1951 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
1952 					n = raidz_parity_verify(zio, rm);
1953 					unexpected_errors += n;
1954 					ASSERT(parity_errors + n <=
1955 					    rm->rm_firstdatacol);
1956 				}
1957 				goto done;
1958 			}
1959 		} else {
1960 			/*
1961 			 * We either attempt to read all the parity columns or
1962 			 * none of them. If we didn't try to read parity, we
1963 			 * wouldn't be here in the correctable case. There must
1964 			 * also have been fewer parity errors than parity
1965 			 * columns or, again, we wouldn't be in this code path.
1966 			 */
1967 			ASSERT(parity_untried == 0);
1968 			ASSERT(parity_errors < rm->rm_firstdatacol);
1969 
1970 			/*
1971 			 * Identify the data columns that reported an error.
1972 			 */
1973 			n = 0;
1974 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1975 				rc = &rm->rm_col[c];
1976 				if (rc->rc_error != 0) {
1977 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1978 					tgts[n++] = c;
1979 				}
1980 			}
1981 
1982 			ASSERT(rm->rm_firstdatacol >= n);
1983 
1984 			code = vdev_raidz_reconstruct(rm, tgts, n);
1985 
1986 			if (raidz_checksum_verify(zio) == 0) {
1987 				atomic_inc_64(&raidz_corrected[code]);
1988 
1989 				/*
1990 				 * If we read more parity disks than were used
1991 				 * for reconstruction, confirm that the other
1992 				 * parity disks produced correct data. This
1993 				 * routine is suboptimal in that it regenerates
1994 				 * the parity that we already used in addition
1995 				 * to the parity that we're attempting to
1996 				 * verify, but this should be a relatively
1997 				 * uncommon case, and can be optimized if it
1998 				 * becomes a problem. Note that we regenerate
1999 				 * parity when resilvering so we can write it
2000 				 * out to failed devices later.
2001 				 */
2002 				if (parity_errors < rm->rm_firstdatacol - n ||
2003 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2004 					n = raidz_parity_verify(zio, rm);
2005 					unexpected_errors += n;
2006 					ASSERT(parity_errors + n <=
2007 					    rm->rm_firstdatacol);
2008 				}
2009 
2010 				goto done;
2011 			}
2012 		}
2013 	}
2014 
2015 	/*
2016 	 * This isn't a typical situation -- either we got a read error or
2017 	 * a child silently returned bad data. Read every block so we can
2018 	 * try again with as much data and parity as we can track down. If
2019 	 * we've already been through once before, all children will be marked
2020 	 * as tried so we'll proceed to combinatorial reconstruction.
2021 	 */
2022 	unexpected_errors = 1;
2023 	rm->rm_missingdata = 0;
2024 	rm->rm_missingparity = 0;
2025 
2026 	for (c = 0; c < rm->rm_cols; c++) {
2027 		if (rm->rm_col[c].rc_tried)
2028 			continue;
2029 
2030 		zio_vdev_io_redone(zio);
2031 		do {
2032 			rc = &rm->rm_col[c];
2033 			if (rc->rc_tried)
2034 				continue;
2035 			zio_nowait(zio_vdev_child_io(zio, NULL,
2036 			    vd->vdev_child[rc->rc_devidx],
2037 			    rc->rc_offset, rc->rc_data, rc->rc_size,
2038 			    zio->io_type, zio->io_priority, 0,
2039 			    vdev_raidz_child_done, rc));
2040 		} while (++c < rm->rm_cols);
2041 
2042 		return;
2043 	}
2044 
2045 	/*
2046 	 * At this point we've attempted to reconstruct the data given the
2047 	 * errors we detected, and we've attempted to read all columns. There
2048 	 * must, therefore, be one or more additional problems -- silent errors
2049 	 * resulting in invalid data rather than explicit I/O errors resulting
2050 	 * in absent data. We check if there is enough additional data to
2051 	 * possibly reconstruct the data and then perform combinatorial
2052 	 * reconstruction over all possible combinations. If that fails,
2053 	 * we're cooked.
2054 	 */
2055 	if (total_errors > rm->rm_firstdatacol) {
2056 		zio->io_error = vdev_raidz_worst_error(rm);
2057 
2058 	} else if (total_errors < rm->rm_firstdatacol &&
2059 	    (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2060 		/*
2061 		 * If we didn't use all the available parity for the
2062 		 * combinatorial reconstruction, verify that the remaining
2063 		 * parity is correct.
2064 		 */
2065 		if (code != (1 << rm->rm_firstdatacol) - 1)
2066 			(void) raidz_parity_verify(zio, rm);
2067 	} else {
2068 		/*
2069 		 * We're here because either:
2070 		 *
2071 		 *	total_errors == rm_first_datacol, or
2072 		 *	vdev_raidz_combrec() failed
2073 		 *
2074 		 * In either case, there is enough bad data to prevent
2075 		 * reconstruction.
2076 		 *
2077 		 * Start checksum ereports for all children which haven't
2078 		 * failed.
2079 		 */
2080 		zio->io_error = ECKSUM;
2081 
2082 		for (c = 0; c < rm->rm_cols; c++) {
2083 			rc = &rm->rm_col[c];
2084 			if (rc->rc_error == 0) {
2085 				zio_bad_cksum_t zbc;
2086 				zbc.zbc_has_cksum = 0;
2087 				zbc.zbc_injected = rm->rm_ecksuminjected;
2088 
2089 				zfs_ereport_start_checksum(
2090 				    zio->io_spa, vd->vdev_child[rc->rc_devidx],
2091 				    zio, rc->rc_offset, rc->rc_size,
2092 				    (void *)(uintptr_t)c, &zbc);
2093 			}
2094 		}
2095 	}
2096 
2097 done:
2098 	zio_checksum_verified(zio);
2099 
2100 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2101 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2102 		/*
2103 		 * Use the good data we have in hand to repair damaged children.
2104 		 */
2105 		for (c = 0; c < rm->rm_cols; c++) {
2106 			rc = &rm->rm_col[c];
2107 			cvd = vd->vdev_child[rc->rc_devidx];
2108 
2109 			if (rc->rc_error == 0)
2110 				continue;
2111 
2112 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2113 			    rc->rc_offset, rc->rc_data, rc->rc_size,
2114 			    ZIO_TYPE_WRITE, zio->io_priority,
2115 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2116 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2117 		}
2118 	}
2119 }
2120 
2121 static void
2122 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2123 {
2124 	if (faulted > vd->vdev_nparity)
2125 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2126 		    VDEV_AUX_NO_REPLICAS);
2127 	else if (degraded + faulted != 0)
2128 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2129 	else
2130 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2131 }
2132 
2133 vdev_ops_t vdev_raidz_ops = {
2134 	vdev_raidz_open,
2135 	vdev_raidz_close,
2136 	vdev_raidz_asize,
2137 	vdev_raidz_io_start,
2138 	vdev_raidz_io_done,
2139 	vdev_raidz_state_change,
2140 	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
2141 	B_FALSE			/* not a leaf vdev */
2142 };
2143