xref: /titanic_52/usr/src/uts/common/fs/zfs/vdev_raidz.c (revision 0a0e9771ca0211c15f3ac4466b661c145feeb9e4)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/zfs_context.h>
28 #include <sys/spa.h>
29 #include <sys/vdev_impl.h>
30 #include <sys/zio.h>
31 #include <sys/zio_checksum.h>
32 #include <sys/fs/zfs.h>
33 #include <sys/fm/fs/zfs.h>
34 
35 /*
36  * Virtual device vector for RAID-Z.
37  *
38  * This vdev supports single, double, and triple parity. For single parity,
39  * we use a simple XOR of all the data columns. For double or triple parity,
40  * we use a special case of Reed-Solomon coding. This extends the
41  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
42  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
43  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
44  * former is also based. The latter is designed to provide higher performance
45  * for writes.
46  *
47  * Note that the Plank paper claimed to support arbitrary N+M, but was then
48  * amended six years later identifying a critical flaw that invalidates its
49  * claims. Nevertheless, the technique can be adapted to work for up to
50  * triple parity. For additional parity, the amendment "Note: Correction to
51  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
52  * is viable, but the additional complexity means that write performance will
53  * suffer.
54  *
55  * All of the methods above operate on a Galois field, defined over the
56  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
57  * can be expressed with a single byte. Briefly, the operations on the
58  * field are defined as follows:
59  *
60  *   o addition (+) is represented by a bitwise XOR
61  *   o subtraction (-) is therefore identical to addition: A + B = A - B
62  *   o multiplication of A by 2 is defined by the following bitwise expression:
63  *	(A * 2)_7 = A_6
64  *	(A * 2)_6 = A_5
65  *	(A * 2)_5 = A_4
66  *	(A * 2)_4 = A_3 + A_7
67  *	(A * 2)_3 = A_2 + A_7
68  *	(A * 2)_2 = A_1 + A_7
69  *	(A * 2)_1 = A_0
70  *	(A * 2)_0 = A_7
71  *
72  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
73  * As an aside, this multiplication is derived from the error correcting
74  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
75  *
76  * Observe that any number in the field (except for 0) can be expressed as a
77  * power of 2 -- a generator for the field. We store a table of the powers of
78  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
79  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
80  * than field addition). The inverse of a field element A (A^-1) is therefore
81  * A ^ (255 - 1) = A^254.
82  *
83  * The up-to-three parity columns, P, Q, R over several data columns,
84  * D_0, ... D_n-1, can be expressed by field operations:
85  *
86  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
87  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
88  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
89  *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
90  *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
91  *
92  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
93  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
94  * independent coefficients. (There are no additional coefficients that have
95  * this property which is why the uncorrected Plank method breaks down.)
96  *
97  * See the reconstruction code below for how P, Q and R can used individually
98  * or in concert to recover missing data columns.
99  */
100 
101 typedef struct raidz_col {
102 	uint64_t rc_devidx;		/* child device index for I/O */
103 	uint64_t rc_offset;		/* device offset */
104 	uint64_t rc_size;		/* I/O size */
105 	void *rc_data;			/* I/O data */
106 	void *rc_gdata;			/* used to store the "good" version */
107 	int rc_error;			/* I/O error for this device */
108 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
109 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
110 } raidz_col_t;
111 
112 typedef struct raidz_map {
113 	uint64_t rm_cols;		/* Regular column count */
114 	uint64_t rm_scols;		/* Count including skipped columns */
115 	uint64_t rm_bigcols;		/* Number of oversized columns */
116 	uint64_t rm_asize;		/* Actual total I/O size */
117 	uint64_t rm_missingdata;	/* Count of missing data devices */
118 	uint64_t rm_missingparity;	/* Count of missing parity devices */
119 	uint64_t rm_firstdatacol;	/* First data column/parity count */
120 	uint64_t rm_nskip;		/* Skipped sectors for padding */
121 	uint64_t rm_skipstart;	/* Column index of padding start */
122 	void *rm_datacopy;		/* rm_asize-buffer of copied data */
123 	uintptr_t rm_reports;		/* # of referencing checksum reports */
124 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
125 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
126 	uint64_t rm_skipped;		/* Skipped sectors for padding */
127 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
128 } raidz_map_t;
129 
130 #define	VDEV_RAIDZ_P		0
131 #define	VDEV_RAIDZ_Q		1
132 #define	VDEV_RAIDZ_R		2
133 #define	VDEV_RAIDZ_MAXPARITY	3
134 
135 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
136 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
137 
138 /*
139  * We provide a mechanism to perform the field multiplication operation on a
140  * 64-bit value all at once rather than a byte at a time. This works by
141  * creating a mask from the top bit in each byte and using that to
142  * conditionally apply the XOR of 0x1d.
143  */
144 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
145 { \
146 	(mask) = (x) & 0x8080808080808080ULL; \
147 	(mask) = ((mask) << 1) - ((mask) >> 7); \
148 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
149 	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
150 }
151 
152 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
153 { \
154 	VDEV_RAIDZ_64MUL_2((x), mask); \
155 	VDEV_RAIDZ_64MUL_2((x), mask); \
156 }
157 
158 /*
159  * Force reconstruction to use the general purpose method.
160  */
161 int vdev_raidz_default_to_general;
162 
163 /*
164  * These two tables represent powers and logs of 2 in the Galois field defined
165  * above. These values were computed by repeatedly multiplying by 2 as above.
166  */
167 static const uint8_t vdev_raidz_pow2[256] = {
168 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
169 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
170 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
171 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
172 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
173 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
174 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
175 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
176 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
177 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
178 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
179 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
180 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
181 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
182 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
183 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
184 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
185 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
186 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
187 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
188 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
189 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
190 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
191 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
192 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
193 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
194 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
195 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
196 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
197 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
198 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
199 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
200 };
201 static const uint8_t vdev_raidz_log2[256] = {
202 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
203 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
204 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
205 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
206 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
207 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
208 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
209 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
210 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
211 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
212 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
213 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
214 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
215 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
216 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
217 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
218 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
219 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
220 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
221 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
222 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
223 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
224 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
225 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
226 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
227 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
228 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
229 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
230 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
231 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
232 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
233 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
234 };
235 
236 static void vdev_raidz_generate_parity(raidz_map_t *rm);
237 
238 /*
239  * Multiply a given number by 2 raised to the given power.
240  */
241 static uint8_t
242 vdev_raidz_exp2(uint_t a, int exp)
243 {
244 	if (a == 0)
245 		return (0);
246 
247 	ASSERT(exp >= 0);
248 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
249 
250 	exp += vdev_raidz_log2[a];
251 	if (exp > 255)
252 		exp -= 255;
253 
254 	return (vdev_raidz_pow2[exp]);
255 }
256 
257 static void
258 vdev_raidz_map_free(raidz_map_t *rm)
259 {
260 	int c;
261 	size_t size = rm->rm_asize;	/* will hold data-size after the loop */
262 
263 	for (c = 0; c < rm->rm_firstdatacol; c++) {
264 		size -= rm->rm_col[c].rc_size;
265 
266 		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
267 
268 		if (rm->rm_col[c].rc_gdata != NULL)
269 			zio_buf_free(rm->rm_col[c].rc_gdata,
270 			    rm->rm_col[c].rc_size);
271 	}
272 
273 	if (rm->rm_datacopy != NULL)
274 		zio_buf_free(rm->rm_datacopy, size);
275 
276 	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
277 }
278 
279 static void
280 vdev_raidz_map_free_vsd(zio_t *zio)
281 {
282 	raidz_map_t *rm = zio->io_vsd;
283 
284 	ASSERT3U(rm->rm_freed, ==, 0);
285 	rm->rm_freed = 1;
286 
287 	if (rm->rm_reports == 0)
288 		vdev_raidz_map_free(rm);
289 }
290 
291 /*ARGSUSED*/
292 static void
293 vdev_raidz_cksum_free(void *arg, size_t ignored)
294 {
295 	raidz_map_t *rm = arg;
296 
297 	ASSERT3U(rm->rm_reports, >, 0);
298 	ASSERT3U(rm->rm_freed, !=, 0);
299 
300 	if (--rm->rm_reports == 0)
301 		vdev_raidz_map_free(rm);
302 }
303 
304 static void
305 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
306 {
307 	raidz_map_t *rm = zcr->zcr_cbdata;
308 	size_t c = zcr->zcr_cbinfo;
309 	size_t x;
310 
311 	const char *good = NULL;
312 	const char *bad = rm->rm_col[c].rc_data;
313 
314 	if (good_data == NULL) {
315 		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
316 		return;
317 	}
318 
319 	if (c < rm->rm_firstdatacol) {
320 		/*
321 		 * The first time through, calculate the parity blocks for
322 		 * the good data (this relies on the fact that the good
323 		 * data never changes for a given logical ZIO)
324 		 */
325 		if (rm->rm_col[0].rc_gdata == NULL) {
326 			char *bad_parity[VDEV_RAIDZ_MAXPARITY];
327 			char *buf;
328 
329 			/*
330 			 * Set up the rm_col[]s to generate the parity for
331 			 * good_data, first saving the parity bufs and
332 			 * replacing them with buffers to hold the result.
333 			 */
334 			for (x = 0; x < rm->rm_firstdatacol; x++) {
335 				bad_parity[x] = rm->rm_col[x].rc_data;
336 				rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
337 				    zio_buf_alloc(rm->rm_col[x].rc_size);
338 			}
339 
340 			/* fill in the data columns from good_data */
341 			buf = (char *)good_data;
342 			for (; x < rm->rm_cols; x++) {
343 				rm->rm_col[x].rc_data = buf;
344 				buf += rm->rm_col[x].rc_size;
345 			}
346 
347 			/*
348 			 * Construct the parity from the good data.
349 			 */
350 			vdev_raidz_generate_parity(rm);
351 
352 			/* restore everything back to its original state */
353 			for (x = 0; x < rm->rm_firstdatacol; x++)
354 				rm->rm_col[x].rc_data = bad_parity[x];
355 
356 			buf = rm->rm_datacopy;
357 			for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
358 				rm->rm_col[x].rc_data = buf;
359 				buf += rm->rm_col[x].rc_size;
360 			}
361 		}
362 
363 		ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
364 		good = rm->rm_col[c].rc_gdata;
365 	} else {
366 		/* adjust good_data to point at the start of our column */
367 		good = good_data;
368 
369 		for (x = rm->rm_firstdatacol; x < c; x++)
370 			good += rm->rm_col[x].rc_size;
371 	}
372 
373 	/* we drop the ereport if it ends up that the data was good */
374 	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
375 }
376 
377 /*
378  * Invoked indirectly by zfs_ereport_start_checksum(), called
379  * below when our read operation fails completely.  The main point
380  * is to keep a copy of everything we read from disk, so that at
381  * vdev_raidz_cksum_finish() time we can compare it with the good data.
382  */
383 static void
384 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
385 {
386 	size_t c = (size_t)(uintptr_t)arg;
387 	caddr_t buf;
388 
389 	raidz_map_t *rm = zio->io_vsd;
390 	size_t size;
391 
392 	/* set up the report and bump the refcount  */
393 	zcr->zcr_cbdata = rm;
394 	zcr->zcr_cbinfo = c;
395 	zcr->zcr_finish = vdev_raidz_cksum_finish;
396 	zcr->zcr_free = vdev_raidz_cksum_free;
397 
398 	rm->rm_reports++;
399 	ASSERT3U(rm->rm_reports, >, 0);
400 
401 	if (rm->rm_reports != 1)
402 		return;
403 
404 	/*
405 	 * It's the first time we're called, so we need to copy the data
406 	 * aside; there's no guarantee that our zio's buffer won't be
407 	 * re-used for something else.
408 	 *
409 	 * Our parity data is already in seperate buffers, so there's no need
410 	 * to copy them.
411 	 */
412 	ASSERT3P(rm->rm_datacopy, ==, NULL);
413 
414 	/* rm_asize includes the parity blocks; subtract them out */
415 	size = rm->rm_asize;
416 	for (c = 0; c < rm->rm_firstdatacol; c++)
417 		size -= rm->rm_col[c].rc_size;
418 
419 	buf = rm->rm_datacopy = zio_buf_alloc(size);
420 	for (; c < rm->rm_cols; c++) {
421 		raidz_col_t *col = &rm->rm_col[c];
422 
423 		bcopy(col->rc_data, buf, col->rc_size);
424 		col->rc_data = buf;
425 
426 		buf += col->rc_size;
427 	}
428 	ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
429 }
430 
431 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
432 	vdev_raidz_map_free_vsd,
433 	vdev_raidz_cksum_report
434 };
435 
436 static raidz_map_t *
437 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
438     uint64_t nparity)
439 {
440 	raidz_map_t *rm;
441 	uint64_t b = zio->io_offset >> unit_shift;
442 	uint64_t s = zio->io_size >> unit_shift;
443 	uint64_t f = b % dcols;
444 	uint64_t o = (b / dcols) << unit_shift;
445 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
446 
447 	q = s / (dcols - nparity);
448 	r = s - q * (dcols - nparity);
449 	bc = (r == 0 ? 0 : r + nparity);
450 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
451 
452 	if (q == 0) {
453 		acols = bc;
454 		scols = MIN(dcols, roundup(bc, nparity + 1));
455 	} else {
456 		acols = dcols;
457 		scols = dcols;
458 	}
459 
460 	ASSERT3U(acols, <=, scols);
461 
462 	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
463 
464 	rm->rm_cols = acols;
465 	rm->rm_scols = scols;
466 	rm->rm_bigcols = bc;
467 	rm->rm_skipstart = bc;
468 	rm->rm_missingdata = 0;
469 	rm->rm_missingparity = 0;
470 	rm->rm_firstdatacol = nparity;
471 	rm->rm_datacopy = NULL;
472 	rm->rm_reports = 0;
473 	rm->rm_freed = 0;
474 	rm->rm_ecksuminjected = 0;
475 
476 	asize = 0;
477 
478 	for (c = 0; c < scols; c++) {
479 		col = f + c;
480 		coff = o;
481 		if (col >= dcols) {
482 			col -= dcols;
483 			coff += 1ULL << unit_shift;
484 		}
485 		rm->rm_col[c].rc_devidx = col;
486 		rm->rm_col[c].rc_offset = coff;
487 		rm->rm_col[c].rc_data = NULL;
488 		rm->rm_col[c].rc_gdata = NULL;
489 		rm->rm_col[c].rc_error = 0;
490 		rm->rm_col[c].rc_tried = 0;
491 		rm->rm_col[c].rc_skipped = 0;
492 
493 		if (c >= acols)
494 			rm->rm_col[c].rc_size = 0;
495 		else if (c < bc)
496 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
497 		else
498 			rm->rm_col[c].rc_size = q << unit_shift;
499 
500 		asize += rm->rm_col[c].rc_size;
501 	}
502 
503 	ASSERT3U(asize, ==, tot << unit_shift);
504 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
505 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
506 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
507 	ASSERT3U(rm->rm_nskip, <=, nparity);
508 
509 	for (c = 0; c < rm->rm_firstdatacol; c++)
510 		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
511 
512 	rm->rm_col[c].rc_data = zio->io_data;
513 
514 	for (c = c + 1; c < acols; c++)
515 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
516 		    rm->rm_col[c - 1].rc_size;
517 
518 	/*
519 	 * If all data stored spans all columns, there's a danger that parity
520 	 * will always be on the same device and, since parity isn't read
521 	 * during normal operation, that that device's I/O bandwidth won't be
522 	 * used effectively. We therefore switch the parity every 1MB.
523 	 *
524 	 * ... at least that was, ostensibly, the theory. As a practical
525 	 * matter unless we juggle the parity between all devices evenly, we
526 	 * won't see any benefit. Further, occasional writes that aren't a
527 	 * multiple of the LCM of the number of children and the minimum
528 	 * stripe width are sufficient to avoid pessimal behavior.
529 	 * Unfortunately, this decision created an implicit on-disk format
530 	 * requirement that we need to support for all eternity, but only
531 	 * for single-parity RAID-Z.
532 	 *
533 	 * If we intend to skip a sector in the zeroth column for padding
534 	 * we must make sure to note this swap. We will never intend to
535 	 * skip the first column since at least one data and one parity
536 	 * column must appear in each row.
537 	 */
538 	ASSERT(rm->rm_cols >= 2);
539 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
540 
541 	if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
542 		devidx = rm->rm_col[0].rc_devidx;
543 		o = rm->rm_col[0].rc_offset;
544 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
545 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
546 		rm->rm_col[1].rc_devidx = devidx;
547 		rm->rm_col[1].rc_offset = o;
548 
549 		if (rm->rm_skipstart == 0)
550 			rm->rm_skipstart = 1;
551 	}
552 
553 	zio->io_vsd = rm;
554 	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
555 	return (rm);
556 }
557 
558 static void
559 vdev_raidz_generate_parity_p(raidz_map_t *rm)
560 {
561 	uint64_t *p, *src, pcount, ccount, i;
562 	int c;
563 
564 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
565 
566 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
567 		src = rm->rm_col[c].rc_data;
568 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
569 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
570 
571 		if (c == rm->rm_firstdatacol) {
572 			ASSERT(ccount == pcount);
573 			for (i = 0; i < ccount; i++, src++, p++) {
574 				*p = *src;
575 			}
576 		} else {
577 			ASSERT(ccount <= pcount);
578 			for (i = 0; i < ccount; i++, src++, p++) {
579 				*p ^= *src;
580 			}
581 		}
582 	}
583 }
584 
585 static void
586 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
587 {
588 	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
589 	int c;
590 
591 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
592 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
593 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
594 
595 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
596 		src = rm->rm_col[c].rc_data;
597 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
598 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
599 
600 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
601 
602 		if (c == rm->rm_firstdatacol) {
603 			ASSERT(ccnt == pcnt || ccnt == 0);
604 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
605 				*p = *src;
606 				*q = *src;
607 			}
608 			for (; i < pcnt; i++, src++, p++, q++) {
609 				*p = 0;
610 				*q = 0;
611 			}
612 		} else {
613 			ASSERT(ccnt <= pcnt);
614 
615 			/*
616 			 * Apply the algorithm described above by multiplying
617 			 * the previous result and adding in the new value.
618 			 */
619 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
620 				*p ^= *src;
621 
622 				VDEV_RAIDZ_64MUL_2(*q, mask);
623 				*q ^= *src;
624 			}
625 
626 			/*
627 			 * Treat short columns as though they are full of 0s.
628 			 * Note that there's therefore nothing needed for P.
629 			 */
630 			for (; i < pcnt; i++, q++) {
631 				VDEV_RAIDZ_64MUL_2(*q, mask);
632 			}
633 		}
634 	}
635 }
636 
637 static void
638 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
639 {
640 	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
641 	int c;
642 
643 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
644 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
645 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
646 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
647 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
648 
649 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
650 		src = rm->rm_col[c].rc_data;
651 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
652 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
653 		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
654 
655 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
656 
657 		if (c == rm->rm_firstdatacol) {
658 			ASSERT(ccnt == pcnt || ccnt == 0);
659 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
660 				*p = *src;
661 				*q = *src;
662 				*r = *src;
663 			}
664 			for (; i < pcnt; i++, src++, p++, q++, r++) {
665 				*p = 0;
666 				*q = 0;
667 				*r = 0;
668 			}
669 		} else {
670 			ASSERT(ccnt <= pcnt);
671 
672 			/*
673 			 * Apply the algorithm described above by multiplying
674 			 * the previous result and adding in the new value.
675 			 */
676 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
677 				*p ^= *src;
678 
679 				VDEV_RAIDZ_64MUL_2(*q, mask);
680 				*q ^= *src;
681 
682 				VDEV_RAIDZ_64MUL_4(*r, mask);
683 				*r ^= *src;
684 			}
685 
686 			/*
687 			 * Treat short columns as though they are full of 0s.
688 			 * Note that there's therefore nothing needed for P.
689 			 */
690 			for (; i < pcnt; i++, q++, r++) {
691 				VDEV_RAIDZ_64MUL_2(*q, mask);
692 				VDEV_RAIDZ_64MUL_4(*r, mask);
693 			}
694 		}
695 	}
696 }
697 
698 /*
699  * Generate RAID parity in the first virtual columns according to the number of
700  * parity columns available.
701  */
702 static void
703 vdev_raidz_generate_parity(raidz_map_t *rm)
704 {
705 	switch (rm->rm_firstdatacol) {
706 	case 1:
707 		vdev_raidz_generate_parity_p(rm);
708 		break;
709 	case 2:
710 		vdev_raidz_generate_parity_pq(rm);
711 		break;
712 	case 3:
713 		vdev_raidz_generate_parity_pqr(rm);
714 		break;
715 	default:
716 		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
717 	}
718 }
719 
720 static int
721 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
722 {
723 	uint64_t *dst, *src, xcount, ccount, count, i;
724 	int x = tgts[0];
725 	int c;
726 
727 	ASSERT(ntgts == 1);
728 	ASSERT(x >= rm->rm_firstdatacol);
729 	ASSERT(x < rm->rm_cols);
730 
731 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
732 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
733 	ASSERT(xcount > 0);
734 
735 	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
736 	dst = rm->rm_col[x].rc_data;
737 	for (i = 0; i < xcount; i++, dst++, src++) {
738 		*dst = *src;
739 	}
740 
741 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
742 		src = rm->rm_col[c].rc_data;
743 		dst = rm->rm_col[x].rc_data;
744 
745 		if (c == x)
746 			continue;
747 
748 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
749 		count = MIN(ccount, xcount);
750 
751 		for (i = 0; i < count; i++, dst++, src++) {
752 			*dst ^= *src;
753 		}
754 	}
755 
756 	return (1 << VDEV_RAIDZ_P);
757 }
758 
759 static int
760 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
761 {
762 	uint64_t *dst, *src, xcount, ccount, count, mask, i;
763 	uint8_t *b;
764 	int x = tgts[0];
765 	int c, j, exp;
766 
767 	ASSERT(ntgts == 1);
768 
769 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
770 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
771 
772 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
773 		src = rm->rm_col[c].rc_data;
774 		dst = rm->rm_col[x].rc_data;
775 
776 		if (c == x)
777 			ccount = 0;
778 		else
779 			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
780 
781 		count = MIN(ccount, xcount);
782 
783 		if (c == rm->rm_firstdatacol) {
784 			for (i = 0; i < count; i++, dst++, src++) {
785 				*dst = *src;
786 			}
787 			for (; i < xcount; i++, dst++) {
788 				*dst = 0;
789 			}
790 
791 		} else {
792 			for (i = 0; i < count; i++, dst++, src++) {
793 				VDEV_RAIDZ_64MUL_2(*dst, mask);
794 				*dst ^= *src;
795 			}
796 
797 			for (; i < xcount; i++, dst++) {
798 				VDEV_RAIDZ_64MUL_2(*dst, mask);
799 			}
800 		}
801 	}
802 
803 	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
804 	dst = rm->rm_col[x].rc_data;
805 	exp = 255 - (rm->rm_cols - 1 - x);
806 
807 	for (i = 0; i < xcount; i++, dst++, src++) {
808 		*dst ^= *src;
809 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
810 			*b = vdev_raidz_exp2(*b, exp);
811 		}
812 	}
813 
814 	return (1 << VDEV_RAIDZ_Q);
815 }
816 
817 static int
818 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
819 {
820 	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
821 	void *pdata, *qdata;
822 	uint64_t xsize, ysize, i;
823 	int x = tgts[0];
824 	int y = tgts[1];
825 
826 	ASSERT(ntgts == 2);
827 	ASSERT(x < y);
828 	ASSERT(x >= rm->rm_firstdatacol);
829 	ASSERT(y < rm->rm_cols);
830 
831 	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
832 
833 	/*
834 	 * Move the parity data aside -- we're going to compute parity as
835 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
836 	 * reuse the parity generation mechanism without trashing the actual
837 	 * parity so we make those columns appear to be full of zeros by
838 	 * setting their lengths to zero.
839 	 */
840 	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
841 	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
842 	xsize = rm->rm_col[x].rc_size;
843 	ysize = rm->rm_col[y].rc_size;
844 
845 	rm->rm_col[VDEV_RAIDZ_P].rc_data =
846 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
847 	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
848 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
849 	rm->rm_col[x].rc_size = 0;
850 	rm->rm_col[y].rc_size = 0;
851 
852 	vdev_raidz_generate_parity_pq(rm);
853 
854 	rm->rm_col[x].rc_size = xsize;
855 	rm->rm_col[y].rc_size = ysize;
856 
857 	p = pdata;
858 	q = qdata;
859 	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
860 	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
861 	xd = rm->rm_col[x].rc_data;
862 	yd = rm->rm_col[y].rc_data;
863 
864 	/*
865 	 * We now have:
866 	 *	Pxy = P + D_x + D_y
867 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
868 	 *
869 	 * We can then solve for D_x:
870 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
871 	 * where
872 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
873 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
874 	 *
875 	 * With D_x in hand, we can easily solve for D_y:
876 	 *	D_y = P + Pxy + D_x
877 	 */
878 
879 	a = vdev_raidz_pow2[255 + x - y];
880 	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
881 	tmp = 255 - vdev_raidz_log2[a ^ 1];
882 
883 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
884 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
885 
886 	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
887 		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
888 		    vdev_raidz_exp2(*q ^ *qxy, bexp);
889 
890 		if (i < ysize)
891 			*yd = *p ^ *pxy ^ *xd;
892 	}
893 
894 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
895 	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
896 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
897 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
898 
899 	/*
900 	 * Restore the saved parity data.
901 	 */
902 	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
903 	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
904 
905 	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
906 }
907 
908 /* BEGIN CSTYLED */
909 /*
910  * In the general case of reconstruction, we must solve the system of linear
911  * equations defined by the coeffecients used to generate parity as well as
912  * the contents of the data and parity disks. This can be expressed with
913  * vectors for the original data (D) and the actual data (d) and parity (p)
914  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
915  *
916  *            __   __                     __     __
917  *            |     |         __     __   |  p_0  |
918  *            |  V  |         |  D_0  |   | p_m-1 |
919  *            |     |    x    |   :   | = |  d_0  |
920  *            |  I  |         | D_n-1 |   |   :   |
921  *            |     |         ~~     ~~   | d_n-1 |
922  *            ~~   ~~                     ~~     ~~
923  *
924  * I is simply a square identity matrix of size n, and V is a vandermonde
925  * matrix defined by the coeffecients we chose for the various parity columns
926  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
927  * computation as well as linear separability.
928  *
929  *      __               __               __     __
930  *      |   1   ..  1 1 1 |               |  p_0  |
931  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
932  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
933  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
934  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
935  *      |   :       : : : |   |   :   |   |  d_2  |
936  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
937  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
938  *      |   0   ..  0 0 1 |               | d_n-1 |
939  *      ~~               ~~               ~~     ~~
940  *
941  * Note that I, V, d, and p are known. To compute D, we must invert the
942  * matrix and use the known data and parity values to reconstruct the unknown
943  * data values. We begin by removing the rows in V|I and d|p that correspond
944  * to failed or missing columns; we then make V|I square (n x n) and d|p
945  * sized n by removing rows corresponding to unused parity from the bottom up
946  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
947  * using Gauss-Jordan elimination. In the example below we use m=3 parity
948  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
949  *           __                               __
950  *           |  1   1   1   1   1   1   1   1  |
951  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
952  *           |  19 205 116  29  64  16  4   1  |      / /
953  *           |  1   0   0   0   0   0   0   0  |     / /
954  *           |  0   1   0   0   0   0   0   0  | <--' /
955  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
956  *           |  0   0   0   1   0   0   0   0  |
957  *           |  0   0   0   0   1   0   0   0  |
958  *           |  0   0   0   0   0   1   0   0  |
959  *           |  0   0   0   0   0   0   1   0  |
960  *           |  0   0   0   0   0   0   0   1  |
961  *           ~~                               ~~
962  *           __                               __
963  *           |  1   1   1   1   1   1   1   1  |
964  *           | 128  64  32  16  8   4   2   1  |
965  *           |  19 205 116  29  64  16  4   1  |
966  *           |  1   0   0   0   0   0   0   0  |
967  *           |  0   1   0   0   0   0   0   0  |
968  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
969  *           |  0   0   0   1   0   0   0   0  |
970  *           |  0   0   0   0   1   0   0   0  |
971  *           |  0   0   0   0   0   1   0   0  |
972  *           |  0   0   0   0   0   0   1   0  |
973  *           |  0   0   0   0   0   0   0   1  |
974  *           ~~                               ~~
975  *
976  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
977  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
978  * matrix is not singular.
979  * __                                                                 __
980  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
981  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
982  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
983  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
984  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
985  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
986  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
987  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
988  * ~~                                                                 ~~
989  * __                                                                 __
990  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
991  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
992  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
993  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
994  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
995  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
996  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
997  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
998  * ~~                                                                 ~~
999  * __                                                                 __
1000  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1001  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1002  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1003  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1004  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1005  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1006  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1007  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1008  * ~~                                                                 ~~
1009  * __                                                                 __
1010  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1011  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1012  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1013  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1014  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1015  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1016  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1017  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1018  * ~~                                                                 ~~
1019  * __                                                                 __
1020  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1021  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1022  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1023  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1024  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1025  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1026  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1027  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1028  * ~~                                                                 ~~
1029  * __                                                                 __
1030  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1031  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1032  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1033  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1034  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1035  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1036  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1037  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1038  * ~~                                                                 ~~
1039  *                   __                               __
1040  *                   |  0   0   1   0   0   0   0   0  |
1041  *                   | 167 100  5   41 159 169 217 208 |
1042  *                   | 166 100  4   40 158 168 216 209 |
1043  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1044  *                   |  0   0   0   0   1   0   0   0  |
1045  *                   |  0   0   0   0   0   1   0   0  |
1046  *                   |  0   0   0   0   0   0   1   0  |
1047  *                   |  0   0   0   0   0   0   0   1  |
1048  *                   ~~                               ~~
1049  *
1050  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1051  * of the missing data.
1052  *
1053  * As is apparent from the example above, the only non-trivial rows in the
1054  * inverse matrix correspond to the data disks that we're trying to
1055  * reconstruct. Indeed, those are the only rows we need as the others would
1056  * only be useful for reconstructing data known or assumed to be valid. For
1057  * that reason, we only build the coefficients in the rows that correspond to
1058  * targeted columns.
1059  */
1060 /* END CSTYLED */
1061 
1062 static void
1063 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1064     uint8_t **rows)
1065 {
1066 	int i, j;
1067 	int pow;
1068 
1069 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1070 
1071 	/*
1072 	 * Fill in the missing rows of interest.
1073 	 */
1074 	for (i = 0; i < nmap; i++) {
1075 		ASSERT3S(0, <=, map[i]);
1076 		ASSERT3S(map[i], <=, 2);
1077 
1078 		pow = map[i] * n;
1079 		if (pow > 255)
1080 			pow -= 255;
1081 		ASSERT(pow <= 255);
1082 
1083 		for (j = 0; j < n; j++) {
1084 			pow -= map[i];
1085 			if (pow < 0)
1086 				pow += 255;
1087 			rows[i][j] = vdev_raidz_pow2[pow];
1088 		}
1089 	}
1090 }
1091 
1092 static void
1093 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1094     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1095 {
1096 	int i, j, ii, jj;
1097 	uint8_t log;
1098 
1099 	/*
1100 	 * Assert that the first nmissing entries from the array of used
1101 	 * columns correspond to parity columns and that subsequent entries
1102 	 * correspond to data columns.
1103 	 */
1104 	for (i = 0; i < nmissing; i++) {
1105 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
1106 	}
1107 	for (; i < n; i++) {
1108 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1109 	}
1110 
1111 	/*
1112 	 * First initialize the storage where we'll compute the inverse rows.
1113 	 */
1114 	for (i = 0; i < nmissing; i++) {
1115 		for (j = 0; j < n; j++) {
1116 			invrows[i][j] = (i == j) ? 1 : 0;
1117 		}
1118 	}
1119 
1120 	/*
1121 	 * Subtract all trivial rows from the rows of consequence.
1122 	 */
1123 	for (i = 0; i < nmissing; i++) {
1124 		for (j = nmissing; j < n; j++) {
1125 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1126 			jj = used[j] - rm->rm_firstdatacol;
1127 			ASSERT3S(jj, <, n);
1128 			invrows[i][j] = rows[i][jj];
1129 			rows[i][jj] = 0;
1130 		}
1131 	}
1132 
1133 	/*
1134 	 * For each of the rows of interest, we must normalize it and subtract
1135 	 * a multiple of it from the other rows.
1136 	 */
1137 	for (i = 0; i < nmissing; i++) {
1138 		for (j = 0; j < missing[i]; j++) {
1139 			ASSERT3U(rows[i][j], ==, 0);
1140 		}
1141 		ASSERT3U(rows[i][missing[i]], !=, 0);
1142 
1143 		/*
1144 		 * Compute the inverse of the first element and multiply each
1145 		 * element in the row by that value.
1146 		 */
1147 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1148 
1149 		for (j = 0; j < n; j++) {
1150 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1151 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1152 		}
1153 
1154 		for (ii = 0; ii < nmissing; ii++) {
1155 			if (i == ii)
1156 				continue;
1157 
1158 			ASSERT3U(rows[ii][missing[i]], !=, 0);
1159 
1160 			log = vdev_raidz_log2[rows[ii][missing[i]]];
1161 
1162 			for (j = 0; j < n; j++) {
1163 				rows[ii][j] ^=
1164 				    vdev_raidz_exp2(rows[i][j], log);
1165 				invrows[ii][j] ^=
1166 				    vdev_raidz_exp2(invrows[i][j], log);
1167 			}
1168 		}
1169 	}
1170 
1171 	/*
1172 	 * Verify that the data that is left in the rows are properly part of
1173 	 * an identity matrix.
1174 	 */
1175 	for (i = 0; i < nmissing; i++) {
1176 		for (j = 0; j < n; j++) {
1177 			if (j == missing[i]) {
1178 				ASSERT3U(rows[i][j], ==, 1);
1179 			} else {
1180 				ASSERT3U(rows[i][j], ==, 0);
1181 			}
1182 		}
1183 	}
1184 }
1185 
1186 static void
1187 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1188     int *missing, uint8_t **invrows, const uint8_t *used)
1189 {
1190 	int i, j, x, cc, c;
1191 	uint8_t *src;
1192 	uint64_t ccount;
1193 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1194 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1195 	uint8_t log, val;
1196 	int ll;
1197 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1198 	uint8_t *p, *pp;
1199 	size_t psize;
1200 
1201 	psize = sizeof (invlog[0][0]) * n * nmissing;
1202 	p = kmem_alloc(psize, KM_SLEEP);
1203 
1204 	for (pp = p, i = 0; i < nmissing; i++) {
1205 		invlog[i] = pp;
1206 		pp += n;
1207 	}
1208 
1209 	for (i = 0; i < nmissing; i++) {
1210 		for (j = 0; j < n; j++) {
1211 			ASSERT3U(invrows[i][j], !=, 0);
1212 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1213 		}
1214 	}
1215 
1216 	for (i = 0; i < n; i++) {
1217 		c = used[i];
1218 		ASSERT3U(c, <, rm->rm_cols);
1219 
1220 		src = rm->rm_col[c].rc_data;
1221 		ccount = rm->rm_col[c].rc_size;
1222 		for (j = 0; j < nmissing; j++) {
1223 			cc = missing[j] + rm->rm_firstdatacol;
1224 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1225 			ASSERT3U(cc, <, rm->rm_cols);
1226 			ASSERT3U(cc, !=, c);
1227 
1228 			dst[j] = rm->rm_col[cc].rc_data;
1229 			dcount[j] = rm->rm_col[cc].rc_size;
1230 		}
1231 
1232 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1233 
1234 		for (x = 0; x < ccount; x++, src++) {
1235 			if (*src != 0)
1236 				log = vdev_raidz_log2[*src];
1237 
1238 			for (cc = 0; cc < nmissing; cc++) {
1239 				if (x >= dcount[cc])
1240 					continue;
1241 
1242 				if (*src == 0) {
1243 					val = 0;
1244 				} else {
1245 					if ((ll = log + invlog[cc][i]) >= 255)
1246 						ll -= 255;
1247 					val = vdev_raidz_pow2[ll];
1248 				}
1249 
1250 				if (i == 0)
1251 					dst[cc][x] = val;
1252 				else
1253 					dst[cc][x] ^= val;
1254 			}
1255 		}
1256 	}
1257 
1258 	kmem_free(p, psize);
1259 }
1260 
1261 static int
1262 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1263 {
1264 	int n, i, c, t, tt;
1265 	int nmissing_rows;
1266 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1267 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1268 
1269 	uint8_t *p, *pp;
1270 	size_t psize;
1271 
1272 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1273 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1274 	uint8_t *used;
1275 
1276 	int code = 0;
1277 
1278 
1279 	n = rm->rm_cols - rm->rm_firstdatacol;
1280 
1281 	/*
1282 	 * Figure out which data columns are missing.
1283 	 */
1284 	nmissing_rows = 0;
1285 	for (t = 0; t < ntgts; t++) {
1286 		if (tgts[t] >= rm->rm_firstdatacol) {
1287 			missing_rows[nmissing_rows++] =
1288 			    tgts[t] - rm->rm_firstdatacol;
1289 		}
1290 	}
1291 
1292 	/*
1293 	 * Figure out which parity columns to use to help generate the missing
1294 	 * data columns.
1295 	 */
1296 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1297 		ASSERT(tt < ntgts);
1298 		ASSERT(c < rm->rm_firstdatacol);
1299 
1300 		/*
1301 		 * Skip any targeted parity columns.
1302 		 */
1303 		if (c == tgts[tt]) {
1304 			tt++;
1305 			continue;
1306 		}
1307 
1308 		code |= 1 << c;
1309 
1310 		parity_map[i] = c;
1311 		i++;
1312 	}
1313 
1314 	ASSERT(code != 0);
1315 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1316 
1317 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1318 	    nmissing_rows * n + sizeof (used[0]) * n;
1319 	p = kmem_alloc(psize, KM_SLEEP);
1320 
1321 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1322 		rows[i] = pp;
1323 		pp += n;
1324 		invrows[i] = pp;
1325 		pp += n;
1326 	}
1327 	used = pp;
1328 
1329 	for (i = 0; i < nmissing_rows; i++) {
1330 		used[i] = parity_map[i];
1331 	}
1332 
1333 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1334 		if (tt < nmissing_rows &&
1335 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1336 			tt++;
1337 			continue;
1338 		}
1339 
1340 		ASSERT3S(i, <, n);
1341 		used[i] = c;
1342 		i++;
1343 	}
1344 
1345 	/*
1346 	 * Initialize the interesting rows of the matrix.
1347 	 */
1348 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1349 
1350 	/*
1351 	 * Invert the matrix.
1352 	 */
1353 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1354 	    invrows, used);
1355 
1356 	/*
1357 	 * Reconstruct the missing data using the generated matrix.
1358 	 */
1359 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1360 	    invrows, used);
1361 
1362 	kmem_free(p, psize);
1363 
1364 	return (code);
1365 }
1366 
1367 static int
1368 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1369 {
1370 	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1371 	int ntgts;
1372 	int i, c;
1373 	int code;
1374 	int nbadparity, nbaddata;
1375 	int parity_valid[VDEV_RAIDZ_MAXPARITY];
1376 
1377 	/*
1378 	 * The tgts list must already be sorted.
1379 	 */
1380 	for (i = 1; i < nt; i++) {
1381 		ASSERT(t[i] > t[i - 1]);
1382 	}
1383 
1384 	nbadparity = rm->rm_firstdatacol;
1385 	nbaddata = rm->rm_cols - nbadparity;
1386 	ntgts = 0;
1387 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1388 		if (c < rm->rm_firstdatacol)
1389 			parity_valid[c] = B_FALSE;
1390 
1391 		if (i < nt && c == t[i]) {
1392 			tgts[ntgts++] = c;
1393 			i++;
1394 		} else if (rm->rm_col[c].rc_error != 0) {
1395 			tgts[ntgts++] = c;
1396 		} else if (c >= rm->rm_firstdatacol) {
1397 			nbaddata--;
1398 		} else {
1399 			parity_valid[c] = B_TRUE;
1400 			nbadparity--;
1401 		}
1402 	}
1403 
1404 	ASSERT(ntgts >= nt);
1405 	ASSERT(nbaddata >= 0);
1406 	ASSERT(nbaddata + nbadparity == ntgts);
1407 
1408 	dt = &tgts[nbadparity];
1409 
1410 	/*
1411 	 * See if we can use any of our optimized reconstruction routines.
1412 	 */
1413 	if (!vdev_raidz_default_to_general) {
1414 		switch (nbaddata) {
1415 		case 1:
1416 			if (parity_valid[VDEV_RAIDZ_P])
1417 				return (vdev_raidz_reconstruct_p(rm, dt, 1));
1418 
1419 			ASSERT(rm->rm_firstdatacol > 1);
1420 
1421 			if (parity_valid[VDEV_RAIDZ_Q])
1422 				return (vdev_raidz_reconstruct_q(rm, dt, 1));
1423 
1424 			ASSERT(rm->rm_firstdatacol > 2);
1425 			break;
1426 
1427 		case 2:
1428 			ASSERT(rm->rm_firstdatacol > 1);
1429 
1430 			if (parity_valid[VDEV_RAIDZ_P] &&
1431 			    parity_valid[VDEV_RAIDZ_Q])
1432 				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1433 
1434 			ASSERT(rm->rm_firstdatacol > 2);
1435 
1436 			break;
1437 		}
1438 	}
1439 
1440 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1441 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1442 	ASSERT(code > 0);
1443 	return (code);
1444 }
1445 
1446 static int
1447 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
1448 {
1449 	vdev_t *cvd;
1450 	uint64_t nparity = vd->vdev_nparity;
1451 	int c;
1452 	int lasterror = 0;
1453 	int numerrors = 0;
1454 
1455 	ASSERT(nparity > 0);
1456 
1457 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
1458 	    vd->vdev_children < nparity + 1) {
1459 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1460 		return (EINVAL);
1461 	}
1462 
1463 	vdev_open_children(vd);
1464 
1465 	for (c = 0; c < vd->vdev_children; c++) {
1466 		cvd = vd->vdev_child[c];
1467 
1468 		if (cvd->vdev_open_error != 0) {
1469 			lasterror = cvd->vdev_open_error;
1470 			numerrors++;
1471 			continue;
1472 		}
1473 
1474 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1475 		*ashift = MAX(*ashift, cvd->vdev_ashift);
1476 	}
1477 
1478 	*asize *= vd->vdev_children;
1479 
1480 	if (numerrors > nparity) {
1481 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1482 		return (lasterror);
1483 	}
1484 
1485 	return (0);
1486 }
1487 
1488 static void
1489 vdev_raidz_close(vdev_t *vd)
1490 {
1491 	int c;
1492 
1493 	for (c = 0; c < vd->vdev_children; c++)
1494 		vdev_close(vd->vdev_child[c]);
1495 }
1496 
1497 static uint64_t
1498 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1499 {
1500 	uint64_t asize;
1501 	uint64_t ashift = vd->vdev_top->vdev_ashift;
1502 	uint64_t cols = vd->vdev_children;
1503 	uint64_t nparity = vd->vdev_nparity;
1504 
1505 	asize = ((psize - 1) >> ashift) + 1;
1506 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1507 	asize = roundup(asize, nparity + 1) << ashift;
1508 
1509 	return (asize);
1510 }
1511 
1512 static void
1513 vdev_raidz_child_done(zio_t *zio)
1514 {
1515 	raidz_col_t *rc = zio->io_private;
1516 
1517 	rc->rc_error = zio->io_error;
1518 	rc->rc_tried = 1;
1519 	rc->rc_skipped = 0;
1520 }
1521 
1522 static int
1523 vdev_raidz_io_start(zio_t *zio)
1524 {
1525 	vdev_t *vd = zio->io_vd;
1526 	vdev_t *tvd = vd->vdev_top;
1527 	vdev_t *cvd;
1528 	blkptr_t *bp = zio->io_bp;
1529 	raidz_map_t *rm;
1530 	raidz_col_t *rc;
1531 	int c, i;
1532 
1533 	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
1534 	    vd->vdev_nparity);
1535 
1536 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1537 
1538 	if (zio->io_type == ZIO_TYPE_WRITE) {
1539 		vdev_raidz_generate_parity(rm);
1540 
1541 		for (c = 0; c < rm->rm_cols; c++) {
1542 			rc = &rm->rm_col[c];
1543 			cvd = vd->vdev_child[rc->rc_devidx];
1544 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1545 			    rc->rc_offset, rc->rc_data, rc->rc_size,
1546 			    zio->io_type, zio->io_priority, 0,
1547 			    vdev_raidz_child_done, rc));
1548 		}
1549 
1550 		/*
1551 		 * Generate optional I/Os for any skipped sectors to improve
1552 		 * aggregation contiguity.
1553 		 */
1554 		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1555 			ASSERT(c <= rm->rm_scols);
1556 			if (c == rm->rm_scols)
1557 				c = 0;
1558 			rc = &rm->rm_col[c];
1559 			cvd = vd->vdev_child[rc->rc_devidx];
1560 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1561 			    rc->rc_offset + rc->rc_size, NULL,
1562 			    1 << tvd->vdev_ashift,
1563 			    zio->io_type, zio->io_priority,
1564 			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1565 		}
1566 
1567 		return (ZIO_PIPELINE_CONTINUE);
1568 	}
1569 
1570 	ASSERT(zio->io_type == ZIO_TYPE_READ);
1571 
1572 	/*
1573 	 * Iterate over the columns in reverse order so that we hit the parity
1574 	 * last -- any errors along the way will force us to read the parity.
1575 	 */
1576 	for (c = rm->rm_cols - 1; c >= 0; c--) {
1577 		rc = &rm->rm_col[c];
1578 		cvd = vd->vdev_child[rc->rc_devidx];
1579 		if (!vdev_readable(cvd)) {
1580 			if (c >= rm->rm_firstdatacol)
1581 				rm->rm_missingdata++;
1582 			else
1583 				rm->rm_missingparity++;
1584 			rc->rc_error = ENXIO;
1585 			rc->rc_tried = 1;	/* don't even try */
1586 			rc->rc_skipped = 1;
1587 			continue;
1588 		}
1589 		if (vdev_dtl_contains(cvd, DTL_MISSING, bp->blk_birth, 1)) {
1590 			if (c >= rm->rm_firstdatacol)
1591 				rm->rm_missingdata++;
1592 			else
1593 				rm->rm_missingparity++;
1594 			rc->rc_error = ESTALE;
1595 			rc->rc_skipped = 1;
1596 			continue;
1597 		}
1598 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1599 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1600 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1601 			    rc->rc_offset, rc->rc_data, rc->rc_size,
1602 			    zio->io_type, zio->io_priority, 0,
1603 			    vdev_raidz_child_done, rc));
1604 		}
1605 	}
1606 
1607 	return (ZIO_PIPELINE_CONTINUE);
1608 }
1609 
1610 /*
1611  * Report a checksum error for a child of a RAID-Z device.
1612  */
1613 static void
1614 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1615 {
1616 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1617 
1618 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1619 		zio_bad_cksum_t zbc;
1620 		raidz_map_t *rm = zio->io_vsd;
1621 
1622 		mutex_enter(&vd->vdev_stat_lock);
1623 		vd->vdev_stat.vs_checksum_errors++;
1624 		mutex_exit(&vd->vdev_stat_lock);
1625 
1626 		zbc.zbc_has_cksum = 0;
1627 		zbc.zbc_injected = rm->rm_ecksuminjected;
1628 
1629 		zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1630 		    rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1631 		    &zbc);
1632 	}
1633 }
1634 
1635 /*
1636  * We keep track of whether or not there were any injected errors, so that
1637  * any ereports we generate can note it.
1638  */
1639 static int
1640 raidz_checksum_verify(zio_t *zio)
1641 {
1642 	zio_bad_cksum_t zbc;
1643 	raidz_map_t *rm = zio->io_vsd;
1644 
1645 	int ret = zio_checksum_error(zio, &zbc);
1646 	if (ret != 0 && zbc.zbc_injected != 0)
1647 		rm->rm_ecksuminjected = 1;
1648 
1649 	return (ret);
1650 }
1651 
1652 /*
1653  * Generate the parity from the data columns. If we tried and were able to
1654  * read the parity without error, verify that the generated parity matches the
1655  * data we read. If it doesn't, we fire off a checksum error. Return the
1656  * number such failures.
1657  */
1658 static int
1659 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1660 {
1661 	void *orig[VDEV_RAIDZ_MAXPARITY];
1662 	int c, ret = 0;
1663 	raidz_col_t *rc;
1664 
1665 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1666 		rc = &rm->rm_col[c];
1667 		if (!rc->rc_tried || rc->rc_error != 0)
1668 			continue;
1669 		orig[c] = zio_buf_alloc(rc->rc_size);
1670 		bcopy(rc->rc_data, orig[c], rc->rc_size);
1671 	}
1672 
1673 	vdev_raidz_generate_parity(rm);
1674 
1675 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1676 		rc = &rm->rm_col[c];
1677 		if (!rc->rc_tried || rc->rc_error != 0)
1678 			continue;
1679 		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1680 			raidz_checksum_error(zio, rc, orig[c]);
1681 			rc->rc_error = ECKSUM;
1682 			ret++;
1683 		}
1684 		zio_buf_free(orig[c], rc->rc_size);
1685 	}
1686 
1687 	return (ret);
1688 }
1689 
1690 /*
1691  * Keep statistics on all the ways that we used parity to correct data.
1692  */
1693 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1694 
1695 static int
1696 vdev_raidz_worst_error(raidz_map_t *rm)
1697 {
1698 	int error = 0;
1699 
1700 	for (int c = 0; c < rm->rm_cols; c++)
1701 		error = zio_worst_error(error, rm->rm_col[c].rc_error);
1702 
1703 	return (error);
1704 }
1705 
1706 /*
1707  * Iterate over all combinations of bad data and attempt a reconstruction.
1708  * Note that the algorithm below is non-optimal because it doesn't take into
1709  * account how reconstruction is actually performed. For example, with
1710  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1711  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1712  * cases we'd only use parity information in column 0.
1713  */
1714 static int
1715 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1716 {
1717 	raidz_map_t *rm = zio->io_vsd;
1718 	raidz_col_t *rc;
1719 	void *orig[VDEV_RAIDZ_MAXPARITY];
1720 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1721 	int *tgts = &tstore[1];
1722 	int current, next, i, c, n;
1723 	int code, ret = 0;
1724 
1725 	ASSERT(total_errors < rm->rm_firstdatacol);
1726 
1727 	/*
1728 	 * This simplifies one edge condition.
1729 	 */
1730 	tgts[-1] = -1;
1731 
1732 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1733 		/*
1734 		 * Initialize the targets array by finding the first n columns
1735 		 * that contain no error.
1736 		 *
1737 		 * If there were no data errors, we need to ensure that we're
1738 		 * always explicitly attempting to reconstruct at least one
1739 		 * data column. To do this, we simply push the highest target
1740 		 * up into the data columns.
1741 		 */
1742 		for (c = 0, i = 0; i < n; i++) {
1743 			if (i == n - 1 && data_errors == 0 &&
1744 			    c < rm->rm_firstdatacol) {
1745 				c = rm->rm_firstdatacol;
1746 			}
1747 
1748 			while (rm->rm_col[c].rc_error != 0) {
1749 				c++;
1750 				ASSERT3S(c, <, rm->rm_cols);
1751 			}
1752 
1753 			tgts[i] = c++;
1754 		}
1755 
1756 		/*
1757 		 * Setting tgts[n] simplifies the other edge condition.
1758 		 */
1759 		tgts[n] = rm->rm_cols;
1760 
1761 		/*
1762 		 * These buffers were allocated in previous iterations.
1763 		 */
1764 		for (i = 0; i < n - 1; i++) {
1765 			ASSERT(orig[i] != NULL);
1766 		}
1767 
1768 		orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
1769 
1770 		current = 0;
1771 		next = tgts[current];
1772 
1773 		while (current != n) {
1774 			tgts[current] = next;
1775 			current = 0;
1776 
1777 			/*
1778 			 * Save off the original data that we're going to
1779 			 * attempt to reconstruct.
1780 			 */
1781 			for (i = 0; i < n; i++) {
1782 				ASSERT(orig[i] != NULL);
1783 				c = tgts[i];
1784 				ASSERT3S(c, >=, 0);
1785 				ASSERT3S(c, <, rm->rm_cols);
1786 				rc = &rm->rm_col[c];
1787 				bcopy(rc->rc_data, orig[i], rc->rc_size);
1788 			}
1789 
1790 			/*
1791 			 * Attempt a reconstruction and exit the outer loop on
1792 			 * success.
1793 			 */
1794 			code = vdev_raidz_reconstruct(rm, tgts, n);
1795 			if (raidz_checksum_verify(zio) == 0) {
1796 				atomic_inc_64(&raidz_corrected[code]);
1797 
1798 				for (i = 0; i < n; i++) {
1799 					c = tgts[i];
1800 					rc = &rm->rm_col[c];
1801 					ASSERT(rc->rc_error == 0);
1802 					if (rc->rc_tried)
1803 						raidz_checksum_error(zio, rc,
1804 						    orig[i]);
1805 					rc->rc_error = ECKSUM;
1806 				}
1807 
1808 				ret = code;
1809 				goto done;
1810 			}
1811 
1812 			/*
1813 			 * Restore the original data.
1814 			 */
1815 			for (i = 0; i < n; i++) {
1816 				c = tgts[i];
1817 				rc = &rm->rm_col[c];
1818 				bcopy(orig[i], rc->rc_data, rc->rc_size);
1819 			}
1820 
1821 			do {
1822 				/*
1823 				 * Find the next valid column after the current
1824 				 * position..
1825 				 */
1826 				for (next = tgts[current] + 1;
1827 				    next < rm->rm_cols &&
1828 				    rm->rm_col[next].rc_error != 0; next++)
1829 					continue;
1830 
1831 				ASSERT(next <= tgts[current + 1]);
1832 
1833 				/*
1834 				 * If that spot is available, we're done here.
1835 				 */
1836 				if (next != tgts[current + 1])
1837 					break;
1838 
1839 				/*
1840 				 * Otherwise, find the next valid column after
1841 				 * the previous position.
1842 				 */
1843 				for (c = tgts[current - 1] + 1;
1844 				    rm->rm_col[c].rc_error != 0; c++)
1845 					continue;
1846 
1847 				tgts[current] = c;
1848 				current++;
1849 
1850 			} while (current != n);
1851 		}
1852 	}
1853 	n--;
1854 done:
1855 	for (i = 0; i < n; i++) {
1856 		zio_buf_free(orig[i], rm->rm_col[0].rc_size);
1857 	}
1858 
1859 	return (ret);
1860 }
1861 
1862 static void
1863 vdev_raidz_io_done(zio_t *zio)
1864 {
1865 	vdev_t *vd = zio->io_vd;
1866 	vdev_t *cvd;
1867 	raidz_map_t *rm = zio->io_vsd;
1868 	raidz_col_t *rc;
1869 	int unexpected_errors = 0;
1870 	int parity_errors = 0;
1871 	int parity_untried = 0;
1872 	int data_errors = 0;
1873 	int total_errors = 0;
1874 	int n, c;
1875 	int tgts[VDEV_RAIDZ_MAXPARITY];
1876 	int code;
1877 
1878 	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
1879 
1880 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1881 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1882 
1883 	for (c = 0; c < rm->rm_cols; c++) {
1884 		rc = &rm->rm_col[c];
1885 
1886 		if (rc->rc_error) {
1887 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
1888 
1889 			if (c < rm->rm_firstdatacol)
1890 				parity_errors++;
1891 			else
1892 				data_errors++;
1893 
1894 			if (!rc->rc_skipped)
1895 				unexpected_errors++;
1896 
1897 			total_errors++;
1898 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1899 			parity_untried++;
1900 		}
1901 	}
1902 
1903 	if (zio->io_type == ZIO_TYPE_WRITE) {
1904 		/*
1905 		 * XXX -- for now, treat partial writes as a success.
1906 		 * (If we couldn't write enough columns to reconstruct
1907 		 * the data, the I/O failed.  Otherwise, good enough.)
1908 		 *
1909 		 * Now that we support write reallocation, it would be better
1910 		 * to treat partial failure as real failure unless there are
1911 		 * no non-degraded top-level vdevs left, and not update DTLs
1912 		 * if we intend to reallocate.
1913 		 */
1914 		/* XXPOLICY */
1915 		if (total_errors > rm->rm_firstdatacol)
1916 			zio->io_error = vdev_raidz_worst_error(rm);
1917 
1918 		return;
1919 	}
1920 
1921 	ASSERT(zio->io_type == ZIO_TYPE_READ);
1922 	/*
1923 	 * There are three potential phases for a read:
1924 	 *	1. produce valid data from the columns read
1925 	 *	2. read all disks and try again
1926 	 *	3. perform combinatorial reconstruction
1927 	 *
1928 	 * Each phase is progressively both more expensive and less likely to
1929 	 * occur. If we encounter more errors than we can repair or all phases
1930 	 * fail, we have no choice but to return an error.
1931 	 */
1932 
1933 	/*
1934 	 * If the number of errors we saw was correctable -- less than or equal
1935 	 * to the number of parity disks read -- attempt to produce data that
1936 	 * has a valid checksum. Naturally, this case applies in the absence of
1937 	 * any errors.
1938 	 */
1939 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1940 		if (data_errors == 0) {
1941 			if (raidz_checksum_verify(zio) == 0) {
1942 				/*
1943 				 * If we read parity information (unnecessarily
1944 				 * as it happens since no reconstruction was
1945 				 * needed) regenerate and verify the parity.
1946 				 * We also regenerate parity when resilvering
1947 				 * so we can write it out to the failed device
1948 				 * later.
1949 				 */
1950 				if (parity_errors + parity_untried <
1951 				    rm->rm_firstdatacol ||
1952 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
1953 					n = raidz_parity_verify(zio, rm);
1954 					unexpected_errors += n;
1955 					ASSERT(parity_errors + n <=
1956 					    rm->rm_firstdatacol);
1957 				}
1958 				goto done;
1959 			}
1960 		} else {
1961 			/*
1962 			 * We either attempt to read all the parity columns or
1963 			 * none of them. If we didn't try to read parity, we
1964 			 * wouldn't be here in the correctable case. There must
1965 			 * also have been fewer parity errors than parity
1966 			 * columns or, again, we wouldn't be in this code path.
1967 			 */
1968 			ASSERT(parity_untried == 0);
1969 			ASSERT(parity_errors < rm->rm_firstdatacol);
1970 
1971 			/*
1972 			 * Identify the data columns that reported an error.
1973 			 */
1974 			n = 0;
1975 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1976 				rc = &rm->rm_col[c];
1977 				if (rc->rc_error != 0) {
1978 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1979 					tgts[n++] = c;
1980 				}
1981 			}
1982 
1983 			ASSERT(rm->rm_firstdatacol >= n);
1984 
1985 			code = vdev_raidz_reconstruct(rm, tgts, n);
1986 
1987 			if (raidz_checksum_verify(zio) == 0) {
1988 				atomic_inc_64(&raidz_corrected[code]);
1989 
1990 				/*
1991 				 * If we read more parity disks than were used
1992 				 * for reconstruction, confirm that the other
1993 				 * parity disks produced correct data. This
1994 				 * routine is suboptimal in that it regenerates
1995 				 * the parity that we already used in addition
1996 				 * to the parity that we're attempting to
1997 				 * verify, but this should be a relatively
1998 				 * uncommon case, and can be optimized if it
1999 				 * becomes a problem. Note that we regenerate
2000 				 * parity when resilvering so we can write it
2001 				 * out to failed devices later.
2002 				 */
2003 				if (parity_errors < rm->rm_firstdatacol - n ||
2004 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2005 					n = raidz_parity_verify(zio, rm);
2006 					unexpected_errors += n;
2007 					ASSERT(parity_errors + n <=
2008 					    rm->rm_firstdatacol);
2009 				}
2010 
2011 				goto done;
2012 			}
2013 		}
2014 	}
2015 
2016 	/*
2017 	 * This isn't a typical situation -- either we got a read error or
2018 	 * a child silently returned bad data. Read every block so we can
2019 	 * try again with as much data and parity as we can track down. If
2020 	 * we've already been through once before, all children will be marked
2021 	 * as tried so we'll proceed to combinatorial reconstruction.
2022 	 */
2023 	unexpected_errors = 1;
2024 	rm->rm_missingdata = 0;
2025 	rm->rm_missingparity = 0;
2026 
2027 	for (c = 0; c < rm->rm_cols; c++) {
2028 		if (rm->rm_col[c].rc_tried)
2029 			continue;
2030 
2031 		zio_vdev_io_redone(zio);
2032 		do {
2033 			rc = &rm->rm_col[c];
2034 			if (rc->rc_tried)
2035 				continue;
2036 			zio_nowait(zio_vdev_child_io(zio, NULL,
2037 			    vd->vdev_child[rc->rc_devidx],
2038 			    rc->rc_offset, rc->rc_data, rc->rc_size,
2039 			    zio->io_type, zio->io_priority, 0,
2040 			    vdev_raidz_child_done, rc));
2041 		} while (++c < rm->rm_cols);
2042 
2043 		return;
2044 	}
2045 
2046 	/*
2047 	 * At this point we've attempted to reconstruct the data given the
2048 	 * errors we detected, and we've attempted to read all columns. There
2049 	 * must, therefore, be one or more additional problems -- silent errors
2050 	 * resulting in invalid data rather than explicit I/O errors resulting
2051 	 * in absent data. We check if there is enough additional data to
2052 	 * possibly reconstruct the data and then perform combinatorial
2053 	 * reconstruction over all possible combinations. If that fails,
2054 	 * we're cooked.
2055 	 */
2056 	if (total_errors > rm->rm_firstdatacol) {
2057 		zio->io_error = vdev_raidz_worst_error(rm);
2058 
2059 	} else if (total_errors < rm->rm_firstdatacol &&
2060 	    (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2061 		/*
2062 		 * If we didn't use all the available parity for the
2063 		 * combinatorial reconstruction, verify that the remaining
2064 		 * parity is correct.
2065 		 */
2066 		if (code != (1 << rm->rm_firstdatacol) - 1)
2067 			(void) raidz_parity_verify(zio, rm);
2068 	} else {
2069 		/*
2070 		 * We're here because either:
2071 		 *
2072 		 *	total_errors == rm_first_datacol, or
2073 		 *	vdev_raidz_combrec() failed
2074 		 *
2075 		 * In either case, there is enough bad data to prevent
2076 		 * reconstruction.
2077 		 *
2078 		 * Start checksum ereports for all children which haven't
2079 		 * failed.
2080 		 */
2081 		zio->io_error = ECKSUM;
2082 
2083 		for (c = 0; c < rm->rm_cols; c++) {
2084 			rc = &rm->rm_col[c];
2085 			if (rc->rc_error == 0) {
2086 				zio_bad_cksum_t zbc;
2087 				zbc.zbc_has_cksum = 0;
2088 				zbc.zbc_injected = rm->rm_ecksuminjected;
2089 
2090 				zfs_ereport_start_checksum(
2091 				    zio->io_spa, vd->vdev_child[rc->rc_devidx],
2092 				    zio, rc->rc_offset, rc->rc_size,
2093 				    (void *)(uintptr_t)c, &zbc);
2094 			}
2095 		}
2096 	}
2097 
2098 done:
2099 	zio_checksum_verified(zio);
2100 
2101 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2102 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2103 		/*
2104 		 * Use the good data we have in hand to repair damaged children.
2105 		 */
2106 		for (c = 0; c < rm->rm_cols; c++) {
2107 			rc = &rm->rm_col[c];
2108 			cvd = vd->vdev_child[rc->rc_devidx];
2109 
2110 			if (rc->rc_error == 0)
2111 				continue;
2112 
2113 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2114 			    rc->rc_offset, rc->rc_data, rc->rc_size,
2115 			    ZIO_TYPE_WRITE, zio->io_priority,
2116 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2117 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2118 		}
2119 	}
2120 }
2121 
2122 static void
2123 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2124 {
2125 	if (faulted > vd->vdev_nparity)
2126 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2127 		    VDEV_AUX_NO_REPLICAS);
2128 	else if (degraded + faulted != 0)
2129 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2130 	else
2131 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2132 }
2133 
2134 vdev_ops_t vdev_raidz_ops = {
2135 	vdev_raidz_open,
2136 	vdev_raidz_close,
2137 	vdev_raidz_asize,
2138 	vdev_raidz_io_start,
2139 	vdev_raidz_io_done,
2140 	vdev_raidz_state_change,
2141 	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
2142 	B_FALSE			/* not a leaf vdev */
2143 };
2144