xref: /titanic_44/usr/src/uts/common/fs/zfs/vdev_raidz.c (revision cf7e209d5f01e9f5fe052b444899ba9cba0e9877)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2013 by Delphix. All rights reserved.
25  */
26 
27 #include <sys/zfs_context.h>
28 #include <sys/spa.h>
29 #include <sys/vdev_impl.h>
30 #include <sys/zio.h>
31 #include <sys/zio_checksum.h>
32 #include <sys/fs/zfs.h>
33 #include <sys/fm/fs/zfs.h>
34 
35 /*
36  * Virtual device vector for RAID-Z.
37  *
38  * This vdev supports single, double, and triple parity. For single parity,
39  * we use a simple XOR of all the data columns. For double or triple parity,
40  * we use a special case of Reed-Solomon coding. This extends the
41  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
42  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
43  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
44  * former is also based. The latter is designed to provide higher performance
45  * for writes.
46  *
47  * Note that the Plank paper claimed to support arbitrary N+M, but was then
48  * amended six years later identifying a critical flaw that invalidates its
49  * claims. Nevertheless, the technique can be adapted to work for up to
50  * triple parity. For additional parity, the amendment "Note: Correction to
51  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
52  * is viable, but the additional complexity means that write performance will
53  * suffer.
54  *
55  * All of the methods above operate on a Galois field, defined over the
56  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
57  * can be expressed with a single byte. Briefly, the operations on the
58  * field are defined as follows:
59  *
60  *   o addition (+) is represented by a bitwise XOR
61  *   o subtraction (-) is therefore identical to addition: A + B = A - B
62  *   o multiplication of A by 2 is defined by the following bitwise expression:
63  *
64  *	(A * 2)_7 = A_6
65  *	(A * 2)_6 = A_5
66  *	(A * 2)_5 = A_4
67  *	(A * 2)_4 = A_3 + A_7
68  *	(A * 2)_3 = A_2 + A_7
69  *	(A * 2)_2 = A_1 + A_7
70  *	(A * 2)_1 = A_0
71  *	(A * 2)_0 = A_7
72  *
73  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
74  * As an aside, this multiplication is derived from the error correcting
75  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
76  *
77  * Observe that any number in the field (except for 0) can be expressed as a
78  * power of 2 -- a generator for the field. We store a table of the powers of
79  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
80  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
81  * than field addition). The inverse of a field element A (A^-1) is therefore
82  * A ^ (255 - 1) = A^254.
83  *
84  * The up-to-three parity columns, P, Q, R over several data columns,
85  * D_0, ... D_n-1, can be expressed by field operations:
86  *
87  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
88  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
89  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
90  *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
91  *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
92  *
93  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
94  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
95  * independent coefficients. (There are no additional coefficients that have
96  * this property which is why the uncorrected Plank method breaks down.)
97  *
98  * See the reconstruction code below for how P, Q and R can used individually
99  * or in concert to recover missing data columns.
100  */
101 
102 typedef struct raidz_col {
103 	uint64_t rc_devidx;		/* child device index for I/O */
104 	uint64_t rc_offset;		/* device offset */
105 	uint64_t rc_size;		/* I/O size */
106 	void *rc_data;			/* I/O data */
107 	void *rc_gdata;			/* used to store the "good" version */
108 	int rc_error;			/* I/O error for this device */
109 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
110 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
111 } raidz_col_t;
112 
113 typedef struct raidz_map {
114 	uint64_t rm_cols;		/* Regular column count */
115 	uint64_t rm_scols;		/* Count including skipped columns */
116 	uint64_t rm_bigcols;		/* Number of oversized columns */
117 	uint64_t rm_asize;		/* Actual total I/O size */
118 	uint64_t rm_missingdata;	/* Count of missing data devices */
119 	uint64_t rm_missingparity;	/* Count of missing parity devices */
120 	uint64_t rm_firstdatacol;	/* First data column/parity count */
121 	uint64_t rm_nskip;		/* Skipped sectors for padding */
122 	uint64_t rm_skipstart;		/* Column index of padding start */
123 	void *rm_datacopy;		/* rm_asize-buffer of copied data */
124 	uintptr_t rm_reports;		/* # of referencing checksum reports */
125 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
126 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
127 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
128 } raidz_map_t;
129 
130 #define	VDEV_RAIDZ_P		0
131 #define	VDEV_RAIDZ_Q		1
132 #define	VDEV_RAIDZ_R		2
133 
134 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
135 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
136 
137 /*
138  * We provide a mechanism to perform the field multiplication operation on a
139  * 64-bit value all at once rather than a byte at a time. This works by
140  * creating a mask from the top bit in each byte and using that to
141  * conditionally apply the XOR of 0x1d.
142  */
143 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
144 { \
145 	(mask) = (x) & 0x8080808080808080ULL; \
146 	(mask) = ((mask) << 1) - ((mask) >> 7); \
147 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
148 	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
149 }
150 
151 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
152 { \
153 	VDEV_RAIDZ_64MUL_2((x), mask); \
154 	VDEV_RAIDZ_64MUL_2((x), mask); \
155 }
156 
157 /*
158  * Force reconstruction to use the general purpose method.
159  */
160 int vdev_raidz_default_to_general;
161 
162 /* Powers of 2 in the Galois field defined above. */
163 static const uint8_t vdev_raidz_pow2[256] = {
164 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
165 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
166 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
167 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
168 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
169 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
170 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
171 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
172 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
173 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
174 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
175 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
176 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
177 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
178 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
179 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
180 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
181 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
182 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
183 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
184 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
185 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
186 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
187 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
188 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
189 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
190 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
191 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
192 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
193 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
194 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
195 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
196 };
197 /* Logs of 2 in the Galois field defined above. */
198 static const uint8_t vdev_raidz_log2[256] = {
199 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
200 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
201 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
202 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
203 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
204 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
205 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
206 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
207 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
208 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
209 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
210 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
211 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
212 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
213 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
214 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
215 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
216 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
217 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
218 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
219 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
220 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
221 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
222 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
223 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
224 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
225 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
226 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
227 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
228 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
229 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
230 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
231 };
232 
233 static void vdev_raidz_generate_parity(raidz_map_t *rm);
234 
235 /*
236  * Multiply a given number by 2 raised to the given power.
237  */
238 static uint8_t
239 vdev_raidz_exp2(uint_t a, int exp)
240 {
241 	if (a == 0)
242 		return (0);
243 
244 	ASSERT(exp >= 0);
245 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
246 
247 	exp += vdev_raidz_log2[a];
248 	if (exp > 255)
249 		exp -= 255;
250 
251 	return (vdev_raidz_pow2[exp]);
252 }
253 
254 static void
255 vdev_raidz_map_free(raidz_map_t *rm)
256 {
257 	int c;
258 	size_t size;
259 
260 	for (c = 0; c < rm->rm_firstdatacol; c++) {
261 		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
262 
263 		if (rm->rm_col[c].rc_gdata != NULL)
264 			zio_buf_free(rm->rm_col[c].rc_gdata,
265 			    rm->rm_col[c].rc_size);
266 	}
267 
268 	size = 0;
269 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
270 		size += rm->rm_col[c].rc_size;
271 
272 	if (rm->rm_datacopy != NULL)
273 		zio_buf_free(rm->rm_datacopy, size);
274 
275 	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
276 }
277 
278 static void
279 vdev_raidz_map_free_vsd(zio_t *zio)
280 {
281 	raidz_map_t *rm = zio->io_vsd;
282 
283 	ASSERT0(rm->rm_freed);
284 	rm->rm_freed = 1;
285 
286 	if (rm->rm_reports == 0)
287 		vdev_raidz_map_free(rm);
288 }
289 
290 /*ARGSUSED*/
291 static void
292 vdev_raidz_cksum_free(void *arg, size_t ignored)
293 {
294 	raidz_map_t *rm = arg;
295 
296 	ASSERT3U(rm->rm_reports, >, 0);
297 
298 	if (--rm->rm_reports == 0 && rm->rm_freed != 0)
299 		vdev_raidz_map_free(rm);
300 }
301 
302 static void
303 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
304 {
305 	raidz_map_t *rm = zcr->zcr_cbdata;
306 	size_t c = zcr->zcr_cbinfo;
307 	size_t x;
308 
309 	const char *good = NULL;
310 	const char *bad = rm->rm_col[c].rc_data;
311 
312 	if (good_data == NULL) {
313 		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
314 		return;
315 	}
316 
317 	if (c < rm->rm_firstdatacol) {
318 		/*
319 		 * The first time through, calculate the parity blocks for
320 		 * the good data (this relies on the fact that the good
321 		 * data never changes for a given logical ZIO)
322 		 */
323 		if (rm->rm_col[0].rc_gdata == NULL) {
324 			char *bad_parity[VDEV_RAIDZ_MAXPARITY];
325 			char *buf;
326 
327 			/*
328 			 * Set up the rm_col[]s to generate the parity for
329 			 * good_data, first saving the parity bufs and
330 			 * replacing them with buffers to hold the result.
331 			 */
332 			for (x = 0; x < rm->rm_firstdatacol; x++) {
333 				bad_parity[x] = rm->rm_col[x].rc_data;
334 				rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
335 				    zio_buf_alloc(rm->rm_col[x].rc_size);
336 			}
337 
338 			/* fill in the data columns from good_data */
339 			buf = (char *)good_data;
340 			for (; x < rm->rm_cols; x++) {
341 				rm->rm_col[x].rc_data = buf;
342 				buf += rm->rm_col[x].rc_size;
343 			}
344 
345 			/*
346 			 * Construct the parity from the good data.
347 			 */
348 			vdev_raidz_generate_parity(rm);
349 
350 			/* restore everything back to its original state */
351 			for (x = 0; x < rm->rm_firstdatacol; x++)
352 				rm->rm_col[x].rc_data = bad_parity[x];
353 
354 			buf = rm->rm_datacopy;
355 			for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
356 				rm->rm_col[x].rc_data = buf;
357 				buf += rm->rm_col[x].rc_size;
358 			}
359 		}
360 
361 		ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
362 		good = rm->rm_col[c].rc_gdata;
363 	} else {
364 		/* adjust good_data to point at the start of our column */
365 		good = good_data;
366 
367 		for (x = rm->rm_firstdatacol; x < c; x++)
368 			good += rm->rm_col[x].rc_size;
369 	}
370 
371 	/* we drop the ereport if it ends up that the data was good */
372 	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
373 }
374 
375 /*
376  * Invoked indirectly by zfs_ereport_start_checksum(), called
377  * below when our read operation fails completely.  The main point
378  * is to keep a copy of everything we read from disk, so that at
379  * vdev_raidz_cksum_finish() time we can compare it with the good data.
380  */
381 static void
382 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
383 {
384 	size_t c = (size_t)(uintptr_t)arg;
385 	caddr_t buf;
386 
387 	raidz_map_t *rm = zio->io_vsd;
388 	size_t size;
389 
390 	/* set up the report and bump the refcount  */
391 	zcr->zcr_cbdata = rm;
392 	zcr->zcr_cbinfo = c;
393 	zcr->zcr_finish = vdev_raidz_cksum_finish;
394 	zcr->zcr_free = vdev_raidz_cksum_free;
395 
396 	rm->rm_reports++;
397 	ASSERT3U(rm->rm_reports, >, 0);
398 
399 	if (rm->rm_datacopy != NULL)
400 		return;
401 
402 	/*
403 	 * It's the first time we're called for this raidz_map_t, so we need
404 	 * to copy the data aside; there's no guarantee that our zio's buffer
405 	 * won't be re-used for something else.
406 	 *
407 	 * Our parity data is already in separate buffers, so there's no need
408 	 * to copy them.
409 	 */
410 
411 	size = 0;
412 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
413 		size += rm->rm_col[c].rc_size;
414 
415 	buf = rm->rm_datacopy = zio_buf_alloc(size);
416 
417 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
418 		raidz_col_t *col = &rm->rm_col[c];
419 
420 		bcopy(col->rc_data, buf, col->rc_size);
421 		col->rc_data = buf;
422 
423 		buf += col->rc_size;
424 	}
425 	ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
426 }
427 
428 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
429 	vdev_raidz_map_free_vsd,
430 	vdev_raidz_cksum_report
431 };
432 
433 /*
434  * Divides the IO evenly across all child vdevs; usually, dcols is
435  * the number of children in the target vdev.
436  */
437 static raidz_map_t *
438 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
439     uint64_t nparity)
440 {
441 	raidz_map_t *rm;
442 	/* The starting RAIDZ (parent) vdev sector of the block. */
443 	uint64_t b = zio->io_offset >> unit_shift;
444 	/* The zio's size in units of the vdev's minimum sector size. */
445 	uint64_t s = zio->io_size >> unit_shift;
446 	/* The first column for this stripe. */
447 	uint64_t f = b % dcols;
448 	/* The starting byte offset on each child vdev. */
449 	uint64_t o = (b / dcols) << unit_shift;
450 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
451 
452 	/*
453 	 * "Quotient": The number of data sectors for this stripe on all but
454 	 * the "big column" child vdevs that also contain "remainder" data.
455 	 */
456 	q = s / (dcols - nparity);
457 
458 	/*
459 	 * "Remainder": The number of partial stripe data sectors in this I/O.
460 	 * This will add a sector to some, but not all, child vdevs.
461 	 */
462 	r = s - q * (dcols - nparity);
463 
464 	/* The number of "big columns" - those which contain remainder data. */
465 	bc = (r == 0 ? 0 : r + nparity);
466 
467 	/*
468 	 * The total number of data and parity sectors associated with
469 	 * this I/O.
470 	 */
471 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
472 
473 	/* acols: The columns that will be accessed. */
474 	/* scols: The columns that will be accessed or skipped. */
475 	if (q == 0) {
476 		/* Our I/O request doesn't span all child vdevs. */
477 		acols = bc;
478 		scols = MIN(dcols, roundup(bc, nparity + 1));
479 	} else {
480 		acols = dcols;
481 		scols = dcols;
482 	}
483 
484 	ASSERT3U(acols, <=, scols);
485 
486 	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
487 
488 	rm->rm_cols = acols;
489 	rm->rm_scols = scols;
490 	rm->rm_bigcols = bc;
491 	rm->rm_skipstart = bc;
492 	rm->rm_missingdata = 0;
493 	rm->rm_missingparity = 0;
494 	rm->rm_firstdatacol = nparity;
495 	rm->rm_datacopy = NULL;
496 	rm->rm_reports = 0;
497 	rm->rm_freed = 0;
498 	rm->rm_ecksuminjected = 0;
499 
500 	asize = 0;
501 
502 	for (c = 0; c < scols; c++) {
503 		col = f + c;
504 		coff = o;
505 		if (col >= dcols) {
506 			col -= dcols;
507 			coff += 1ULL << unit_shift;
508 		}
509 		rm->rm_col[c].rc_devidx = col;
510 		rm->rm_col[c].rc_offset = coff;
511 		rm->rm_col[c].rc_data = NULL;
512 		rm->rm_col[c].rc_gdata = NULL;
513 		rm->rm_col[c].rc_error = 0;
514 		rm->rm_col[c].rc_tried = 0;
515 		rm->rm_col[c].rc_skipped = 0;
516 
517 		if (c >= acols)
518 			rm->rm_col[c].rc_size = 0;
519 		else if (c < bc)
520 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
521 		else
522 			rm->rm_col[c].rc_size = q << unit_shift;
523 
524 		asize += rm->rm_col[c].rc_size;
525 	}
526 
527 	ASSERT3U(asize, ==, tot << unit_shift);
528 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
529 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
530 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
531 	ASSERT3U(rm->rm_nskip, <=, nparity);
532 
533 	for (c = 0; c < rm->rm_firstdatacol; c++)
534 		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
535 
536 	rm->rm_col[c].rc_data = zio->io_data;
537 
538 	for (c = c + 1; c < acols; c++)
539 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
540 		    rm->rm_col[c - 1].rc_size;
541 
542 	/*
543 	 * If all data stored spans all columns, there's a danger that parity
544 	 * will always be on the same device and, since parity isn't read
545 	 * during normal operation, that that device's I/O bandwidth won't be
546 	 * used effectively. We therefore switch the parity every 1MB.
547 	 *
548 	 * ... at least that was, ostensibly, the theory. As a practical
549 	 * matter unless we juggle the parity between all devices evenly, we
550 	 * won't see any benefit. Further, occasional writes that aren't a
551 	 * multiple of the LCM of the number of children and the minimum
552 	 * stripe width are sufficient to avoid pessimal behavior.
553 	 * Unfortunately, this decision created an implicit on-disk format
554 	 * requirement that we need to support for all eternity, but only
555 	 * for single-parity RAID-Z.
556 	 *
557 	 * If we intend to skip a sector in the zeroth column for padding
558 	 * we must make sure to note this swap. We will never intend to
559 	 * skip the first column since at least one data and one parity
560 	 * column must appear in each row.
561 	 */
562 	ASSERT(rm->rm_cols >= 2);
563 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
564 
565 	if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
566 		devidx = rm->rm_col[0].rc_devidx;
567 		o = rm->rm_col[0].rc_offset;
568 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
569 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
570 		rm->rm_col[1].rc_devidx = devidx;
571 		rm->rm_col[1].rc_offset = o;
572 
573 		if (rm->rm_skipstart == 0)
574 			rm->rm_skipstart = 1;
575 	}
576 
577 	zio->io_vsd = rm;
578 	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
579 	return (rm);
580 }
581 
582 static void
583 vdev_raidz_generate_parity_p(raidz_map_t *rm)
584 {
585 	uint64_t *p, *src, pcount, ccount, i;
586 	int c;
587 
588 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
589 
590 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
591 		src = rm->rm_col[c].rc_data;
592 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
593 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
594 
595 		if (c == rm->rm_firstdatacol) {
596 			ASSERT(ccount == pcount);
597 			for (i = 0; i < ccount; i++, src++, p++) {
598 				*p = *src;
599 			}
600 		} else {
601 			ASSERT(ccount <= pcount);
602 			for (i = 0; i < ccount; i++, src++, p++) {
603 				*p ^= *src;
604 			}
605 		}
606 	}
607 }
608 
609 static void
610 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
611 {
612 	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
613 	int c;
614 
615 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
616 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
617 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
618 
619 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
620 		src = rm->rm_col[c].rc_data;
621 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
622 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
623 
624 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
625 
626 		if (c == rm->rm_firstdatacol) {
627 			ASSERT(ccnt == pcnt || ccnt == 0);
628 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
629 				*p = *src;
630 				*q = *src;
631 			}
632 			for (; i < pcnt; i++, src++, p++, q++) {
633 				*p = 0;
634 				*q = 0;
635 			}
636 		} else {
637 			ASSERT(ccnt <= pcnt);
638 
639 			/*
640 			 * Apply the algorithm described above by multiplying
641 			 * the previous result and adding in the new value.
642 			 */
643 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
644 				*p ^= *src;
645 
646 				VDEV_RAIDZ_64MUL_2(*q, mask);
647 				*q ^= *src;
648 			}
649 
650 			/*
651 			 * Treat short columns as though they are full of 0s.
652 			 * Note that there's therefore nothing needed for P.
653 			 */
654 			for (; i < pcnt; i++, q++) {
655 				VDEV_RAIDZ_64MUL_2(*q, mask);
656 			}
657 		}
658 	}
659 }
660 
661 static void
662 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
663 {
664 	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
665 	int c;
666 
667 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
668 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
669 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
670 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
671 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
672 
673 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
674 		src = rm->rm_col[c].rc_data;
675 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
676 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
677 		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
678 
679 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
680 
681 		if (c == rm->rm_firstdatacol) {
682 			ASSERT(ccnt == pcnt || ccnt == 0);
683 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
684 				*p = *src;
685 				*q = *src;
686 				*r = *src;
687 			}
688 			for (; i < pcnt; i++, src++, p++, q++, r++) {
689 				*p = 0;
690 				*q = 0;
691 				*r = 0;
692 			}
693 		} else {
694 			ASSERT(ccnt <= pcnt);
695 
696 			/*
697 			 * Apply the algorithm described above by multiplying
698 			 * the previous result and adding in the new value.
699 			 */
700 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
701 				*p ^= *src;
702 
703 				VDEV_RAIDZ_64MUL_2(*q, mask);
704 				*q ^= *src;
705 
706 				VDEV_RAIDZ_64MUL_4(*r, mask);
707 				*r ^= *src;
708 			}
709 
710 			/*
711 			 * Treat short columns as though they are full of 0s.
712 			 * Note that there's therefore nothing needed for P.
713 			 */
714 			for (; i < pcnt; i++, q++, r++) {
715 				VDEV_RAIDZ_64MUL_2(*q, mask);
716 				VDEV_RAIDZ_64MUL_4(*r, mask);
717 			}
718 		}
719 	}
720 }
721 
722 /*
723  * Generate RAID parity in the first virtual columns according to the number of
724  * parity columns available.
725  */
726 static void
727 vdev_raidz_generate_parity(raidz_map_t *rm)
728 {
729 	switch (rm->rm_firstdatacol) {
730 	case 1:
731 		vdev_raidz_generate_parity_p(rm);
732 		break;
733 	case 2:
734 		vdev_raidz_generate_parity_pq(rm);
735 		break;
736 	case 3:
737 		vdev_raidz_generate_parity_pqr(rm);
738 		break;
739 	default:
740 		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
741 	}
742 }
743 
744 static int
745 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
746 {
747 	uint64_t *dst, *src, xcount, ccount, count, i;
748 	int x = tgts[0];
749 	int c;
750 
751 	ASSERT(ntgts == 1);
752 	ASSERT(x >= rm->rm_firstdatacol);
753 	ASSERT(x < rm->rm_cols);
754 
755 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
756 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
757 	ASSERT(xcount > 0);
758 
759 	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
760 	dst = rm->rm_col[x].rc_data;
761 	for (i = 0; i < xcount; i++, dst++, src++) {
762 		*dst = *src;
763 	}
764 
765 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
766 		src = rm->rm_col[c].rc_data;
767 		dst = rm->rm_col[x].rc_data;
768 
769 		if (c == x)
770 			continue;
771 
772 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
773 		count = MIN(ccount, xcount);
774 
775 		for (i = 0; i < count; i++, dst++, src++) {
776 			*dst ^= *src;
777 		}
778 	}
779 
780 	return (1 << VDEV_RAIDZ_P);
781 }
782 
783 static int
784 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
785 {
786 	uint64_t *dst, *src, xcount, ccount, count, mask, i;
787 	uint8_t *b;
788 	int x = tgts[0];
789 	int c, j, exp;
790 
791 	ASSERT(ntgts == 1);
792 
793 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
794 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
795 
796 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
797 		src = rm->rm_col[c].rc_data;
798 		dst = rm->rm_col[x].rc_data;
799 
800 		if (c == x)
801 			ccount = 0;
802 		else
803 			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
804 
805 		count = MIN(ccount, xcount);
806 
807 		if (c == rm->rm_firstdatacol) {
808 			for (i = 0; i < count; i++, dst++, src++) {
809 				*dst = *src;
810 			}
811 			for (; i < xcount; i++, dst++) {
812 				*dst = 0;
813 			}
814 
815 		} else {
816 			for (i = 0; i < count; i++, dst++, src++) {
817 				VDEV_RAIDZ_64MUL_2(*dst, mask);
818 				*dst ^= *src;
819 			}
820 
821 			for (; i < xcount; i++, dst++) {
822 				VDEV_RAIDZ_64MUL_2(*dst, mask);
823 			}
824 		}
825 	}
826 
827 	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
828 	dst = rm->rm_col[x].rc_data;
829 	exp = 255 - (rm->rm_cols - 1 - x);
830 
831 	for (i = 0; i < xcount; i++, dst++, src++) {
832 		*dst ^= *src;
833 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
834 			*b = vdev_raidz_exp2(*b, exp);
835 		}
836 	}
837 
838 	return (1 << VDEV_RAIDZ_Q);
839 }
840 
841 static int
842 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
843 {
844 	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
845 	void *pdata, *qdata;
846 	uint64_t xsize, ysize, i;
847 	int x = tgts[0];
848 	int y = tgts[1];
849 
850 	ASSERT(ntgts == 2);
851 	ASSERT(x < y);
852 	ASSERT(x >= rm->rm_firstdatacol);
853 	ASSERT(y < rm->rm_cols);
854 
855 	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
856 
857 	/*
858 	 * Move the parity data aside -- we're going to compute parity as
859 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
860 	 * reuse the parity generation mechanism without trashing the actual
861 	 * parity so we make those columns appear to be full of zeros by
862 	 * setting their lengths to zero.
863 	 */
864 	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
865 	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
866 	xsize = rm->rm_col[x].rc_size;
867 	ysize = rm->rm_col[y].rc_size;
868 
869 	rm->rm_col[VDEV_RAIDZ_P].rc_data =
870 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
871 	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
872 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
873 	rm->rm_col[x].rc_size = 0;
874 	rm->rm_col[y].rc_size = 0;
875 
876 	vdev_raidz_generate_parity_pq(rm);
877 
878 	rm->rm_col[x].rc_size = xsize;
879 	rm->rm_col[y].rc_size = ysize;
880 
881 	p = pdata;
882 	q = qdata;
883 	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
884 	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
885 	xd = rm->rm_col[x].rc_data;
886 	yd = rm->rm_col[y].rc_data;
887 
888 	/*
889 	 * We now have:
890 	 *	Pxy = P + D_x + D_y
891 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
892 	 *
893 	 * We can then solve for D_x:
894 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
895 	 * where
896 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
897 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
898 	 *
899 	 * With D_x in hand, we can easily solve for D_y:
900 	 *	D_y = P + Pxy + D_x
901 	 */
902 
903 	a = vdev_raidz_pow2[255 + x - y];
904 	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
905 	tmp = 255 - vdev_raidz_log2[a ^ 1];
906 
907 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
908 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
909 
910 	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
911 		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
912 		    vdev_raidz_exp2(*q ^ *qxy, bexp);
913 
914 		if (i < ysize)
915 			*yd = *p ^ *pxy ^ *xd;
916 	}
917 
918 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
919 	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
920 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
921 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
922 
923 	/*
924 	 * Restore the saved parity data.
925 	 */
926 	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
927 	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
928 
929 	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
930 }
931 
932 /* BEGIN CSTYLED */
933 /*
934  * In the general case of reconstruction, we must solve the system of linear
935  * equations defined by the coeffecients used to generate parity as well as
936  * the contents of the data and parity disks. This can be expressed with
937  * vectors for the original data (D) and the actual data (d) and parity (p)
938  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
939  *
940  *            __   __                     __     __
941  *            |     |         __     __   |  p_0  |
942  *            |  V  |         |  D_0  |   | p_m-1 |
943  *            |     |    x    |   :   | = |  d_0  |
944  *            |  I  |         | D_n-1 |   |   :   |
945  *            |     |         ~~     ~~   | d_n-1 |
946  *            ~~   ~~                     ~~     ~~
947  *
948  * I is simply a square identity matrix of size n, and V is a vandermonde
949  * matrix defined by the coeffecients we chose for the various parity columns
950  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
951  * computation as well as linear separability.
952  *
953  *      __               __               __     __
954  *      |   1   ..  1 1 1 |               |  p_0  |
955  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
956  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
957  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
958  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
959  *      |   :       : : : |   |   :   |   |  d_2  |
960  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
961  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
962  *      |   0   ..  0 0 1 |               | d_n-1 |
963  *      ~~               ~~               ~~     ~~
964  *
965  * Note that I, V, d, and p are known. To compute D, we must invert the
966  * matrix and use the known data and parity values to reconstruct the unknown
967  * data values. We begin by removing the rows in V|I and d|p that correspond
968  * to failed or missing columns; we then make V|I square (n x n) and d|p
969  * sized n by removing rows corresponding to unused parity from the bottom up
970  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
971  * using Gauss-Jordan elimination. In the example below we use m=3 parity
972  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
973  *           __                               __
974  *           |  1   1   1   1   1   1   1   1  |
975  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
976  *           |  19 205 116  29  64  16  4   1  |      / /
977  *           |  1   0   0   0   0   0   0   0  |     / /
978  *           |  0   1   0   0   0   0   0   0  | <--' /
979  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
980  *           |  0   0   0   1   0   0   0   0  |
981  *           |  0   0   0   0   1   0   0   0  |
982  *           |  0   0   0   0   0   1   0   0  |
983  *           |  0   0   0   0   0   0   1   0  |
984  *           |  0   0   0   0   0   0   0   1  |
985  *           ~~                               ~~
986  *           __                               __
987  *           |  1   1   1   1   1   1   1   1  |
988  *           | 128  64  32  16  8   4   2   1  |
989  *           |  19 205 116  29  64  16  4   1  |
990  *           |  1   0   0   0   0   0   0   0  |
991  *           |  0   1   0   0   0   0   0   0  |
992  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
993  *           |  0   0   0   1   0   0   0   0  |
994  *           |  0   0   0   0   1   0   0   0  |
995  *           |  0   0   0   0   0   1   0   0  |
996  *           |  0   0   0   0   0   0   1   0  |
997  *           |  0   0   0   0   0   0   0   1  |
998  *           ~~                               ~~
999  *
1000  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1001  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1002  * matrix is not singular.
1003  * __                                                                 __
1004  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1005  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1006  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1007  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1008  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1009  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1010  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1011  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1012  * ~~                                                                 ~~
1013  * __                                                                 __
1014  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1015  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1016  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1017  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1018  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1019  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1020  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1021  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1022  * ~~                                                                 ~~
1023  * __                                                                 __
1024  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1025  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1026  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1027  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1028  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1029  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1030  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1031  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1032  * ~~                                                                 ~~
1033  * __                                                                 __
1034  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1035  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1036  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1037  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1038  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1039  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1040  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1041  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1042  * ~~                                                                 ~~
1043  * __                                                                 __
1044  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1045  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1046  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1047  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1048  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1049  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1050  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1051  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1052  * ~~                                                                 ~~
1053  * __                                                                 __
1054  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1055  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1056  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1057  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1058  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1059  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1060  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1061  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1062  * ~~                                                                 ~~
1063  *                   __                               __
1064  *                   |  0   0   1   0   0   0   0   0  |
1065  *                   | 167 100  5   41 159 169 217 208 |
1066  *                   | 166 100  4   40 158 168 216 209 |
1067  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1068  *                   |  0   0   0   0   1   0   0   0  |
1069  *                   |  0   0   0   0   0   1   0   0  |
1070  *                   |  0   0   0   0   0   0   1   0  |
1071  *                   |  0   0   0   0   0   0   0   1  |
1072  *                   ~~                               ~~
1073  *
1074  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1075  * of the missing data.
1076  *
1077  * As is apparent from the example above, the only non-trivial rows in the
1078  * inverse matrix correspond to the data disks that we're trying to
1079  * reconstruct. Indeed, those are the only rows we need as the others would
1080  * only be useful for reconstructing data known or assumed to be valid. For
1081  * that reason, we only build the coefficients in the rows that correspond to
1082  * targeted columns.
1083  */
1084 /* END CSTYLED */
1085 
1086 static void
1087 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1088     uint8_t **rows)
1089 {
1090 	int i, j;
1091 	int pow;
1092 
1093 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1094 
1095 	/*
1096 	 * Fill in the missing rows of interest.
1097 	 */
1098 	for (i = 0; i < nmap; i++) {
1099 		ASSERT3S(0, <=, map[i]);
1100 		ASSERT3S(map[i], <=, 2);
1101 
1102 		pow = map[i] * n;
1103 		if (pow > 255)
1104 			pow -= 255;
1105 		ASSERT(pow <= 255);
1106 
1107 		for (j = 0; j < n; j++) {
1108 			pow -= map[i];
1109 			if (pow < 0)
1110 				pow += 255;
1111 			rows[i][j] = vdev_raidz_pow2[pow];
1112 		}
1113 	}
1114 }
1115 
1116 static void
1117 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1118     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1119 {
1120 	int i, j, ii, jj;
1121 	uint8_t log;
1122 
1123 	/*
1124 	 * Assert that the first nmissing entries from the array of used
1125 	 * columns correspond to parity columns and that subsequent entries
1126 	 * correspond to data columns.
1127 	 */
1128 	for (i = 0; i < nmissing; i++) {
1129 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
1130 	}
1131 	for (; i < n; i++) {
1132 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1133 	}
1134 
1135 	/*
1136 	 * First initialize the storage where we'll compute the inverse rows.
1137 	 */
1138 	for (i = 0; i < nmissing; i++) {
1139 		for (j = 0; j < n; j++) {
1140 			invrows[i][j] = (i == j) ? 1 : 0;
1141 		}
1142 	}
1143 
1144 	/*
1145 	 * Subtract all trivial rows from the rows of consequence.
1146 	 */
1147 	for (i = 0; i < nmissing; i++) {
1148 		for (j = nmissing; j < n; j++) {
1149 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1150 			jj = used[j] - rm->rm_firstdatacol;
1151 			ASSERT3S(jj, <, n);
1152 			invrows[i][j] = rows[i][jj];
1153 			rows[i][jj] = 0;
1154 		}
1155 	}
1156 
1157 	/*
1158 	 * For each of the rows of interest, we must normalize it and subtract
1159 	 * a multiple of it from the other rows.
1160 	 */
1161 	for (i = 0; i < nmissing; i++) {
1162 		for (j = 0; j < missing[i]; j++) {
1163 			ASSERT0(rows[i][j]);
1164 		}
1165 		ASSERT3U(rows[i][missing[i]], !=, 0);
1166 
1167 		/*
1168 		 * Compute the inverse of the first element and multiply each
1169 		 * element in the row by that value.
1170 		 */
1171 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1172 
1173 		for (j = 0; j < n; j++) {
1174 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1175 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1176 		}
1177 
1178 		for (ii = 0; ii < nmissing; ii++) {
1179 			if (i == ii)
1180 				continue;
1181 
1182 			ASSERT3U(rows[ii][missing[i]], !=, 0);
1183 
1184 			log = vdev_raidz_log2[rows[ii][missing[i]]];
1185 
1186 			for (j = 0; j < n; j++) {
1187 				rows[ii][j] ^=
1188 				    vdev_raidz_exp2(rows[i][j], log);
1189 				invrows[ii][j] ^=
1190 				    vdev_raidz_exp2(invrows[i][j], log);
1191 			}
1192 		}
1193 	}
1194 
1195 	/*
1196 	 * Verify that the data that is left in the rows are properly part of
1197 	 * an identity matrix.
1198 	 */
1199 	for (i = 0; i < nmissing; i++) {
1200 		for (j = 0; j < n; j++) {
1201 			if (j == missing[i]) {
1202 				ASSERT3U(rows[i][j], ==, 1);
1203 			} else {
1204 				ASSERT0(rows[i][j]);
1205 			}
1206 		}
1207 	}
1208 }
1209 
1210 static void
1211 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1212     int *missing, uint8_t **invrows, const uint8_t *used)
1213 {
1214 	int i, j, x, cc, c;
1215 	uint8_t *src;
1216 	uint64_t ccount;
1217 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1218 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1219 	uint8_t log = 0;
1220 	uint8_t val;
1221 	int ll;
1222 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1223 	uint8_t *p, *pp;
1224 	size_t psize;
1225 
1226 	psize = sizeof (invlog[0][0]) * n * nmissing;
1227 	p = kmem_alloc(psize, KM_SLEEP);
1228 
1229 	for (pp = p, i = 0; i < nmissing; i++) {
1230 		invlog[i] = pp;
1231 		pp += n;
1232 	}
1233 
1234 	for (i = 0; i < nmissing; i++) {
1235 		for (j = 0; j < n; j++) {
1236 			ASSERT3U(invrows[i][j], !=, 0);
1237 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1238 		}
1239 	}
1240 
1241 	for (i = 0; i < n; i++) {
1242 		c = used[i];
1243 		ASSERT3U(c, <, rm->rm_cols);
1244 
1245 		src = rm->rm_col[c].rc_data;
1246 		ccount = rm->rm_col[c].rc_size;
1247 		for (j = 0; j < nmissing; j++) {
1248 			cc = missing[j] + rm->rm_firstdatacol;
1249 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1250 			ASSERT3U(cc, <, rm->rm_cols);
1251 			ASSERT3U(cc, !=, c);
1252 
1253 			dst[j] = rm->rm_col[cc].rc_data;
1254 			dcount[j] = rm->rm_col[cc].rc_size;
1255 		}
1256 
1257 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1258 
1259 		for (x = 0; x < ccount; x++, src++) {
1260 			if (*src != 0)
1261 				log = vdev_raidz_log2[*src];
1262 
1263 			for (cc = 0; cc < nmissing; cc++) {
1264 				if (x >= dcount[cc])
1265 					continue;
1266 
1267 				if (*src == 0) {
1268 					val = 0;
1269 				} else {
1270 					if ((ll = log + invlog[cc][i]) >= 255)
1271 						ll -= 255;
1272 					val = vdev_raidz_pow2[ll];
1273 				}
1274 
1275 				if (i == 0)
1276 					dst[cc][x] = val;
1277 				else
1278 					dst[cc][x] ^= val;
1279 			}
1280 		}
1281 	}
1282 
1283 	kmem_free(p, psize);
1284 }
1285 
1286 static int
1287 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1288 {
1289 	int n, i, c, t, tt;
1290 	int nmissing_rows;
1291 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1292 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1293 
1294 	uint8_t *p, *pp;
1295 	size_t psize;
1296 
1297 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1298 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1299 	uint8_t *used;
1300 
1301 	int code = 0;
1302 
1303 
1304 	n = rm->rm_cols - rm->rm_firstdatacol;
1305 
1306 	/*
1307 	 * Figure out which data columns are missing.
1308 	 */
1309 	nmissing_rows = 0;
1310 	for (t = 0; t < ntgts; t++) {
1311 		if (tgts[t] >= rm->rm_firstdatacol) {
1312 			missing_rows[nmissing_rows++] =
1313 			    tgts[t] - rm->rm_firstdatacol;
1314 		}
1315 	}
1316 
1317 	/*
1318 	 * Figure out which parity columns to use to help generate the missing
1319 	 * data columns.
1320 	 */
1321 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1322 		ASSERT(tt < ntgts);
1323 		ASSERT(c < rm->rm_firstdatacol);
1324 
1325 		/*
1326 		 * Skip any targeted parity columns.
1327 		 */
1328 		if (c == tgts[tt]) {
1329 			tt++;
1330 			continue;
1331 		}
1332 
1333 		code |= 1 << c;
1334 
1335 		parity_map[i] = c;
1336 		i++;
1337 	}
1338 
1339 	ASSERT(code != 0);
1340 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1341 
1342 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1343 	    nmissing_rows * n + sizeof (used[0]) * n;
1344 	p = kmem_alloc(psize, KM_SLEEP);
1345 
1346 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1347 		rows[i] = pp;
1348 		pp += n;
1349 		invrows[i] = pp;
1350 		pp += n;
1351 	}
1352 	used = pp;
1353 
1354 	for (i = 0; i < nmissing_rows; i++) {
1355 		used[i] = parity_map[i];
1356 	}
1357 
1358 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1359 		if (tt < nmissing_rows &&
1360 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1361 			tt++;
1362 			continue;
1363 		}
1364 
1365 		ASSERT3S(i, <, n);
1366 		used[i] = c;
1367 		i++;
1368 	}
1369 
1370 	/*
1371 	 * Initialize the interesting rows of the matrix.
1372 	 */
1373 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1374 
1375 	/*
1376 	 * Invert the matrix.
1377 	 */
1378 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1379 	    invrows, used);
1380 
1381 	/*
1382 	 * Reconstruct the missing data using the generated matrix.
1383 	 */
1384 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1385 	    invrows, used);
1386 
1387 	kmem_free(p, psize);
1388 
1389 	return (code);
1390 }
1391 
1392 static int
1393 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1394 {
1395 	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1396 	int ntgts;
1397 	int i, c;
1398 	int code;
1399 	int nbadparity, nbaddata;
1400 	int parity_valid[VDEV_RAIDZ_MAXPARITY];
1401 
1402 	/*
1403 	 * The tgts list must already be sorted.
1404 	 */
1405 	for (i = 1; i < nt; i++) {
1406 		ASSERT(t[i] > t[i - 1]);
1407 	}
1408 
1409 	nbadparity = rm->rm_firstdatacol;
1410 	nbaddata = rm->rm_cols - nbadparity;
1411 	ntgts = 0;
1412 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1413 		if (c < rm->rm_firstdatacol)
1414 			parity_valid[c] = B_FALSE;
1415 
1416 		if (i < nt && c == t[i]) {
1417 			tgts[ntgts++] = c;
1418 			i++;
1419 		} else if (rm->rm_col[c].rc_error != 0) {
1420 			tgts[ntgts++] = c;
1421 		} else if (c >= rm->rm_firstdatacol) {
1422 			nbaddata--;
1423 		} else {
1424 			parity_valid[c] = B_TRUE;
1425 			nbadparity--;
1426 		}
1427 	}
1428 
1429 	ASSERT(ntgts >= nt);
1430 	ASSERT(nbaddata >= 0);
1431 	ASSERT(nbaddata + nbadparity == ntgts);
1432 
1433 	dt = &tgts[nbadparity];
1434 
1435 	/*
1436 	 * See if we can use any of our optimized reconstruction routines.
1437 	 */
1438 	if (!vdev_raidz_default_to_general) {
1439 		switch (nbaddata) {
1440 		case 1:
1441 			if (parity_valid[VDEV_RAIDZ_P])
1442 				return (vdev_raidz_reconstruct_p(rm, dt, 1));
1443 
1444 			ASSERT(rm->rm_firstdatacol > 1);
1445 
1446 			if (parity_valid[VDEV_RAIDZ_Q])
1447 				return (vdev_raidz_reconstruct_q(rm, dt, 1));
1448 
1449 			ASSERT(rm->rm_firstdatacol > 2);
1450 			break;
1451 
1452 		case 2:
1453 			ASSERT(rm->rm_firstdatacol > 1);
1454 
1455 			if (parity_valid[VDEV_RAIDZ_P] &&
1456 			    parity_valid[VDEV_RAIDZ_Q])
1457 				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1458 
1459 			ASSERT(rm->rm_firstdatacol > 2);
1460 
1461 			break;
1462 		}
1463 	}
1464 
1465 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1466 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1467 	ASSERT(code > 0);
1468 	return (code);
1469 }
1470 
1471 static int
1472 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1473     uint64_t *ashift)
1474 {
1475 	vdev_t *cvd;
1476 	uint64_t nparity = vd->vdev_nparity;
1477 	int c;
1478 	int lasterror = 0;
1479 	int numerrors = 0;
1480 
1481 	ASSERT(nparity > 0);
1482 
1483 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
1484 	    vd->vdev_children < nparity + 1) {
1485 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1486 		return (SET_ERROR(EINVAL));
1487 	}
1488 
1489 	vdev_open_children(vd);
1490 
1491 	for (c = 0; c < vd->vdev_children; c++) {
1492 		cvd = vd->vdev_child[c];
1493 
1494 		if (cvd->vdev_open_error != 0) {
1495 			lasterror = cvd->vdev_open_error;
1496 			numerrors++;
1497 			continue;
1498 		}
1499 
1500 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1501 		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1502 		*ashift = MAX(*ashift, cvd->vdev_ashift);
1503 	}
1504 
1505 	*asize *= vd->vdev_children;
1506 	*max_asize *= vd->vdev_children;
1507 
1508 	if (numerrors > nparity) {
1509 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1510 		return (lasterror);
1511 	}
1512 
1513 	return (0);
1514 }
1515 
1516 static void
1517 vdev_raidz_close(vdev_t *vd)
1518 {
1519 	int c;
1520 
1521 	for (c = 0; c < vd->vdev_children; c++)
1522 		vdev_close(vd->vdev_child[c]);
1523 }
1524 
1525 static uint64_t
1526 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1527 {
1528 	uint64_t asize;
1529 	uint64_t ashift = vd->vdev_top->vdev_ashift;
1530 	uint64_t cols = vd->vdev_children;
1531 	uint64_t nparity = vd->vdev_nparity;
1532 
1533 	asize = ((psize - 1) >> ashift) + 1;
1534 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1535 	asize = roundup(asize, nparity + 1) << ashift;
1536 
1537 	return (asize);
1538 }
1539 
1540 static void
1541 vdev_raidz_child_done(zio_t *zio)
1542 {
1543 	raidz_col_t *rc = zio->io_private;
1544 
1545 	rc->rc_error = zio->io_error;
1546 	rc->rc_tried = 1;
1547 	rc->rc_skipped = 0;
1548 }
1549 
1550 /*
1551  * Start an IO operation on a RAIDZ VDev
1552  *
1553  * Outline:
1554  * - For write operations:
1555  *   1. Generate the parity data
1556  *   2. Create child zio write operations to each column's vdev, for both
1557  *      data and parity.
1558  *   3. If the column skips any sectors for padding, create optional dummy
1559  *      write zio children for those areas to improve aggregation continuity.
1560  * - For read operations:
1561  *   1. Create child zio read operations to each data column's vdev to read
1562  *      the range of data required for zio.
1563  *   2. If this is a scrub or resilver operation, or if any of the data
1564  *      vdevs have had errors, then create zio read operations to the parity
1565  *      columns' VDevs as well.
1566  */
1567 static int
1568 vdev_raidz_io_start(zio_t *zio)
1569 {
1570 	vdev_t *vd = zio->io_vd;
1571 	vdev_t *tvd = vd->vdev_top;
1572 	vdev_t *cvd;
1573 	raidz_map_t *rm;
1574 	raidz_col_t *rc;
1575 	int c, i;
1576 
1577 	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
1578 	    vd->vdev_nparity);
1579 
1580 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1581 
1582 	if (zio->io_type == ZIO_TYPE_WRITE) {
1583 		vdev_raidz_generate_parity(rm);
1584 
1585 		for (c = 0; c < rm->rm_cols; c++) {
1586 			rc = &rm->rm_col[c];
1587 			cvd = vd->vdev_child[rc->rc_devidx];
1588 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1589 			    rc->rc_offset, rc->rc_data, rc->rc_size,
1590 			    zio->io_type, zio->io_priority, 0,
1591 			    vdev_raidz_child_done, rc));
1592 		}
1593 
1594 		/*
1595 		 * Generate optional I/Os for any skipped sectors to improve
1596 		 * aggregation contiguity.
1597 		 */
1598 		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1599 			ASSERT(c <= rm->rm_scols);
1600 			if (c == rm->rm_scols)
1601 				c = 0;
1602 			rc = &rm->rm_col[c];
1603 			cvd = vd->vdev_child[rc->rc_devidx];
1604 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1605 			    rc->rc_offset + rc->rc_size, NULL,
1606 			    1 << tvd->vdev_ashift,
1607 			    zio->io_type, zio->io_priority,
1608 			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1609 		}
1610 
1611 		return (ZIO_PIPELINE_CONTINUE);
1612 	}
1613 
1614 	ASSERT(zio->io_type == ZIO_TYPE_READ);
1615 
1616 	/*
1617 	 * Iterate over the columns in reverse order so that we hit the parity
1618 	 * last -- any errors along the way will force us to read the parity.
1619 	 */
1620 	for (c = rm->rm_cols - 1; c >= 0; c--) {
1621 		rc = &rm->rm_col[c];
1622 		cvd = vd->vdev_child[rc->rc_devidx];
1623 		if (!vdev_readable(cvd)) {
1624 			if (c >= rm->rm_firstdatacol)
1625 				rm->rm_missingdata++;
1626 			else
1627 				rm->rm_missingparity++;
1628 			rc->rc_error = SET_ERROR(ENXIO);
1629 			rc->rc_tried = 1;	/* don't even try */
1630 			rc->rc_skipped = 1;
1631 			continue;
1632 		}
1633 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1634 			if (c >= rm->rm_firstdatacol)
1635 				rm->rm_missingdata++;
1636 			else
1637 				rm->rm_missingparity++;
1638 			rc->rc_error = SET_ERROR(ESTALE);
1639 			rc->rc_skipped = 1;
1640 			continue;
1641 		}
1642 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1643 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1644 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1645 			    rc->rc_offset, rc->rc_data, rc->rc_size,
1646 			    zio->io_type, zio->io_priority, 0,
1647 			    vdev_raidz_child_done, rc));
1648 		}
1649 	}
1650 
1651 	return (ZIO_PIPELINE_CONTINUE);
1652 }
1653 
1654 
1655 /*
1656  * Report a checksum error for a child of a RAID-Z device.
1657  */
1658 static void
1659 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1660 {
1661 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1662 
1663 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1664 		zio_bad_cksum_t zbc;
1665 		raidz_map_t *rm = zio->io_vsd;
1666 
1667 		mutex_enter(&vd->vdev_stat_lock);
1668 		vd->vdev_stat.vs_checksum_errors++;
1669 		mutex_exit(&vd->vdev_stat_lock);
1670 
1671 		zbc.zbc_has_cksum = 0;
1672 		zbc.zbc_injected = rm->rm_ecksuminjected;
1673 
1674 		zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1675 		    rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1676 		    &zbc);
1677 	}
1678 }
1679 
1680 /*
1681  * We keep track of whether or not there were any injected errors, so that
1682  * any ereports we generate can note it.
1683  */
1684 static int
1685 raidz_checksum_verify(zio_t *zio)
1686 {
1687 	zio_bad_cksum_t zbc;
1688 	raidz_map_t *rm = zio->io_vsd;
1689 
1690 	int ret = zio_checksum_error(zio, &zbc);
1691 	if (ret != 0 && zbc.zbc_injected != 0)
1692 		rm->rm_ecksuminjected = 1;
1693 
1694 	return (ret);
1695 }
1696 
1697 /*
1698  * Generate the parity from the data columns. If we tried and were able to
1699  * read the parity without error, verify that the generated parity matches the
1700  * data we read. If it doesn't, we fire off a checksum error. Return the
1701  * number such failures.
1702  */
1703 static int
1704 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1705 {
1706 	void *orig[VDEV_RAIDZ_MAXPARITY];
1707 	int c, ret = 0;
1708 	raidz_col_t *rc;
1709 
1710 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1711 		rc = &rm->rm_col[c];
1712 		if (!rc->rc_tried || rc->rc_error != 0)
1713 			continue;
1714 		orig[c] = zio_buf_alloc(rc->rc_size);
1715 		bcopy(rc->rc_data, orig[c], rc->rc_size);
1716 	}
1717 
1718 	vdev_raidz_generate_parity(rm);
1719 
1720 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1721 		rc = &rm->rm_col[c];
1722 		if (!rc->rc_tried || rc->rc_error != 0)
1723 			continue;
1724 		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1725 			raidz_checksum_error(zio, rc, orig[c]);
1726 			rc->rc_error = SET_ERROR(ECKSUM);
1727 			ret++;
1728 		}
1729 		zio_buf_free(orig[c], rc->rc_size);
1730 	}
1731 
1732 	return (ret);
1733 }
1734 
1735 /*
1736  * Keep statistics on all the ways that we used parity to correct data.
1737  */
1738 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1739 
1740 static int
1741 vdev_raidz_worst_error(raidz_map_t *rm)
1742 {
1743 	int error = 0;
1744 
1745 	for (int c = 0; c < rm->rm_cols; c++)
1746 		error = zio_worst_error(error, rm->rm_col[c].rc_error);
1747 
1748 	return (error);
1749 }
1750 
1751 /*
1752  * Iterate over all combinations of bad data and attempt a reconstruction.
1753  * Note that the algorithm below is non-optimal because it doesn't take into
1754  * account how reconstruction is actually performed. For example, with
1755  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1756  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1757  * cases we'd only use parity information in column 0.
1758  */
1759 static int
1760 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1761 {
1762 	raidz_map_t *rm = zio->io_vsd;
1763 	raidz_col_t *rc;
1764 	void *orig[VDEV_RAIDZ_MAXPARITY];
1765 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1766 	int *tgts = &tstore[1];
1767 	int current, next, i, c, n;
1768 	int code, ret = 0;
1769 
1770 	ASSERT(total_errors < rm->rm_firstdatacol);
1771 
1772 	/*
1773 	 * This simplifies one edge condition.
1774 	 */
1775 	tgts[-1] = -1;
1776 
1777 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1778 		/*
1779 		 * Initialize the targets array by finding the first n columns
1780 		 * that contain no error.
1781 		 *
1782 		 * If there were no data errors, we need to ensure that we're
1783 		 * always explicitly attempting to reconstruct at least one
1784 		 * data column. To do this, we simply push the highest target
1785 		 * up into the data columns.
1786 		 */
1787 		for (c = 0, i = 0; i < n; i++) {
1788 			if (i == n - 1 && data_errors == 0 &&
1789 			    c < rm->rm_firstdatacol) {
1790 				c = rm->rm_firstdatacol;
1791 			}
1792 
1793 			while (rm->rm_col[c].rc_error != 0) {
1794 				c++;
1795 				ASSERT3S(c, <, rm->rm_cols);
1796 			}
1797 
1798 			tgts[i] = c++;
1799 		}
1800 
1801 		/*
1802 		 * Setting tgts[n] simplifies the other edge condition.
1803 		 */
1804 		tgts[n] = rm->rm_cols;
1805 
1806 		/*
1807 		 * These buffers were allocated in previous iterations.
1808 		 */
1809 		for (i = 0; i < n - 1; i++) {
1810 			ASSERT(orig[i] != NULL);
1811 		}
1812 
1813 		orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
1814 
1815 		current = 0;
1816 		next = tgts[current];
1817 
1818 		while (current != n) {
1819 			tgts[current] = next;
1820 			current = 0;
1821 
1822 			/*
1823 			 * Save off the original data that we're going to
1824 			 * attempt to reconstruct.
1825 			 */
1826 			for (i = 0; i < n; i++) {
1827 				ASSERT(orig[i] != NULL);
1828 				c = tgts[i];
1829 				ASSERT3S(c, >=, 0);
1830 				ASSERT3S(c, <, rm->rm_cols);
1831 				rc = &rm->rm_col[c];
1832 				bcopy(rc->rc_data, orig[i], rc->rc_size);
1833 			}
1834 
1835 			/*
1836 			 * Attempt a reconstruction and exit the outer loop on
1837 			 * success.
1838 			 */
1839 			code = vdev_raidz_reconstruct(rm, tgts, n);
1840 			if (raidz_checksum_verify(zio) == 0) {
1841 				atomic_inc_64(&raidz_corrected[code]);
1842 
1843 				for (i = 0; i < n; i++) {
1844 					c = tgts[i];
1845 					rc = &rm->rm_col[c];
1846 					ASSERT(rc->rc_error == 0);
1847 					if (rc->rc_tried)
1848 						raidz_checksum_error(zio, rc,
1849 						    orig[i]);
1850 					rc->rc_error = SET_ERROR(ECKSUM);
1851 				}
1852 
1853 				ret = code;
1854 				goto done;
1855 			}
1856 
1857 			/*
1858 			 * Restore the original data.
1859 			 */
1860 			for (i = 0; i < n; i++) {
1861 				c = tgts[i];
1862 				rc = &rm->rm_col[c];
1863 				bcopy(orig[i], rc->rc_data, rc->rc_size);
1864 			}
1865 
1866 			do {
1867 				/*
1868 				 * Find the next valid column after the current
1869 				 * position..
1870 				 */
1871 				for (next = tgts[current] + 1;
1872 				    next < rm->rm_cols &&
1873 				    rm->rm_col[next].rc_error != 0; next++)
1874 					continue;
1875 
1876 				ASSERT(next <= tgts[current + 1]);
1877 
1878 				/*
1879 				 * If that spot is available, we're done here.
1880 				 */
1881 				if (next != tgts[current + 1])
1882 					break;
1883 
1884 				/*
1885 				 * Otherwise, find the next valid column after
1886 				 * the previous position.
1887 				 */
1888 				for (c = tgts[current - 1] + 1;
1889 				    rm->rm_col[c].rc_error != 0; c++)
1890 					continue;
1891 
1892 				tgts[current] = c;
1893 				current++;
1894 
1895 			} while (current != n);
1896 		}
1897 	}
1898 	n--;
1899 done:
1900 	for (i = 0; i < n; i++) {
1901 		zio_buf_free(orig[i], rm->rm_col[0].rc_size);
1902 	}
1903 
1904 	return (ret);
1905 }
1906 
1907 /*
1908  * Complete an IO operation on a RAIDZ VDev
1909  *
1910  * Outline:
1911  * - For write operations:
1912  *   1. Check for errors on the child IOs.
1913  *   2. Return, setting an error code if too few child VDevs were written
1914  *      to reconstruct the data later.  Note that partial writes are
1915  *      considered successful if they can be reconstructed at all.
1916  * - For read operations:
1917  *   1. Check for errors on the child IOs.
1918  *   2. If data errors occurred:
1919  *      a. Try to reassemble the data from the parity available.
1920  *      b. If we haven't yet read the parity drives, read them now.
1921  *      c. If all parity drives have been read but the data still doesn't
1922  *         reassemble with a correct checksum, then try combinatorial
1923  *         reconstruction.
1924  *      d. If that doesn't work, return an error.
1925  *   3. If there were unexpected errors or this is a resilver operation,
1926  *      rewrite the vdevs that had errors.
1927  */
1928 static void
1929 vdev_raidz_io_done(zio_t *zio)
1930 {
1931 	vdev_t *vd = zio->io_vd;
1932 	vdev_t *cvd;
1933 	raidz_map_t *rm = zio->io_vsd;
1934 	raidz_col_t *rc;
1935 	int unexpected_errors = 0;
1936 	int parity_errors = 0;
1937 	int parity_untried = 0;
1938 	int data_errors = 0;
1939 	int total_errors = 0;
1940 	int n, c;
1941 	int tgts[VDEV_RAIDZ_MAXPARITY];
1942 	int code;
1943 
1944 	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
1945 
1946 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1947 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1948 
1949 	for (c = 0; c < rm->rm_cols; c++) {
1950 		rc = &rm->rm_col[c];
1951 
1952 		if (rc->rc_error) {
1953 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
1954 
1955 			if (c < rm->rm_firstdatacol)
1956 				parity_errors++;
1957 			else
1958 				data_errors++;
1959 
1960 			if (!rc->rc_skipped)
1961 				unexpected_errors++;
1962 
1963 			total_errors++;
1964 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1965 			parity_untried++;
1966 		}
1967 	}
1968 
1969 	if (zio->io_type == ZIO_TYPE_WRITE) {
1970 		/*
1971 		 * XXX -- for now, treat partial writes as a success.
1972 		 * (If we couldn't write enough columns to reconstruct
1973 		 * the data, the I/O failed.  Otherwise, good enough.)
1974 		 *
1975 		 * Now that we support write reallocation, it would be better
1976 		 * to treat partial failure as real failure unless there are
1977 		 * no non-degraded top-level vdevs left, and not update DTLs
1978 		 * if we intend to reallocate.
1979 		 */
1980 		/* XXPOLICY */
1981 		if (total_errors > rm->rm_firstdatacol)
1982 			zio->io_error = vdev_raidz_worst_error(rm);
1983 
1984 		return;
1985 	}
1986 
1987 	ASSERT(zio->io_type == ZIO_TYPE_READ);
1988 	/*
1989 	 * There are three potential phases for a read:
1990 	 *	1. produce valid data from the columns read
1991 	 *	2. read all disks and try again
1992 	 *	3. perform combinatorial reconstruction
1993 	 *
1994 	 * Each phase is progressively both more expensive and less likely to
1995 	 * occur. If we encounter more errors than we can repair or all phases
1996 	 * fail, we have no choice but to return an error.
1997 	 */
1998 
1999 	/*
2000 	 * If the number of errors we saw was correctable -- less than or equal
2001 	 * to the number of parity disks read -- attempt to produce data that
2002 	 * has a valid checksum. Naturally, this case applies in the absence of
2003 	 * any errors.
2004 	 */
2005 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2006 		if (data_errors == 0) {
2007 			if (raidz_checksum_verify(zio) == 0) {
2008 				/*
2009 				 * If we read parity information (unnecessarily
2010 				 * as it happens since no reconstruction was
2011 				 * needed) regenerate and verify the parity.
2012 				 * We also regenerate parity when resilvering
2013 				 * so we can write it out to the failed device
2014 				 * later.
2015 				 */
2016 				if (parity_errors + parity_untried <
2017 				    rm->rm_firstdatacol ||
2018 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2019 					n = raidz_parity_verify(zio, rm);
2020 					unexpected_errors += n;
2021 					ASSERT(parity_errors + n <=
2022 					    rm->rm_firstdatacol);
2023 				}
2024 				goto done;
2025 			}
2026 		} else {
2027 			/*
2028 			 * We either attempt to read all the parity columns or
2029 			 * none of them. If we didn't try to read parity, we
2030 			 * wouldn't be here in the correctable case. There must
2031 			 * also have been fewer parity errors than parity
2032 			 * columns or, again, we wouldn't be in this code path.
2033 			 */
2034 			ASSERT(parity_untried == 0);
2035 			ASSERT(parity_errors < rm->rm_firstdatacol);
2036 
2037 			/*
2038 			 * Identify the data columns that reported an error.
2039 			 */
2040 			n = 0;
2041 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2042 				rc = &rm->rm_col[c];
2043 				if (rc->rc_error != 0) {
2044 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2045 					tgts[n++] = c;
2046 				}
2047 			}
2048 
2049 			ASSERT(rm->rm_firstdatacol >= n);
2050 
2051 			code = vdev_raidz_reconstruct(rm, tgts, n);
2052 
2053 			if (raidz_checksum_verify(zio) == 0) {
2054 				atomic_inc_64(&raidz_corrected[code]);
2055 
2056 				/*
2057 				 * If we read more parity disks than were used
2058 				 * for reconstruction, confirm that the other
2059 				 * parity disks produced correct data. This
2060 				 * routine is suboptimal in that it regenerates
2061 				 * the parity that we already used in addition
2062 				 * to the parity that we're attempting to
2063 				 * verify, but this should be a relatively
2064 				 * uncommon case, and can be optimized if it
2065 				 * becomes a problem. Note that we regenerate
2066 				 * parity when resilvering so we can write it
2067 				 * out to failed devices later.
2068 				 */
2069 				if (parity_errors < rm->rm_firstdatacol - n ||
2070 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2071 					n = raidz_parity_verify(zio, rm);
2072 					unexpected_errors += n;
2073 					ASSERT(parity_errors + n <=
2074 					    rm->rm_firstdatacol);
2075 				}
2076 
2077 				goto done;
2078 			}
2079 		}
2080 	}
2081 
2082 	/*
2083 	 * This isn't a typical situation -- either we got a read error or
2084 	 * a child silently returned bad data. Read every block so we can
2085 	 * try again with as much data and parity as we can track down. If
2086 	 * we've already been through once before, all children will be marked
2087 	 * as tried so we'll proceed to combinatorial reconstruction.
2088 	 */
2089 	unexpected_errors = 1;
2090 	rm->rm_missingdata = 0;
2091 	rm->rm_missingparity = 0;
2092 
2093 	for (c = 0; c < rm->rm_cols; c++) {
2094 		if (rm->rm_col[c].rc_tried)
2095 			continue;
2096 
2097 		zio_vdev_io_redone(zio);
2098 		do {
2099 			rc = &rm->rm_col[c];
2100 			if (rc->rc_tried)
2101 				continue;
2102 			zio_nowait(zio_vdev_child_io(zio, NULL,
2103 			    vd->vdev_child[rc->rc_devidx],
2104 			    rc->rc_offset, rc->rc_data, rc->rc_size,
2105 			    zio->io_type, zio->io_priority, 0,
2106 			    vdev_raidz_child_done, rc));
2107 		} while (++c < rm->rm_cols);
2108 
2109 		return;
2110 	}
2111 
2112 	/*
2113 	 * At this point we've attempted to reconstruct the data given the
2114 	 * errors we detected, and we've attempted to read all columns. There
2115 	 * must, therefore, be one or more additional problems -- silent errors
2116 	 * resulting in invalid data rather than explicit I/O errors resulting
2117 	 * in absent data. We check if there is enough additional data to
2118 	 * possibly reconstruct the data and then perform combinatorial
2119 	 * reconstruction over all possible combinations. If that fails,
2120 	 * we're cooked.
2121 	 */
2122 	if (total_errors > rm->rm_firstdatacol) {
2123 		zio->io_error = vdev_raidz_worst_error(rm);
2124 
2125 	} else if (total_errors < rm->rm_firstdatacol &&
2126 	    (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2127 		/*
2128 		 * If we didn't use all the available parity for the
2129 		 * combinatorial reconstruction, verify that the remaining
2130 		 * parity is correct.
2131 		 */
2132 		if (code != (1 << rm->rm_firstdatacol) - 1)
2133 			(void) raidz_parity_verify(zio, rm);
2134 	} else {
2135 		/*
2136 		 * We're here because either:
2137 		 *
2138 		 *	total_errors == rm_first_datacol, or
2139 		 *	vdev_raidz_combrec() failed
2140 		 *
2141 		 * In either case, there is enough bad data to prevent
2142 		 * reconstruction.
2143 		 *
2144 		 * Start checksum ereports for all children which haven't
2145 		 * failed, and the IO wasn't speculative.
2146 		 */
2147 		zio->io_error = SET_ERROR(ECKSUM);
2148 
2149 		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2150 			for (c = 0; c < rm->rm_cols; c++) {
2151 				rc = &rm->rm_col[c];
2152 				if (rc->rc_error == 0) {
2153 					zio_bad_cksum_t zbc;
2154 					zbc.zbc_has_cksum = 0;
2155 					zbc.zbc_injected =
2156 					    rm->rm_ecksuminjected;
2157 
2158 					zfs_ereport_start_checksum(
2159 					    zio->io_spa,
2160 					    vd->vdev_child[rc->rc_devidx],
2161 					    zio, rc->rc_offset, rc->rc_size,
2162 					    (void *)(uintptr_t)c, &zbc);
2163 				}
2164 			}
2165 		}
2166 	}
2167 
2168 done:
2169 	zio_checksum_verified(zio);
2170 
2171 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2172 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2173 		/*
2174 		 * Use the good data we have in hand to repair damaged children.
2175 		 */
2176 		for (c = 0; c < rm->rm_cols; c++) {
2177 			rc = &rm->rm_col[c];
2178 			cvd = vd->vdev_child[rc->rc_devidx];
2179 
2180 			if (rc->rc_error == 0)
2181 				continue;
2182 
2183 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2184 			    rc->rc_offset, rc->rc_data, rc->rc_size,
2185 			    ZIO_TYPE_WRITE, zio->io_priority,
2186 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2187 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2188 		}
2189 	}
2190 }
2191 
2192 static void
2193 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2194 {
2195 	if (faulted > vd->vdev_nparity)
2196 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2197 		    VDEV_AUX_NO_REPLICAS);
2198 	else if (degraded + faulted != 0)
2199 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2200 	else
2201 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2202 }
2203 
2204 vdev_ops_t vdev_raidz_ops = {
2205 	vdev_raidz_open,
2206 	vdev_raidz_close,
2207 	vdev_raidz_asize,
2208 	vdev_raidz_io_start,
2209 	vdev_raidz_io_done,
2210 	vdev_raidz_state_change,
2211 	NULL,
2212 	NULL,
2213 	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
2214 	B_FALSE			/* not a leaf vdev */
2215 };
2216