xref: /titanic_41/usr/src/uts/common/fs/zfs/vdev_raidz.c (revision 3e30c24aeefdee1631958ecf17f18da671781956)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2013 by Delphix. All rights reserved.
25  */
26 
27 #include <sys/zfs_context.h>
28 #include <sys/spa.h>
29 #include <sys/vdev_impl.h>
30 #include <sys/zio.h>
31 #include <sys/zio_checksum.h>
32 #include <sys/fs/zfs.h>
33 #include <sys/fm/fs/zfs.h>
34 
35 /*
36  * Virtual device vector for RAID-Z.
37  *
38  * This vdev supports single, double, and triple parity. For single parity,
39  * we use a simple XOR of all the data columns. For double or triple parity,
40  * we use a special case of Reed-Solomon coding. This extends the
41  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
42  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
43  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
44  * former is also based. The latter is designed to provide higher performance
45  * for writes.
46  *
47  * Note that the Plank paper claimed to support arbitrary N+M, but was then
48  * amended six years later identifying a critical flaw that invalidates its
49  * claims. Nevertheless, the technique can be adapted to work for up to
50  * triple parity. For additional parity, the amendment "Note: Correction to
51  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
52  * is viable, but the additional complexity means that write performance will
53  * suffer.
54  *
55  * All of the methods above operate on a Galois field, defined over the
56  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
57  * can be expressed with a single byte. Briefly, the operations on the
58  * field are defined as follows:
59  *
60  *   o addition (+) is represented by a bitwise XOR
61  *   o subtraction (-) is therefore identical to addition: A + B = A - B
62  *   o multiplication of A by 2 is defined by the following bitwise expression:
63  *	(A * 2)_7 = A_6
64  *	(A * 2)_6 = A_5
65  *	(A * 2)_5 = A_4
66  *	(A * 2)_4 = A_3 + A_7
67  *	(A * 2)_3 = A_2 + A_7
68  *	(A * 2)_2 = A_1 + A_7
69  *	(A * 2)_1 = A_0
70  *	(A * 2)_0 = A_7
71  *
72  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
73  * As an aside, this multiplication is derived from the error correcting
74  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
75  *
76  * Observe that any number in the field (except for 0) can be expressed as a
77  * power of 2 -- a generator for the field. We store a table of the powers of
78  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
79  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
80  * than field addition). The inverse of a field element A (A^-1) is therefore
81  * A ^ (255 - 1) = A^254.
82  *
83  * The up-to-three parity columns, P, Q, R over several data columns,
84  * D_0, ... D_n-1, can be expressed by field operations:
85  *
86  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
87  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
88  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
89  *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
90  *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
91  *
92  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
93  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
94  * independent coefficients. (There are no additional coefficients that have
95  * this property which is why the uncorrected Plank method breaks down.)
96  *
97  * See the reconstruction code below for how P, Q and R can used individually
98  * or in concert to recover missing data columns.
99  */
100 
101 typedef struct raidz_col {
102 	uint64_t rc_devidx;		/* child device index for I/O */
103 	uint64_t rc_offset;		/* device offset */
104 	uint64_t rc_size;		/* I/O size */
105 	void *rc_data;			/* I/O data */
106 	void *rc_gdata;			/* used to store the "good" version */
107 	int rc_error;			/* I/O error for this device */
108 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
109 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
110 } raidz_col_t;
111 
112 typedef struct raidz_map {
113 	uint64_t rm_cols;		/* Regular column count */
114 	uint64_t rm_scols;		/* Count including skipped columns */
115 	uint64_t rm_bigcols;		/* Number of oversized columns */
116 	uint64_t rm_asize;		/* Actual total I/O size */
117 	uint64_t rm_missingdata;	/* Count of missing data devices */
118 	uint64_t rm_missingparity;	/* Count of missing parity devices */
119 	uint64_t rm_firstdatacol;	/* First data column/parity count */
120 	uint64_t rm_nskip;		/* Skipped sectors for padding */
121 	uint64_t rm_skipstart;	/* Column index of padding start */
122 	void *rm_datacopy;		/* rm_asize-buffer of copied data */
123 	uintptr_t rm_reports;		/* # of referencing checksum reports */
124 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
125 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
126 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
127 } raidz_map_t;
128 
129 #define	VDEV_RAIDZ_P		0
130 #define	VDEV_RAIDZ_Q		1
131 #define	VDEV_RAIDZ_R		2
132 
133 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
134 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
135 
136 /*
137  * We provide a mechanism to perform the field multiplication operation on a
138  * 64-bit value all at once rather than a byte at a time. This works by
139  * creating a mask from the top bit in each byte and using that to
140  * conditionally apply the XOR of 0x1d.
141  */
142 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
143 { \
144 	(mask) = (x) & 0x8080808080808080ULL; \
145 	(mask) = ((mask) << 1) - ((mask) >> 7); \
146 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
147 	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
148 }
149 
150 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
151 { \
152 	VDEV_RAIDZ_64MUL_2((x), mask); \
153 	VDEV_RAIDZ_64MUL_2((x), mask); \
154 }
155 
156 /*
157  * Force reconstruction to use the general purpose method.
158  */
159 int vdev_raidz_default_to_general;
160 
161 /*
162  * These two tables represent powers and logs of 2 in the Galois field defined
163  * above. These values were computed by repeatedly multiplying by 2 as above.
164  */
165 static const uint8_t vdev_raidz_pow2[256] = {
166 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
167 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
168 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
169 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
170 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
171 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
172 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
173 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
174 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
175 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
176 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
177 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
178 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
179 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
180 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
181 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
182 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
183 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
184 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
185 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
186 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
187 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
188 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
189 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
190 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
191 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
192 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
193 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
194 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
195 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
196 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
197 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
198 };
199 static const uint8_t vdev_raidz_log2[256] = {
200 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
201 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
202 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
203 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
204 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
205 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
206 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
207 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
208 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
209 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
210 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
211 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
212 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
213 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
214 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
215 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
216 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
217 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
218 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
219 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
220 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
221 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
222 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
223 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
224 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
225 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
226 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
227 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
228 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
229 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
230 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
231 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
232 };
233 
234 static void vdev_raidz_generate_parity(raidz_map_t *rm);
235 
236 /*
237  * Multiply a given number by 2 raised to the given power.
238  */
239 static uint8_t
240 vdev_raidz_exp2(uint_t a, int exp)
241 {
242 	if (a == 0)
243 		return (0);
244 
245 	ASSERT(exp >= 0);
246 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
247 
248 	exp += vdev_raidz_log2[a];
249 	if (exp > 255)
250 		exp -= 255;
251 
252 	return (vdev_raidz_pow2[exp]);
253 }
254 
255 static void
256 vdev_raidz_map_free(raidz_map_t *rm)
257 {
258 	int c;
259 	size_t size;
260 
261 	for (c = 0; c < rm->rm_firstdatacol; c++) {
262 		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
263 
264 		if (rm->rm_col[c].rc_gdata != NULL)
265 			zio_buf_free(rm->rm_col[c].rc_gdata,
266 			    rm->rm_col[c].rc_size);
267 	}
268 
269 	size = 0;
270 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
271 		size += rm->rm_col[c].rc_size;
272 
273 	if (rm->rm_datacopy != NULL)
274 		zio_buf_free(rm->rm_datacopy, size);
275 
276 	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
277 }
278 
279 static void
280 vdev_raidz_map_free_vsd(zio_t *zio)
281 {
282 	raidz_map_t *rm = zio->io_vsd;
283 
284 	ASSERT0(rm->rm_freed);
285 	rm->rm_freed = 1;
286 
287 	if (rm->rm_reports == 0)
288 		vdev_raidz_map_free(rm);
289 }
290 
291 /*ARGSUSED*/
292 static void
293 vdev_raidz_cksum_free(void *arg, size_t ignored)
294 {
295 	raidz_map_t *rm = arg;
296 
297 	ASSERT3U(rm->rm_reports, >, 0);
298 
299 	if (--rm->rm_reports == 0 && rm->rm_freed != 0)
300 		vdev_raidz_map_free(rm);
301 }
302 
303 static void
304 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
305 {
306 	raidz_map_t *rm = zcr->zcr_cbdata;
307 	size_t c = zcr->zcr_cbinfo;
308 	size_t x;
309 
310 	const char *good = NULL;
311 	const char *bad = rm->rm_col[c].rc_data;
312 
313 	if (good_data == NULL) {
314 		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
315 		return;
316 	}
317 
318 	if (c < rm->rm_firstdatacol) {
319 		/*
320 		 * The first time through, calculate the parity blocks for
321 		 * the good data (this relies on the fact that the good
322 		 * data never changes for a given logical ZIO)
323 		 */
324 		if (rm->rm_col[0].rc_gdata == NULL) {
325 			char *bad_parity[VDEV_RAIDZ_MAXPARITY];
326 			char *buf;
327 
328 			/*
329 			 * Set up the rm_col[]s to generate the parity for
330 			 * good_data, first saving the parity bufs and
331 			 * replacing them with buffers to hold the result.
332 			 */
333 			for (x = 0; x < rm->rm_firstdatacol; x++) {
334 				bad_parity[x] = rm->rm_col[x].rc_data;
335 				rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
336 				    zio_buf_alloc(rm->rm_col[x].rc_size);
337 			}
338 
339 			/* fill in the data columns from good_data */
340 			buf = (char *)good_data;
341 			for (; x < rm->rm_cols; x++) {
342 				rm->rm_col[x].rc_data = buf;
343 				buf += rm->rm_col[x].rc_size;
344 			}
345 
346 			/*
347 			 * Construct the parity from the good data.
348 			 */
349 			vdev_raidz_generate_parity(rm);
350 
351 			/* restore everything back to its original state */
352 			for (x = 0; x < rm->rm_firstdatacol; x++)
353 				rm->rm_col[x].rc_data = bad_parity[x];
354 
355 			buf = rm->rm_datacopy;
356 			for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
357 				rm->rm_col[x].rc_data = buf;
358 				buf += rm->rm_col[x].rc_size;
359 			}
360 		}
361 
362 		ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
363 		good = rm->rm_col[c].rc_gdata;
364 	} else {
365 		/* adjust good_data to point at the start of our column */
366 		good = good_data;
367 
368 		for (x = rm->rm_firstdatacol; x < c; x++)
369 			good += rm->rm_col[x].rc_size;
370 	}
371 
372 	/* we drop the ereport if it ends up that the data was good */
373 	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
374 }
375 
376 /*
377  * Invoked indirectly by zfs_ereport_start_checksum(), called
378  * below when our read operation fails completely.  The main point
379  * is to keep a copy of everything we read from disk, so that at
380  * vdev_raidz_cksum_finish() time we can compare it with the good data.
381  */
382 static void
383 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
384 {
385 	size_t c = (size_t)(uintptr_t)arg;
386 	caddr_t buf;
387 
388 	raidz_map_t *rm = zio->io_vsd;
389 	size_t size;
390 
391 	/* set up the report and bump the refcount  */
392 	zcr->zcr_cbdata = rm;
393 	zcr->zcr_cbinfo = c;
394 	zcr->zcr_finish = vdev_raidz_cksum_finish;
395 	zcr->zcr_free = vdev_raidz_cksum_free;
396 
397 	rm->rm_reports++;
398 	ASSERT3U(rm->rm_reports, >, 0);
399 
400 	if (rm->rm_datacopy != NULL)
401 		return;
402 
403 	/*
404 	 * It's the first time we're called for this raidz_map_t, so we need
405 	 * to copy the data aside; there's no guarantee that our zio's buffer
406 	 * won't be re-used for something else.
407 	 *
408 	 * Our parity data is already in separate buffers, so there's no need
409 	 * to copy them.
410 	 */
411 
412 	size = 0;
413 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
414 		size += rm->rm_col[c].rc_size;
415 
416 	buf = rm->rm_datacopy = zio_buf_alloc(size);
417 
418 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
419 		raidz_col_t *col = &rm->rm_col[c];
420 
421 		bcopy(col->rc_data, buf, col->rc_size);
422 		col->rc_data = buf;
423 
424 		buf += col->rc_size;
425 	}
426 	ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
427 }
428 
429 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
430 	vdev_raidz_map_free_vsd,
431 	vdev_raidz_cksum_report
432 };
433 
434 /*
435  * Divides the IO evenly across all child vdevs; usually, dcols is
436  * the number of children in the target vdev.
437  */
438 static raidz_map_t *
439 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
440     uint64_t nparity)
441 {
442 	raidz_map_t *rm;
443 	/* The starting RAIDZ (parent) vdev sector of the block. */
444 	uint64_t b = zio->io_offset >> unit_shift;
445 	/* The zio's size in units of the vdev's minimum sector size. */
446 	uint64_t s = zio->io_size >> unit_shift;
447 	/* The first column for this stripe. */
448 	uint64_t f = b % dcols;
449 	/* The starting byte offset on each child vdev. */
450 	uint64_t o = (b / dcols) << unit_shift;
451 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
452 
453 	/*
454 	 * "Quotient": The number of data sectors for this stripe on all but
455 	 * the "big column" child vdevs that also contain "remainder" data.
456 	 */
457 	q = s / (dcols - nparity);
458 
459 	/*
460 	 * "Remainder": The number of partial stripe data sectors in this I/O.
461 	 * This will add a sector to some, but not all, child vdevs.
462 	 */
463 	r = s - q * (dcols - nparity);
464 
465 	/* The number of "big columns" - those which contain remainder data. */
466 	bc = (r == 0 ? 0 : r + nparity);
467 
468 	/*
469 	 * The total number of data and parity sectors associated with
470 	 * this I/O.
471 	 */
472 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
473 
474 	/* acols: The columns that will be accessed. */
475 	/* scols: The columns that will be accessed or skipped. */
476 	if (q == 0) {
477 		/* Our I/O request doesn't span all child vdevs. */
478 		acols = bc;
479 		scols = MIN(dcols, roundup(bc, nparity + 1));
480 	} else {
481 		acols = dcols;
482 		scols = dcols;
483 	}
484 
485 	ASSERT3U(acols, <=, scols);
486 
487 	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
488 
489 	rm->rm_cols = acols;
490 	rm->rm_scols = scols;
491 	rm->rm_bigcols = bc;
492 	rm->rm_skipstart = bc;
493 	rm->rm_missingdata = 0;
494 	rm->rm_missingparity = 0;
495 	rm->rm_firstdatacol = nparity;
496 	rm->rm_datacopy = NULL;
497 	rm->rm_reports = 0;
498 	rm->rm_freed = 0;
499 	rm->rm_ecksuminjected = 0;
500 
501 	asize = 0;
502 
503 	for (c = 0; c < scols; c++) {
504 		col = f + c;
505 		coff = o;
506 		if (col >= dcols) {
507 			col -= dcols;
508 			coff += 1ULL << unit_shift;
509 		}
510 		rm->rm_col[c].rc_devidx = col;
511 		rm->rm_col[c].rc_offset = coff;
512 		rm->rm_col[c].rc_data = NULL;
513 		rm->rm_col[c].rc_gdata = NULL;
514 		rm->rm_col[c].rc_error = 0;
515 		rm->rm_col[c].rc_tried = 0;
516 		rm->rm_col[c].rc_skipped = 0;
517 
518 		if (c >= acols)
519 			rm->rm_col[c].rc_size = 0;
520 		else if (c < bc)
521 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
522 		else
523 			rm->rm_col[c].rc_size = q << unit_shift;
524 
525 		asize += rm->rm_col[c].rc_size;
526 	}
527 
528 	ASSERT3U(asize, ==, tot << unit_shift);
529 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
530 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
531 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
532 	ASSERT3U(rm->rm_nskip, <=, nparity);
533 
534 	for (c = 0; c < rm->rm_firstdatacol; c++)
535 		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
536 
537 	rm->rm_col[c].rc_data = zio->io_data;
538 
539 	for (c = c + 1; c < acols; c++)
540 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
541 		    rm->rm_col[c - 1].rc_size;
542 
543 	/*
544 	 * If all data stored spans all columns, there's a danger that parity
545 	 * will always be on the same device and, since parity isn't read
546 	 * during normal operation, that that device's I/O bandwidth won't be
547 	 * used effectively. We therefore switch the parity every 1MB.
548 	 *
549 	 * ... at least that was, ostensibly, the theory. As a practical
550 	 * matter unless we juggle the parity between all devices evenly, we
551 	 * won't see any benefit. Further, occasional writes that aren't a
552 	 * multiple of the LCM of the number of children and the minimum
553 	 * stripe width are sufficient to avoid pessimal behavior.
554 	 * Unfortunately, this decision created an implicit on-disk format
555 	 * requirement that we need to support for all eternity, but only
556 	 * for single-parity RAID-Z.
557 	 *
558 	 * If we intend to skip a sector in the zeroth column for padding
559 	 * we must make sure to note this swap. We will never intend to
560 	 * skip the first column since at least one data and one parity
561 	 * column must appear in each row.
562 	 */
563 	ASSERT(rm->rm_cols >= 2);
564 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
565 
566 	if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
567 		devidx = rm->rm_col[0].rc_devidx;
568 		o = rm->rm_col[0].rc_offset;
569 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
570 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
571 		rm->rm_col[1].rc_devidx = devidx;
572 		rm->rm_col[1].rc_offset = o;
573 
574 		if (rm->rm_skipstart == 0)
575 			rm->rm_skipstart = 1;
576 	}
577 
578 	zio->io_vsd = rm;
579 	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
580 	return (rm);
581 }
582 
583 static void
584 vdev_raidz_generate_parity_p(raidz_map_t *rm)
585 {
586 	uint64_t *p, *src, pcount, ccount, i;
587 	int c;
588 
589 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
590 
591 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
592 		src = rm->rm_col[c].rc_data;
593 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
594 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
595 
596 		if (c == rm->rm_firstdatacol) {
597 			ASSERT(ccount == pcount);
598 			for (i = 0; i < ccount; i++, src++, p++) {
599 				*p = *src;
600 			}
601 		} else {
602 			ASSERT(ccount <= pcount);
603 			for (i = 0; i < ccount; i++, src++, p++) {
604 				*p ^= *src;
605 			}
606 		}
607 	}
608 }
609 
610 static void
611 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
612 {
613 	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
614 	int c;
615 
616 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
617 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
618 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
619 
620 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
621 		src = rm->rm_col[c].rc_data;
622 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
623 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
624 
625 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
626 
627 		if (c == rm->rm_firstdatacol) {
628 			ASSERT(ccnt == pcnt || ccnt == 0);
629 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
630 				*p = *src;
631 				*q = *src;
632 			}
633 			for (; i < pcnt; i++, src++, p++, q++) {
634 				*p = 0;
635 				*q = 0;
636 			}
637 		} else {
638 			ASSERT(ccnt <= pcnt);
639 
640 			/*
641 			 * Apply the algorithm described above by multiplying
642 			 * the previous result and adding in the new value.
643 			 */
644 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
645 				*p ^= *src;
646 
647 				VDEV_RAIDZ_64MUL_2(*q, mask);
648 				*q ^= *src;
649 			}
650 
651 			/*
652 			 * Treat short columns as though they are full of 0s.
653 			 * Note that there's therefore nothing needed for P.
654 			 */
655 			for (; i < pcnt; i++, q++) {
656 				VDEV_RAIDZ_64MUL_2(*q, mask);
657 			}
658 		}
659 	}
660 }
661 
662 static void
663 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
664 {
665 	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
666 	int c;
667 
668 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
669 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
670 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
671 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
672 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
673 
674 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
675 		src = rm->rm_col[c].rc_data;
676 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
677 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
678 		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
679 
680 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
681 
682 		if (c == rm->rm_firstdatacol) {
683 			ASSERT(ccnt == pcnt || ccnt == 0);
684 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
685 				*p = *src;
686 				*q = *src;
687 				*r = *src;
688 			}
689 			for (; i < pcnt; i++, src++, p++, q++, r++) {
690 				*p = 0;
691 				*q = 0;
692 				*r = 0;
693 			}
694 		} else {
695 			ASSERT(ccnt <= pcnt);
696 
697 			/*
698 			 * Apply the algorithm described above by multiplying
699 			 * the previous result and adding in the new value.
700 			 */
701 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
702 				*p ^= *src;
703 
704 				VDEV_RAIDZ_64MUL_2(*q, mask);
705 				*q ^= *src;
706 
707 				VDEV_RAIDZ_64MUL_4(*r, mask);
708 				*r ^= *src;
709 			}
710 
711 			/*
712 			 * Treat short columns as though they are full of 0s.
713 			 * Note that there's therefore nothing needed for P.
714 			 */
715 			for (; i < pcnt; i++, q++, r++) {
716 				VDEV_RAIDZ_64MUL_2(*q, mask);
717 				VDEV_RAIDZ_64MUL_4(*r, mask);
718 			}
719 		}
720 	}
721 }
722 
723 /*
724  * Generate RAID parity in the first virtual columns according to the number of
725  * parity columns available.
726  */
727 static void
728 vdev_raidz_generate_parity(raidz_map_t *rm)
729 {
730 	switch (rm->rm_firstdatacol) {
731 	case 1:
732 		vdev_raidz_generate_parity_p(rm);
733 		break;
734 	case 2:
735 		vdev_raidz_generate_parity_pq(rm);
736 		break;
737 	case 3:
738 		vdev_raidz_generate_parity_pqr(rm);
739 		break;
740 	default:
741 		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
742 	}
743 }
744 
745 static int
746 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
747 {
748 	uint64_t *dst, *src, xcount, ccount, count, i;
749 	int x = tgts[0];
750 	int c;
751 
752 	ASSERT(ntgts == 1);
753 	ASSERT(x >= rm->rm_firstdatacol);
754 	ASSERT(x < rm->rm_cols);
755 
756 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
757 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
758 	ASSERT(xcount > 0);
759 
760 	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
761 	dst = rm->rm_col[x].rc_data;
762 	for (i = 0; i < xcount; i++, dst++, src++) {
763 		*dst = *src;
764 	}
765 
766 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
767 		src = rm->rm_col[c].rc_data;
768 		dst = rm->rm_col[x].rc_data;
769 
770 		if (c == x)
771 			continue;
772 
773 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
774 		count = MIN(ccount, xcount);
775 
776 		for (i = 0; i < count; i++, dst++, src++) {
777 			*dst ^= *src;
778 		}
779 	}
780 
781 	return (1 << VDEV_RAIDZ_P);
782 }
783 
784 static int
785 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
786 {
787 	uint64_t *dst, *src, xcount, ccount, count, mask, i;
788 	uint8_t *b;
789 	int x = tgts[0];
790 	int c, j, exp;
791 
792 	ASSERT(ntgts == 1);
793 
794 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
795 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
796 
797 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
798 		src = rm->rm_col[c].rc_data;
799 		dst = rm->rm_col[x].rc_data;
800 
801 		if (c == x)
802 			ccount = 0;
803 		else
804 			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
805 
806 		count = MIN(ccount, xcount);
807 
808 		if (c == rm->rm_firstdatacol) {
809 			for (i = 0; i < count; i++, dst++, src++) {
810 				*dst = *src;
811 			}
812 			for (; i < xcount; i++, dst++) {
813 				*dst = 0;
814 			}
815 
816 		} else {
817 			for (i = 0; i < count; i++, dst++, src++) {
818 				VDEV_RAIDZ_64MUL_2(*dst, mask);
819 				*dst ^= *src;
820 			}
821 
822 			for (; i < xcount; i++, dst++) {
823 				VDEV_RAIDZ_64MUL_2(*dst, mask);
824 			}
825 		}
826 	}
827 
828 	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
829 	dst = rm->rm_col[x].rc_data;
830 	exp = 255 - (rm->rm_cols - 1 - x);
831 
832 	for (i = 0; i < xcount; i++, dst++, src++) {
833 		*dst ^= *src;
834 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
835 			*b = vdev_raidz_exp2(*b, exp);
836 		}
837 	}
838 
839 	return (1 << VDEV_RAIDZ_Q);
840 }
841 
842 static int
843 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
844 {
845 	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
846 	void *pdata, *qdata;
847 	uint64_t xsize, ysize, i;
848 	int x = tgts[0];
849 	int y = tgts[1];
850 
851 	ASSERT(ntgts == 2);
852 	ASSERT(x < y);
853 	ASSERT(x >= rm->rm_firstdatacol);
854 	ASSERT(y < rm->rm_cols);
855 
856 	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
857 
858 	/*
859 	 * Move the parity data aside -- we're going to compute parity as
860 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
861 	 * reuse the parity generation mechanism without trashing the actual
862 	 * parity so we make those columns appear to be full of zeros by
863 	 * setting their lengths to zero.
864 	 */
865 	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
866 	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
867 	xsize = rm->rm_col[x].rc_size;
868 	ysize = rm->rm_col[y].rc_size;
869 
870 	rm->rm_col[VDEV_RAIDZ_P].rc_data =
871 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
872 	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
873 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
874 	rm->rm_col[x].rc_size = 0;
875 	rm->rm_col[y].rc_size = 0;
876 
877 	vdev_raidz_generate_parity_pq(rm);
878 
879 	rm->rm_col[x].rc_size = xsize;
880 	rm->rm_col[y].rc_size = ysize;
881 
882 	p = pdata;
883 	q = qdata;
884 	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
885 	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
886 	xd = rm->rm_col[x].rc_data;
887 	yd = rm->rm_col[y].rc_data;
888 
889 	/*
890 	 * We now have:
891 	 *	Pxy = P + D_x + D_y
892 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
893 	 *
894 	 * We can then solve for D_x:
895 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
896 	 * where
897 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
898 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
899 	 *
900 	 * With D_x in hand, we can easily solve for D_y:
901 	 *	D_y = P + Pxy + D_x
902 	 */
903 
904 	a = vdev_raidz_pow2[255 + x - y];
905 	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
906 	tmp = 255 - vdev_raidz_log2[a ^ 1];
907 
908 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
909 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
910 
911 	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
912 		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
913 		    vdev_raidz_exp2(*q ^ *qxy, bexp);
914 
915 		if (i < ysize)
916 			*yd = *p ^ *pxy ^ *xd;
917 	}
918 
919 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
920 	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
921 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
922 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
923 
924 	/*
925 	 * Restore the saved parity data.
926 	 */
927 	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
928 	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
929 
930 	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
931 }
932 
933 /* BEGIN CSTYLED */
934 /*
935  * In the general case of reconstruction, we must solve the system of linear
936  * equations defined by the coeffecients used to generate parity as well as
937  * the contents of the data and parity disks. This can be expressed with
938  * vectors for the original data (D) and the actual data (d) and parity (p)
939  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
940  *
941  *            __   __                     __     __
942  *            |     |         __     __   |  p_0  |
943  *            |  V  |         |  D_0  |   | p_m-1 |
944  *            |     |    x    |   :   | = |  d_0  |
945  *            |  I  |         | D_n-1 |   |   :   |
946  *            |     |         ~~     ~~   | d_n-1 |
947  *            ~~   ~~                     ~~     ~~
948  *
949  * I is simply a square identity matrix of size n, and V is a vandermonde
950  * matrix defined by the coeffecients we chose for the various parity columns
951  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
952  * computation as well as linear separability.
953  *
954  *      __               __               __     __
955  *      |   1   ..  1 1 1 |               |  p_0  |
956  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
957  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
958  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
959  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
960  *      |   :       : : : |   |   :   |   |  d_2  |
961  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
962  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
963  *      |   0   ..  0 0 1 |               | d_n-1 |
964  *      ~~               ~~               ~~     ~~
965  *
966  * Note that I, V, d, and p are known. To compute D, we must invert the
967  * matrix and use the known data and parity values to reconstruct the unknown
968  * data values. We begin by removing the rows in V|I and d|p that correspond
969  * to failed or missing columns; we then make V|I square (n x n) and d|p
970  * sized n by removing rows corresponding to unused parity from the bottom up
971  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
972  * using Gauss-Jordan elimination. In the example below we use m=3 parity
973  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
974  *           __                               __
975  *           |  1   1   1   1   1   1   1   1  |
976  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
977  *           |  19 205 116  29  64  16  4   1  |      / /
978  *           |  1   0   0   0   0   0   0   0  |     / /
979  *           |  0   1   0   0   0   0   0   0  | <--' /
980  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
981  *           |  0   0   0   1   0   0   0   0  |
982  *           |  0   0   0   0   1   0   0   0  |
983  *           |  0   0   0   0   0   1   0   0  |
984  *           |  0   0   0   0   0   0   1   0  |
985  *           |  0   0   0   0   0   0   0   1  |
986  *           ~~                               ~~
987  *           __                               __
988  *           |  1   1   1   1   1   1   1   1  |
989  *           | 128  64  32  16  8   4   2   1  |
990  *           |  19 205 116  29  64  16  4   1  |
991  *           |  1   0   0   0   0   0   0   0  |
992  *           |  0   1   0   0   0   0   0   0  |
993  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
994  *           |  0   0   0   1   0   0   0   0  |
995  *           |  0   0   0   0   1   0   0   0  |
996  *           |  0   0   0   0   0   1   0   0  |
997  *           |  0   0   0   0   0   0   1   0  |
998  *           |  0   0   0   0   0   0   0   1  |
999  *           ~~                               ~~
1000  *
1001  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1002  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1003  * matrix is not singular.
1004  * __                                                                 __
1005  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1006  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1007  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1008  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1009  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1010  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1011  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1012  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1013  * ~~                                                                 ~~
1014  * __                                                                 __
1015  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1016  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1017  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1018  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1019  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1020  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1021  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1022  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1023  * ~~                                                                 ~~
1024  * __                                                                 __
1025  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1026  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1027  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1028  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1029  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1030  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1031  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1032  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1033  * ~~                                                                 ~~
1034  * __                                                                 __
1035  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1036  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1037  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1038  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1039  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1040  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1041  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1042  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1043  * ~~                                                                 ~~
1044  * __                                                                 __
1045  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1046  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1047  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1048  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1049  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1050  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1051  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1052  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1053  * ~~                                                                 ~~
1054  * __                                                                 __
1055  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1056  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1057  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1058  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1059  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1060  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1061  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1062  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1063  * ~~                                                                 ~~
1064  *                   __                               __
1065  *                   |  0   0   1   0   0   0   0   0  |
1066  *                   | 167 100  5   41 159 169 217 208 |
1067  *                   | 166 100  4   40 158 168 216 209 |
1068  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1069  *                   |  0   0   0   0   1   0   0   0  |
1070  *                   |  0   0   0   0   0   1   0   0  |
1071  *                   |  0   0   0   0   0   0   1   0  |
1072  *                   |  0   0   0   0   0   0   0   1  |
1073  *                   ~~                               ~~
1074  *
1075  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1076  * of the missing data.
1077  *
1078  * As is apparent from the example above, the only non-trivial rows in the
1079  * inverse matrix correspond to the data disks that we're trying to
1080  * reconstruct. Indeed, those are the only rows we need as the others would
1081  * only be useful for reconstructing data known or assumed to be valid. For
1082  * that reason, we only build the coefficients in the rows that correspond to
1083  * targeted columns.
1084  */
1085 /* END CSTYLED */
1086 
1087 static void
1088 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1089     uint8_t **rows)
1090 {
1091 	int i, j;
1092 	int pow;
1093 
1094 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1095 
1096 	/*
1097 	 * Fill in the missing rows of interest.
1098 	 */
1099 	for (i = 0; i < nmap; i++) {
1100 		ASSERT3S(0, <=, map[i]);
1101 		ASSERT3S(map[i], <=, 2);
1102 
1103 		pow = map[i] * n;
1104 		if (pow > 255)
1105 			pow -= 255;
1106 		ASSERT(pow <= 255);
1107 
1108 		for (j = 0; j < n; j++) {
1109 			pow -= map[i];
1110 			if (pow < 0)
1111 				pow += 255;
1112 			rows[i][j] = vdev_raidz_pow2[pow];
1113 		}
1114 	}
1115 }
1116 
1117 static void
1118 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1119     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1120 {
1121 	int i, j, ii, jj;
1122 	uint8_t log;
1123 
1124 	/*
1125 	 * Assert that the first nmissing entries from the array of used
1126 	 * columns correspond to parity columns and that subsequent entries
1127 	 * correspond to data columns.
1128 	 */
1129 	for (i = 0; i < nmissing; i++) {
1130 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
1131 	}
1132 	for (; i < n; i++) {
1133 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1134 	}
1135 
1136 	/*
1137 	 * First initialize the storage where we'll compute the inverse rows.
1138 	 */
1139 	for (i = 0; i < nmissing; i++) {
1140 		for (j = 0; j < n; j++) {
1141 			invrows[i][j] = (i == j) ? 1 : 0;
1142 		}
1143 	}
1144 
1145 	/*
1146 	 * Subtract all trivial rows from the rows of consequence.
1147 	 */
1148 	for (i = 0; i < nmissing; i++) {
1149 		for (j = nmissing; j < n; j++) {
1150 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1151 			jj = used[j] - rm->rm_firstdatacol;
1152 			ASSERT3S(jj, <, n);
1153 			invrows[i][j] = rows[i][jj];
1154 			rows[i][jj] = 0;
1155 		}
1156 	}
1157 
1158 	/*
1159 	 * For each of the rows of interest, we must normalize it and subtract
1160 	 * a multiple of it from the other rows.
1161 	 */
1162 	for (i = 0; i < nmissing; i++) {
1163 		for (j = 0; j < missing[i]; j++) {
1164 			ASSERT0(rows[i][j]);
1165 		}
1166 		ASSERT3U(rows[i][missing[i]], !=, 0);
1167 
1168 		/*
1169 		 * Compute the inverse of the first element and multiply each
1170 		 * element in the row by that value.
1171 		 */
1172 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1173 
1174 		for (j = 0; j < n; j++) {
1175 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1176 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1177 		}
1178 
1179 		for (ii = 0; ii < nmissing; ii++) {
1180 			if (i == ii)
1181 				continue;
1182 
1183 			ASSERT3U(rows[ii][missing[i]], !=, 0);
1184 
1185 			log = vdev_raidz_log2[rows[ii][missing[i]]];
1186 
1187 			for (j = 0; j < n; j++) {
1188 				rows[ii][j] ^=
1189 				    vdev_raidz_exp2(rows[i][j], log);
1190 				invrows[ii][j] ^=
1191 				    vdev_raidz_exp2(invrows[i][j], log);
1192 			}
1193 		}
1194 	}
1195 
1196 	/*
1197 	 * Verify that the data that is left in the rows are properly part of
1198 	 * an identity matrix.
1199 	 */
1200 	for (i = 0; i < nmissing; i++) {
1201 		for (j = 0; j < n; j++) {
1202 			if (j == missing[i]) {
1203 				ASSERT3U(rows[i][j], ==, 1);
1204 			} else {
1205 				ASSERT0(rows[i][j]);
1206 			}
1207 		}
1208 	}
1209 }
1210 
1211 static void
1212 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1213     int *missing, uint8_t **invrows, const uint8_t *used)
1214 {
1215 	int i, j, x, cc, c;
1216 	uint8_t *src;
1217 	uint64_t ccount;
1218 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1219 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1220 	uint8_t log = 0;
1221 	uint8_t val;
1222 	int ll;
1223 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1224 	uint8_t *p, *pp;
1225 	size_t psize;
1226 
1227 	psize = sizeof (invlog[0][0]) * n * nmissing;
1228 	p = kmem_alloc(psize, KM_SLEEP);
1229 
1230 	for (pp = p, i = 0; i < nmissing; i++) {
1231 		invlog[i] = pp;
1232 		pp += n;
1233 	}
1234 
1235 	for (i = 0; i < nmissing; i++) {
1236 		for (j = 0; j < n; j++) {
1237 			ASSERT3U(invrows[i][j], !=, 0);
1238 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1239 		}
1240 	}
1241 
1242 	for (i = 0; i < n; i++) {
1243 		c = used[i];
1244 		ASSERT3U(c, <, rm->rm_cols);
1245 
1246 		src = rm->rm_col[c].rc_data;
1247 		ccount = rm->rm_col[c].rc_size;
1248 		for (j = 0; j < nmissing; j++) {
1249 			cc = missing[j] + rm->rm_firstdatacol;
1250 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1251 			ASSERT3U(cc, <, rm->rm_cols);
1252 			ASSERT3U(cc, !=, c);
1253 
1254 			dst[j] = rm->rm_col[cc].rc_data;
1255 			dcount[j] = rm->rm_col[cc].rc_size;
1256 		}
1257 
1258 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1259 
1260 		for (x = 0; x < ccount; x++, src++) {
1261 			if (*src != 0)
1262 				log = vdev_raidz_log2[*src];
1263 
1264 			for (cc = 0; cc < nmissing; cc++) {
1265 				if (x >= dcount[cc])
1266 					continue;
1267 
1268 				if (*src == 0) {
1269 					val = 0;
1270 				} else {
1271 					if ((ll = log + invlog[cc][i]) >= 255)
1272 						ll -= 255;
1273 					val = vdev_raidz_pow2[ll];
1274 				}
1275 
1276 				if (i == 0)
1277 					dst[cc][x] = val;
1278 				else
1279 					dst[cc][x] ^= val;
1280 			}
1281 		}
1282 	}
1283 
1284 	kmem_free(p, psize);
1285 }
1286 
1287 static int
1288 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1289 {
1290 	int n, i, c, t, tt;
1291 	int nmissing_rows;
1292 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1293 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1294 
1295 	uint8_t *p, *pp;
1296 	size_t psize;
1297 
1298 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1299 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1300 	uint8_t *used;
1301 
1302 	int code = 0;
1303 
1304 
1305 	n = rm->rm_cols - rm->rm_firstdatacol;
1306 
1307 	/*
1308 	 * Figure out which data columns are missing.
1309 	 */
1310 	nmissing_rows = 0;
1311 	for (t = 0; t < ntgts; t++) {
1312 		if (tgts[t] >= rm->rm_firstdatacol) {
1313 			missing_rows[nmissing_rows++] =
1314 			    tgts[t] - rm->rm_firstdatacol;
1315 		}
1316 	}
1317 
1318 	/*
1319 	 * Figure out which parity columns to use to help generate the missing
1320 	 * data columns.
1321 	 */
1322 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1323 		ASSERT(tt < ntgts);
1324 		ASSERT(c < rm->rm_firstdatacol);
1325 
1326 		/*
1327 		 * Skip any targeted parity columns.
1328 		 */
1329 		if (c == tgts[tt]) {
1330 			tt++;
1331 			continue;
1332 		}
1333 
1334 		code |= 1 << c;
1335 
1336 		parity_map[i] = c;
1337 		i++;
1338 	}
1339 
1340 	ASSERT(code != 0);
1341 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1342 
1343 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1344 	    nmissing_rows * n + sizeof (used[0]) * n;
1345 	p = kmem_alloc(psize, KM_SLEEP);
1346 
1347 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1348 		rows[i] = pp;
1349 		pp += n;
1350 		invrows[i] = pp;
1351 		pp += n;
1352 	}
1353 	used = pp;
1354 
1355 	for (i = 0; i < nmissing_rows; i++) {
1356 		used[i] = parity_map[i];
1357 	}
1358 
1359 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1360 		if (tt < nmissing_rows &&
1361 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1362 			tt++;
1363 			continue;
1364 		}
1365 
1366 		ASSERT3S(i, <, n);
1367 		used[i] = c;
1368 		i++;
1369 	}
1370 
1371 	/*
1372 	 * Initialize the interesting rows of the matrix.
1373 	 */
1374 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1375 
1376 	/*
1377 	 * Invert the matrix.
1378 	 */
1379 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1380 	    invrows, used);
1381 
1382 	/*
1383 	 * Reconstruct the missing data using the generated matrix.
1384 	 */
1385 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1386 	    invrows, used);
1387 
1388 	kmem_free(p, psize);
1389 
1390 	return (code);
1391 }
1392 
1393 static int
1394 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1395 {
1396 	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1397 	int ntgts;
1398 	int i, c;
1399 	int code;
1400 	int nbadparity, nbaddata;
1401 	int parity_valid[VDEV_RAIDZ_MAXPARITY];
1402 
1403 	/*
1404 	 * The tgts list must already be sorted.
1405 	 */
1406 	for (i = 1; i < nt; i++) {
1407 		ASSERT(t[i] > t[i - 1]);
1408 	}
1409 
1410 	nbadparity = rm->rm_firstdatacol;
1411 	nbaddata = rm->rm_cols - nbadparity;
1412 	ntgts = 0;
1413 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1414 		if (c < rm->rm_firstdatacol)
1415 			parity_valid[c] = B_FALSE;
1416 
1417 		if (i < nt && c == t[i]) {
1418 			tgts[ntgts++] = c;
1419 			i++;
1420 		} else if (rm->rm_col[c].rc_error != 0) {
1421 			tgts[ntgts++] = c;
1422 		} else if (c >= rm->rm_firstdatacol) {
1423 			nbaddata--;
1424 		} else {
1425 			parity_valid[c] = B_TRUE;
1426 			nbadparity--;
1427 		}
1428 	}
1429 
1430 	ASSERT(ntgts >= nt);
1431 	ASSERT(nbaddata >= 0);
1432 	ASSERT(nbaddata + nbadparity == ntgts);
1433 
1434 	dt = &tgts[nbadparity];
1435 
1436 	/*
1437 	 * See if we can use any of our optimized reconstruction routines.
1438 	 */
1439 	if (!vdev_raidz_default_to_general) {
1440 		switch (nbaddata) {
1441 		case 1:
1442 			if (parity_valid[VDEV_RAIDZ_P])
1443 				return (vdev_raidz_reconstruct_p(rm, dt, 1));
1444 
1445 			ASSERT(rm->rm_firstdatacol > 1);
1446 
1447 			if (parity_valid[VDEV_RAIDZ_Q])
1448 				return (vdev_raidz_reconstruct_q(rm, dt, 1));
1449 
1450 			ASSERT(rm->rm_firstdatacol > 2);
1451 			break;
1452 
1453 		case 2:
1454 			ASSERT(rm->rm_firstdatacol > 1);
1455 
1456 			if (parity_valid[VDEV_RAIDZ_P] &&
1457 			    parity_valid[VDEV_RAIDZ_Q])
1458 				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1459 
1460 			ASSERT(rm->rm_firstdatacol > 2);
1461 
1462 			break;
1463 		}
1464 	}
1465 
1466 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1467 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1468 	ASSERT(code > 0);
1469 	return (code);
1470 }
1471 
1472 static int
1473 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1474     uint64_t *ashift)
1475 {
1476 	vdev_t *cvd;
1477 	uint64_t nparity = vd->vdev_nparity;
1478 	int c;
1479 	int lasterror = 0;
1480 	int numerrors = 0;
1481 
1482 	ASSERT(nparity > 0);
1483 
1484 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
1485 	    vd->vdev_children < nparity + 1) {
1486 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1487 		return (SET_ERROR(EINVAL));
1488 	}
1489 
1490 	vdev_open_children(vd);
1491 
1492 	for (c = 0; c < vd->vdev_children; c++) {
1493 		cvd = vd->vdev_child[c];
1494 
1495 		if (cvd->vdev_open_error != 0) {
1496 			lasterror = cvd->vdev_open_error;
1497 			numerrors++;
1498 			continue;
1499 		}
1500 
1501 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1502 		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1503 		*ashift = MAX(*ashift, cvd->vdev_ashift);
1504 	}
1505 
1506 	*asize *= vd->vdev_children;
1507 	*max_asize *= vd->vdev_children;
1508 
1509 	if (numerrors > nparity) {
1510 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1511 		return (lasterror);
1512 	}
1513 
1514 	return (0);
1515 }
1516 
1517 static void
1518 vdev_raidz_close(vdev_t *vd)
1519 {
1520 	int c;
1521 
1522 	for (c = 0; c < vd->vdev_children; c++)
1523 		vdev_close(vd->vdev_child[c]);
1524 }
1525 
1526 static uint64_t
1527 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1528 {
1529 	uint64_t asize;
1530 	uint64_t ashift = vd->vdev_top->vdev_ashift;
1531 	uint64_t cols = vd->vdev_children;
1532 	uint64_t nparity = vd->vdev_nparity;
1533 
1534 	asize = ((psize - 1) >> ashift) + 1;
1535 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1536 	asize = roundup(asize, nparity + 1) << ashift;
1537 
1538 	return (asize);
1539 }
1540 
1541 static void
1542 vdev_raidz_child_done(zio_t *zio)
1543 {
1544 	raidz_col_t *rc = zio->io_private;
1545 
1546 	rc->rc_error = zio->io_error;
1547 	rc->rc_tried = 1;
1548 	rc->rc_skipped = 0;
1549 }
1550 
1551 /*
1552  * Start an IO operation on a RAIDZ VDev
1553  *
1554  * Outline:
1555  * - For write operations:
1556  *   1. Generate the parity data
1557  *   2. Create child zio write operations to each column's vdev, for both
1558  *      data and parity.
1559  *   3. If the column skips any sectors for padding, create optional dummy
1560  *      write zio children for those areas to improve aggregation continuity.
1561  * - For read operations:
1562  *   1. Create child zio read operations to each data column's vdev to read
1563  *      the range of data required for zio.
1564  *   2. If this is a scrub or resilver operation, or if any of the data
1565  *      vdevs have had errors, then create zio read operations to the parity
1566  *      columns' VDevs as well.
1567  */
1568 static int
1569 vdev_raidz_io_start(zio_t *zio)
1570 {
1571 	vdev_t *vd = zio->io_vd;
1572 	vdev_t *tvd = vd->vdev_top;
1573 	vdev_t *cvd;
1574 	raidz_map_t *rm;
1575 	raidz_col_t *rc;
1576 	int c, i;
1577 
1578 	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
1579 	    vd->vdev_nparity);
1580 
1581 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1582 
1583 	if (zio->io_type == ZIO_TYPE_WRITE) {
1584 		vdev_raidz_generate_parity(rm);
1585 
1586 		for (c = 0; c < rm->rm_cols; c++) {
1587 			rc = &rm->rm_col[c];
1588 			cvd = vd->vdev_child[rc->rc_devidx];
1589 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1590 			    rc->rc_offset, rc->rc_data, rc->rc_size,
1591 			    zio->io_type, zio->io_priority, 0,
1592 			    vdev_raidz_child_done, rc));
1593 		}
1594 
1595 		/*
1596 		 * Generate optional I/Os for any skipped sectors to improve
1597 		 * aggregation contiguity.
1598 		 */
1599 		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1600 			ASSERT(c <= rm->rm_scols);
1601 			if (c == rm->rm_scols)
1602 				c = 0;
1603 			rc = &rm->rm_col[c];
1604 			cvd = vd->vdev_child[rc->rc_devidx];
1605 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1606 			    rc->rc_offset + rc->rc_size, NULL,
1607 			    1 << tvd->vdev_ashift,
1608 			    zio->io_type, zio->io_priority,
1609 			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1610 		}
1611 
1612 		return (ZIO_PIPELINE_CONTINUE);
1613 	}
1614 
1615 	ASSERT(zio->io_type == ZIO_TYPE_READ);
1616 
1617 	/*
1618 	 * Iterate over the columns in reverse order so that we hit the parity
1619 	 * last -- any errors along the way will force us to read the parity.
1620 	 */
1621 	for (c = rm->rm_cols - 1; c >= 0; c--) {
1622 		rc = &rm->rm_col[c];
1623 		cvd = vd->vdev_child[rc->rc_devidx];
1624 		if (!vdev_readable(cvd)) {
1625 			if (c >= rm->rm_firstdatacol)
1626 				rm->rm_missingdata++;
1627 			else
1628 				rm->rm_missingparity++;
1629 			rc->rc_error = SET_ERROR(ENXIO);
1630 			rc->rc_tried = 1;	/* don't even try */
1631 			rc->rc_skipped = 1;
1632 			continue;
1633 		}
1634 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1635 			if (c >= rm->rm_firstdatacol)
1636 				rm->rm_missingdata++;
1637 			else
1638 				rm->rm_missingparity++;
1639 			rc->rc_error = SET_ERROR(ESTALE);
1640 			rc->rc_skipped = 1;
1641 			continue;
1642 		}
1643 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1644 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1645 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1646 			    rc->rc_offset, rc->rc_data, rc->rc_size,
1647 			    zio->io_type, zio->io_priority, 0,
1648 			    vdev_raidz_child_done, rc));
1649 		}
1650 	}
1651 
1652 	return (ZIO_PIPELINE_CONTINUE);
1653 }
1654 
1655 
1656 /*
1657  * Report a checksum error for a child of a RAID-Z device.
1658  */
1659 static void
1660 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1661 {
1662 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1663 
1664 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1665 		zio_bad_cksum_t zbc;
1666 		raidz_map_t *rm = zio->io_vsd;
1667 
1668 		mutex_enter(&vd->vdev_stat_lock);
1669 		vd->vdev_stat.vs_checksum_errors++;
1670 		mutex_exit(&vd->vdev_stat_lock);
1671 
1672 		zbc.zbc_has_cksum = 0;
1673 		zbc.zbc_injected = rm->rm_ecksuminjected;
1674 
1675 		zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1676 		    rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1677 		    &zbc);
1678 	}
1679 }
1680 
1681 /*
1682  * We keep track of whether or not there were any injected errors, so that
1683  * any ereports we generate can note it.
1684  */
1685 static int
1686 raidz_checksum_verify(zio_t *zio)
1687 {
1688 	zio_bad_cksum_t zbc;
1689 	raidz_map_t *rm = zio->io_vsd;
1690 
1691 	int ret = zio_checksum_error(zio, &zbc);
1692 	if (ret != 0 && zbc.zbc_injected != 0)
1693 		rm->rm_ecksuminjected = 1;
1694 
1695 	return (ret);
1696 }
1697 
1698 /*
1699  * Generate the parity from the data columns. If we tried and were able to
1700  * read the parity without error, verify that the generated parity matches the
1701  * data we read. If it doesn't, we fire off a checksum error. Return the
1702  * number such failures.
1703  */
1704 static int
1705 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1706 {
1707 	void *orig[VDEV_RAIDZ_MAXPARITY];
1708 	int c, ret = 0;
1709 	raidz_col_t *rc;
1710 
1711 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1712 		rc = &rm->rm_col[c];
1713 		if (!rc->rc_tried || rc->rc_error != 0)
1714 			continue;
1715 		orig[c] = zio_buf_alloc(rc->rc_size);
1716 		bcopy(rc->rc_data, orig[c], rc->rc_size);
1717 	}
1718 
1719 	vdev_raidz_generate_parity(rm);
1720 
1721 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1722 		rc = &rm->rm_col[c];
1723 		if (!rc->rc_tried || rc->rc_error != 0)
1724 			continue;
1725 		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1726 			raidz_checksum_error(zio, rc, orig[c]);
1727 			rc->rc_error = SET_ERROR(ECKSUM);
1728 			ret++;
1729 		}
1730 		zio_buf_free(orig[c], rc->rc_size);
1731 	}
1732 
1733 	return (ret);
1734 }
1735 
1736 /*
1737  * Keep statistics on all the ways that we used parity to correct data.
1738  */
1739 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1740 
1741 static int
1742 vdev_raidz_worst_error(raidz_map_t *rm)
1743 {
1744 	int error = 0;
1745 
1746 	for (int c = 0; c < rm->rm_cols; c++)
1747 		error = zio_worst_error(error, rm->rm_col[c].rc_error);
1748 
1749 	return (error);
1750 }
1751 
1752 /*
1753  * Iterate over all combinations of bad data and attempt a reconstruction.
1754  * Note that the algorithm below is non-optimal because it doesn't take into
1755  * account how reconstruction is actually performed. For example, with
1756  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1757  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1758  * cases we'd only use parity information in column 0.
1759  */
1760 static int
1761 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1762 {
1763 	raidz_map_t *rm = zio->io_vsd;
1764 	raidz_col_t *rc;
1765 	void *orig[VDEV_RAIDZ_MAXPARITY];
1766 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1767 	int *tgts = &tstore[1];
1768 	int current, next, i, c, n;
1769 	int code, ret = 0;
1770 
1771 	ASSERT(total_errors < rm->rm_firstdatacol);
1772 
1773 	/*
1774 	 * This simplifies one edge condition.
1775 	 */
1776 	tgts[-1] = -1;
1777 
1778 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1779 		/*
1780 		 * Initialize the targets array by finding the first n columns
1781 		 * that contain no error.
1782 		 *
1783 		 * If there were no data errors, we need to ensure that we're
1784 		 * always explicitly attempting to reconstruct at least one
1785 		 * data column. To do this, we simply push the highest target
1786 		 * up into the data columns.
1787 		 */
1788 		for (c = 0, i = 0; i < n; i++) {
1789 			if (i == n - 1 && data_errors == 0 &&
1790 			    c < rm->rm_firstdatacol) {
1791 				c = rm->rm_firstdatacol;
1792 			}
1793 
1794 			while (rm->rm_col[c].rc_error != 0) {
1795 				c++;
1796 				ASSERT3S(c, <, rm->rm_cols);
1797 			}
1798 
1799 			tgts[i] = c++;
1800 		}
1801 
1802 		/*
1803 		 * Setting tgts[n] simplifies the other edge condition.
1804 		 */
1805 		tgts[n] = rm->rm_cols;
1806 
1807 		/*
1808 		 * These buffers were allocated in previous iterations.
1809 		 */
1810 		for (i = 0; i < n - 1; i++) {
1811 			ASSERT(orig[i] != NULL);
1812 		}
1813 
1814 		orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
1815 
1816 		current = 0;
1817 		next = tgts[current];
1818 
1819 		while (current != n) {
1820 			tgts[current] = next;
1821 			current = 0;
1822 
1823 			/*
1824 			 * Save off the original data that we're going to
1825 			 * attempt to reconstruct.
1826 			 */
1827 			for (i = 0; i < n; i++) {
1828 				ASSERT(orig[i] != NULL);
1829 				c = tgts[i];
1830 				ASSERT3S(c, >=, 0);
1831 				ASSERT3S(c, <, rm->rm_cols);
1832 				rc = &rm->rm_col[c];
1833 				bcopy(rc->rc_data, orig[i], rc->rc_size);
1834 			}
1835 
1836 			/*
1837 			 * Attempt a reconstruction and exit the outer loop on
1838 			 * success.
1839 			 */
1840 			code = vdev_raidz_reconstruct(rm, tgts, n);
1841 			if (raidz_checksum_verify(zio) == 0) {
1842 				atomic_inc_64(&raidz_corrected[code]);
1843 
1844 				for (i = 0; i < n; i++) {
1845 					c = tgts[i];
1846 					rc = &rm->rm_col[c];
1847 					ASSERT(rc->rc_error == 0);
1848 					if (rc->rc_tried)
1849 						raidz_checksum_error(zio, rc,
1850 						    orig[i]);
1851 					rc->rc_error = SET_ERROR(ECKSUM);
1852 				}
1853 
1854 				ret = code;
1855 				goto done;
1856 			}
1857 
1858 			/*
1859 			 * Restore the original data.
1860 			 */
1861 			for (i = 0; i < n; i++) {
1862 				c = tgts[i];
1863 				rc = &rm->rm_col[c];
1864 				bcopy(orig[i], rc->rc_data, rc->rc_size);
1865 			}
1866 
1867 			do {
1868 				/*
1869 				 * Find the next valid column after the current
1870 				 * position..
1871 				 */
1872 				for (next = tgts[current] + 1;
1873 				    next < rm->rm_cols &&
1874 				    rm->rm_col[next].rc_error != 0; next++)
1875 					continue;
1876 
1877 				ASSERT(next <= tgts[current + 1]);
1878 
1879 				/*
1880 				 * If that spot is available, we're done here.
1881 				 */
1882 				if (next != tgts[current + 1])
1883 					break;
1884 
1885 				/*
1886 				 * Otherwise, find the next valid column after
1887 				 * the previous position.
1888 				 */
1889 				for (c = tgts[current - 1] + 1;
1890 				    rm->rm_col[c].rc_error != 0; c++)
1891 					continue;
1892 
1893 				tgts[current] = c;
1894 				current++;
1895 
1896 			} while (current != n);
1897 		}
1898 	}
1899 	n--;
1900 done:
1901 	for (i = 0; i < n; i++) {
1902 		zio_buf_free(orig[i], rm->rm_col[0].rc_size);
1903 	}
1904 
1905 	return (ret);
1906 }
1907 
1908 /*
1909  * Complete an IO operation on a RAIDZ VDev
1910  *
1911  * Outline:
1912  * - For write operations:
1913  *   1. Check for errors on the child IOs.
1914  *   2. Return, setting an error code if too few child VDevs were written
1915  *      to reconstruct the data later.  Note that partial writes are
1916  *      considered successful if they can be reconstructed at all.
1917  * - For read operations:
1918  *   1. Check for errors on the child IOs.
1919  *   2. If data errors occurred:
1920  *      a. Try to reassemble the data from the parity available.
1921  *      b. If we haven't yet read the parity drives, read them now.
1922  *      c. If all parity drives have been read but the data still doesn't
1923  *         reassemble with a correct checksum, then try combinatorial
1924  *         reconstruction.
1925  *      d. If that doesn't work, return an error.
1926  *   3. If there were unexpected errors or this is a resilver operation,
1927  *      rewrite the vdevs that had errors.
1928  */
1929 static void
1930 vdev_raidz_io_done(zio_t *zio)
1931 {
1932 	vdev_t *vd = zio->io_vd;
1933 	vdev_t *cvd;
1934 	raidz_map_t *rm = zio->io_vsd;
1935 	raidz_col_t *rc;
1936 	int unexpected_errors = 0;
1937 	int parity_errors = 0;
1938 	int parity_untried = 0;
1939 	int data_errors = 0;
1940 	int total_errors = 0;
1941 	int n, c;
1942 	int tgts[VDEV_RAIDZ_MAXPARITY];
1943 	int code;
1944 
1945 	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
1946 
1947 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1948 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1949 
1950 	for (c = 0; c < rm->rm_cols; c++) {
1951 		rc = &rm->rm_col[c];
1952 
1953 		if (rc->rc_error) {
1954 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
1955 
1956 			if (c < rm->rm_firstdatacol)
1957 				parity_errors++;
1958 			else
1959 				data_errors++;
1960 
1961 			if (!rc->rc_skipped)
1962 				unexpected_errors++;
1963 
1964 			total_errors++;
1965 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1966 			parity_untried++;
1967 		}
1968 	}
1969 
1970 	if (zio->io_type == ZIO_TYPE_WRITE) {
1971 		/*
1972 		 * XXX -- for now, treat partial writes as a success.
1973 		 * (If we couldn't write enough columns to reconstruct
1974 		 * the data, the I/O failed.  Otherwise, good enough.)
1975 		 *
1976 		 * Now that we support write reallocation, it would be better
1977 		 * to treat partial failure as real failure unless there are
1978 		 * no non-degraded top-level vdevs left, and not update DTLs
1979 		 * if we intend to reallocate.
1980 		 */
1981 		/* XXPOLICY */
1982 		if (total_errors > rm->rm_firstdatacol)
1983 			zio->io_error = vdev_raidz_worst_error(rm);
1984 
1985 		return;
1986 	}
1987 
1988 	ASSERT(zio->io_type == ZIO_TYPE_READ);
1989 	/*
1990 	 * There are three potential phases for a read:
1991 	 *	1. produce valid data from the columns read
1992 	 *	2. read all disks and try again
1993 	 *	3. perform combinatorial reconstruction
1994 	 *
1995 	 * Each phase is progressively both more expensive and less likely to
1996 	 * occur. If we encounter more errors than we can repair or all phases
1997 	 * fail, we have no choice but to return an error.
1998 	 */
1999 
2000 	/*
2001 	 * If the number of errors we saw was correctable -- less than or equal
2002 	 * to the number of parity disks read -- attempt to produce data that
2003 	 * has a valid checksum. Naturally, this case applies in the absence of
2004 	 * any errors.
2005 	 */
2006 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2007 		if (data_errors == 0) {
2008 			if (raidz_checksum_verify(zio) == 0) {
2009 				/*
2010 				 * If we read parity information (unnecessarily
2011 				 * as it happens since no reconstruction was
2012 				 * needed) regenerate and verify the parity.
2013 				 * We also regenerate parity when resilvering
2014 				 * so we can write it out to the failed device
2015 				 * later.
2016 				 */
2017 				if (parity_errors + parity_untried <
2018 				    rm->rm_firstdatacol ||
2019 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2020 					n = raidz_parity_verify(zio, rm);
2021 					unexpected_errors += n;
2022 					ASSERT(parity_errors + n <=
2023 					    rm->rm_firstdatacol);
2024 				}
2025 				goto done;
2026 			}
2027 		} else {
2028 			/*
2029 			 * We either attempt to read all the parity columns or
2030 			 * none of them. If we didn't try to read parity, we
2031 			 * wouldn't be here in the correctable case. There must
2032 			 * also have been fewer parity errors than parity
2033 			 * columns or, again, we wouldn't be in this code path.
2034 			 */
2035 			ASSERT(parity_untried == 0);
2036 			ASSERT(parity_errors < rm->rm_firstdatacol);
2037 
2038 			/*
2039 			 * Identify the data columns that reported an error.
2040 			 */
2041 			n = 0;
2042 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2043 				rc = &rm->rm_col[c];
2044 				if (rc->rc_error != 0) {
2045 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2046 					tgts[n++] = c;
2047 				}
2048 			}
2049 
2050 			ASSERT(rm->rm_firstdatacol >= n);
2051 
2052 			code = vdev_raidz_reconstruct(rm, tgts, n);
2053 
2054 			if (raidz_checksum_verify(zio) == 0) {
2055 				atomic_inc_64(&raidz_corrected[code]);
2056 
2057 				/*
2058 				 * If we read more parity disks than were used
2059 				 * for reconstruction, confirm that the other
2060 				 * parity disks produced correct data. This
2061 				 * routine is suboptimal in that it regenerates
2062 				 * the parity that we already used in addition
2063 				 * to the parity that we're attempting to
2064 				 * verify, but this should be a relatively
2065 				 * uncommon case, and can be optimized if it
2066 				 * becomes a problem. Note that we regenerate
2067 				 * parity when resilvering so we can write it
2068 				 * out to failed devices later.
2069 				 */
2070 				if (parity_errors < rm->rm_firstdatacol - n ||
2071 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2072 					n = raidz_parity_verify(zio, rm);
2073 					unexpected_errors += n;
2074 					ASSERT(parity_errors + n <=
2075 					    rm->rm_firstdatacol);
2076 				}
2077 
2078 				goto done;
2079 			}
2080 		}
2081 	}
2082 
2083 	/*
2084 	 * This isn't a typical situation -- either we got a read error or
2085 	 * a child silently returned bad data. Read every block so we can
2086 	 * try again with as much data and parity as we can track down. If
2087 	 * we've already been through once before, all children will be marked
2088 	 * as tried so we'll proceed to combinatorial reconstruction.
2089 	 */
2090 	unexpected_errors = 1;
2091 	rm->rm_missingdata = 0;
2092 	rm->rm_missingparity = 0;
2093 
2094 	for (c = 0; c < rm->rm_cols; c++) {
2095 		if (rm->rm_col[c].rc_tried)
2096 			continue;
2097 
2098 		zio_vdev_io_redone(zio);
2099 		do {
2100 			rc = &rm->rm_col[c];
2101 			if (rc->rc_tried)
2102 				continue;
2103 			zio_nowait(zio_vdev_child_io(zio, NULL,
2104 			    vd->vdev_child[rc->rc_devidx],
2105 			    rc->rc_offset, rc->rc_data, rc->rc_size,
2106 			    zio->io_type, zio->io_priority, 0,
2107 			    vdev_raidz_child_done, rc));
2108 		} while (++c < rm->rm_cols);
2109 
2110 		return;
2111 	}
2112 
2113 	/*
2114 	 * At this point we've attempted to reconstruct the data given the
2115 	 * errors we detected, and we've attempted to read all columns. There
2116 	 * must, therefore, be one or more additional problems -- silent errors
2117 	 * resulting in invalid data rather than explicit I/O errors resulting
2118 	 * in absent data. We check if there is enough additional data to
2119 	 * possibly reconstruct the data and then perform combinatorial
2120 	 * reconstruction over all possible combinations. If that fails,
2121 	 * we're cooked.
2122 	 */
2123 	if (total_errors > rm->rm_firstdatacol) {
2124 		zio->io_error = vdev_raidz_worst_error(rm);
2125 
2126 	} else if (total_errors < rm->rm_firstdatacol &&
2127 	    (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2128 		/*
2129 		 * If we didn't use all the available parity for the
2130 		 * combinatorial reconstruction, verify that the remaining
2131 		 * parity is correct.
2132 		 */
2133 		if (code != (1 << rm->rm_firstdatacol) - 1)
2134 			(void) raidz_parity_verify(zio, rm);
2135 	} else {
2136 		/*
2137 		 * We're here because either:
2138 		 *
2139 		 *	total_errors == rm_first_datacol, or
2140 		 *	vdev_raidz_combrec() failed
2141 		 *
2142 		 * In either case, there is enough bad data to prevent
2143 		 * reconstruction.
2144 		 *
2145 		 * Start checksum ereports for all children which haven't
2146 		 * failed, and the IO wasn't speculative.
2147 		 */
2148 		zio->io_error = SET_ERROR(ECKSUM);
2149 
2150 		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2151 			for (c = 0; c < rm->rm_cols; c++) {
2152 				rc = &rm->rm_col[c];
2153 				if (rc->rc_error == 0) {
2154 					zio_bad_cksum_t zbc;
2155 					zbc.zbc_has_cksum = 0;
2156 					zbc.zbc_injected =
2157 					    rm->rm_ecksuminjected;
2158 
2159 					zfs_ereport_start_checksum(
2160 					    zio->io_spa,
2161 					    vd->vdev_child[rc->rc_devidx],
2162 					    zio, rc->rc_offset, rc->rc_size,
2163 					    (void *)(uintptr_t)c, &zbc);
2164 				}
2165 			}
2166 		}
2167 	}
2168 
2169 done:
2170 	zio_checksum_verified(zio);
2171 
2172 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2173 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2174 		/*
2175 		 * Use the good data we have in hand to repair damaged children.
2176 		 */
2177 		for (c = 0; c < rm->rm_cols; c++) {
2178 			rc = &rm->rm_col[c];
2179 			cvd = vd->vdev_child[rc->rc_devidx];
2180 
2181 			if (rc->rc_error == 0)
2182 				continue;
2183 
2184 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2185 			    rc->rc_offset, rc->rc_data, rc->rc_size,
2186 			    ZIO_TYPE_WRITE, zio->io_priority,
2187 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2188 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2189 		}
2190 	}
2191 }
2192 
2193 static void
2194 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2195 {
2196 	if (faulted > vd->vdev_nparity)
2197 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2198 		    VDEV_AUX_NO_REPLICAS);
2199 	else if (degraded + faulted != 0)
2200 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2201 	else
2202 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2203 }
2204 
2205 vdev_ops_t vdev_raidz_ops = {
2206 	vdev_raidz_open,
2207 	vdev_raidz_close,
2208 	vdev_raidz_asize,
2209 	vdev_raidz_io_start,
2210 	vdev_raidz_io_done,
2211 	vdev_raidz_state_change,
2212 	NULL,
2213 	NULL,
2214 	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
2215 	B_FALSE			/* not a leaf vdev */
2216 };
2217