xref: /illumos-gate/usr/src/uts/common/fs/zfs/vdev_raidz.c (revision dfc0fed81813380fb526f1a003d3be17ab26ce3b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2013 by Delphix. All rights reserved.
25  */
26 
27 #include <sys/zfs_context.h>
28 #include <sys/spa.h>
29 #include <sys/vdev_impl.h>
30 #include <sys/zio.h>
31 #include <sys/zio_checksum.h>
32 #include <sys/fs/zfs.h>
33 #include <sys/fm/fs/zfs.h>
34 
35 /*
36  * Virtual device vector for RAID-Z.
37  *
38  * This vdev supports single, double, and triple parity. For single parity,
39  * we use a simple XOR of all the data columns. For double or triple parity,
40  * we use a special case of Reed-Solomon coding. This extends the
41  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
42  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
43  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
44  * former is also based. The latter is designed to provide higher performance
45  * for writes.
46  *
47  * Note that the Plank paper claimed to support arbitrary N+M, but was then
48  * amended six years later identifying a critical flaw that invalidates its
49  * claims. Nevertheless, the technique can be adapted to work for up to
50  * triple parity. For additional parity, the amendment "Note: Correction to
51  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
52  * is viable, but the additional complexity means that write performance will
53  * suffer.
54  *
55  * All of the methods above operate on a Galois field, defined over the
56  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
57  * can be expressed with a single byte. Briefly, the operations on the
58  * field are defined as follows:
59  *
60  *   o addition (+) is represented by a bitwise XOR
61  *   o subtraction (-) is therefore identical to addition: A + B = A - B
62  *   o multiplication of A by 2 is defined by the following bitwise expression:
63  *	(A * 2)_7 = A_6
64  *	(A * 2)_6 = A_5
65  *	(A * 2)_5 = A_4
66  *	(A * 2)_4 = A_3 + A_7
67  *	(A * 2)_3 = A_2 + A_7
68  *	(A * 2)_2 = A_1 + A_7
69  *	(A * 2)_1 = A_0
70  *	(A * 2)_0 = A_7
71  *
72  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
73  * As an aside, this multiplication is derived from the error correcting
74  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
75  *
76  * Observe that any number in the field (except for 0) can be expressed as a
77  * power of 2 -- a generator for the field. We store a table of the powers of
78  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
79  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
80  * than field addition). The inverse of a field element A (A^-1) is therefore
81  * A ^ (255 - 1) = A^254.
82  *
83  * The up-to-three parity columns, P, Q, R over several data columns,
84  * D_0, ... D_n-1, can be expressed by field operations:
85  *
86  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
87  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
88  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
89  *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
90  *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
91  *
92  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
93  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
94  * independent coefficients. (There are no additional coefficients that have
95  * this property which is why the uncorrected Plank method breaks down.)
96  *
97  * See the reconstruction code below for how P, Q and R can used individually
98  * or in concert to recover missing data columns.
99  */
100 
101 typedef struct raidz_col {
102 	uint64_t rc_devidx;		/* child device index for I/O */
103 	uint64_t rc_offset;		/* device offset */
104 	uint64_t rc_size;		/* I/O size */
105 	void *rc_data;			/* I/O data */
106 	void *rc_gdata;			/* used to store the "good" version */
107 	int rc_error;			/* I/O error for this device */
108 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
109 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
110 } raidz_col_t;
111 
112 typedef struct raidz_map {
113 	uint64_t rm_cols;		/* Regular column count */
114 	uint64_t rm_scols;		/* Count including skipped columns */
115 	uint64_t rm_bigcols;		/* Number of oversized columns */
116 	uint64_t rm_asize;		/* Actual total I/O size */
117 	uint64_t rm_missingdata;	/* Count of missing data devices */
118 	uint64_t rm_missingparity;	/* Count of missing parity devices */
119 	uint64_t rm_firstdatacol;	/* First data column/parity count */
120 	uint64_t rm_nskip;		/* Skipped sectors for padding */
121 	uint64_t rm_skipstart;	/* Column index of padding start */
122 	void *rm_datacopy;		/* rm_asize-buffer of copied data */
123 	uintptr_t rm_reports;		/* # of referencing checksum reports */
124 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
125 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
126 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
127 } raidz_map_t;
128 
129 #define	VDEV_RAIDZ_P		0
130 #define	VDEV_RAIDZ_Q		1
131 #define	VDEV_RAIDZ_R		2
132 
133 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
134 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
135 
136 /*
137  * We provide a mechanism to perform the field multiplication operation on a
138  * 64-bit value all at once rather than a byte at a time. This works by
139  * creating a mask from the top bit in each byte and using that to
140  * conditionally apply the XOR of 0x1d.
141  */
142 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
143 { \
144 	(mask) = (x) & 0x8080808080808080ULL; \
145 	(mask) = ((mask) << 1) - ((mask) >> 7); \
146 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
147 	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
148 }
149 
150 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
151 { \
152 	VDEV_RAIDZ_64MUL_2((x), mask); \
153 	VDEV_RAIDZ_64MUL_2((x), mask); \
154 }
155 
156 /*
157  * Force reconstruction to use the general purpose method.
158  */
159 int vdev_raidz_default_to_general;
160 
161 /*
162  * These two tables represent powers and logs of 2 in the Galois field defined
163  * above. These values were computed by repeatedly multiplying by 2 as above.
164  */
165 static const uint8_t vdev_raidz_pow2[256] = {
166 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
167 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
168 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
169 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
170 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
171 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
172 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
173 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
174 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
175 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
176 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
177 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
178 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
179 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
180 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
181 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
182 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
183 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
184 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
185 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
186 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
187 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
188 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
189 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
190 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
191 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
192 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
193 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
194 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
195 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
196 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
197 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
198 };
199 static const uint8_t vdev_raidz_log2[256] = {
200 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
201 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
202 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
203 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
204 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
205 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
206 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
207 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
208 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
209 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
210 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
211 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
212 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
213 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
214 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
215 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
216 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
217 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
218 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
219 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
220 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
221 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
222 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
223 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
224 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
225 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
226 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
227 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
228 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
229 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
230 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
231 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
232 };
233 
234 static void vdev_raidz_generate_parity(raidz_map_t *rm);
235 
236 /*
237  * Multiply a given number by 2 raised to the given power.
238  */
239 static uint8_t
240 vdev_raidz_exp2(uint_t a, int exp)
241 {
242 	if (a == 0)
243 		return (0);
244 
245 	ASSERT(exp >= 0);
246 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
247 
248 	exp += vdev_raidz_log2[a];
249 	if (exp > 255)
250 		exp -= 255;
251 
252 	return (vdev_raidz_pow2[exp]);
253 }
254 
255 static void
256 vdev_raidz_map_free(raidz_map_t *rm)
257 {
258 	int c;
259 	size_t size;
260 
261 	for (c = 0; c < rm->rm_firstdatacol; c++) {
262 		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
263 
264 		if (rm->rm_col[c].rc_gdata != NULL)
265 			zio_buf_free(rm->rm_col[c].rc_gdata,
266 			    rm->rm_col[c].rc_size);
267 	}
268 
269 	size = 0;
270 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
271 		size += rm->rm_col[c].rc_size;
272 
273 	if (rm->rm_datacopy != NULL)
274 		zio_buf_free(rm->rm_datacopy, size);
275 
276 	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
277 }
278 
279 static void
280 vdev_raidz_map_free_vsd(zio_t *zio)
281 {
282 	raidz_map_t *rm = zio->io_vsd;
283 
284 	ASSERT0(rm->rm_freed);
285 	rm->rm_freed = 1;
286 
287 	if (rm->rm_reports == 0)
288 		vdev_raidz_map_free(rm);
289 }
290 
291 /*ARGSUSED*/
292 static void
293 vdev_raidz_cksum_free(void *arg, size_t ignored)
294 {
295 	raidz_map_t *rm = arg;
296 
297 	ASSERT3U(rm->rm_reports, >, 0);
298 
299 	if (--rm->rm_reports == 0 && rm->rm_freed != 0)
300 		vdev_raidz_map_free(rm);
301 }
302 
303 static void
304 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
305 {
306 	raidz_map_t *rm = zcr->zcr_cbdata;
307 	size_t c = zcr->zcr_cbinfo;
308 	size_t x;
309 
310 	const char *good = NULL;
311 	const char *bad = rm->rm_col[c].rc_data;
312 
313 	if (good_data == NULL) {
314 		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
315 		return;
316 	}
317 
318 	if (c < rm->rm_firstdatacol) {
319 		/*
320 		 * The first time through, calculate the parity blocks for
321 		 * the good data (this relies on the fact that the good
322 		 * data never changes for a given logical ZIO)
323 		 */
324 		if (rm->rm_col[0].rc_gdata == NULL) {
325 			char *bad_parity[VDEV_RAIDZ_MAXPARITY];
326 			char *buf;
327 
328 			/*
329 			 * Set up the rm_col[]s to generate the parity for
330 			 * good_data, first saving the parity bufs and
331 			 * replacing them with buffers to hold the result.
332 			 */
333 			for (x = 0; x < rm->rm_firstdatacol; x++) {
334 				bad_parity[x] = rm->rm_col[x].rc_data;
335 				rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
336 				    zio_buf_alloc(rm->rm_col[x].rc_size);
337 			}
338 
339 			/* fill in the data columns from good_data */
340 			buf = (char *)good_data;
341 			for (; x < rm->rm_cols; x++) {
342 				rm->rm_col[x].rc_data = buf;
343 				buf += rm->rm_col[x].rc_size;
344 			}
345 
346 			/*
347 			 * Construct the parity from the good data.
348 			 */
349 			vdev_raidz_generate_parity(rm);
350 
351 			/* restore everything back to its original state */
352 			for (x = 0; x < rm->rm_firstdatacol; x++)
353 				rm->rm_col[x].rc_data = bad_parity[x];
354 
355 			buf = rm->rm_datacopy;
356 			for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
357 				rm->rm_col[x].rc_data = buf;
358 				buf += rm->rm_col[x].rc_size;
359 			}
360 		}
361 
362 		ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
363 		good = rm->rm_col[c].rc_gdata;
364 	} else {
365 		/* adjust good_data to point at the start of our column */
366 		good = good_data;
367 
368 		for (x = rm->rm_firstdatacol; x < c; x++)
369 			good += rm->rm_col[x].rc_size;
370 	}
371 
372 	/* we drop the ereport if it ends up that the data was good */
373 	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
374 }
375 
376 /*
377  * Invoked indirectly by zfs_ereport_start_checksum(), called
378  * below when our read operation fails completely.  The main point
379  * is to keep a copy of everything we read from disk, so that at
380  * vdev_raidz_cksum_finish() time we can compare it with the good data.
381  */
382 static void
383 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
384 {
385 	size_t c = (size_t)(uintptr_t)arg;
386 	caddr_t buf;
387 
388 	raidz_map_t *rm = zio->io_vsd;
389 	size_t size;
390 
391 	/* set up the report and bump the refcount  */
392 	zcr->zcr_cbdata = rm;
393 	zcr->zcr_cbinfo = c;
394 	zcr->zcr_finish = vdev_raidz_cksum_finish;
395 	zcr->zcr_free = vdev_raidz_cksum_free;
396 
397 	rm->rm_reports++;
398 	ASSERT3U(rm->rm_reports, >, 0);
399 
400 	if (rm->rm_datacopy != NULL)
401 		return;
402 
403 	/*
404 	 * It's the first time we're called for this raidz_map_t, so we need
405 	 * to copy the data aside; there's no guarantee that our zio's buffer
406 	 * won't be re-used for something else.
407 	 *
408 	 * Our parity data is already in separate buffers, so there's no need
409 	 * to copy them.
410 	 */
411 
412 	size = 0;
413 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
414 		size += rm->rm_col[c].rc_size;
415 
416 	buf = rm->rm_datacopy = zio_buf_alloc(size);
417 
418 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
419 		raidz_col_t *col = &rm->rm_col[c];
420 
421 		bcopy(col->rc_data, buf, col->rc_size);
422 		col->rc_data = buf;
423 
424 		buf += col->rc_size;
425 	}
426 	ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
427 }
428 
429 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
430 	vdev_raidz_map_free_vsd,
431 	vdev_raidz_cksum_report
432 };
433 
434 static raidz_map_t *
435 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
436     uint64_t nparity)
437 {
438 	raidz_map_t *rm;
439 	uint64_t b = zio->io_offset >> unit_shift;
440 	uint64_t s = zio->io_size >> unit_shift;
441 	uint64_t f = b % dcols;
442 	uint64_t o = (b / dcols) << unit_shift;
443 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
444 
445 	q = s / (dcols - nparity);
446 	r = s - q * (dcols - nparity);
447 	bc = (r == 0 ? 0 : r + nparity);
448 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
449 
450 	if (q == 0) {
451 		acols = bc;
452 		scols = MIN(dcols, roundup(bc, nparity + 1));
453 	} else {
454 		acols = dcols;
455 		scols = dcols;
456 	}
457 
458 	ASSERT3U(acols, <=, scols);
459 
460 	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
461 
462 	rm->rm_cols = acols;
463 	rm->rm_scols = scols;
464 	rm->rm_bigcols = bc;
465 	rm->rm_skipstart = bc;
466 	rm->rm_missingdata = 0;
467 	rm->rm_missingparity = 0;
468 	rm->rm_firstdatacol = nparity;
469 	rm->rm_datacopy = NULL;
470 	rm->rm_reports = 0;
471 	rm->rm_freed = 0;
472 	rm->rm_ecksuminjected = 0;
473 
474 	asize = 0;
475 
476 	for (c = 0; c < scols; c++) {
477 		col = f + c;
478 		coff = o;
479 		if (col >= dcols) {
480 			col -= dcols;
481 			coff += 1ULL << unit_shift;
482 		}
483 		rm->rm_col[c].rc_devidx = col;
484 		rm->rm_col[c].rc_offset = coff;
485 		rm->rm_col[c].rc_data = NULL;
486 		rm->rm_col[c].rc_gdata = NULL;
487 		rm->rm_col[c].rc_error = 0;
488 		rm->rm_col[c].rc_tried = 0;
489 		rm->rm_col[c].rc_skipped = 0;
490 
491 		if (c >= acols)
492 			rm->rm_col[c].rc_size = 0;
493 		else if (c < bc)
494 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
495 		else
496 			rm->rm_col[c].rc_size = q << unit_shift;
497 
498 		asize += rm->rm_col[c].rc_size;
499 	}
500 
501 	ASSERT3U(asize, ==, tot << unit_shift);
502 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
503 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
504 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
505 	ASSERT3U(rm->rm_nskip, <=, nparity);
506 
507 	for (c = 0; c < rm->rm_firstdatacol; c++)
508 		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
509 
510 	rm->rm_col[c].rc_data = zio->io_data;
511 
512 	for (c = c + 1; c < acols; c++)
513 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
514 		    rm->rm_col[c - 1].rc_size;
515 
516 	/*
517 	 * If all data stored spans all columns, there's a danger that parity
518 	 * will always be on the same device and, since parity isn't read
519 	 * during normal operation, that that device's I/O bandwidth won't be
520 	 * used effectively. We therefore switch the parity every 1MB.
521 	 *
522 	 * ... at least that was, ostensibly, the theory. As a practical
523 	 * matter unless we juggle the parity between all devices evenly, we
524 	 * won't see any benefit. Further, occasional writes that aren't a
525 	 * multiple of the LCM of the number of children and the minimum
526 	 * stripe width are sufficient to avoid pessimal behavior.
527 	 * Unfortunately, this decision created an implicit on-disk format
528 	 * requirement that we need to support for all eternity, but only
529 	 * for single-parity RAID-Z.
530 	 *
531 	 * If we intend to skip a sector in the zeroth column for padding
532 	 * we must make sure to note this swap. We will never intend to
533 	 * skip the first column since at least one data and one parity
534 	 * column must appear in each row.
535 	 */
536 	ASSERT(rm->rm_cols >= 2);
537 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
538 
539 	if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
540 		devidx = rm->rm_col[0].rc_devidx;
541 		o = rm->rm_col[0].rc_offset;
542 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
543 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
544 		rm->rm_col[1].rc_devidx = devidx;
545 		rm->rm_col[1].rc_offset = o;
546 
547 		if (rm->rm_skipstart == 0)
548 			rm->rm_skipstart = 1;
549 	}
550 
551 	zio->io_vsd = rm;
552 	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
553 	return (rm);
554 }
555 
556 static void
557 vdev_raidz_generate_parity_p(raidz_map_t *rm)
558 {
559 	uint64_t *p, *src, pcount, ccount, i;
560 	int c;
561 
562 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
563 
564 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
565 		src = rm->rm_col[c].rc_data;
566 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
567 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
568 
569 		if (c == rm->rm_firstdatacol) {
570 			ASSERT(ccount == pcount);
571 			for (i = 0; i < ccount; i++, src++, p++) {
572 				*p = *src;
573 			}
574 		} else {
575 			ASSERT(ccount <= pcount);
576 			for (i = 0; i < ccount; i++, src++, p++) {
577 				*p ^= *src;
578 			}
579 		}
580 	}
581 }
582 
583 static void
584 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
585 {
586 	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
587 	int c;
588 
589 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
590 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
591 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
592 
593 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
594 		src = rm->rm_col[c].rc_data;
595 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
596 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
597 
598 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
599 
600 		if (c == rm->rm_firstdatacol) {
601 			ASSERT(ccnt == pcnt || ccnt == 0);
602 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
603 				*p = *src;
604 				*q = *src;
605 			}
606 			for (; i < pcnt; i++, src++, p++, q++) {
607 				*p = 0;
608 				*q = 0;
609 			}
610 		} else {
611 			ASSERT(ccnt <= pcnt);
612 
613 			/*
614 			 * Apply the algorithm described above by multiplying
615 			 * the previous result and adding in the new value.
616 			 */
617 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
618 				*p ^= *src;
619 
620 				VDEV_RAIDZ_64MUL_2(*q, mask);
621 				*q ^= *src;
622 			}
623 
624 			/*
625 			 * Treat short columns as though they are full of 0s.
626 			 * Note that there's therefore nothing needed for P.
627 			 */
628 			for (; i < pcnt; i++, q++) {
629 				VDEV_RAIDZ_64MUL_2(*q, mask);
630 			}
631 		}
632 	}
633 }
634 
635 static void
636 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
637 {
638 	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
639 	int c;
640 
641 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
642 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
643 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
644 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
645 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
646 
647 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
648 		src = rm->rm_col[c].rc_data;
649 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
650 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
651 		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
652 
653 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
654 
655 		if (c == rm->rm_firstdatacol) {
656 			ASSERT(ccnt == pcnt || ccnt == 0);
657 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
658 				*p = *src;
659 				*q = *src;
660 				*r = *src;
661 			}
662 			for (; i < pcnt; i++, src++, p++, q++, r++) {
663 				*p = 0;
664 				*q = 0;
665 				*r = 0;
666 			}
667 		} else {
668 			ASSERT(ccnt <= pcnt);
669 
670 			/*
671 			 * Apply the algorithm described above by multiplying
672 			 * the previous result and adding in the new value.
673 			 */
674 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
675 				*p ^= *src;
676 
677 				VDEV_RAIDZ_64MUL_2(*q, mask);
678 				*q ^= *src;
679 
680 				VDEV_RAIDZ_64MUL_4(*r, mask);
681 				*r ^= *src;
682 			}
683 
684 			/*
685 			 * Treat short columns as though they are full of 0s.
686 			 * Note that there's therefore nothing needed for P.
687 			 */
688 			for (; i < pcnt; i++, q++, r++) {
689 				VDEV_RAIDZ_64MUL_2(*q, mask);
690 				VDEV_RAIDZ_64MUL_4(*r, mask);
691 			}
692 		}
693 	}
694 }
695 
696 /*
697  * Generate RAID parity in the first virtual columns according to the number of
698  * parity columns available.
699  */
700 static void
701 vdev_raidz_generate_parity(raidz_map_t *rm)
702 {
703 	switch (rm->rm_firstdatacol) {
704 	case 1:
705 		vdev_raidz_generate_parity_p(rm);
706 		break;
707 	case 2:
708 		vdev_raidz_generate_parity_pq(rm);
709 		break;
710 	case 3:
711 		vdev_raidz_generate_parity_pqr(rm);
712 		break;
713 	default:
714 		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
715 	}
716 }
717 
718 static int
719 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
720 {
721 	uint64_t *dst, *src, xcount, ccount, count, i;
722 	int x = tgts[0];
723 	int c;
724 
725 	ASSERT(ntgts == 1);
726 	ASSERT(x >= rm->rm_firstdatacol);
727 	ASSERT(x < rm->rm_cols);
728 
729 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
730 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
731 	ASSERT(xcount > 0);
732 
733 	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
734 	dst = rm->rm_col[x].rc_data;
735 	for (i = 0; i < xcount; i++, dst++, src++) {
736 		*dst = *src;
737 	}
738 
739 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
740 		src = rm->rm_col[c].rc_data;
741 		dst = rm->rm_col[x].rc_data;
742 
743 		if (c == x)
744 			continue;
745 
746 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
747 		count = MIN(ccount, xcount);
748 
749 		for (i = 0; i < count; i++, dst++, src++) {
750 			*dst ^= *src;
751 		}
752 	}
753 
754 	return (1 << VDEV_RAIDZ_P);
755 }
756 
757 static int
758 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
759 {
760 	uint64_t *dst, *src, xcount, ccount, count, mask, i;
761 	uint8_t *b;
762 	int x = tgts[0];
763 	int c, j, exp;
764 
765 	ASSERT(ntgts == 1);
766 
767 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
768 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
769 
770 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
771 		src = rm->rm_col[c].rc_data;
772 		dst = rm->rm_col[x].rc_data;
773 
774 		if (c == x)
775 			ccount = 0;
776 		else
777 			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
778 
779 		count = MIN(ccount, xcount);
780 
781 		if (c == rm->rm_firstdatacol) {
782 			for (i = 0; i < count; i++, dst++, src++) {
783 				*dst = *src;
784 			}
785 			for (; i < xcount; i++, dst++) {
786 				*dst = 0;
787 			}
788 
789 		} else {
790 			for (i = 0; i < count; i++, dst++, src++) {
791 				VDEV_RAIDZ_64MUL_2(*dst, mask);
792 				*dst ^= *src;
793 			}
794 
795 			for (; i < xcount; i++, dst++) {
796 				VDEV_RAIDZ_64MUL_2(*dst, mask);
797 			}
798 		}
799 	}
800 
801 	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
802 	dst = rm->rm_col[x].rc_data;
803 	exp = 255 - (rm->rm_cols - 1 - x);
804 
805 	for (i = 0; i < xcount; i++, dst++, src++) {
806 		*dst ^= *src;
807 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
808 			*b = vdev_raidz_exp2(*b, exp);
809 		}
810 	}
811 
812 	return (1 << VDEV_RAIDZ_Q);
813 }
814 
815 static int
816 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
817 {
818 	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
819 	void *pdata, *qdata;
820 	uint64_t xsize, ysize, i;
821 	int x = tgts[0];
822 	int y = tgts[1];
823 
824 	ASSERT(ntgts == 2);
825 	ASSERT(x < y);
826 	ASSERT(x >= rm->rm_firstdatacol);
827 	ASSERT(y < rm->rm_cols);
828 
829 	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
830 
831 	/*
832 	 * Move the parity data aside -- we're going to compute parity as
833 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
834 	 * reuse the parity generation mechanism without trashing the actual
835 	 * parity so we make those columns appear to be full of zeros by
836 	 * setting their lengths to zero.
837 	 */
838 	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
839 	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
840 	xsize = rm->rm_col[x].rc_size;
841 	ysize = rm->rm_col[y].rc_size;
842 
843 	rm->rm_col[VDEV_RAIDZ_P].rc_data =
844 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
845 	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
846 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
847 	rm->rm_col[x].rc_size = 0;
848 	rm->rm_col[y].rc_size = 0;
849 
850 	vdev_raidz_generate_parity_pq(rm);
851 
852 	rm->rm_col[x].rc_size = xsize;
853 	rm->rm_col[y].rc_size = ysize;
854 
855 	p = pdata;
856 	q = qdata;
857 	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
858 	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
859 	xd = rm->rm_col[x].rc_data;
860 	yd = rm->rm_col[y].rc_data;
861 
862 	/*
863 	 * We now have:
864 	 *	Pxy = P + D_x + D_y
865 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
866 	 *
867 	 * We can then solve for D_x:
868 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
869 	 * where
870 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
871 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
872 	 *
873 	 * With D_x in hand, we can easily solve for D_y:
874 	 *	D_y = P + Pxy + D_x
875 	 */
876 
877 	a = vdev_raidz_pow2[255 + x - y];
878 	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
879 	tmp = 255 - vdev_raidz_log2[a ^ 1];
880 
881 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
882 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
883 
884 	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
885 		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
886 		    vdev_raidz_exp2(*q ^ *qxy, bexp);
887 
888 		if (i < ysize)
889 			*yd = *p ^ *pxy ^ *xd;
890 	}
891 
892 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
893 	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
894 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
895 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
896 
897 	/*
898 	 * Restore the saved parity data.
899 	 */
900 	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
901 	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
902 
903 	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
904 }
905 
906 /* BEGIN CSTYLED */
907 /*
908  * In the general case of reconstruction, we must solve the system of linear
909  * equations defined by the coeffecients used to generate parity as well as
910  * the contents of the data and parity disks. This can be expressed with
911  * vectors for the original data (D) and the actual data (d) and parity (p)
912  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
913  *
914  *            __   __                     __     __
915  *            |     |         __     __   |  p_0  |
916  *            |  V  |         |  D_0  |   | p_m-1 |
917  *            |     |    x    |   :   | = |  d_0  |
918  *            |  I  |         | D_n-1 |   |   :   |
919  *            |     |         ~~     ~~   | d_n-1 |
920  *            ~~   ~~                     ~~     ~~
921  *
922  * I is simply a square identity matrix of size n, and V is a vandermonde
923  * matrix defined by the coeffecients we chose for the various parity columns
924  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
925  * computation as well as linear separability.
926  *
927  *      __               __               __     __
928  *      |   1   ..  1 1 1 |               |  p_0  |
929  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
930  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
931  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
932  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
933  *      |   :       : : : |   |   :   |   |  d_2  |
934  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
935  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
936  *      |   0   ..  0 0 1 |               | d_n-1 |
937  *      ~~               ~~               ~~     ~~
938  *
939  * Note that I, V, d, and p are known. To compute D, we must invert the
940  * matrix and use the known data and parity values to reconstruct the unknown
941  * data values. We begin by removing the rows in V|I and d|p that correspond
942  * to failed or missing columns; we then make V|I square (n x n) and d|p
943  * sized n by removing rows corresponding to unused parity from the bottom up
944  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
945  * using Gauss-Jordan elimination. In the example below we use m=3 parity
946  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
947  *           __                               __
948  *           |  1   1   1   1   1   1   1   1  |
949  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
950  *           |  19 205 116  29  64  16  4   1  |      / /
951  *           |  1   0   0   0   0   0   0   0  |     / /
952  *           |  0   1   0   0   0   0   0   0  | <--' /
953  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
954  *           |  0   0   0   1   0   0   0   0  |
955  *           |  0   0   0   0   1   0   0   0  |
956  *           |  0   0   0   0   0   1   0   0  |
957  *           |  0   0   0   0   0   0   1   0  |
958  *           |  0   0   0   0   0   0   0   1  |
959  *           ~~                               ~~
960  *           __                               __
961  *           |  1   1   1   1   1   1   1   1  |
962  *           | 128  64  32  16  8   4   2   1  |
963  *           |  19 205 116  29  64  16  4   1  |
964  *           |  1   0   0   0   0   0   0   0  |
965  *           |  0   1   0   0   0   0   0   0  |
966  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
967  *           |  0   0   0   1   0   0   0   0  |
968  *           |  0   0   0   0   1   0   0   0  |
969  *           |  0   0   0   0   0   1   0   0  |
970  *           |  0   0   0   0   0   0   1   0  |
971  *           |  0   0   0   0   0   0   0   1  |
972  *           ~~                               ~~
973  *
974  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
975  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
976  * matrix is not singular.
977  * __                                                                 __
978  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
979  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
980  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
981  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
982  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
983  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
984  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
985  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
986  * ~~                                                                 ~~
987  * __                                                                 __
988  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
989  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
990  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
991  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
992  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
993  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
994  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
995  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
996  * ~~                                                                 ~~
997  * __                                                                 __
998  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
999  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1000  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1001  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1002  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1003  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1004  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1005  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1006  * ~~                                                                 ~~
1007  * __                                                                 __
1008  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1009  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1010  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1011  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1012  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1013  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1014  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1015  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1016  * ~~                                                                 ~~
1017  * __                                                                 __
1018  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1019  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1020  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1021  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1022  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1023  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1024  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1025  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1026  * ~~                                                                 ~~
1027  * __                                                                 __
1028  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1029  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1030  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1031  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1032  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1033  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1034  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1035  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1036  * ~~                                                                 ~~
1037  *                   __                               __
1038  *                   |  0   0   1   0   0   0   0   0  |
1039  *                   | 167 100  5   41 159 169 217 208 |
1040  *                   | 166 100  4   40 158 168 216 209 |
1041  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1042  *                   |  0   0   0   0   1   0   0   0  |
1043  *                   |  0   0   0   0   0   1   0   0  |
1044  *                   |  0   0   0   0   0   0   1   0  |
1045  *                   |  0   0   0   0   0   0   0   1  |
1046  *                   ~~                               ~~
1047  *
1048  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1049  * of the missing data.
1050  *
1051  * As is apparent from the example above, the only non-trivial rows in the
1052  * inverse matrix correspond to the data disks that we're trying to
1053  * reconstruct. Indeed, those are the only rows we need as the others would
1054  * only be useful for reconstructing data known or assumed to be valid. For
1055  * that reason, we only build the coefficients in the rows that correspond to
1056  * targeted columns.
1057  */
1058 /* END CSTYLED */
1059 
1060 static void
1061 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1062     uint8_t **rows)
1063 {
1064 	int i, j;
1065 	int pow;
1066 
1067 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1068 
1069 	/*
1070 	 * Fill in the missing rows of interest.
1071 	 */
1072 	for (i = 0; i < nmap; i++) {
1073 		ASSERT3S(0, <=, map[i]);
1074 		ASSERT3S(map[i], <=, 2);
1075 
1076 		pow = map[i] * n;
1077 		if (pow > 255)
1078 			pow -= 255;
1079 		ASSERT(pow <= 255);
1080 
1081 		for (j = 0; j < n; j++) {
1082 			pow -= map[i];
1083 			if (pow < 0)
1084 				pow += 255;
1085 			rows[i][j] = vdev_raidz_pow2[pow];
1086 		}
1087 	}
1088 }
1089 
1090 static void
1091 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1092     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1093 {
1094 	int i, j, ii, jj;
1095 	uint8_t log;
1096 
1097 	/*
1098 	 * Assert that the first nmissing entries from the array of used
1099 	 * columns correspond to parity columns and that subsequent entries
1100 	 * correspond to data columns.
1101 	 */
1102 	for (i = 0; i < nmissing; i++) {
1103 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
1104 	}
1105 	for (; i < n; i++) {
1106 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1107 	}
1108 
1109 	/*
1110 	 * First initialize the storage where we'll compute the inverse rows.
1111 	 */
1112 	for (i = 0; i < nmissing; i++) {
1113 		for (j = 0; j < n; j++) {
1114 			invrows[i][j] = (i == j) ? 1 : 0;
1115 		}
1116 	}
1117 
1118 	/*
1119 	 * Subtract all trivial rows from the rows of consequence.
1120 	 */
1121 	for (i = 0; i < nmissing; i++) {
1122 		for (j = nmissing; j < n; j++) {
1123 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1124 			jj = used[j] - rm->rm_firstdatacol;
1125 			ASSERT3S(jj, <, n);
1126 			invrows[i][j] = rows[i][jj];
1127 			rows[i][jj] = 0;
1128 		}
1129 	}
1130 
1131 	/*
1132 	 * For each of the rows of interest, we must normalize it and subtract
1133 	 * a multiple of it from the other rows.
1134 	 */
1135 	for (i = 0; i < nmissing; i++) {
1136 		for (j = 0; j < missing[i]; j++) {
1137 			ASSERT0(rows[i][j]);
1138 		}
1139 		ASSERT3U(rows[i][missing[i]], !=, 0);
1140 
1141 		/*
1142 		 * Compute the inverse of the first element and multiply each
1143 		 * element in the row by that value.
1144 		 */
1145 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1146 
1147 		for (j = 0; j < n; j++) {
1148 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1149 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1150 		}
1151 
1152 		for (ii = 0; ii < nmissing; ii++) {
1153 			if (i == ii)
1154 				continue;
1155 
1156 			ASSERT3U(rows[ii][missing[i]], !=, 0);
1157 
1158 			log = vdev_raidz_log2[rows[ii][missing[i]]];
1159 
1160 			for (j = 0; j < n; j++) {
1161 				rows[ii][j] ^=
1162 				    vdev_raidz_exp2(rows[i][j], log);
1163 				invrows[ii][j] ^=
1164 				    vdev_raidz_exp2(invrows[i][j], log);
1165 			}
1166 		}
1167 	}
1168 
1169 	/*
1170 	 * Verify that the data that is left in the rows are properly part of
1171 	 * an identity matrix.
1172 	 */
1173 	for (i = 0; i < nmissing; i++) {
1174 		for (j = 0; j < n; j++) {
1175 			if (j == missing[i]) {
1176 				ASSERT3U(rows[i][j], ==, 1);
1177 			} else {
1178 				ASSERT0(rows[i][j]);
1179 			}
1180 		}
1181 	}
1182 }
1183 
1184 static void
1185 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1186     int *missing, uint8_t **invrows, const uint8_t *used)
1187 {
1188 	int i, j, x, cc, c;
1189 	uint8_t *src;
1190 	uint64_t ccount;
1191 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1192 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1193 	uint8_t log = 0;
1194 	uint8_t val;
1195 	int ll;
1196 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1197 	uint8_t *p, *pp;
1198 	size_t psize;
1199 
1200 	psize = sizeof (invlog[0][0]) * n * nmissing;
1201 	p = kmem_alloc(psize, KM_SLEEP);
1202 
1203 	for (pp = p, i = 0; i < nmissing; i++) {
1204 		invlog[i] = pp;
1205 		pp += n;
1206 	}
1207 
1208 	for (i = 0; i < nmissing; i++) {
1209 		for (j = 0; j < n; j++) {
1210 			ASSERT3U(invrows[i][j], !=, 0);
1211 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1212 		}
1213 	}
1214 
1215 	for (i = 0; i < n; i++) {
1216 		c = used[i];
1217 		ASSERT3U(c, <, rm->rm_cols);
1218 
1219 		src = rm->rm_col[c].rc_data;
1220 		ccount = rm->rm_col[c].rc_size;
1221 		for (j = 0; j < nmissing; j++) {
1222 			cc = missing[j] + rm->rm_firstdatacol;
1223 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1224 			ASSERT3U(cc, <, rm->rm_cols);
1225 			ASSERT3U(cc, !=, c);
1226 
1227 			dst[j] = rm->rm_col[cc].rc_data;
1228 			dcount[j] = rm->rm_col[cc].rc_size;
1229 		}
1230 
1231 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1232 
1233 		for (x = 0; x < ccount; x++, src++) {
1234 			if (*src != 0)
1235 				log = vdev_raidz_log2[*src];
1236 
1237 			for (cc = 0; cc < nmissing; cc++) {
1238 				if (x >= dcount[cc])
1239 					continue;
1240 
1241 				if (*src == 0) {
1242 					val = 0;
1243 				} else {
1244 					if ((ll = log + invlog[cc][i]) >= 255)
1245 						ll -= 255;
1246 					val = vdev_raidz_pow2[ll];
1247 				}
1248 
1249 				if (i == 0)
1250 					dst[cc][x] = val;
1251 				else
1252 					dst[cc][x] ^= val;
1253 			}
1254 		}
1255 	}
1256 
1257 	kmem_free(p, psize);
1258 }
1259 
1260 static int
1261 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1262 {
1263 	int n, i, c, t, tt;
1264 	int nmissing_rows;
1265 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1266 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1267 
1268 	uint8_t *p, *pp;
1269 	size_t psize;
1270 
1271 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1272 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1273 	uint8_t *used;
1274 
1275 	int code = 0;
1276 
1277 
1278 	n = rm->rm_cols - rm->rm_firstdatacol;
1279 
1280 	/*
1281 	 * Figure out which data columns are missing.
1282 	 */
1283 	nmissing_rows = 0;
1284 	for (t = 0; t < ntgts; t++) {
1285 		if (tgts[t] >= rm->rm_firstdatacol) {
1286 			missing_rows[nmissing_rows++] =
1287 			    tgts[t] - rm->rm_firstdatacol;
1288 		}
1289 	}
1290 
1291 	/*
1292 	 * Figure out which parity columns to use to help generate the missing
1293 	 * data columns.
1294 	 */
1295 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1296 		ASSERT(tt < ntgts);
1297 		ASSERT(c < rm->rm_firstdatacol);
1298 
1299 		/*
1300 		 * Skip any targeted parity columns.
1301 		 */
1302 		if (c == tgts[tt]) {
1303 			tt++;
1304 			continue;
1305 		}
1306 
1307 		code |= 1 << c;
1308 
1309 		parity_map[i] = c;
1310 		i++;
1311 	}
1312 
1313 	ASSERT(code != 0);
1314 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1315 
1316 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1317 	    nmissing_rows * n + sizeof (used[0]) * n;
1318 	p = kmem_alloc(psize, KM_SLEEP);
1319 
1320 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1321 		rows[i] = pp;
1322 		pp += n;
1323 		invrows[i] = pp;
1324 		pp += n;
1325 	}
1326 	used = pp;
1327 
1328 	for (i = 0; i < nmissing_rows; i++) {
1329 		used[i] = parity_map[i];
1330 	}
1331 
1332 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1333 		if (tt < nmissing_rows &&
1334 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1335 			tt++;
1336 			continue;
1337 		}
1338 
1339 		ASSERT3S(i, <, n);
1340 		used[i] = c;
1341 		i++;
1342 	}
1343 
1344 	/*
1345 	 * Initialize the interesting rows of the matrix.
1346 	 */
1347 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1348 
1349 	/*
1350 	 * Invert the matrix.
1351 	 */
1352 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1353 	    invrows, used);
1354 
1355 	/*
1356 	 * Reconstruct the missing data using the generated matrix.
1357 	 */
1358 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1359 	    invrows, used);
1360 
1361 	kmem_free(p, psize);
1362 
1363 	return (code);
1364 }
1365 
1366 static int
1367 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1368 {
1369 	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1370 	int ntgts;
1371 	int i, c;
1372 	int code;
1373 	int nbadparity, nbaddata;
1374 	int parity_valid[VDEV_RAIDZ_MAXPARITY];
1375 
1376 	/*
1377 	 * The tgts list must already be sorted.
1378 	 */
1379 	for (i = 1; i < nt; i++) {
1380 		ASSERT(t[i] > t[i - 1]);
1381 	}
1382 
1383 	nbadparity = rm->rm_firstdatacol;
1384 	nbaddata = rm->rm_cols - nbadparity;
1385 	ntgts = 0;
1386 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1387 		if (c < rm->rm_firstdatacol)
1388 			parity_valid[c] = B_FALSE;
1389 
1390 		if (i < nt && c == t[i]) {
1391 			tgts[ntgts++] = c;
1392 			i++;
1393 		} else if (rm->rm_col[c].rc_error != 0) {
1394 			tgts[ntgts++] = c;
1395 		} else if (c >= rm->rm_firstdatacol) {
1396 			nbaddata--;
1397 		} else {
1398 			parity_valid[c] = B_TRUE;
1399 			nbadparity--;
1400 		}
1401 	}
1402 
1403 	ASSERT(ntgts >= nt);
1404 	ASSERT(nbaddata >= 0);
1405 	ASSERT(nbaddata + nbadparity == ntgts);
1406 
1407 	dt = &tgts[nbadparity];
1408 
1409 	/*
1410 	 * See if we can use any of our optimized reconstruction routines.
1411 	 */
1412 	if (!vdev_raidz_default_to_general) {
1413 		switch (nbaddata) {
1414 		case 1:
1415 			if (parity_valid[VDEV_RAIDZ_P])
1416 				return (vdev_raidz_reconstruct_p(rm, dt, 1));
1417 
1418 			ASSERT(rm->rm_firstdatacol > 1);
1419 
1420 			if (parity_valid[VDEV_RAIDZ_Q])
1421 				return (vdev_raidz_reconstruct_q(rm, dt, 1));
1422 
1423 			ASSERT(rm->rm_firstdatacol > 2);
1424 			break;
1425 
1426 		case 2:
1427 			ASSERT(rm->rm_firstdatacol > 1);
1428 
1429 			if (parity_valid[VDEV_RAIDZ_P] &&
1430 			    parity_valid[VDEV_RAIDZ_Q])
1431 				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1432 
1433 			ASSERT(rm->rm_firstdatacol > 2);
1434 
1435 			break;
1436 		}
1437 	}
1438 
1439 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1440 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1441 	ASSERT(code > 0);
1442 	return (code);
1443 }
1444 
1445 static int
1446 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1447     uint64_t *ashift)
1448 {
1449 	vdev_t *cvd;
1450 	uint64_t nparity = vd->vdev_nparity;
1451 	int c;
1452 	int lasterror = 0;
1453 	int numerrors = 0;
1454 
1455 	ASSERT(nparity > 0);
1456 
1457 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
1458 	    vd->vdev_children < nparity + 1) {
1459 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1460 		return (SET_ERROR(EINVAL));
1461 	}
1462 
1463 	vdev_open_children(vd);
1464 
1465 	for (c = 0; c < vd->vdev_children; c++) {
1466 		cvd = vd->vdev_child[c];
1467 
1468 		if (cvd->vdev_open_error != 0) {
1469 			lasterror = cvd->vdev_open_error;
1470 			numerrors++;
1471 			continue;
1472 		}
1473 
1474 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1475 		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1476 		*ashift = MAX(*ashift, cvd->vdev_ashift);
1477 	}
1478 
1479 	*asize *= vd->vdev_children;
1480 	*max_asize *= vd->vdev_children;
1481 
1482 	if (numerrors > nparity) {
1483 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1484 		return (lasterror);
1485 	}
1486 
1487 	return (0);
1488 }
1489 
1490 static void
1491 vdev_raidz_close(vdev_t *vd)
1492 {
1493 	int c;
1494 
1495 	for (c = 0; c < vd->vdev_children; c++)
1496 		vdev_close(vd->vdev_child[c]);
1497 }
1498 
1499 static uint64_t
1500 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1501 {
1502 	uint64_t asize;
1503 	uint64_t ashift = vd->vdev_top->vdev_ashift;
1504 	uint64_t cols = vd->vdev_children;
1505 	uint64_t nparity = vd->vdev_nparity;
1506 
1507 	asize = ((psize - 1) >> ashift) + 1;
1508 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1509 	asize = roundup(asize, nparity + 1) << ashift;
1510 
1511 	return (asize);
1512 }
1513 
1514 static void
1515 vdev_raidz_child_done(zio_t *zio)
1516 {
1517 	raidz_col_t *rc = zio->io_private;
1518 
1519 	rc->rc_error = zio->io_error;
1520 	rc->rc_tried = 1;
1521 	rc->rc_skipped = 0;
1522 }
1523 
1524 static int
1525 vdev_raidz_io_start(zio_t *zio)
1526 {
1527 	vdev_t *vd = zio->io_vd;
1528 	vdev_t *tvd = vd->vdev_top;
1529 	vdev_t *cvd;
1530 	raidz_map_t *rm;
1531 	raidz_col_t *rc;
1532 	int c, i;
1533 
1534 	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
1535 	    vd->vdev_nparity);
1536 
1537 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1538 
1539 	if (zio->io_type == ZIO_TYPE_WRITE) {
1540 		vdev_raidz_generate_parity(rm);
1541 
1542 		for (c = 0; c < rm->rm_cols; c++) {
1543 			rc = &rm->rm_col[c];
1544 			cvd = vd->vdev_child[rc->rc_devidx];
1545 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1546 			    rc->rc_offset, rc->rc_data, rc->rc_size,
1547 			    zio->io_type, zio->io_priority, 0,
1548 			    vdev_raidz_child_done, rc));
1549 		}
1550 
1551 		/*
1552 		 * Generate optional I/Os for any skipped sectors to improve
1553 		 * aggregation contiguity.
1554 		 */
1555 		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1556 			ASSERT(c <= rm->rm_scols);
1557 			if (c == rm->rm_scols)
1558 				c = 0;
1559 			rc = &rm->rm_col[c];
1560 			cvd = vd->vdev_child[rc->rc_devidx];
1561 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1562 			    rc->rc_offset + rc->rc_size, NULL,
1563 			    1 << tvd->vdev_ashift,
1564 			    zio->io_type, zio->io_priority,
1565 			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1566 		}
1567 
1568 		return (ZIO_PIPELINE_CONTINUE);
1569 	}
1570 
1571 	ASSERT(zio->io_type == ZIO_TYPE_READ);
1572 
1573 	/*
1574 	 * Iterate over the columns in reverse order so that we hit the parity
1575 	 * last -- any errors along the way will force us to read the parity.
1576 	 */
1577 	for (c = rm->rm_cols - 1; c >= 0; c--) {
1578 		rc = &rm->rm_col[c];
1579 		cvd = vd->vdev_child[rc->rc_devidx];
1580 		if (!vdev_readable(cvd)) {
1581 			if (c >= rm->rm_firstdatacol)
1582 				rm->rm_missingdata++;
1583 			else
1584 				rm->rm_missingparity++;
1585 			rc->rc_error = SET_ERROR(ENXIO);
1586 			rc->rc_tried = 1;	/* don't even try */
1587 			rc->rc_skipped = 1;
1588 			continue;
1589 		}
1590 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1591 			if (c >= rm->rm_firstdatacol)
1592 				rm->rm_missingdata++;
1593 			else
1594 				rm->rm_missingparity++;
1595 			rc->rc_error = SET_ERROR(ESTALE);
1596 			rc->rc_skipped = 1;
1597 			continue;
1598 		}
1599 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1600 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1601 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1602 			    rc->rc_offset, rc->rc_data, rc->rc_size,
1603 			    zio->io_type, zio->io_priority, 0,
1604 			    vdev_raidz_child_done, rc));
1605 		}
1606 	}
1607 
1608 	return (ZIO_PIPELINE_CONTINUE);
1609 }
1610 
1611 
1612 /*
1613  * Report a checksum error for a child of a RAID-Z device.
1614  */
1615 static void
1616 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1617 {
1618 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1619 
1620 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1621 		zio_bad_cksum_t zbc;
1622 		raidz_map_t *rm = zio->io_vsd;
1623 
1624 		mutex_enter(&vd->vdev_stat_lock);
1625 		vd->vdev_stat.vs_checksum_errors++;
1626 		mutex_exit(&vd->vdev_stat_lock);
1627 
1628 		zbc.zbc_has_cksum = 0;
1629 		zbc.zbc_injected = rm->rm_ecksuminjected;
1630 
1631 		zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1632 		    rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1633 		    &zbc);
1634 	}
1635 }
1636 
1637 /*
1638  * We keep track of whether or not there were any injected errors, so that
1639  * any ereports we generate can note it.
1640  */
1641 static int
1642 raidz_checksum_verify(zio_t *zio)
1643 {
1644 	zio_bad_cksum_t zbc;
1645 	raidz_map_t *rm = zio->io_vsd;
1646 
1647 	int ret = zio_checksum_error(zio, &zbc);
1648 	if (ret != 0 && zbc.zbc_injected != 0)
1649 		rm->rm_ecksuminjected = 1;
1650 
1651 	return (ret);
1652 }
1653 
1654 /*
1655  * Generate the parity from the data columns. If we tried and were able to
1656  * read the parity without error, verify that the generated parity matches the
1657  * data we read. If it doesn't, we fire off a checksum error. Return the
1658  * number such failures.
1659  */
1660 static int
1661 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1662 {
1663 	void *orig[VDEV_RAIDZ_MAXPARITY];
1664 	int c, ret = 0;
1665 	raidz_col_t *rc;
1666 
1667 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1668 		rc = &rm->rm_col[c];
1669 		if (!rc->rc_tried || rc->rc_error != 0)
1670 			continue;
1671 		orig[c] = zio_buf_alloc(rc->rc_size);
1672 		bcopy(rc->rc_data, orig[c], rc->rc_size);
1673 	}
1674 
1675 	vdev_raidz_generate_parity(rm);
1676 
1677 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1678 		rc = &rm->rm_col[c];
1679 		if (!rc->rc_tried || rc->rc_error != 0)
1680 			continue;
1681 		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1682 			raidz_checksum_error(zio, rc, orig[c]);
1683 			rc->rc_error = SET_ERROR(ECKSUM);
1684 			ret++;
1685 		}
1686 		zio_buf_free(orig[c], rc->rc_size);
1687 	}
1688 
1689 	return (ret);
1690 }
1691 
1692 /*
1693  * Keep statistics on all the ways that we used parity to correct data.
1694  */
1695 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1696 
1697 static int
1698 vdev_raidz_worst_error(raidz_map_t *rm)
1699 {
1700 	int error = 0;
1701 
1702 	for (int c = 0; c < rm->rm_cols; c++)
1703 		error = zio_worst_error(error, rm->rm_col[c].rc_error);
1704 
1705 	return (error);
1706 }
1707 
1708 /*
1709  * Iterate over all combinations of bad data and attempt a reconstruction.
1710  * Note that the algorithm below is non-optimal because it doesn't take into
1711  * account how reconstruction is actually performed. For example, with
1712  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1713  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1714  * cases we'd only use parity information in column 0.
1715  */
1716 static int
1717 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1718 {
1719 	raidz_map_t *rm = zio->io_vsd;
1720 	raidz_col_t *rc;
1721 	void *orig[VDEV_RAIDZ_MAXPARITY];
1722 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1723 	int *tgts = &tstore[1];
1724 	int current, next, i, c, n;
1725 	int code, ret = 0;
1726 
1727 	ASSERT(total_errors < rm->rm_firstdatacol);
1728 
1729 	/*
1730 	 * This simplifies one edge condition.
1731 	 */
1732 	tgts[-1] = -1;
1733 
1734 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1735 		/*
1736 		 * Initialize the targets array by finding the first n columns
1737 		 * that contain no error.
1738 		 *
1739 		 * If there were no data errors, we need to ensure that we're
1740 		 * always explicitly attempting to reconstruct at least one
1741 		 * data column. To do this, we simply push the highest target
1742 		 * up into the data columns.
1743 		 */
1744 		for (c = 0, i = 0; i < n; i++) {
1745 			if (i == n - 1 && data_errors == 0 &&
1746 			    c < rm->rm_firstdatacol) {
1747 				c = rm->rm_firstdatacol;
1748 			}
1749 
1750 			while (rm->rm_col[c].rc_error != 0) {
1751 				c++;
1752 				ASSERT3S(c, <, rm->rm_cols);
1753 			}
1754 
1755 			tgts[i] = c++;
1756 		}
1757 
1758 		/*
1759 		 * Setting tgts[n] simplifies the other edge condition.
1760 		 */
1761 		tgts[n] = rm->rm_cols;
1762 
1763 		/*
1764 		 * These buffers were allocated in previous iterations.
1765 		 */
1766 		for (i = 0; i < n - 1; i++) {
1767 			ASSERT(orig[i] != NULL);
1768 		}
1769 
1770 		orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
1771 
1772 		current = 0;
1773 		next = tgts[current];
1774 
1775 		while (current != n) {
1776 			tgts[current] = next;
1777 			current = 0;
1778 
1779 			/*
1780 			 * Save off the original data that we're going to
1781 			 * attempt to reconstruct.
1782 			 */
1783 			for (i = 0; i < n; i++) {
1784 				ASSERT(orig[i] != NULL);
1785 				c = tgts[i];
1786 				ASSERT3S(c, >=, 0);
1787 				ASSERT3S(c, <, rm->rm_cols);
1788 				rc = &rm->rm_col[c];
1789 				bcopy(rc->rc_data, orig[i], rc->rc_size);
1790 			}
1791 
1792 			/*
1793 			 * Attempt a reconstruction and exit the outer loop on
1794 			 * success.
1795 			 */
1796 			code = vdev_raidz_reconstruct(rm, tgts, n);
1797 			if (raidz_checksum_verify(zio) == 0) {
1798 				atomic_inc_64(&raidz_corrected[code]);
1799 
1800 				for (i = 0; i < n; i++) {
1801 					c = tgts[i];
1802 					rc = &rm->rm_col[c];
1803 					ASSERT(rc->rc_error == 0);
1804 					if (rc->rc_tried)
1805 						raidz_checksum_error(zio, rc,
1806 						    orig[i]);
1807 					rc->rc_error = SET_ERROR(ECKSUM);
1808 				}
1809 
1810 				ret = code;
1811 				goto done;
1812 			}
1813 
1814 			/*
1815 			 * Restore the original data.
1816 			 */
1817 			for (i = 0; i < n; i++) {
1818 				c = tgts[i];
1819 				rc = &rm->rm_col[c];
1820 				bcopy(orig[i], rc->rc_data, rc->rc_size);
1821 			}
1822 
1823 			do {
1824 				/*
1825 				 * Find the next valid column after the current
1826 				 * position..
1827 				 */
1828 				for (next = tgts[current] + 1;
1829 				    next < rm->rm_cols &&
1830 				    rm->rm_col[next].rc_error != 0; next++)
1831 					continue;
1832 
1833 				ASSERT(next <= tgts[current + 1]);
1834 
1835 				/*
1836 				 * If that spot is available, we're done here.
1837 				 */
1838 				if (next != tgts[current + 1])
1839 					break;
1840 
1841 				/*
1842 				 * Otherwise, find the next valid column after
1843 				 * the previous position.
1844 				 */
1845 				for (c = tgts[current - 1] + 1;
1846 				    rm->rm_col[c].rc_error != 0; c++)
1847 					continue;
1848 
1849 				tgts[current] = c;
1850 				current++;
1851 
1852 			} while (current != n);
1853 		}
1854 	}
1855 	n--;
1856 done:
1857 	for (i = 0; i < n; i++) {
1858 		zio_buf_free(orig[i], rm->rm_col[0].rc_size);
1859 	}
1860 
1861 	return (ret);
1862 }
1863 
1864 static void
1865 vdev_raidz_io_done(zio_t *zio)
1866 {
1867 	vdev_t *vd = zio->io_vd;
1868 	vdev_t *cvd;
1869 	raidz_map_t *rm = zio->io_vsd;
1870 	raidz_col_t *rc;
1871 	int unexpected_errors = 0;
1872 	int parity_errors = 0;
1873 	int parity_untried = 0;
1874 	int data_errors = 0;
1875 	int total_errors = 0;
1876 	int n, c;
1877 	int tgts[VDEV_RAIDZ_MAXPARITY];
1878 	int code;
1879 
1880 	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
1881 
1882 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1883 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1884 
1885 	for (c = 0; c < rm->rm_cols; c++) {
1886 		rc = &rm->rm_col[c];
1887 
1888 		if (rc->rc_error) {
1889 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
1890 
1891 			if (c < rm->rm_firstdatacol)
1892 				parity_errors++;
1893 			else
1894 				data_errors++;
1895 
1896 			if (!rc->rc_skipped)
1897 				unexpected_errors++;
1898 
1899 			total_errors++;
1900 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1901 			parity_untried++;
1902 		}
1903 	}
1904 
1905 	if (zio->io_type == ZIO_TYPE_WRITE) {
1906 		/*
1907 		 * XXX -- for now, treat partial writes as a success.
1908 		 * (If we couldn't write enough columns to reconstruct
1909 		 * the data, the I/O failed.  Otherwise, good enough.)
1910 		 *
1911 		 * Now that we support write reallocation, it would be better
1912 		 * to treat partial failure as real failure unless there are
1913 		 * no non-degraded top-level vdevs left, and not update DTLs
1914 		 * if we intend to reallocate.
1915 		 */
1916 		/* XXPOLICY */
1917 		if (total_errors > rm->rm_firstdatacol)
1918 			zio->io_error = vdev_raidz_worst_error(rm);
1919 
1920 		return;
1921 	}
1922 
1923 	ASSERT(zio->io_type == ZIO_TYPE_READ);
1924 	/*
1925 	 * There are three potential phases for a read:
1926 	 *	1. produce valid data from the columns read
1927 	 *	2. read all disks and try again
1928 	 *	3. perform combinatorial reconstruction
1929 	 *
1930 	 * Each phase is progressively both more expensive and less likely to
1931 	 * occur. If we encounter more errors than we can repair or all phases
1932 	 * fail, we have no choice but to return an error.
1933 	 */
1934 
1935 	/*
1936 	 * If the number of errors we saw was correctable -- less than or equal
1937 	 * to the number of parity disks read -- attempt to produce data that
1938 	 * has a valid checksum. Naturally, this case applies in the absence of
1939 	 * any errors.
1940 	 */
1941 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1942 		if (data_errors == 0) {
1943 			if (raidz_checksum_verify(zio) == 0) {
1944 				/*
1945 				 * If we read parity information (unnecessarily
1946 				 * as it happens since no reconstruction was
1947 				 * needed) regenerate and verify the parity.
1948 				 * We also regenerate parity when resilvering
1949 				 * so we can write it out to the failed device
1950 				 * later.
1951 				 */
1952 				if (parity_errors + parity_untried <
1953 				    rm->rm_firstdatacol ||
1954 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
1955 					n = raidz_parity_verify(zio, rm);
1956 					unexpected_errors += n;
1957 					ASSERT(parity_errors + n <=
1958 					    rm->rm_firstdatacol);
1959 				}
1960 				goto done;
1961 			}
1962 		} else {
1963 			/*
1964 			 * We either attempt to read all the parity columns or
1965 			 * none of them. If we didn't try to read parity, we
1966 			 * wouldn't be here in the correctable case. There must
1967 			 * also have been fewer parity errors than parity
1968 			 * columns or, again, we wouldn't be in this code path.
1969 			 */
1970 			ASSERT(parity_untried == 0);
1971 			ASSERT(parity_errors < rm->rm_firstdatacol);
1972 
1973 			/*
1974 			 * Identify the data columns that reported an error.
1975 			 */
1976 			n = 0;
1977 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1978 				rc = &rm->rm_col[c];
1979 				if (rc->rc_error != 0) {
1980 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1981 					tgts[n++] = c;
1982 				}
1983 			}
1984 
1985 			ASSERT(rm->rm_firstdatacol >= n);
1986 
1987 			code = vdev_raidz_reconstruct(rm, tgts, n);
1988 
1989 			if (raidz_checksum_verify(zio) == 0) {
1990 				atomic_inc_64(&raidz_corrected[code]);
1991 
1992 				/*
1993 				 * If we read more parity disks than were used
1994 				 * for reconstruction, confirm that the other
1995 				 * parity disks produced correct data. This
1996 				 * routine is suboptimal in that it regenerates
1997 				 * the parity that we already used in addition
1998 				 * to the parity that we're attempting to
1999 				 * verify, but this should be a relatively
2000 				 * uncommon case, and can be optimized if it
2001 				 * becomes a problem. Note that we regenerate
2002 				 * parity when resilvering so we can write it
2003 				 * out to failed devices later.
2004 				 */
2005 				if (parity_errors < rm->rm_firstdatacol - n ||
2006 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2007 					n = raidz_parity_verify(zio, rm);
2008 					unexpected_errors += n;
2009 					ASSERT(parity_errors + n <=
2010 					    rm->rm_firstdatacol);
2011 				}
2012 
2013 				goto done;
2014 			}
2015 		}
2016 	}
2017 
2018 	/*
2019 	 * This isn't a typical situation -- either we got a read error or
2020 	 * a child silently returned bad data. Read every block so we can
2021 	 * try again with as much data and parity as we can track down. If
2022 	 * we've already been through once before, all children will be marked
2023 	 * as tried so we'll proceed to combinatorial reconstruction.
2024 	 */
2025 	unexpected_errors = 1;
2026 	rm->rm_missingdata = 0;
2027 	rm->rm_missingparity = 0;
2028 
2029 	for (c = 0; c < rm->rm_cols; c++) {
2030 		if (rm->rm_col[c].rc_tried)
2031 			continue;
2032 
2033 		zio_vdev_io_redone(zio);
2034 		do {
2035 			rc = &rm->rm_col[c];
2036 			if (rc->rc_tried)
2037 				continue;
2038 			zio_nowait(zio_vdev_child_io(zio, NULL,
2039 			    vd->vdev_child[rc->rc_devidx],
2040 			    rc->rc_offset, rc->rc_data, rc->rc_size,
2041 			    zio->io_type, zio->io_priority, 0,
2042 			    vdev_raidz_child_done, rc));
2043 		} while (++c < rm->rm_cols);
2044 
2045 		return;
2046 	}
2047 
2048 	/*
2049 	 * At this point we've attempted to reconstruct the data given the
2050 	 * errors we detected, and we've attempted to read all columns. There
2051 	 * must, therefore, be one or more additional problems -- silent errors
2052 	 * resulting in invalid data rather than explicit I/O errors resulting
2053 	 * in absent data. We check if there is enough additional data to
2054 	 * possibly reconstruct the data and then perform combinatorial
2055 	 * reconstruction over all possible combinations. If that fails,
2056 	 * we're cooked.
2057 	 */
2058 	if (total_errors > rm->rm_firstdatacol) {
2059 		zio->io_error = vdev_raidz_worst_error(rm);
2060 
2061 	} else if (total_errors < rm->rm_firstdatacol &&
2062 	    (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2063 		/*
2064 		 * If we didn't use all the available parity for the
2065 		 * combinatorial reconstruction, verify that the remaining
2066 		 * parity is correct.
2067 		 */
2068 		if (code != (1 << rm->rm_firstdatacol) - 1)
2069 			(void) raidz_parity_verify(zio, rm);
2070 	} else {
2071 		/*
2072 		 * We're here because either:
2073 		 *
2074 		 *	total_errors == rm_first_datacol, or
2075 		 *	vdev_raidz_combrec() failed
2076 		 *
2077 		 * In either case, there is enough bad data to prevent
2078 		 * reconstruction.
2079 		 *
2080 		 * Start checksum ereports for all children which haven't
2081 		 * failed, and the IO wasn't speculative.
2082 		 */
2083 		zio->io_error = SET_ERROR(ECKSUM);
2084 
2085 		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2086 			for (c = 0; c < rm->rm_cols; c++) {
2087 				rc = &rm->rm_col[c];
2088 				if (rc->rc_error == 0) {
2089 					zio_bad_cksum_t zbc;
2090 					zbc.zbc_has_cksum = 0;
2091 					zbc.zbc_injected =
2092 					    rm->rm_ecksuminjected;
2093 
2094 					zfs_ereport_start_checksum(
2095 					    zio->io_spa,
2096 					    vd->vdev_child[rc->rc_devidx],
2097 					    zio, rc->rc_offset, rc->rc_size,
2098 					    (void *)(uintptr_t)c, &zbc);
2099 				}
2100 			}
2101 		}
2102 	}
2103 
2104 done:
2105 	zio_checksum_verified(zio);
2106 
2107 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2108 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2109 		/*
2110 		 * Use the good data we have in hand to repair damaged children.
2111 		 */
2112 		for (c = 0; c < rm->rm_cols; c++) {
2113 			rc = &rm->rm_col[c];
2114 			cvd = vd->vdev_child[rc->rc_devidx];
2115 
2116 			if (rc->rc_error == 0)
2117 				continue;
2118 
2119 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2120 			    rc->rc_offset, rc->rc_data, rc->rc_size,
2121 			    ZIO_TYPE_WRITE, zio->io_priority,
2122 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2123 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2124 		}
2125 	}
2126 }
2127 
2128 static void
2129 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2130 {
2131 	if (faulted > vd->vdev_nparity)
2132 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2133 		    VDEV_AUX_NO_REPLICAS);
2134 	else if (degraded + faulted != 0)
2135 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2136 	else
2137 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2138 }
2139 
2140 vdev_ops_t vdev_raidz_ops = {
2141 	vdev_raidz_open,
2142 	vdev_raidz_close,
2143 	vdev_raidz_asize,
2144 	vdev_raidz_io_start,
2145 	vdev_raidz_io_done,
2146 	vdev_raidz_state_change,
2147 	NULL,
2148 	NULL,
2149 	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
2150 	B_FALSE			/* not a leaf vdev */
2151 };
2152