xref: /illumos-gate/usr/src/uts/common/fs/zfs/vdev_raidz.c (revision ff7af0d3beb1bddf8bb93afc2e9042dc3828be3d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
25  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
26  * Copyright (c) 2014 Integros [integros.com]
27  */
28 
29 #include <sys/zfs_context.h>
30 #include <sys/spa.h>
31 #include <sys/vdev_impl.h>
32 #include <sys/vdev_disk.h>
33 #include <sys/vdev_file.h>
34 #include <sys/vdev_raidz.h>
35 #include <sys/zio.h>
36 #include <sys/zio_checksum.h>
37 #include <sys/abd.h>
38 #include <sys/fs/zfs.h>
39 #include <sys/fm/fs/zfs.h>
40 
41 /*
42  * Virtual device vector for RAID-Z.
43  *
44  * This vdev supports single, double, and triple parity. For single parity,
45  * we use a simple XOR of all the data columns. For double or triple parity,
46  * we use a special case of Reed-Solomon coding. This extends the
47  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
48  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
49  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
50  * former is also based. The latter is designed to provide higher performance
51  * for writes.
52  *
53  * Note that the Plank paper claimed to support arbitrary N+M, but was then
54  * amended six years later identifying a critical flaw that invalidates its
55  * claims. Nevertheless, the technique can be adapted to work for up to
56  * triple parity. For additional parity, the amendment "Note: Correction to
57  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
58  * is viable, but the additional complexity means that write performance will
59  * suffer.
60  *
61  * All of the methods above operate on a Galois field, defined over the
62  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
63  * can be expressed with a single byte. Briefly, the operations on the
64  * field are defined as follows:
65  *
66  *   o addition (+) is represented by a bitwise XOR
67  *   o subtraction (-) is therefore identical to addition: A + B = A - B
68  *   o multiplication of A by 2 is defined by the following bitwise expression:
69  *
70  *	(A * 2)_7 = A_6
71  *	(A * 2)_6 = A_5
72  *	(A * 2)_5 = A_4
73  *	(A * 2)_4 = A_3 + A_7
74  *	(A * 2)_3 = A_2 + A_7
75  *	(A * 2)_2 = A_1 + A_7
76  *	(A * 2)_1 = A_0
77  *	(A * 2)_0 = A_7
78  *
79  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
80  * As an aside, this multiplication is derived from the error correcting
81  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
82  *
83  * Observe that any number in the field (except for 0) can be expressed as a
84  * power of 2 -- a generator for the field. We store a table of the powers of
85  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
86  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
87  * than field addition). The inverse of a field element A (A^-1) is therefore
88  * A ^ (255 - 1) = A^254.
89  *
90  * The up-to-three parity columns, P, Q, R over several data columns,
91  * D_0, ... D_n-1, can be expressed by field operations:
92  *
93  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
94  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
95  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
96  *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
97  *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
98  *
99  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
100  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
101  * independent coefficients. (There are no additional coefficients that have
102  * this property which is why the uncorrected Plank method breaks down.)
103  *
104  * See the reconstruction code below for how P, Q and R can used individually
105  * or in concert to recover missing data columns.
106  */
107 
108 typedef struct raidz_col {
109 	uint64_t rc_devidx;		/* child device index for I/O */
110 	uint64_t rc_offset;		/* device offset */
111 	uint64_t rc_size;		/* I/O size */
112 	abd_t *rc_abd;			/* I/O data */
113 	void *rc_gdata;			/* used to store the "good" version */
114 	int rc_error;			/* I/O error for this device */
115 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
116 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
117 } raidz_col_t;
118 
119 typedef struct raidz_map {
120 	uint64_t rm_cols;		/* Regular column count */
121 	uint64_t rm_scols;		/* Count including skipped columns */
122 	uint64_t rm_bigcols;		/* Number of oversized columns */
123 	uint64_t rm_asize;		/* Actual total I/O size */
124 	uint64_t rm_missingdata;	/* Count of missing data devices */
125 	uint64_t rm_missingparity;	/* Count of missing parity devices */
126 	uint64_t rm_firstdatacol;	/* First data column/parity count */
127 	uint64_t rm_nskip;		/* Skipped sectors for padding */
128 	uint64_t rm_skipstart;		/* Column index of padding start */
129 	abd_t *rm_abd_copy;		/* rm_asize-buffer of copied data */
130 	uintptr_t rm_reports;		/* # of referencing checksum reports */
131 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
132 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
133 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
134 } raidz_map_t;
135 
136 #define	VDEV_RAIDZ_P		0
137 #define	VDEV_RAIDZ_Q		1
138 #define	VDEV_RAIDZ_R		2
139 
140 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
141 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
142 
143 /*
144  * We provide a mechanism to perform the field multiplication operation on a
145  * 64-bit value all at once rather than a byte at a time. This works by
146  * creating a mask from the top bit in each byte and using that to
147  * conditionally apply the XOR of 0x1d.
148  */
149 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
150 { \
151 	(mask) = (x) & 0x8080808080808080ULL; \
152 	(mask) = ((mask) << 1) - ((mask) >> 7); \
153 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
154 	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
155 }
156 
157 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
158 { \
159 	VDEV_RAIDZ_64MUL_2((x), mask); \
160 	VDEV_RAIDZ_64MUL_2((x), mask); \
161 }
162 
163 #define	VDEV_LABEL_OFFSET(x)	(x + VDEV_LABEL_START_SIZE)
164 
165 /*
166  * Force reconstruction to use the general purpose method.
167  */
168 int vdev_raidz_default_to_general;
169 
170 /* Powers of 2 in the Galois field defined above. */
171 static const uint8_t vdev_raidz_pow2[256] = {
172 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
173 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
174 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
175 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
176 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
177 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
178 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
179 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
180 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
181 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
182 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
183 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
184 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
185 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
186 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
187 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
188 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
189 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
190 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
191 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
192 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
193 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
194 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
195 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
196 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
197 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
198 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
199 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
200 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
201 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
202 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
203 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
204 };
205 /* Logs of 2 in the Galois field defined above. */
206 static const uint8_t vdev_raidz_log2[256] = {
207 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
208 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
209 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
210 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
211 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
212 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
213 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
214 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
215 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
216 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
217 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
218 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
219 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
220 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
221 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
222 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
223 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
224 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
225 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
226 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
227 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
228 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
229 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
230 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
231 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
232 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
233 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
234 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
235 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
236 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
237 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
238 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
239 };
240 
241 static void vdev_raidz_generate_parity(raidz_map_t *rm);
242 
243 /*
244  * Multiply a given number by 2 raised to the given power.
245  */
246 static uint8_t
247 vdev_raidz_exp2(uint_t a, int exp)
248 {
249 	if (a == 0)
250 		return (0);
251 
252 	ASSERT(exp >= 0);
253 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
254 
255 	exp += vdev_raidz_log2[a];
256 	if (exp > 255)
257 		exp -= 255;
258 
259 	return (vdev_raidz_pow2[exp]);
260 }
261 
262 static void
263 vdev_raidz_map_free(raidz_map_t *rm)
264 {
265 	int c;
266 	size_t size;
267 
268 	for (c = 0; c < rm->rm_firstdatacol; c++) {
269 		abd_free(rm->rm_col[c].rc_abd);
270 
271 		if (rm->rm_col[c].rc_gdata != NULL)
272 			zio_buf_free(rm->rm_col[c].rc_gdata,
273 			    rm->rm_col[c].rc_size);
274 	}
275 
276 	size = 0;
277 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
278 		abd_put(rm->rm_col[c].rc_abd);
279 		size += rm->rm_col[c].rc_size;
280 	}
281 
282 	if (rm->rm_abd_copy != NULL)
283 		abd_free(rm->rm_abd_copy);
284 
285 	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
286 }
287 
288 static void
289 vdev_raidz_map_free_vsd(zio_t *zio)
290 {
291 	raidz_map_t *rm = zio->io_vsd;
292 
293 	ASSERT0(rm->rm_freed);
294 	rm->rm_freed = 1;
295 
296 	if (rm->rm_reports == 0)
297 		vdev_raidz_map_free(rm);
298 }
299 
300 /*ARGSUSED*/
301 static void
302 vdev_raidz_cksum_free(void *arg, size_t ignored)
303 {
304 	raidz_map_t *rm = arg;
305 
306 	ASSERT3U(rm->rm_reports, >, 0);
307 
308 	if (--rm->rm_reports == 0 && rm->rm_freed != 0)
309 		vdev_raidz_map_free(rm);
310 }
311 
312 static void
313 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
314 {
315 	raidz_map_t *rm = zcr->zcr_cbdata;
316 	size_t c = zcr->zcr_cbinfo;
317 	size_t x;
318 
319 	const char *good = NULL;
320 	char *bad;
321 
322 	if (good_data == NULL) {
323 		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
324 		return;
325 	}
326 
327 	if (c < rm->rm_firstdatacol) {
328 		/*
329 		 * The first time through, calculate the parity blocks for
330 		 * the good data (this relies on the fact that the good
331 		 * data never changes for a given logical ZIO)
332 		 */
333 		if (rm->rm_col[0].rc_gdata == NULL) {
334 			abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY];
335 			char *buf;
336 			int offset;
337 
338 			/*
339 			 * Set up the rm_col[]s to generate the parity for
340 			 * good_data, first saving the parity bufs and
341 			 * replacing them with buffers to hold the result.
342 			 */
343 			for (x = 0; x < rm->rm_firstdatacol; x++) {
344 				bad_parity[x] = rm->rm_col[x].rc_abd;
345 				rm->rm_col[x].rc_gdata =
346 				    zio_buf_alloc(rm->rm_col[x].rc_size);
347 				rm->rm_col[x].rc_abd =
348 				    abd_get_from_buf(rm->rm_col[x].rc_gdata,
349 				    rm->rm_col[x].rc_size);
350 			}
351 
352 			/* fill in the data columns from good_data */
353 			buf = (char *)good_data;
354 			for (; x < rm->rm_cols; x++) {
355 				abd_put(rm->rm_col[x].rc_abd);
356 				rm->rm_col[x].rc_abd = abd_get_from_buf(buf,
357 				    rm->rm_col[x].rc_size);
358 				buf += rm->rm_col[x].rc_size;
359 			}
360 
361 			/*
362 			 * Construct the parity from the good data.
363 			 */
364 			vdev_raidz_generate_parity(rm);
365 
366 			/* restore everything back to its original state */
367 			for (x = 0; x < rm->rm_firstdatacol; x++) {
368 				abd_put(rm->rm_col[x].rc_abd);
369 				rm->rm_col[x].rc_abd = bad_parity[x];
370 			}
371 
372 			offset = 0;
373 			for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
374 				abd_put(rm->rm_col[x].rc_abd);
375 				rm->rm_col[x].rc_abd = abd_get_offset(
376 				    rm->rm_abd_copy, offset);
377 				offset += rm->rm_col[x].rc_size;
378 			}
379 		}
380 
381 		ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
382 		good = rm->rm_col[c].rc_gdata;
383 	} else {
384 		/* adjust good_data to point at the start of our column */
385 		good = good_data;
386 
387 		for (x = rm->rm_firstdatacol; x < c; x++)
388 			good += rm->rm_col[x].rc_size;
389 	}
390 
391 	bad = abd_borrow_buf_copy(rm->rm_col[c].rc_abd, rm->rm_col[c].rc_size);
392 	/* we drop the ereport if it ends up that the data was good */
393 	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
394 	abd_return_buf(rm->rm_col[c].rc_abd, bad, rm->rm_col[c].rc_size);
395 }
396 
397 /*
398  * Invoked indirectly by zfs_ereport_start_checksum(), called
399  * below when our read operation fails completely.  The main point
400  * is to keep a copy of everything we read from disk, so that at
401  * vdev_raidz_cksum_finish() time we can compare it with the good data.
402  */
403 static void
404 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
405 {
406 	size_t c = (size_t)(uintptr_t)arg;
407 	size_t offset;
408 
409 	raidz_map_t *rm = zio->io_vsd;
410 	size_t size;
411 
412 	/* set up the report and bump the refcount  */
413 	zcr->zcr_cbdata = rm;
414 	zcr->zcr_cbinfo = c;
415 	zcr->zcr_finish = vdev_raidz_cksum_finish;
416 	zcr->zcr_free = vdev_raidz_cksum_free;
417 
418 	rm->rm_reports++;
419 	ASSERT3U(rm->rm_reports, >, 0);
420 
421 	if (rm->rm_abd_copy != NULL)
422 		return;
423 
424 	/*
425 	 * It's the first time we're called for this raidz_map_t, so we need
426 	 * to copy the data aside; there's no guarantee that our zio's buffer
427 	 * won't be re-used for something else.
428 	 *
429 	 * Our parity data is already in separate buffers, so there's no need
430 	 * to copy them.
431 	 */
432 
433 	size = 0;
434 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
435 		size += rm->rm_col[c].rc_size;
436 
437 	rm->rm_abd_copy =
438 	    abd_alloc_sametype(rm->rm_col[rm->rm_firstdatacol].rc_abd, size);
439 
440 	for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
441 		raidz_col_t *col = &rm->rm_col[c];
442 		abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset);
443 
444 		abd_copy(tmp, col->rc_abd, col->rc_size);
445 		abd_put(col->rc_abd);
446 		col->rc_abd = tmp;
447 
448 		offset += col->rc_size;
449 	}
450 	ASSERT3U(offset, ==, size);
451 }
452 
453 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
454 	vdev_raidz_map_free_vsd,
455 	vdev_raidz_cksum_report
456 };
457 
458 /*
459  * Divides the IO evenly across all child vdevs; usually, dcols is
460  * the number of children in the target vdev.
461  */
462 static raidz_map_t *
463 vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset,
464     uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
465 {
466 	raidz_map_t *rm;
467 	/* The starting RAIDZ (parent) vdev sector of the block. */
468 	uint64_t b = offset >> unit_shift;
469 	/* The zio's size in units of the vdev's minimum sector size. */
470 	uint64_t s = size >> unit_shift;
471 	/* The first column for this stripe. */
472 	uint64_t f = b % dcols;
473 	/* The starting byte offset on each child vdev. */
474 	uint64_t o = (b / dcols) << unit_shift;
475 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
476 	uint64_t off = 0;
477 
478 	/*
479 	 * "Quotient": The number of data sectors for this stripe on all but
480 	 * the "big column" child vdevs that also contain "remainder" data.
481 	 */
482 	q = s / (dcols - nparity);
483 
484 	/*
485 	 * "Remainder": The number of partial stripe data sectors in this I/O.
486 	 * This will add a sector to some, but not all, child vdevs.
487 	 */
488 	r = s - q * (dcols - nparity);
489 
490 	/* The number of "big columns" - those which contain remainder data. */
491 	bc = (r == 0 ? 0 : r + nparity);
492 
493 	/*
494 	 * The total number of data and parity sectors associated with
495 	 * this I/O.
496 	 */
497 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
498 
499 	/* acols: The columns that will be accessed. */
500 	/* scols: The columns that will be accessed or skipped. */
501 	if (q == 0) {
502 		/* Our I/O request doesn't span all child vdevs. */
503 		acols = bc;
504 		scols = MIN(dcols, roundup(bc, nparity + 1));
505 	} else {
506 		acols = dcols;
507 		scols = dcols;
508 	}
509 
510 	ASSERT3U(acols, <=, scols);
511 
512 	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
513 
514 	rm->rm_cols = acols;
515 	rm->rm_scols = scols;
516 	rm->rm_bigcols = bc;
517 	rm->rm_skipstart = bc;
518 	rm->rm_missingdata = 0;
519 	rm->rm_missingparity = 0;
520 	rm->rm_firstdatacol = nparity;
521 	rm->rm_abd_copy = NULL;
522 	rm->rm_reports = 0;
523 	rm->rm_freed = 0;
524 	rm->rm_ecksuminjected = 0;
525 
526 	asize = 0;
527 
528 	for (c = 0; c < scols; c++) {
529 		col = f + c;
530 		coff = o;
531 		if (col >= dcols) {
532 			col -= dcols;
533 			coff += 1ULL << unit_shift;
534 		}
535 		rm->rm_col[c].rc_devidx = col;
536 		rm->rm_col[c].rc_offset = coff;
537 		rm->rm_col[c].rc_abd = NULL;
538 		rm->rm_col[c].rc_gdata = NULL;
539 		rm->rm_col[c].rc_error = 0;
540 		rm->rm_col[c].rc_tried = 0;
541 		rm->rm_col[c].rc_skipped = 0;
542 
543 		if (c >= acols)
544 			rm->rm_col[c].rc_size = 0;
545 		else if (c < bc)
546 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
547 		else
548 			rm->rm_col[c].rc_size = q << unit_shift;
549 
550 		asize += rm->rm_col[c].rc_size;
551 	}
552 
553 	ASSERT3U(asize, ==, tot << unit_shift);
554 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
555 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
556 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
557 	ASSERT3U(rm->rm_nskip, <=, nparity);
558 
559 	for (c = 0; c < rm->rm_firstdatacol; c++)
560 		rm->rm_col[c].rc_abd =
561 		    abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE);
562 
563 	rm->rm_col[c].rc_abd = abd_get_offset(abd, 0);
564 	off = rm->rm_col[c].rc_size;
565 
566 	for (c = c + 1; c < acols; c++) {
567 		rm->rm_col[c].rc_abd = abd_get_offset(abd, off);
568 		off += rm->rm_col[c].rc_size;
569 	}
570 
571 	/*
572 	 * If all data stored spans all columns, there's a danger that parity
573 	 * will always be on the same device and, since parity isn't read
574 	 * during normal operation, that that device's I/O bandwidth won't be
575 	 * used effectively. We therefore switch the parity every 1MB.
576 	 *
577 	 * ... at least that was, ostensibly, the theory. As a practical
578 	 * matter unless we juggle the parity between all devices evenly, we
579 	 * won't see any benefit. Further, occasional writes that aren't a
580 	 * multiple of the LCM of the number of children and the minimum
581 	 * stripe width are sufficient to avoid pessimal behavior.
582 	 * Unfortunately, this decision created an implicit on-disk format
583 	 * requirement that we need to support for all eternity, but only
584 	 * for single-parity RAID-Z.
585 	 *
586 	 * If we intend to skip a sector in the zeroth column for padding
587 	 * we must make sure to note this swap. We will never intend to
588 	 * skip the first column since at least one data and one parity
589 	 * column must appear in each row.
590 	 */
591 	ASSERT(rm->rm_cols >= 2);
592 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
593 
594 	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
595 		devidx = rm->rm_col[0].rc_devidx;
596 		o = rm->rm_col[0].rc_offset;
597 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
598 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
599 		rm->rm_col[1].rc_devidx = devidx;
600 		rm->rm_col[1].rc_offset = o;
601 
602 		if (rm->rm_skipstart == 0)
603 			rm->rm_skipstart = 1;
604 	}
605 
606 	return (rm);
607 }
608 
609 struct pqr_struct {
610 	uint64_t *p;
611 	uint64_t *q;
612 	uint64_t *r;
613 };
614 
615 static int
616 vdev_raidz_p_func(void *buf, size_t size, void *private)
617 {
618 	struct pqr_struct *pqr = private;
619 	const uint64_t *src = buf;
620 	int i, cnt = size / sizeof (src[0]);
621 
622 	ASSERT(pqr->p && !pqr->q && !pqr->r);
623 
624 	for (i = 0; i < cnt; i++, src++, pqr->p++)
625 		*pqr->p ^= *src;
626 
627 	return (0);
628 }
629 
630 static int
631 vdev_raidz_pq_func(void *buf, size_t size, void *private)
632 {
633 	struct pqr_struct *pqr = private;
634 	const uint64_t *src = buf;
635 	uint64_t mask;
636 	int i, cnt = size / sizeof (src[0]);
637 
638 	ASSERT(pqr->p && pqr->q && !pqr->r);
639 
640 	for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
641 		*pqr->p ^= *src;
642 		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
643 		*pqr->q ^= *src;
644 	}
645 
646 	return (0);
647 }
648 
649 static int
650 vdev_raidz_pqr_func(void *buf, size_t size, void *private)
651 {
652 	struct pqr_struct *pqr = private;
653 	const uint64_t *src = buf;
654 	uint64_t mask;
655 	int i, cnt = size / sizeof (src[0]);
656 
657 	ASSERT(pqr->p && pqr->q && pqr->r);
658 
659 	for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
660 		*pqr->p ^= *src;
661 		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
662 		*pqr->q ^= *src;
663 		VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
664 		*pqr->r ^= *src;
665 	}
666 
667 	return (0);
668 }
669 
670 static void
671 vdev_raidz_generate_parity_p(raidz_map_t *rm)
672 {
673 	uint64_t *p;
674 	int c;
675 	abd_t *src;
676 
677 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
678 		src = rm->rm_col[c].rc_abd;
679 		p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
680 
681 		if (c == rm->rm_firstdatacol) {
682 			abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
683 		} else {
684 			struct pqr_struct pqr = { p, NULL, NULL };
685 			(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
686 			    vdev_raidz_p_func, &pqr);
687 		}
688 	}
689 }
690 
691 static void
692 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
693 {
694 	uint64_t *p, *q, pcnt, ccnt, mask, i;
695 	int c;
696 	abd_t *src;
697 
698 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
699 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
700 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
701 
702 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
703 		src = rm->rm_col[c].rc_abd;
704 		p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
705 		q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
706 
707 		ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
708 
709 
710 		if (c == rm->rm_firstdatacol) {
711 			abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
712 			(void) memcpy(q, p, rm->rm_col[c].rc_size);
713 		} else {
714 			struct pqr_struct pqr = { p, q, NULL };
715 			(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
716 			    vdev_raidz_pq_func, &pqr);
717 		}
718 
719 		if (c == rm->rm_firstdatacol) {
720 			for (i = ccnt; i < pcnt; i++) {
721 				p[i] = 0;
722 				q[i] = 0;
723 			}
724 		} else {
725 			/*
726 			 * Treat short columns as though they are full of 0s.
727 			 * Note that there's therefore nothing needed for P.
728 			 */
729 			for (i = ccnt; i < pcnt; i++) {
730 				VDEV_RAIDZ_64MUL_2(q[i], mask);
731 			}
732 		}
733 	}
734 }
735 
736 static void
737 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
738 {
739 	uint64_t *p, *q, *r, pcnt, ccnt, mask, i;
740 	int c;
741 	abd_t *src;
742 
743 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
744 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
745 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
746 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
747 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
748 
749 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
750 		src = rm->rm_col[c].rc_abd;
751 		p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
752 		q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
753 		r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd);
754 
755 		ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
756 
757 		if (c == rm->rm_firstdatacol) {
758 			abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
759 			(void) memcpy(q, p, rm->rm_col[c].rc_size);
760 			(void) memcpy(r, p, rm->rm_col[c].rc_size);
761 		} else {
762 			struct pqr_struct pqr = { p, q, r };
763 			(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
764 			    vdev_raidz_pqr_func, &pqr);
765 		}
766 
767 		if (c == rm->rm_firstdatacol) {
768 			for (i = ccnt; i < pcnt; i++) {
769 				p[i] = 0;
770 				q[i] = 0;
771 				r[i] = 0;
772 			}
773 		} else {
774 			/*
775 			 * Treat short columns as though they are full of 0s.
776 			 * Note that there's therefore nothing needed for P.
777 			 */
778 			for (i = ccnt; i < pcnt; i++) {
779 				VDEV_RAIDZ_64MUL_2(q[i], mask);
780 				VDEV_RAIDZ_64MUL_4(r[i], mask);
781 			}
782 		}
783 	}
784 }
785 
786 /*
787  * Generate RAID parity in the first virtual columns according to the number of
788  * parity columns available.
789  */
790 static void
791 vdev_raidz_generate_parity(raidz_map_t *rm)
792 {
793 	switch (rm->rm_firstdatacol) {
794 	case 1:
795 		vdev_raidz_generate_parity_p(rm);
796 		break;
797 	case 2:
798 		vdev_raidz_generate_parity_pq(rm);
799 		break;
800 	case 3:
801 		vdev_raidz_generate_parity_pqr(rm);
802 		break;
803 	default:
804 		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
805 	}
806 }
807 
808 /* ARGSUSED */
809 static int
810 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
811 {
812 	uint64_t *dst = dbuf;
813 	uint64_t *src = sbuf;
814 	int cnt = size / sizeof (src[0]);
815 
816 	for (int i = 0; i < cnt; i++) {
817 		dst[i] ^= src[i];
818 	}
819 
820 	return (0);
821 }
822 
823 /* ARGSUSED */
824 static int
825 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
826     void *private)
827 {
828 	uint64_t *dst = dbuf;
829 	uint64_t *src = sbuf;
830 	uint64_t mask;
831 	int cnt = size / sizeof (dst[0]);
832 
833 	for (int i = 0; i < cnt; i++, dst++, src++) {
834 		VDEV_RAIDZ_64MUL_2(*dst, mask);
835 		*dst ^= *src;
836 	}
837 
838 	return (0);
839 }
840 
841 /* ARGSUSED */
842 static int
843 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
844 {
845 	uint64_t *dst = buf;
846 	uint64_t mask;
847 	int cnt = size / sizeof (dst[0]);
848 
849 	for (int i = 0; i < cnt; i++, dst++) {
850 		/* same operation as vdev_raidz_reconst_q_pre_func() on dst */
851 		VDEV_RAIDZ_64MUL_2(*dst, mask);
852 	}
853 
854 	return (0);
855 }
856 
857 struct reconst_q_struct {
858 	uint64_t *q;
859 	int exp;
860 };
861 
862 static int
863 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
864 {
865 	struct reconst_q_struct *rq = private;
866 	uint64_t *dst = buf;
867 	int cnt = size / sizeof (dst[0]);
868 
869 	for (int i = 0; i < cnt; i++, dst++, rq->q++) {
870 		*dst ^= *rq->q;
871 
872 		int j;
873 		uint8_t *b;
874 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
875 			*b = vdev_raidz_exp2(*b, rq->exp);
876 		}
877 	}
878 
879 	return (0);
880 }
881 
882 struct reconst_pq_struct {
883 	uint8_t *p;
884 	uint8_t *q;
885 	uint8_t *pxy;
886 	uint8_t *qxy;
887 	int aexp;
888 	int bexp;
889 };
890 
891 static int
892 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
893 {
894 	struct reconst_pq_struct *rpq = private;
895 	uint8_t *xd = xbuf;
896 	uint8_t *yd = ybuf;
897 
898 	for (int i = 0; i < size;
899 	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
900 		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
901 		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
902 		*yd = *rpq->p ^ *rpq->pxy ^ *xd;
903 	}
904 
905 	return (0);
906 }
907 
908 static int
909 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
910 {
911 	struct reconst_pq_struct *rpq = private;
912 	uint8_t *xd = xbuf;
913 
914 	for (int i = 0; i < size;
915 	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
916 		/* same operation as vdev_raidz_reconst_pq_func() on xd */
917 		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
918 		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
919 	}
920 
921 	return (0);
922 }
923 
924 static int
925 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
926 {
927 	int x = tgts[0];
928 	int c;
929 	abd_t *dst, *src;
930 
931 	ASSERT(ntgts == 1);
932 	ASSERT(x >= rm->rm_firstdatacol);
933 	ASSERT(x < rm->rm_cols);
934 
935 	ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size);
936 	ASSERT(rm->rm_col[x].rc_size > 0);
937 
938 	src = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
939 	dst = rm->rm_col[x].rc_abd;
940 
941 	abd_copy_from_buf(dst, abd_to_buf(src), rm->rm_col[x].rc_size);
942 
943 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
944 		uint64_t size = MIN(rm->rm_col[x].rc_size,
945 		    rm->rm_col[c].rc_size);
946 
947 		src = rm->rm_col[c].rc_abd;
948 		dst = rm->rm_col[x].rc_abd;
949 
950 		if (c == x)
951 			continue;
952 
953 		(void) abd_iterate_func2(dst, src, 0, 0, size,
954 		    vdev_raidz_reconst_p_func, NULL);
955 	}
956 
957 	return (1 << VDEV_RAIDZ_P);
958 }
959 
960 static int
961 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
962 {
963 	int x = tgts[0];
964 	int c, exp;
965 	abd_t *dst, *src;
966 
967 	ASSERT(ntgts == 1);
968 
969 	ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size);
970 
971 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
972 		uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size,
973 		    rm->rm_col[c].rc_size);
974 
975 		src = rm->rm_col[c].rc_abd;
976 		dst = rm->rm_col[x].rc_abd;
977 
978 		if (c == rm->rm_firstdatacol) {
979 			abd_copy(dst, src, size);
980 			if (rm->rm_col[x].rc_size > size)
981 				abd_zero_off(dst, size,
982 				    rm->rm_col[x].rc_size - size);
983 		} else {
984 			ASSERT3U(size, <=, rm->rm_col[x].rc_size);
985 			(void) abd_iterate_func2(dst, src, 0, 0, size,
986 			    vdev_raidz_reconst_q_pre_func, NULL);
987 			(void) abd_iterate_func(dst,
988 			    size, rm->rm_col[x].rc_size - size,
989 			    vdev_raidz_reconst_q_pre_tail_func, NULL);
990 		}
991 	}
992 
993 	src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
994 	dst = rm->rm_col[x].rc_abd;
995 	exp = 255 - (rm->rm_cols - 1 - x);
996 
997 	struct reconst_q_struct rq = { abd_to_buf(src), exp };
998 	(void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size,
999 	    vdev_raidz_reconst_q_post_func, &rq);
1000 
1001 	return (1 << VDEV_RAIDZ_Q);
1002 }
1003 
1004 static int
1005 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
1006 {
1007 	uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
1008 	abd_t *pdata, *qdata;
1009 	uint64_t xsize, ysize;
1010 	int x = tgts[0];
1011 	int y = tgts[1];
1012 	abd_t *xd, *yd;
1013 
1014 	ASSERT(ntgts == 2);
1015 	ASSERT(x < y);
1016 	ASSERT(x >= rm->rm_firstdatacol);
1017 	ASSERT(y < rm->rm_cols);
1018 
1019 	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
1020 
1021 	/*
1022 	 * Move the parity data aside -- we're going to compute parity as
1023 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
1024 	 * reuse the parity generation mechanism without trashing the actual
1025 	 * parity so we make those columns appear to be full of zeros by
1026 	 * setting their lengths to zero.
1027 	 */
1028 	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
1029 	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
1030 	xsize = rm->rm_col[x].rc_size;
1031 	ysize = rm->rm_col[y].rc_size;
1032 
1033 	rm->rm_col[VDEV_RAIDZ_P].rc_abd =
1034 	    abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
1035 	rm->rm_col[VDEV_RAIDZ_Q].rc_abd =
1036 	    abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
1037 	rm->rm_col[x].rc_size = 0;
1038 	rm->rm_col[y].rc_size = 0;
1039 
1040 	vdev_raidz_generate_parity_pq(rm);
1041 
1042 	rm->rm_col[x].rc_size = xsize;
1043 	rm->rm_col[y].rc_size = ysize;
1044 
1045 	p = abd_to_buf(pdata);
1046 	q = abd_to_buf(qdata);
1047 	pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
1048 	qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
1049 	xd = rm->rm_col[x].rc_abd;
1050 	yd = rm->rm_col[y].rc_abd;
1051 
1052 	/*
1053 	 * We now have:
1054 	 *	Pxy = P + D_x + D_y
1055 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
1056 	 *
1057 	 * We can then solve for D_x:
1058 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
1059 	 * where
1060 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
1061 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
1062 	 *
1063 	 * With D_x in hand, we can easily solve for D_y:
1064 	 *	D_y = P + Pxy + D_x
1065 	 */
1066 
1067 	a = vdev_raidz_pow2[255 + x - y];
1068 	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
1069 	tmp = 255 - vdev_raidz_log2[a ^ 1];
1070 
1071 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
1072 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
1073 
1074 	ASSERT3U(xsize, >=, ysize);
1075 	struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
1076 	(void) abd_iterate_func2(xd, yd, 0, 0, ysize,
1077 	    vdev_raidz_reconst_pq_func, &rpq);
1078 	(void) abd_iterate_func(xd, ysize, xsize - ysize,
1079 	    vdev_raidz_reconst_pq_tail_func, &rpq);
1080 
1081 	abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
1082 	abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
1083 
1084 	/*
1085 	 * Restore the saved parity data.
1086 	 */
1087 	rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata;
1088 	rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata;
1089 
1090 	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
1091 }
1092 
1093 /* BEGIN CSTYLED */
1094 /*
1095  * In the general case of reconstruction, we must solve the system of linear
1096  * equations defined by the coeffecients used to generate parity as well as
1097  * the contents of the data and parity disks. This can be expressed with
1098  * vectors for the original data (D) and the actual data (d) and parity (p)
1099  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
1100  *
1101  *            __   __                     __     __
1102  *            |     |         __     __   |  p_0  |
1103  *            |  V  |         |  D_0  |   | p_m-1 |
1104  *            |     |    x    |   :   | = |  d_0  |
1105  *            |  I  |         | D_n-1 |   |   :   |
1106  *            |     |         ~~     ~~   | d_n-1 |
1107  *            ~~   ~~                     ~~     ~~
1108  *
1109  * I is simply a square identity matrix of size n, and V is a vandermonde
1110  * matrix defined by the coeffecients we chose for the various parity columns
1111  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
1112  * computation as well as linear separability.
1113  *
1114  *      __               __               __     __
1115  *      |   1   ..  1 1 1 |               |  p_0  |
1116  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
1117  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
1118  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
1119  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
1120  *      |   :       : : : |   |   :   |   |  d_2  |
1121  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
1122  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
1123  *      |   0   ..  0 0 1 |               | d_n-1 |
1124  *      ~~               ~~               ~~     ~~
1125  *
1126  * Note that I, V, d, and p are known. To compute D, we must invert the
1127  * matrix and use the known data and parity values to reconstruct the unknown
1128  * data values. We begin by removing the rows in V|I and d|p that correspond
1129  * to failed or missing columns; we then make V|I square (n x n) and d|p
1130  * sized n by removing rows corresponding to unused parity from the bottom up
1131  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
1132  * using Gauss-Jordan elimination. In the example below we use m=3 parity
1133  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
1134  *           __                               __
1135  *           |  1   1   1   1   1   1   1   1  |
1136  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
1137  *           |  19 205 116  29  64  16  4   1  |      / /
1138  *           |  1   0   0   0   0   0   0   0  |     / /
1139  *           |  0   1   0   0   0   0   0   0  | <--' /
1140  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
1141  *           |  0   0   0   1   0   0   0   0  |
1142  *           |  0   0   0   0   1   0   0   0  |
1143  *           |  0   0   0   0   0   1   0   0  |
1144  *           |  0   0   0   0   0   0   1   0  |
1145  *           |  0   0   0   0   0   0   0   1  |
1146  *           ~~                               ~~
1147  *           __                               __
1148  *           |  1   1   1   1   1   1   1   1  |
1149  *           |  19 205 116  29  64  16  4   1  |
1150  *           |  1   0   0   0   0   0   0   0  |
1151  *  (V|I)' = |  0   0   0   1   0   0   0   0  |
1152  *           |  0   0   0   0   1   0   0   0  |
1153  *           |  0   0   0   0   0   1   0   0  |
1154  *           |  0   0   0   0   0   0   1   0  |
1155  *           |  0   0   0   0   0   0   0   1  |
1156  *           ~~                               ~~
1157  *
1158  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1159  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1160  * matrix is not singular.
1161  * __                                                                 __
1162  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1163  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1164  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1165  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1166  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1167  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1168  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1169  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1170  * ~~                                                                 ~~
1171  * __                                                                 __
1172  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1173  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1174  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1175  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1176  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1177  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1178  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1179  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1180  * ~~                                                                 ~~
1181  * __                                                                 __
1182  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1183  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1184  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1185  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1186  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1187  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1188  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1189  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1190  * ~~                                                                 ~~
1191  * __                                                                 __
1192  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1193  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1194  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1195  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1196  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1197  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1198  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1199  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1200  * ~~                                                                 ~~
1201  * __                                                                 __
1202  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1203  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1204  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1205  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1206  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1207  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1208  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1209  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1210  * ~~                                                                 ~~
1211  * __                                                                 __
1212  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1213  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1214  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1215  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1216  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1217  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1218  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1219  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1220  * ~~                                                                 ~~
1221  *                   __                               __
1222  *                   |  0   0   1   0   0   0   0   0  |
1223  *                   | 167 100  5   41 159 169 217 208 |
1224  *                   | 166 100  4   40 158 168 216 209 |
1225  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1226  *                   |  0   0   0   0   1   0   0   0  |
1227  *                   |  0   0   0   0   0   1   0   0  |
1228  *                   |  0   0   0   0   0   0   1   0  |
1229  *                   |  0   0   0   0   0   0   0   1  |
1230  *                   ~~                               ~~
1231  *
1232  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1233  * of the missing data.
1234  *
1235  * As is apparent from the example above, the only non-trivial rows in the
1236  * inverse matrix correspond to the data disks that we're trying to
1237  * reconstruct. Indeed, those are the only rows we need as the others would
1238  * only be useful for reconstructing data known or assumed to be valid. For
1239  * that reason, we only build the coefficients in the rows that correspond to
1240  * targeted columns.
1241  */
1242 /* END CSTYLED */
1243 
1244 static void
1245 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1246     uint8_t **rows)
1247 {
1248 	int i, j;
1249 	int pow;
1250 
1251 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1252 
1253 	/*
1254 	 * Fill in the missing rows of interest.
1255 	 */
1256 	for (i = 0; i < nmap; i++) {
1257 		ASSERT3S(0, <=, map[i]);
1258 		ASSERT3S(map[i], <=, 2);
1259 
1260 		pow = map[i] * n;
1261 		if (pow > 255)
1262 			pow -= 255;
1263 		ASSERT(pow <= 255);
1264 
1265 		for (j = 0; j < n; j++) {
1266 			pow -= map[i];
1267 			if (pow < 0)
1268 				pow += 255;
1269 			rows[i][j] = vdev_raidz_pow2[pow];
1270 		}
1271 	}
1272 }
1273 
1274 static void
1275 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1276     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1277 {
1278 	int i, j, ii, jj;
1279 	uint8_t log;
1280 
1281 	/*
1282 	 * Assert that the first nmissing entries from the array of used
1283 	 * columns correspond to parity columns and that subsequent entries
1284 	 * correspond to data columns.
1285 	 */
1286 	for (i = 0; i < nmissing; i++) {
1287 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
1288 	}
1289 	for (; i < n; i++) {
1290 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1291 	}
1292 
1293 	/*
1294 	 * First initialize the storage where we'll compute the inverse rows.
1295 	 */
1296 	for (i = 0; i < nmissing; i++) {
1297 		for (j = 0; j < n; j++) {
1298 			invrows[i][j] = (i == j) ? 1 : 0;
1299 		}
1300 	}
1301 
1302 	/*
1303 	 * Subtract all trivial rows from the rows of consequence.
1304 	 */
1305 	for (i = 0; i < nmissing; i++) {
1306 		for (j = nmissing; j < n; j++) {
1307 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1308 			jj = used[j] - rm->rm_firstdatacol;
1309 			ASSERT3S(jj, <, n);
1310 			invrows[i][j] = rows[i][jj];
1311 			rows[i][jj] = 0;
1312 		}
1313 	}
1314 
1315 	/*
1316 	 * For each of the rows of interest, we must normalize it and subtract
1317 	 * a multiple of it from the other rows.
1318 	 */
1319 	for (i = 0; i < nmissing; i++) {
1320 		for (j = 0; j < missing[i]; j++) {
1321 			ASSERT0(rows[i][j]);
1322 		}
1323 		ASSERT3U(rows[i][missing[i]], !=, 0);
1324 
1325 		/*
1326 		 * Compute the inverse of the first element and multiply each
1327 		 * element in the row by that value.
1328 		 */
1329 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1330 
1331 		for (j = 0; j < n; j++) {
1332 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1333 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1334 		}
1335 
1336 		for (ii = 0; ii < nmissing; ii++) {
1337 			if (i == ii)
1338 				continue;
1339 
1340 			ASSERT3U(rows[ii][missing[i]], !=, 0);
1341 
1342 			log = vdev_raidz_log2[rows[ii][missing[i]]];
1343 
1344 			for (j = 0; j < n; j++) {
1345 				rows[ii][j] ^=
1346 				    vdev_raidz_exp2(rows[i][j], log);
1347 				invrows[ii][j] ^=
1348 				    vdev_raidz_exp2(invrows[i][j], log);
1349 			}
1350 		}
1351 	}
1352 
1353 	/*
1354 	 * Verify that the data that is left in the rows are properly part of
1355 	 * an identity matrix.
1356 	 */
1357 	for (i = 0; i < nmissing; i++) {
1358 		for (j = 0; j < n; j++) {
1359 			if (j == missing[i]) {
1360 				ASSERT3U(rows[i][j], ==, 1);
1361 			} else {
1362 				ASSERT0(rows[i][j]);
1363 			}
1364 		}
1365 	}
1366 }
1367 
1368 static void
1369 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1370     int *missing, uint8_t **invrows, const uint8_t *used)
1371 {
1372 	int i, j, x, cc, c;
1373 	uint8_t *src;
1374 	uint64_t ccount;
1375 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1376 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1377 	uint8_t log = 0;
1378 	uint8_t val;
1379 	int ll;
1380 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1381 	uint8_t *p, *pp;
1382 	size_t psize;
1383 
1384 	psize = sizeof (invlog[0][0]) * n * nmissing;
1385 	p = kmem_alloc(psize, KM_SLEEP);
1386 
1387 	for (pp = p, i = 0; i < nmissing; i++) {
1388 		invlog[i] = pp;
1389 		pp += n;
1390 	}
1391 
1392 	for (i = 0; i < nmissing; i++) {
1393 		for (j = 0; j < n; j++) {
1394 			ASSERT3U(invrows[i][j], !=, 0);
1395 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1396 		}
1397 	}
1398 
1399 	for (i = 0; i < n; i++) {
1400 		c = used[i];
1401 		ASSERT3U(c, <, rm->rm_cols);
1402 
1403 		src = abd_to_buf(rm->rm_col[c].rc_abd);
1404 		ccount = rm->rm_col[c].rc_size;
1405 		for (j = 0; j < nmissing; j++) {
1406 			cc = missing[j] + rm->rm_firstdatacol;
1407 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1408 			ASSERT3U(cc, <, rm->rm_cols);
1409 			ASSERT3U(cc, !=, c);
1410 
1411 			dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd);
1412 			dcount[j] = rm->rm_col[cc].rc_size;
1413 		}
1414 
1415 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1416 
1417 		for (x = 0; x < ccount; x++, src++) {
1418 			if (*src != 0)
1419 				log = vdev_raidz_log2[*src];
1420 
1421 			for (cc = 0; cc < nmissing; cc++) {
1422 				if (x >= dcount[cc])
1423 					continue;
1424 
1425 				if (*src == 0) {
1426 					val = 0;
1427 				} else {
1428 					if ((ll = log + invlog[cc][i]) >= 255)
1429 						ll -= 255;
1430 					val = vdev_raidz_pow2[ll];
1431 				}
1432 
1433 				if (i == 0)
1434 					dst[cc][x] = val;
1435 				else
1436 					dst[cc][x] ^= val;
1437 			}
1438 		}
1439 	}
1440 
1441 	kmem_free(p, psize);
1442 }
1443 
1444 static int
1445 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1446 {
1447 	int n, i, c, t, tt;
1448 	int nmissing_rows;
1449 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1450 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1451 
1452 	uint8_t *p, *pp;
1453 	size_t psize;
1454 
1455 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1456 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1457 	uint8_t *used;
1458 
1459 	abd_t **bufs = NULL;
1460 
1461 	int code = 0;
1462 
1463 	/*
1464 	 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
1465 	 * temporary linear ABDs.
1466 	 */
1467 	if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) {
1468 		bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE);
1469 
1470 		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1471 			raidz_col_t *col = &rm->rm_col[c];
1472 
1473 			bufs[c] = col->rc_abd;
1474 			col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE);
1475 			abd_copy(col->rc_abd, bufs[c], col->rc_size);
1476 		}
1477 	}
1478 
1479 	n = rm->rm_cols - rm->rm_firstdatacol;
1480 
1481 	/*
1482 	 * Figure out which data columns are missing.
1483 	 */
1484 	nmissing_rows = 0;
1485 	for (t = 0; t < ntgts; t++) {
1486 		if (tgts[t] >= rm->rm_firstdatacol) {
1487 			missing_rows[nmissing_rows++] =
1488 			    tgts[t] - rm->rm_firstdatacol;
1489 		}
1490 	}
1491 
1492 	/*
1493 	 * Figure out which parity columns to use to help generate the missing
1494 	 * data columns.
1495 	 */
1496 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1497 		ASSERT(tt < ntgts);
1498 		ASSERT(c < rm->rm_firstdatacol);
1499 
1500 		/*
1501 		 * Skip any targeted parity columns.
1502 		 */
1503 		if (c == tgts[tt]) {
1504 			tt++;
1505 			continue;
1506 		}
1507 
1508 		code |= 1 << c;
1509 
1510 		parity_map[i] = c;
1511 		i++;
1512 	}
1513 
1514 	ASSERT(code != 0);
1515 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1516 
1517 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1518 	    nmissing_rows * n + sizeof (used[0]) * n;
1519 	p = kmem_alloc(psize, KM_SLEEP);
1520 
1521 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1522 		rows[i] = pp;
1523 		pp += n;
1524 		invrows[i] = pp;
1525 		pp += n;
1526 	}
1527 	used = pp;
1528 
1529 	for (i = 0; i < nmissing_rows; i++) {
1530 		used[i] = parity_map[i];
1531 	}
1532 
1533 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1534 		if (tt < nmissing_rows &&
1535 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1536 			tt++;
1537 			continue;
1538 		}
1539 
1540 		ASSERT3S(i, <, n);
1541 		used[i] = c;
1542 		i++;
1543 	}
1544 
1545 	/*
1546 	 * Initialize the interesting rows of the matrix.
1547 	 */
1548 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1549 
1550 	/*
1551 	 * Invert the matrix.
1552 	 */
1553 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1554 	    invrows, used);
1555 
1556 	/*
1557 	 * Reconstruct the missing data using the generated matrix.
1558 	 */
1559 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1560 	    invrows, used);
1561 
1562 	kmem_free(p, psize);
1563 
1564 	/*
1565 	 * copy back from temporary linear abds and free them
1566 	 */
1567 	if (bufs) {
1568 		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1569 			raidz_col_t *col = &rm->rm_col[c];
1570 
1571 			abd_copy(bufs[c], col->rc_abd, col->rc_size);
1572 			abd_free(col->rc_abd);
1573 			col->rc_abd = bufs[c];
1574 		}
1575 		kmem_free(bufs, rm->rm_cols * sizeof (abd_t *));
1576 	}
1577 
1578 	return (code);
1579 }
1580 
1581 static int
1582 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1583 {
1584 	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1585 	int ntgts;
1586 	int i, c;
1587 	int code;
1588 	int nbadparity, nbaddata;
1589 	int parity_valid[VDEV_RAIDZ_MAXPARITY];
1590 
1591 	/*
1592 	 * The tgts list must already be sorted.
1593 	 */
1594 	for (i = 1; i < nt; i++) {
1595 		ASSERT(t[i] > t[i - 1]);
1596 	}
1597 
1598 	nbadparity = rm->rm_firstdatacol;
1599 	nbaddata = rm->rm_cols - nbadparity;
1600 	ntgts = 0;
1601 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1602 		if (c < rm->rm_firstdatacol)
1603 			parity_valid[c] = B_FALSE;
1604 
1605 		if (i < nt && c == t[i]) {
1606 			tgts[ntgts++] = c;
1607 			i++;
1608 		} else if (rm->rm_col[c].rc_error != 0) {
1609 			tgts[ntgts++] = c;
1610 		} else if (c >= rm->rm_firstdatacol) {
1611 			nbaddata--;
1612 		} else {
1613 			parity_valid[c] = B_TRUE;
1614 			nbadparity--;
1615 		}
1616 	}
1617 
1618 	ASSERT(ntgts >= nt);
1619 	ASSERT(nbaddata >= 0);
1620 	ASSERT(nbaddata + nbadparity == ntgts);
1621 
1622 	dt = &tgts[nbadparity];
1623 
1624 	/*
1625 	 * See if we can use any of our optimized reconstruction routines.
1626 	 */
1627 	if (!vdev_raidz_default_to_general) {
1628 		switch (nbaddata) {
1629 		case 1:
1630 			if (parity_valid[VDEV_RAIDZ_P])
1631 				return (vdev_raidz_reconstruct_p(rm, dt, 1));
1632 
1633 			ASSERT(rm->rm_firstdatacol > 1);
1634 
1635 			if (parity_valid[VDEV_RAIDZ_Q])
1636 				return (vdev_raidz_reconstruct_q(rm, dt, 1));
1637 
1638 			ASSERT(rm->rm_firstdatacol > 2);
1639 			break;
1640 
1641 		case 2:
1642 			ASSERT(rm->rm_firstdatacol > 1);
1643 
1644 			if (parity_valid[VDEV_RAIDZ_P] &&
1645 			    parity_valid[VDEV_RAIDZ_Q])
1646 				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1647 
1648 			ASSERT(rm->rm_firstdatacol > 2);
1649 
1650 			break;
1651 		}
1652 	}
1653 
1654 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1655 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1656 	ASSERT(code > 0);
1657 	return (code);
1658 }
1659 
1660 static int
1661 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1662     uint64_t *ashift)
1663 {
1664 	vdev_t *cvd;
1665 	uint64_t nparity = vd->vdev_nparity;
1666 	int c;
1667 	int lasterror = 0;
1668 	int numerrors = 0;
1669 
1670 	ASSERT(nparity > 0);
1671 
1672 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
1673 	    vd->vdev_children < nparity + 1) {
1674 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1675 		return (SET_ERROR(EINVAL));
1676 	}
1677 
1678 	vdev_open_children(vd);
1679 
1680 	for (c = 0; c < vd->vdev_children; c++) {
1681 		cvd = vd->vdev_child[c];
1682 
1683 		if (cvd->vdev_open_error != 0) {
1684 			lasterror = cvd->vdev_open_error;
1685 			numerrors++;
1686 			continue;
1687 		}
1688 
1689 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1690 		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1691 		*ashift = MAX(*ashift, cvd->vdev_ashift);
1692 	}
1693 
1694 	*asize *= vd->vdev_children;
1695 	*max_asize *= vd->vdev_children;
1696 
1697 	if (numerrors > nparity) {
1698 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1699 		return (lasterror);
1700 	}
1701 
1702 	return (0);
1703 }
1704 
1705 static void
1706 vdev_raidz_close(vdev_t *vd)
1707 {
1708 	int c;
1709 
1710 	for (c = 0; c < vd->vdev_children; c++)
1711 		vdev_close(vd->vdev_child[c]);
1712 }
1713 
1714 /*
1715  * Handle a read or write I/O to a RAID-Z dump device.
1716  *
1717  * The dump device is in a unique situation compared to other ZFS datasets:
1718  * writing to this device should be as simple and fast as possible.  In
1719  * addition, durability matters much less since the dump will be extracted
1720  * once the machine reboots.  For that reason, this function eschews parity for
1721  * performance and simplicity.  The dump device uses the checksum setting
1722  * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1723  * dataset.
1724  *
1725  * Blocks of size 128 KB have been preallocated for this volume.  I/Os less than
1726  * 128 KB will not fill an entire block; in addition, they may not be properly
1727  * aligned.  In that case, this function uses the preallocated 128 KB block and
1728  * omits reading or writing any "empty" portions of that block, as opposed to
1729  * allocating a fresh appropriately-sized block.
1730  *
1731  * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1732  *
1733  *     vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1734  *
1735  * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1736  * allocated which spans all five child vdevs.  8 KB of data would be written to
1737  * each of four vdevs, with the fifth containing the parity bits.
1738  *
1739  *       parity    data     data     data     data
1740  *     |   PP   |   XX   |   XX   |   XX   |   XX   |
1741  *         ^        ^        ^        ^        ^
1742  *         |        |        |        |        |
1743  *   8 KB parity    ------8 KB data blocks------
1744  *
1745  * However, when writing to the dump device, the behavior is different:
1746  *
1747  *     vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1748  *
1749  * Unlike the normal RAID-Z case in which the block is allocated based on the
1750  * I/O size, reads and writes here always use a 128 KB logical I/O size.  If the
1751  * I/O size is less than 128 KB, only the actual portions of data are written.
1752  * In this example the data is written to the third data vdev since that vdev
1753  * contains the offset [64 KB, 96 KB).
1754  *
1755  *       parity    data     data     data     data
1756  *     |        |        |        |   XX   |        |
1757  *                                    ^
1758  *                                    |
1759  *                             32 KB data block
1760  *
1761  * As a result, an individual I/O may not span all child vdevs; moreover, a
1762  * small I/O may only operate on a single child vdev.
1763  *
1764  * Note that since there are no parity bits calculated or written, this format
1765  * remains the same no matter how many parity bits are used in a normal RAID-Z
1766  * stripe.  On a RAID-Z3 configuration with seven child vdevs, the example above
1767  * would look like:
1768  *
1769  *       parity   parity   parity    data     data     data     data
1770  *     |        |        |        |        |        |   XX   |        |
1771  *                                                      ^
1772  *                                                      |
1773  *                                               32 KB data block
1774  */
1775 int
1776 vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1777     uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1778 {
1779 	vdev_t *tvd = vd->vdev_top;
1780 	vdev_t *cvd;
1781 	raidz_map_t *rm;
1782 	raidz_col_t *rc;
1783 	int c, err = 0;
1784 
1785 	uint64_t start, end, colstart, colend;
1786 	uint64_t coloffset, colsize, colskip;
1787 
1788 	int flags = doread ? B_READ : B_WRITE;
1789 
1790 #ifdef	_KERNEL
1791 
1792 	/*
1793 	 * Don't write past the end of the block
1794 	 */
1795 	VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
1796 
1797 	start = offset;
1798 	end = start + size;
1799 
1800 	/*
1801 	 * Allocate a RAID-Z map for this block.  Note that this block starts
1802 	 * from the "original" offset, this is, the offset of the extent which
1803 	 * contains the requisite offset of the data being read or written.
1804 	 *
1805 	 * Even if this I/O operation doesn't span the full block size, let's
1806 	 * treat the on-disk format as if the only blocks are the complete 128
1807 	 * KB size.
1808 	 */
1809 	abd_t *abd = abd_get_from_buf(data - (offset - origoffset),
1810 	    SPA_OLD_MAXBLOCKSIZE);
1811 	rm = vdev_raidz_map_alloc(abd,
1812 	    SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift,
1813 	    vd->vdev_children, vd->vdev_nparity);
1814 
1815 	coloffset = origoffset;
1816 
1817 	for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1818 	    c++, coloffset += rc->rc_size) {
1819 		rc = &rm->rm_col[c];
1820 		cvd = vd->vdev_child[rc->rc_devidx];
1821 
1822 		/*
1823 		 * Find the start and end of this column in the RAID-Z map,
1824 		 * keeping in mind that the stated size and offset of the
1825 		 * operation may not fill the entire column for this vdev.
1826 		 *
1827 		 * If any portion of the data spans this column, issue the
1828 		 * appropriate operation to the vdev.
1829 		 */
1830 		if (coloffset + rc->rc_size <= start)
1831 			continue;
1832 		if (coloffset >= end)
1833 			continue;
1834 
1835 		colstart = MAX(coloffset, start);
1836 		colend = MIN(end, coloffset + rc->rc_size);
1837 		colsize = colend - colstart;
1838 		colskip = colstart - coloffset;
1839 
1840 		VERIFY3U(colsize, <=, rc->rc_size);
1841 		VERIFY3U(colskip, <=, rc->rc_size);
1842 
1843 		/*
1844 		 * Note that the child vdev will have a vdev label at the start
1845 		 * of its range of offsets, hence the need for
1846 		 * VDEV_LABEL_OFFSET().  See zio_vdev_child_io() for another
1847 		 * example of why this calculation is needed.
1848 		 */
1849 		if ((err = vdev_disk_physio(cvd,
1850 		    ((char *)rc->rc_abd) + colskip, colsize,
1851 		    VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1852 		    flags, isdump)) != 0)
1853 			break;
1854 	}
1855 
1856 	vdev_raidz_map_free(rm);
1857 	abd_put(abd);
1858 #endif	/* KERNEL */
1859 
1860 	return (err);
1861 }
1862 
1863 static uint64_t
1864 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1865 {
1866 	uint64_t asize;
1867 	uint64_t ashift = vd->vdev_top->vdev_ashift;
1868 	uint64_t cols = vd->vdev_children;
1869 	uint64_t nparity = vd->vdev_nparity;
1870 
1871 	asize = ((psize - 1) >> ashift) + 1;
1872 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1873 	asize = roundup(asize, nparity + 1) << ashift;
1874 
1875 	return (asize);
1876 }
1877 
1878 static void
1879 vdev_raidz_child_done(zio_t *zio)
1880 {
1881 	raidz_col_t *rc = zio->io_private;
1882 
1883 	rc->rc_error = zio->io_error;
1884 	rc->rc_tried = 1;
1885 	rc->rc_skipped = 0;
1886 }
1887 
1888 /*
1889  * Start an IO operation on a RAIDZ VDev
1890  *
1891  * Outline:
1892  * - For write operations:
1893  *   1. Generate the parity data
1894  *   2. Create child zio write operations to each column's vdev, for both
1895  *      data and parity.
1896  *   3. If the column skips any sectors for padding, create optional dummy
1897  *      write zio children for those areas to improve aggregation continuity.
1898  * - For read operations:
1899  *   1. Create child zio read operations to each data column's vdev to read
1900  *      the range of data required for zio.
1901  *   2. If this is a scrub or resilver operation, or if any of the data
1902  *      vdevs have had errors, then create zio read operations to the parity
1903  *      columns' VDevs as well.
1904  */
1905 static void
1906 vdev_raidz_io_start(zio_t *zio)
1907 {
1908 	vdev_t *vd = zio->io_vd;
1909 	vdev_t *tvd = vd->vdev_top;
1910 	vdev_t *cvd;
1911 	raidz_map_t *rm;
1912 	raidz_col_t *rc;
1913 	int c, i;
1914 
1915 	rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset,
1916 	    tvd->vdev_ashift, vd->vdev_children,
1917 	    vd->vdev_nparity);
1918 
1919 	zio->io_vsd = rm;
1920 	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1921 
1922 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1923 
1924 	if (zio->io_type == ZIO_TYPE_WRITE) {
1925 		vdev_raidz_generate_parity(rm);
1926 
1927 		for (c = 0; c < rm->rm_cols; c++) {
1928 			rc = &rm->rm_col[c];
1929 			cvd = vd->vdev_child[rc->rc_devidx];
1930 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1931 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
1932 			    zio->io_type, zio->io_priority, 0,
1933 			    vdev_raidz_child_done, rc));
1934 		}
1935 
1936 		/*
1937 		 * Generate optional I/Os for any skipped sectors to improve
1938 		 * aggregation contiguity.
1939 		 */
1940 		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1941 			ASSERT(c <= rm->rm_scols);
1942 			if (c == rm->rm_scols)
1943 				c = 0;
1944 			rc = &rm->rm_col[c];
1945 			cvd = vd->vdev_child[rc->rc_devidx];
1946 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1947 			    rc->rc_offset + rc->rc_size, NULL,
1948 			    1 << tvd->vdev_ashift,
1949 			    zio->io_type, zio->io_priority,
1950 			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1951 		}
1952 
1953 		zio_execute(zio);
1954 		return;
1955 	}
1956 
1957 	ASSERT(zio->io_type == ZIO_TYPE_READ);
1958 
1959 	/*
1960 	 * Iterate over the columns in reverse order so that we hit the parity
1961 	 * last -- any errors along the way will force us to read the parity.
1962 	 */
1963 	for (c = rm->rm_cols - 1; c >= 0; c--) {
1964 		rc = &rm->rm_col[c];
1965 		cvd = vd->vdev_child[rc->rc_devidx];
1966 		if (!vdev_readable(cvd)) {
1967 			if (c >= rm->rm_firstdatacol)
1968 				rm->rm_missingdata++;
1969 			else
1970 				rm->rm_missingparity++;
1971 			rc->rc_error = SET_ERROR(ENXIO);
1972 			rc->rc_tried = 1;	/* don't even try */
1973 			rc->rc_skipped = 1;
1974 			continue;
1975 		}
1976 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1977 			if (c >= rm->rm_firstdatacol)
1978 				rm->rm_missingdata++;
1979 			else
1980 				rm->rm_missingparity++;
1981 			rc->rc_error = SET_ERROR(ESTALE);
1982 			rc->rc_skipped = 1;
1983 			continue;
1984 		}
1985 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1986 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1987 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1988 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
1989 			    zio->io_type, zio->io_priority, 0,
1990 			    vdev_raidz_child_done, rc));
1991 		}
1992 	}
1993 
1994 	zio_execute(zio);
1995 }
1996 
1997 
1998 /*
1999  * Report a checksum error for a child of a RAID-Z device.
2000  */
2001 static void
2002 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
2003 {
2004 	void *buf;
2005 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
2006 
2007 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2008 		zio_bad_cksum_t zbc;
2009 		raidz_map_t *rm = zio->io_vsd;
2010 
2011 		mutex_enter(&vd->vdev_stat_lock);
2012 		vd->vdev_stat.vs_checksum_errors++;
2013 		mutex_exit(&vd->vdev_stat_lock);
2014 
2015 		zbc.zbc_has_cksum = 0;
2016 		zbc.zbc_injected = rm->rm_ecksuminjected;
2017 
2018 		buf = abd_borrow_buf_copy(rc->rc_abd, rc->rc_size);
2019 		zfs_ereport_post_checksum(zio->io_spa, vd, zio,
2020 		    rc->rc_offset, rc->rc_size, buf, bad_data,
2021 		    &zbc);
2022 		abd_return_buf(rc->rc_abd, buf, rc->rc_size);
2023 	}
2024 }
2025 
2026 /*
2027  * We keep track of whether or not there were any injected errors, so that
2028  * any ereports we generate can note it.
2029  */
2030 static int
2031 raidz_checksum_verify(zio_t *zio)
2032 {
2033 	zio_bad_cksum_t zbc;
2034 	raidz_map_t *rm = zio->io_vsd;
2035 
2036 	int ret = zio_checksum_error(zio, &zbc);
2037 	if (ret != 0 && zbc.zbc_injected != 0)
2038 		rm->rm_ecksuminjected = 1;
2039 
2040 	return (ret);
2041 }
2042 
2043 /*
2044  * Generate the parity from the data columns. If we tried and were able to
2045  * read the parity without error, verify that the generated parity matches the
2046  * data we read. If it doesn't, we fire off a checksum error. Return the
2047  * number such failures.
2048  */
2049 static int
2050 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
2051 {
2052 	void *orig[VDEV_RAIDZ_MAXPARITY];
2053 	int c, ret = 0;
2054 	raidz_col_t *rc;
2055 
2056 	blkptr_t *bp = zio->io_bp;
2057 	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
2058 	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
2059 
2060 	if (checksum == ZIO_CHECKSUM_NOPARITY)
2061 		return (ret);
2062 
2063 	for (c = 0; c < rm->rm_firstdatacol; c++) {
2064 		rc = &rm->rm_col[c];
2065 		if (!rc->rc_tried || rc->rc_error != 0)
2066 			continue;
2067 		orig[c] = zio_buf_alloc(rc->rc_size);
2068 		abd_copy_to_buf(orig[c], rc->rc_abd, rc->rc_size);
2069 	}
2070 
2071 	vdev_raidz_generate_parity(rm);
2072 
2073 	for (c = 0; c < rm->rm_firstdatacol; c++) {
2074 		rc = &rm->rm_col[c];
2075 		if (!rc->rc_tried || rc->rc_error != 0)
2076 			continue;
2077 		if (bcmp(orig[c], abd_to_buf(rc->rc_abd), rc->rc_size) != 0) {
2078 			raidz_checksum_error(zio, rc, orig[c]);
2079 			rc->rc_error = SET_ERROR(ECKSUM);
2080 			ret++;
2081 		}
2082 		zio_buf_free(orig[c], rc->rc_size);
2083 	}
2084 
2085 	return (ret);
2086 }
2087 
2088 /*
2089  * Keep statistics on all the ways that we used parity to correct data.
2090  */
2091 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
2092 
2093 static int
2094 vdev_raidz_worst_error(raidz_map_t *rm)
2095 {
2096 	int error = 0;
2097 
2098 	for (int c = 0; c < rm->rm_cols; c++)
2099 		error = zio_worst_error(error, rm->rm_col[c].rc_error);
2100 
2101 	return (error);
2102 }
2103 
2104 /*
2105  * Iterate over all combinations of bad data and attempt a reconstruction.
2106  * Note that the algorithm below is non-optimal because it doesn't take into
2107  * account how reconstruction is actually performed. For example, with
2108  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
2109  * is targeted as invalid as if columns 1 and 4 are targeted since in both
2110  * cases we'd only use parity information in column 0.
2111  */
2112 static int
2113 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
2114 {
2115 	raidz_map_t *rm = zio->io_vsd;
2116 	raidz_col_t *rc;
2117 	void *orig[VDEV_RAIDZ_MAXPARITY];
2118 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
2119 	int *tgts = &tstore[1];
2120 	int current, next, i, c, n;
2121 	int code, ret = 0;
2122 
2123 	ASSERT(total_errors < rm->rm_firstdatacol);
2124 
2125 	/*
2126 	 * This simplifies one edge condition.
2127 	 */
2128 	tgts[-1] = -1;
2129 
2130 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
2131 		/*
2132 		 * Initialize the targets array by finding the first n columns
2133 		 * that contain no error.
2134 		 *
2135 		 * If there were no data errors, we need to ensure that we're
2136 		 * always explicitly attempting to reconstruct at least one
2137 		 * data column. To do this, we simply push the highest target
2138 		 * up into the data columns.
2139 		 */
2140 		for (c = 0, i = 0; i < n; i++) {
2141 			if (i == n - 1 && data_errors == 0 &&
2142 			    c < rm->rm_firstdatacol) {
2143 				c = rm->rm_firstdatacol;
2144 			}
2145 
2146 			while (rm->rm_col[c].rc_error != 0) {
2147 				c++;
2148 				ASSERT3S(c, <, rm->rm_cols);
2149 			}
2150 
2151 			tgts[i] = c++;
2152 		}
2153 
2154 		/*
2155 		 * Setting tgts[n] simplifies the other edge condition.
2156 		 */
2157 		tgts[n] = rm->rm_cols;
2158 
2159 		/*
2160 		 * These buffers were allocated in previous iterations.
2161 		 */
2162 		for (i = 0; i < n - 1; i++) {
2163 			ASSERT(orig[i] != NULL);
2164 		}
2165 
2166 		orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
2167 
2168 		current = 0;
2169 		next = tgts[current];
2170 
2171 		while (current != n) {
2172 			tgts[current] = next;
2173 			current = 0;
2174 
2175 			/*
2176 			 * Save off the original data that we're going to
2177 			 * attempt to reconstruct.
2178 			 */
2179 			for (i = 0; i < n; i++) {
2180 				ASSERT(orig[i] != NULL);
2181 				c = tgts[i];
2182 				ASSERT3S(c, >=, 0);
2183 				ASSERT3S(c, <, rm->rm_cols);
2184 				rc = &rm->rm_col[c];
2185 				abd_copy_to_buf(orig[i], rc->rc_abd,
2186 				    rc->rc_size);
2187 			}
2188 
2189 			/*
2190 			 * Attempt a reconstruction and exit the outer loop on
2191 			 * success.
2192 			 */
2193 			code = vdev_raidz_reconstruct(rm, tgts, n);
2194 			if (raidz_checksum_verify(zio) == 0) {
2195 				atomic_inc_64(&raidz_corrected[code]);
2196 
2197 				for (i = 0; i < n; i++) {
2198 					c = tgts[i];
2199 					rc = &rm->rm_col[c];
2200 					ASSERT(rc->rc_error == 0);
2201 					if (rc->rc_tried)
2202 						raidz_checksum_error(zio, rc,
2203 						    orig[i]);
2204 					rc->rc_error = SET_ERROR(ECKSUM);
2205 				}
2206 
2207 				ret = code;
2208 				goto done;
2209 			}
2210 
2211 			/*
2212 			 * Restore the original data.
2213 			 */
2214 			for (i = 0; i < n; i++) {
2215 				c = tgts[i];
2216 				rc = &rm->rm_col[c];
2217 				abd_copy_from_buf(rc->rc_abd, orig[i],
2218 				    rc->rc_size);
2219 			}
2220 
2221 			do {
2222 				/*
2223 				 * Find the next valid column after the current
2224 				 * position..
2225 				 */
2226 				for (next = tgts[current] + 1;
2227 				    next < rm->rm_cols &&
2228 				    rm->rm_col[next].rc_error != 0; next++)
2229 					continue;
2230 
2231 				ASSERT(next <= tgts[current + 1]);
2232 
2233 				/*
2234 				 * If that spot is available, we're done here.
2235 				 */
2236 				if (next != tgts[current + 1])
2237 					break;
2238 
2239 				/*
2240 				 * Otherwise, find the next valid column after
2241 				 * the previous position.
2242 				 */
2243 				for (c = tgts[current - 1] + 1;
2244 				    rm->rm_col[c].rc_error != 0; c++)
2245 					continue;
2246 
2247 				tgts[current] = c;
2248 				current++;
2249 
2250 			} while (current != n);
2251 		}
2252 	}
2253 	n--;
2254 done:
2255 	for (i = 0; i < n; i++) {
2256 		zio_buf_free(orig[i], rm->rm_col[0].rc_size);
2257 	}
2258 
2259 	return (ret);
2260 }
2261 
2262 /*
2263  * Complete an IO operation on a RAIDZ VDev
2264  *
2265  * Outline:
2266  * - For write operations:
2267  *   1. Check for errors on the child IOs.
2268  *   2. Return, setting an error code if too few child VDevs were written
2269  *      to reconstruct the data later.  Note that partial writes are
2270  *      considered successful if they can be reconstructed at all.
2271  * - For read operations:
2272  *   1. Check for errors on the child IOs.
2273  *   2. If data errors occurred:
2274  *      a. Try to reassemble the data from the parity available.
2275  *      b. If we haven't yet read the parity drives, read them now.
2276  *      c. If all parity drives have been read but the data still doesn't
2277  *         reassemble with a correct checksum, then try combinatorial
2278  *         reconstruction.
2279  *      d. If that doesn't work, return an error.
2280  *   3. If there were unexpected errors or this is a resilver operation,
2281  *      rewrite the vdevs that had errors.
2282  */
2283 static void
2284 vdev_raidz_io_done(zio_t *zio)
2285 {
2286 	vdev_t *vd = zio->io_vd;
2287 	vdev_t *cvd;
2288 	raidz_map_t *rm = zio->io_vsd;
2289 	raidz_col_t *rc;
2290 	int unexpected_errors = 0;
2291 	int parity_errors = 0;
2292 	int parity_untried = 0;
2293 	int data_errors = 0;
2294 	int total_errors = 0;
2295 	int n, c;
2296 	int tgts[VDEV_RAIDZ_MAXPARITY];
2297 	int code;
2298 
2299 	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
2300 
2301 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2302 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2303 
2304 	for (c = 0; c < rm->rm_cols; c++) {
2305 		rc = &rm->rm_col[c];
2306 
2307 		if (rc->rc_error) {
2308 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
2309 
2310 			if (c < rm->rm_firstdatacol)
2311 				parity_errors++;
2312 			else
2313 				data_errors++;
2314 
2315 			if (!rc->rc_skipped)
2316 				unexpected_errors++;
2317 
2318 			total_errors++;
2319 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2320 			parity_untried++;
2321 		}
2322 	}
2323 
2324 	if (zio->io_type == ZIO_TYPE_WRITE) {
2325 		/*
2326 		 * XXX -- for now, treat partial writes as a success.
2327 		 * (If we couldn't write enough columns to reconstruct
2328 		 * the data, the I/O failed.  Otherwise, good enough.)
2329 		 *
2330 		 * Now that we support write reallocation, it would be better
2331 		 * to treat partial failure as real failure unless there are
2332 		 * no non-degraded top-level vdevs left, and not update DTLs
2333 		 * if we intend to reallocate.
2334 		 */
2335 		/* XXPOLICY */
2336 		if (total_errors > rm->rm_firstdatacol)
2337 			zio->io_error = vdev_raidz_worst_error(rm);
2338 
2339 		return;
2340 	}
2341 
2342 	ASSERT(zio->io_type == ZIO_TYPE_READ);
2343 	/*
2344 	 * There are three potential phases for a read:
2345 	 *	1. produce valid data from the columns read
2346 	 *	2. read all disks and try again
2347 	 *	3. perform combinatorial reconstruction
2348 	 *
2349 	 * Each phase is progressively both more expensive and less likely to
2350 	 * occur. If we encounter more errors than we can repair or all phases
2351 	 * fail, we have no choice but to return an error.
2352 	 */
2353 
2354 	/*
2355 	 * If the number of errors we saw was correctable -- less than or equal
2356 	 * to the number of parity disks read -- attempt to produce data that
2357 	 * has a valid checksum. Naturally, this case applies in the absence of
2358 	 * any errors.
2359 	 */
2360 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2361 		if (data_errors == 0) {
2362 			if (raidz_checksum_verify(zio) == 0) {
2363 				/*
2364 				 * If we read parity information (unnecessarily
2365 				 * as it happens since no reconstruction was
2366 				 * needed) regenerate and verify the parity.
2367 				 * We also regenerate parity when resilvering
2368 				 * so we can write it out to the failed device
2369 				 * later.
2370 				 */
2371 				if (parity_errors + parity_untried <
2372 				    rm->rm_firstdatacol ||
2373 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2374 					n = raidz_parity_verify(zio, rm);
2375 					unexpected_errors += n;
2376 					ASSERT(parity_errors + n <=
2377 					    rm->rm_firstdatacol);
2378 				}
2379 				goto done;
2380 			}
2381 		} else {
2382 			/*
2383 			 * We either attempt to read all the parity columns or
2384 			 * none of them. If we didn't try to read parity, we
2385 			 * wouldn't be here in the correctable case. There must
2386 			 * also have been fewer parity errors than parity
2387 			 * columns or, again, we wouldn't be in this code path.
2388 			 */
2389 			ASSERT(parity_untried == 0);
2390 			ASSERT(parity_errors < rm->rm_firstdatacol);
2391 
2392 			/*
2393 			 * Identify the data columns that reported an error.
2394 			 */
2395 			n = 0;
2396 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2397 				rc = &rm->rm_col[c];
2398 				if (rc->rc_error != 0) {
2399 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2400 					tgts[n++] = c;
2401 				}
2402 			}
2403 
2404 			ASSERT(rm->rm_firstdatacol >= n);
2405 
2406 			code = vdev_raidz_reconstruct(rm, tgts, n);
2407 
2408 			if (raidz_checksum_verify(zio) == 0) {
2409 				atomic_inc_64(&raidz_corrected[code]);
2410 
2411 				/*
2412 				 * If we read more parity disks than were used
2413 				 * for reconstruction, confirm that the other
2414 				 * parity disks produced correct data. This
2415 				 * routine is suboptimal in that it regenerates
2416 				 * the parity that we already used in addition
2417 				 * to the parity that we're attempting to
2418 				 * verify, but this should be a relatively
2419 				 * uncommon case, and can be optimized if it
2420 				 * becomes a problem. Note that we regenerate
2421 				 * parity when resilvering so we can write it
2422 				 * out to failed devices later.
2423 				 */
2424 				if (parity_errors < rm->rm_firstdatacol - n ||
2425 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2426 					n = raidz_parity_verify(zio, rm);
2427 					unexpected_errors += n;
2428 					ASSERT(parity_errors + n <=
2429 					    rm->rm_firstdatacol);
2430 				}
2431 
2432 				goto done;
2433 			}
2434 		}
2435 	}
2436 
2437 	/*
2438 	 * This isn't a typical situation -- either we got a read error or
2439 	 * a child silently returned bad data. Read every block so we can
2440 	 * try again with as much data and parity as we can track down. If
2441 	 * we've already been through once before, all children will be marked
2442 	 * as tried so we'll proceed to combinatorial reconstruction.
2443 	 */
2444 	unexpected_errors = 1;
2445 	rm->rm_missingdata = 0;
2446 	rm->rm_missingparity = 0;
2447 
2448 	for (c = 0; c < rm->rm_cols; c++) {
2449 		if (rm->rm_col[c].rc_tried)
2450 			continue;
2451 
2452 		zio_vdev_io_redone(zio);
2453 		do {
2454 			rc = &rm->rm_col[c];
2455 			if (rc->rc_tried)
2456 				continue;
2457 			zio_nowait(zio_vdev_child_io(zio, NULL,
2458 			    vd->vdev_child[rc->rc_devidx],
2459 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
2460 			    zio->io_type, zio->io_priority, 0,
2461 			    vdev_raidz_child_done, rc));
2462 		} while (++c < rm->rm_cols);
2463 
2464 		return;
2465 	}
2466 
2467 	/*
2468 	 * At this point we've attempted to reconstruct the data given the
2469 	 * errors we detected, and we've attempted to read all columns. There
2470 	 * must, therefore, be one or more additional problems -- silent errors
2471 	 * resulting in invalid data rather than explicit I/O errors resulting
2472 	 * in absent data. We check if there is enough additional data to
2473 	 * possibly reconstruct the data and then perform combinatorial
2474 	 * reconstruction over all possible combinations. If that fails,
2475 	 * we're cooked.
2476 	 */
2477 	if (total_errors > rm->rm_firstdatacol) {
2478 		zio->io_error = vdev_raidz_worst_error(rm);
2479 
2480 	} else if (total_errors < rm->rm_firstdatacol &&
2481 	    (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2482 		/*
2483 		 * If we didn't use all the available parity for the
2484 		 * combinatorial reconstruction, verify that the remaining
2485 		 * parity is correct.
2486 		 */
2487 		if (code != (1 << rm->rm_firstdatacol) - 1)
2488 			(void) raidz_parity_verify(zio, rm);
2489 	} else {
2490 		/*
2491 		 * We're here because either:
2492 		 *
2493 		 *	total_errors == rm_first_datacol, or
2494 		 *	vdev_raidz_combrec() failed
2495 		 *
2496 		 * In either case, there is enough bad data to prevent
2497 		 * reconstruction.
2498 		 *
2499 		 * Start checksum ereports for all children which haven't
2500 		 * failed, and the IO wasn't speculative.
2501 		 */
2502 		zio->io_error = SET_ERROR(ECKSUM);
2503 
2504 		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2505 			for (c = 0; c < rm->rm_cols; c++) {
2506 				rc = &rm->rm_col[c];
2507 				if (rc->rc_error == 0) {
2508 					zio_bad_cksum_t zbc;
2509 					zbc.zbc_has_cksum = 0;
2510 					zbc.zbc_injected =
2511 					    rm->rm_ecksuminjected;
2512 
2513 					zfs_ereport_start_checksum(
2514 					    zio->io_spa,
2515 					    vd->vdev_child[rc->rc_devidx],
2516 					    zio, rc->rc_offset, rc->rc_size,
2517 					    (void *)(uintptr_t)c, &zbc);
2518 				}
2519 			}
2520 		}
2521 	}
2522 
2523 done:
2524 	zio_checksum_verified(zio);
2525 
2526 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2527 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2528 		/*
2529 		 * Use the good data we have in hand to repair damaged children.
2530 		 */
2531 		for (c = 0; c < rm->rm_cols; c++) {
2532 			rc = &rm->rm_col[c];
2533 			cvd = vd->vdev_child[rc->rc_devidx];
2534 
2535 			if (rc->rc_error == 0)
2536 				continue;
2537 
2538 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2539 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
2540 			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2541 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2542 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2543 		}
2544 	}
2545 }
2546 
2547 static void
2548 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2549 {
2550 	if (faulted > vd->vdev_nparity)
2551 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2552 		    VDEV_AUX_NO_REPLICAS);
2553 	else if (degraded + faulted != 0)
2554 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2555 	else
2556 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2557 }
2558 
2559 vdev_ops_t vdev_raidz_ops = {
2560 	vdev_raidz_open,
2561 	vdev_raidz_close,
2562 	vdev_raidz_asize,
2563 	vdev_raidz_io_start,
2564 	vdev_raidz_io_done,
2565 	vdev_raidz_state_change,
2566 	NULL,
2567 	NULL,
2568 	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
2569 	B_FALSE			/* not a leaf vdev */
2570 };
2571