xref: /illumos-gate/usr/src/uts/common/fs/zfs/vdev_raidz.c (revision d1aea6f139360e9e7f1504facb24f8521047b15c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
25  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
26  * Copyright (c) 2014 Integros [integros.com]
27  */
28 
29 #include <sys/zfs_context.h>
30 #include <sys/spa.h>
31 #include <sys/vdev_impl.h>
32 #include <sys/vdev_disk.h>
33 #include <sys/vdev_file.h>
34 #include <sys/vdev_raidz.h>
35 #include <sys/zio.h>
36 #include <sys/zio_checksum.h>
37 #include <sys/abd.h>
38 #include <sys/fs/zfs.h>
39 #include <sys/fm/fs/zfs.h>
40 
41 /*
42  * Virtual device vector for RAID-Z.
43  *
44  * This vdev supports single, double, and triple parity. For single parity,
45  * we use a simple XOR of all the data columns. For double or triple parity,
46  * we use a special case of Reed-Solomon coding. This extends the
47  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
48  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
49  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
50  * former is also based. The latter is designed to provide higher performance
51  * for writes.
52  *
53  * Note that the Plank paper claimed to support arbitrary N+M, but was then
54  * amended six years later identifying a critical flaw that invalidates its
55  * claims. Nevertheless, the technique can be adapted to work for up to
56  * triple parity. For additional parity, the amendment "Note: Correction to
57  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
58  * is viable, but the additional complexity means that write performance will
59  * suffer.
60  *
61  * All of the methods above operate on a Galois field, defined over the
62  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
63  * can be expressed with a single byte. Briefly, the operations on the
64  * field are defined as follows:
65  *
66  *   o addition (+) is represented by a bitwise XOR
67  *   o subtraction (-) is therefore identical to addition: A + B = A - B
68  *   o multiplication of A by 2 is defined by the following bitwise expression:
69  *
70  *	(A * 2)_7 = A_6
71  *	(A * 2)_6 = A_5
72  *	(A * 2)_5 = A_4
73  *	(A * 2)_4 = A_3 + A_7
74  *	(A * 2)_3 = A_2 + A_7
75  *	(A * 2)_2 = A_1 + A_7
76  *	(A * 2)_1 = A_0
77  *	(A * 2)_0 = A_7
78  *
79  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
80  * As an aside, this multiplication is derived from the error correcting
81  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
82  *
83  * Observe that any number in the field (except for 0) can be expressed as a
84  * power of 2 -- a generator for the field. We store a table of the powers of
85  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
86  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
87  * than field addition). The inverse of a field element A (A^-1) is therefore
88  * A ^ (255 - 1) = A^254.
89  *
90  * The up-to-three parity columns, P, Q, R over several data columns,
91  * D_0, ... D_n-1, can be expressed by field operations:
92  *
93  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
94  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
95  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
96  *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
97  *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
98  *
99  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
100  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
101  * independent coefficients. (There are no additional coefficients that have
102  * this property which is why the uncorrected Plank method breaks down.)
103  *
104  * See the reconstruction code below for how P, Q and R can used individually
105  * or in concert to recover missing data columns.
106  */
107 
108 typedef struct raidz_col {
109 	uint64_t rc_devidx;		/* child device index for I/O */
110 	uint64_t rc_offset;		/* device offset */
111 	uint64_t rc_size;		/* I/O size */
112 	abd_t *rc_abd;			/* I/O data */
113 	void *rc_gdata;			/* used to store the "good" version */
114 	int rc_error;			/* I/O error for this device */
115 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
116 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
117 } raidz_col_t;
118 
119 typedef struct raidz_map {
120 	uint64_t rm_cols;		/* Regular column count */
121 	uint64_t rm_scols;		/* Count including skipped columns */
122 	uint64_t rm_bigcols;		/* Number of oversized columns */
123 	uint64_t rm_asize;		/* Actual total I/O size */
124 	uint64_t rm_missingdata;	/* Count of missing data devices */
125 	uint64_t rm_missingparity;	/* Count of missing parity devices */
126 	uint64_t rm_firstdatacol;	/* First data column/parity count */
127 	uint64_t rm_nskip;		/* Skipped sectors for padding */
128 	uint64_t rm_skipstart;		/* Column index of padding start */
129 	abd_t *rm_abd_copy;		/* rm_asize-buffer of copied data */
130 	uintptr_t rm_reports;		/* # of referencing checksum reports */
131 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
132 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
133 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
134 } raidz_map_t;
135 
136 #define	VDEV_RAIDZ_P		0
137 #define	VDEV_RAIDZ_Q		1
138 #define	VDEV_RAIDZ_R		2
139 
140 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
141 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
142 
143 /*
144  * We provide a mechanism to perform the field multiplication operation on a
145  * 64-bit value all at once rather than a byte at a time. This works by
146  * creating a mask from the top bit in each byte and using that to
147  * conditionally apply the XOR of 0x1d.
148  */
149 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
150 { \
151 	(mask) = (x) & 0x8080808080808080ULL; \
152 	(mask) = ((mask) << 1) - ((mask) >> 7); \
153 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
154 	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
155 }
156 
157 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
158 { \
159 	VDEV_RAIDZ_64MUL_2((x), mask); \
160 	VDEV_RAIDZ_64MUL_2((x), mask); \
161 }
162 
163 #define	VDEV_LABEL_OFFSET(x)	(x + VDEV_LABEL_START_SIZE)
164 
165 /*
166  * Force reconstruction to use the general purpose method.
167  */
168 int vdev_raidz_default_to_general;
169 
170 /* Powers of 2 in the Galois field defined above. */
171 static const uint8_t vdev_raidz_pow2[256] = {
172 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
173 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
174 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
175 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
176 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
177 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
178 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
179 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
180 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
181 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
182 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
183 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
184 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
185 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
186 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
187 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
188 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
189 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
190 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
191 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
192 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
193 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
194 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
195 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
196 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
197 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
198 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
199 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
200 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
201 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
202 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
203 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
204 };
205 /* Logs of 2 in the Galois field defined above. */
206 static const uint8_t vdev_raidz_log2[256] = {
207 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
208 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
209 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
210 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
211 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
212 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
213 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
214 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
215 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
216 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
217 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
218 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
219 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
220 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
221 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
222 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
223 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
224 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
225 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
226 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
227 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
228 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
229 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
230 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
231 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
232 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
233 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
234 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
235 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
236 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
237 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
238 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
239 };
240 
241 static void vdev_raidz_generate_parity(raidz_map_t *rm);
242 
243 /*
244  * Multiply a given number by 2 raised to the given power.
245  */
246 static uint8_t
247 vdev_raidz_exp2(uint_t a, int exp)
248 {
249 	if (a == 0)
250 		return (0);
251 
252 	ASSERT(exp >= 0);
253 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
254 
255 	exp += vdev_raidz_log2[a];
256 	if (exp > 255)
257 		exp -= 255;
258 
259 	return (vdev_raidz_pow2[exp]);
260 }
261 
262 static void
263 vdev_raidz_map_free(raidz_map_t *rm)
264 {
265 	int c;
266 	size_t size;
267 
268 	for (c = 0; c < rm->rm_firstdatacol; c++) {
269 		abd_free(rm->rm_col[c].rc_abd);
270 
271 		if (rm->rm_col[c].rc_gdata != NULL)
272 			zio_buf_free(rm->rm_col[c].rc_gdata,
273 			    rm->rm_col[c].rc_size);
274 	}
275 
276 	size = 0;
277 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
278 		abd_put(rm->rm_col[c].rc_abd);
279 		size += rm->rm_col[c].rc_size;
280 	}
281 
282 	if (rm->rm_abd_copy != NULL)
283 		abd_free(rm->rm_abd_copy);
284 
285 	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
286 }
287 
288 static void
289 vdev_raidz_map_free_vsd(zio_t *zio)
290 {
291 	raidz_map_t *rm = zio->io_vsd;
292 
293 	ASSERT0(rm->rm_freed);
294 	rm->rm_freed = 1;
295 
296 	if (rm->rm_reports == 0)
297 		vdev_raidz_map_free(rm);
298 }
299 
300 /*ARGSUSED*/
301 static void
302 vdev_raidz_cksum_free(void *arg, size_t ignored)
303 {
304 	raidz_map_t *rm = arg;
305 
306 	ASSERT3U(rm->rm_reports, >, 0);
307 
308 	if (--rm->rm_reports == 0 && rm->rm_freed != 0)
309 		vdev_raidz_map_free(rm);
310 }
311 
312 static void
313 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
314 {
315 	raidz_map_t *rm = zcr->zcr_cbdata;
316 	size_t c = zcr->zcr_cbinfo;
317 	size_t x;
318 
319 	const char *good = NULL;
320 	char *bad;
321 
322 	if (good_data == NULL) {
323 		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
324 		return;
325 	}
326 
327 	if (c < rm->rm_firstdatacol) {
328 		/*
329 		 * The first time through, calculate the parity blocks for
330 		 * the good data (this relies on the fact that the good
331 		 * data never changes for a given logical ZIO)
332 		 */
333 		if (rm->rm_col[0].rc_gdata == NULL) {
334 			abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY];
335 			char *buf;
336 			int offset;
337 
338 			/*
339 			 * Set up the rm_col[]s to generate the parity for
340 			 * good_data, first saving the parity bufs and
341 			 * replacing them with buffers to hold the result.
342 			 */
343 			for (x = 0; x < rm->rm_firstdatacol; x++) {
344 				bad_parity[x] = rm->rm_col[x].rc_abd;
345 				rm->rm_col[x].rc_gdata =
346 				    zio_buf_alloc(rm->rm_col[x].rc_size);
347 				rm->rm_col[x].rc_abd =
348 				    abd_get_from_buf(rm->rm_col[x].rc_gdata,
349 				    rm->rm_col[x].rc_size);
350 			}
351 
352 			/* fill in the data columns from good_data */
353 			buf = (char *)good_data;
354 			for (; x < rm->rm_cols; x++) {
355 				abd_put(rm->rm_col[x].rc_abd);
356 				rm->rm_col[x].rc_abd = abd_get_from_buf(buf,
357 				    rm->rm_col[x].rc_size);
358 				buf += rm->rm_col[x].rc_size;
359 			}
360 
361 			/*
362 			 * Construct the parity from the good data.
363 			 */
364 			vdev_raidz_generate_parity(rm);
365 
366 			/* restore everything back to its original state */
367 			for (x = 0; x < rm->rm_firstdatacol; x++) {
368 				abd_put(rm->rm_col[x].rc_abd);
369 				rm->rm_col[x].rc_abd = bad_parity[x];
370 			}
371 
372 			offset = 0;
373 			for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
374 				abd_put(rm->rm_col[x].rc_abd);
375 				rm->rm_col[x].rc_abd = abd_get_offset(
376 				    rm->rm_abd_copy, offset);
377 				offset += rm->rm_col[x].rc_size;
378 			}
379 		}
380 
381 		ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
382 		good = rm->rm_col[c].rc_gdata;
383 	} else {
384 		/* adjust good_data to point at the start of our column */
385 		good = good_data;
386 
387 		for (x = rm->rm_firstdatacol; x < c; x++)
388 			good += rm->rm_col[x].rc_size;
389 	}
390 
391 	bad = abd_borrow_buf_copy(rm->rm_col[c].rc_abd, rm->rm_col[c].rc_size);
392 	/* we drop the ereport if it ends up that the data was good */
393 	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
394 	abd_return_buf(rm->rm_col[c].rc_abd, bad, rm->rm_col[c].rc_size);
395 }
396 
397 /*
398  * Invoked indirectly by zfs_ereport_start_checksum(), called
399  * below when our read operation fails completely.  The main point
400  * is to keep a copy of everything we read from disk, so that at
401  * vdev_raidz_cksum_finish() time we can compare it with the good data.
402  */
403 static void
404 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
405 {
406 	size_t c = (size_t)(uintptr_t)arg;
407 	size_t offset;
408 
409 	raidz_map_t *rm = zio->io_vsd;
410 	size_t size;
411 
412 	/* set up the report and bump the refcount  */
413 	zcr->zcr_cbdata = rm;
414 	zcr->zcr_cbinfo = c;
415 	zcr->zcr_finish = vdev_raidz_cksum_finish;
416 	zcr->zcr_free = vdev_raidz_cksum_free;
417 
418 	rm->rm_reports++;
419 	ASSERT3U(rm->rm_reports, >, 0);
420 
421 	if (rm->rm_abd_copy != NULL)
422 		return;
423 
424 	/*
425 	 * It's the first time we're called for this raidz_map_t, so we need
426 	 * to copy the data aside; there's no guarantee that our zio's buffer
427 	 * won't be re-used for something else.
428 	 *
429 	 * Our parity data is already in separate buffers, so there's no need
430 	 * to copy them.
431 	 */
432 
433 	size = 0;
434 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
435 		size += rm->rm_col[c].rc_size;
436 
437 	rm->rm_abd_copy =
438 	    abd_alloc_sametype(rm->rm_col[rm->rm_firstdatacol].rc_abd, size);
439 
440 	for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
441 		raidz_col_t *col = &rm->rm_col[c];
442 		abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset);
443 
444 		abd_copy(tmp, col->rc_abd, col->rc_size);
445 		abd_put(col->rc_abd);
446 		col->rc_abd = tmp;
447 
448 		offset += col->rc_size;
449 	}
450 	ASSERT3U(offset, ==, size);
451 }
452 
453 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
454 	vdev_raidz_map_free_vsd,
455 	vdev_raidz_cksum_report
456 };
457 
458 /*
459  * Divides the IO evenly across all child vdevs; usually, dcols is
460  * the number of children in the target vdev.
461  */
462 static raidz_map_t *
463 vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset,
464     uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
465 {
466 	raidz_map_t *rm;
467 	/* The starting RAIDZ (parent) vdev sector of the block. */
468 	uint64_t b = offset >> unit_shift;
469 	/* The zio's size in units of the vdev's minimum sector size. */
470 	uint64_t s = size >> unit_shift;
471 	/* The first column for this stripe. */
472 	uint64_t f = b % dcols;
473 	/* The starting byte offset on each child vdev. */
474 	uint64_t o = (b / dcols) << unit_shift;
475 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
476 	uint64_t off = 0;
477 
478 	/*
479 	 * "Quotient": The number of data sectors for this stripe on all but
480 	 * the "big column" child vdevs that also contain "remainder" data.
481 	 */
482 	q = s / (dcols - nparity);
483 
484 	/*
485 	 * "Remainder": The number of partial stripe data sectors in this I/O.
486 	 * This will add a sector to some, but not all, child vdevs.
487 	 */
488 	r = s - q * (dcols - nparity);
489 
490 	/* The number of "big columns" - those which contain remainder data. */
491 	bc = (r == 0 ? 0 : r + nparity);
492 
493 	/*
494 	 * The total number of data and parity sectors associated with
495 	 * this I/O.
496 	 */
497 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
498 
499 	/* acols: The columns that will be accessed. */
500 	/* scols: The columns that will be accessed or skipped. */
501 	if (q == 0) {
502 		/* Our I/O request doesn't span all child vdevs. */
503 		acols = bc;
504 		scols = MIN(dcols, roundup(bc, nparity + 1));
505 	} else {
506 		acols = dcols;
507 		scols = dcols;
508 	}
509 
510 	ASSERT3U(acols, <=, scols);
511 
512 	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
513 
514 	rm->rm_cols = acols;
515 	rm->rm_scols = scols;
516 	rm->rm_bigcols = bc;
517 	rm->rm_skipstart = bc;
518 	rm->rm_missingdata = 0;
519 	rm->rm_missingparity = 0;
520 	rm->rm_firstdatacol = nparity;
521 	rm->rm_abd_copy = NULL;
522 	rm->rm_reports = 0;
523 	rm->rm_freed = 0;
524 	rm->rm_ecksuminjected = 0;
525 
526 	asize = 0;
527 
528 	for (c = 0; c < scols; c++) {
529 		col = f + c;
530 		coff = o;
531 		if (col >= dcols) {
532 			col -= dcols;
533 			coff += 1ULL << unit_shift;
534 		}
535 		rm->rm_col[c].rc_devidx = col;
536 		rm->rm_col[c].rc_offset = coff;
537 		rm->rm_col[c].rc_abd = NULL;
538 		rm->rm_col[c].rc_gdata = NULL;
539 		rm->rm_col[c].rc_error = 0;
540 		rm->rm_col[c].rc_tried = 0;
541 		rm->rm_col[c].rc_skipped = 0;
542 
543 		if (c >= acols)
544 			rm->rm_col[c].rc_size = 0;
545 		else if (c < bc)
546 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
547 		else
548 			rm->rm_col[c].rc_size = q << unit_shift;
549 
550 		asize += rm->rm_col[c].rc_size;
551 	}
552 
553 	ASSERT3U(asize, ==, tot << unit_shift);
554 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
555 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
556 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
557 	ASSERT3U(rm->rm_nskip, <=, nparity);
558 
559 	for (c = 0; c < rm->rm_firstdatacol; c++)
560 		rm->rm_col[c].rc_abd =
561 		    abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE);
562 
563 	rm->rm_col[c].rc_abd = abd_get_offset(abd, 0);
564 	off = rm->rm_col[c].rc_size;
565 
566 	for (c = c + 1; c < acols; c++) {
567 		rm->rm_col[c].rc_abd = abd_get_offset(abd, off);
568 		off += rm->rm_col[c].rc_size;
569 	}
570 
571 	/*
572 	 * If all data stored spans all columns, there's a danger that parity
573 	 * will always be on the same device and, since parity isn't read
574 	 * during normal operation, that that device's I/O bandwidth won't be
575 	 * used effectively. We therefore switch the parity every 1MB.
576 	 *
577 	 * ... at least that was, ostensibly, the theory. As a practical
578 	 * matter unless we juggle the parity between all devices evenly, we
579 	 * won't see any benefit. Further, occasional writes that aren't a
580 	 * multiple of the LCM of the number of children and the minimum
581 	 * stripe width are sufficient to avoid pessimal behavior.
582 	 * Unfortunately, this decision created an implicit on-disk format
583 	 * requirement that we need to support for all eternity, but only
584 	 * for single-parity RAID-Z.
585 	 *
586 	 * If we intend to skip a sector in the zeroth column for padding
587 	 * we must make sure to note this swap. We will never intend to
588 	 * skip the first column since at least one data and one parity
589 	 * column must appear in each row.
590 	 */
591 	ASSERT(rm->rm_cols >= 2);
592 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
593 
594 	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
595 		devidx = rm->rm_col[0].rc_devidx;
596 		o = rm->rm_col[0].rc_offset;
597 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
598 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
599 		rm->rm_col[1].rc_devidx = devidx;
600 		rm->rm_col[1].rc_offset = o;
601 
602 		if (rm->rm_skipstart == 0)
603 			rm->rm_skipstart = 1;
604 	}
605 
606 	return (rm);
607 }
608 
609 struct pqr_struct {
610 	uint64_t *p;
611 	uint64_t *q;
612 	uint64_t *r;
613 };
614 
615 static int
616 vdev_raidz_p_func(void *buf, size_t size, void *private)
617 {
618 	struct pqr_struct *pqr = private;
619 	const uint64_t *src = buf;
620 	int i, cnt = size / sizeof (src[0]);
621 
622 	ASSERT(pqr->p && !pqr->q && !pqr->r);
623 
624 	for (i = 0; i < cnt; i++, src++, pqr->p++)
625 		*pqr->p ^= *src;
626 
627 	return (0);
628 }
629 
630 static int
631 vdev_raidz_pq_func(void *buf, size_t size, void *private)
632 {
633 	struct pqr_struct *pqr = private;
634 	const uint64_t *src = buf;
635 	uint64_t mask;
636 	int i, cnt = size / sizeof (src[0]);
637 
638 	ASSERT(pqr->p && pqr->q && !pqr->r);
639 
640 	for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
641 		*pqr->p ^= *src;
642 		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
643 		*pqr->q ^= *src;
644 	}
645 
646 	return (0);
647 }
648 
649 static int
650 vdev_raidz_pqr_func(void *buf, size_t size, void *private)
651 {
652 	struct pqr_struct *pqr = private;
653 	const uint64_t *src = buf;
654 	uint64_t mask;
655 	int i, cnt = size / sizeof (src[0]);
656 
657 	ASSERT(pqr->p && pqr->q && pqr->r);
658 
659 	for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
660 		*pqr->p ^= *src;
661 		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
662 		*pqr->q ^= *src;
663 		VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
664 		*pqr->r ^= *src;
665 	}
666 
667 	return (0);
668 }
669 
670 static void
671 vdev_raidz_generate_parity_p(raidz_map_t *rm)
672 {
673 	uint64_t *p;
674 	int c;
675 	abd_t *src;
676 
677 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
678 		src = rm->rm_col[c].rc_abd;
679 		p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
680 
681 		if (c == rm->rm_firstdatacol) {
682 			abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
683 		} else {
684 			struct pqr_struct pqr = { p, NULL, NULL };
685 			(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
686 			    vdev_raidz_p_func, &pqr);
687 		}
688 	}
689 }
690 
691 static void
692 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
693 {
694 	uint64_t *p, *q, pcnt, ccnt, mask, i;
695 	int c;
696 	abd_t *src;
697 
698 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
699 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
700 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
701 
702 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
703 		src = rm->rm_col[c].rc_abd;
704 		p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
705 		q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
706 
707 		ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
708 
709 		if (c == rm->rm_firstdatacol) {
710 			abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
711 			(void) memcpy(q, p, rm->rm_col[c].rc_size);
712 		} else {
713 			struct pqr_struct pqr = { p, q, NULL };
714 			(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
715 			    vdev_raidz_pq_func, &pqr);
716 		}
717 
718 		if (c == rm->rm_firstdatacol) {
719 			for (i = ccnt; i < pcnt; i++) {
720 				p[i] = 0;
721 				q[i] = 0;
722 			}
723 		} else {
724 			/*
725 			 * Treat short columns as though they are full of 0s.
726 			 * Note that there's therefore nothing needed for P.
727 			 */
728 			for (i = ccnt; i < pcnt; i++) {
729 				VDEV_RAIDZ_64MUL_2(q[i], mask);
730 			}
731 		}
732 	}
733 }
734 
735 static void
736 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
737 {
738 	uint64_t *p, *q, *r, pcnt, ccnt, mask, i;
739 	int c;
740 	abd_t *src;
741 
742 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
743 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
744 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
745 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
746 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
747 
748 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
749 		src = rm->rm_col[c].rc_abd;
750 		p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
751 		q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
752 		r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd);
753 
754 		ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
755 
756 		if (c == rm->rm_firstdatacol) {
757 			abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
758 			(void) memcpy(q, p, rm->rm_col[c].rc_size);
759 			(void) memcpy(r, p, rm->rm_col[c].rc_size);
760 		} else {
761 			struct pqr_struct pqr = { p, q, r };
762 			(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
763 			    vdev_raidz_pqr_func, &pqr);
764 		}
765 
766 		if (c == rm->rm_firstdatacol) {
767 			for (i = ccnt; i < pcnt; i++) {
768 				p[i] = 0;
769 				q[i] = 0;
770 				r[i] = 0;
771 			}
772 		} else {
773 			/*
774 			 * Treat short columns as though they are full of 0s.
775 			 * Note that there's therefore nothing needed for P.
776 			 */
777 			for (i = ccnt; i < pcnt; i++) {
778 				VDEV_RAIDZ_64MUL_2(q[i], mask);
779 				VDEV_RAIDZ_64MUL_4(r[i], mask);
780 			}
781 		}
782 	}
783 }
784 
785 /*
786  * Generate RAID parity in the first virtual columns according to the number of
787  * parity columns available.
788  */
789 static void
790 vdev_raidz_generate_parity(raidz_map_t *rm)
791 {
792 	switch (rm->rm_firstdatacol) {
793 	case 1:
794 		vdev_raidz_generate_parity_p(rm);
795 		break;
796 	case 2:
797 		vdev_raidz_generate_parity_pq(rm);
798 		break;
799 	case 3:
800 		vdev_raidz_generate_parity_pqr(rm);
801 		break;
802 	default:
803 		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
804 	}
805 }
806 
807 /* ARGSUSED */
808 static int
809 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
810 {
811 	uint64_t *dst = dbuf;
812 	uint64_t *src = sbuf;
813 	int cnt = size / sizeof (src[0]);
814 
815 	for (int i = 0; i < cnt; i++) {
816 		dst[i] ^= src[i];
817 	}
818 
819 	return (0);
820 }
821 
822 /* ARGSUSED */
823 static int
824 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
825     void *private)
826 {
827 	uint64_t *dst = dbuf;
828 	uint64_t *src = sbuf;
829 	uint64_t mask;
830 	int cnt = size / sizeof (dst[0]);
831 
832 	for (int i = 0; i < cnt; i++, dst++, src++) {
833 		VDEV_RAIDZ_64MUL_2(*dst, mask);
834 		*dst ^= *src;
835 	}
836 
837 	return (0);
838 }
839 
840 /* ARGSUSED */
841 static int
842 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
843 {
844 	uint64_t *dst = buf;
845 	uint64_t mask;
846 	int cnt = size / sizeof (dst[0]);
847 
848 	for (int i = 0; i < cnt; i++, dst++) {
849 		/* same operation as vdev_raidz_reconst_q_pre_func() on dst */
850 		VDEV_RAIDZ_64MUL_2(*dst, mask);
851 	}
852 
853 	return (0);
854 }
855 
856 struct reconst_q_struct {
857 	uint64_t *q;
858 	int exp;
859 };
860 
861 static int
862 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
863 {
864 	struct reconst_q_struct *rq = private;
865 	uint64_t *dst = buf;
866 	int cnt = size / sizeof (dst[0]);
867 
868 	for (int i = 0; i < cnt; i++, dst++, rq->q++) {
869 		*dst ^= *rq->q;
870 
871 		int j;
872 		uint8_t *b;
873 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
874 			*b = vdev_raidz_exp2(*b, rq->exp);
875 		}
876 	}
877 
878 	return (0);
879 }
880 
881 struct reconst_pq_struct {
882 	uint8_t *p;
883 	uint8_t *q;
884 	uint8_t *pxy;
885 	uint8_t *qxy;
886 	int aexp;
887 	int bexp;
888 };
889 
890 static int
891 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
892 {
893 	struct reconst_pq_struct *rpq = private;
894 	uint8_t *xd = xbuf;
895 	uint8_t *yd = ybuf;
896 
897 	for (int i = 0; i < size;
898 	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
899 		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
900 		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
901 		*yd = *rpq->p ^ *rpq->pxy ^ *xd;
902 	}
903 
904 	return (0);
905 }
906 
907 static int
908 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
909 {
910 	struct reconst_pq_struct *rpq = private;
911 	uint8_t *xd = xbuf;
912 
913 	for (int i = 0; i < size;
914 	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
915 		/* same operation as vdev_raidz_reconst_pq_func() on xd */
916 		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
917 		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
918 	}
919 
920 	return (0);
921 }
922 
923 static int
924 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
925 {
926 	int x = tgts[0];
927 	int c;
928 	abd_t *dst, *src;
929 
930 	ASSERT(ntgts == 1);
931 	ASSERT(x >= rm->rm_firstdatacol);
932 	ASSERT(x < rm->rm_cols);
933 
934 	ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size);
935 	ASSERT(rm->rm_col[x].rc_size > 0);
936 
937 	src = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
938 	dst = rm->rm_col[x].rc_abd;
939 
940 	abd_copy(dst, src, rm->rm_col[x].rc_size);
941 
942 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
943 		uint64_t size = MIN(rm->rm_col[x].rc_size,
944 		    rm->rm_col[c].rc_size);
945 
946 		src = rm->rm_col[c].rc_abd;
947 		dst = rm->rm_col[x].rc_abd;
948 
949 		if (c == x)
950 			continue;
951 
952 		(void) abd_iterate_func2(dst, src, 0, 0, size,
953 		    vdev_raidz_reconst_p_func, NULL);
954 	}
955 
956 	return (1 << VDEV_RAIDZ_P);
957 }
958 
959 static int
960 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
961 {
962 	int x = tgts[0];
963 	int c, exp;
964 	abd_t *dst, *src;
965 
966 	ASSERT(ntgts == 1);
967 
968 	ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size);
969 
970 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
971 		uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size,
972 		    rm->rm_col[c].rc_size);
973 
974 		src = rm->rm_col[c].rc_abd;
975 		dst = rm->rm_col[x].rc_abd;
976 
977 		if (c == rm->rm_firstdatacol) {
978 			abd_copy(dst, src, size);
979 			if (rm->rm_col[x].rc_size > size)
980 				abd_zero_off(dst, size,
981 				    rm->rm_col[x].rc_size - size);
982 		} else {
983 			ASSERT3U(size, <=, rm->rm_col[x].rc_size);
984 			(void) abd_iterate_func2(dst, src, 0, 0, size,
985 			    vdev_raidz_reconst_q_pre_func, NULL);
986 			(void) abd_iterate_func(dst,
987 			    size, rm->rm_col[x].rc_size - size,
988 			    vdev_raidz_reconst_q_pre_tail_func, NULL);
989 		}
990 	}
991 
992 	src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
993 	dst = rm->rm_col[x].rc_abd;
994 	exp = 255 - (rm->rm_cols - 1 - x);
995 
996 	struct reconst_q_struct rq = { abd_to_buf(src), exp };
997 	(void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size,
998 	    vdev_raidz_reconst_q_post_func, &rq);
999 
1000 	return (1 << VDEV_RAIDZ_Q);
1001 }
1002 
1003 static int
1004 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
1005 {
1006 	uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
1007 	abd_t *pdata, *qdata;
1008 	uint64_t xsize, ysize;
1009 	int x = tgts[0];
1010 	int y = tgts[1];
1011 	abd_t *xd, *yd;
1012 
1013 	ASSERT(ntgts == 2);
1014 	ASSERT(x < y);
1015 	ASSERT(x >= rm->rm_firstdatacol);
1016 	ASSERT(y < rm->rm_cols);
1017 
1018 	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
1019 
1020 	/*
1021 	 * Move the parity data aside -- we're going to compute parity as
1022 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
1023 	 * reuse the parity generation mechanism without trashing the actual
1024 	 * parity so we make those columns appear to be full of zeros by
1025 	 * setting their lengths to zero.
1026 	 */
1027 	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
1028 	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
1029 	xsize = rm->rm_col[x].rc_size;
1030 	ysize = rm->rm_col[y].rc_size;
1031 
1032 	rm->rm_col[VDEV_RAIDZ_P].rc_abd =
1033 	    abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
1034 	rm->rm_col[VDEV_RAIDZ_Q].rc_abd =
1035 	    abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
1036 	rm->rm_col[x].rc_size = 0;
1037 	rm->rm_col[y].rc_size = 0;
1038 
1039 	vdev_raidz_generate_parity_pq(rm);
1040 
1041 	rm->rm_col[x].rc_size = xsize;
1042 	rm->rm_col[y].rc_size = ysize;
1043 
1044 	p = abd_to_buf(pdata);
1045 	q = abd_to_buf(qdata);
1046 	pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
1047 	qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
1048 	xd = rm->rm_col[x].rc_abd;
1049 	yd = rm->rm_col[y].rc_abd;
1050 
1051 	/*
1052 	 * We now have:
1053 	 *	Pxy = P + D_x + D_y
1054 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
1055 	 *
1056 	 * We can then solve for D_x:
1057 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
1058 	 * where
1059 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
1060 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
1061 	 *
1062 	 * With D_x in hand, we can easily solve for D_y:
1063 	 *	D_y = P + Pxy + D_x
1064 	 */
1065 
1066 	a = vdev_raidz_pow2[255 + x - y];
1067 	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
1068 	tmp = 255 - vdev_raidz_log2[a ^ 1];
1069 
1070 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
1071 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
1072 
1073 	ASSERT3U(xsize, >=, ysize);
1074 	struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
1075 	(void) abd_iterate_func2(xd, yd, 0, 0, ysize,
1076 	    vdev_raidz_reconst_pq_func, &rpq);
1077 	(void) abd_iterate_func(xd, ysize, xsize - ysize,
1078 	    vdev_raidz_reconst_pq_tail_func, &rpq);
1079 
1080 	abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
1081 	abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
1082 
1083 	/*
1084 	 * Restore the saved parity data.
1085 	 */
1086 	rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata;
1087 	rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata;
1088 
1089 	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
1090 }
1091 
1092 /* BEGIN CSTYLED */
1093 /*
1094  * In the general case of reconstruction, we must solve the system of linear
1095  * equations defined by the coeffecients used to generate parity as well as
1096  * the contents of the data and parity disks. This can be expressed with
1097  * vectors for the original data (D) and the actual data (d) and parity (p)
1098  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
1099  *
1100  *            __   __                     __     __
1101  *            |     |         __     __   |  p_0  |
1102  *            |  V  |         |  D_0  |   | p_m-1 |
1103  *            |     |    x    |   :   | = |  d_0  |
1104  *            |  I  |         | D_n-1 |   |   :   |
1105  *            |     |         ~~     ~~   | d_n-1 |
1106  *            ~~   ~~                     ~~     ~~
1107  *
1108  * I is simply a square identity matrix of size n, and V is a vandermonde
1109  * matrix defined by the coeffecients we chose for the various parity columns
1110  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
1111  * computation as well as linear separability.
1112  *
1113  *      __               __               __     __
1114  *      |   1   ..  1 1 1 |               |  p_0  |
1115  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
1116  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
1117  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
1118  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
1119  *      |   :       : : : |   |   :   |   |  d_2  |
1120  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
1121  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
1122  *      |   0   ..  0 0 1 |               | d_n-1 |
1123  *      ~~               ~~               ~~     ~~
1124  *
1125  * Note that I, V, d, and p are known. To compute D, we must invert the
1126  * matrix and use the known data and parity values to reconstruct the unknown
1127  * data values. We begin by removing the rows in V|I and d|p that correspond
1128  * to failed or missing columns; we then make V|I square (n x n) and d|p
1129  * sized n by removing rows corresponding to unused parity from the bottom up
1130  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
1131  * using Gauss-Jordan elimination. In the example below we use m=3 parity
1132  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
1133  *           __                               __
1134  *           |  1   1   1   1   1   1   1   1  |
1135  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
1136  *           |  19 205 116  29  64  16  4   1  |      / /
1137  *           |  1   0   0   0   0   0   0   0  |     / /
1138  *           |  0   1   0   0   0   0   0   0  | <--' /
1139  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
1140  *           |  0   0   0   1   0   0   0   0  |
1141  *           |  0   0   0   0   1   0   0   0  |
1142  *           |  0   0   0   0   0   1   0   0  |
1143  *           |  0   0   0   0   0   0   1   0  |
1144  *           |  0   0   0   0   0   0   0   1  |
1145  *           ~~                               ~~
1146  *           __                               __
1147  *           |  1   1   1   1   1   1   1   1  |
1148  *           |  19 205 116  29  64  16  4   1  |
1149  *           |  1   0   0   0   0   0   0   0  |
1150  *  (V|I)' = |  0   0   0   1   0   0   0   0  |
1151  *           |  0   0   0   0   1   0   0   0  |
1152  *           |  0   0   0   0   0   1   0   0  |
1153  *           |  0   0   0   0   0   0   1   0  |
1154  *           |  0   0   0   0   0   0   0   1  |
1155  *           ~~                               ~~
1156  *
1157  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1158  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1159  * matrix is not singular.
1160  * __                                                                 __
1161  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1162  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1163  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1164  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1165  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1166  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1167  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1168  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1169  * ~~                                                                 ~~
1170  * __                                                                 __
1171  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1172  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1173  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1174  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1175  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1176  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1177  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1178  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1179  * ~~                                                                 ~~
1180  * __                                                                 __
1181  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1182  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1183  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1184  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1185  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1186  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1187  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1188  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1189  * ~~                                                                 ~~
1190  * __                                                                 __
1191  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1192  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1193  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1194  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1195  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1196  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1197  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1198  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1199  * ~~                                                                 ~~
1200  * __                                                                 __
1201  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1202  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1203  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1204  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1205  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1206  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1207  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1208  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1209  * ~~                                                                 ~~
1210  * __                                                                 __
1211  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1212  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1213  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1214  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1215  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1216  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1217  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1218  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1219  * ~~                                                                 ~~
1220  *                   __                               __
1221  *                   |  0   0   1   0   0   0   0   0  |
1222  *                   | 167 100  5   41 159 169 217 208 |
1223  *                   | 166 100  4   40 158 168 216 209 |
1224  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1225  *                   |  0   0   0   0   1   0   0   0  |
1226  *                   |  0   0   0   0   0   1   0   0  |
1227  *                   |  0   0   0   0   0   0   1   0  |
1228  *                   |  0   0   0   0   0   0   0   1  |
1229  *                   ~~                               ~~
1230  *
1231  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1232  * of the missing data.
1233  *
1234  * As is apparent from the example above, the only non-trivial rows in the
1235  * inverse matrix correspond to the data disks that we're trying to
1236  * reconstruct. Indeed, those are the only rows we need as the others would
1237  * only be useful for reconstructing data known or assumed to be valid. For
1238  * that reason, we only build the coefficients in the rows that correspond to
1239  * targeted columns.
1240  */
1241 /* END CSTYLED */
1242 
1243 static void
1244 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1245     uint8_t **rows)
1246 {
1247 	int i, j;
1248 	int pow;
1249 
1250 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1251 
1252 	/*
1253 	 * Fill in the missing rows of interest.
1254 	 */
1255 	for (i = 0; i < nmap; i++) {
1256 		ASSERT3S(0, <=, map[i]);
1257 		ASSERT3S(map[i], <=, 2);
1258 
1259 		pow = map[i] * n;
1260 		if (pow > 255)
1261 			pow -= 255;
1262 		ASSERT(pow <= 255);
1263 
1264 		for (j = 0; j < n; j++) {
1265 			pow -= map[i];
1266 			if (pow < 0)
1267 				pow += 255;
1268 			rows[i][j] = vdev_raidz_pow2[pow];
1269 		}
1270 	}
1271 }
1272 
1273 static void
1274 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1275     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1276 {
1277 	int i, j, ii, jj;
1278 	uint8_t log;
1279 
1280 	/*
1281 	 * Assert that the first nmissing entries from the array of used
1282 	 * columns correspond to parity columns and that subsequent entries
1283 	 * correspond to data columns.
1284 	 */
1285 	for (i = 0; i < nmissing; i++) {
1286 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
1287 	}
1288 	for (; i < n; i++) {
1289 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1290 	}
1291 
1292 	/*
1293 	 * First initialize the storage where we'll compute the inverse rows.
1294 	 */
1295 	for (i = 0; i < nmissing; i++) {
1296 		for (j = 0; j < n; j++) {
1297 			invrows[i][j] = (i == j) ? 1 : 0;
1298 		}
1299 	}
1300 
1301 	/*
1302 	 * Subtract all trivial rows from the rows of consequence.
1303 	 */
1304 	for (i = 0; i < nmissing; i++) {
1305 		for (j = nmissing; j < n; j++) {
1306 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1307 			jj = used[j] - rm->rm_firstdatacol;
1308 			ASSERT3S(jj, <, n);
1309 			invrows[i][j] = rows[i][jj];
1310 			rows[i][jj] = 0;
1311 		}
1312 	}
1313 
1314 	/*
1315 	 * For each of the rows of interest, we must normalize it and subtract
1316 	 * a multiple of it from the other rows.
1317 	 */
1318 	for (i = 0; i < nmissing; i++) {
1319 		for (j = 0; j < missing[i]; j++) {
1320 			ASSERT0(rows[i][j]);
1321 		}
1322 		ASSERT3U(rows[i][missing[i]], !=, 0);
1323 
1324 		/*
1325 		 * Compute the inverse of the first element and multiply each
1326 		 * element in the row by that value.
1327 		 */
1328 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1329 
1330 		for (j = 0; j < n; j++) {
1331 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1332 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1333 		}
1334 
1335 		for (ii = 0; ii < nmissing; ii++) {
1336 			if (i == ii)
1337 				continue;
1338 
1339 			ASSERT3U(rows[ii][missing[i]], !=, 0);
1340 
1341 			log = vdev_raidz_log2[rows[ii][missing[i]]];
1342 
1343 			for (j = 0; j < n; j++) {
1344 				rows[ii][j] ^=
1345 				    vdev_raidz_exp2(rows[i][j], log);
1346 				invrows[ii][j] ^=
1347 				    vdev_raidz_exp2(invrows[i][j], log);
1348 			}
1349 		}
1350 	}
1351 
1352 	/*
1353 	 * Verify that the data that is left in the rows are properly part of
1354 	 * an identity matrix.
1355 	 */
1356 	for (i = 0; i < nmissing; i++) {
1357 		for (j = 0; j < n; j++) {
1358 			if (j == missing[i]) {
1359 				ASSERT3U(rows[i][j], ==, 1);
1360 			} else {
1361 				ASSERT0(rows[i][j]);
1362 			}
1363 		}
1364 	}
1365 }
1366 
1367 static void
1368 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1369     int *missing, uint8_t **invrows, const uint8_t *used)
1370 {
1371 	int i, j, x, cc, c;
1372 	uint8_t *src;
1373 	uint64_t ccount;
1374 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1375 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1376 	uint8_t log = 0;
1377 	uint8_t val;
1378 	int ll;
1379 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1380 	uint8_t *p, *pp;
1381 	size_t psize;
1382 
1383 	psize = sizeof (invlog[0][0]) * n * nmissing;
1384 	p = kmem_alloc(psize, KM_SLEEP);
1385 
1386 	for (pp = p, i = 0; i < nmissing; i++) {
1387 		invlog[i] = pp;
1388 		pp += n;
1389 	}
1390 
1391 	for (i = 0; i < nmissing; i++) {
1392 		for (j = 0; j < n; j++) {
1393 			ASSERT3U(invrows[i][j], !=, 0);
1394 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1395 		}
1396 	}
1397 
1398 	for (i = 0; i < n; i++) {
1399 		c = used[i];
1400 		ASSERT3U(c, <, rm->rm_cols);
1401 
1402 		src = abd_to_buf(rm->rm_col[c].rc_abd);
1403 		ccount = rm->rm_col[c].rc_size;
1404 		for (j = 0; j < nmissing; j++) {
1405 			cc = missing[j] + rm->rm_firstdatacol;
1406 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1407 			ASSERT3U(cc, <, rm->rm_cols);
1408 			ASSERT3U(cc, !=, c);
1409 
1410 			dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd);
1411 			dcount[j] = rm->rm_col[cc].rc_size;
1412 		}
1413 
1414 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1415 
1416 		for (x = 0; x < ccount; x++, src++) {
1417 			if (*src != 0)
1418 				log = vdev_raidz_log2[*src];
1419 
1420 			for (cc = 0; cc < nmissing; cc++) {
1421 				if (x >= dcount[cc])
1422 					continue;
1423 
1424 				if (*src == 0) {
1425 					val = 0;
1426 				} else {
1427 					if ((ll = log + invlog[cc][i]) >= 255)
1428 						ll -= 255;
1429 					val = vdev_raidz_pow2[ll];
1430 				}
1431 
1432 				if (i == 0)
1433 					dst[cc][x] = val;
1434 				else
1435 					dst[cc][x] ^= val;
1436 			}
1437 		}
1438 	}
1439 
1440 	kmem_free(p, psize);
1441 }
1442 
1443 static int
1444 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1445 {
1446 	int n, i, c, t, tt;
1447 	int nmissing_rows;
1448 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1449 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1450 
1451 	uint8_t *p, *pp;
1452 	size_t psize;
1453 
1454 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1455 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1456 	uint8_t *used;
1457 
1458 	abd_t **bufs = NULL;
1459 
1460 	int code = 0;
1461 
1462 	/*
1463 	 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
1464 	 * temporary linear ABDs.
1465 	 */
1466 	if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) {
1467 		bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE);
1468 
1469 		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1470 			raidz_col_t *col = &rm->rm_col[c];
1471 
1472 			bufs[c] = col->rc_abd;
1473 			col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE);
1474 			abd_copy(col->rc_abd, bufs[c], col->rc_size);
1475 		}
1476 	}
1477 
1478 	n = rm->rm_cols - rm->rm_firstdatacol;
1479 
1480 	/*
1481 	 * Figure out which data columns are missing.
1482 	 */
1483 	nmissing_rows = 0;
1484 	for (t = 0; t < ntgts; t++) {
1485 		if (tgts[t] >= rm->rm_firstdatacol) {
1486 			missing_rows[nmissing_rows++] =
1487 			    tgts[t] - rm->rm_firstdatacol;
1488 		}
1489 	}
1490 
1491 	/*
1492 	 * Figure out which parity columns to use to help generate the missing
1493 	 * data columns.
1494 	 */
1495 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1496 		ASSERT(tt < ntgts);
1497 		ASSERT(c < rm->rm_firstdatacol);
1498 
1499 		/*
1500 		 * Skip any targeted parity columns.
1501 		 */
1502 		if (c == tgts[tt]) {
1503 			tt++;
1504 			continue;
1505 		}
1506 
1507 		code |= 1 << c;
1508 
1509 		parity_map[i] = c;
1510 		i++;
1511 	}
1512 
1513 	ASSERT(code != 0);
1514 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1515 
1516 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1517 	    nmissing_rows * n + sizeof (used[0]) * n;
1518 	p = kmem_alloc(psize, KM_SLEEP);
1519 
1520 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1521 		rows[i] = pp;
1522 		pp += n;
1523 		invrows[i] = pp;
1524 		pp += n;
1525 	}
1526 	used = pp;
1527 
1528 	for (i = 0; i < nmissing_rows; i++) {
1529 		used[i] = parity_map[i];
1530 	}
1531 
1532 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1533 		if (tt < nmissing_rows &&
1534 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1535 			tt++;
1536 			continue;
1537 		}
1538 
1539 		ASSERT3S(i, <, n);
1540 		used[i] = c;
1541 		i++;
1542 	}
1543 
1544 	/*
1545 	 * Initialize the interesting rows of the matrix.
1546 	 */
1547 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1548 
1549 	/*
1550 	 * Invert the matrix.
1551 	 */
1552 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1553 	    invrows, used);
1554 
1555 	/*
1556 	 * Reconstruct the missing data using the generated matrix.
1557 	 */
1558 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1559 	    invrows, used);
1560 
1561 	kmem_free(p, psize);
1562 
1563 	/*
1564 	 * copy back from temporary linear abds and free them
1565 	 */
1566 	if (bufs) {
1567 		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1568 			raidz_col_t *col = &rm->rm_col[c];
1569 
1570 			abd_copy(bufs[c], col->rc_abd, col->rc_size);
1571 			abd_free(col->rc_abd);
1572 			col->rc_abd = bufs[c];
1573 		}
1574 		kmem_free(bufs, rm->rm_cols * sizeof (abd_t *));
1575 	}
1576 
1577 	return (code);
1578 }
1579 
1580 static int
1581 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1582 {
1583 	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1584 	int ntgts;
1585 	int i, c;
1586 	int code;
1587 	int nbadparity, nbaddata;
1588 	int parity_valid[VDEV_RAIDZ_MAXPARITY];
1589 
1590 	/*
1591 	 * The tgts list must already be sorted.
1592 	 */
1593 	for (i = 1; i < nt; i++) {
1594 		ASSERT(t[i] > t[i - 1]);
1595 	}
1596 
1597 	nbadparity = rm->rm_firstdatacol;
1598 	nbaddata = rm->rm_cols - nbadparity;
1599 	ntgts = 0;
1600 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1601 		if (c < rm->rm_firstdatacol)
1602 			parity_valid[c] = B_FALSE;
1603 
1604 		if (i < nt && c == t[i]) {
1605 			tgts[ntgts++] = c;
1606 			i++;
1607 		} else if (rm->rm_col[c].rc_error != 0) {
1608 			tgts[ntgts++] = c;
1609 		} else if (c >= rm->rm_firstdatacol) {
1610 			nbaddata--;
1611 		} else {
1612 			parity_valid[c] = B_TRUE;
1613 			nbadparity--;
1614 		}
1615 	}
1616 
1617 	ASSERT(ntgts >= nt);
1618 	ASSERT(nbaddata >= 0);
1619 	ASSERT(nbaddata + nbadparity == ntgts);
1620 
1621 	dt = &tgts[nbadparity];
1622 
1623 	/*
1624 	 * See if we can use any of our optimized reconstruction routines.
1625 	 */
1626 	if (!vdev_raidz_default_to_general) {
1627 		switch (nbaddata) {
1628 		case 1:
1629 			if (parity_valid[VDEV_RAIDZ_P])
1630 				return (vdev_raidz_reconstruct_p(rm, dt, 1));
1631 
1632 			ASSERT(rm->rm_firstdatacol > 1);
1633 
1634 			if (parity_valid[VDEV_RAIDZ_Q])
1635 				return (vdev_raidz_reconstruct_q(rm, dt, 1));
1636 
1637 			ASSERT(rm->rm_firstdatacol > 2);
1638 			break;
1639 
1640 		case 2:
1641 			ASSERT(rm->rm_firstdatacol > 1);
1642 
1643 			if (parity_valid[VDEV_RAIDZ_P] &&
1644 			    parity_valid[VDEV_RAIDZ_Q])
1645 				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1646 
1647 			ASSERT(rm->rm_firstdatacol > 2);
1648 
1649 			break;
1650 		}
1651 	}
1652 
1653 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1654 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1655 	ASSERT(code > 0);
1656 	return (code);
1657 }
1658 
1659 static int
1660 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1661     uint64_t *ashift)
1662 {
1663 	vdev_t *cvd;
1664 	uint64_t nparity = vd->vdev_nparity;
1665 	int c;
1666 	int lasterror = 0;
1667 	int numerrors = 0;
1668 
1669 	ASSERT(nparity > 0);
1670 
1671 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
1672 	    vd->vdev_children < nparity + 1) {
1673 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1674 		return (SET_ERROR(EINVAL));
1675 	}
1676 
1677 	vdev_open_children(vd);
1678 
1679 	for (c = 0; c < vd->vdev_children; c++) {
1680 		cvd = vd->vdev_child[c];
1681 
1682 		if (cvd->vdev_open_error != 0) {
1683 			lasterror = cvd->vdev_open_error;
1684 			numerrors++;
1685 			continue;
1686 		}
1687 
1688 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1689 		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1690 		*ashift = MAX(*ashift, cvd->vdev_ashift);
1691 	}
1692 
1693 	*asize *= vd->vdev_children;
1694 	*max_asize *= vd->vdev_children;
1695 
1696 	if (numerrors > nparity) {
1697 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1698 		return (lasterror);
1699 	}
1700 
1701 	return (0);
1702 }
1703 
1704 static void
1705 vdev_raidz_close(vdev_t *vd)
1706 {
1707 	int c;
1708 
1709 	for (c = 0; c < vd->vdev_children; c++)
1710 		vdev_close(vd->vdev_child[c]);
1711 }
1712 
1713 /*
1714  * Handle a read or write I/O to a RAID-Z dump device.
1715  *
1716  * The dump device is in a unique situation compared to other ZFS datasets:
1717  * writing to this device should be as simple and fast as possible.  In
1718  * addition, durability matters much less since the dump will be extracted
1719  * once the machine reboots.  For that reason, this function eschews parity for
1720  * performance and simplicity.  The dump device uses the checksum setting
1721  * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1722  * dataset.
1723  *
1724  * Blocks of size 128 KB have been preallocated for this volume.  I/Os less than
1725  * 128 KB will not fill an entire block; in addition, they may not be properly
1726  * aligned.  In that case, this function uses the preallocated 128 KB block and
1727  * omits reading or writing any "empty" portions of that block, as opposed to
1728  * allocating a fresh appropriately-sized block.
1729  *
1730  * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1731  *
1732  *     vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1733  *
1734  * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1735  * allocated which spans all five child vdevs.  8 KB of data would be written to
1736  * each of four vdevs, with the fifth containing the parity bits.
1737  *
1738  *       parity    data     data     data     data
1739  *     |   PP   |   XX   |   XX   |   XX   |   XX   |
1740  *         ^        ^        ^        ^        ^
1741  *         |        |        |        |        |
1742  *   8 KB parity    ------8 KB data blocks------
1743  *
1744  * However, when writing to the dump device, the behavior is different:
1745  *
1746  *     vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1747  *
1748  * Unlike the normal RAID-Z case in which the block is allocated based on the
1749  * I/O size, reads and writes here always use a 128 KB logical I/O size.  If the
1750  * I/O size is less than 128 KB, only the actual portions of data are written.
1751  * In this example the data is written to the third data vdev since that vdev
1752  * contains the offset [64 KB, 96 KB).
1753  *
1754  *       parity    data     data     data     data
1755  *     |        |        |        |   XX   |        |
1756  *                                    ^
1757  *                                    |
1758  *                             32 KB data block
1759  *
1760  * As a result, an individual I/O may not span all child vdevs; moreover, a
1761  * small I/O may only operate on a single child vdev.
1762  *
1763  * Note that since there are no parity bits calculated or written, this format
1764  * remains the same no matter how many parity bits are used in a normal RAID-Z
1765  * stripe.  On a RAID-Z3 configuration with seven child vdevs, the example above
1766  * would look like:
1767  *
1768  *       parity   parity   parity    data     data     data     data
1769  *     |        |        |        |        |        |   XX   |        |
1770  *                                                      ^
1771  *                                                      |
1772  *                                               32 KB data block
1773  */
1774 int
1775 vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1776     uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1777 {
1778 	vdev_t *tvd = vd->vdev_top;
1779 	vdev_t *cvd;
1780 	raidz_map_t *rm;
1781 	raidz_col_t *rc;
1782 	int c, err = 0;
1783 
1784 	uint64_t start, end, colstart, colend;
1785 	uint64_t coloffset, colsize, colskip;
1786 
1787 	int flags = doread ? B_READ : B_WRITE;
1788 
1789 #ifdef	_KERNEL
1790 
1791 	/*
1792 	 * Don't write past the end of the block
1793 	 */
1794 	VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
1795 
1796 	start = offset;
1797 	end = start + size;
1798 
1799 	/*
1800 	 * Allocate a RAID-Z map for this block.  Note that this block starts
1801 	 * from the "original" offset, this is, the offset of the extent which
1802 	 * contains the requisite offset of the data being read or written.
1803 	 *
1804 	 * Even if this I/O operation doesn't span the full block size, let's
1805 	 * treat the on-disk format as if the only blocks are the complete 128
1806 	 * KB size.
1807 	 */
1808 	abd_t *abd = abd_get_from_buf(data - (offset - origoffset),
1809 	    SPA_OLD_MAXBLOCKSIZE);
1810 	rm = vdev_raidz_map_alloc(abd,
1811 	    SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift,
1812 	    vd->vdev_children, vd->vdev_nparity);
1813 
1814 	coloffset = origoffset;
1815 
1816 	for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1817 	    c++, coloffset += rc->rc_size) {
1818 		rc = &rm->rm_col[c];
1819 		cvd = vd->vdev_child[rc->rc_devidx];
1820 
1821 		/*
1822 		 * Find the start and end of this column in the RAID-Z map,
1823 		 * keeping in mind that the stated size and offset of the
1824 		 * operation may not fill the entire column for this vdev.
1825 		 *
1826 		 * If any portion of the data spans this column, issue the
1827 		 * appropriate operation to the vdev.
1828 		 */
1829 		if (coloffset + rc->rc_size <= start)
1830 			continue;
1831 		if (coloffset >= end)
1832 			continue;
1833 
1834 		colstart = MAX(coloffset, start);
1835 		colend = MIN(end, coloffset + rc->rc_size);
1836 		colsize = colend - colstart;
1837 		colskip = colstart - coloffset;
1838 
1839 		VERIFY3U(colsize, <=, rc->rc_size);
1840 		VERIFY3U(colskip, <=, rc->rc_size);
1841 
1842 		/*
1843 		 * Note that the child vdev will have a vdev label at the start
1844 		 * of its range of offsets, hence the need for
1845 		 * VDEV_LABEL_OFFSET().  See zio_vdev_child_io() for another
1846 		 * example of why this calculation is needed.
1847 		 */
1848 		if ((err = vdev_disk_physio(cvd,
1849 		    ((char *)abd_to_buf(rc->rc_abd)) + colskip, colsize,
1850 		    VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1851 		    flags, isdump)) != 0)
1852 			break;
1853 	}
1854 
1855 	vdev_raidz_map_free(rm);
1856 	abd_put(abd);
1857 #endif	/* KERNEL */
1858 
1859 	return (err);
1860 }
1861 
1862 static uint64_t
1863 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1864 {
1865 	uint64_t asize;
1866 	uint64_t ashift = vd->vdev_top->vdev_ashift;
1867 	uint64_t cols = vd->vdev_children;
1868 	uint64_t nparity = vd->vdev_nparity;
1869 
1870 	asize = ((psize - 1) >> ashift) + 1;
1871 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1872 	asize = roundup(asize, nparity + 1) << ashift;
1873 
1874 	return (asize);
1875 }
1876 
1877 static void
1878 vdev_raidz_child_done(zio_t *zio)
1879 {
1880 	raidz_col_t *rc = zio->io_private;
1881 
1882 	rc->rc_error = zio->io_error;
1883 	rc->rc_tried = 1;
1884 	rc->rc_skipped = 0;
1885 }
1886 
1887 /*
1888  * Start an IO operation on a RAIDZ VDev
1889  *
1890  * Outline:
1891  * - For write operations:
1892  *   1. Generate the parity data
1893  *   2. Create child zio write operations to each column's vdev, for both
1894  *      data and parity.
1895  *   3. If the column skips any sectors for padding, create optional dummy
1896  *      write zio children for those areas to improve aggregation continuity.
1897  * - For read operations:
1898  *   1. Create child zio read operations to each data column's vdev to read
1899  *      the range of data required for zio.
1900  *   2. If this is a scrub or resilver operation, or if any of the data
1901  *      vdevs have had errors, then create zio read operations to the parity
1902  *      columns' VDevs as well.
1903  */
1904 static void
1905 vdev_raidz_io_start(zio_t *zio)
1906 {
1907 	vdev_t *vd = zio->io_vd;
1908 	vdev_t *tvd = vd->vdev_top;
1909 	vdev_t *cvd;
1910 	raidz_map_t *rm;
1911 	raidz_col_t *rc;
1912 	int c, i;
1913 
1914 	rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset,
1915 	    tvd->vdev_ashift, vd->vdev_children,
1916 	    vd->vdev_nparity);
1917 
1918 	zio->io_vsd = rm;
1919 	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1920 
1921 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1922 
1923 	if (zio->io_type == ZIO_TYPE_WRITE) {
1924 		vdev_raidz_generate_parity(rm);
1925 
1926 		for (c = 0; c < rm->rm_cols; c++) {
1927 			rc = &rm->rm_col[c];
1928 			cvd = vd->vdev_child[rc->rc_devidx];
1929 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1930 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
1931 			    zio->io_type, zio->io_priority, 0,
1932 			    vdev_raidz_child_done, rc));
1933 		}
1934 
1935 		/*
1936 		 * Generate optional I/Os for any skipped sectors to improve
1937 		 * aggregation contiguity.
1938 		 */
1939 		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1940 			ASSERT(c <= rm->rm_scols);
1941 			if (c == rm->rm_scols)
1942 				c = 0;
1943 			rc = &rm->rm_col[c];
1944 			cvd = vd->vdev_child[rc->rc_devidx];
1945 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1946 			    rc->rc_offset + rc->rc_size, NULL,
1947 			    1 << tvd->vdev_ashift,
1948 			    zio->io_type, zio->io_priority,
1949 			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1950 		}
1951 
1952 		zio_execute(zio);
1953 		return;
1954 	}
1955 
1956 	ASSERT(zio->io_type == ZIO_TYPE_READ);
1957 
1958 	/*
1959 	 * Iterate over the columns in reverse order so that we hit the parity
1960 	 * last -- any errors along the way will force us to read the parity.
1961 	 */
1962 	for (c = rm->rm_cols - 1; c >= 0; c--) {
1963 		rc = &rm->rm_col[c];
1964 		cvd = vd->vdev_child[rc->rc_devidx];
1965 		if (!vdev_readable(cvd)) {
1966 			if (c >= rm->rm_firstdatacol)
1967 				rm->rm_missingdata++;
1968 			else
1969 				rm->rm_missingparity++;
1970 			rc->rc_error = SET_ERROR(ENXIO);
1971 			rc->rc_tried = 1;	/* don't even try */
1972 			rc->rc_skipped = 1;
1973 			continue;
1974 		}
1975 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1976 			if (c >= rm->rm_firstdatacol)
1977 				rm->rm_missingdata++;
1978 			else
1979 				rm->rm_missingparity++;
1980 			rc->rc_error = SET_ERROR(ESTALE);
1981 			rc->rc_skipped = 1;
1982 			continue;
1983 		}
1984 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1985 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1986 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1987 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
1988 			    zio->io_type, zio->io_priority, 0,
1989 			    vdev_raidz_child_done, rc));
1990 		}
1991 	}
1992 
1993 	zio_execute(zio);
1994 }
1995 
1996 
1997 /*
1998  * Report a checksum error for a child of a RAID-Z device.
1999  */
2000 static void
2001 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
2002 {
2003 	void *buf;
2004 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
2005 
2006 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2007 		zio_bad_cksum_t zbc;
2008 		raidz_map_t *rm = zio->io_vsd;
2009 
2010 		mutex_enter(&vd->vdev_stat_lock);
2011 		vd->vdev_stat.vs_checksum_errors++;
2012 		mutex_exit(&vd->vdev_stat_lock);
2013 
2014 		zbc.zbc_has_cksum = 0;
2015 		zbc.zbc_injected = rm->rm_ecksuminjected;
2016 
2017 		buf = abd_borrow_buf_copy(rc->rc_abd, rc->rc_size);
2018 		zfs_ereport_post_checksum(zio->io_spa, vd, zio,
2019 		    rc->rc_offset, rc->rc_size, buf, bad_data,
2020 		    &zbc);
2021 		abd_return_buf(rc->rc_abd, buf, rc->rc_size);
2022 	}
2023 }
2024 
2025 /*
2026  * We keep track of whether or not there were any injected errors, so that
2027  * any ereports we generate can note it.
2028  */
2029 static int
2030 raidz_checksum_verify(zio_t *zio)
2031 {
2032 	zio_bad_cksum_t zbc;
2033 	raidz_map_t *rm = zio->io_vsd;
2034 
2035 	int ret = zio_checksum_error(zio, &zbc);
2036 	if (ret != 0 && zbc.zbc_injected != 0)
2037 		rm->rm_ecksuminjected = 1;
2038 
2039 	return (ret);
2040 }
2041 
2042 /*
2043  * Generate the parity from the data columns. If we tried and were able to
2044  * read the parity without error, verify that the generated parity matches the
2045  * data we read. If it doesn't, we fire off a checksum error. Return the
2046  * number such failures.
2047  */
2048 static int
2049 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
2050 {
2051 	void *orig[VDEV_RAIDZ_MAXPARITY];
2052 	int c, ret = 0;
2053 	raidz_col_t *rc;
2054 
2055 	blkptr_t *bp = zio->io_bp;
2056 	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
2057 	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
2058 
2059 	if (checksum == ZIO_CHECKSUM_NOPARITY)
2060 		return (ret);
2061 
2062 	for (c = 0; c < rm->rm_firstdatacol; c++) {
2063 		rc = &rm->rm_col[c];
2064 		if (!rc->rc_tried || rc->rc_error != 0)
2065 			continue;
2066 		orig[c] = zio_buf_alloc(rc->rc_size);
2067 		abd_copy_to_buf(orig[c], rc->rc_abd, rc->rc_size);
2068 	}
2069 
2070 	vdev_raidz_generate_parity(rm);
2071 
2072 	for (c = 0; c < rm->rm_firstdatacol; c++) {
2073 		rc = &rm->rm_col[c];
2074 		if (!rc->rc_tried || rc->rc_error != 0)
2075 			continue;
2076 		if (abd_cmp_buf(rc->rc_abd, orig[c], rc->rc_size) != 0) {
2077 			raidz_checksum_error(zio, rc, orig[c]);
2078 			rc->rc_error = SET_ERROR(ECKSUM);
2079 			ret++;
2080 		}
2081 		zio_buf_free(orig[c], rc->rc_size);
2082 	}
2083 
2084 	return (ret);
2085 }
2086 
2087 /*
2088  * Keep statistics on all the ways that we used parity to correct data.
2089  */
2090 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
2091 
2092 static int
2093 vdev_raidz_worst_error(raidz_map_t *rm)
2094 {
2095 	int error = 0;
2096 
2097 	for (int c = 0; c < rm->rm_cols; c++)
2098 		error = zio_worst_error(error, rm->rm_col[c].rc_error);
2099 
2100 	return (error);
2101 }
2102 
2103 /*
2104  * Iterate over all combinations of bad data and attempt a reconstruction.
2105  * Note that the algorithm below is non-optimal because it doesn't take into
2106  * account how reconstruction is actually performed. For example, with
2107  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
2108  * is targeted as invalid as if columns 1 and 4 are targeted since in both
2109  * cases we'd only use parity information in column 0.
2110  */
2111 static int
2112 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
2113 {
2114 	raidz_map_t *rm = zio->io_vsd;
2115 	raidz_col_t *rc;
2116 	void *orig[VDEV_RAIDZ_MAXPARITY];
2117 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
2118 	int *tgts = &tstore[1];
2119 	int current, next, i, c, n;
2120 	int code, ret = 0;
2121 
2122 	ASSERT(total_errors < rm->rm_firstdatacol);
2123 
2124 	/*
2125 	 * This simplifies one edge condition.
2126 	 */
2127 	tgts[-1] = -1;
2128 
2129 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
2130 		/*
2131 		 * Initialize the targets array by finding the first n columns
2132 		 * that contain no error.
2133 		 *
2134 		 * If there were no data errors, we need to ensure that we're
2135 		 * always explicitly attempting to reconstruct at least one
2136 		 * data column. To do this, we simply push the highest target
2137 		 * up into the data columns.
2138 		 */
2139 		for (c = 0, i = 0; i < n; i++) {
2140 			if (i == n - 1 && data_errors == 0 &&
2141 			    c < rm->rm_firstdatacol) {
2142 				c = rm->rm_firstdatacol;
2143 			}
2144 
2145 			while (rm->rm_col[c].rc_error != 0) {
2146 				c++;
2147 				ASSERT3S(c, <, rm->rm_cols);
2148 			}
2149 
2150 			tgts[i] = c++;
2151 		}
2152 
2153 		/*
2154 		 * Setting tgts[n] simplifies the other edge condition.
2155 		 */
2156 		tgts[n] = rm->rm_cols;
2157 
2158 		/*
2159 		 * These buffers were allocated in previous iterations.
2160 		 */
2161 		for (i = 0; i < n - 1; i++) {
2162 			ASSERT(orig[i] != NULL);
2163 		}
2164 
2165 		orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
2166 
2167 		current = 0;
2168 		next = tgts[current];
2169 
2170 		while (current != n) {
2171 			tgts[current] = next;
2172 			current = 0;
2173 
2174 			/*
2175 			 * Save off the original data that we're going to
2176 			 * attempt to reconstruct.
2177 			 */
2178 			for (i = 0; i < n; i++) {
2179 				ASSERT(orig[i] != NULL);
2180 				c = tgts[i];
2181 				ASSERT3S(c, >=, 0);
2182 				ASSERT3S(c, <, rm->rm_cols);
2183 				rc = &rm->rm_col[c];
2184 				abd_copy_to_buf(orig[i], rc->rc_abd,
2185 				    rc->rc_size);
2186 			}
2187 
2188 			/*
2189 			 * Attempt a reconstruction and exit the outer loop on
2190 			 * success.
2191 			 */
2192 			code = vdev_raidz_reconstruct(rm, tgts, n);
2193 			if (raidz_checksum_verify(zio) == 0) {
2194 				atomic_inc_64(&raidz_corrected[code]);
2195 
2196 				for (i = 0; i < n; i++) {
2197 					c = tgts[i];
2198 					rc = &rm->rm_col[c];
2199 					ASSERT(rc->rc_error == 0);
2200 					if (rc->rc_tried)
2201 						raidz_checksum_error(zio, rc,
2202 						    orig[i]);
2203 					rc->rc_error = SET_ERROR(ECKSUM);
2204 				}
2205 
2206 				ret = code;
2207 				goto done;
2208 			}
2209 
2210 			/*
2211 			 * Restore the original data.
2212 			 */
2213 			for (i = 0; i < n; i++) {
2214 				c = tgts[i];
2215 				rc = &rm->rm_col[c];
2216 				abd_copy_from_buf(rc->rc_abd, orig[i],
2217 				    rc->rc_size);
2218 			}
2219 
2220 			do {
2221 				/*
2222 				 * Find the next valid column after the current
2223 				 * position..
2224 				 */
2225 				for (next = tgts[current] + 1;
2226 				    next < rm->rm_cols &&
2227 				    rm->rm_col[next].rc_error != 0; next++)
2228 					continue;
2229 
2230 				ASSERT(next <= tgts[current + 1]);
2231 
2232 				/*
2233 				 * If that spot is available, we're done here.
2234 				 */
2235 				if (next != tgts[current + 1])
2236 					break;
2237 
2238 				/*
2239 				 * Otherwise, find the next valid column after
2240 				 * the previous position.
2241 				 */
2242 				for (c = tgts[current - 1] + 1;
2243 				    rm->rm_col[c].rc_error != 0; c++)
2244 					continue;
2245 
2246 				tgts[current] = c;
2247 				current++;
2248 
2249 			} while (current != n);
2250 		}
2251 	}
2252 	n--;
2253 done:
2254 	for (i = 0; i < n; i++) {
2255 		zio_buf_free(orig[i], rm->rm_col[0].rc_size);
2256 	}
2257 
2258 	return (ret);
2259 }
2260 
2261 /*
2262  * Complete an IO operation on a RAIDZ VDev
2263  *
2264  * Outline:
2265  * - For write operations:
2266  *   1. Check for errors on the child IOs.
2267  *   2. Return, setting an error code if too few child VDevs were written
2268  *      to reconstruct the data later.  Note that partial writes are
2269  *      considered successful if they can be reconstructed at all.
2270  * - For read operations:
2271  *   1. Check for errors on the child IOs.
2272  *   2. If data errors occurred:
2273  *      a. Try to reassemble the data from the parity available.
2274  *      b. If we haven't yet read the parity drives, read them now.
2275  *      c. If all parity drives have been read but the data still doesn't
2276  *         reassemble with a correct checksum, then try combinatorial
2277  *         reconstruction.
2278  *      d. If that doesn't work, return an error.
2279  *   3. If there were unexpected errors or this is a resilver operation,
2280  *      rewrite the vdevs that had errors.
2281  */
2282 static void
2283 vdev_raidz_io_done(zio_t *zio)
2284 {
2285 	vdev_t *vd = zio->io_vd;
2286 	vdev_t *cvd;
2287 	raidz_map_t *rm = zio->io_vsd;
2288 	raidz_col_t *rc;
2289 	int unexpected_errors = 0;
2290 	int parity_errors = 0;
2291 	int parity_untried = 0;
2292 	int data_errors = 0;
2293 	int total_errors = 0;
2294 	int n, c;
2295 	int tgts[VDEV_RAIDZ_MAXPARITY];
2296 	int code;
2297 
2298 	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
2299 
2300 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2301 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2302 
2303 	for (c = 0; c < rm->rm_cols; c++) {
2304 		rc = &rm->rm_col[c];
2305 
2306 		if (rc->rc_error) {
2307 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
2308 
2309 			if (c < rm->rm_firstdatacol)
2310 				parity_errors++;
2311 			else
2312 				data_errors++;
2313 
2314 			if (!rc->rc_skipped)
2315 				unexpected_errors++;
2316 
2317 			total_errors++;
2318 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2319 			parity_untried++;
2320 		}
2321 	}
2322 
2323 	if (zio->io_type == ZIO_TYPE_WRITE) {
2324 		/*
2325 		 * XXX -- for now, treat partial writes as a success.
2326 		 * (If we couldn't write enough columns to reconstruct
2327 		 * the data, the I/O failed.  Otherwise, good enough.)
2328 		 *
2329 		 * Now that we support write reallocation, it would be better
2330 		 * to treat partial failure as real failure unless there are
2331 		 * no non-degraded top-level vdevs left, and not update DTLs
2332 		 * if we intend to reallocate.
2333 		 */
2334 		/* XXPOLICY */
2335 		if (total_errors > rm->rm_firstdatacol)
2336 			zio->io_error = vdev_raidz_worst_error(rm);
2337 
2338 		return;
2339 	}
2340 
2341 	ASSERT(zio->io_type == ZIO_TYPE_READ);
2342 	/*
2343 	 * There are three potential phases for a read:
2344 	 *	1. produce valid data from the columns read
2345 	 *	2. read all disks and try again
2346 	 *	3. perform combinatorial reconstruction
2347 	 *
2348 	 * Each phase is progressively both more expensive and less likely to
2349 	 * occur. If we encounter more errors than we can repair or all phases
2350 	 * fail, we have no choice but to return an error.
2351 	 */
2352 
2353 	/*
2354 	 * If the number of errors we saw was correctable -- less than or equal
2355 	 * to the number of parity disks read -- attempt to produce data that
2356 	 * has a valid checksum. Naturally, this case applies in the absence of
2357 	 * any errors.
2358 	 */
2359 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2360 		if (data_errors == 0) {
2361 			if (raidz_checksum_verify(zio) == 0) {
2362 				/*
2363 				 * If we read parity information (unnecessarily
2364 				 * as it happens since no reconstruction was
2365 				 * needed) regenerate and verify the parity.
2366 				 * We also regenerate parity when resilvering
2367 				 * so we can write it out to the failed device
2368 				 * later.
2369 				 */
2370 				if (parity_errors + parity_untried <
2371 				    rm->rm_firstdatacol ||
2372 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2373 					n = raidz_parity_verify(zio, rm);
2374 					unexpected_errors += n;
2375 					ASSERT(parity_errors + n <=
2376 					    rm->rm_firstdatacol);
2377 				}
2378 				goto done;
2379 			}
2380 		} else {
2381 			/*
2382 			 * We either attempt to read all the parity columns or
2383 			 * none of them. If we didn't try to read parity, we
2384 			 * wouldn't be here in the correctable case. There must
2385 			 * also have been fewer parity errors than parity
2386 			 * columns or, again, we wouldn't be in this code path.
2387 			 */
2388 			ASSERT(parity_untried == 0);
2389 			ASSERT(parity_errors < rm->rm_firstdatacol);
2390 
2391 			/*
2392 			 * Identify the data columns that reported an error.
2393 			 */
2394 			n = 0;
2395 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2396 				rc = &rm->rm_col[c];
2397 				if (rc->rc_error != 0) {
2398 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2399 					tgts[n++] = c;
2400 				}
2401 			}
2402 
2403 			ASSERT(rm->rm_firstdatacol >= n);
2404 
2405 			code = vdev_raidz_reconstruct(rm, tgts, n);
2406 
2407 			if (raidz_checksum_verify(zio) == 0) {
2408 				atomic_inc_64(&raidz_corrected[code]);
2409 
2410 				/*
2411 				 * If we read more parity disks than were used
2412 				 * for reconstruction, confirm that the other
2413 				 * parity disks produced correct data. This
2414 				 * routine is suboptimal in that it regenerates
2415 				 * the parity that we already used in addition
2416 				 * to the parity that we're attempting to
2417 				 * verify, but this should be a relatively
2418 				 * uncommon case, and can be optimized if it
2419 				 * becomes a problem. Note that we regenerate
2420 				 * parity when resilvering so we can write it
2421 				 * out to failed devices later.
2422 				 */
2423 				if (parity_errors < rm->rm_firstdatacol - n ||
2424 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2425 					n = raidz_parity_verify(zio, rm);
2426 					unexpected_errors += n;
2427 					ASSERT(parity_errors + n <=
2428 					    rm->rm_firstdatacol);
2429 				}
2430 
2431 				goto done;
2432 			}
2433 		}
2434 	}
2435 
2436 	/*
2437 	 * This isn't a typical situation -- either we got a read error or
2438 	 * a child silently returned bad data. Read every block so we can
2439 	 * try again with as much data and parity as we can track down. If
2440 	 * we've already been through once before, all children will be marked
2441 	 * as tried so we'll proceed to combinatorial reconstruction.
2442 	 */
2443 	unexpected_errors = 1;
2444 	rm->rm_missingdata = 0;
2445 	rm->rm_missingparity = 0;
2446 
2447 	for (c = 0; c < rm->rm_cols; c++) {
2448 		if (rm->rm_col[c].rc_tried)
2449 			continue;
2450 
2451 		zio_vdev_io_redone(zio);
2452 		do {
2453 			rc = &rm->rm_col[c];
2454 			if (rc->rc_tried)
2455 				continue;
2456 			zio_nowait(zio_vdev_child_io(zio, NULL,
2457 			    vd->vdev_child[rc->rc_devidx],
2458 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
2459 			    zio->io_type, zio->io_priority, 0,
2460 			    vdev_raidz_child_done, rc));
2461 		} while (++c < rm->rm_cols);
2462 
2463 		return;
2464 	}
2465 
2466 	/*
2467 	 * At this point we've attempted to reconstruct the data given the
2468 	 * errors we detected, and we've attempted to read all columns. There
2469 	 * must, therefore, be one or more additional problems -- silent errors
2470 	 * resulting in invalid data rather than explicit I/O errors resulting
2471 	 * in absent data. We check if there is enough additional data to
2472 	 * possibly reconstruct the data and then perform combinatorial
2473 	 * reconstruction over all possible combinations. If that fails,
2474 	 * we're cooked.
2475 	 */
2476 	if (total_errors > rm->rm_firstdatacol) {
2477 		zio->io_error = vdev_raidz_worst_error(rm);
2478 
2479 	} else if (total_errors < rm->rm_firstdatacol &&
2480 	    (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2481 		/*
2482 		 * If we didn't use all the available parity for the
2483 		 * combinatorial reconstruction, verify that the remaining
2484 		 * parity is correct.
2485 		 */
2486 		if (code != (1 << rm->rm_firstdatacol) - 1)
2487 			(void) raidz_parity_verify(zio, rm);
2488 	} else {
2489 		/*
2490 		 * We're here because either:
2491 		 *
2492 		 *	total_errors == rm_first_datacol, or
2493 		 *	vdev_raidz_combrec() failed
2494 		 *
2495 		 * In either case, there is enough bad data to prevent
2496 		 * reconstruction.
2497 		 *
2498 		 * Start checksum ereports for all children which haven't
2499 		 * failed, and the IO wasn't speculative.
2500 		 */
2501 		zio->io_error = SET_ERROR(ECKSUM);
2502 
2503 		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2504 			for (c = 0; c < rm->rm_cols; c++) {
2505 				rc = &rm->rm_col[c];
2506 				if (rc->rc_error == 0) {
2507 					zio_bad_cksum_t zbc;
2508 					zbc.zbc_has_cksum = 0;
2509 					zbc.zbc_injected =
2510 					    rm->rm_ecksuminjected;
2511 
2512 					zfs_ereport_start_checksum(
2513 					    zio->io_spa,
2514 					    vd->vdev_child[rc->rc_devidx],
2515 					    zio, rc->rc_offset, rc->rc_size,
2516 					    (void *)(uintptr_t)c, &zbc);
2517 				}
2518 			}
2519 		}
2520 	}
2521 
2522 done:
2523 	zio_checksum_verified(zio);
2524 
2525 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2526 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2527 		/*
2528 		 * Use the good data we have in hand to repair damaged children.
2529 		 */
2530 		for (c = 0; c < rm->rm_cols; c++) {
2531 			rc = &rm->rm_col[c];
2532 			cvd = vd->vdev_child[rc->rc_devidx];
2533 
2534 			if (rc->rc_error == 0)
2535 				continue;
2536 
2537 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2538 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
2539 			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2540 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2541 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2542 		}
2543 	}
2544 }
2545 
2546 static void
2547 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2548 {
2549 	if (faulted > vd->vdev_nparity)
2550 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2551 		    VDEV_AUX_NO_REPLICAS);
2552 	else if (degraded + faulted != 0)
2553 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2554 	else
2555 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2556 }
2557 
2558 vdev_ops_t vdev_raidz_ops = {
2559 	vdev_raidz_open,
2560 	vdev_raidz_close,
2561 	vdev_raidz_asize,
2562 	vdev_raidz_io_start,
2563 	vdev_raidz_io_done,
2564 	vdev_raidz_state_change,
2565 	NULL,
2566 	NULL,
2567 	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
2568 	B_FALSE			/* not a leaf vdev */
2569 };
2570