xref: /illumos-gate/usr/src/uts/common/fs/zfs/vdev_raidz.c (revision c5806743f70246f7f708e57514b9103a6291d629)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
25  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
26  * Copyright (c) 2014 Integros [integros.com]
27  */
28 
29 #include <sys/zfs_context.h>
30 #include <sys/spa.h>
31 #include <sys/vdev_impl.h>
32 #include <sys/vdev_disk.h>
33 #include <sys/vdev_file.h>
34 #include <sys/vdev_raidz.h>
35 #include <sys/zio.h>
36 #include <sys/zio_checksum.h>
37 #include <sys/abd.h>
38 #include <sys/fs/zfs.h>
39 #include <sys/fm/fs/zfs.h>
40 
41 #ifdef ZFS_DEBUG
42 #include <sys/vdev_initialize.h>	/* vdev_xlate testing */
43 #endif
44 
45 /*
46  * Virtual device vector for RAID-Z.
47  *
48  * This vdev supports single, double, and triple parity. For single parity,
49  * we use a simple XOR of all the data columns. For double or triple parity,
50  * we use a special case of Reed-Solomon coding. This extends the
51  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
52  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
53  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
54  * former is also based. The latter is designed to provide higher performance
55  * for writes.
56  *
57  * Note that the Plank paper claimed to support arbitrary N+M, but was then
58  * amended six years later identifying a critical flaw that invalidates its
59  * claims. Nevertheless, the technique can be adapted to work for up to
60  * triple parity. For additional parity, the amendment "Note: Correction to
61  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
62  * is viable, but the additional complexity means that write performance will
63  * suffer.
64  *
65  * All of the methods above operate on a Galois field, defined over the
66  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
67  * can be expressed with a single byte. Briefly, the operations on the
68  * field are defined as follows:
69  *
70  *   o addition (+) is represented by a bitwise XOR
71  *   o subtraction (-) is therefore identical to addition: A + B = A - B
72  *   o multiplication of A by 2 is defined by the following bitwise expression:
73  *
74  *	(A * 2)_7 = A_6
75  *	(A * 2)_6 = A_5
76  *	(A * 2)_5 = A_4
77  *	(A * 2)_4 = A_3 + A_7
78  *	(A * 2)_3 = A_2 + A_7
79  *	(A * 2)_2 = A_1 + A_7
80  *	(A * 2)_1 = A_0
81  *	(A * 2)_0 = A_7
82  *
83  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
84  * As an aside, this multiplication is derived from the error correcting
85  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
86  *
87  * Observe that any number in the field (except for 0) can be expressed as a
88  * power of 2 -- a generator for the field. We store a table of the powers of
89  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
90  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
91  * than field addition). The inverse of a field element A (A^-1) is therefore
92  * A ^ (255 - 1) = A^254.
93  *
94  * The up-to-three parity columns, P, Q, R over several data columns,
95  * D_0, ... D_n-1, can be expressed by field operations:
96  *
97  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
98  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
99  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
100  *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
101  *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
102  *
103  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
104  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
105  * independent coefficients. (There are no additional coefficients that have
106  * this property which is why the uncorrected Plank method breaks down.)
107  *
108  * See the reconstruction code below for how P, Q and R can used individually
109  * or in concert to recover missing data columns.
110  */
111 
112 typedef struct raidz_col {
113 	uint64_t rc_devidx;		/* child device index for I/O */
114 	uint64_t rc_offset;		/* device offset */
115 	uint64_t rc_size;		/* I/O size */
116 	abd_t *rc_abd;			/* I/O data */
117 	void *rc_gdata;			/* used to store the "good" version */
118 	int rc_error;			/* I/O error for this device */
119 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
120 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
121 } raidz_col_t;
122 
123 typedef struct raidz_map {
124 	uint64_t rm_cols;		/* Regular column count */
125 	uint64_t rm_scols;		/* Count including skipped columns */
126 	uint64_t rm_bigcols;		/* Number of oversized columns */
127 	uint64_t rm_asize;		/* Actual total I/O size */
128 	uint64_t rm_missingdata;	/* Count of missing data devices */
129 	uint64_t rm_missingparity;	/* Count of missing parity devices */
130 	uint64_t rm_firstdatacol;	/* First data column/parity count */
131 	uint64_t rm_nskip;		/* Skipped sectors for padding */
132 	uint64_t rm_skipstart;		/* Column index of padding start */
133 	abd_t *rm_abd_copy;		/* rm_asize-buffer of copied data */
134 	uintptr_t rm_reports;		/* # of referencing checksum reports */
135 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
136 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
137 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
138 } raidz_map_t;
139 
140 #define	VDEV_RAIDZ_P		0
141 #define	VDEV_RAIDZ_Q		1
142 #define	VDEV_RAIDZ_R		2
143 
144 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
145 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
146 
147 /*
148  * We provide a mechanism to perform the field multiplication operation on a
149  * 64-bit value all at once rather than a byte at a time. This works by
150  * creating a mask from the top bit in each byte and using that to
151  * conditionally apply the XOR of 0x1d.
152  */
153 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
154 { \
155 	(mask) = (x) & 0x8080808080808080ULL; \
156 	(mask) = ((mask) << 1) - ((mask) >> 7); \
157 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
158 	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
159 }
160 
161 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
162 { \
163 	VDEV_RAIDZ_64MUL_2((x), mask); \
164 	VDEV_RAIDZ_64MUL_2((x), mask); \
165 }
166 
167 #define	VDEV_LABEL_OFFSET(x)	(x + VDEV_LABEL_START_SIZE)
168 
169 /*
170  * Force reconstruction to use the general purpose method.
171  */
172 int vdev_raidz_default_to_general;
173 
174 /* Powers of 2 in the Galois field defined above. */
175 static const uint8_t vdev_raidz_pow2[256] = {
176 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
177 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
178 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
179 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
180 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
181 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
182 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
183 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
184 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
185 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
186 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
187 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
188 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
189 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
190 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
191 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
192 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
193 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
194 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
195 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
196 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
197 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
198 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
199 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
200 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
201 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
202 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
203 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
204 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
205 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
206 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
207 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
208 };
209 /* Logs of 2 in the Galois field defined above. */
210 static const uint8_t vdev_raidz_log2[256] = {
211 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
212 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
213 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
214 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
215 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
216 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
217 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
218 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
219 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
220 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
221 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
222 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
223 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
224 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
225 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
226 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
227 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
228 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
229 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
230 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
231 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
232 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
233 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
234 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
235 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
236 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
237 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
238 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
239 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
240 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
241 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
242 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
243 };
244 
245 static void vdev_raidz_generate_parity(raidz_map_t *rm);
246 
247 /*
248  * Multiply a given number by 2 raised to the given power.
249  */
250 static uint8_t
251 vdev_raidz_exp2(uint_t a, int exp)
252 {
253 	if (a == 0)
254 		return (0);
255 
256 	ASSERT(exp >= 0);
257 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
258 
259 	exp += vdev_raidz_log2[a];
260 	if (exp > 255)
261 		exp -= 255;
262 
263 	return (vdev_raidz_pow2[exp]);
264 }
265 
266 static void
267 vdev_raidz_map_free(raidz_map_t *rm)
268 {
269 	int c;
270 
271 	for (c = 0; c < rm->rm_firstdatacol; c++) {
272 		abd_free(rm->rm_col[c].rc_abd);
273 
274 		if (rm->rm_col[c].rc_gdata != NULL)
275 			abd_free(rm->rm_col[c].rc_gdata);
276 
277 	}
278 
279 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
280 		abd_put(rm->rm_col[c].rc_abd);
281 
282 	if (rm->rm_abd_copy != NULL)
283 		abd_free(rm->rm_abd_copy);
284 
285 	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
286 }
287 
288 static void
289 vdev_raidz_map_free_vsd(zio_t *zio)
290 {
291 	raidz_map_t *rm = zio->io_vsd;
292 
293 	ASSERT0(rm->rm_freed);
294 	rm->rm_freed = 1;
295 
296 	if (rm->rm_reports == 0)
297 		vdev_raidz_map_free(rm);
298 }
299 
300 /*ARGSUSED*/
301 static void
302 vdev_raidz_cksum_free(void *arg, size_t ignored)
303 {
304 	raidz_map_t *rm = arg;
305 
306 	ASSERT3U(rm->rm_reports, >, 0);
307 
308 	if (--rm->rm_reports == 0 && rm->rm_freed != 0)
309 		vdev_raidz_map_free(rm);
310 }
311 
312 static void
313 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data)
314 {
315 	raidz_map_t *rm = zcr->zcr_cbdata;
316 	size_t c = zcr->zcr_cbinfo;
317 	size_t x, offset;
318 
319 	const abd_t *good = NULL;
320 	const abd_t *bad = rm->rm_col[c].rc_abd;
321 
322 	if (good_data == NULL) {
323 		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
324 		return;
325 	}
326 
327 	if (c < rm->rm_firstdatacol) {
328 		/*
329 		 * The first time through, calculate the parity blocks for
330 		 * the good data (this relies on the fact that the good
331 		 * data never changes for a given logical ZIO)
332 		 */
333 		if (rm->rm_col[0].rc_gdata == NULL) {
334 			abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY];
335 
336 			/*
337 			 * Set up the rm_col[]s to generate the parity for
338 			 * good_data, first saving the parity bufs and
339 			 * replacing them with buffers to hold the result.
340 			 */
341 			for (x = 0; x < rm->rm_firstdatacol; x++) {
342 				bad_parity[x] = rm->rm_col[x].rc_abd;
343 				rm->rm_col[x].rc_abd =
344 				    rm->rm_col[x].rc_gdata =
345 				    abd_alloc_sametype(rm->rm_col[x].rc_abd,
346 				    rm->rm_col[x].rc_size);
347 			}
348 
349 			/* fill in the data columns from good_data */
350 			offset = 0;
351 			for (; x < rm->rm_cols; x++) {
352 				abd_put(rm->rm_col[x].rc_abd);
353 
354 				rm->rm_col[x].rc_abd =
355 				    abd_get_offset_size((abd_t *)good_data,
356 				    offset, rm->rm_col[x].rc_size);
357 				offset += rm->rm_col[x].rc_size;
358 			}
359 
360 			/*
361 			 * Construct the parity from the good data.
362 			 */
363 			vdev_raidz_generate_parity(rm);
364 
365 			/* restore everything back to its original state */
366 			for (x = 0; x < rm->rm_firstdatacol; x++)
367 				rm->rm_col[x].rc_abd = bad_parity[x];
368 
369 			offset = 0;
370 			for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
371 				abd_put(rm->rm_col[x].rc_abd);
372 				rm->rm_col[x].rc_abd = abd_get_offset_size(
373 				    rm->rm_abd_copy, offset,
374 				    rm->rm_col[x].rc_size);
375 				offset += rm->rm_col[x].rc_size;
376 			}
377 		}
378 
379 		ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
380 		good = abd_get_offset_size(rm->rm_col[c].rc_gdata, 0,
381 		    rm->rm_col[c].rc_size);
382 	} else {
383 		/* adjust good_data to point at the start of our column */
384 		offset = 0;
385 		for (x = rm->rm_firstdatacol; x < c; x++)
386 			offset += rm->rm_col[x].rc_size;
387 
388 		good = abd_get_offset_size((abd_t *)good_data, offset,
389 		    rm->rm_col[c].rc_size);
390 	}
391 
392 	/* we drop the ereport if it ends up that the data was good */
393 	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
394 	abd_put((abd_t *)good);
395 }
396 
397 /*
398  * Invoked indirectly by zfs_ereport_start_checksum(), called
399  * below when our read operation fails completely.  The main point
400  * is to keep a copy of everything we read from disk, so that at
401  * vdev_raidz_cksum_finish() time we can compare it with the good data.
402  */
403 static void
404 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
405 {
406 	size_t c = (size_t)(uintptr_t)arg;
407 	size_t offset;
408 
409 	raidz_map_t *rm = zio->io_vsd;
410 	size_t size;
411 
412 	/* set up the report and bump the refcount  */
413 	zcr->zcr_cbdata = rm;
414 	zcr->zcr_cbinfo = c;
415 	zcr->zcr_finish = vdev_raidz_cksum_finish;
416 	zcr->zcr_free = vdev_raidz_cksum_free;
417 
418 	rm->rm_reports++;
419 	ASSERT3U(rm->rm_reports, >, 0);
420 
421 	if (rm->rm_abd_copy != NULL)
422 		return;
423 
424 	/*
425 	 * It's the first time we're called for this raidz_map_t, so we need
426 	 * to copy the data aside; there's no guarantee that our zio's buffer
427 	 * won't be re-used for something else.
428 	 *
429 	 * Our parity data is already in separate buffers, so there's no need
430 	 * to copy them.
431 	 */
432 
433 	size = 0;
434 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
435 		size += rm->rm_col[c].rc_size;
436 
437 	rm->rm_abd_copy = abd_alloc_for_io(size, B_FALSE);
438 
439 	for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
440 		raidz_col_t *col = &rm->rm_col[c];
441 		abd_t *tmp = abd_get_offset_size(rm->rm_abd_copy, offset,
442 		    col->rc_size);
443 
444 		ASSERT3S(tmp->abd_size, >=, col->rc_size);
445 		ASSERT3S(col->rc_abd->abd_size, >=, col->rc_size);
446 		abd_copy_off(tmp, col->rc_abd, 0, 0, col->rc_size);
447 		abd_put(col->rc_abd);
448 		col->rc_abd = tmp;
449 
450 		offset += col->rc_size;
451 	}
452 	ASSERT3U(offset, ==, size);
453 }
454 
455 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
456 	vdev_raidz_map_free_vsd,
457 	vdev_raidz_cksum_report
458 };
459 
460 /*
461  * Divides the IO evenly across all child vdevs; usually, dcols is
462  * the number of children in the target vdev.
463  */
464 static raidz_map_t *
465 vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset,
466     uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
467 {
468 	raidz_map_t *rm;
469 	/* The starting RAIDZ (parent) vdev sector of the block. */
470 	uint64_t b = offset >> unit_shift;
471 	/* The zio's size in units of the vdev's minimum sector size. */
472 	uint64_t s = size >> unit_shift;
473 	/* The first column for this stripe. */
474 	uint64_t f = b % dcols;
475 	/* The starting byte offset on each child vdev. */
476 	uint64_t o = (b / dcols) << unit_shift;
477 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
478 	uint64_t off = 0;
479 
480 	/*
481 	 * "Quotient": The number of data sectors for this stripe on all but
482 	 * the "big column" child vdevs that also contain "remainder" data.
483 	 */
484 	q = s / (dcols - nparity);
485 
486 	/*
487 	 * "Remainder": The number of partial stripe data sectors in this I/O.
488 	 * This will add a sector to some, but not all, child vdevs.
489 	 */
490 	r = s - q * (dcols - nparity);
491 
492 	/* The number of "big columns" - those which contain remainder data. */
493 	bc = (r == 0 ? 0 : r + nparity);
494 
495 	/*
496 	 * The total number of data and parity sectors associated with
497 	 * this I/O.
498 	 */
499 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
500 
501 	/* acols: The columns that will be accessed. */
502 	/* scols: The columns that will be accessed or skipped. */
503 	if (q == 0) {
504 		/* Our I/O request doesn't span all child vdevs. */
505 		acols = bc;
506 		scols = MIN(dcols, roundup(bc, nparity + 1));
507 	} else {
508 		acols = dcols;
509 		scols = dcols;
510 	}
511 
512 	ASSERT3U(acols, <=, scols);
513 
514 	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
515 
516 	rm->rm_cols = acols;
517 	rm->rm_scols = scols;
518 	rm->rm_bigcols = bc;
519 	rm->rm_skipstart = bc;
520 	rm->rm_missingdata = 0;
521 	rm->rm_missingparity = 0;
522 	rm->rm_firstdatacol = nparity;
523 	rm->rm_abd_copy = NULL;
524 	rm->rm_reports = 0;
525 	rm->rm_freed = 0;
526 	rm->rm_ecksuminjected = 0;
527 
528 	asize = 0;
529 
530 	for (c = 0; c < scols; c++) {
531 		col = f + c;
532 		coff = o;
533 		if (col >= dcols) {
534 			col -= dcols;
535 			coff += 1ULL << unit_shift;
536 		}
537 		rm->rm_col[c].rc_devidx = col;
538 		rm->rm_col[c].rc_offset = coff;
539 		rm->rm_col[c].rc_abd = NULL;
540 		rm->rm_col[c].rc_gdata = NULL;
541 		rm->rm_col[c].rc_error = 0;
542 		rm->rm_col[c].rc_tried = 0;
543 		rm->rm_col[c].rc_skipped = 0;
544 
545 		if (c >= acols)
546 			rm->rm_col[c].rc_size = 0;
547 		else if (c < bc)
548 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
549 		else
550 			rm->rm_col[c].rc_size = q << unit_shift;
551 
552 		asize += rm->rm_col[c].rc_size;
553 	}
554 
555 	ASSERT3U(asize, ==, tot << unit_shift);
556 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
557 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
558 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
559 	ASSERT3U(rm->rm_nskip, <=, nparity);
560 
561 	for (c = 0; c < rm->rm_firstdatacol; c++)
562 		rm->rm_col[c].rc_abd =
563 		    abd_alloc_linear(rm->rm_col[c].rc_size, B_FALSE);
564 
565 	rm->rm_col[c].rc_abd = abd_get_offset_size(abd, 0,
566 	    rm->rm_col[c].rc_size);
567 	off = rm->rm_col[c].rc_size;
568 
569 	for (c = c + 1; c < acols; c++) {
570 		rm->rm_col[c].rc_abd = abd_get_offset_size(abd, off,
571 		    rm->rm_col[c].rc_size);
572 		off += rm->rm_col[c].rc_size;
573 	}
574 
575 	/*
576 	 * If all data stored spans all columns, there's a danger that parity
577 	 * will always be on the same device and, since parity isn't read
578 	 * during normal operation, that that device's I/O bandwidth won't be
579 	 * used effectively. We therefore switch the parity every 1MB.
580 	 *
581 	 * ... at least that was, ostensibly, the theory. As a practical
582 	 * matter unless we juggle the parity between all devices evenly, we
583 	 * won't see any benefit. Further, occasional writes that aren't a
584 	 * multiple of the LCM of the number of children and the minimum
585 	 * stripe width are sufficient to avoid pessimal behavior.
586 	 * Unfortunately, this decision created an implicit on-disk format
587 	 * requirement that we need to support for all eternity, but only
588 	 * for single-parity RAID-Z.
589 	 *
590 	 * If we intend to skip a sector in the zeroth column for padding
591 	 * we must make sure to note this swap. We will never intend to
592 	 * skip the first column since at least one data and one parity
593 	 * column must appear in each row.
594 	 */
595 	ASSERT(rm->rm_cols >= 2);
596 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
597 
598 	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
599 		devidx = rm->rm_col[0].rc_devidx;
600 		o = rm->rm_col[0].rc_offset;
601 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
602 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
603 		rm->rm_col[1].rc_devidx = devidx;
604 		rm->rm_col[1].rc_offset = o;
605 
606 		if (rm->rm_skipstart == 0)
607 			rm->rm_skipstart = 1;
608 	}
609 
610 	return (rm);
611 }
612 
613 struct pqr_struct {
614 	uint64_t *p;
615 	uint64_t *q;
616 	uint64_t *r;
617 };
618 
619 static int
620 vdev_raidz_p_func(void *buf, size_t size, void *private)
621 {
622 	struct pqr_struct *pqr = private;
623 	const uint64_t *src = buf;
624 	int i, cnt = size / sizeof (src[0]);
625 
626 	ASSERT(pqr->p && !pqr->q && !pqr->r);
627 
628 	for (i = 0; i < cnt; i++, src++, pqr->p++)
629 		*pqr->p ^= *src;
630 
631 	return (0);
632 }
633 
634 static int
635 vdev_raidz_pq_func(void *buf, size_t size, void *private)
636 {
637 	struct pqr_struct *pqr = private;
638 	const uint64_t *src = buf;
639 	uint64_t mask;
640 	int i, cnt = size / sizeof (src[0]);
641 
642 	ASSERT(pqr->p && pqr->q && !pqr->r);
643 
644 	for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
645 		*pqr->p ^= *src;
646 		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
647 		*pqr->q ^= *src;
648 	}
649 
650 	return (0);
651 }
652 
653 static int
654 vdev_raidz_pqr_func(void *buf, size_t size, void *private)
655 {
656 	struct pqr_struct *pqr = private;
657 	const uint64_t *src = buf;
658 	uint64_t mask;
659 	int i, cnt = size / sizeof (src[0]);
660 
661 	ASSERT(pqr->p && pqr->q && pqr->r);
662 
663 	for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
664 		*pqr->p ^= *src;
665 		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
666 		*pqr->q ^= *src;
667 		VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
668 		*pqr->r ^= *src;
669 	}
670 
671 	return (0);
672 }
673 
674 static void
675 vdev_raidz_generate_parity_p(raidz_map_t *rm)
676 {
677 	uint64_t *p;
678 	int c;
679 	abd_t *src;
680 
681 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
682 		src = rm->rm_col[c].rc_abd;
683 		p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
684 
685 		if (c == rm->rm_firstdatacol) {
686 			ASSERT3U(src->abd_size, >=, rm->rm_col[c].rc_size);
687 			abd_copy_to_buf_off(p, src, 0, rm->rm_col[c].rc_size);
688 		} else {
689 			struct pqr_struct pqr = { p, NULL, NULL };
690 			(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
691 			    vdev_raidz_p_func, &pqr);
692 		}
693 	}
694 }
695 
696 static void
697 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
698 {
699 	uint64_t *p, *q, pcnt, ccnt, mask, i;
700 	int c;
701 	abd_t *src;
702 
703 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
704 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
705 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
706 
707 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
708 		src = rm->rm_col[c].rc_abd;
709 		p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
710 		q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
711 
712 		ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
713 
714 		if (c == rm->rm_firstdatacol) {
715 			ASSERT(ccnt == pcnt || ccnt == 0);
716 
717 			abd_copy_to_buf_off(p, src, 0, rm->rm_col[c].rc_size);
718 			(void) memcpy(q, p, rm->rm_col[c].rc_size);
719 			for (i = ccnt; i < pcnt; i++) {
720 				p[i] = 0;
721 				q[i] = 0;
722 			}
723 		} else {
724 			struct pqr_struct pqr = { p, q, NULL };
725 
726 			ASSERT(ccnt <= pcnt);
727 
728 			(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
729 			    vdev_raidz_pq_func, &pqr);
730 
731 			/*
732 			 * Treat short columns as though they are full of 0s.
733 			 * Note that there's therefore nothing needed for P.
734 			 */
735 			for (i = ccnt; i < pcnt; i++) {
736 				VDEV_RAIDZ_64MUL_2(q[i], mask);
737 			}
738 		}
739 	}
740 }
741 
742 static void
743 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
744 {
745 	uint64_t *p, *q, *r, pcnt, ccnt, mask, i;
746 	int c;
747 	abd_t *src;
748 
749 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
750 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
751 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
752 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
753 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
754 
755 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
756 		src = rm->rm_col[c].rc_abd;
757 		p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
758 		q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
759 		r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd);
760 
761 		ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
762 
763 		if (c == rm->rm_firstdatacol) {
764 			ASSERT3S(src->abd_size, >=, rm->rm_col[c].rc_size);
765 			ASSERT(ccnt == pcnt || ccnt == 0);
766 			abd_copy_to_buf_off(p, src, 0, rm->rm_col[c].rc_size);
767 			(void) memcpy(q, p, rm->rm_col[c].rc_size);
768 			(void) memcpy(r, p, rm->rm_col[c].rc_size);
769 
770 			for (i = ccnt; i < pcnt; i++) {
771 				p[i] = 0;
772 				q[i] = 0;
773 				r[i] = 0;
774 			}
775 		} else {
776 			struct pqr_struct pqr = { p, q, r };
777 
778 			ASSERT(ccnt <= pcnt);
779 			(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
780 			    vdev_raidz_pqr_func, &pqr);
781 
782 			/*
783 			 * Treat short columns as though they are full of 0s.
784 			 * Note that there's therefore nothing needed for P.
785 			 */
786 			for (i = ccnt; i < pcnt; i++) {
787 				VDEV_RAIDZ_64MUL_2(q[i], mask);
788 				VDEV_RAIDZ_64MUL_4(r[i], mask);
789 			}
790 		}
791 	}
792 }
793 
794 /*
795  * Generate RAID parity in the first virtual columns according to the number of
796  * parity columns available.
797  */
798 static void
799 vdev_raidz_generate_parity(raidz_map_t *rm)
800 {
801 	switch (rm->rm_firstdatacol) {
802 	case 1:
803 		vdev_raidz_generate_parity_p(rm);
804 		break;
805 	case 2:
806 		vdev_raidz_generate_parity_pq(rm);
807 		break;
808 	case 3:
809 		vdev_raidz_generate_parity_pqr(rm);
810 		break;
811 	default:
812 		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
813 	}
814 }
815 
816 /* ARGSUSED */
817 static int
818 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
819 {
820 	uint64_t *dst = dbuf;
821 	uint64_t *src = sbuf;
822 	int cnt = size / sizeof (src[0]);
823 
824 	for (int i = 0; i < cnt; i++) {
825 		dst[i] ^= src[i];
826 	}
827 
828 	return (0);
829 }
830 
831 /* ARGSUSED */
832 static int
833 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
834     void *private)
835 {
836 	uint64_t *dst = dbuf;
837 	uint64_t *src = sbuf;
838 	uint64_t mask;
839 	int cnt = size / sizeof (dst[0]);
840 
841 	for (int i = 0; i < cnt; i++, dst++, src++) {
842 		VDEV_RAIDZ_64MUL_2(*dst, mask);
843 		*dst ^= *src;
844 	}
845 
846 	return (0);
847 }
848 
849 /* ARGSUSED */
850 static int
851 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
852 {
853 	uint64_t *dst = buf;
854 	uint64_t mask;
855 	int cnt = size / sizeof (dst[0]);
856 
857 	for (int i = 0; i < cnt; i++, dst++) {
858 		/* same operation as vdev_raidz_reconst_q_pre_func() on dst */
859 		VDEV_RAIDZ_64MUL_2(*dst, mask);
860 	}
861 
862 	return (0);
863 }
864 
865 struct reconst_q_struct {
866 	uint64_t *q;
867 	int exp;
868 };
869 
870 static int
871 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
872 {
873 	struct reconst_q_struct *rq = private;
874 	uint64_t *dst = buf;
875 	int cnt = size / sizeof (dst[0]);
876 
877 	for (int i = 0; i < cnt; i++, dst++, rq->q++) {
878 		*dst ^= *rq->q;
879 
880 		int j;
881 		uint8_t *b;
882 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
883 			*b = vdev_raidz_exp2(*b, rq->exp);
884 		}
885 	}
886 
887 	return (0);
888 }
889 
890 struct reconst_pq_struct {
891 	uint8_t *p;
892 	uint8_t *q;
893 	uint8_t *pxy;
894 	uint8_t *qxy;
895 	int aexp;
896 	int bexp;
897 };
898 
899 static int
900 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
901 {
902 	struct reconst_pq_struct *rpq = private;
903 	uint8_t *xd = xbuf;
904 	uint8_t *yd = ybuf;
905 
906 	for (int i = 0; i < size;
907 	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
908 		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
909 		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
910 		*yd = *rpq->p ^ *rpq->pxy ^ *xd;
911 	}
912 
913 	return (0);
914 }
915 
916 static int
917 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
918 {
919 	struct reconst_pq_struct *rpq = private;
920 	uint8_t *xd = xbuf;
921 
922 	for (int i = 0; i < size;
923 	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
924 		/* same operation as vdev_raidz_reconst_pq_func() on xd */
925 		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
926 		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
927 	}
928 
929 	return (0);
930 }
931 
932 static int
933 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
934 {
935 	int x = tgts[0];
936 	int c;
937 	abd_t *dst, *src;
938 
939 	ASSERT(ntgts == 1);
940 	ASSERT(x >= rm->rm_firstdatacol);
941 	ASSERT(x < rm->rm_cols);
942 
943 	ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size);
944 	ASSERT(rm->rm_col[x].rc_size > 0);
945 
946 	src = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
947 	dst = rm->rm_col[x].rc_abd;
948 
949 	ASSERT3S(dst->abd_size, >=, rm->rm_col[x].rc_size);
950 	ASSERT3S(src->abd_size, >=, rm->rm_col[x].rc_size);
951 	abd_copy_off(dst, src, 0, 0, rm->rm_col[x].rc_size);
952 
953 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
954 		uint64_t size = MIN(rm->rm_col[x].rc_size,
955 		    rm->rm_col[c].rc_size);
956 
957 		src = rm->rm_col[c].rc_abd;
958 		dst = rm->rm_col[x].rc_abd;
959 
960 		if (c == x)
961 			continue;
962 
963 		(void) abd_iterate_func2(dst, src, 0, 0, size,
964 		    vdev_raidz_reconst_p_func, NULL);
965 	}
966 
967 	return (1 << VDEV_RAIDZ_P);
968 }
969 
970 static int
971 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
972 {
973 	int x = tgts[0];
974 	int c, exp;
975 	abd_t *dst, *src;
976 
977 	ASSERT(ntgts == 1);
978 
979 	ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size);
980 
981 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
982 		uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size,
983 		    rm->rm_col[c].rc_size);
984 
985 		src = rm->rm_col[c].rc_abd;
986 		dst = rm->rm_col[x].rc_abd;
987 
988 		if (c == rm->rm_firstdatacol) {
989 			if (dst != src) {
990 				ASSERT3S(dst->abd_size, >=, size);
991 				ASSERT3S(src->abd_size, >=, size);
992 				abd_copy_off(dst, src, 0, 0, size);
993 			}
994 			if (rm->rm_col[x].rc_size > size)
995 				abd_zero_off(dst, size,
996 				    rm->rm_col[x].rc_size - size);
997 		} else {
998 			ASSERT3U(size, <=, rm->rm_col[x].rc_size);
999 			if (src != dst)
1000 				(void) abd_iterate_func2(dst, src, 0, 0, size,
1001 				    vdev_raidz_reconst_q_pre_func, NULL);
1002 			(void) abd_iterate_func(dst,
1003 			    size, rm->rm_col[x].rc_size - size,
1004 			    vdev_raidz_reconst_q_pre_tail_func, NULL);
1005 		}
1006 	}
1007 
1008 	src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
1009 	dst = rm->rm_col[x].rc_abd;
1010 	exp = 255 - (rm->rm_cols - 1 - x);
1011 
1012 	struct reconst_q_struct rq = { abd_to_buf(src), exp };
1013 	(void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size,
1014 	    vdev_raidz_reconst_q_post_func, &rq);
1015 
1016 	return (1 << VDEV_RAIDZ_Q);
1017 }
1018 
1019 static int
1020 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
1021 {
1022 	uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
1023 	abd_t *pdata, *qdata;
1024 	uint64_t xsize, ysize;
1025 	int x = tgts[0];
1026 	int y = tgts[1];
1027 	abd_t *xd, *yd;
1028 
1029 	ASSERT(ntgts == 2);
1030 	ASSERT(x < y);
1031 	ASSERT(x >= rm->rm_firstdatacol);
1032 	ASSERT(y < rm->rm_cols);
1033 
1034 	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
1035 
1036 	/*
1037 	 * Move the parity data aside -- we're going to compute parity as
1038 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
1039 	 * reuse the parity generation mechanism without trashing the actual
1040 	 * parity so we make those columns appear to be full of zeros by
1041 	 * setting their lengths to zero.
1042 	 */
1043 	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
1044 	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
1045 	xsize = rm->rm_col[x].rc_size;
1046 	ysize = rm->rm_col[y].rc_size;
1047 
1048 	rm->rm_col[VDEV_RAIDZ_P].rc_abd =
1049 	    abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
1050 	rm->rm_col[VDEV_RAIDZ_Q].rc_abd =
1051 	    abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
1052 	rm->rm_col[x].rc_size = 0;
1053 	rm->rm_col[y].rc_size = 0;
1054 
1055 	vdev_raidz_generate_parity_pq(rm);
1056 
1057 	rm->rm_col[x].rc_size = xsize;
1058 	rm->rm_col[y].rc_size = ysize;
1059 
1060 	p = abd_to_buf(pdata);
1061 	q = abd_to_buf(qdata);
1062 	pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
1063 	qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
1064 	xd = rm->rm_col[x].rc_abd;
1065 	yd = rm->rm_col[y].rc_abd;
1066 
1067 	/*
1068 	 * We now have:
1069 	 *	Pxy = P + D_x + D_y
1070 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
1071 	 *
1072 	 * We can then solve for D_x:
1073 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
1074 	 * where
1075 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
1076 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
1077 	 *
1078 	 * With D_x in hand, we can easily solve for D_y:
1079 	 *	D_y = P + Pxy + D_x
1080 	 */
1081 
1082 	a = vdev_raidz_pow2[255 + x - y];
1083 	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
1084 	tmp = 255 - vdev_raidz_log2[a ^ 1];
1085 
1086 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
1087 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
1088 
1089 	ASSERT3U(xsize, >=, ysize);
1090 	struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
1091 	(void) abd_iterate_func2(xd, yd, 0, 0, ysize,
1092 	    vdev_raidz_reconst_pq_func, &rpq);
1093 	(void) abd_iterate_func(xd, ysize, xsize - ysize,
1094 	    vdev_raidz_reconst_pq_tail_func, &rpq);
1095 
1096 	abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
1097 	abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
1098 
1099 	/*
1100 	 * Restore the saved parity data.
1101 	 */
1102 	rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata;
1103 	rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata;
1104 
1105 	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
1106 }
1107 
1108 /* BEGIN CSTYLED */
1109 /*
1110  * In the general case of reconstruction, we must solve the system of linear
1111  * equations defined by the coeffecients used to generate parity as well as
1112  * the contents of the data and parity disks. This can be expressed with
1113  * vectors for the original data (D) and the actual data (d) and parity (p)
1114  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
1115  *
1116  *            __   __                     __     __
1117  *            |     |         __     __   |  p_0  |
1118  *            |  V  |         |  D_0  |   | p_m-1 |
1119  *            |     |    x    |   :   | = |  d_0  |
1120  *            |  I  |         | D_n-1 |   |   :   |
1121  *            |     |         ~~     ~~   | d_n-1 |
1122  *            ~~   ~~                     ~~     ~~
1123  *
1124  * I is simply a square identity matrix of size n, and V is a vandermonde
1125  * matrix defined by the coeffecients we chose for the various parity columns
1126  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
1127  * computation as well as linear separability.
1128  *
1129  *      __               __               __     __
1130  *      |   1   ..  1 1 1 |               |  p_0  |
1131  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
1132  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
1133  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
1134  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
1135  *      |   :       : : : |   |   :   |   |  d_2  |
1136  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
1137  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
1138  *      |   0   ..  0 0 1 |               | d_n-1 |
1139  *      ~~               ~~               ~~     ~~
1140  *
1141  * Note that I, V, d, and p are known. To compute D, we must invert the
1142  * matrix and use the known data and parity values to reconstruct the unknown
1143  * data values. We begin by removing the rows in V|I and d|p that correspond
1144  * to failed or missing columns; we then make V|I square (n x n) and d|p
1145  * sized n by removing rows corresponding to unused parity from the bottom up
1146  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
1147  * using Gauss-Jordan elimination. In the example below we use m=3 parity
1148  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
1149  *           __                               __
1150  *           |  1   1   1   1   1   1   1   1  |
1151  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
1152  *           |  19 205 116  29  64  16  4   1  |      / /
1153  *           |  1   0   0   0   0   0   0   0  |     / /
1154  *           |  0   1   0   0   0   0   0   0  | <--' /
1155  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
1156  *           |  0   0   0   1   0   0   0   0  |
1157  *           |  0   0   0   0   1   0   0   0  |
1158  *           |  0   0   0   0   0   1   0   0  |
1159  *           |  0   0   0   0   0   0   1   0  |
1160  *           |  0   0   0   0   0   0   0   1  |
1161  *           ~~                               ~~
1162  *           __                               __
1163  *           |  1   1   1   1   1   1   1   1  |
1164  *           |  19 205 116  29  64  16  4   1  |
1165  *           |  1   0   0   0   0   0   0   0  |
1166  *  (V|I)' = |  0   0   0   1   0   0   0   0  |
1167  *           |  0   0   0   0   1   0   0   0  |
1168  *           |  0   0   0   0   0   1   0   0  |
1169  *           |  0   0   0   0   0   0   1   0  |
1170  *           |  0   0   0   0   0   0   0   1  |
1171  *           ~~                               ~~
1172  *
1173  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1174  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1175  * matrix is not singular.
1176  * __                                                                 __
1177  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1178  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1179  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1180  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1181  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1182  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1183  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1184  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1185  * ~~                                                                 ~~
1186  * __                                                                 __
1187  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1188  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1189  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1190  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1191  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1192  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1193  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1194  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1195  * ~~                                                                 ~~
1196  * __                                                                 __
1197  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1198  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1199  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1200  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1201  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1202  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1203  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1204  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1205  * ~~                                                                 ~~
1206  * __                                                                 __
1207  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1208  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1209  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1210  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1211  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1212  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1213  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1214  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1215  * ~~                                                                 ~~
1216  * __                                                                 __
1217  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1218  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1219  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1220  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1221  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1222  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1223  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1224  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1225  * ~~                                                                 ~~
1226  * __                                                                 __
1227  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1228  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1229  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1230  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1231  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1232  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1233  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1234  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1235  * ~~                                                                 ~~
1236  *                   __                               __
1237  *                   |  0   0   1   0   0   0   0   0  |
1238  *                   | 167 100  5   41 159 169 217 208 |
1239  *                   | 166 100  4   40 158 168 216 209 |
1240  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1241  *                   |  0   0   0   0   1   0   0   0  |
1242  *                   |  0   0   0   0   0   1   0   0  |
1243  *                   |  0   0   0   0   0   0   1   0  |
1244  *                   |  0   0   0   0   0   0   0   1  |
1245  *                   ~~                               ~~
1246  *
1247  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1248  * of the missing data.
1249  *
1250  * As is apparent from the example above, the only non-trivial rows in the
1251  * inverse matrix correspond to the data disks that we're trying to
1252  * reconstruct. Indeed, those are the only rows we need as the others would
1253  * only be useful for reconstructing data known or assumed to be valid. For
1254  * that reason, we only build the coefficients in the rows that correspond to
1255  * targeted columns.
1256  */
1257 /* END CSTYLED */
1258 
1259 static void
1260 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1261     uint8_t **rows)
1262 {
1263 	int i, j;
1264 	int pow;
1265 
1266 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1267 
1268 	/*
1269 	 * Fill in the missing rows of interest.
1270 	 */
1271 	for (i = 0; i < nmap; i++) {
1272 		ASSERT3S(0, <=, map[i]);
1273 		ASSERT3S(map[i], <=, 2);
1274 
1275 		pow = map[i] * n;
1276 		if (pow > 255)
1277 			pow -= 255;
1278 		ASSERT(pow <= 255);
1279 
1280 		for (j = 0; j < n; j++) {
1281 			pow -= map[i];
1282 			if (pow < 0)
1283 				pow += 255;
1284 			rows[i][j] = vdev_raidz_pow2[pow];
1285 		}
1286 	}
1287 }
1288 
1289 static void
1290 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1291     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1292 {
1293 	int i, j, ii, jj;
1294 	uint8_t log;
1295 
1296 	/*
1297 	 * Assert that the first nmissing entries from the array of used
1298 	 * columns correspond to parity columns and that subsequent entries
1299 	 * correspond to data columns.
1300 	 */
1301 	for (i = 0; i < nmissing; i++) {
1302 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
1303 	}
1304 	for (; i < n; i++) {
1305 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1306 	}
1307 
1308 	/*
1309 	 * First initialize the storage where we'll compute the inverse rows.
1310 	 */
1311 	for (i = 0; i < nmissing; i++) {
1312 		for (j = 0; j < n; j++) {
1313 			invrows[i][j] = (i == j) ? 1 : 0;
1314 		}
1315 	}
1316 
1317 	/*
1318 	 * Subtract all trivial rows from the rows of consequence.
1319 	 */
1320 	for (i = 0; i < nmissing; i++) {
1321 		for (j = nmissing; j < n; j++) {
1322 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1323 			jj = used[j] - rm->rm_firstdatacol;
1324 			ASSERT3S(jj, <, n);
1325 			invrows[i][j] = rows[i][jj];
1326 			rows[i][jj] = 0;
1327 		}
1328 	}
1329 
1330 	/*
1331 	 * For each of the rows of interest, we must normalize it and subtract
1332 	 * a multiple of it from the other rows.
1333 	 */
1334 	for (i = 0; i < nmissing; i++) {
1335 		for (j = 0; j < missing[i]; j++) {
1336 			ASSERT0(rows[i][j]);
1337 		}
1338 		ASSERT3U(rows[i][missing[i]], !=, 0);
1339 
1340 		/*
1341 		 * Compute the inverse of the first element and multiply each
1342 		 * element in the row by that value.
1343 		 */
1344 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1345 
1346 		for (j = 0; j < n; j++) {
1347 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1348 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1349 		}
1350 
1351 		for (ii = 0; ii < nmissing; ii++) {
1352 			if (i == ii)
1353 				continue;
1354 
1355 			ASSERT3U(rows[ii][missing[i]], !=, 0);
1356 
1357 			log = vdev_raidz_log2[rows[ii][missing[i]]];
1358 
1359 			for (j = 0; j < n; j++) {
1360 				rows[ii][j] ^=
1361 				    vdev_raidz_exp2(rows[i][j], log);
1362 				invrows[ii][j] ^=
1363 				    vdev_raidz_exp2(invrows[i][j], log);
1364 			}
1365 		}
1366 	}
1367 
1368 	/*
1369 	 * Verify that the data that is left in the rows are properly part of
1370 	 * an identity matrix.
1371 	 */
1372 	for (i = 0; i < nmissing; i++) {
1373 		for (j = 0; j < n; j++) {
1374 			if (j == missing[i]) {
1375 				ASSERT3U(rows[i][j], ==, 1);
1376 			} else {
1377 				ASSERT0(rows[i][j]);
1378 			}
1379 		}
1380 	}
1381 }
1382 
1383 static void
1384 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1385     int *missing, uint8_t **invrows, const uint8_t *used)
1386 {
1387 	int i, j, x, cc, c;
1388 	uint8_t *src;
1389 	uint64_t ccount;
1390 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1391 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1392 	uint8_t log = 0;
1393 	uint8_t val;
1394 	int ll;
1395 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1396 	uint8_t *p, *pp;
1397 	size_t psize;
1398 
1399 	psize = sizeof (invlog[0][0]) * n * nmissing;
1400 	p = kmem_alloc(psize, KM_SLEEP);
1401 
1402 	for (pp = p, i = 0; i < nmissing; i++) {
1403 		invlog[i] = pp;
1404 		pp += n;
1405 	}
1406 
1407 	for (i = 0; i < nmissing; i++) {
1408 		for (j = 0; j < n; j++) {
1409 			ASSERT3U(invrows[i][j], !=, 0);
1410 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1411 		}
1412 	}
1413 
1414 	for (i = 0; i < n; i++) {
1415 		c = used[i];
1416 		ASSERT3U(c, <, rm->rm_cols);
1417 
1418 		src = abd_to_buf(rm->rm_col[c].rc_abd);
1419 		ccount = rm->rm_col[c].rc_size;
1420 		for (j = 0; j < nmissing; j++) {
1421 			cc = missing[j] + rm->rm_firstdatacol;
1422 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1423 			ASSERT3U(cc, <, rm->rm_cols);
1424 			ASSERT3U(cc, !=, c);
1425 
1426 			dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd);
1427 			dcount[j] = rm->rm_col[cc].rc_size;
1428 		}
1429 
1430 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1431 
1432 		for (x = 0; x < ccount; x++, src++) {
1433 			if (*src != 0)
1434 				log = vdev_raidz_log2[*src];
1435 
1436 			for (cc = 0; cc < nmissing; cc++) {
1437 				if (x >= dcount[cc])
1438 					continue;
1439 
1440 				if (*src == 0) {
1441 					val = 0;
1442 				} else {
1443 					if ((ll = log + invlog[cc][i]) >= 255)
1444 						ll -= 255;
1445 					val = vdev_raidz_pow2[ll];
1446 				}
1447 
1448 				if (i == 0)
1449 					dst[cc][x] = val;
1450 				else
1451 					dst[cc][x] ^= val;
1452 			}
1453 		}
1454 	}
1455 
1456 	kmem_free(p, psize);
1457 }
1458 
1459 static int
1460 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1461 {
1462 	int n, i, c, t, tt;
1463 	int nmissing_rows;
1464 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1465 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1466 
1467 	uint8_t *p, *pp;
1468 	size_t psize;
1469 
1470 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1471 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1472 	uint8_t *used;
1473 
1474 	abd_t **bufs = NULL;
1475 
1476 	int code = 0;
1477 
1478 	/*
1479 	 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
1480 	 * temporary linear ABDs.
1481 	 */
1482 	if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) {
1483 		bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE);
1484 
1485 		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1486 			raidz_col_t *col = &rm->rm_col[c];
1487 
1488 			bufs[c] = col->rc_abd;
1489 			col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE);
1490 			ASSERT3S(col->rc_abd->abd_size, >=, col->rc_size);
1491 			ASSERT3S(bufs[c]->abd_size, >=, col->rc_size);
1492 			abd_copy_off(col->rc_abd, bufs[c], 0, 0, col->rc_size);
1493 		}
1494 	}
1495 
1496 	n = rm->rm_cols - rm->rm_firstdatacol;
1497 
1498 	/*
1499 	 * Figure out which data columns are missing.
1500 	 */
1501 	nmissing_rows = 0;
1502 	for (t = 0; t < ntgts; t++) {
1503 		if (tgts[t] >= rm->rm_firstdatacol) {
1504 			missing_rows[nmissing_rows++] =
1505 			    tgts[t] - rm->rm_firstdatacol;
1506 		}
1507 	}
1508 
1509 	/*
1510 	 * Figure out which parity columns to use to help generate the missing
1511 	 * data columns.
1512 	 */
1513 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1514 		ASSERT(tt < ntgts);
1515 		ASSERT(c < rm->rm_firstdatacol);
1516 
1517 		/*
1518 		 * Skip any targeted parity columns.
1519 		 */
1520 		if (c == tgts[tt]) {
1521 			tt++;
1522 			continue;
1523 		}
1524 
1525 		code |= 1 << c;
1526 
1527 		parity_map[i] = c;
1528 		i++;
1529 	}
1530 
1531 	ASSERT(code != 0);
1532 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1533 
1534 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1535 	    nmissing_rows * n + sizeof (used[0]) * n;
1536 	p = kmem_alloc(psize, KM_SLEEP);
1537 
1538 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1539 		rows[i] = pp;
1540 		pp += n;
1541 		invrows[i] = pp;
1542 		pp += n;
1543 	}
1544 	used = pp;
1545 
1546 	for (i = 0; i < nmissing_rows; i++) {
1547 		used[i] = parity_map[i];
1548 	}
1549 
1550 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1551 		if (tt < nmissing_rows &&
1552 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1553 			tt++;
1554 			continue;
1555 		}
1556 
1557 		ASSERT3S(i, <, n);
1558 		used[i] = c;
1559 		i++;
1560 	}
1561 
1562 	/*
1563 	 * Initialize the interesting rows of the matrix.
1564 	 */
1565 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1566 
1567 	/*
1568 	 * Invert the matrix.
1569 	 */
1570 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1571 	    invrows, used);
1572 
1573 	/*
1574 	 * Reconstruct the missing data using the generated matrix.
1575 	 */
1576 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1577 	    invrows, used);
1578 
1579 	kmem_free(p, psize);
1580 
1581 	/*
1582 	 * copy back from temporary linear abds and free them
1583 	 */
1584 	if (bufs) {
1585 		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1586 			raidz_col_t *col = &rm->rm_col[c];
1587 
1588 			ASSERT3S(bufs[c]->abd_size, >=, col->rc_size);
1589 			ASSERT3S(col->rc_abd->abd_size, >=, col->rc_size);
1590 			abd_copy_off(bufs[c], col->rc_abd, 0, 0, col->rc_size);
1591 			abd_free(col->rc_abd);
1592 			col->rc_abd = bufs[c];
1593 		}
1594 		kmem_free(bufs, rm->rm_cols * sizeof (abd_t *));
1595 	}
1596 
1597 	return (code);
1598 }
1599 
1600 static int
1601 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1602 {
1603 	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1604 	int ntgts;
1605 	int i, c;
1606 	int code;
1607 	int nbadparity, nbaddata;
1608 	int parity_valid[VDEV_RAIDZ_MAXPARITY];
1609 
1610 	/*
1611 	 * The tgts list must already be sorted.
1612 	 */
1613 	for (i = 1; i < nt; i++) {
1614 		ASSERT(t[i] > t[i - 1]);
1615 	}
1616 
1617 	nbadparity = rm->rm_firstdatacol;
1618 	nbaddata = rm->rm_cols - nbadparity;
1619 	ntgts = 0;
1620 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1621 		if (c < rm->rm_firstdatacol)
1622 			parity_valid[c] = B_FALSE;
1623 
1624 		if (i < nt && c == t[i]) {
1625 			tgts[ntgts++] = c;
1626 			i++;
1627 		} else if (rm->rm_col[c].rc_error != 0) {
1628 			tgts[ntgts++] = c;
1629 		} else if (c >= rm->rm_firstdatacol) {
1630 			nbaddata--;
1631 		} else {
1632 			parity_valid[c] = B_TRUE;
1633 			nbadparity--;
1634 		}
1635 	}
1636 
1637 	ASSERT(ntgts >= nt);
1638 	ASSERT(nbaddata >= 0);
1639 	ASSERT(nbaddata + nbadparity == ntgts);
1640 
1641 	dt = &tgts[nbadparity];
1642 
1643 	/*
1644 	 * See if we can use any of our optimized reconstruction routines.
1645 	 */
1646 	if (!vdev_raidz_default_to_general) {
1647 		switch (nbaddata) {
1648 		case 1:
1649 			if (parity_valid[VDEV_RAIDZ_P])
1650 				return (vdev_raidz_reconstruct_p(rm, dt, 1));
1651 
1652 			ASSERT(rm->rm_firstdatacol > 1);
1653 
1654 			if (parity_valid[VDEV_RAIDZ_Q])
1655 				return (vdev_raidz_reconstruct_q(rm, dt, 1));
1656 
1657 			ASSERT(rm->rm_firstdatacol > 2);
1658 			break;
1659 
1660 		case 2:
1661 			ASSERT(rm->rm_firstdatacol > 1);
1662 
1663 			if (parity_valid[VDEV_RAIDZ_P] &&
1664 			    parity_valid[VDEV_RAIDZ_Q])
1665 				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1666 
1667 			ASSERT(rm->rm_firstdatacol > 2);
1668 
1669 			break;
1670 		}
1671 	}
1672 
1673 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1674 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1675 	ASSERT(code > 0);
1676 	return (code);
1677 }
1678 
1679 static int
1680 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1681     uint64_t *ashift)
1682 {
1683 	vdev_t *cvd;
1684 	uint64_t nparity = vd->vdev_nparity;
1685 	int c;
1686 	int lasterror = 0;
1687 	int numerrors = 0;
1688 
1689 	ASSERT(nparity > 0);
1690 
1691 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
1692 	    vd->vdev_children < nparity + 1) {
1693 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1694 		return (SET_ERROR(EINVAL));
1695 	}
1696 
1697 	vdev_open_children(vd);
1698 
1699 	for (c = 0; c < vd->vdev_children; c++) {
1700 		cvd = vd->vdev_child[c];
1701 
1702 		if (cvd->vdev_open_error != 0) {
1703 			lasterror = cvd->vdev_open_error;
1704 			numerrors++;
1705 			continue;
1706 		}
1707 
1708 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1709 		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1710 		*ashift = MAX(*ashift, cvd->vdev_ashift);
1711 	}
1712 
1713 	*asize *= vd->vdev_children;
1714 	*max_asize *= vd->vdev_children;
1715 
1716 	if (numerrors > nparity) {
1717 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1718 		return (lasterror);
1719 	}
1720 
1721 	return (0);
1722 }
1723 
1724 static void
1725 vdev_raidz_close(vdev_t *vd)
1726 {
1727 	int c;
1728 
1729 	for (c = 0; c < vd->vdev_children; c++)
1730 		vdev_close(vd->vdev_child[c]);
1731 }
1732 
1733 /*
1734  * Handle a read or write I/O to a RAID-Z dump device.
1735  *
1736  * The dump device is in a unique situation compared to other ZFS datasets:
1737  * writing to this device should be as simple and fast as possible.  In
1738  * addition, durability matters much less since the dump will be extracted
1739  * once the machine reboots.  For that reason, this function eschews parity for
1740  * performance and simplicity.  The dump device uses the checksum setting
1741  * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1742  * dataset.
1743  *
1744  * Blocks of size 128 KB have been preallocated for this volume.  I/Os less than
1745  * 128 KB will not fill an entire block; in addition, they may not be properly
1746  * aligned.  In that case, this function uses the preallocated 128 KB block and
1747  * omits reading or writing any "empty" portions of that block, as opposed to
1748  * allocating a fresh appropriately-sized block.
1749  *
1750  * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1751  *
1752  *     vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1753  *
1754  * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1755  * allocated which spans all five child vdevs.  8 KB of data would be written to
1756  * each of four vdevs, with the fifth containing the parity bits.
1757  *
1758  *       parity    data     data     data     data
1759  *     |   PP   |   XX   |   XX   |   XX   |   XX   |
1760  *         ^        ^        ^        ^        ^
1761  *         |        |        |        |        |
1762  *   8 KB parity    ------8 KB data blocks------
1763  *
1764  * However, when writing to the dump device, the behavior is different:
1765  *
1766  *     vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1767  *
1768  * Unlike the normal RAID-Z case in which the block is allocated based on the
1769  * I/O size, reads and writes here always use a 128 KB logical I/O size.  If the
1770  * I/O size is less than 128 KB, only the actual portions of data are written.
1771  * In this example the data is written to the third data vdev since that vdev
1772  * contains the offset [64 KB, 96 KB).
1773  *
1774  *       parity    data     data     data     data
1775  *     |        |        |        |   XX   |        |
1776  *                                    ^
1777  *                                    |
1778  *                             32 KB data block
1779  *
1780  * As a result, an individual I/O may not span all child vdevs; moreover, a
1781  * small I/O may only operate on a single child vdev.
1782  *
1783  * Note that since there are no parity bits calculated or written, this format
1784  * remains the same no matter how many parity bits are used in a normal RAID-Z
1785  * stripe.  On a RAID-Z3 configuration with seven child vdevs, the example above
1786  * would look like:
1787  *
1788  *       parity   parity   parity    data     data     data     data
1789  *     |        |        |        |        |        |   XX   |        |
1790  *                                                      ^
1791  *                                                      |
1792  *                                               32 KB data block
1793  */
1794 int
1795 vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1796     uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1797 {
1798 	vdev_t *tvd = vd->vdev_top;
1799 	vdev_t *cvd;
1800 	raidz_map_t *rm;
1801 	raidz_col_t *rc;
1802 	int c, err = 0;
1803 
1804 	uint64_t start, end, colstart, colend;
1805 	uint64_t coloffset, colsize, colskip;
1806 
1807 	int flags = doread ? B_READ : B_WRITE;
1808 
1809 #ifdef	_KERNEL
1810 
1811 	/*
1812 	 * Don't write past the end of the block
1813 	 */
1814 	VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
1815 
1816 	start = offset;
1817 	end = start + size;
1818 
1819 	/*
1820 	 * Allocate a RAID-Z map for this block.  Note that this block starts
1821 	 * from the "original" offset, this is, the offset of the extent which
1822 	 * contains the requisite offset of the data being read or written.
1823 	 *
1824 	 * Even if this I/O operation doesn't span the full block size, let's
1825 	 * treat the on-disk format as if the only blocks are the complete 128
1826 	 * KB size.
1827 	 */
1828 	abd_t *abd = abd_get_from_buf(data - (offset - origoffset),
1829 	    SPA_OLD_MAXBLOCKSIZE);
1830 	rm = vdev_raidz_map_alloc(abd,
1831 	    SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift,
1832 	    vd->vdev_children, vd->vdev_nparity);
1833 
1834 	coloffset = origoffset;
1835 
1836 	for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1837 	    c++, coloffset += rc->rc_size) {
1838 		rc = &rm->rm_col[c];
1839 		cvd = vd->vdev_child[rc->rc_devidx];
1840 
1841 		/*
1842 		 * Find the start and end of this column in the RAID-Z map,
1843 		 * keeping in mind that the stated size and offset of the
1844 		 * operation may not fill the entire column for this vdev.
1845 		 *
1846 		 * If any portion of the data spans this column, issue the
1847 		 * appropriate operation to the vdev.
1848 		 */
1849 		if (coloffset + rc->rc_size <= start)
1850 			continue;
1851 		if (coloffset >= end)
1852 			continue;
1853 
1854 		colstart = MAX(coloffset, start);
1855 		colend = MIN(end, coloffset + rc->rc_size);
1856 		colsize = colend - colstart;
1857 		colskip = colstart - coloffset;
1858 
1859 		VERIFY3U(colsize, <=, rc->rc_size);
1860 		VERIFY3U(colskip, <=, rc->rc_size);
1861 
1862 		/*
1863 		 * Note that the child vdev will have a vdev label at the start
1864 		 * of its range of offsets, hence the need for
1865 		 * VDEV_LABEL_OFFSET().  See zio_vdev_child_io() for another
1866 		 * example of why this calculation is needed.
1867 		 */
1868 		if ((err = vdev_disk_physio(cvd,
1869 		    ((char *)abd_to_buf(rc->rc_abd)) + colskip, colsize,
1870 		    VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1871 		    flags, isdump)) != 0)
1872 			break;
1873 	}
1874 
1875 	vdev_raidz_map_free(rm);
1876 	abd_put(abd);
1877 #endif	/* KERNEL */
1878 
1879 	return (err);
1880 }
1881 
1882 static uint64_t
1883 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1884 {
1885 	uint64_t asize;
1886 	uint64_t ashift = vd->vdev_top->vdev_ashift;
1887 	uint64_t cols = vd->vdev_children;
1888 	uint64_t nparity = vd->vdev_nparity;
1889 
1890 	asize = ((psize - 1) >> ashift) + 1;
1891 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1892 	asize = roundup(asize, nparity + 1) << ashift;
1893 
1894 	return (asize);
1895 }
1896 
1897 static void
1898 vdev_raidz_child_done(zio_t *zio)
1899 {
1900 	raidz_col_t *rc = zio->io_private;
1901 
1902 	rc->rc_error = zio->io_error;
1903 	rc->rc_tried = 1;
1904 	rc->rc_skipped = 0;
1905 }
1906 
1907 static void
1908 vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col)
1909 {
1910 #ifdef ZFS_DEBUG
1911 	vdev_t *vd = zio->io_vd;
1912 	vdev_t *tvd = vd->vdev_top;
1913 
1914 	range_seg_t logical_rs, physical_rs;
1915 	logical_rs.rs_start = zio->io_offset;
1916 	logical_rs.rs_end = logical_rs.rs_start +
1917 	    vdev_raidz_asize(zio->io_vd, zio->io_size);
1918 
1919 	raidz_col_t *rc = &rm->rm_col[col];
1920 	vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
1921 
1922 	vdev_xlate(cvd, &logical_rs, &physical_rs);
1923 	ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
1924 	ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
1925 	/*
1926 	 * It would be nice to assert that rs_end is equal
1927 	 * to rc_offset + rc_size but there might be an
1928 	 * optional I/O at the end that is not accounted in
1929 	 * rc_size.
1930 	 */
1931 	if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
1932 		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
1933 		    rc->rc_size + (1 << tvd->vdev_ashift));
1934 	} else {
1935 		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
1936 	}
1937 #endif
1938 }
1939 
1940 /*
1941  * Start an IO operation on a RAIDZ VDev
1942  *
1943  * Outline:
1944  * - For write operations:
1945  *   1. Generate the parity data
1946  *   2. Create child zio write operations to each column's vdev, for both
1947  *      data and parity.
1948  *   3. If the column skips any sectors for padding, create optional dummy
1949  *      write zio children for those areas to improve aggregation continuity.
1950  * - For read operations:
1951  *   1. Create child zio read operations to each data column's vdev to read
1952  *      the range of data required for zio.
1953  *   2. If this is a scrub or resilver operation, or if any of the data
1954  *      vdevs have had errors, then create zio read operations to the parity
1955  *      columns' VDevs as well.
1956  */
1957 static void
1958 vdev_raidz_io_start(zio_t *zio)
1959 {
1960 	vdev_t *vd = zio->io_vd;
1961 	vdev_t *tvd = vd->vdev_top;
1962 	vdev_t *cvd;
1963 	raidz_map_t *rm;
1964 	raidz_col_t *rc;
1965 	int c, i;
1966 
1967 	rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset,
1968 	    tvd->vdev_ashift, vd->vdev_children,
1969 	    vd->vdev_nparity);
1970 
1971 	zio->io_vsd = rm;
1972 	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1973 
1974 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1975 
1976 	if (zio->io_type == ZIO_TYPE_WRITE) {
1977 		vdev_raidz_generate_parity(rm);
1978 
1979 		for (c = 0; c < rm->rm_cols; c++) {
1980 			rc = &rm->rm_col[c];
1981 			cvd = vd->vdev_child[rc->rc_devidx];
1982 
1983 			/*
1984 			 * Verify physical to logical translation.
1985 			 */
1986 			vdev_raidz_io_verify(zio, rm, c);
1987 
1988 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1989 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
1990 			    zio->io_type, zio->io_priority, 0,
1991 			    vdev_raidz_child_done, rc));
1992 		}
1993 
1994 		/*
1995 		 * Generate optional I/Os for any skipped sectors to improve
1996 		 * aggregation contiguity.
1997 		 */
1998 		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1999 			ASSERT(c <= rm->rm_scols);
2000 			if (c == rm->rm_scols)
2001 				c = 0;
2002 			rc = &rm->rm_col[c];
2003 			cvd = vd->vdev_child[rc->rc_devidx];
2004 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2005 			    rc->rc_offset + rc->rc_size, NULL,
2006 			    1 << tvd->vdev_ashift,
2007 			    zio->io_type, zio->io_priority,
2008 			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
2009 		}
2010 
2011 		zio_execute(zio);
2012 		return;
2013 	}
2014 
2015 	ASSERT(zio->io_type == ZIO_TYPE_READ);
2016 
2017 	/*
2018 	 * Iterate over the columns in reverse order so that we hit the parity
2019 	 * last -- any errors along the way will force us to read the parity.
2020 	 */
2021 	for (c = rm->rm_cols - 1; c >= 0; c--) {
2022 		rc = &rm->rm_col[c];
2023 		cvd = vd->vdev_child[rc->rc_devidx];
2024 		if (!vdev_readable(cvd)) {
2025 			if (c >= rm->rm_firstdatacol)
2026 				rm->rm_missingdata++;
2027 			else
2028 				rm->rm_missingparity++;
2029 			rc->rc_error = SET_ERROR(ENXIO);
2030 			rc->rc_tried = 1;	/* don't even try */
2031 			rc->rc_skipped = 1;
2032 			continue;
2033 		}
2034 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
2035 			if (c >= rm->rm_firstdatacol)
2036 				rm->rm_missingdata++;
2037 			else
2038 				rm->rm_missingparity++;
2039 			rc->rc_error = SET_ERROR(ESTALE);
2040 			rc->rc_skipped = 1;
2041 			continue;
2042 		}
2043 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
2044 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
2045 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2046 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
2047 			    zio->io_type, zio->io_priority, 0,
2048 			    vdev_raidz_child_done, rc));
2049 		}
2050 	}
2051 
2052 	zio_execute(zio);
2053 }
2054 
2055 
2056 /*
2057  * Report a checksum error for a child of a RAID-Z device.
2058  */
2059 static void
2060 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
2061 {
2062 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
2063 
2064 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2065 		zio_bad_cksum_t zbc;
2066 		raidz_map_t *rm = zio->io_vsd;
2067 
2068 		mutex_enter(&vd->vdev_stat_lock);
2069 		vd->vdev_stat.vs_checksum_errors++;
2070 		mutex_exit(&vd->vdev_stat_lock);
2071 
2072 		zbc.zbc_has_cksum = 0;
2073 		zbc.zbc_injected = rm->rm_ecksuminjected;
2074 
2075 		zfs_ereport_post_checksum(zio->io_spa, vd,
2076 		    &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size,
2077 		    rc->rc_abd, bad_data, &zbc);
2078 	}
2079 }
2080 
2081 /*
2082  * We keep track of whether or not there were any injected errors, so that
2083  * any ereports we generate can note it.
2084  */
2085 static int
2086 raidz_checksum_verify(zio_t *zio)
2087 {
2088 	zio_bad_cksum_t zbc;
2089 	raidz_map_t *rm = zio->io_vsd;
2090 
2091 	int ret = zio_checksum_error(zio, &zbc);
2092 	if (ret != 0 && zbc.zbc_injected != 0)
2093 		rm->rm_ecksuminjected = 1;
2094 
2095 	return (ret);
2096 }
2097 
2098 /*
2099  * Generate the parity from the data columns. If we tried and were able to
2100  * read the parity without error, verify that the generated parity matches the
2101  * data we read. If it doesn't, we fire off a checksum error. Return the
2102  * number such failures.
2103  */
2104 static int
2105 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
2106 {
2107 	abd_t *orig[VDEV_RAIDZ_MAXPARITY];
2108 	int c, ret = 0;
2109 	raidz_col_t *rc;
2110 
2111 	blkptr_t *bp = zio->io_bp;
2112 	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
2113 	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
2114 
2115 	if (checksum == ZIO_CHECKSUM_NOPARITY)
2116 		return (ret);
2117 
2118 	for (c = 0; c < rm->rm_firstdatacol; c++) {
2119 		rc = &rm->rm_col[c];
2120 		if (!rc->rc_tried || rc->rc_error != 0)
2121 			continue;
2122 		orig[c] = abd_alloc_sametype(rc->rc_abd, rc->rc_size);
2123 		abd_copy(orig[c], rc->rc_abd, rc->rc_size);
2124 	}
2125 
2126 	vdev_raidz_generate_parity(rm);
2127 
2128 	for (c = 0; c < rm->rm_firstdatacol; c++) {
2129 		rc = &rm->rm_col[c];
2130 		if (!rc->rc_tried || rc->rc_error != 0)
2131 			continue;
2132 		if (abd_cmp(orig[c], rc->rc_abd, rc->rc_abd->abd_size) != 0) {
2133 			raidz_checksum_error(zio, rc, orig[c]);
2134 			rc->rc_error = SET_ERROR(ECKSUM);
2135 			ret++;
2136 		}
2137 		abd_free(orig[c]);
2138 	}
2139 
2140 	return (ret);
2141 }
2142 
2143 /*
2144  * Keep statistics on all the ways that we used parity to correct data.
2145  */
2146 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
2147 
2148 static int
2149 vdev_raidz_worst_error(raidz_map_t *rm)
2150 {
2151 	int error = 0;
2152 
2153 	for (int c = 0; c < rm->rm_cols; c++)
2154 		error = zio_worst_error(error, rm->rm_col[c].rc_error);
2155 
2156 	return (error);
2157 }
2158 
2159 /*
2160  * Iterate over all combinations of bad data and attempt a reconstruction.
2161  * Note that the algorithm below is non-optimal because it doesn't take into
2162  * account how reconstruction is actually performed. For example, with
2163  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
2164  * is targeted as invalid as if columns 1 and 4 are targeted since in both
2165  * cases we'd only use parity information in column 0.
2166  */
2167 static int
2168 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
2169 {
2170 	raidz_map_t *rm = zio->io_vsd;
2171 	raidz_col_t *rc;
2172 	abd_t *orig[VDEV_RAIDZ_MAXPARITY];
2173 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
2174 	int *tgts = &tstore[1];
2175 	int current, next, i, c, n;
2176 	int code, ret = 0;
2177 
2178 	ASSERT(total_errors < rm->rm_firstdatacol);
2179 
2180 	/*
2181 	 * This simplifies one edge condition.
2182 	 */
2183 	tgts[-1] = -1;
2184 
2185 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
2186 		/*
2187 		 * Initialize the targets array by finding the first n columns
2188 		 * that contain no error.
2189 		 *
2190 		 * If there were no data errors, we need to ensure that we're
2191 		 * always explicitly attempting to reconstruct at least one
2192 		 * data column. To do this, we simply push the highest target
2193 		 * up into the data columns.
2194 		 */
2195 		for (c = 0, i = 0; i < n; i++) {
2196 			if (i == n - 1 && data_errors == 0 &&
2197 			    c < rm->rm_firstdatacol) {
2198 				c = rm->rm_firstdatacol;
2199 			}
2200 
2201 			while (rm->rm_col[c].rc_error != 0) {
2202 				c++;
2203 				ASSERT3S(c, <, rm->rm_cols);
2204 			}
2205 
2206 			tgts[i] = c++;
2207 		}
2208 
2209 		/*
2210 		 * Setting tgts[n] simplifies the other edge condition.
2211 		 */
2212 		tgts[n] = rm->rm_cols;
2213 
2214 		/*
2215 		 * These buffers were allocated in previous iterations.
2216 		 */
2217 		for (i = 0; i < n - 1; i++) {
2218 			ASSERT(orig[i] != NULL);
2219 		}
2220 
2221 		orig[n - 1] = abd_alloc_sametype(rm->rm_col[0].rc_abd,
2222 		    rm->rm_col[0].rc_size);
2223 
2224 		current = 0;
2225 		next = tgts[current];
2226 
2227 		while (current != n) {
2228 			tgts[current] = next;
2229 			current = 0;
2230 
2231 			/*
2232 			 * Save off the original data that we're going to
2233 			 * attempt to reconstruct.
2234 			 */
2235 			for (i = 0; i < n; i++) {
2236 				ASSERT(orig[i] != NULL);
2237 				c = tgts[i];
2238 				ASSERT3S(c, >=, 0);
2239 				ASSERT3S(c, <, rm->rm_cols);
2240 				rc = &rm->rm_col[c];
2241 				ASSERT3S(orig[i]->abd_size, >=, rc->rc_size);
2242 				ASSERT3S(rc->rc_abd->abd_size, >=, rc->rc_size);
2243 				abd_copy_off(orig[i], rc->rc_abd, 0, 0,
2244 				    rc->rc_size);
2245 			}
2246 
2247 			/*
2248 			 * Attempt a reconstruction and exit the outer loop on
2249 			 * success.
2250 			 */
2251 			code = vdev_raidz_reconstruct(rm, tgts, n);
2252 			if (raidz_checksum_verify(zio) == 0) {
2253 				atomic_inc_64(&raidz_corrected[code]);
2254 
2255 				for (i = 0; i < n; i++) {
2256 					c = tgts[i];
2257 					rc = &rm->rm_col[c];
2258 					ASSERT(rc->rc_error == 0);
2259 					if (rc->rc_tried)
2260 						raidz_checksum_error(zio, rc,
2261 						    orig[i]);
2262 					rc->rc_error = SET_ERROR(ECKSUM);
2263 				}
2264 
2265 				ret = code;
2266 				goto done;
2267 			}
2268 
2269 			/*
2270 			 * Restore the original data.
2271 			 */
2272 			for (i = 0; i < n; i++) {
2273 				c = tgts[i];
2274 				rc = &rm->rm_col[c];
2275 				ASSERT3S(rc->rc_abd->abd_size, >=, rc->rc_size);
2276 				ASSERT3S(orig[i]->abd_size, >=, rc->rc_size);
2277 				abd_copy_off(rc->rc_abd, orig[i], 0, 0,
2278 				    rc->rc_size);
2279 			}
2280 
2281 			do {
2282 				/*
2283 				 * Find the next valid column after the current
2284 				 * position..
2285 				 */
2286 				for (next = tgts[current] + 1;
2287 				    next < rm->rm_cols &&
2288 				    rm->rm_col[next].rc_error != 0; next++)
2289 					continue;
2290 
2291 				ASSERT(next <= tgts[current + 1]);
2292 
2293 				/*
2294 				 * If that spot is available, we're done here.
2295 				 */
2296 				if (next != tgts[current + 1])
2297 					break;
2298 
2299 				/*
2300 				 * Otherwise, find the next valid column after
2301 				 * the previous position.
2302 				 */
2303 				for (c = tgts[current - 1] + 1;
2304 				    rm->rm_col[c].rc_error != 0; c++)
2305 					continue;
2306 
2307 				tgts[current] = c;
2308 				current++;
2309 
2310 			} while (current != n);
2311 		}
2312 	}
2313 	n--;
2314 done:
2315 	for (i = 0; i < n; i++)
2316 		abd_free(orig[i]);
2317 
2318 	return (ret);
2319 }
2320 
2321 /*
2322  * Complete an IO operation on a RAIDZ VDev
2323  *
2324  * Outline:
2325  * - For write operations:
2326  *   1. Check for errors on the child IOs.
2327  *   2. Return, setting an error code if too few child VDevs were written
2328  *      to reconstruct the data later.  Note that partial writes are
2329  *      considered successful if they can be reconstructed at all.
2330  * - For read operations:
2331  *   1. Check for errors on the child IOs.
2332  *   2. If data errors occurred:
2333  *      a. Try to reassemble the data from the parity available.
2334  *      b. If we haven't yet read the parity drives, read them now.
2335  *      c. If all parity drives have been read but the data still doesn't
2336  *         reassemble with a correct checksum, then try combinatorial
2337  *         reconstruction.
2338  *      d. If that doesn't work, return an error.
2339  *   3. If there were unexpected errors or this is a resilver operation,
2340  *      rewrite the vdevs that had errors.
2341  */
2342 static void
2343 vdev_raidz_io_done(zio_t *zio)
2344 {
2345 	vdev_t *vd = zio->io_vd;
2346 	vdev_t *cvd;
2347 	raidz_map_t *rm = zio->io_vsd;
2348 	raidz_col_t *rc;
2349 	int unexpected_errors = 0;
2350 	int parity_errors = 0;
2351 	int parity_untried = 0;
2352 	int data_errors = 0;
2353 	int total_errors = 0;
2354 	int n, c;
2355 	int tgts[VDEV_RAIDZ_MAXPARITY];
2356 	int code;
2357 
2358 	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
2359 
2360 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2361 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2362 
2363 	for (c = 0; c < rm->rm_cols; c++) {
2364 		rc = &rm->rm_col[c];
2365 
2366 		if (rc->rc_error) {
2367 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
2368 
2369 			if (c < rm->rm_firstdatacol)
2370 				parity_errors++;
2371 			else
2372 				data_errors++;
2373 
2374 			if (!rc->rc_skipped)
2375 				unexpected_errors++;
2376 
2377 			total_errors++;
2378 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2379 			parity_untried++;
2380 		}
2381 	}
2382 
2383 	if (zio->io_type == ZIO_TYPE_WRITE) {
2384 		/*
2385 		 * XXX -- for now, treat partial writes as a success.
2386 		 * (If we couldn't write enough columns to reconstruct
2387 		 * the data, the I/O failed.  Otherwise, good enough.)
2388 		 *
2389 		 * Now that we support write reallocation, it would be better
2390 		 * to treat partial failure as real failure unless there are
2391 		 * no non-degraded top-level vdevs left, and not update DTLs
2392 		 * if we intend to reallocate.
2393 		 */
2394 		/* XXPOLICY */
2395 		if (total_errors > rm->rm_firstdatacol)
2396 			zio->io_error = vdev_raidz_worst_error(rm);
2397 
2398 		return;
2399 	}
2400 
2401 	ASSERT(zio->io_type == ZIO_TYPE_READ);
2402 	/*
2403 	 * There are three potential phases for a read:
2404 	 *	1. produce valid data from the columns read
2405 	 *	2. read all disks and try again
2406 	 *	3. perform combinatorial reconstruction
2407 	 *
2408 	 * Each phase is progressively both more expensive and less likely to
2409 	 * occur. If we encounter more errors than we can repair or all phases
2410 	 * fail, we have no choice but to return an error.
2411 	 */
2412 
2413 	/*
2414 	 * If the number of errors we saw was correctable -- less than or equal
2415 	 * to the number of parity disks read -- attempt to produce data that
2416 	 * has a valid checksum. Naturally, this case applies in the absence of
2417 	 * any errors.
2418 	 */
2419 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2420 		if (data_errors == 0) {
2421 			if (raidz_checksum_verify(zio) == 0) {
2422 				/*
2423 				 * If we read parity information (unnecessarily
2424 				 * as it happens since no reconstruction was
2425 				 * needed) regenerate and verify the parity.
2426 				 * We also regenerate parity when resilvering
2427 				 * so we can write it out to the failed device
2428 				 * later.
2429 				 */
2430 				if (parity_errors + parity_untried <
2431 				    rm->rm_firstdatacol ||
2432 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2433 					n = raidz_parity_verify(zio, rm);
2434 					unexpected_errors += n;
2435 					ASSERT(parity_errors + n <=
2436 					    rm->rm_firstdatacol);
2437 				}
2438 				goto done;
2439 			}
2440 		} else {
2441 			/*
2442 			 * We either attempt to read all the parity columns or
2443 			 * none of them. If we didn't try to read parity, we
2444 			 * wouldn't be here in the correctable case. There must
2445 			 * also have been fewer parity errors than parity
2446 			 * columns or, again, we wouldn't be in this code path.
2447 			 */
2448 			ASSERT(parity_untried == 0);
2449 			ASSERT(parity_errors < rm->rm_firstdatacol);
2450 
2451 			/*
2452 			 * Identify the data columns that reported an error.
2453 			 */
2454 			n = 0;
2455 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2456 				rc = &rm->rm_col[c];
2457 				if (rc->rc_error != 0) {
2458 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2459 					tgts[n++] = c;
2460 				}
2461 			}
2462 
2463 			ASSERT(rm->rm_firstdatacol >= n);
2464 
2465 			code = vdev_raidz_reconstruct(rm, tgts, n);
2466 
2467 			if (raidz_checksum_verify(zio) == 0) {
2468 				atomic_inc_64(&raidz_corrected[code]);
2469 
2470 				/*
2471 				 * If we read more parity disks than were used
2472 				 * for reconstruction, confirm that the other
2473 				 * parity disks produced correct data. This
2474 				 * routine is suboptimal in that it regenerates
2475 				 * the parity that we already used in addition
2476 				 * to the parity that we're attempting to
2477 				 * verify, but this should be a relatively
2478 				 * uncommon case, and can be optimized if it
2479 				 * becomes a problem. Note that we regenerate
2480 				 * parity when resilvering so we can write it
2481 				 * out to failed devices later.
2482 				 */
2483 				if (parity_errors < rm->rm_firstdatacol - n ||
2484 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2485 					n = raidz_parity_verify(zio, rm);
2486 					unexpected_errors += n;
2487 					ASSERT(parity_errors + n <=
2488 					    rm->rm_firstdatacol);
2489 				}
2490 
2491 				goto done;
2492 			}
2493 		}
2494 	}
2495 
2496 	/*
2497 	 * This isn't a typical situation -- either we got a read error or
2498 	 * a child silently returned bad data. Read every block so we can
2499 	 * try again with as much data and parity as we can track down. If
2500 	 * we've already been through once before, all children will be marked
2501 	 * as tried so we'll proceed to combinatorial reconstruction.
2502 	 */
2503 	unexpected_errors = 1;
2504 	rm->rm_missingdata = 0;
2505 	rm->rm_missingparity = 0;
2506 
2507 	for (c = 0; c < rm->rm_cols; c++) {
2508 		if (rm->rm_col[c].rc_tried)
2509 			continue;
2510 
2511 		zio_vdev_io_redone(zio);
2512 		do {
2513 			rc = &rm->rm_col[c];
2514 			if (rc->rc_tried)
2515 				continue;
2516 			zio_nowait(zio_vdev_child_io(zio, NULL,
2517 			    vd->vdev_child[rc->rc_devidx],
2518 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
2519 			    zio->io_type, zio->io_priority, 0,
2520 			    vdev_raidz_child_done, rc));
2521 		} while (++c < rm->rm_cols);
2522 
2523 		return;
2524 	}
2525 
2526 	/*
2527 	 * At this point we've attempted to reconstruct the data given the
2528 	 * errors we detected, and we've attempted to read all columns. There
2529 	 * must, therefore, be one or more additional problems -- silent errors
2530 	 * resulting in invalid data rather than explicit I/O errors resulting
2531 	 * in absent data. We check if there is enough additional data to
2532 	 * possibly reconstruct the data and then perform combinatorial
2533 	 * reconstruction over all possible combinations. If that fails,
2534 	 * we're cooked.
2535 	 */
2536 	if (total_errors > rm->rm_firstdatacol) {
2537 		zio->io_error = vdev_raidz_worst_error(rm);
2538 
2539 	} else if (total_errors < rm->rm_firstdatacol &&
2540 	    (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2541 		/*
2542 		 * If we didn't use all the available parity for the
2543 		 * combinatorial reconstruction, verify that the remaining
2544 		 * parity is correct.
2545 		 */
2546 		if (code != (1 << rm->rm_firstdatacol) - 1)
2547 			(void) raidz_parity_verify(zio, rm);
2548 	} else {
2549 		/*
2550 		 * We're here because either:
2551 		 *
2552 		 *	total_errors == rm_first_datacol, or
2553 		 *	vdev_raidz_combrec() failed
2554 		 *
2555 		 * In either case, there is enough bad data to prevent
2556 		 * reconstruction.
2557 		 *
2558 		 * Start checksum ereports for all children which haven't
2559 		 * failed, and the IO wasn't speculative.
2560 		 */
2561 		zio->io_error = SET_ERROR(ECKSUM);
2562 
2563 		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2564 			for (c = 0; c < rm->rm_cols; c++) {
2565 				rc = &rm->rm_col[c];
2566 				if (rc->rc_error == 0) {
2567 					zio_bad_cksum_t zbc;
2568 					zbc.zbc_has_cksum = 0;
2569 					zbc.zbc_injected =
2570 					    rm->rm_ecksuminjected;
2571 
2572 					zfs_ereport_start_checksum(
2573 					    zio->io_spa,
2574 					    vd->vdev_child[rc->rc_devidx],
2575 					    &zio->io_bookmark, zio,
2576 					    rc->rc_offset, rc->rc_size,
2577 					    (void *)(uintptr_t)c, &zbc);
2578 				}
2579 			}
2580 		}
2581 	}
2582 
2583 done:
2584 	zio_checksum_verified(zio);
2585 
2586 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2587 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2588 		/*
2589 		 * Use the good data we have in hand to repair damaged children.
2590 		 */
2591 		for (c = 0; c < rm->rm_cols; c++) {
2592 			rc = &rm->rm_col[c];
2593 			cvd = vd->vdev_child[rc->rc_devidx];
2594 
2595 			if (rc->rc_error == 0)
2596 				continue;
2597 
2598 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2599 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
2600 			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2601 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2602 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2603 		}
2604 	}
2605 }
2606 
2607 static void
2608 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2609 {
2610 	if (faulted > vd->vdev_nparity)
2611 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2612 		    VDEV_AUX_NO_REPLICAS);
2613 	else if (degraded + faulted != 0)
2614 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2615 	else
2616 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2617 }
2618 
2619 /*
2620  * Determine if any portion of the provided block resides on a child vdev
2621  * with a dirty DTL and therefore needs to be resilvered.  The function
2622  * assumes that at least one DTL is dirty which imples that full stripe
2623  * width blocks must be resilvered.
2624  */
2625 static boolean_t
2626 vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
2627 {
2628 	uint64_t dcols = vd->vdev_children;
2629 	uint64_t nparity = vd->vdev_nparity;
2630 	uint64_t ashift = vd->vdev_top->vdev_ashift;
2631 	/* The starting RAIDZ (parent) vdev sector of the block. */
2632 	uint64_t b = offset >> ashift;
2633 	/* The zio's size in units of the vdev's minimum sector size. */
2634 	uint64_t s = ((psize - 1) >> ashift) + 1;
2635 	/* The first column for this stripe. */
2636 	uint64_t f = b % dcols;
2637 
2638 	if (s + nparity >= dcols)
2639 		return (B_TRUE);
2640 
2641 	for (uint64_t c = 0; c < s + nparity; c++) {
2642 		uint64_t devidx = (f + c) % dcols;
2643 		vdev_t *cvd = vd->vdev_child[devidx];
2644 
2645 		/*
2646 		 * dsl_scan_need_resilver() already checked vd with
2647 		 * vdev_dtl_contains(). So here just check cvd with
2648 		 * vdev_dtl_empty(), cheaper and a good approximation.
2649 		 */
2650 		if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
2651 			return (B_TRUE);
2652 	}
2653 
2654 	return (B_FALSE);
2655 }
2656 
2657 static void
2658 vdev_raidz_xlate(vdev_t *cvd, const range_seg_t *in, range_seg_t *res)
2659 {
2660 	vdev_t *raidvd = cvd->vdev_parent;
2661 	ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
2662 
2663 	uint64_t width = raidvd->vdev_children;
2664 	uint64_t tgt_col = cvd->vdev_id;
2665 	uint64_t ashift = raidvd->vdev_top->vdev_ashift;
2666 
2667 	/* make sure the offsets are block-aligned */
2668 	ASSERT0(in->rs_start % (1 << ashift));
2669 	ASSERT0(in->rs_end % (1 << ashift));
2670 	uint64_t b_start = in->rs_start >> ashift;
2671 	uint64_t b_end = in->rs_end >> ashift;
2672 
2673 	uint64_t start_row = 0;
2674 	if (b_start > tgt_col) /* avoid underflow */
2675 		start_row = ((b_start - tgt_col - 1) / width) + 1;
2676 
2677 	uint64_t end_row = 0;
2678 	if (b_end > tgt_col)
2679 		end_row = ((b_end - tgt_col - 1) / width) + 1;
2680 
2681 	res->rs_start = start_row << ashift;
2682 	res->rs_end = end_row << ashift;
2683 
2684 	ASSERT3U(res->rs_start, <=, in->rs_start);
2685 	ASSERT3U(res->rs_end - res->rs_start, <=, in->rs_end - in->rs_start);
2686 }
2687 
2688 vdev_ops_t vdev_raidz_ops = {
2689 	.vdev_op_open = vdev_raidz_open,
2690 	.vdev_op_close = vdev_raidz_close,
2691 	.vdev_op_asize = vdev_raidz_asize,
2692 	.vdev_op_io_start = vdev_raidz_io_start,
2693 	.vdev_op_io_done = vdev_raidz_io_done,
2694 	.vdev_op_state_change = vdev_raidz_state_change,
2695 	.vdev_op_need_resilver = vdev_raidz_need_resilver,
2696 	.vdev_op_hold = NULL,
2697 	.vdev_op_rele = NULL,
2698 	.vdev_op_remap = NULL,
2699 	.vdev_op_xlate = vdev_raidz_xlate,
2700 	.vdev_op_type = VDEV_TYPE_RAIDZ,	/* name of this vdev type */
2701 	.vdev_op_leaf = B_FALSE			/* not a leaf vdev */
2702 };
2703