xref: /illumos-gate/usr/src/uts/common/fs/zfs/vdev_raidz.c (revision ac04831df9010f533275633d3bb9829b86c171d3)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
25  * Copyright 2019 Joyent, Inc.
26  * Copyright (c) 2014 Integros [integros.com]
27  */
28 
29 #include <sys/zfs_context.h>
30 #include <sys/spa.h>
31 #include <sys/vdev_impl.h>
32 #include <sys/vdev_file.h>
33 #include <sys/zio.h>
34 #include <sys/zio_checksum.h>
35 #include <sys/abd.h>
36 #include <sys/fs/zfs.h>
37 #include <sys/fm/fs/zfs.h>
38 
39 #ifdef ZFS_DEBUG
40 #include <sys/vdev.h>	/* For vdev_xlate() in vdev_raidz_io_verify() */
41 #endif
42 
43 /*
44  * Virtual device vector for RAID-Z.
45  *
46  * This vdev supports single, double, and triple parity. For single parity,
47  * we use a simple XOR of all the data columns. For double or triple parity,
48  * we use a special case of Reed-Solomon coding. This extends the
49  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
50  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
51  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
52  * former is also based. The latter is designed to provide higher performance
53  * for writes.
54  *
55  * Note that the Plank paper claimed to support arbitrary N+M, but was then
56  * amended six years later identifying a critical flaw that invalidates its
57  * claims. Nevertheless, the technique can be adapted to work for up to
58  * triple parity. For additional parity, the amendment "Note: Correction to
59  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
60  * is viable, but the additional complexity means that write performance will
61  * suffer.
62  *
63  * All of the methods above operate on a Galois field, defined over the
64  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
65  * can be expressed with a single byte. Briefly, the operations on the
66  * field are defined as follows:
67  *
68  *   o addition (+) is represented by a bitwise XOR
69  *   o subtraction (-) is therefore identical to addition: A + B = A - B
70  *   o multiplication of A by 2 is defined by the following bitwise expression:
71  *
72  *	(A * 2)_7 = A_6
73  *	(A * 2)_6 = A_5
74  *	(A * 2)_5 = A_4
75  *	(A * 2)_4 = A_3 + A_7
76  *	(A * 2)_3 = A_2 + A_7
77  *	(A * 2)_2 = A_1 + A_7
78  *	(A * 2)_1 = A_0
79  *	(A * 2)_0 = A_7
80  *
81  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
82  * As an aside, this multiplication is derived from the error correcting
83  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
84  *
85  * Observe that any number in the field (except for 0) can be expressed as a
86  * power of 2 -- a generator for the field. We store a table of the powers of
87  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
88  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
89  * than field addition). The inverse of a field element A (A^-1) is therefore
90  * A ^ (255 - 1) = A^254.
91  *
92  * The up-to-three parity columns, P, Q, R over several data columns,
93  * D_0, ... D_n-1, can be expressed by field operations:
94  *
95  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
96  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
97  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
98  *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
99  *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
100  *
101  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
102  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
103  * independent coefficients. (There are no additional coefficients that have
104  * this property which is why the uncorrected Plank method breaks down.)
105  *
106  * See the reconstruction code below for how P, Q and R can used individually
107  * or in concert to recover missing data columns.
108  */
109 
110 typedef struct raidz_col {
111 	uint64_t rc_devidx;		/* child device index for I/O */
112 	uint64_t rc_offset;		/* device offset */
113 	uint64_t rc_size;		/* I/O size */
114 	abd_t *rc_abd;			/* I/O data */
115 	void *rc_gdata;			/* used to store the "good" version */
116 	int rc_error;			/* I/O error for this device */
117 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
118 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
119 } raidz_col_t;
120 
121 typedef struct raidz_map {
122 	uint64_t rm_cols;		/* Regular column count */
123 	uint64_t rm_scols;		/* Count including skipped columns */
124 	uint64_t rm_bigcols;		/* Number of oversized columns */
125 	uint64_t rm_asize;		/* Actual total I/O size */
126 	uint64_t rm_missingdata;	/* Count of missing data devices */
127 	uint64_t rm_missingparity;	/* Count of missing parity devices */
128 	uint64_t rm_firstdatacol;	/* First data column/parity count */
129 	uint64_t rm_nskip;		/* Skipped sectors for padding */
130 	uint64_t rm_skipstart;		/* Column index of padding start */
131 	abd_t *rm_abd_copy;		/* rm_asize-buffer of copied data */
132 	uintptr_t rm_reports;		/* # of referencing checksum reports */
133 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
134 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
135 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
136 } raidz_map_t;
137 
138 #define	VDEV_RAIDZ_P		0
139 #define	VDEV_RAIDZ_Q		1
140 #define	VDEV_RAIDZ_R		2
141 
142 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
143 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
144 
145 /*
146  * We provide a mechanism to perform the field multiplication operation on a
147  * 64-bit value all at once rather than a byte at a time. This works by
148  * creating a mask from the top bit in each byte and using that to
149  * conditionally apply the XOR of 0x1d.
150  */
151 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
152 { \
153 	(mask) = (x) & 0x8080808080808080ULL; \
154 	(mask) = ((mask) << 1) - ((mask) >> 7); \
155 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
156 	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
157 }
158 
159 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
160 { \
161 	VDEV_RAIDZ_64MUL_2((x), mask); \
162 	VDEV_RAIDZ_64MUL_2((x), mask); \
163 }
164 
165 #define	VDEV_LABEL_OFFSET(x)	(x + VDEV_LABEL_START_SIZE)
166 
167 /*
168  * Force reconstruction to use the general purpose method.
169  */
170 int vdev_raidz_default_to_general;
171 
172 /* Powers of 2 in the Galois field defined above. */
173 static const uint8_t vdev_raidz_pow2[256] = {
174 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
175 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
176 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
177 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
178 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
179 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
180 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
181 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
182 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
183 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
184 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
185 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
186 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
187 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
188 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
189 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
190 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
191 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
192 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
193 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
194 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
195 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
196 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
197 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
198 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
199 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
200 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
201 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
202 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
203 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
204 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
205 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
206 };
207 /* Logs of 2 in the Galois field defined above. */
208 static const uint8_t vdev_raidz_log2[256] = {
209 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
210 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
211 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
212 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
213 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
214 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
215 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
216 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
217 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
218 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
219 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
220 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
221 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
222 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
223 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
224 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
225 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
226 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
227 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
228 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
229 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
230 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
231 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
232 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
233 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
234 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
235 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
236 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
237 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
238 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
239 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
240 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
241 };
242 
243 static void vdev_raidz_generate_parity(raidz_map_t *rm);
244 
245 /*
246  * Multiply a given number by 2 raised to the given power.
247  */
248 static uint8_t
249 vdev_raidz_exp2(uint_t a, int exp)
250 {
251 	if (a == 0)
252 		return (0);
253 
254 	ASSERT(exp >= 0);
255 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
256 
257 	exp += vdev_raidz_log2[a];
258 	if (exp > 255)
259 		exp -= 255;
260 
261 	return (vdev_raidz_pow2[exp]);
262 }
263 
264 static void
265 vdev_raidz_map_free(raidz_map_t *rm)
266 {
267 	int c;
268 
269 	for (c = 0; c < rm->rm_firstdatacol; c++) {
270 		abd_free(rm->rm_col[c].rc_abd);
271 
272 		if (rm->rm_col[c].rc_gdata != NULL)
273 			abd_free(rm->rm_col[c].rc_gdata);
274 
275 	}
276 
277 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
278 		abd_put(rm->rm_col[c].rc_abd);
279 
280 	if (rm->rm_abd_copy != NULL)
281 		abd_free(rm->rm_abd_copy);
282 
283 	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
284 }
285 
286 static void
287 vdev_raidz_map_free_vsd(zio_t *zio)
288 {
289 	raidz_map_t *rm = zio->io_vsd;
290 
291 	ASSERT0(rm->rm_freed);
292 	rm->rm_freed = 1;
293 
294 	if (rm->rm_reports == 0)
295 		vdev_raidz_map_free(rm);
296 }
297 
298 /*ARGSUSED*/
299 static void
300 vdev_raidz_cksum_free(void *arg, size_t ignored)
301 {
302 	raidz_map_t *rm = arg;
303 
304 	ASSERT3U(rm->rm_reports, >, 0);
305 
306 	if (--rm->rm_reports == 0 && rm->rm_freed != 0)
307 		vdev_raidz_map_free(rm);
308 }
309 
310 static void
311 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data)
312 {
313 	raidz_map_t *rm = zcr->zcr_cbdata;
314 	size_t c = zcr->zcr_cbinfo;
315 	size_t x, offset;
316 
317 	const abd_t *good = NULL;
318 	const abd_t *bad = rm->rm_col[c].rc_abd;
319 
320 	if (good_data == NULL) {
321 		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
322 		return;
323 	}
324 
325 	if (c < rm->rm_firstdatacol) {
326 		/*
327 		 * The first time through, calculate the parity blocks for
328 		 * the good data (this relies on the fact that the good
329 		 * data never changes for a given logical ZIO)
330 		 */
331 		if (rm->rm_col[0].rc_gdata == NULL) {
332 			abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY];
333 
334 			/*
335 			 * Set up the rm_col[]s to generate the parity for
336 			 * good_data, first saving the parity bufs and
337 			 * replacing them with buffers to hold the result.
338 			 */
339 			for (x = 0; x < rm->rm_firstdatacol; x++) {
340 				bad_parity[x] = rm->rm_col[x].rc_abd;
341 				rm->rm_col[x].rc_abd =
342 				    rm->rm_col[x].rc_gdata =
343 				    abd_alloc_sametype(rm->rm_col[x].rc_abd,
344 				    rm->rm_col[x].rc_size);
345 			}
346 
347 			/* fill in the data columns from good_data */
348 			offset = 0;
349 			for (; x < rm->rm_cols; x++) {
350 				abd_put(rm->rm_col[x].rc_abd);
351 
352 				rm->rm_col[x].rc_abd =
353 				    abd_get_offset_size((abd_t *)good_data,
354 				    offset, rm->rm_col[x].rc_size);
355 				offset += rm->rm_col[x].rc_size;
356 			}
357 
358 			/*
359 			 * Construct the parity from the good data.
360 			 */
361 			vdev_raidz_generate_parity(rm);
362 
363 			/* restore everything back to its original state */
364 			for (x = 0; x < rm->rm_firstdatacol; x++)
365 				rm->rm_col[x].rc_abd = bad_parity[x];
366 
367 			offset = 0;
368 			for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
369 				abd_put(rm->rm_col[x].rc_abd);
370 				rm->rm_col[x].rc_abd = abd_get_offset_size(
371 				    rm->rm_abd_copy, offset,
372 				    rm->rm_col[x].rc_size);
373 				offset += rm->rm_col[x].rc_size;
374 			}
375 		}
376 
377 		ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
378 		good = abd_get_offset_size(rm->rm_col[c].rc_gdata, 0,
379 		    rm->rm_col[c].rc_size);
380 	} else {
381 		/* adjust good_data to point at the start of our column */
382 		offset = 0;
383 		for (x = rm->rm_firstdatacol; x < c; x++)
384 			offset += rm->rm_col[x].rc_size;
385 
386 		good = abd_get_offset_size((abd_t *)good_data, offset,
387 		    rm->rm_col[c].rc_size);
388 	}
389 
390 	/* we drop the ereport if it ends up that the data was good */
391 	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
392 	abd_put((abd_t *)good);
393 }
394 
395 /*
396  * Invoked indirectly by zfs_ereport_start_checksum(), called
397  * below when our read operation fails completely.  The main point
398  * is to keep a copy of everything we read from disk, so that at
399  * vdev_raidz_cksum_finish() time we can compare it with the good data.
400  */
401 static void
402 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
403 {
404 	size_t c = (size_t)(uintptr_t)arg;
405 	size_t offset;
406 
407 	raidz_map_t *rm = zio->io_vsd;
408 	size_t size;
409 
410 	/* set up the report and bump the refcount  */
411 	zcr->zcr_cbdata = rm;
412 	zcr->zcr_cbinfo = c;
413 	zcr->zcr_finish = vdev_raidz_cksum_finish;
414 	zcr->zcr_free = vdev_raidz_cksum_free;
415 
416 	rm->rm_reports++;
417 	ASSERT3U(rm->rm_reports, >, 0);
418 
419 	if (rm->rm_abd_copy != NULL)
420 		return;
421 
422 	/*
423 	 * It's the first time we're called for this raidz_map_t, so we need
424 	 * to copy the data aside; there's no guarantee that our zio's buffer
425 	 * won't be re-used for something else.
426 	 *
427 	 * Our parity data is already in separate buffers, so there's no need
428 	 * to copy them.
429 	 */
430 
431 	size = 0;
432 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
433 		size += rm->rm_col[c].rc_size;
434 
435 	rm->rm_abd_copy = abd_alloc_for_io(size, B_FALSE);
436 
437 	for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
438 		raidz_col_t *col = &rm->rm_col[c];
439 		abd_t *tmp = abd_get_offset_size(rm->rm_abd_copy, offset,
440 		    col->rc_size);
441 
442 		ASSERT3S(tmp->abd_size, >=, col->rc_size);
443 		ASSERT3S(col->rc_abd->abd_size, >=, col->rc_size);
444 		abd_copy_off(tmp, col->rc_abd, 0, 0, col->rc_size);
445 		abd_put(col->rc_abd);
446 		col->rc_abd = tmp;
447 
448 		offset += col->rc_size;
449 	}
450 	ASSERT3U(offset, ==, size);
451 }
452 
453 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
454 	vdev_raidz_map_free_vsd,
455 	vdev_raidz_cksum_report
456 };
457 
458 /*
459  * Divides the IO evenly across all child vdevs; usually, dcols is
460  * the number of children in the target vdev.
461  */
462 static raidz_map_t *
463 vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset,
464     uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
465 {
466 	raidz_map_t *rm;
467 	/* The starting RAIDZ (parent) vdev sector of the block. */
468 	uint64_t b = offset >> unit_shift;
469 	/* The zio's size in units of the vdev's minimum sector size. */
470 	uint64_t s = size >> unit_shift;
471 	/* The first column for this stripe. */
472 	uint64_t f = b % dcols;
473 	/* The starting byte offset on each child vdev. */
474 	uint64_t o = (b / dcols) << unit_shift;
475 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
476 	uint64_t off = 0;
477 
478 	/*
479 	 * "Quotient": The number of data sectors for this stripe on all but
480 	 * the "big column" child vdevs that also contain "remainder" data.
481 	 */
482 	q = s / (dcols - nparity);
483 
484 	/*
485 	 * "Remainder": The number of partial stripe data sectors in this I/O.
486 	 * This will add a sector to some, but not all, child vdevs.
487 	 */
488 	r = s - q * (dcols - nparity);
489 
490 	/* The number of "big columns" - those which contain remainder data. */
491 	bc = (r == 0 ? 0 : r + nparity);
492 
493 	/*
494 	 * The total number of data and parity sectors associated with
495 	 * this I/O.
496 	 */
497 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
498 
499 	/* acols: The columns that will be accessed. */
500 	/* scols: The columns that will be accessed or skipped. */
501 	if (q == 0) {
502 		/* Our I/O request doesn't span all child vdevs. */
503 		acols = bc;
504 		scols = MIN(dcols, roundup(bc, nparity + 1));
505 	} else {
506 		acols = dcols;
507 		scols = dcols;
508 	}
509 
510 	ASSERT3U(acols, <=, scols);
511 
512 	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
513 
514 	rm->rm_cols = acols;
515 	rm->rm_scols = scols;
516 	rm->rm_bigcols = bc;
517 	rm->rm_skipstart = bc;
518 	rm->rm_missingdata = 0;
519 	rm->rm_missingparity = 0;
520 	rm->rm_firstdatacol = nparity;
521 	rm->rm_abd_copy = NULL;
522 	rm->rm_reports = 0;
523 	rm->rm_freed = 0;
524 	rm->rm_ecksuminjected = 0;
525 
526 	asize = 0;
527 
528 	for (c = 0; c < scols; c++) {
529 		col = f + c;
530 		coff = o;
531 		if (col >= dcols) {
532 			col -= dcols;
533 			coff += 1ULL << unit_shift;
534 		}
535 		rm->rm_col[c].rc_devidx = col;
536 		rm->rm_col[c].rc_offset = coff;
537 		rm->rm_col[c].rc_abd = NULL;
538 		rm->rm_col[c].rc_gdata = NULL;
539 		rm->rm_col[c].rc_error = 0;
540 		rm->rm_col[c].rc_tried = 0;
541 		rm->rm_col[c].rc_skipped = 0;
542 
543 		if (c >= acols)
544 			rm->rm_col[c].rc_size = 0;
545 		else if (c < bc)
546 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
547 		else
548 			rm->rm_col[c].rc_size = q << unit_shift;
549 
550 		asize += rm->rm_col[c].rc_size;
551 	}
552 
553 	ASSERT3U(asize, ==, tot << unit_shift);
554 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
555 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
556 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
557 	ASSERT3U(rm->rm_nskip, <=, nparity);
558 
559 	for (c = 0; c < rm->rm_firstdatacol; c++)
560 		rm->rm_col[c].rc_abd =
561 		    abd_alloc_linear(rm->rm_col[c].rc_size, B_FALSE);
562 
563 	rm->rm_col[c].rc_abd = abd_get_offset_size(abd, 0,
564 	    rm->rm_col[c].rc_size);
565 	off = rm->rm_col[c].rc_size;
566 
567 	for (c = c + 1; c < acols; c++) {
568 		rm->rm_col[c].rc_abd = abd_get_offset_size(abd, off,
569 		    rm->rm_col[c].rc_size);
570 		off += rm->rm_col[c].rc_size;
571 	}
572 
573 	/*
574 	 * If all data stored spans all columns, there's a danger that parity
575 	 * will always be on the same device and, since parity isn't read
576 	 * during normal operation, that that device's I/O bandwidth won't be
577 	 * used effectively. We therefore switch the parity every 1MB.
578 	 *
579 	 * ... at least that was, ostensibly, the theory. As a practical
580 	 * matter unless we juggle the parity between all devices evenly, we
581 	 * won't see any benefit. Further, occasional writes that aren't a
582 	 * multiple of the LCM of the number of children and the minimum
583 	 * stripe width are sufficient to avoid pessimal behavior.
584 	 * Unfortunately, this decision created an implicit on-disk format
585 	 * requirement that we need to support for all eternity, but only
586 	 * for single-parity RAID-Z.
587 	 *
588 	 * If we intend to skip a sector in the zeroth column for padding
589 	 * we must make sure to note this swap. We will never intend to
590 	 * skip the first column since at least one data and one parity
591 	 * column must appear in each row.
592 	 */
593 	ASSERT(rm->rm_cols >= 2);
594 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
595 
596 	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
597 		devidx = rm->rm_col[0].rc_devidx;
598 		o = rm->rm_col[0].rc_offset;
599 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
600 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
601 		rm->rm_col[1].rc_devidx = devidx;
602 		rm->rm_col[1].rc_offset = o;
603 
604 		if (rm->rm_skipstart == 0)
605 			rm->rm_skipstart = 1;
606 	}
607 
608 	return (rm);
609 }
610 
611 struct pqr_struct {
612 	uint64_t *p;
613 	uint64_t *q;
614 	uint64_t *r;
615 };
616 
617 static int
618 vdev_raidz_p_func(void *buf, size_t size, void *private)
619 {
620 	struct pqr_struct *pqr = private;
621 	const uint64_t *src = buf;
622 	int i, cnt = size / sizeof (src[0]);
623 
624 	ASSERT(pqr->p && !pqr->q && !pqr->r);
625 
626 	for (i = 0; i < cnt; i++, src++, pqr->p++)
627 		*pqr->p ^= *src;
628 
629 	return (0);
630 }
631 
632 static int
633 vdev_raidz_pq_func(void *buf, size_t size, void *private)
634 {
635 	struct pqr_struct *pqr = private;
636 	const uint64_t *src = buf;
637 	uint64_t mask;
638 	int i, cnt = size / sizeof (src[0]);
639 
640 	ASSERT(pqr->p && pqr->q && !pqr->r);
641 
642 	for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
643 		*pqr->p ^= *src;
644 		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
645 		*pqr->q ^= *src;
646 	}
647 
648 	return (0);
649 }
650 
651 static int
652 vdev_raidz_pqr_func(void *buf, size_t size, void *private)
653 {
654 	struct pqr_struct *pqr = private;
655 	const uint64_t *src = buf;
656 	uint64_t mask;
657 	int i, cnt = size / sizeof (src[0]);
658 
659 	ASSERT(pqr->p && pqr->q && pqr->r);
660 
661 	for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
662 		*pqr->p ^= *src;
663 		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
664 		*pqr->q ^= *src;
665 		VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
666 		*pqr->r ^= *src;
667 	}
668 
669 	return (0);
670 }
671 
672 static void
673 vdev_raidz_generate_parity_p(raidz_map_t *rm)
674 {
675 	uint64_t *p;
676 	int c;
677 	abd_t *src;
678 
679 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
680 		src = rm->rm_col[c].rc_abd;
681 		p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
682 
683 		if (c == rm->rm_firstdatacol) {
684 			ASSERT3U(src->abd_size, >=, rm->rm_col[c].rc_size);
685 			abd_copy_to_buf_off(p, src, 0, rm->rm_col[c].rc_size);
686 		} else {
687 			struct pqr_struct pqr = { p, NULL, NULL };
688 			(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
689 			    vdev_raidz_p_func, &pqr);
690 		}
691 	}
692 }
693 
694 static void
695 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
696 {
697 	uint64_t *p, *q, pcnt, ccnt, mask, i;
698 	int c;
699 	abd_t *src;
700 
701 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
702 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
703 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
704 
705 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
706 		src = rm->rm_col[c].rc_abd;
707 		p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
708 		q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
709 
710 		ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
711 
712 		if (c == rm->rm_firstdatacol) {
713 			ASSERT(ccnt == pcnt || ccnt == 0);
714 
715 			abd_copy_to_buf_off(p, src, 0, rm->rm_col[c].rc_size);
716 			(void) memcpy(q, p, rm->rm_col[c].rc_size);
717 			for (i = ccnt; i < pcnt; i++) {
718 				p[i] = 0;
719 				q[i] = 0;
720 			}
721 		} else {
722 			struct pqr_struct pqr = { p, q, NULL };
723 
724 			ASSERT(ccnt <= pcnt);
725 
726 			(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
727 			    vdev_raidz_pq_func, &pqr);
728 
729 			/*
730 			 * Treat short columns as though they are full of 0s.
731 			 * Note that there's therefore nothing needed for P.
732 			 */
733 			for (i = ccnt; i < pcnt; i++) {
734 				VDEV_RAIDZ_64MUL_2(q[i], mask);
735 			}
736 		}
737 	}
738 }
739 
740 static void
741 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
742 {
743 	uint64_t *p, *q, *r, pcnt, ccnt, mask, i;
744 	int c;
745 	abd_t *src;
746 
747 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
748 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
749 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
750 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
751 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
752 
753 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
754 		src = rm->rm_col[c].rc_abd;
755 		p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
756 		q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
757 		r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd);
758 
759 		ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
760 
761 		if (c == rm->rm_firstdatacol) {
762 			ASSERT3S(src->abd_size, >=, rm->rm_col[c].rc_size);
763 			ASSERT(ccnt == pcnt || ccnt == 0);
764 			abd_copy_to_buf_off(p, src, 0, rm->rm_col[c].rc_size);
765 			(void) memcpy(q, p, rm->rm_col[c].rc_size);
766 			(void) memcpy(r, p, rm->rm_col[c].rc_size);
767 
768 			for (i = ccnt; i < pcnt; i++) {
769 				p[i] = 0;
770 				q[i] = 0;
771 				r[i] = 0;
772 			}
773 		} else {
774 			struct pqr_struct pqr = { p, q, r };
775 
776 			ASSERT(ccnt <= pcnt);
777 			(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
778 			    vdev_raidz_pqr_func, &pqr);
779 
780 			/*
781 			 * Treat short columns as though they are full of 0s.
782 			 * Note that there's therefore nothing needed for P.
783 			 */
784 			for (i = ccnt; i < pcnt; i++) {
785 				VDEV_RAIDZ_64MUL_2(q[i], mask);
786 				VDEV_RAIDZ_64MUL_4(r[i], mask);
787 			}
788 		}
789 	}
790 }
791 
792 /*
793  * Generate RAID parity in the first virtual columns according to the number of
794  * parity columns available.
795  */
796 static void
797 vdev_raidz_generate_parity(raidz_map_t *rm)
798 {
799 	switch (rm->rm_firstdatacol) {
800 	case 1:
801 		vdev_raidz_generate_parity_p(rm);
802 		break;
803 	case 2:
804 		vdev_raidz_generate_parity_pq(rm);
805 		break;
806 	case 3:
807 		vdev_raidz_generate_parity_pqr(rm);
808 		break;
809 	default:
810 		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
811 	}
812 }
813 
814 /* ARGSUSED */
815 static int
816 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
817 {
818 	uint64_t *dst = dbuf;
819 	uint64_t *src = sbuf;
820 	int cnt = size / sizeof (src[0]);
821 
822 	for (int i = 0; i < cnt; i++) {
823 		dst[i] ^= src[i];
824 	}
825 
826 	return (0);
827 }
828 
829 /* ARGSUSED */
830 static int
831 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
832     void *private)
833 {
834 	uint64_t *dst = dbuf;
835 	uint64_t *src = sbuf;
836 	uint64_t mask;
837 	int cnt = size / sizeof (dst[0]);
838 
839 	for (int i = 0; i < cnt; i++, dst++, src++) {
840 		VDEV_RAIDZ_64MUL_2(*dst, mask);
841 		*dst ^= *src;
842 	}
843 
844 	return (0);
845 }
846 
847 /* ARGSUSED */
848 static int
849 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
850 {
851 	uint64_t *dst = buf;
852 	uint64_t mask;
853 	int cnt = size / sizeof (dst[0]);
854 
855 	for (int i = 0; i < cnt; i++, dst++) {
856 		/* same operation as vdev_raidz_reconst_q_pre_func() on dst */
857 		VDEV_RAIDZ_64MUL_2(*dst, mask);
858 	}
859 
860 	return (0);
861 }
862 
863 struct reconst_q_struct {
864 	uint64_t *q;
865 	int exp;
866 };
867 
868 static int
869 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
870 {
871 	struct reconst_q_struct *rq = private;
872 	uint64_t *dst = buf;
873 	int cnt = size / sizeof (dst[0]);
874 
875 	for (int i = 0; i < cnt; i++, dst++, rq->q++) {
876 		*dst ^= *rq->q;
877 
878 		int j;
879 		uint8_t *b;
880 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
881 			*b = vdev_raidz_exp2(*b, rq->exp);
882 		}
883 	}
884 
885 	return (0);
886 }
887 
888 struct reconst_pq_struct {
889 	uint8_t *p;
890 	uint8_t *q;
891 	uint8_t *pxy;
892 	uint8_t *qxy;
893 	int aexp;
894 	int bexp;
895 };
896 
897 static int
898 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
899 {
900 	struct reconst_pq_struct *rpq = private;
901 	uint8_t *xd = xbuf;
902 	uint8_t *yd = ybuf;
903 
904 	for (int i = 0; i < size;
905 	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
906 		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
907 		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
908 		*yd = *rpq->p ^ *rpq->pxy ^ *xd;
909 	}
910 
911 	return (0);
912 }
913 
914 static int
915 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
916 {
917 	struct reconst_pq_struct *rpq = private;
918 	uint8_t *xd = xbuf;
919 
920 	for (int i = 0; i < size;
921 	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
922 		/* same operation as vdev_raidz_reconst_pq_func() on xd */
923 		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
924 		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
925 	}
926 
927 	return (0);
928 }
929 
930 static int
931 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
932 {
933 	int x = tgts[0];
934 	int c;
935 	abd_t *dst, *src;
936 
937 	ASSERT(ntgts == 1);
938 	ASSERT(x >= rm->rm_firstdatacol);
939 	ASSERT(x < rm->rm_cols);
940 
941 	ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size);
942 	ASSERT(rm->rm_col[x].rc_size > 0);
943 
944 	src = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
945 	dst = rm->rm_col[x].rc_abd;
946 
947 	ASSERT3S(dst->abd_size, >=, rm->rm_col[x].rc_size);
948 	ASSERT3S(src->abd_size, >=, rm->rm_col[x].rc_size);
949 	abd_copy_off(dst, src, 0, 0, rm->rm_col[x].rc_size);
950 
951 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
952 		uint64_t size = MIN(rm->rm_col[x].rc_size,
953 		    rm->rm_col[c].rc_size);
954 
955 		src = rm->rm_col[c].rc_abd;
956 		dst = rm->rm_col[x].rc_abd;
957 
958 		if (c == x)
959 			continue;
960 
961 		(void) abd_iterate_func2(dst, src, 0, 0, size,
962 		    vdev_raidz_reconst_p_func, NULL);
963 	}
964 
965 	return (1 << VDEV_RAIDZ_P);
966 }
967 
968 static int
969 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
970 {
971 	int x = tgts[0];
972 	int c, exp;
973 	abd_t *dst, *src;
974 
975 	ASSERT(ntgts == 1);
976 
977 	ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size);
978 
979 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
980 		uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size,
981 		    rm->rm_col[c].rc_size);
982 
983 		src = rm->rm_col[c].rc_abd;
984 		dst = rm->rm_col[x].rc_abd;
985 
986 		if (c == rm->rm_firstdatacol) {
987 			if (dst != src) {
988 				ASSERT3S(dst->abd_size, >=, size);
989 				ASSERT3S(src->abd_size, >=, size);
990 				abd_copy_off(dst, src, 0, 0, size);
991 			}
992 			if (rm->rm_col[x].rc_size > size)
993 				abd_zero_off(dst, size,
994 				    rm->rm_col[x].rc_size - size);
995 		} else {
996 			ASSERT3U(size, <=, rm->rm_col[x].rc_size);
997 			if (src != dst)
998 				(void) abd_iterate_func2(dst, src, 0, 0, size,
999 				    vdev_raidz_reconst_q_pre_func, NULL);
1000 			(void) abd_iterate_func(dst,
1001 			    size, rm->rm_col[x].rc_size - size,
1002 			    vdev_raidz_reconst_q_pre_tail_func, NULL);
1003 		}
1004 	}
1005 
1006 	src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
1007 	dst = rm->rm_col[x].rc_abd;
1008 	exp = 255 - (rm->rm_cols - 1 - x);
1009 
1010 	struct reconst_q_struct rq = { abd_to_buf(src), exp };
1011 	(void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size,
1012 	    vdev_raidz_reconst_q_post_func, &rq);
1013 
1014 	return (1 << VDEV_RAIDZ_Q);
1015 }
1016 
1017 static int
1018 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
1019 {
1020 	uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
1021 	abd_t *pdata, *qdata;
1022 	uint64_t xsize, ysize;
1023 	int x = tgts[0];
1024 	int y = tgts[1];
1025 	abd_t *xd, *yd;
1026 
1027 	ASSERT(ntgts == 2);
1028 	ASSERT(x < y);
1029 	ASSERT(x >= rm->rm_firstdatacol);
1030 	ASSERT(y < rm->rm_cols);
1031 
1032 	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
1033 
1034 	/*
1035 	 * Move the parity data aside -- we're going to compute parity as
1036 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
1037 	 * reuse the parity generation mechanism without trashing the actual
1038 	 * parity so we make those columns appear to be full of zeros by
1039 	 * setting their lengths to zero.
1040 	 */
1041 	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
1042 	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
1043 	xsize = rm->rm_col[x].rc_size;
1044 	ysize = rm->rm_col[y].rc_size;
1045 
1046 	rm->rm_col[VDEV_RAIDZ_P].rc_abd =
1047 	    abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
1048 	rm->rm_col[VDEV_RAIDZ_Q].rc_abd =
1049 	    abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
1050 	rm->rm_col[x].rc_size = 0;
1051 	rm->rm_col[y].rc_size = 0;
1052 
1053 	vdev_raidz_generate_parity_pq(rm);
1054 
1055 	rm->rm_col[x].rc_size = xsize;
1056 	rm->rm_col[y].rc_size = ysize;
1057 
1058 	p = abd_to_buf(pdata);
1059 	q = abd_to_buf(qdata);
1060 	pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
1061 	qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
1062 	xd = rm->rm_col[x].rc_abd;
1063 	yd = rm->rm_col[y].rc_abd;
1064 
1065 	/*
1066 	 * We now have:
1067 	 *	Pxy = P + D_x + D_y
1068 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
1069 	 *
1070 	 * We can then solve for D_x:
1071 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
1072 	 * where
1073 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
1074 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
1075 	 *
1076 	 * With D_x in hand, we can easily solve for D_y:
1077 	 *	D_y = P + Pxy + D_x
1078 	 */
1079 
1080 	a = vdev_raidz_pow2[255 + x - y];
1081 	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
1082 	tmp = 255 - vdev_raidz_log2[a ^ 1];
1083 
1084 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
1085 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
1086 
1087 	ASSERT3U(xsize, >=, ysize);
1088 	struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
1089 	(void) abd_iterate_func2(xd, yd, 0, 0, ysize,
1090 	    vdev_raidz_reconst_pq_func, &rpq);
1091 	(void) abd_iterate_func(xd, ysize, xsize - ysize,
1092 	    vdev_raidz_reconst_pq_tail_func, &rpq);
1093 
1094 	abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
1095 	abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
1096 
1097 	/*
1098 	 * Restore the saved parity data.
1099 	 */
1100 	rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata;
1101 	rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata;
1102 
1103 	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
1104 }
1105 
1106 /* BEGIN CSTYLED */
1107 /*
1108  * In the general case of reconstruction, we must solve the system of linear
1109  * equations defined by the coeffecients used to generate parity as well as
1110  * the contents of the data and parity disks. This can be expressed with
1111  * vectors for the original data (D) and the actual data (d) and parity (p)
1112  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
1113  *
1114  *            __   __                     __     __
1115  *            |     |         __     __   |  p_0  |
1116  *            |  V  |         |  D_0  |   | p_m-1 |
1117  *            |     |    x    |   :   | = |  d_0  |
1118  *            |  I  |         | D_n-1 |   |   :   |
1119  *            |     |         ~~     ~~   | d_n-1 |
1120  *            ~~   ~~                     ~~     ~~
1121  *
1122  * I is simply a square identity matrix of size n, and V is a vandermonde
1123  * matrix defined by the coeffecients we chose for the various parity columns
1124  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
1125  * computation as well as linear separability.
1126  *
1127  *      __               __               __     __
1128  *      |   1   ..  1 1 1 |               |  p_0  |
1129  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
1130  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
1131  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
1132  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
1133  *      |   :       : : : |   |   :   |   |  d_2  |
1134  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
1135  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
1136  *      |   0   ..  0 0 1 |               | d_n-1 |
1137  *      ~~               ~~               ~~     ~~
1138  *
1139  * Note that I, V, d, and p are known. To compute D, we must invert the
1140  * matrix and use the known data and parity values to reconstruct the unknown
1141  * data values. We begin by removing the rows in V|I and d|p that correspond
1142  * to failed or missing columns; we then make V|I square (n x n) and d|p
1143  * sized n by removing rows corresponding to unused parity from the bottom up
1144  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
1145  * using Gauss-Jordan elimination. In the example below we use m=3 parity
1146  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
1147  *           __                               __
1148  *           |  1   1   1   1   1   1   1   1  |
1149  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
1150  *           |  19 205 116  29  64  16  4   1  |      / /
1151  *           |  1   0   0   0   0   0   0   0  |     / /
1152  *           |  0   1   0   0   0   0   0   0  | <--' /
1153  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
1154  *           |  0   0   0   1   0   0   0   0  |
1155  *           |  0   0   0   0   1   0   0   0  |
1156  *           |  0   0   0   0   0   1   0   0  |
1157  *           |  0   0   0   0   0   0   1   0  |
1158  *           |  0   0   0   0   0   0   0   1  |
1159  *           ~~                               ~~
1160  *           __                               __
1161  *           |  1   1   1   1   1   1   1   1  |
1162  *           |  19 205 116  29  64  16  4   1  |
1163  *           |  1   0   0   0   0   0   0   0  |
1164  *  (V|I)' = |  0   0   0   1   0   0   0   0  |
1165  *           |  0   0   0   0   1   0   0   0  |
1166  *           |  0   0   0   0   0   1   0   0  |
1167  *           |  0   0   0   0   0   0   1   0  |
1168  *           |  0   0   0   0   0   0   0   1  |
1169  *           ~~                               ~~
1170  *
1171  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1172  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1173  * matrix is not singular.
1174  * __                                                                 __
1175  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1176  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1177  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1178  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1179  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1180  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1181  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1182  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1183  * ~~                                                                 ~~
1184  * __                                                                 __
1185  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1186  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1187  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1188  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1189  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1190  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1191  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1192  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1193  * ~~                                                                 ~~
1194  * __                                                                 __
1195  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1196  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1197  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1198  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1199  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1200  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1201  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1202  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1203  * ~~                                                                 ~~
1204  * __                                                                 __
1205  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1206  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1207  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1208  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1209  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1210  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1211  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1212  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1213  * ~~                                                                 ~~
1214  * __                                                                 __
1215  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1216  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1217  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1218  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1219  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1220  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1221  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1222  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1223  * ~~                                                                 ~~
1224  * __                                                                 __
1225  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1226  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1227  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1228  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1229  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1230  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1231  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1232  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1233  * ~~                                                                 ~~
1234  *                   __                               __
1235  *                   |  0   0   1   0   0   0   0   0  |
1236  *                   | 167 100  5   41 159 169 217 208 |
1237  *                   | 166 100  4   40 158 168 216 209 |
1238  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1239  *                   |  0   0   0   0   1   0   0   0  |
1240  *                   |  0   0   0   0   0   1   0   0  |
1241  *                   |  0   0   0   0   0   0   1   0  |
1242  *                   |  0   0   0   0   0   0   0   1  |
1243  *                   ~~                               ~~
1244  *
1245  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1246  * of the missing data.
1247  *
1248  * As is apparent from the example above, the only non-trivial rows in the
1249  * inverse matrix correspond to the data disks that we're trying to
1250  * reconstruct. Indeed, those are the only rows we need as the others would
1251  * only be useful for reconstructing data known or assumed to be valid. For
1252  * that reason, we only build the coefficients in the rows that correspond to
1253  * targeted columns.
1254  */
1255 /* END CSTYLED */
1256 
1257 static void
1258 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1259     uint8_t **rows)
1260 {
1261 	int i, j;
1262 	int pow;
1263 
1264 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1265 
1266 	/*
1267 	 * Fill in the missing rows of interest.
1268 	 */
1269 	for (i = 0; i < nmap; i++) {
1270 		ASSERT3S(0, <=, map[i]);
1271 		ASSERT3S(map[i], <=, 2);
1272 
1273 		pow = map[i] * n;
1274 		if (pow > 255)
1275 			pow -= 255;
1276 		ASSERT(pow <= 255);
1277 
1278 		for (j = 0; j < n; j++) {
1279 			pow -= map[i];
1280 			if (pow < 0)
1281 				pow += 255;
1282 			rows[i][j] = vdev_raidz_pow2[pow];
1283 		}
1284 	}
1285 }
1286 
1287 static void
1288 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1289     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1290 {
1291 	int i, j, ii, jj;
1292 	uint8_t log;
1293 
1294 	/*
1295 	 * Assert that the first nmissing entries from the array of used
1296 	 * columns correspond to parity columns and that subsequent entries
1297 	 * correspond to data columns.
1298 	 */
1299 	for (i = 0; i < nmissing; i++) {
1300 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
1301 	}
1302 	for (; i < n; i++) {
1303 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1304 	}
1305 
1306 	/*
1307 	 * First initialize the storage where we'll compute the inverse rows.
1308 	 */
1309 	for (i = 0; i < nmissing; i++) {
1310 		for (j = 0; j < n; j++) {
1311 			invrows[i][j] = (i == j) ? 1 : 0;
1312 		}
1313 	}
1314 
1315 	/*
1316 	 * Subtract all trivial rows from the rows of consequence.
1317 	 */
1318 	for (i = 0; i < nmissing; i++) {
1319 		for (j = nmissing; j < n; j++) {
1320 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1321 			jj = used[j] - rm->rm_firstdatacol;
1322 			ASSERT3S(jj, <, n);
1323 			invrows[i][j] = rows[i][jj];
1324 			rows[i][jj] = 0;
1325 		}
1326 	}
1327 
1328 	/*
1329 	 * For each of the rows of interest, we must normalize it and subtract
1330 	 * a multiple of it from the other rows.
1331 	 */
1332 	for (i = 0; i < nmissing; i++) {
1333 		for (j = 0; j < missing[i]; j++) {
1334 			ASSERT0(rows[i][j]);
1335 		}
1336 		ASSERT3U(rows[i][missing[i]], !=, 0);
1337 
1338 		/*
1339 		 * Compute the inverse of the first element and multiply each
1340 		 * element in the row by that value.
1341 		 */
1342 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1343 
1344 		for (j = 0; j < n; j++) {
1345 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1346 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1347 		}
1348 
1349 		for (ii = 0; ii < nmissing; ii++) {
1350 			if (i == ii)
1351 				continue;
1352 
1353 			ASSERT3U(rows[ii][missing[i]], !=, 0);
1354 
1355 			log = vdev_raidz_log2[rows[ii][missing[i]]];
1356 
1357 			for (j = 0; j < n; j++) {
1358 				rows[ii][j] ^=
1359 				    vdev_raidz_exp2(rows[i][j], log);
1360 				invrows[ii][j] ^=
1361 				    vdev_raidz_exp2(invrows[i][j], log);
1362 			}
1363 		}
1364 	}
1365 
1366 	/*
1367 	 * Verify that the data that is left in the rows are properly part of
1368 	 * an identity matrix.
1369 	 */
1370 	for (i = 0; i < nmissing; i++) {
1371 		for (j = 0; j < n; j++) {
1372 			if (j == missing[i]) {
1373 				ASSERT3U(rows[i][j], ==, 1);
1374 			} else {
1375 				ASSERT0(rows[i][j]);
1376 			}
1377 		}
1378 	}
1379 }
1380 
1381 static void
1382 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1383     int *missing, uint8_t **invrows, const uint8_t *used)
1384 {
1385 	int i, j, x, cc, c;
1386 	uint8_t *src;
1387 	uint64_t ccount;
1388 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1389 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1390 	uint8_t log = 0;
1391 	uint8_t val;
1392 	int ll;
1393 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1394 	uint8_t *p, *pp;
1395 	size_t psize;
1396 
1397 	psize = sizeof (invlog[0][0]) * n * nmissing;
1398 	p = kmem_alloc(psize, KM_SLEEP);
1399 
1400 	for (pp = p, i = 0; i < nmissing; i++) {
1401 		invlog[i] = pp;
1402 		pp += n;
1403 	}
1404 
1405 	for (i = 0; i < nmissing; i++) {
1406 		for (j = 0; j < n; j++) {
1407 			ASSERT3U(invrows[i][j], !=, 0);
1408 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1409 		}
1410 	}
1411 
1412 	for (i = 0; i < n; i++) {
1413 		c = used[i];
1414 		ASSERT3U(c, <, rm->rm_cols);
1415 
1416 		src = abd_to_buf(rm->rm_col[c].rc_abd);
1417 		ccount = rm->rm_col[c].rc_size;
1418 		for (j = 0; j < nmissing; j++) {
1419 			cc = missing[j] + rm->rm_firstdatacol;
1420 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1421 			ASSERT3U(cc, <, rm->rm_cols);
1422 			ASSERT3U(cc, !=, c);
1423 
1424 			dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd);
1425 			dcount[j] = rm->rm_col[cc].rc_size;
1426 		}
1427 
1428 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1429 
1430 		for (x = 0; x < ccount; x++, src++) {
1431 			if (*src != 0)
1432 				log = vdev_raidz_log2[*src];
1433 
1434 			for (cc = 0; cc < nmissing; cc++) {
1435 				if (x >= dcount[cc])
1436 					continue;
1437 
1438 				if (*src == 0) {
1439 					val = 0;
1440 				} else {
1441 					if ((ll = log + invlog[cc][i]) >= 255)
1442 						ll -= 255;
1443 					val = vdev_raidz_pow2[ll];
1444 				}
1445 
1446 				if (i == 0)
1447 					dst[cc][x] = val;
1448 				else
1449 					dst[cc][x] ^= val;
1450 			}
1451 		}
1452 	}
1453 
1454 	kmem_free(p, psize);
1455 }
1456 
1457 static int
1458 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1459 {
1460 	int n, i, c, t, tt;
1461 	int nmissing_rows;
1462 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1463 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1464 
1465 	uint8_t *p, *pp;
1466 	size_t psize;
1467 
1468 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1469 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1470 	uint8_t *used;
1471 
1472 	abd_t **bufs = NULL;
1473 
1474 	int code = 0;
1475 
1476 	/*
1477 	 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
1478 	 * temporary linear ABDs.
1479 	 */
1480 	if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) {
1481 		bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE);
1482 
1483 		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1484 			raidz_col_t *col = &rm->rm_col[c];
1485 
1486 			bufs[c] = col->rc_abd;
1487 			col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE);
1488 			ASSERT3S(col->rc_abd->abd_size, >=, col->rc_size);
1489 			ASSERT3S(bufs[c]->abd_size, >=, col->rc_size);
1490 			abd_copy_off(col->rc_abd, bufs[c], 0, 0, col->rc_size);
1491 		}
1492 	}
1493 
1494 	n = rm->rm_cols - rm->rm_firstdatacol;
1495 
1496 	/*
1497 	 * Figure out which data columns are missing.
1498 	 */
1499 	nmissing_rows = 0;
1500 	for (t = 0; t < ntgts; t++) {
1501 		if (tgts[t] >= rm->rm_firstdatacol) {
1502 			missing_rows[nmissing_rows++] =
1503 			    tgts[t] - rm->rm_firstdatacol;
1504 		}
1505 	}
1506 
1507 	/*
1508 	 * Figure out which parity columns to use to help generate the missing
1509 	 * data columns.
1510 	 */
1511 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1512 		ASSERT(tt < ntgts);
1513 		ASSERT(c < rm->rm_firstdatacol);
1514 
1515 		/*
1516 		 * Skip any targeted parity columns.
1517 		 */
1518 		if (c == tgts[tt]) {
1519 			tt++;
1520 			continue;
1521 		}
1522 
1523 		code |= 1 << c;
1524 
1525 		parity_map[i] = c;
1526 		i++;
1527 	}
1528 
1529 	ASSERT(code != 0);
1530 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1531 
1532 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1533 	    nmissing_rows * n + sizeof (used[0]) * n;
1534 	p = kmem_alloc(psize, KM_SLEEP);
1535 
1536 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1537 		rows[i] = pp;
1538 		pp += n;
1539 		invrows[i] = pp;
1540 		pp += n;
1541 	}
1542 	used = pp;
1543 
1544 	for (i = 0; i < nmissing_rows; i++) {
1545 		used[i] = parity_map[i];
1546 	}
1547 
1548 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1549 		if (tt < nmissing_rows &&
1550 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1551 			tt++;
1552 			continue;
1553 		}
1554 
1555 		ASSERT3S(i, <, n);
1556 		used[i] = c;
1557 		i++;
1558 	}
1559 
1560 	/*
1561 	 * Initialize the interesting rows of the matrix.
1562 	 */
1563 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1564 
1565 	/*
1566 	 * Invert the matrix.
1567 	 */
1568 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1569 	    invrows, used);
1570 
1571 	/*
1572 	 * Reconstruct the missing data using the generated matrix.
1573 	 */
1574 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1575 	    invrows, used);
1576 
1577 	kmem_free(p, psize);
1578 
1579 	/*
1580 	 * copy back from temporary linear abds and free them
1581 	 */
1582 	if (bufs) {
1583 		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1584 			raidz_col_t *col = &rm->rm_col[c];
1585 
1586 			ASSERT3S(bufs[c]->abd_size, >=, col->rc_size);
1587 			ASSERT3S(col->rc_abd->abd_size, >=, col->rc_size);
1588 			abd_copy_off(bufs[c], col->rc_abd, 0, 0, col->rc_size);
1589 			abd_free(col->rc_abd);
1590 			col->rc_abd = bufs[c];
1591 		}
1592 		kmem_free(bufs, rm->rm_cols * sizeof (abd_t *));
1593 	}
1594 
1595 	return (code);
1596 }
1597 
1598 static int
1599 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1600 {
1601 	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1602 	int ntgts;
1603 	int i, c;
1604 	int code;
1605 	int nbadparity, nbaddata;
1606 	int parity_valid[VDEV_RAIDZ_MAXPARITY];
1607 
1608 	/*
1609 	 * The tgts list must already be sorted.
1610 	 */
1611 	for (i = 1; i < nt; i++) {
1612 		ASSERT(t[i] > t[i - 1]);
1613 	}
1614 
1615 	nbadparity = rm->rm_firstdatacol;
1616 	nbaddata = rm->rm_cols - nbadparity;
1617 	ntgts = 0;
1618 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1619 		if (c < rm->rm_firstdatacol)
1620 			parity_valid[c] = B_FALSE;
1621 
1622 		if (i < nt && c == t[i]) {
1623 			tgts[ntgts++] = c;
1624 			i++;
1625 		} else if (rm->rm_col[c].rc_error != 0) {
1626 			tgts[ntgts++] = c;
1627 		} else if (c >= rm->rm_firstdatacol) {
1628 			nbaddata--;
1629 		} else {
1630 			parity_valid[c] = B_TRUE;
1631 			nbadparity--;
1632 		}
1633 	}
1634 
1635 	ASSERT(ntgts >= nt);
1636 	ASSERT(nbaddata >= 0);
1637 	ASSERT(nbaddata + nbadparity == ntgts);
1638 
1639 	dt = &tgts[nbadparity];
1640 
1641 	/*
1642 	 * See if we can use any of our optimized reconstruction routines.
1643 	 */
1644 	if (!vdev_raidz_default_to_general) {
1645 		switch (nbaddata) {
1646 		case 1:
1647 			if (parity_valid[VDEV_RAIDZ_P])
1648 				return (vdev_raidz_reconstruct_p(rm, dt, 1));
1649 
1650 			ASSERT(rm->rm_firstdatacol > 1);
1651 
1652 			if (parity_valid[VDEV_RAIDZ_Q])
1653 				return (vdev_raidz_reconstruct_q(rm, dt, 1));
1654 
1655 			ASSERT(rm->rm_firstdatacol > 2);
1656 			break;
1657 
1658 		case 2:
1659 			ASSERT(rm->rm_firstdatacol > 1);
1660 
1661 			if (parity_valid[VDEV_RAIDZ_P] &&
1662 			    parity_valid[VDEV_RAIDZ_Q])
1663 				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1664 
1665 			ASSERT(rm->rm_firstdatacol > 2);
1666 
1667 			break;
1668 		}
1669 	}
1670 
1671 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1672 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1673 	ASSERT(code > 0);
1674 	return (code);
1675 }
1676 
1677 static int
1678 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1679     uint64_t *ashift)
1680 {
1681 	vdev_t *cvd;
1682 	uint64_t nparity = vd->vdev_nparity;
1683 	int c;
1684 	int lasterror = 0;
1685 	int numerrors = 0;
1686 
1687 	ASSERT(nparity > 0);
1688 
1689 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
1690 	    vd->vdev_children < nparity + 1) {
1691 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1692 		return (SET_ERROR(EINVAL));
1693 	}
1694 
1695 	vdev_open_children(vd);
1696 
1697 	for (c = 0; c < vd->vdev_children; c++) {
1698 		cvd = vd->vdev_child[c];
1699 
1700 		if (cvd->vdev_open_error != 0) {
1701 			lasterror = cvd->vdev_open_error;
1702 			numerrors++;
1703 			continue;
1704 		}
1705 
1706 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1707 		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1708 		*ashift = MAX(*ashift, cvd->vdev_ashift);
1709 	}
1710 
1711 	*asize *= vd->vdev_children;
1712 	*max_asize *= vd->vdev_children;
1713 
1714 	if (numerrors > nparity) {
1715 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1716 		return (lasterror);
1717 	}
1718 
1719 	return (0);
1720 }
1721 
1722 static void
1723 vdev_raidz_close(vdev_t *vd)
1724 {
1725 	int c;
1726 
1727 	for (c = 0; c < vd->vdev_children; c++)
1728 		vdev_close(vd->vdev_child[c]);
1729 }
1730 
1731 /*
1732  * Handle a read or write I/O to a RAID-Z dump device.
1733  *
1734  * The dump device is in a unique situation compared to other ZFS datasets:
1735  * writing to this device should be as simple and fast as possible.  In
1736  * addition, durability matters much less since the dump will be extracted
1737  * once the machine reboots.  For that reason, this function eschews parity for
1738  * performance and simplicity.  The dump device uses the checksum setting
1739  * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1740  * dataset.
1741  *
1742  * Blocks of size 128 KB have been preallocated for this volume.  I/Os less than
1743  * 128 KB will not fill an entire block; in addition, they may not be properly
1744  * aligned.  In that case, this function uses the preallocated 128 KB block and
1745  * omits reading or writing any "empty" portions of that block, as opposed to
1746  * allocating a fresh appropriately-sized block.
1747  *
1748  * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1749  *
1750  *     vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1751  *
1752  * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1753  * allocated which spans all five child vdevs.  8 KB of data would be written to
1754  * each of four vdevs, with the fifth containing the parity bits.
1755  *
1756  *       parity    data     data     data     data
1757  *     |   PP   |   XX   |   XX   |   XX   |   XX   |
1758  *         ^        ^        ^        ^        ^
1759  *         |        |        |        |        |
1760  *   8 KB parity    ------8 KB data blocks------
1761  *
1762  * However, when writing to the dump device, the behavior is different:
1763  *
1764  *     vdev_raidz_dumpio(data, size: 32 KB, offset: 64 KB)
1765  *
1766  * Unlike the normal RAID-Z case in which the block is allocated based on the
1767  * I/O size, reads and writes here always use a 128 KB logical I/O size.  If the
1768  * I/O size is less than 128 KB, only the actual portions of data are written.
1769  * In this example the data is written to the third data vdev since that vdev
1770  * contains the offset [64 KB, 96 KB).
1771  *
1772  *       parity    data     data     data     data
1773  *     |        |        |        |   XX   |        |
1774  *                                    ^
1775  *                                    |
1776  *                             32 KB data block
1777  *
1778  * As a result, an individual I/O may not span all child vdevs; moreover, a
1779  * small I/O may only operate on a single child vdev.
1780  *
1781  * Note that since there are no parity bits calculated or written, this format
1782  * remains the same no matter how many parity bits are used in a normal RAID-Z
1783  * stripe.  On a RAID-Z3 configuration with seven child vdevs, the example above
1784  * would look like:
1785  *
1786  *       parity   parity   parity    data     data     data     data
1787  *     |        |        |        |        |        |   XX   |        |
1788  *                                                      ^
1789  *                                                      |
1790  *                                               32 KB data block
1791  */
1792 static int
1793 vdev_raidz_dumpio(vdev_t *vd, caddr_t data, size_t size,
1794     uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1795 {
1796 	vdev_t *tvd = vd->vdev_top;
1797 	vdev_t *cvd;
1798 	raidz_map_t *rm;
1799 	raidz_col_t *rc;
1800 	int c, err = 0;
1801 
1802 	uint64_t start, end, colstart, colend;
1803 	uint64_t coloffset, colsize, colskip;
1804 
1805 #ifdef	_KERNEL
1806 
1807 	/*
1808 	 * Don't write past the end of the block
1809 	 */
1810 	VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
1811 
1812 	start = offset;
1813 	end = start + size;
1814 
1815 	/*
1816 	 * Allocate a RAID-Z map for this block.  Note that this block starts
1817 	 * from the "original" offset, this is, the offset of the extent which
1818 	 * contains the requisite offset of the data being read or written.
1819 	 *
1820 	 * Even if this I/O operation doesn't span the full block size, let's
1821 	 * treat the on-disk format as if the only blocks are the complete 128
1822 	 * KB size.
1823 	 */
1824 	abd_t *abd = abd_get_from_buf(data - (offset - origoffset),
1825 	    SPA_OLD_MAXBLOCKSIZE);
1826 	rm = vdev_raidz_map_alloc(abd,
1827 	    SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift,
1828 	    vd->vdev_children, vd->vdev_nparity);
1829 
1830 	coloffset = origoffset;
1831 
1832 	for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1833 	    c++, coloffset += rc->rc_size) {
1834 		rc = &rm->rm_col[c];
1835 		cvd = vd->vdev_child[rc->rc_devidx];
1836 
1837 		if (cvd->vdev_ops->vdev_op_dumpio == NULL) {
1838 			err = EINVAL;
1839 			break;
1840 		}
1841 
1842 		/*
1843 		 * Find the start and end of this column in the RAID-Z map,
1844 		 * keeping in mind that the stated size and offset of the
1845 		 * operation may not fill the entire column for this vdev.
1846 		 *
1847 		 * If any portion of the data spans this column, issue the
1848 		 * appropriate operation to the vdev.
1849 		 */
1850 		if (coloffset + rc->rc_size <= start)
1851 			continue;
1852 		if (coloffset >= end)
1853 			continue;
1854 
1855 		colstart = MAX(coloffset, start);
1856 		colend = MIN(end, coloffset + rc->rc_size);
1857 		colsize = colend - colstart;
1858 		colskip = colstart - coloffset;
1859 
1860 		VERIFY3U(colsize, <=, rc->rc_size);
1861 		VERIFY3U(colskip, <=, rc->rc_size);
1862 
1863 		/*
1864 		 * Note that the child vdev will have a vdev label at the start
1865 		 * of its range of offsets, hence the need for
1866 		 * VDEV_LABEL_OFFSET().  See zio_vdev_child_io() for another
1867 		 * example of why this calculation is needed.
1868 		 */
1869 		if ((err = cvd->vdev_ops->vdev_op_dumpio(cvd,
1870 		    ((char *)abd_to_buf(rc->rc_abd)) + colskip, colsize,
1871 		    VDEV_LABEL_OFFSET(rc->rc_offset) + colskip, 0,
1872 		    doread, isdump)) != 0)
1873 			break;
1874 	}
1875 
1876 	vdev_raidz_map_free(rm);
1877 	abd_put(abd);
1878 #endif	/* KERNEL */
1879 
1880 	return (err);
1881 }
1882 
1883 static uint64_t
1884 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1885 {
1886 	uint64_t asize;
1887 	uint64_t ashift = vd->vdev_top->vdev_ashift;
1888 	uint64_t cols = vd->vdev_children;
1889 	uint64_t nparity = vd->vdev_nparity;
1890 
1891 	asize = ((psize - 1) >> ashift) + 1;
1892 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1893 	asize = roundup(asize, nparity + 1) << ashift;
1894 
1895 	return (asize);
1896 }
1897 
1898 static void
1899 vdev_raidz_child_done(zio_t *zio)
1900 {
1901 	raidz_col_t *rc = zio->io_private;
1902 
1903 	rc->rc_error = zio->io_error;
1904 	rc->rc_tried = 1;
1905 	rc->rc_skipped = 0;
1906 }
1907 
1908 static void
1909 vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col)
1910 {
1911 #ifdef ZFS_DEBUG
1912 	vdev_t *vd = zio->io_vd;
1913 	vdev_t *tvd = vd->vdev_top;
1914 
1915 	range_seg64_t logical_rs, physical_rs;
1916 	logical_rs.rs_start = zio->io_offset;
1917 	logical_rs.rs_end = logical_rs.rs_start +
1918 	    vdev_raidz_asize(zio->io_vd, zio->io_size);
1919 
1920 	raidz_col_t *rc = &rm->rm_col[col];
1921 	vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
1922 
1923 	vdev_xlate(cvd, &logical_rs, &physical_rs);
1924 	ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
1925 	ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
1926 	/*
1927 	 * It would be nice to assert that rs_end is equal
1928 	 * to rc_offset + rc_size but there might be an
1929 	 * optional I/O at the end that is not accounted in
1930 	 * rc_size.
1931 	 */
1932 	if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
1933 		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
1934 		    rc->rc_size + (1 << tvd->vdev_ashift));
1935 	} else {
1936 		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
1937 	}
1938 #endif
1939 }
1940 
1941 /*
1942  * Start an IO operation on a RAIDZ VDev
1943  *
1944  * Outline:
1945  * - For write operations:
1946  *   1. Generate the parity data
1947  *   2. Create child zio write operations to each column's vdev, for both
1948  *      data and parity.
1949  *   3. If the column skips any sectors for padding, create optional dummy
1950  *      write zio children for those areas to improve aggregation continuity.
1951  * - For read operations:
1952  *   1. Create child zio read operations to each data column's vdev to read
1953  *      the range of data required for zio.
1954  *   2. If this is a scrub or resilver operation, or if any of the data
1955  *      vdevs have had errors, then create zio read operations to the parity
1956  *      columns' VDevs as well.
1957  */
1958 static void
1959 vdev_raidz_io_start(zio_t *zio)
1960 {
1961 	vdev_t *vd = zio->io_vd;
1962 	vdev_t *tvd = vd->vdev_top;
1963 	vdev_t *cvd;
1964 	raidz_map_t *rm;
1965 	raidz_col_t *rc;
1966 	int c, i;
1967 
1968 	rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset,
1969 	    tvd->vdev_ashift, vd->vdev_children,
1970 	    vd->vdev_nparity);
1971 
1972 	zio->io_vsd = rm;
1973 	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1974 
1975 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1976 
1977 	if (zio->io_type == ZIO_TYPE_WRITE) {
1978 		vdev_raidz_generate_parity(rm);
1979 
1980 		for (c = 0; c < rm->rm_cols; c++) {
1981 			rc = &rm->rm_col[c];
1982 			cvd = vd->vdev_child[rc->rc_devidx];
1983 
1984 			/*
1985 			 * Verify physical to logical translation.
1986 			 */
1987 			vdev_raidz_io_verify(zio, rm, c);
1988 
1989 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1990 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
1991 			    zio->io_type, zio->io_priority, 0,
1992 			    vdev_raidz_child_done, rc));
1993 		}
1994 
1995 		/*
1996 		 * Generate optional I/Os for any skipped sectors to improve
1997 		 * aggregation contiguity.
1998 		 */
1999 		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
2000 			ASSERT(c <= rm->rm_scols);
2001 			if (c == rm->rm_scols)
2002 				c = 0;
2003 			rc = &rm->rm_col[c];
2004 			cvd = vd->vdev_child[rc->rc_devidx];
2005 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2006 			    rc->rc_offset + rc->rc_size, NULL,
2007 			    1 << tvd->vdev_ashift,
2008 			    zio->io_type, zio->io_priority,
2009 			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
2010 		}
2011 
2012 		zio_execute(zio);
2013 		return;
2014 	}
2015 
2016 	ASSERT(zio->io_type == ZIO_TYPE_READ);
2017 
2018 	/*
2019 	 * Iterate over the columns in reverse order so that we hit the parity
2020 	 * last -- any errors along the way will force us to read the parity.
2021 	 */
2022 	for (c = rm->rm_cols - 1; c >= 0; c--) {
2023 		rc = &rm->rm_col[c];
2024 		cvd = vd->vdev_child[rc->rc_devidx];
2025 		if (!vdev_readable(cvd)) {
2026 			if (c >= rm->rm_firstdatacol)
2027 				rm->rm_missingdata++;
2028 			else
2029 				rm->rm_missingparity++;
2030 			rc->rc_error = SET_ERROR(ENXIO);
2031 			rc->rc_tried = 1;	/* don't even try */
2032 			rc->rc_skipped = 1;
2033 			continue;
2034 		}
2035 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
2036 			if (c >= rm->rm_firstdatacol)
2037 				rm->rm_missingdata++;
2038 			else
2039 				rm->rm_missingparity++;
2040 			rc->rc_error = SET_ERROR(ESTALE);
2041 			rc->rc_skipped = 1;
2042 			continue;
2043 		}
2044 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
2045 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
2046 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2047 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
2048 			    zio->io_type, zio->io_priority, 0,
2049 			    vdev_raidz_child_done, rc));
2050 		}
2051 	}
2052 
2053 	zio_execute(zio);
2054 }
2055 
2056 
2057 /*
2058  * Report a checksum error for a child of a RAID-Z device.
2059  */
2060 static void
2061 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
2062 {
2063 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
2064 
2065 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2066 		zio_bad_cksum_t zbc;
2067 		raidz_map_t *rm = zio->io_vsd;
2068 
2069 		mutex_enter(&vd->vdev_stat_lock);
2070 		vd->vdev_stat.vs_checksum_errors++;
2071 		mutex_exit(&vd->vdev_stat_lock);
2072 
2073 		zbc.zbc_has_cksum = 0;
2074 		zbc.zbc_injected = rm->rm_ecksuminjected;
2075 
2076 		zfs_ereport_post_checksum(zio->io_spa, vd,
2077 		    &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size,
2078 		    rc->rc_abd, bad_data, &zbc);
2079 	}
2080 }
2081 
2082 /*
2083  * We keep track of whether or not there were any injected errors, so that
2084  * any ereports we generate can note it.
2085  */
2086 static int
2087 raidz_checksum_verify(zio_t *zio)
2088 {
2089 	zio_bad_cksum_t zbc;
2090 	raidz_map_t *rm = zio->io_vsd;
2091 
2092 	int ret = zio_checksum_error(zio, &zbc);
2093 	if (ret != 0 && zbc.zbc_injected != 0)
2094 		rm->rm_ecksuminjected = 1;
2095 
2096 	return (ret);
2097 }
2098 
2099 /*
2100  * Generate the parity from the data columns. If we tried and were able to
2101  * read the parity without error, verify that the generated parity matches the
2102  * data we read. If it doesn't, we fire off a checksum error. Return the
2103  * number such failures.
2104  */
2105 static int
2106 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
2107 {
2108 	abd_t *orig[VDEV_RAIDZ_MAXPARITY];
2109 	int c, ret = 0;
2110 	raidz_col_t *rc;
2111 
2112 	blkptr_t *bp = zio->io_bp;
2113 	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
2114 	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
2115 
2116 	if (checksum == ZIO_CHECKSUM_NOPARITY)
2117 		return (ret);
2118 
2119 	for (c = 0; c < rm->rm_firstdatacol; c++) {
2120 		rc = &rm->rm_col[c];
2121 		if (!rc->rc_tried || rc->rc_error != 0)
2122 			continue;
2123 		orig[c] = abd_alloc_sametype(rc->rc_abd, rc->rc_size);
2124 		abd_copy(orig[c], rc->rc_abd, rc->rc_size);
2125 	}
2126 
2127 	vdev_raidz_generate_parity(rm);
2128 
2129 	for (c = 0; c < rm->rm_firstdatacol; c++) {
2130 		rc = &rm->rm_col[c];
2131 		if (!rc->rc_tried || rc->rc_error != 0)
2132 			continue;
2133 		if (abd_cmp(orig[c], rc->rc_abd, rc->rc_abd->abd_size) != 0) {
2134 			raidz_checksum_error(zio, rc, orig[c]);
2135 			rc->rc_error = SET_ERROR(ECKSUM);
2136 			ret++;
2137 		}
2138 		abd_free(orig[c]);
2139 	}
2140 
2141 	return (ret);
2142 }
2143 
2144 /*
2145  * Keep statistics on all the ways that we used parity to correct data.
2146  */
2147 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
2148 
2149 static int
2150 vdev_raidz_worst_error(raidz_map_t *rm)
2151 {
2152 	int error = 0;
2153 
2154 	for (int c = 0; c < rm->rm_cols; c++)
2155 		error = zio_worst_error(error, rm->rm_col[c].rc_error);
2156 
2157 	return (error);
2158 }
2159 
2160 /*
2161  * Iterate over all combinations of bad data and attempt a reconstruction.
2162  * Note that the algorithm below is non-optimal because it doesn't take into
2163  * account how reconstruction is actually performed. For example, with
2164  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
2165  * is targeted as invalid as if columns 1 and 4 are targeted since in both
2166  * cases we'd only use parity information in column 0.
2167  */
2168 static int
2169 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
2170 {
2171 	raidz_map_t *rm = zio->io_vsd;
2172 	raidz_col_t *rc;
2173 	abd_t *orig[VDEV_RAIDZ_MAXPARITY];
2174 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
2175 	int *tgts = &tstore[1];
2176 	int current, next, i, c, n;
2177 	int code, ret = 0;
2178 
2179 	ASSERT(total_errors < rm->rm_firstdatacol);
2180 
2181 	/*
2182 	 * This simplifies one edge condition.
2183 	 */
2184 	tgts[-1] = -1;
2185 
2186 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
2187 		/*
2188 		 * Initialize the targets array by finding the first n columns
2189 		 * that contain no error.
2190 		 *
2191 		 * If there were no data errors, we need to ensure that we're
2192 		 * always explicitly attempting to reconstruct at least one
2193 		 * data column. To do this, we simply push the highest target
2194 		 * up into the data columns.
2195 		 */
2196 		for (c = 0, i = 0; i < n; i++) {
2197 			if (i == n - 1 && data_errors == 0 &&
2198 			    c < rm->rm_firstdatacol) {
2199 				c = rm->rm_firstdatacol;
2200 			}
2201 
2202 			while (rm->rm_col[c].rc_error != 0) {
2203 				c++;
2204 				ASSERT3S(c, <, rm->rm_cols);
2205 			}
2206 
2207 			tgts[i] = c++;
2208 		}
2209 
2210 		/*
2211 		 * Setting tgts[n] simplifies the other edge condition.
2212 		 */
2213 		tgts[n] = rm->rm_cols;
2214 
2215 		/*
2216 		 * These buffers were allocated in previous iterations.
2217 		 */
2218 		for (i = 0; i < n - 1; i++) {
2219 			ASSERT(orig[i] != NULL);
2220 		}
2221 
2222 		orig[n - 1] = abd_alloc_sametype(rm->rm_col[0].rc_abd,
2223 		    rm->rm_col[0].rc_size);
2224 
2225 		current = 0;
2226 		next = tgts[current];
2227 
2228 		while (current != n) {
2229 			tgts[current] = next;
2230 			current = 0;
2231 
2232 			/*
2233 			 * Save off the original data that we're going to
2234 			 * attempt to reconstruct.
2235 			 */
2236 			for (i = 0; i < n; i++) {
2237 				ASSERT(orig[i] != NULL);
2238 				c = tgts[i];
2239 				ASSERT3S(c, >=, 0);
2240 				ASSERT3S(c, <, rm->rm_cols);
2241 				rc = &rm->rm_col[c];
2242 				ASSERT3S(orig[i]->abd_size, >=, rc->rc_size);
2243 				ASSERT3S(rc->rc_abd->abd_size, >=, rc->rc_size);
2244 				abd_copy_off(orig[i], rc->rc_abd, 0, 0,
2245 				    rc->rc_size);
2246 			}
2247 
2248 			/*
2249 			 * Attempt a reconstruction and exit the outer loop on
2250 			 * success.
2251 			 */
2252 			code = vdev_raidz_reconstruct(rm, tgts, n);
2253 			if (raidz_checksum_verify(zio) == 0) {
2254 				atomic_inc_64(&raidz_corrected[code]);
2255 
2256 				for (i = 0; i < n; i++) {
2257 					c = tgts[i];
2258 					rc = &rm->rm_col[c];
2259 					ASSERT(rc->rc_error == 0);
2260 					if (rc->rc_tried)
2261 						raidz_checksum_error(zio, rc,
2262 						    orig[i]);
2263 					rc->rc_error = SET_ERROR(ECKSUM);
2264 				}
2265 
2266 				ret = code;
2267 				goto done;
2268 			}
2269 
2270 			/*
2271 			 * Restore the original data.
2272 			 */
2273 			for (i = 0; i < n; i++) {
2274 				c = tgts[i];
2275 				rc = &rm->rm_col[c];
2276 				ASSERT3S(rc->rc_abd->abd_size, >=, rc->rc_size);
2277 				ASSERT3S(orig[i]->abd_size, >=, rc->rc_size);
2278 				abd_copy_off(rc->rc_abd, orig[i], 0, 0,
2279 				    rc->rc_size);
2280 			}
2281 
2282 			do {
2283 				/*
2284 				 * Find the next valid column after the current
2285 				 * position..
2286 				 */
2287 				for (next = tgts[current] + 1;
2288 				    next < rm->rm_cols &&
2289 				    rm->rm_col[next].rc_error != 0; next++)
2290 					continue;
2291 
2292 				ASSERT(next <= tgts[current + 1]);
2293 
2294 				/*
2295 				 * If that spot is available, we're done here.
2296 				 */
2297 				if (next != tgts[current + 1])
2298 					break;
2299 
2300 				/*
2301 				 * Otherwise, find the next valid column after
2302 				 * the previous position.
2303 				 */
2304 				for (c = tgts[current - 1] + 1;
2305 				    rm->rm_col[c].rc_error != 0; c++)
2306 					continue;
2307 
2308 				tgts[current] = c;
2309 				current++;
2310 
2311 			} while (current != n);
2312 		}
2313 	}
2314 	n--;
2315 done:
2316 	for (i = 0; i < n; i++)
2317 		abd_free(orig[i]);
2318 
2319 	return (ret);
2320 }
2321 
2322 /*
2323  * Complete an IO operation on a RAIDZ VDev
2324  *
2325  * Outline:
2326  * - For write operations:
2327  *   1. Check for errors on the child IOs.
2328  *   2. Return, setting an error code if too few child VDevs were written
2329  *      to reconstruct the data later.  Note that partial writes are
2330  *      considered successful if they can be reconstructed at all.
2331  * - For read operations:
2332  *   1. Check for errors on the child IOs.
2333  *   2. If data errors occurred:
2334  *      a. Try to reassemble the data from the parity available.
2335  *      b. If we haven't yet read the parity drives, read them now.
2336  *      c. If all parity drives have been read but the data still doesn't
2337  *         reassemble with a correct checksum, then try combinatorial
2338  *         reconstruction.
2339  *      d. If that doesn't work, return an error.
2340  *   3. If there were unexpected errors or this is a resilver operation,
2341  *      rewrite the vdevs that had errors.
2342  */
2343 static void
2344 vdev_raidz_io_done(zio_t *zio)
2345 {
2346 	vdev_t *vd = zio->io_vd;
2347 	vdev_t *cvd;
2348 	raidz_map_t *rm = zio->io_vsd;
2349 	raidz_col_t *rc;
2350 	int unexpected_errors = 0;
2351 	int parity_errors = 0;
2352 	int parity_untried = 0;
2353 	int data_errors = 0;
2354 	int total_errors = 0;
2355 	int n, c;
2356 	int tgts[VDEV_RAIDZ_MAXPARITY];
2357 	int code;
2358 
2359 	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
2360 
2361 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2362 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2363 
2364 	for (c = 0; c < rm->rm_cols; c++) {
2365 		rc = &rm->rm_col[c];
2366 
2367 		if (rc->rc_error) {
2368 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
2369 
2370 			if (c < rm->rm_firstdatacol)
2371 				parity_errors++;
2372 			else
2373 				data_errors++;
2374 
2375 			if (!rc->rc_skipped)
2376 				unexpected_errors++;
2377 
2378 			total_errors++;
2379 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2380 			parity_untried++;
2381 		}
2382 	}
2383 
2384 	if (zio->io_type == ZIO_TYPE_WRITE) {
2385 		/*
2386 		 * XXX -- for now, treat partial writes as a success.
2387 		 * (If we couldn't write enough columns to reconstruct
2388 		 * the data, the I/O failed.  Otherwise, good enough.)
2389 		 *
2390 		 * Now that we support write reallocation, it would be better
2391 		 * to treat partial failure as real failure unless there are
2392 		 * no non-degraded top-level vdevs left, and not update DTLs
2393 		 * if we intend to reallocate.
2394 		 */
2395 		/* XXPOLICY */
2396 		if (total_errors > rm->rm_firstdatacol)
2397 			zio->io_error = vdev_raidz_worst_error(rm);
2398 
2399 		return;
2400 	}
2401 
2402 	ASSERT(zio->io_type == ZIO_TYPE_READ);
2403 	/*
2404 	 * There are three potential phases for a read:
2405 	 *	1. produce valid data from the columns read
2406 	 *	2. read all disks and try again
2407 	 *	3. perform combinatorial reconstruction
2408 	 *
2409 	 * Each phase is progressively both more expensive and less likely to
2410 	 * occur. If we encounter more errors than we can repair or all phases
2411 	 * fail, we have no choice but to return an error.
2412 	 */
2413 
2414 	/*
2415 	 * If the number of errors we saw was correctable -- less than or equal
2416 	 * to the number of parity disks read -- attempt to produce data that
2417 	 * has a valid checksum. Naturally, this case applies in the absence of
2418 	 * any errors.
2419 	 */
2420 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2421 		if (data_errors == 0) {
2422 			if (raidz_checksum_verify(zio) == 0) {
2423 				/*
2424 				 * If we read parity information (unnecessarily
2425 				 * as it happens since no reconstruction was
2426 				 * needed) regenerate and verify the parity.
2427 				 * We also regenerate parity when resilvering
2428 				 * so we can write it out to the failed device
2429 				 * later.
2430 				 */
2431 				if (parity_errors + parity_untried <
2432 				    rm->rm_firstdatacol ||
2433 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2434 					n = raidz_parity_verify(zio, rm);
2435 					unexpected_errors += n;
2436 					ASSERT(parity_errors + n <=
2437 					    rm->rm_firstdatacol);
2438 				}
2439 				goto done;
2440 			}
2441 		} else {
2442 			/*
2443 			 * We either attempt to read all the parity columns or
2444 			 * none of them. If we didn't try to read parity, we
2445 			 * wouldn't be here in the correctable case. There must
2446 			 * also have been fewer parity errors than parity
2447 			 * columns or, again, we wouldn't be in this code path.
2448 			 */
2449 			ASSERT(parity_untried == 0);
2450 			ASSERT(parity_errors < rm->rm_firstdatacol);
2451 
2452 			/*
2453 			 * Identify the data columns that reported an error.
2454 			 */
2455 			n = 0;
2456 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2457 				rc = &rm->rm_col[c];
2458 				if (rc->rc_error != 0) {
2459 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2460 					tgts[n++] = c;
2461 				}
2462 			}
2463 
2464 			ASSERT(rm->rm_firstdatacol >= n);
2465 
2466 			code = vdev_raidz_reconstruct(rm, tgts, n);
2467 
2468 			if (raidz_checksum_verify(zio) == 0) {
2469 				atomic_inc_64(&raidz_corrected[code]);
2470 
2471 				/*
2472 				 * If we read more parity disks than were used
2473 				 * for reconstruction, confirm that the other
2474 				 * parity disks produced correct data. This
2475 				 * routine is suboptimal in that it regenerates
2476 				 * the parity that we already used in addition
2477 				 * to the parity that we're attempting to
2478 				 * verify, but this should be a relatively
2479 				 * uncommon case, and can be optimized if it
2480 				 * becomes a problem. Note that we regenerate
2481 				 * parity when resilvering so we can write it
2482 				 * out to failed devices later.
2483 				 */
2484 				if (parity_errors < rm->rm_firstdatacol - n ||
2485 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2486 					n = raidz_parity_verify(zio, rm);
2487 					unexpected_errors += n;
2488 					ASSERT(parity_errors + n <=
2489 					    rm->rm_firstdatacol);
2490 				}
2491 
2492 				goto done;
2493 			}
2494 		}
2495 	}
2496 
2497 	/*
2498 	 * This isn't a typical situation -- either we got a read error or
2499 	 * a child silently returned bad data. Read every block so we can
2500 	 * try again with as much data and parity as we can track down. If
2501 	 * we've already been through once before, all children will be marked
2502 	 * as tried so we'll proceed to combinatorial reconstruction.
2503 	 */
2504 	unexpected_errors = 1;
2505 	rm->rm_missingdata = 0;
2506 	rm->rm_missingparity = 0;
2507 
2508 	for (c = 0; c < rm->rm_cols; c++) {
2509 		if (rm->rm_col[c].rc_tried)
2510 			continue;
2511 
2512 		zio_vdev_io_redone(zio);
2513 		do {
2514 			rc = &rm->rm_col[c];
2515 			if (rc->rc_tried)
2516 				continue;
2517 			zio_nowait(zio_vdev_child_io(zio, NULL,
2518 			    vd->vdev_child[rc->rc_devidx],
2519 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
2520 			    zio->io_type, zio->io_priority, 0,
2521 			    vdev_raidz_child_done, rc));
2522 		} while (++c < rm->rm_cols);
2523 
2524 		return;
2525 	}
2526 
2527 	/*
2528 	 * At this point we've attempted to reconstruct the data given the
2529 	 * errors we detected, and we've attempted to read all columns. There
2530 	 * must, therefore, be one or more additional problems -- silent errors
2531 	 * resulting in invalid data rather than explicit I/O errors resulting
2532 	 * in absent data. We check if there is enough additional data to
2533 	 * possibly reconstruct the data and then perform combinatorial
2534 	 * reconstruction over all possible combinations. If that fails,
2535 	 * we're cooked.
2536 	 */
2537 	if (total_errors > rm->rm_firstdatacol) {
2538 		zio->io_error = vdev_raidz_worst_error(rm);
2539 
2540 	} else if (total_errors < rm->rm_firstdatacol &&
2541 	    (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2542 		/*
2543 		 * If we didn't use all the available parity for the
2544 		 * combinatorial reconstruction, verify that the remaining
2545 		 * parity is correct.
2546 		 */
2547 		if (code != (1 << rm->rm_firstdatacol) - 1)
2548 			(void) raidz_parity_verify(zio, rm);
2549 	} else {
2550 		/*
2551 		 * We're here because either:
2552 		 *
2553 		 *	total_errors == rm_first_datacol, or
2554 		 *	vdev_raidz_combrec() failed
2555 		 *
2556 		 * In either case, there is enough bad data to prevent
2557 		 * reconstruction.
2558 		 *
2559 		 * Start checksum ereports for all children which haven't
2560 		 * failed, and the IO wasn't speculative.
2561 		 */
2562 		zio->io_error = SET_ERROR(ECKSUM);
2563 
2564 		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2565 			for (c = 0; c < rm->rm_cols; c++) {
2566 				rc = &rm->rm_col[c];
2567 				if (rc->rc_error == 0) {
2568 					zio_bad_cksum_t zbc;
2569 					zbc.zbc_has_cksum = 0;
2570 					zbc.zbc_injected =
2571 					    rm->rm_ecksuminjected;
2572 
2573 					zfs_ereport_start_checksum(
2574 					    zio->io_spa,
2575 					    vd->vdev_child[rc->rc_devidx],
2576 					    &zio->io_bookmark, zio,
2577 					    rc->rc_offset, rc->rc_size,
2578 					    (void *)(uintptr_t)c, &zbc);
2579 				}
2580 			}
2581 		}
2582 	}
2583 
2584 done:
2585 	zio_checksum_verified(zio);
2586 
2587 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2588 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2589 		/*
2590 		 * Use the good data we have in hand to repair damaged children.
2591 		 */
2592 		for (c = 0; c < rm->rm_cols; c++) {
2593 			rc = &rm->rm_col[c];
2594 			cvd = vd->vdev_child[rc->rc_devidx];
2595 
2596 			if (rc->rc_error == 0)
2597 				continue;
2598 
2599 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2600 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
2601 			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2602 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2603 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2604 		}
2605 	}
2606 }
2607 
2608 static void
2609 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2610 {
2611 	if (faulted > vd->vdev_nparity)
2612 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2613 		    VDEV_AUX_NO_REPLICAS);
2614 	else if (degraded + faulted != 0)
2615 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2616 	else
2617 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2618 }
2619 
2620 /*
2621  * Determine if any portion of the provided block resides on a child vdev
2622  * with a dirty DTL and therefore needs to be resilvered.  The function
2623  * assumes that at least one DTL is dirty which imples that full stripe
2624  * width blocks must be resilvered.
2625  */
2626 static boolean_t
2627 vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
2628 {
2629 	uint64_t dcols = vd->vdev_children;
2630 	uint64_t nparity = vd->vdev_nparity;
2631 	uint64_t ashift = vd->vdev_top->vdev_ashift;
2632 	/* The starting RAIDZ (parent) vdev sector of the block. */
2633 	uint64_t b = offset >> ashift;
2634 	/* The zio's size in units of the vdev's minimum sector size. */
2635 	uint64_t s = ((psize - 1) >> ashift) + 1;
2636 	/* The first column for this stripe. */
2637 	uint64_t f = b % dcols;
2638 
2639 	if (s + nparity >= dcols)
2640 		return (B_TRUE);
2641 
2642 	for (uint64_t c = 0; c < s + nparity; c++) {
2643 		uint64_t devidx = (f + c) % dcols;
2644 		vdev_t *cvd = vd->vdev_child[devidx];
2645 
2646 		/*
2647 		 * dsl_scan_need_resilver() already checked vd with
2648 		 * vdev_dtl_contains(). So here just check cvd with
2649 		 * vdev_dtl_empty(), cheaper and a good approximation.
2650 		 */
2651 		if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
2652 			return (B_TRUE);
2653 	}
2654 
2655 	return (B_FALSE);
2656 }
2657 
2658 static void
2659 vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *in, range_seg64_t *res)
2660 {
2661 	vdev_t *raidvd = cvd->vdev_parent;
2662 	ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
2663 
2664 	uint64_t width = raidvd->vdev_children;
2665 	uint64_t tgt_col = cvd->vdev_id;
2666 	uint64_t ashift = raidvd->vdev_top->vdev_ashift;
2667 
2668 	/* make sure the offsets are block-aligned */
2669 	ASSERT0(in->rs_start % (1 << ashift));
2670 	ASSERT0(in->rs_end % (1 << ashift));
2671 	uint64_t b_start = in->rs_start >> ashift;
2672 	uint64_t b_end = in->rs_end >> ashift;
2673 
2674 	uint64_t start_row = 0;
2675 	if (b_start > tgt_col) /* avoid underflow */
2676 		start_row = ((b_start - tgt_col - 1) / width) + 1;
2677 
2678 	uint64_t end_row = 0;
2679 	if (b_end > tgt_col)
2680 		end_row = ((b_end - tgt_col - 1) / width) + 1;
2681 
2682 	res->rs_start = start_row << ashift;
2683 	res->rs_end = end_row << ashift;
2684 
2685 	ASSERT3U(res->rs_start, <=, in->rs_start);
2686 	ASSERT3U(res->rs_end - res->rs_start, <=, in->rs_end - in->rs_start);
2687 }
2688 
2689 vdev_ops_t vdev_raidz_ops = {
2690 	.vdev_op_open = vdev_raidz_open,
2691 	.vdev_op_close = vdev_raidz_close,
2692 	.vdev_op_asize = vdev_raidz_asize,
2693 	.vdev_op_io_start = vdev_raidz_io_start,
2694 	.vdev_op_io_done = vdev_raidz_io_done,
2695 	.vdev_op_state_change = vdev_raidz_state_change,
2696 	.vdev_op_need_resilver = vdev_raidz_need_resilver,
2697 	.vdev_op_hold = NULL,
2698 	.vdev_op_rele = NULL,
2699 	.vdev_op_remap = NULL,
2700 	.vdev_op_xlate = vdev_raidz_xlate,
2701 	.vdev_op_dumpio = vdev_raidz_dumpio,
2702 	.vdev_op_type = VDEV_TYPE_RAIDZ,	/* name of this vdev type */
2703 	.vdev_op_leaf = B_FALSE			/* not a leaf vdev */
2704 };
2705