xref: /illumos-gate/usr/src/uts/common/fs/zfs/vdev_raidz.c (revision b1e2e3fb17324e9ddf43db264a0c64da7756d9e6)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
25  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
26  * Copyright (c) 2014 Integros [integros.com]
27  */
28 
29 #include <sys/zfs_context.h>
30 #include <sys/spa.h>
31 #include <sys/vdev_impl.h>
32 #include <sys/vdev_disk.h>
33 #include <sys/vdev_file.h>
34 #include <sys/vdev_raidz.h>
35 #include <sys/zio.h>
36 #include <sys/zio_checksum.h>
37 #include <sys/abd.h>
38 #include <sys/fs/zfs.h>
39 #include <sys/fm/fs/zfs.h>
40 
41 #ifdef ZFS_DEBUG
42 #include <sys/vdev_initialize.h>	/* vdev_xlate testing */
43 #endif
44 
45 /*
46  * Virtual device vector for RAID-Z.
47  *
48  * This vdev supports single, double, and triple parity. For single parity,
49  * we use a simple XOR of all the data columns. For double or triple parity,
50  * we use a special case of Reed-Solomon coding. This extends the
51  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
52  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
53  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
54  * former is also based. The latter is designed to provide higher performance
55  * for writes.
56  *
57  * Note that the Plank paper claimed to support arbitrary N+M, but was then
58  * amended six years later identifying a critical flaw that invalidates its
59  * claims. Nevertheless, the technique can be adapted to work for up to
60  * triple parity. For additional parity, the amendment "Note: Correction to
61  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
62  * is viable, but the additional complexity means that write performance will
63  * suffer.
64  *
65  * All of the methods above operate on a Galois field, defined over the
66  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
67  * can be expressed with a single byte. Briefly, the operations on the
68  * field are defined as follows:
69  *
70  *   o addition (+) is represented by a bitwise XOR
71  *   o subtraction (-) is therefore identical to addition: A + B = A - B
72  *   o multiplication of A by 2 is defined by the following bitwise expression:
73  *
74  *	(A * 2)_7 = A_6
75  *	(A * 2)_6 = A_5
76  *	(A * 2)_5 = A_4
77  *	(A * 2)_4 = A_3 + A_7
78  *	(A * 2)_3 = A_2 + A_7
79  *	(A * 2)_2 = A_1 + A_7
80  *	(A * 2)_1 = A_0
81  *	(A * 2)_0 = A_7
82  *
83  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
84  * As an aside, this multiplication is derived from the error correcting
85  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
86  *
87  * Observe that any number in the field (except for 0) can be expressed as a
88  * power of 2 -- a generator for the field. We store a table of the powers of
89  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
90  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
91  * than field addition). The inverse of a field element A (A^-1) is therefore
92  * A ^ (255 - 1) = A^254.
93  *
94  * The up-to-three parity columns, P, Q, R over several data columns,
95  * D_0, ... D_n-1, can be expressed by field operations:
96  *
97  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
98  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
99  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
100  *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
101  *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
102  *
103  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
104  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
105  * independent coefficients. (There are no additional coefficients that have
106  * this property which is why the uncorrected Plank method breaks down.)
107  *
108  * See the reconstruction code below for how P, Q and R can used individually
109  * or in concert to recover missing data columns.
110  */
111 
112 typedef struct raidz_col {
113 	uint64_t rc_devidx;		/* child device index for I/O */
114 	uint64_t rc_offset;		/* device offset */
115 	uint64_t rc_size;		/* I/O size */
116 	abd_t *rc_abd;			/* I/O data */
117 	void *rc_gdata;			/* used to store the "good" version */
118 	int rc_error;			/* I/O error for this device */
119 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
120 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
121 } raidz_col_t;
122 
123 typedef struct raidz_map {
124 	uint64_t rm_cols;		/* Regular column count */
125 	uint64_t rm_scols;		/* Count including skipped columns */
126 	uint64_t rm_bigcols;		/* Number of oversized columns */
127 	uint64_t rm_asize;		/* Actual total I/O size */
128 	uint64_t rm_missingdata;	/* Count of missing data devices */
129 	uint64_t rm_missingparity;	/* Count of missing parity devices */
130 	uint64_t rm_firstdatacol;	/* First data column/parity count */
131 	uint64_t rm_nskip;		/* Skipped sectors for padding */
132 	uint64_t rm_skipstart;		/* Column index of padding start */
133 	abd_t *rm_abd_copy;		/* rm_asize-buffer of copied data */
134 	uintptr_t rm_reports;		/* # of referencing checksum reports */
135 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
136 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
137 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
138 } raidz_map_t;
139 
140 #define	VDEV_RAIDZ_P		0
141 #define	VDEV_RAIDZ_Q		1
142 #define	VDEV_RAIDZ_R		2
143 
144 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
145 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
146 
147 /*
148  * We provide a mechanism to perform the field multiplication operation on a
149  * 64-bit value all at once rather than a byte at a time. This works by
150  * creating a mask from the top bit in each byte and using that to
151  * conditionally apply the XOR of 0x1d.
152  */
153 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
154 { \
155 	(mask) = (x) & 0x8080808080808080ULL; \
156 	(mask) = ((mask) << 1) - ((mask) >> 7); \
157 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
158 	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
159 }
160 
161 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
162 { \
163 	VDEV_RAIDZ_64MUL_2((x), mask); \
164 	VDEV_RAIDZ_64MUL_2((x), mask); \
165 }
166 
167 #define	VDEV_LABEL_OFFSET(x)	(x + VDEV_LABEL_START_SIZE)
168 
169 /*
170  * Force reconstruction to use the general purpose method.
171  */
172 int vdev_raidz_default_to_general;
173 
174 /* Powers of 2 in the Galois field defined above. */
175 static const uint8_t vdev_raidz_pow2[256] = {
176 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
177 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
178 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
179 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
180 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
181 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
182 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
183 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
184 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
185 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
186 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
187 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
188 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
189 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
190 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
191 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
192 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
193 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
194 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
195 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
196 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
197 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
198 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
199 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
200 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
201 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
202 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
203 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
204 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
205 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
206 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
207 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
208 };
209 /* Logs of 2 in the Galois field defined above. */
210 static const uint8_t vdev_raidz_log2[256] = {
211 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
212 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
213 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
214 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
215 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
216 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
217 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
218 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
219 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
220 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
221 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
222 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
223 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
224 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
225 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
226 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
227 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
228 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
229 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
230 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
231 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
232 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
233 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
234 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
235 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
236 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
237 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
238 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
239 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
240 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
241 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
242 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
243 };
244 
245 static void vdev_raidz_generate_parity(raidz_map_t *rm);
246 
247 /*
248  * Multiply a given number by 2 raised to the given power.
249  */
250 static uint8_t
251 vdev_raidz_exp2(uint_t a, int exp)
252 {
253 	if (a == 0)
254 		return (0);
255 
256 	ASSERT(exp >= 0);
257 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
258 
259 	exp += vdev_raidz_log2[a];
260 	if (exp > 255)
261 		exp -= 255;
262 
263 	return (vdev_raidz_pow2[exp]);
264 }
265 
266 static void
267 vdev_raidz_map_free(raidz_map_t *rm)
268 {
269 	int c;
270 	size_t size;
271 
272 	for (c = 0; c < rm->rm_firstdatacol; c++) {
273 		abd_free(rm->rm_col[c].rc_abd);
274 
275 		if (rm->rm_col[c].rc_gdata != NULL)
276 			zio_buf_free(rm->rm_col[c].rc_gdata,
277 			    rm->rm_col[c].rc_size);
278 	}
279 
280 	size = 0;
281 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
282 		abd_put(rm->rm_col[c].rc_abd);
283 		size += rm->rm_col[c].rc_size;
284 	}
285 
286 	if (rm->rm_abd_copy != NULL)
287 		abd_free(rm->rm_abd_copy);
288 
289 	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
290 }
291 
292 static void
293 vdev_raidz_map_free_vsd(zio_t *zio)
294 {
295 	raidz_map_t *rm = zio->io_vsd;
296 
297 	ASSERT0(rm->rm_freed);
298 	rm->rm_freed = 1;
299 
300 	if (rm->rm_reports == 0)
301 		vdev_raidz_map_free(rm);
302 }
303 
304 /*ARGSUSED*/
305 static void
306 vdev_raidz_cksum_free(void *arg, size_t ignored)
307 {
308 	raidz_map_t *rm = arg;
309 
310 	ASSERT3U(rm->rm_reports, >, 0);
311 
312 	if (--rm->rm_reports == 0 && rm->rm_freed != 0)
313 		vdev_raidz_map_free(rm);
314 }
315 
316 static void
317 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
318 {
319 	raidz_map_t *rm = zcr->zcr_cbdata;
320 	size_t c = zcr->zcr_cbinfo;
321 	size_t x;
322 
323 	const char *good = NULL;
324 	char *bad;
325 
326 	if (good_data == NULL) {
327 		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
328 		return;
329 	}
330 
331 	if (c < rm->rm_firstdatacol) {
332 		/*
333 		 * The first time through, calculate the parity blocks for
334 		 * the good data (this relies on the fact that the good
335 		 * data never changes for a given logical ZIO)
336 		 */
337 		if (rm->rm_col[0].rc_gdata == NULL) {
338 			abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY];
339 			char *buf;
340 			int offset;
341 
342 			/*
343 			 * Set up the rm_col[]s to generate the parity for
344 			 * good_data, first saving the parity bufs and
345 			 * replacing them with buffers to hold the result.
346 			 */
347 			for (x = 0; x < rm->rm_firstdatacol; x++) {
348 				bad_parity[x] = rm->rm_col[x].rc_abd;
349 				rm->rm_col[x].rc_gdata =
350 				    zio_buf_alloc(rm->rm_col[x].rc_size);
351 				rm->rm_col[x].rc_abd =
352 				    abd_get_from_buf(rm->rm_col[x].rc_gdata,
353 				    rm->rm_col[x].rc_size);
354 			}
355 
356 			/* fill in the data columns from good_data */
357 			buf = (char *)good_data;
358 			for (; x < rm->rm_cols; x++) {
359 				abd_put(rm->rm_col[x].rc_abd);
360 				rm->rm_col[x].rc_abd = abd_get_from_buf(buf,
361 				    rm->rm_col[x].rc_size);
362 				buf += rm->rm_col[x].rc_size;
363 			}
364 
365 			/*
366 			 * Construct the parity from the good data.
367 			 */
368 			vdev_raidz_generate_parity(rm);
369 
370 			/* restore everything back to its original state */
371 			for (x = 0; x < rm->rm_firstdatacol; x++) {
372 				abd_put(rm->rm_col[x].rc_abd);
373 				rm->rm_col[x].rc_abd = bad_parity[x];
374 			}
375 
376 			offset = 0;
377 			for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
378 				abd_put(rm->rm_col[x].rc_abd);
379 				rm->rm_col[x].rc_abd = abd_get_offset(
380 				    rm->rm_abd_copy, offset);
381 				offset += rm->rm_col[x].rc_size;
382 			}
383 		}
384 
385 		ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
386 		good = rm->rm_col[c].rc_gdata;
387 	} else {
388 		/* adjust good_data to point at the start of our column */
389 		good = good_data;
390 
391 		for (x = rm->rm_firstdatacol; x < c; x++)
392 			good += rm->rm_col[x].rc_size;
393 	}
394 
395 	bad = abd_borrow_buf_copy(rm->rm_col[c].rc_abd, rm->rm_col[c].rc_size);
396 	/* we drop the ereport if it ends up that the data was good */
397 	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
398 	abd_return_buf(rm->rm_col[c].rc_abd, bad, rm->rm_col[c].rc_size);
399 }
400 
401 /*
402  * Invoked indirectly by zfs_ereport_start_checksum(), called
403  * below when our read operation fails completely.  The main point
404  * is to keep a copy of everything we read from disk, so that at
405  * vdev_raidz_cksum_finish() time we can compare it with the good data.
406  */
407 static void
408 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
409 {
410 	size_t c = (size_t)(uintptr_t)arg;
411 	size_t offset;
412 
413 	raidz_map_t *rm = zio->io_vsd;
414 	size_t size;
415 
416 	/* set up the report and bump the refcount  */
417 	zcr->zcr_cbdata = rm;
418 	zcr->zcr_cbinfo = c;
419 	zcr->zcr_finish = vdev_raidz_cksum_finish;
420 	zcr->zcr_free = vdev_raidz_cksum_free;
421 
422 	rm->rm_reports++;
423 	ASSERT3U(rm->rm_reports, >, 0);
424 
425 	if (rm->rm_abd_copy != NULL)
426 		return;
427 
428 	/*
429 	 * It's the first time we're called for this raidz_map_t, so we need
430 	 * to copy the data aside; there's no guarantee that our zio's buffer
431 	 * won't be re-used for something else.
432 	 *
433 	 * Our parity data is already in separate buffers, so there's no need
434 	 * to copy them.
435 	 */
436 
437 	size = 0;
438 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
439 		size += rm->rm_col[c].rc_size;
440 
441 	rm->rm_abd_copy =
442 	    abd_alloc_sametype(rm->rm_col[rm->rm_firstdatacol].rc_abd, size);
443 
444 	for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
445 		raidz_col_t *col = &rm->rm_col[c];
446 		abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset);
447 
448 		abd_copy(tmp, col->rc_abd, col->rc_size);
449 		abd_put(col->rc_abd);
450 		col->rc_abd = tmp;
451 
452 		offset += col->rc_size;
453 	}
454 	ASSERT3U(offset, ==, size);
455 }
456 
457 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
458 	vdev_raidz_map_free_vsd,
459 	vdev_raidz_cksum_report
460 };
461 
462 /*
463  * Divides the IO evenly across all child vdevs; usually, dcols is
464  * the number of children in the target vdev.
465  */
466 static raidz_map_t *
467 vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset,
468     uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
469 {
470 	raidz_map_t *rm;
471 	/* The starting RAIDZ (parent) vdev sector of the block. */
472 	uint64_t b = offset >> unit_shift;
473 	/* The zio's size in units of the vdev's minimum sector size. */
474 	uint64_t s = size >> unit_shift;
475 	/* The first column for this stripe. */
476 	uint64_t f = b % dcols;
477 	/* The starting byte offset on each child vdev. */
478 	uint64_t o = (b / dcols) << unit_shift;
479 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
480 	uint64_t off = 0;
481 
482 	/*
483 	 * "Quotient": The number of data sectors for this stripe on all but
484 	 * the "big column" child vdevs that also contain "remainder" data.
485 	 */
486 	q = s / (dcols - nparity);
487 
488 	/*
489 	 * "Remainder": The number of partial stripe data sectors in this I/O.
490 	 * This will add a sector to some, but not all, child vdevs.
491 	 */
492 	r = s - q * (dcols - nparity);
493 
494 	/* The number of "big columns" - those which contain remainder data. */
495 	bc = (r == 0 ? 0 : r + nparity);
496 
497 	/*
498 	 * The total number of data and parity sectors associated with
499 	 * this I/O.
500 	 */
501 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
502 
503 	/* acols: The columns that will be accessed. */
504 	/* scols: The columns that will be accessed or skipped. */
505 	if (q == 0) {
506 		/* Our I/O request doesn't span all child vdevs. */
507 		acols = bc;
508 		scols = MIN(dcols, roundup(bc, nparity + 1));
509 	} else {
510 		acols = dcols;
511 		scols = dcols;
512 	}
513 
514 	ASSERT3U(acols, <=, scols);
515 
516 	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
517 
518 	rm->rm_cols = acols;
519 	rm->rm_scols = scols;
520 	rm->rm_bigcols = bc;
521 	rm->rm_skipstart = bc;
522 	rm->rm_missingdata = 0;
523 	rm->rm_missingparity = 0;
524 	rm->rm_firstdatacol = nparity;
525 	rm->rm_abd_copy = NULL;
526 	rm->rm_reports = 0;
527 	rm->rm_freed = 0;
528 	rm->rm_ecksuminjected = 0;
529 
530 	asize = 0;
531 
532 	for (c = 0; c < scols; c++) {
533 		col = f + c;
534 		coff = o;
535 		if (col >= dcols) {
536 			col -= dcols;
537 			coff += 1ULL << unit_shift;
538 		}
539 		rm->rm_col[c].rc_devidx = col;
540 		rm->rm_col[c].rc_offset = coff;
541 		rm->rm_col[c].rc_abd = NULL;
542 		rm->rm_col[c].rc_gdata = NULL;
543 		rm->rm_col[c].rc_error = 0;
544 		rm->rm_col[c].rc_tried = 0;
545 		rm->rm_col[c].rc_skipped = 0;
546 
547 		if (c >= acols)
548 			rm->rm_col[c].rc_size = 0;
549 		else if (c < bc)
550 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
551 		else
552 			rm->rm_col[c].rc_size = q << unit_shift;
553 
554 		asize += rm->rm_col[c].rc_size;
555 	}
556 
557 	ASSERT3U(asize, ==, tot << unit_shift);
558 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
559 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
560 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
561 	ASSERT3U(rm->rm_nskip, <=, nparity);
562 
563 	for (c = 0; c < rm->rm_firstdatacol; c++)
564 		rm->rm_col[c].rc_abd =
565 		    abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE);
566 
567 	rm->rm_col[c].rc_abd = abd_get_offset(abd, 0);
568 	off = rm->rm_col[c].rc_size;
569 
570 	for (c = c + 1; c < acols; c++) {
571 		rm->rm_col[c].rc_abd = abd_get_offset(abd, off);
572 		off += rm->rm_col[c].rc_size;
573 	}
574 
575 	/*
576 	 * If all data stored spans all columns, there's a danger that parity
577 	 * will always be on the same device and, since parity isn't read
578 	 * during normal operation, that that device's I/O bandwidth won't be
579 	 * used effectively. We therefore switch the parity every 1MB.
580 	 *
581 	 * ... at least that was, ostensibly, the theory. As a practical
582 	 * matter unless we juggle the parity between all devices evenly, we
583 	 * won't see any benefit. Further, occasional writes that aren't a
584 	 * multiple of the LCM of the number of children and the minimum
585 	 * stripe width are sufficient to avoid pessimal behavior.
586 	 * Unfortunately, this decision created an implicit on-disk format
587 	 * requirement that we need to support for all eternity, but only
588 	 * for single-parity RAID-Z.
589 	 *
590 	 * If we intend to skip a sector in the zeroth column for padding
591 	 * we must make sure to note this swap. We will never intend to
592 	 * skip the first column since at least one data and one parity
593 	 * column must appear in each row.
594 	 */
595 	ASSERT(rm->rm_cols >= 2);
596 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
597 
598 	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
599 		devidx = rm->rm_col[0].rc_devidx;
600 		o = rm->rm_col[0].rc_offset;
601 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
602 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
603 		rm->rm_col[1].rc_devidx = devidx;
604 		rm->rm_col[1].rc_offset = o;
605 
606 		if (rm->rm_skipstart == 0)
607 			rm->rm_skipstart = 1;
608 	}
609 
610 	return (rm);
611 }
612 
613 struct pqr_struct {
614 	uint64_t *p;
615 	uint64_t *q;
616 	uint64_t *r;
617 };
618 
619 static int
620 vdev_raidz_p_func(void *buf, size_t size, void *private)
621 {
622 	struct pqr_struct *pqr = private;
623 	const uint64_t *src = buf;
624 	int i, cnt = size / sizeof (src[0]);
625 
626 	ASSERT(pqr->p && !pqr->q && !pqr->r);
627 
628 	for (i = 0; i < cnt; i++, src++, pqr->p++)
629 		*pqr->p ^= *src;
630 
631 	return (0);
632 }
633 
634 static int
635 vdev_raidz_pq_func(void *buf, size_t size, void *private)
636 {
637 	struct pqr_struct *pqr = private;
638 	const uint64_t *src = buf;
639 	uint64_t mask;
640 	int i, cnt = size / sizeof (src[0]);
641 
642 	ASSERT(pqr->p && pqr->q && !pqr->r);
643 
644 	for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
645 		*pqr->p ^= *src;
646 		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
647 		*pqr->q ^= *src;
648 	}
649 
650 	return (0);
651 }
652 
653 static int
654 vdev_raidz_pqr_func(void *buf, size_t size, void *private)
655 {
656 	struct pqr_struct *pqr = private;
657 	const uint64_t *src = buf;
658 	uint64_t mask;
659 	int i, cnt = size / sizeof (src[0]);
660 
661 	ASSERT(pqr->p && pqr->q && pqr->r);
662 
663 	for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
664 		*pqr->p ^= *src;
665 		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
666 		*pqr->q ^= *src;
667 		VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
668 		*pqr->r ^= *src;
669 	}
670 
671 	return (0);
672 }
673 
674 static void
675 vdev_raidz_generate_parity_p(raidz_map_t *rm)
676 {
677 	uint64_t *p;
678 	int c;
679 	abd_t *src;
680 
681 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
682 		src = rm->rm_col[c].rc_abd;
683 		p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
684 
685 		if (c == rm->rm_firstdatacol) {
686 			abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
687 		} else {
688 			struct pqr_struct pqr = { p, NULL, NULL };
689 			(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
690 			    vdev_raidz_p_func, &pqr);
691 		}
692 	}
693 }
694 
695 static void
696 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
697 {
698 	uint64_t *p, *q, pcnt, ccnt, mask, i;
699 	int c;
700 	abd_t *src;
701 
702 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
703 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
704 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
705 
706 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
707 		src = rm->rm_col[c].rc_abd;
708 		p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
709 		q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
710 
711 		ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
712 
713 		if (c == rm->rm_firstdatacol) {
714 			abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
715 			(void) memcpy(q, p, rm->rm_col[c].rc_size);
716 		} else {
717 			struct pqr_struct pqr = { p, q, NULL };
718 			(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
719 			    vdev_raidz_pq_func, &pqr);
720 		}
721 
722 		if (c == rm->rm_firstdatacol) {
723 			for (i = ccnt; i < pcnt; i++) {
724 				p[i] = 0;
725 				q[i] = 0;
726 			}
727 		} else {
728 			/*
729 			 * Treat short columns as though they are full of 0s.
730 			 * Note that there's therefore nothing needed for P.
731 			 */
732 			for (i = ccnt; i < pcnt; i++) {
733 				VDEV_RAIDZ_64MUL_2(q[i], mask);
734 			}
735 		}
736 	}
737 }
738 
739 static void
740 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
741 {
742 	uint64_t *p, *q, *r, pcnt, ccnt, mask, i;
743 	int c;
744 	abd_t *src;
745 
746 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
747 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
748 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
749 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
750 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
751 
752 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
753 		src = rm->rm_col[c].rc_abd;
754 		p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
755 		q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
756 		r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd);
757 
758 		ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
759 
760 		if (c == rm->rm_firstdatacol) {
761 			abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
762 			(void) memcpy(q, p, rm->rm_col[c].rc_size);
763 			(void) memcpy(r, p, rm->rm_col[c].rc_size);
764 		} else {
765 			struct pqr_struct pqr = { p, q, r };
766 			(void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
767 			    vdev_raidz_pqr_func, &pqr);
768 		}
769 
770 		if (c == rm->rm_firstdatacol) {
771 			for (i = ccnt; i < pcnt; i++) {
772 				p[i] = 0;
773 				q[i] = 0;
774 				r[i] = 0;
775 			}
776 		} else {
777 			/*
778 			 * Treat short columns as though they are full of 0s.
779 			 * Note that there's therefore nothing needed for P.
780 			 */
781 			for (i = ccnt; i < pcnt; i++) {
782 				VDEV_RAIDZ_64MUL_2(q[i], mask);
783 				VDEV_RAIDZ_64MUL_4(r[i], mask);
784 			}
785 		}
786 	}
787 }
788 
789 /*
790  * Generate RAID parity in the first virtual columns according to the number of
791  * parity columns available.
792  */
793 static void
794 vdev_raidz_generate_parity(raidz_map_t *rm)
795 {
796 	switch (rm->rm_firstdatacol) {
797 	case 1:
798 		vdev_raidz_generate_parity_p(rm);
799 		break;
800 	case 2:
801 		vdev_raidz_generate_parity_pq(rm);
802 		break;
803 	case 3:
804 		vdev_raidz_generate_parity_pqr(rm);
805 		break;
806 	default:
807 		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
808 	}
809 }
810 
811 /* ARGSUSED */
812 static int
813 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
814 {
815 	uint64_t *dst = dbuf;
816 	uint64_t *src = sbuf;
817 	int cnt = size / sizeof (src[0]);
818 
819 	for (int i = 0; i < cnt; i++) {
820 		dst[i] ^= src[i];
821 	}
822 
823 	return (0);
824 }
825 
826 /* ARGSUSED */
827 static int
828 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
829     void *private)
830 {
831 	uint64_t *dst = dbuf;
832 	uint64_t *src = sbuf;
833 	uint64_t mask;
834 	int cnt = size / sizeof (dst[0]);
835 
836 	for (int i = 0; i < cnt; i++, dst++, src++) {
837 		VDEV_RAIDZ_64MUL_2(*dst, mask);
838 		*dst ^= *src;
839 	}
840 
841 	return (0);
842 }
843 
844 /* ARGSUSED */
845 static int
846 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
847 {
848 	uint64_t *dst = buf;
849 	uint64_t mask;
850 	int cnt = size / sizeof (dst[0]);
851 
852 	for (int i = 0; i < cnt; i++, dst++) {
853 		/* same operation as vdev_raidz_reconst_q_pre_func() on dst */
854 		VDEV_RAIDZ_64MUL_2(*dst, mask);
855 	}
856 
857 	return (0);
858 }
859 
860 struct reconst_q_struct {
861 	uint64_t *q;
862 	int exp;
863 };
864 
865 static int
866 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
867 {
868 	struct reconst_q_struct *rq = private;
869 	uint64_t *dst = buf;
870 	int cnt = size / sizeof (dst[0]);
871 
872 	for (int i = 0; i < cnt; i++, dst++, rq->q++) {
873 		*dst ^= *rq->q;
874 
875 		int j;
876 		uint8_t *b;
877 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
878 			*b = vdev_raidz_exp2(*b, rq->exp);
879 		}
880 	}
881 
882 	return (0);
883 }
884 
885 struct reconst_pq_struct {
886 	uint8_t *p;
887 	uint8_t *q;
888 	uint8_t *pxy;
889 	uint8_t *qxy;
890 	int aexp;
891 	int bexp;
892 };
893 
894 static int
895 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
896 {
897 	struct reconst_pq_struct *rpq = private;
898 	uint8_t *xd = xbuf;
899 	uint8_t *yd = ybuf;
900 
901 	for (int i = 0; i < size;
902 	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
903 		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
904 		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
905 		*yd = *rpq->p ^ *rpq->pxy ^ *xd;
906 	}
907 
908 	return (0);
909 }
910 
911 static int
912 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
913 {
914 	struct reconst_pq_struct *rpq = private;
915 	uint8_t *xd = xbuf;
916 
917 	for (int i = 0; i < size;
918 	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
919 		/* same operation as vdev_raidz_reconst_pq_func() on xd */
920 		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
921 		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
922 	}
923 
924 	return (0);
925 }
926 
927 static int
928 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
929 {
930 	int x = tgts[0];
931 	int c;
932 	abd_t *dst, *src;
933 
934 	ASSERT(ntgts == 1);
935 	ASSERT(x >= rm->rm_firstdatacol);
936 	ASSERT(x < rm->rm_cols);
937 
938 	ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size);
939 	ASSERT(rm->rm_col[x].rc_size > 0);
940 
941 	src = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
942 	dst = rm->rm_col[x].rc_abd;
943 
944 	abd_copy(dst, src, rm->rm_col[x].rc_size);
945 
946 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
947 		uint64_t size = MIN(rm->rm_col[x].rc_size,
948 		    rm->rm_col[c].rc_size);
949 
950 		src = rm->rm_col[c].rc_abd;
951 		dst = rm->rm_col[x].rc_abd;
952 
953 		if (c == x)
954 			continue;
955 
956 		(void) abd_iterate_func2(dst, src, 0, 0, size,
957 		    vdev_raidz_reconst_p_func, NULL);
958 	}
959 
960 	return (1 << VDEV_RAIDZ_P);
961 }
962 
963 static int
964 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
965 {
966 	int x = tgts[0];
967 	int c, exp;
968 	abd_t *dst, *src;
969 
970 	ASSERT(ntgts == 1);
971 
972 	ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size);
973 
974 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
975 		uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size,
976 		    rm->rm_col[c].rc_size);
977 
978 		src = rm->rm_col[c].rc_abd;
979 		dst = rm->rm_col[x].rc_abd;
980 
981 		if (c == rm->rm_firstdatacol) {
982 			abd_copy(dst, src, size);
983 			if (rm->rm_col[x].rc_size > size)
984 				abd_zero_off(dst, size,
985 				    rm->rm_col[x].rc_size - size);
986 		} else {
987 			ASSERT3U(size, <=, rm->rm_col[x].rc_size);
988 			(void) abd_iterate_func2(dst, src, 0, 0, size,
989 			    vdev_raidz_reconst_q_pre_func, NULL);
990 			(void) abd_iterate_func(dst,
991 			    size, rm->rm_col[x].rc_size - size,
992 			    vdev_raidz_reconst_q_pre_tail_func, NULL);
993 		}
994 	}
995 
996 	src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
997 	dst = rm->rm_col[x].rc_abd;
998 	exp = 255 - (rm->rm_cols - 1 - x);
999 
1000 	struct reconst_q_struct rq = { abd_to_buf(src), exp };
1001 	(void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size,
1002 	    vdev_raidz_reconst_q_post_func, &rq);
1003 
1004 	return (1 << VDEV_RAIDZ_Q);
1005 }
1006 
1007 static int
1008 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
1009 {
1010 	uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
1011 	abd_t *pdata, *qdata;
1012 	uint64_t xsize, ysize;
1013 	int x = tgts[0];
1014 	int y = tgts[1];
1015 	abd_t *xd, *yd;
1016 
1017 	ASSERT(ntgts == 2);
1018 	ASSERT(x < y);
1019 	ASSERT(x >= rm->rm_firstdatacol);
1020 	ASSERT(y < rm->rm_cols);
1021 
1022 	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
1023 
1024 	/*
1025 	 * Move the parity data aside -- we're going to compute parity as
1026 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
1027 	 * reuse the parity generation mechanism without trashing the actual
1028 	 * parity so we make those columns appear to be full of zeros by
1029 	 * setting their lengths to zero.
1030 	 */
1031 	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
1032 	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
1033 	xsize = rm->rm_col[x].rc_size;
1034 	ysize = rm->rm_col[y].rc_size;
1035 
1036 	rm->rm_col[VDEV_RAIDZ_P].rc_abd =
1037 	    abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
1038 	rm->rm_col[VDEV_RAIDZ_Q].rc_abd =
1039 	    abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
1040 	rm->rm_col[x].rc_size = 0;
1041 	rm->rm_col[y].rc_size = 0;
1042 
1043 	vdev_raidz_generate_parity_pq(rm);
1044 
1045 	rm->rm_col[x].rc_size = xsize;
1046 	rm->rm_col[y].rc_size = ysize;
1047 
1048 	p = abd_to_buf(pdata);
1049 	q = abd_to_buf(qdata);
1050 	pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
1051 	qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
1052 	xd = rm->rm_col[x].rc_abd;
1053 	yd = rm->rm_col[y].rc_abd;
1054 
1055 	/*
1056 	 * We now have:
1057 	 *	Pxy = P + D_x + D_y
1058 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
1059 	 *
1060 	 * We can then solve for D_x:
1061 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
1062 	 * where
1063 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
1064 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
1065 	 *
1066 	 * With D_x in hand, we can easily solve for D_y:
1067 	 *	D_y = P + Pxy + D_x
1068 	 */
1069 
1070 	a = vdev_raidz_pow2[255 + x - y];
1071 	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
1072 	tmp = 255 - vdev_raidz_log2[a ^ 1];
1073 
1074 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
1075 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
1076 
1077 	ASSERT3U(xsize, >=, ysize);
1078 	struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
1079 	(void) abd_iterate_func2(xd, yd, 0, 0, ysize,
1080 	    vdev_raidz_reconst_pq_func, &rpq);
1081 	(void) abd_iterate_func(xd, ysize, xsize - ysize,
1082 	    vdev_raidz_reconst_pq_tail_func, &rpq);
1083 
1084 	abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
1085 	abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
1086 
1087 	/*
1088 	 * Restore the saved parity data.
1089 	 */
1090 	rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata;
1091 	rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata;
1092 
1093 	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
1094 }
1095 
1096 /* BEGIN CSTYLED */
1097 /*
1098  * In the general case of reconstruction, we must solve the system of linear
1099  * equations defined by the coeffecients used to generate parity as well as
1100  * the contents of the data and parity disks. This can be expressed with
1101  * vectors for the original data (D) and the actual data (d) and parity (p)
1102  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
1103  *
1104  *            __   __                     __     __
1105  *            |     |         __     __   |  p_0  |
1106  *            |  V  |         |  D_0  |   | p_m-1 |
1107  *            |     |    x    |   :   | = |  d_0  |
1108  *            |  I  |         | D_n-1 |   |   :   |
1109  *            |     |         ~~     ~~   | d_n-1 |
1110  *            ~~   ~~                     ~~     ~~
1111  *
1112  * I is simply a square identity matrix of size n, and V is a vandermonde
1113  * matrix defined by the coeffecients we chose for the various parity columns
1114  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
1115  * computation as well as linear separability.
1116  *
1117  *      __               __               __     __
1118  *      |   1   ..  1 1 1 |               |  p_0  |
1119  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
1120  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
1121  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
1122  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
1123  *      |   :       : : : |   |   :   |   |  d_2  |
1124  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
1125  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
1126  *      |   0   ..  0 0 1 |               | d_n-1 |
1127  *      ~~               ~~               ~~     ~~
1128  *
1129  * Note that I, V, d, and p are known. To compute D, we must invert the
1130  * matrix and use the known data and parity values to reconstruct the unknown
1131  * data values. We begin by removing the rows in V|I and d|p that correspond
1132  * to failed or missing columns; we then make V|I square (n x n) and d|p
1133  * sized n by removing rows corresponding to unused parity from the bottom up
1134  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
1135  * using Gauss-Jordan elimination. In the example below we use m=3 parity
1136  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
1137  *           __                               __
1138  *           |  1   1   1   1   1   1   1   1  |
1139  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
1140  *           |  19 205 116  29  64  16  4   1  |      / /
1141  *           |  1   0   0   0   0   0   0   0  |     / /
1142  *           |  0   1   0   0   0   0   0   0  | <--' /
1143  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
1144  *           |  0   0   0   1   0   0   0   0  |
1145  *           |  0   0   0   0   1   0   0   0  |
1146  *           |  0   0   0   0   0   1   0   0  |
1147  *           |  0   0   0   0   0   0   1   0  |
1148  *           |  0   0   0   0   0   0   0   1  |
1149  *           ~~                               ~~
1150  *           __                               __
1151  *           |  1   1   1   1   1   1   1   1  |
1152  *           |  19 205 116  29  64  16  4   1  |
1153  *           |  1   0   0   0   0   0   0   0  |
1154  *  (V|I)' = |  0   0   0   1   0   0   0   0  |
1155  *           |  0   0   0   0   1   0   0   0  |
1156  *           |  0   0   0   0   0   1   0   0  |
1157  *           |  0   0   0   0   0   0   1   0  |
1158  *           |  0   0   0   0   0   0   0   1  |
1159  *           ~~                               ~~
1160  *
1161  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1162  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1163  * matrix is not singular.
1164  * __                                                                 __
1165  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1166  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1167  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1168  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1169  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1170  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1171  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1172  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1173  * ~~                                                                 ~~
1174  * __                                                                 __
1175  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1176  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1177  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1178  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1179  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1180  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1181  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1182  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1183  * ~~                                                                 ~~
1184  * __                                                                 __
1185  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1186  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1187  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1188  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1189  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1190  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1191  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1192  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1193  * ~~                                                                 ~~
1194  * __                                                                 __
1195  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1196  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1197  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1198  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1199  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1200  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1201  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1202  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1203  * ~~                                                                 ~~
1204  * __                                                                 __
1205  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1206  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1207  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1208  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1209  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1210  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1211  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1212  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1213  * ~~                                                                 ~~
1214  * __                                                                 __
1215  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1216  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1217  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1218  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1219  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1220  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1221  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1222  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1223  * ~~                                                                 ~~
1224  *                   __                               __
1225  *                   |  0   0   1   0   0   0   0   0  |
1226  *                   | 167 100  5   41 159 169 217 208 |
1227  *                   | 166 100  4   40 158 168 216 209 |
1228  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1229  *                   |  0   0   0   0   1   0   0   0  |
1230  *                   |  0   0   0   0   0   1   0   0  |
1231  *                   |  0   0   0   0   0   0   1   0  |
1232  *                   |  0   0   0   0   0   0   0   1  |
1233  *                   ~~                               ~~
1234  *
1235  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1236  * of the missing data.
1237  *
1238  * As is apparent from the example above, the only non-trivial rows in the
1239  * inverse matrix correspond to the data disks that we're trying to
1240  * reconstruct. Indeed, those are the only rows we need as the others would
1241  * only be useful for reconstructing data known or assumed to be valid. For
1242  * that reason, we only build the coefficients in the rows that correspond to
1243  * targeted columns.
1244  */
1245 /* END CSTYLED */
1246 
1247 static void
1248 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1249     uint8_t **rows)
1250 {
1251 	int i, j;
1252 	int pow;
1253 
1254 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1255 
1256 	/*
1257 	 * Fill in the missing rows of interest.
1258 	 */
1259 	for (i = 0; i < nmap; i++) {
1260 		ASSERT3S(0, <=, map[i]);
1261 		ASSERT3S(map[i], <=, 2);
1262 
1263 		pow = map[i] * n;
1264 		if (pow > 255)
1265 			pow -= 255;
1266 		ASSERT(pow <= 255);
1267 
1268 		for (j = 0; j < n; j++) {
1269 			pow -= map[i];
1270 			if (pow < 0)
1271 				pow += 255;
1272 			rows[i][j] = vdev_raidz_pow2[pow];
1273 		}
1274 	}
1275 }
1276 
1277 static void
1278 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1279     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1280 {
1281 	int i, j, ii, jj;
1282 	uint8_t log;
1283 
1284 	/*
1285 	 * Assert that the first nmissing entries from the array of used
1286 	 * columns correspond to parity columns and that subsequent entries
1287 	 * correspond to data columns.
1288 	 */
1289 	for (i = 0; i < nmissing; i++) {
1290 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
1291 	}
1292 	for (; i < n; i++) {
1293 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1294 	}
1295 
1296 	/*
1297 	 * First initialize the storage where we'll compute the inverse rows.
1298 	 */
1299 	for (i = 0; i < nmissing; i++) {
1300 		for (j = 0; j < n; j++) {
1301 			invrows[i][j] = (i == j) ? 1 : 0;
1302 		}
1303 	}
1304 
1305 	/*
1306 	 * Subtract all trivial rows from the rows of consequence.
1307 	 */
1308 	for (i = 0; i < nmissing; i++) {
1309 		for (j = nmissing; j < n; j++) {
1310 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1311 			jj = used[j] - rm->rm_firstdatacol;
1312 			ASSERT3S(jj, <, n);
1313 			invrows[i][j] = rows[i][jj];
1314 			rows[i][jj] = 0;
1315 		}
1316 	}
1317 
1318 	/*
1319 	 * For each of the rows of interest, we must normalize it and subtract
1320 	 * a multiple of it from the other rows.
1321 	 */
1322 	for (i = 0; i < nmissing; i++) {
1323 		for (j = 0; j < missing[i]; j++) {
1324 			ASSERT0(rows[i][j]);
1325 		}
1326 		ASSERT3U(rows[i][missing[i]], !=, 0);
1327 
1328 		/*
1329 		 * Compute the inverse of the first element and multiply each
1330 		 * element in the row by that value.
1331 		 */
1332 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1333 
1334 		for (j = 0; j < n; j++) {
1335 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1336 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1337 		}
1338 
1339 		for (ii = 0; ii < nmissing; ii++) {
1340 			if (i == ii)
1341 				continue;
1342 
1343 			ASSERT3U(rows[ii][missing[i]], !=, 0);
1344 
1345 			log = vdev_raidz_log2[rows[ii][missing[i]]];
1346 
1347 			for (j = 0; j < n; j++) {
1348 				rows[ii][j] ^=
1349 				    vdev_raidz_exp2(rows[i][j], log);
1350 				invrows[ii][j] ^=
1351 				    vdev_raidz_exp2(invrows[i][j], log);
1352 			}
1353 		}
1354 	}
1355 
1356 	/*
1357 	 * Verify that the data that is left in the rows are properly part of
1358 	 * an identity matrix.
1359 	 */
1360 	for (i = 0; i < nmissing; i++) {
1361 		for (j = 0; j < n; j++) {
1362 			if (j == missing[i]) {
1363 				ASSERT3U(rows[i][j], ==, 1);
1364 			} else {
1365 				ASSERT0(rows[i][j]);
1366 			}
1367 		}
1368 	}
1369 }
1370 
1371 static void
1372 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1373     int *missing, uint8_t **invrows, const uint8_t *used)
1374 {
1375 	int i, j, x, cc, c;
1376 	uint8_t *src;
1377 	uint64_t ccount;
1378 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1379 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1380 	uint8_t log = 0;
1381 	uint8_t val;
1382 	int ll;
1383 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1384 	uint8_t *p, *pp;
1385 	size_t psize;
1386 
1387 	psize = sizeof (invlog[0][0]) * n * nmissing;
1388 	p = kmem_alloc(psize, KM_SLEEP);
1389 
1390 	for (pp = p, i = 0; i < nmissing; i++) {
1391 		invlog[i] = pp;
1392 		pp += n;
1393 	}
1394 
1395 	for (i = 0; i < nmissing; i++) {
1396 		for (j = 0; j < n; j++) {
1397 			ASSERT3U(invrows[i][j], !=, 0);
1398 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1399 		}
1400 	}
1401 
1402 	for (i = 0; i < n; i++) {
1403 		c = used[i];
1404 		ASSERT3U(c, <, rm->rm_cols);
1405 
1406 		src = abd_to_buf(rm->rm_col[c].rc_abd);
1407 		ccount = rm->rm_col[c].rc_size;
1408 		for (j = 0; j < nmissing; j++) {
1409 			cc = missing[j] + rm->rm_firstdatacol;
1410 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1411 			ASSERT3U(cc, <, rm->rm_cols);
1412 			ASSERT3U(cc, !=, c);
1413 
1414 			dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd);
1415 			dcount[j] = rm->rm_col[cc].rc_size;
1416 		}
1417 
1418 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1419 
1420 		for (x = 0; x < ccount; x++, src++) {
1421 			if (*src != 0)
1422 				log = vdev_raidz_log2[*src];
1423 
1424 			for (cc = 0; cc < nmissing; cc++) {
1425 				if (x >= dcount[cc])
1426 					continue;
1427 
1428 				if (*src == 0) {
1429 					val = 0;
1430 				} else {
1431 					if ((ll = log + invlog[cc][i]) >= 255)
1432 						ll -= 255;
1433 					val = vdev_raidz_pow2[ll];
1434 				}
1435 
1436 				if (i == 0)
1437 					dst[cc][x] = val;
1438 				else
1439 					dst[cc][x] ^= val;
1440 			}
1441 		}
1442 	}
1443 
1444 	kmem_free(p, psize);
1445 }
1446 
1447 static int
1448 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1449 {
1450 	int n, i, c, t, tt;
1451 	int nmissing_rows;
1452 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1453 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1454 
1455 	uint8_t *p, *pp;
1456 	size_t psize;
1457 
1458 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1459 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1460 	uint8_t *used;
1461 
1462 	abd_t **bufs = NULL;
1463 
1464 	int code = 0;
1465 
1466 	/*
1467 	 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
1468 	 * temporary linear ABDs.
1469 	 */
1470 	if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) {
1471 		bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE);
1472 
1473 		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1474 			raidz_col_t *col = &rm->rm_col[c];
1475 
1476 			bufs[c] = col->rc_abd;
1477 			col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE);
1478 			abd_copy(col->rc_abd, bufs[c], col->rc_size);
1479 		}
1480 	}
1481 
1482 	n = rm->rm_cols - rm->rm_firstdatacol;
1483 
1484 	/*
1485 	 * Figure out which data columns are missing.
1486 	 */
1487 	nmissing_rows = 0;
1488 	for (t = 0; t < ntgts; t++) {
1489 		if (tgts[t] >= rm->rm_firstdatacol) {
1490 			missing_rows[nmissing_rows++] =
1491 			    tgts[t] - rm->rm_firstdatacol;
1492 		}
1493 	}
1494 
1495 	/*
1496 	 * Figure out which parity columns to use to help generate the missing
1497 	 * data columns.
1498 	 */
1499 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1500 		ASSERT(tt < ntgts);
1501 		ASSERT(c < rm->rm_firstdatacol);
1502 
1503 		/*
1504 		 * Skip any targeted parity columns.
1505 		 */
1506 		if (c == tgts[tt]) {
1507 			tt++;
1508 			continue;
1509 		}
1510 
1511 		code |= 1 << c;
1512 
1513 		parity_map[i] = c;
1514 		i++;
1515 	}
1516 
1517 	ASSERT(code != 0);
1518 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1519 
1520 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1521 	    nmissing_rows * n + sizeof (used[0]) * n;
1522 	p = kmem_alloc(psize, KM_SLEEP);
1523 
1524 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1525 		rows[i] = pp;
1526 		pp += n;
1527 		invrows[i] = pp;
1528 		pp += n;
1529 	}
1530 	used = pp;
1531 
1532 	for (i = 0; i < nmissing_rows; i++) {
1533 		used[i] = parity_map[i];
1534 	}
1535 
1536 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1537 		if (tt < nmissing_rows &&
1538 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1539 			tt++;
1540 			continue;
1541 		}
1542 
1543 		ASSERT3S(i, <, n);
1544 		used[i] = c;
1545 		i++;
1546 	}
1547 
1548 	/*
1549 	 * Initialize the interesting rows of the matrix.
1550 	 */
1551 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1552 
1553 	/*
1554 	 * Invert the matrix.
1555 	 */
1556 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1557 	    invrows, used);
1558 
1559 	/*
1560 	 * Reconstruct the missing data using the generated matrix.
1561 	 */
1562 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1563 	    invrows, used);
1564 
1565 	kmem_free(p, psize);
1566 
1567 	/*
1568 	 * copy back from temporary linear abds and free them
1569 	 */
1570 	if (bufs) {
1571 		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1572 			raidz_col_t *col = &rm->rm_col[c];
1573 
1574 			abd_copy(bufs[c], col->rc_abd, col->rc_size);
1575 			abd_free(col->rc_abd);
1576 			col->rc_abd = bufs[c];
1577 		}
1578 		kmem_free(bufs, rm->rm_cols * sizeof (abd_t *));
1579 	}
1580 
1581 	return (code);
1582 }
1583 
1584 static int
1585 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1586 {
1587 	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1588 	int ntgts;
1589 	int i, c;
1590 	int code;
1591 	int nbadparity, nbaddata;
1592 	int parity_valid[VDEV_RAIDZ_MAXPARITY];
1593 
1594 	/*
1595 	 * The tgts list must already be sorted.
1596 	 */
1597 	for (i = 1; i < nt; i++) {
1598 		ASSERT(t[i] > t[i - 1]);
1599 	}
1600 
1601 	nbadparity = rm->rm_firstdatacol;
1602 	nbaddata = rm->rm_cols - nbadparity;
1603 	ntgts = 0;
1604 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1605 		if (c < rm->rm_firstdatacol)
1606 			parity_valid[c] = B_FALSE;
1607 
1608 		if (i < nt && c == t[i]) {
1609 			tgts[ntgts++] = c;
1610 			i++;
1611 		} else if (rm->rm_col[c].rc_error != 0) {
1612 			tgts[ntgts++] = c;
1613 		} else if (c >= rm->rm_firstdatacol) {
1614 			nbaddata--;
1615 		} else {
1616 			parity_valid[c] = B_TRUE;
1617 			nbadparity--;
1618 		}
1619 	}
1620 
1621 	ASSERT(ntgts >= nt);
1622 	ASSERT(nbaddata >= 0);
1623 	ASSERT(nbaddata + nbadparity == ntgts);
1624 
1625 	dt = &tgts[nbadparity];
1626 
1627 	/*
1628 	 * See if we can use any of our optimized reconstruction routines.
1629 	 */
1630 	if (!vdev_raidz_default_to_general) {
1631 		switch (nbaddata) {
1632 		case 1:
1633 			if (parity_valid[VDEV_RAIDZ_P])
1634 				return (vdev_raidz_reconstruct_p(rm, dt, 1));
1635 
1636 			ASSERT(rm->rm_firstdatacol > 1);
1637 
1638 			if (parity_valid[VDEV_RAIDZ_Q])
1639 				return (vdev_raidz_reconstruct_q(rm, dt, 1));
1640 
1641 			ASSERT(rm->rm_firstdatacol > 2);
1642 			break;
1643 
1644 		case 2:
1645 			ASSERT(rm->rm_firstdatacol > 1);
1646 
1647 			if (parity_valid[VDEV_RAIDZ_P] &&
1648 			    parity_valid[VDEV_RAIDZ_Q])
1649 				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1650 
1651 			ASSERT(rm->rm_firstdatacol > 2);
1652 
1653 			break;
1654 		}
1655 	}
1656 
1657 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1658 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1659 	ASSERT(code > 0);
1660 	return (code);
1661 }
1662 
1663 static int
1664 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1665     uint64_t *ashift)
1666 {
1667 	vdev_t *cvd;
1668 	uint64_t nparity = vd->vdev_nparity;
1669 	int c;
1670 	int lasterror = 0;
1671 	int numerrors = 0;
1672 
1673 	ASSERT(nparity > 0);
1674 
1675 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
1676 	    vd->vdev_children < nparity + 1) {
1677 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1678 		return (SET_ERROR(EINVAL));
1679 	}
1680 
1681 	vdev_open_children(vd);
1682 
1683 	for (c = 0; c < vd->vdev_children; c++) {
1684 		cvd = vd->vdev_child[c];
1685 
1686 		if (cvd->vdev_open_error != 0) {
1687 			lasterror = cvd->vdev_open_error;
1688 			numerrors++;
1689 			continue;
1690 		}
1691 
1692 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1693 		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1694 		*ashift = MAX(*ashift, cvd->vdev_ashift);
1695 	}
1696 
1697 	*asize *= vd->vdev_children;
1698 	*max_asize *= vd->vdev_children;
1699 
1700 	if (numerrors > nparity) {
1701 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1702 		return (lasterror);
1703 	}
1704 
1705 	return (0);
1706 }
1707 
1708 static void
1709 vdev_raidz_close(vdev_t *vd)
1710 {
1711 	int c;
1712 
1713 	for (c = 0; c < vd->vdev_children; c++)
1714 		vdev_close(vd->vdev_child[c]);
1715 }
1716 
1717 /*
1718  * Handle a read or write I/O to a RAID-Z dump device.
1719  *
1720  * The dump device is in a unique situation compared to other ZFS datasets:
1721  * writing to this device should be as simple and fast as possible.  In
1722  * addition, durability matters much less since the dump will be extracted
1723  * once the machine reboots.  For that reason, this function eschews parity for
1724  * performance and simplicity.  The dump device uses the checksum setting
1725  * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1726  * dataset.
1727  *
1728  * Blocks of size 128 KB have been preallocated for this volume.  I/Os less than
1729  * 128 KB will not fill an entire block; in addition, they may not be properly
1730  * aligned.  In that case, this function uses the preallocated 128 KB block and
1731  * omits reading or writing any "empty" portions of that block, as opposed to
1732  * allocating a fresh appropriately-sized block.
1733  *
1734  * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1735  *
1736  *     vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1737  *
1738  * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1739  * allocated which spans all five child vdevs.  8 KB of data would be written to
1740  * each of four vdevs, with the fifth containing the parity bits.
1741  *
1742  *       parity    data     data     data     data
1743  *     |   PP   |   XX   |   XX   |   XX   |   XX   |
1744  *         ^        ^        ^        ^        ^
1745  *         |        |        |        |        |
1746  *   8 KB parity    ------8 KB data blocks------
1747  *
1748  * However, when writing to the dump device, the behavior is different:
1749  *
1750  *     vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1751  *
1752  * Unlike the normal RAID-Z case in which the block is allocated based on the
1753  * I/O size, reads and writes here always use a 128 KB logical I/O size.  If the
1754  * I/O size is less than 128 KB, only the actual portions of data are written.
1755  * In this example the data is written to the third data vdev since that vdev
1756  * contains the offset [64 KB, 96 KB).
1757  *
1758  *       parity    data     data     data     data
1759  *     |        |        |        |   XX   |        |
1760  *                                    ^
1761  *                                    |
1762  *                             32 KB data block
1763  *
1764  * As a result, an individual I/O may not span all child vdevs; moreover, a
1765  * small I/O may only operate on a single child vdev.
1766  *
1767  * Note that since there are no parity bits calculated or written, this format
1768  * remains the same no matter how many parity bits are used in a normal RAID-Z
1769  * stripe.  On a RAID-Z3 configuration with seven child vdevs, the example above
1770  * would look like:
1771  *
1772  *       parity   parity   parity    data     data     data     data
1773  *     |        |        |        |        |        |   XX   |        |
1774  *                                                      ^
1775  *                                                      |
1776  *                                               32 KB data block
1777  */
1778 int
1779 vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1780     uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1781 {
1782 	vdev_t *tvd = vd->vdev_top;
1783 	vdev_t *cvd;
1784 	raidz_map_t *rm;
1785 	raidz_col_t *rc;
1786 	int c, err = 0;
1787 
1788 	uint64_t start, end, colstart, colend;
1789 	uint64_t coloffset, colsize, colskip;
1790 
1791 	int flags = doread ? B_READ : B_WRITE;
1792 
1793 #ifdef	_KERNEL
1794 
1795 	/*
1796 	 * Don't write past the end of the block
1797 	 */
1798 	VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
1799 
1800 	start = offset;
1801 	end = start + size;
1802 
1803 	/*
1804 	 * Allocate a RAID-Z map for this block.  Note that this block starts
1805 	 * from the "original" offset, this is, the offset of the extent which
1806 	 * contains the requisite offset of the data being read or written.
1807 	 *
1808 	 * Even if this I/O operation doesn't span the full block size, let's
1809 	 * treat the on-disk format as if the only blocks are the complete 128
1810 	 * KB size.
1811 	 */
1812 	abd_t *abd = abd_get_from_buf(data - (offset - origoffset),
1813 	    SPA_OLD_MAXBLOCKSIZE);
1814 	rm = vdev_raidz_map_alloc(abd,
1815 	    SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift,
1816 	    vd->vdev_children, vd->vdev_nparity);
1817 
1818 	coloffset = origoffset;
1819 
1820 	for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1821 	    c++, coloffset += rc->rc_size) {
1822 		rc = &rm->rm_col[c];
1823 		cvd = vd->vdev_child[rc->rc_devidx];
1824 
1825 		/*
1826 		 * Find the start and end of this column in the RAID-Z map,
1827 		 * keeping in mind that the stated size and offset of the
1828 		 * operation may not fill the entire column for this vdev.
1829 		 *
1830 		 * If any portion of the data spans this column, issue the
1831 		 * appropriate operation to the vdev.
1832 		 */
1833 		if (coloffset + rc->rc_size <= start)
1834 			continue;
1835 		if (coloffset >= end)
1836 			continue;
1837 
1838 		colstart = MAX(coloffset, start);
1839 		colend = MIN(end, coloffset + rc->rc_size);
1840 		colsize = colend - colstart;
1841 		colskip = colstart - coloffset;
1842 
1843 		VERIFY3U(colsize, <=, rc->rc_size);
1844 		VERIFY3U(colskip, <=, rc->rc_size);
1845 
1846 		/*
1847 		 * Note that the child vdev will have a vdev label at the start
1848 		 * of its range of offsets, hence the need for
1849 		 * VDEV_LABEL_OFFSET().  See zio_vdev_child_io() for another
1850 		 * example of why this calculation is needed.
1851 		 */
1852 		if ((err = vdev_disk_physio(cvd,
1853 		    ((char *)abd_to_buf(rc->rc_abd)) + colskip, colsize,
1854 		    VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1855 		    flags, isdump)) != 0)
1856 			break;
1857 	}
1858 
1859 	vdev_raidz_map_free(rm);
1860 	abd_put(abd);
1861 #endif	/* KERNEL */
1862 
1863 	return (err);
1864 }
1865 
1866 static uint64_t
1867 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1868 {
1869 	uint64_t asize;
1870 	uint64_t ashift = vd->vdev_top->vdev_ashift;
1871 	uint64_t cols = vd->vdev_children;
1872 	uint64_t nparity = vd->vdev_nparity;
1873 
1874 	asize = ((psize - 1) >> ashift) + 1;
1875 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1876 	asize = roundup(asize, nparity + 1) << ashift;
1877 
1878 	return (asize);
1879 }
1880 
1881 static void
1882 vdev_raidz_child_done(zio_t *zio)
1883 {
1884 	raidz_col_t *rc = zio->io_private;
1885 
1886 	rc->rc_error = zio->io_error;
1887 	rc->rc_tried = 1;
1888 	rc->rc_skipped = 0;
1889 }
1890 
1891 static void
1892 vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col)
1893 {
1894 #ifdef ZFS_DEBUG
1895 	vdev_t *vd = zio->io_vd;
1896 	vdev_t *tvd = vd->vdev_top;
1897 
1898 	range_seg_t logical_rs, physical_rs;
1899 	logical_rs.rs_start = zio->io_offset;
1900 	logical_rs.rs_end = logical_rs.rs_start +
1901 	    vdev_raidz_asize(zio->io_vd, zio->io_size);
1902 
1903 	raidz_col_t *rc = &rm->rm_col[col];
1904 	vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
1905 
1906 	vdev_xlate(cvd, &logical_rs, &physical_rs);
1907 	ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
1908 	ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
1909 	/*
1910 	 * It would be nice to assert that rs_end is equal
1911 	 * to rc_offset + rc_size but there might be an
1912 	 * optional I/O at the end that is not accounted in
1913 	 * rc_size.
1914 	 */
1915 	if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
1916 		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
1917 		    rc->rc_size + (1 << tvd->vdev_ashift));
1918 	} else {
1919 		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
1920 	}
1921 #endif
1922 }
1923 
1924 /*
1925  * Start an IO operation on a RAIDZ VDev
1926  *
1927  * Outline:
1928  * - For write operations:
1929  *   1. Generate the parity data
1930  *   2. Create child zio write operations to each column's vdev, for both
1931  *      data and parity.
1932  *   3. If the column skips any sectors for padding, create optional dummy
1933  *      write zio children for those areas to improve aggregation continuity.
1934  * - For read operations:
1935  *   1. Create child zio read operations to each data column's vdev to read
1936  *      the range of data required for zio.
1937  *   2. If this is a scrub or resilver operation, or if any of the data
1938  *      vdevs have had errors, then create zio read operations to the parity
1939  *      columns' VDevs as well.
1940  */
1941 static void
1942 vdev_raidz_io_start(zio_t *zio)
1943 {
1944 	vdev_t *vd = zio->io_vd;
1945 	vdev_t *tvd = vd->vdev_top;
1946 	vdev_t *cvd;
1947 	raidz_map_t *rm;
1948 	raidz_col_t *rc;
1949 	int c, i;
1950 
1951 	rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset,
1952 	    tvd->vdev_ashift, vd->vdev_children,
1953 	    vd->vdev_nparity);
1954 
1955 	zio->io_vsd = rm;
1956 	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1957 
1958 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1959 
1960 	if (zio->io_type == ZIO_TYPE_WRITE) {
1961 		vdev_raidz_generate_parity(rm);
1962 
1963 		for (c = 0; c < rm->rm_cols; c++) {
1964 			rc = &rm->rm_col[c];
1965 			cvd = vd->vdev_child[rc->rc_devidx];
1966 
1967 			/*
1968 			 * Verify physical to logical translation.
1969 			 */
1970 			vdev_raidz_io_verify(zio, rm, c);
1971 
1972 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1973 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
1974 			    zio->io_type, zio->io_priority, 0,
1975 			    vdev_raidz_child_done, rc));
1976 		}
1977 
1978 		/*
1979 		 * Generate optional I/Os for any skipped sectors to improve
1980 		 * aggregation contiguity.
1981 		 */
1982 		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1983 			ASSERT(c <= rm->rm_scols);
1984 			if (c == rm->rm_scols)
1985 				c = 0;
1986 			rc = &rm->rm_col[c];
1987 			cvd = vd->vdev_child[rc->rc_devidx];
1988 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1989 			    rc->rc_offset + rc->rc_size, NULL,
1990 			    1 << tvd->vdev_ashift,
1991 			    zio->io_type, zio->io_priority,
1992 			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1993 		}
1994 
1995 		zio_execute(zio);
1996 		return;
1997 	}
1998 
1999 	ASSERT(zio->io_type == ZIO_TYPE_READ);
2000 
2001 	/*
2002 	 * Iterate over the columns in reverse order so that we hit the parity
2003 	 * last -- any errors along the way will force us to read the parity.
2004 	 */
2005 	for (c = rm->rm_cols - 1; c >= 0; c--) {
2006 		rc = &rm->rm_col[c];
2007 		cvd = vd->vdev_child[rc->rc_devidx];
2008 		if (!vdev_readable(cvd)) {
2009 			if (c >= rm->rm_firstdatacol)
2010 				rm->rm_missingdata++;
2011 			else
2012 				rm->rm_missingparity++;
2013 			rc->rc_error = SET_ERROR(ENXIO);
2014 			rc->rc_tried = 1;	/* don't even try */
2015 			rc->rc_skipped = 1;
2016 			continue;
2017 		}
2018 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
2019 			if (c >= rm->rm_firstdatacol)
2020 				rm->rm_missingdata++;
2021 			else
2022 				rm->rm_missingparity++;
2023 			rc->rc_error = SET_ERROR(ESTALE);
2024 			rc->rc_skipped = 1;
2025 			continue;
2026 		}
2027 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
2028 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
2029 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2030 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
2031 			    zio->io_type, zio->io_priority, 0,
2032 			    vdev_raidz_child_done, rc));
2033 		}
2034 	}
2035 
2036 	zio_execute(zio);
2037 }
2038 
2039 
2040 /*
2041  * Report a checksum error for a child of a RAID-Z device.
2042  */
2043 static void
2044 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
2045 {
2046 	void *buf;
2047 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
2048 
2049 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2050 		zio_bad_cksum_t zbc;
2051 		raidz_map_t *rm = zio->io_vsd;
2052 
2053 		mutex_enter(&vd->vdev_stat_lock);
2054 		vd->vdev_stat.vs_checksum_errors++;
2055 		mutex_exit(&vd->vdev_stat_lock);
2056 
2057 		zbc.zbc_has_cksum = 0;
2058 		zbc.zbc_injected = rm->rm_ecksuminjected;
2059 
2060 		buf = abd_borrow_buf_copy(rc->rc_abd, rc->rc_size);
2061 		zfs_ereport_post_checksum(zio->io_spa, vd, zio,
2062 		    rc->rc_offset, rc->rc_size, buf, bad_data,
2063 		    &zbc);
2064 		abd_return_buf(rc->rc_abd, buf, rc->rc_size);
2065 	}
2066 }
2067 
2068 /*
2069  * We keep track of whether or not there were any injected errors, so that
2070  * any ereports we generate can note it.
2071  */
2072 static int
2073 raidz_checksum_verify(zio_t *zio)
2074 {
2075 	zio_bad_cksum_t zbc;
2076 	raidz_map_t *rm = zio->io_vsd;
2077 
2078 	int ret = zio_checksum_error(zio, &zbc);
2079 	if (ret != 0 && zbc.zbc_injected != 0)
2080 		rm->rm_ecksuminjected = 1;
2081 
2082 	return (ret);
2083 }
2084 
2085 /*
2086  * Generate the parity from the data columns. If we tried and were able to
2087  * read the parity without error, verify that the generated parity matches the
2088  * data we read. If it doesn't, we fire off a checksum error. Return the
2089  * number such failures.
2090  */
2091 static int
2092 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
2093 {
2094 	void *orig[VDEV_RAIDZ_MAXPARITY];
2095 	int c, ret = 0;
2096 	raidz_col_t *rc;
2097 
2098 	blkptr_t *bp = zio->io_bp;
2099 	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
2100 	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
2101 
2102 	if (checksum == ZIO_CHECKSUM_NOPARITY)
2103 		return (ret);
2104 
2105 	for (c = 0; c < rm->rm_firstdatacol; c++) {
2106 		rc = &rm->rm_col[c];
2107 		if (!rc->rc_tried || rc->rc_error != 0)
2108 			continue;
2109 		orig[c] = zio_buf_alloc(rc->rc_size);
2110 		abd_copy_to_buf(orig[c], rc->rc_abd, rc->rc_size);
2111 	}
2112 
2113 	vdev_raidz_generate_parity(rm);
2114 
2115 	for (c = 0; c < rm->rm_firstdatacol; c++) {
2116 		rc = &rm->rm_col[c];
2117 		if (!rc->rc_tried || rc->rc_error != 0)
2118 			continue;
2119 		if (abd_cmp_buf(rc->rc_abd, orig[c], rc->rc_size) != 0) {
2120 			raidz_checksum_error(zio, rc, orig[c]);
2121 			rc->rc_error = SET_ERROR(ECKSUM);
2122 			ret++;
2123 		}
2124 		zio_buf_free(orig[c], rc->rc_size);
2125 	}
2126 
2127 	return (ret);
2128 }
2129 
2130 /*
2131  * Keep statistics on all the ways that we used parity to correct data.
2132  */
2133 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
2134 
2135 static int
2136 vdev_raidz_worst_error(raidz_map_t *rm)
2137 {
2138 	int error = 0;
2139 
2140 	for (int c = 0; c < rm->rm_cols; c++)
2141 		error = zio_worst_error(error, rm->rm_col[c].rc_error);
2142 
2143 	return (error);
2144 }
2145 
2146 /*
2147  * Iterate over all combinations of bad data and attempt a reconstruction.
2148  * Note that the algorithm below is non-optimal because it doesn't take into
2149  * account how reconstruction is actually performed. For example, with
2150  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
2151  * is targeted as invalid as if columns 1 and 4 are targeted since in both
2152  * cases we'd only use parity information in column 0.
2153  */
2154 static int
2155 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
2156 {
2157 	raidz_map_t *rm = zio->io_vsd;
2158 	raidz_col_t *rc;
2159 	void *orig[VDEV_RAIDZ_MAXPARITY];
2160 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
2161 	int *tgts = &tstore[1];
2162 	int current, next, i, c, n;
2163 	int code, ret = 0;
2164 
2165 	ASSERT(total_errors < rm->rm_firstdatacol);
2166 
2167 	/*
2168 	 * This simplifies one edge condition.
2169 	 */
2170 	tgts[-1] = -1;
2171 
2172 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
2173 		/*
2174 		 * Initialize the targets array by finding the first n columns
2175 		 * that contain no error.
2176 		 *
2177 		 * If there were no data errors, we need to ensure that we're
2178 		 * always explicitly attempting to reconstruct at least one
2179 		 * data column. To do this, we simply push the highest target
2180 		 * up into the data columns.
2181 		 */
2182 		for (c = 0, i = 0; i < n; i++) {
2183 			if (i == n - 1 && data_errors == 0 &&
2184 			    c < rm->rm_firstdatacol) {
2185 				c = rm->rm_firstdatacol;
2186 			}
2187 
2188 			while (rm->rm_col[c].rc_error != 0) {
2189 				c++;
2190 				ASSERT3S(c, <, rm->rm_cols);
2191 			}
2192 
2193 			tgts[i] = c++;
2194 		}
2195 
2196 		/*
2197 		 * Setting tgts[n] simplifies the other edge condition.
2198 		 */
2199 		tgts[n] = rm->rm_cols;
2200 
2201 		/*
2202 		 * These buffers were allocated in previous iterations.
2203 		 */
2204 		for (i = 0; i < n - 1; i++) {
2205 			ASSERT(orig[i] != NULL);
2206 		}
2207 
2208 		orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
2209 
2210 		current = 0;
2211 		next = tgts[current];
2212 
2213 		while (current != n) {
2214 			tgts[current] = next;
2215 			current = 0;
2216 
2217 			/*
2218 			 * Save off the original data that we're going to
2219 			 * attempt to reconstruct.
2220 			 */
2221 			for (i = 0; i < n; i++) {
2222 				ASSERT(orig[i] != NULL);
2223 				c = tgts[i];
2224 				ASSERT3S(c, >=, 0);
2225 				ASSERT3S(c, <, rm->rm_cols);
2226 				rc = &rm->rm_col[c];
2227 				abd_copy_to_buf(orig[i], rc->rc_abd,
2228 				    rc->rc_size);
2229 			}
2230 
2231 			/*
2232 			 * Attempt a reconstruction and exit the outer loop on
2233 			 * success.
2234 			 */
2235 			code = vdev_raidz_reconstruct(rm, tgts, n);
2236 			if (raidz_checksum_verify(zio) == 0) {
2237 				atomic_inc_64(&raidz_corrected[code]);
2238 
2239 				for (i = 0; i < n; i++) {
2240 					c = tgts[i];
2241 					rc = &rm->rm_col[c];
2242 					ASSERT(rc->rc_error == 0);
2243 					if (rc->rc_tried)
2244 						raidz_checksum_error(zio, rc,
2245 						    orig[i]);
2246 					rc->rc_error = SET_ERROR(ECKSUM);
2247 				}
2248 
2249 				ret = code;
2250 				goto done;
2251 			}
2252 
2253 			/*
2254 			 * Restore the original data.
2255 			 */
2256 			for (i = 0; i < n; i++) {
2257 				c = tgts[i];
2258 				rc = &rm->rm_col[c];
2259 				abd_copy_from_buf(rc->rc_abd, orig[i],
2260 				    rc->rc_size);
2261 			}
2262 
2263 			do {
2264 				/*
2265 				 * Find the next valid column after the current
2266 				 * position..
2267 				 */
2268 				for (next = tgts[current] + 1;
2269 				    next < rm->rm_cols &&
2270 				    rm->rm_col[next].rc_error != 0; next++)
2271 					continue;
2272 
2273 				ASSERT(next <= tgts[current + 1]);
2274 
2275 				/*
2276 				 * If that spot is available, we're done here.
2277 				 */
2278 				if (next != tgts[current + 1])
2279 					break;
2280 
2281 				/*
2282 				 * Otherwise, find the next valid column after
2283 				 * the previous position.
2284 				 */
2285 				for (c = tgts[current - 1] + 1;
2286 				    rm->rm_col[c].rc_error != 0; c++)
2287 					continue;
2288 
2289 				tgts[current] = c;
2290 				current++;
2291 
2292 			} while (current != n);
2293 		}
2294 	}
2295 	n--;
2296 done:
2297 	for (i = 0; i < n; i++) {
2298 		zio_buf_free(orig[i], rm->rm_col[0].rc_size);
2299 	}
2300 
2301 	return (ret);
2302 }
2303 
2304 /*
2305  * Complete an IO operation on a RAIDZ VDev
2306  *
2307  * Outline:
2308  * - For write operations:
2309  *   1. Check for errors on the child IOs.
2310  *   2. Return, setting an error code if too few child VDevs were written
2311  *      to reconstruct the data later.  Note that partial writes are
2312  *      considered successful if they can be reconstructed at all.
2313  * - For read operations:
2314  *   1. Check for errors on the child IOs.
2315  *   2. If data errors occurred:
2316  *      a. Try to reassemble the data from the parity available.
2317  *      b. If we haven't yet read the parity drives, read them now.
2318  *      c. If all parity drives have been read but the data still doesn't
2319  *         reassemble with a correct checksum, then try combinatorial
2320  *         reconstruction.
2321  *      d. If that doesn't work, return an error.
2322  *   3. If there were unexpected errors or this is a resilver operation,
2323  *      rewrite the vdevs that had errors.
2324  */
2325 static void
2326 vdev_raidz_io_done(zio_t *zio)
2327 {
2328 	vdev_t *vd = zio->io_vd;
2329 	vdev_t *cvd;
2330 	raidz_map_t *rm = zio->io_vsd;
2331 	raidz_col_t *rc;
2332 	int unexpected_errors = 0;
2333 	int parity_errors = 0;
2334 	int parity_untried = 0;
2335 	int data_errors = 0;
2336 	int total_errors = 0;
2337 	int n, c;
2338 	int tgts[VDEV_RAIDZ_MAXPARITY];
2339 	int code;
2340 
2341 	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
2342 
2343 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2344 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2345 
2346 	for (c = 0; c < rm->rm_cols; c++) {
2347 		rc = &rm->rm_col[c];
2348 
2349 		if (rc->rc_error) {
2350 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
2351 
2352 			if (c < rm->rm_firstdatacol)
2353 				parity_errors++;
2354 			else
2355 				data_errors++;
2356 
2357 			if (!rc->rc_skipped)
2358 				unexpected_errors++;
2359 
2360 			total_errors++;
2361 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2362 			parity_untried++;
2363 		}
2364 	}
2365 
2366 	if (zio->io_type == ZIO_TYPE_WRITE) {
2367 		/*
2368 		 * XXX -- for now, treat partial writes as a success.
2369 		 * (If we couldn't write enough columns to reconstruct
2370 		 * the data, the I/O failed.  Otherwise, good enough.)
2371 		 *
2372 		 * Now that we support write reallocation, it would be better
2373 		 * to treat partial failure as real failure unless there are
2374 		 * no non-degraded top-level vdevs left, and not update DTLs
2375 		 * if we intend to reallocate.
2376 		 */
2377 		/* XXPOLICY */
2378 		if (total_errors > rm->rm_firstdatacol)
2379 			zio->io_error = vdev_raidz_worst_error(rm);
2380 
2381 		return;
2382 	}
2383 
2384 	ASSERT(zio->io_type == ZIO_TYPE_READ);
2385 	/*
2386 	 * There are three potential phases for a read:
2387 	 *	1. produce valid data from the columns read
2388 	 *	2. read all disks and try again
2389 	 *	3. perform combinatorial reconstruction
2390 	 *
2391 	 * Each phase is progressively both more expensive and less likely to
2392 	 * occur. If we encounter more errors than we can repair or all phases
2393 	 * fail, we have no choice but to return an error.
2394 	 */
2395 
2396 	/*
2397 	 * If the number of errors we saw was correctable -- less than or equal
2398 	 * to the number of parity disks read -- attempt to produce data that
2399 	 * has a valid checksum. Naturally, this case applies in the absence of
2400 	 * any errors.
2401 	 */
2402 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2403 		if (data_errors == 0) {
2404 			if (raidz_checksum_verify(zio) == 0) {
2405 				/*
2406 				 * If we read parity information (unnecessarily
2407 				 * as it happens since no reconstruction was
2408 				 * needed) regenerate and verify the parity.
2409 				 * We also regenerate parity when resilvering
2410 				 * so we can write it out to the failed device
2411 				 * later.
2412 				 */
2413 				if (parity_errors + parity_untried <
2414 				    rm->rm_firstdatacol ||
2415 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2416 					n = raidz_parity_verify(zio, rm);
2417 					unexpected_errors += n;
2418 					ASSERT(parity_errors + n <=
2419 					    rm->rm_firstdatacol);
2420 				}
2421 				goto done;
2422 			}
2423 		} else {
2424 			/*
2425 			 * We either attempt to read all the parity columns or
2426 			 * none of them. If we didn't try to read parity, we
2427 			 * wouldn't be here in the correctable case. There must
2428 			 * also have been fewer parity errors than parity
2429 			 * columns or, again, we wouldn't be in this code path.
2430 			 */
2431 			ASSERT(parity_untried == 0);
2432 			ASSERT(parity_errors < rm->rm_firstdatacol);
2433 
2434 			/*
2435 			 * Identify the data columns that reported an error.
2436 			 */
2437 			n = 0;
2438 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2439 				rc = &rm->rm_col[c];
2440 				if (rc->rc_error != 0) {
2441 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2442 					tgts[n++] = c;
2443 				}
2444 			}
2445 
2446 			ASSERT(rm->rm_firstdatacol >= n);
2447 
2448 			code = vdev_raidz_reconstruct(rm, tgts, n);
2449 
2450 			if (raidz_checksum_verify(zio) == 0) {
2451 				atomic_inc_64(&raidz_corrected[code]);
2452 
2453 				/*
2454 				 * If we read more parity disks than were used
2455 				 * for reconstruction, confirm that the other
2456 				 * parity disks produced correct data. This
2457 				 * routine is suboptimal in that it regenerates
2458 				 * the parity that we already used in addition
2459 				 * to the parity that we're attempting to
2460 				 * verify, but this should be a relatively
2461 				 * uncommon case, and can be optimized if it
2462 				 * becomes a problem. Note that we regenerate
2463 				 * parity when resilvering so we can write it
2464 				 * out to failed devices later.
2465 				 */
2466 				if (parity_errors < rm->rm_firstdatacol - n ||
2467 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2468 					n = raidz_parity_verify(zio, rm);
2469 					unexpected_errors += n;
2470 					ASSERT(parity_errors + n <=
2471 					    rm->rm_firstdatacol);
2472 				}
2473 
2474 				goto done;
2475 			}
2476 		}
2477 	}
2478 
2479 	/*
2480 	 * This isn't a typical situation -- either we got a read error or
2481 	 * a child silently returned bad data. Read every block so we can
2482 	 * try again with as much data and parity as we can track down. If
2483 	 * we've already been through once before, all children will be marked
2484 	 * as tried so we'll proceed to combinatorial reconstruction.
2485 	 */
2486 	unexpected_errors = 1;
2487 	rm->rm_missingdata = 0;
2488 	rm->rm_missingparity = 0;
2489 
2490 	for (c = 0; c < rm->rm_cols; c++) {
2491 		if (rm->rm_col[c].rc_tried)
2492 			continue;
2493 
2494 		zio_vdev_io_redone(zio);
2495 		do {
2496 			rc = &rm->rm_col[c];
2497 			if (rc->rc_tried)
2498 				continue;
2499 			zio_nowait(zio_vdev_child_io(zio, NULL,
2500 			    vd->vdev_child[rc->rc_devidx],
2501 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
2502 			    zio->io_type, zio->io_priority, 0,
2503 			    vdev_raidz_child_done, rc));
2504 		} while (++c < rm->rm_cols);
2505 
2506 		return;
2507 	}
2508 
2509 	/*
2510 	 * At this point we've attempted to reconstruct the data given the
2511 	 * errors we detected, and we've attempted to read all columns. There
2512 	 * must, therefore, be one or more additional problems -- silent errors
2513 	 * resulting in invalid data rather than explicit I/O errors resulting
2514 	 * in absent data. We check if there is enough additional data to
2515 	 * possibly reconstruct the data and then perform combinatorial
2516 	 * reconstruction over all possible combinations. If that fails,
2517 	 * we're cooked.
2518 	 */
2519 	if (total_errors > rm->rm_firstdatacol) {
2520 		zio->io_error = vdev_raidz_worst_error(rm);
2521 
2522 	} else if (total_errors < rm->rm_firstdatacol &&
2523 	    (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2524 		/*
2525 		 * If we didn't use all the available parity for the
2526 		 * combinatorial reconstruction, verify that the remaining
2527 		 * parity is correct.
2528 		 */
2529 		if (code != (1 << rm->rm_firstdatacol) - 1)
2530 			(void) raidz_parity_verify(zio, rm);
2531 	} else {
2532 		/*
2533 		 * We're here because either:
2534 		 *
2535 		 *	total_errors == rm_first_datacol, or
2536 		 *	vdev_raidz_combrec() failed
2537 		 *
2538 		 * In either case, there is enough bad data to prevent
2539 		 * reconstruction.
2540 		 *
2541 		 * Start checksum ereports for all children which haven't
2542 		 * failed, and the IO wasn't speculative.
2543 		 */
2544 		zio->io_error = SET_ERROR(ECKSUM);
2545 
2546 		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2547 			for (c = 0; c < rm->rm_cols; c++) {
2548 				rc = &rm->rm_col[c];
2549 				if (rc->rc_error == 0) {
2550 					zio_bad_cksum_t zbc;
2551 					zbc.zbc_has_cksum = 0;
2552 					zbc.zbc_injected =
2553 					    rm->rm_ecksuminjected;
2554 
2555 					zfs_ereport_start_checksum(
2556 					    zio->io_spa,
2557 					    vd->vdev_child[rc->rc_devidx],
2558 					    zio, rc->rc_offset, rc->rc_size,
2559 					    (void *)(uintptr_t)c, &zbc);
2560 				}
2561 			}
2562 		}
2563 	}
2564 
2565 done:
2566 	zio_checksum_verified(zio);
2567 
2568 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2569 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2570 		/*
2571 		 * Use the good data we have in hand to repair damaged children.
2572 		 */
2573 		for (c = 0; c < rm->rm_cols; c++) {
2574 			rc = &rm->rm_col[c];
2575 			cvd = vd->vdev_child[rc->rc_devidx];
2576 
2577 			if (rc->rc_error == 0)
2578 				continue;
2579 
2580 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2581 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
2582 			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2583 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2584 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2585 		}
2586 	}
2587 }
2588 
2589 static void
2590 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2591 {
2592 	if (faulted > vd->vdev_nparity)
2593 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2594 		    VDEV_AUX_NO_REPLICAS);
2595 	else if (degraded + faulted != 0)
2596 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2597 	else
2598 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2599 }
2600 
2601 /*
2602  * Determine if any portion of the provided block resides on a child vdev
2603  * with a dirty DTL and therefore needs to be resilvered.  The function
2604  * assumes that at least one DTL is dirty which imples that full stripe
2605  * width blocks must be resilvered.
2606  */
2607 static boolean_t
2608 vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
2609 {
2610 	uint64_t dcols = vd->vdev_children;
2611 	uint64_t nparity = vd->vdev_nparity;
2612 	uint64_t ashift = vd->vdev_top->vdev_ashift;
2613 	/* The starting RAIDZ (parent) vdev sector of the block. */
2614 	uint64_t b = offset >> ashift;
2615 	/* The zio's size in units of the vdev's minimum sector size. */
2616 	uint64_t s = ((psize - 1) >> ashift) + 1;
2617 	/* The first column for this stripe. */
2618 	uint64_t f = b % dcols;
2619 
2620 	if (s + nparity >= dcols)
2621 		return (B_TRUE);
2622 
2623 	for (uint64_t c = 0; c < s + nparity; c++) {
2624 		uint64_t devidx = (f + c) % dcols;
2625 		vdev_t *cvd = vd->vdev_child[devidx];
2626 
2627 		/*
2628 		 * dsl_scan_need_resilver() already checked vd with
2629 		 * vdev_dtl_contains(). So here just check cvd with
2630 		 * vdev_dtl_empty(), cheaper and a good approximation.
2631 		 */
2632 		if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
2633 			return (B_TRUE);
2634 	}
2635 
2636 	return (B_FALSE);
2637 }
2638 
2639 static void
2640 vdev_raidz_xlate(vdev_t *cvd, const range_seg_t *in, range_seg_t *res)
2641 {
2642 	vdev_t *raidvd = cvd->vdev_parent;
2643 	ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
2644 
2645 	uint64_t width = raidvd->vdev_children;
2646 	uint64_t tgt_col = cvd->vdev_id;
2647 	uint64_t ashift = raidvd->vdev_top->vdev_ashift;
2648 
2649 	/* make sure the offsets are block-aligned */
2650 	ASSERT0(in->rs_start % (1 << ashift));
2651 	ASSERT0(in->rs_end % (1 << ashift));
2652 	uint64_t b_start = in->rs_start >> ashift;
2653 	uint64_t b_end = in->rs_end >> ashift;
2654 
2655 	uint64_t start_row = 0;
2656 	if (b_start > tgt_col) /* avoid underflow */
2657 		start_row = ((b_start - tgt_col - 1) / width) + 1;
2658 
2659 	uint64_t end_row = 0;
2660 	if (b_end > tgt_col)
2661 		end_row = ((b_end - tgt_col - 1) / width) + 1;
2662 
2663 	res->rs_start = start_row << ashift;
2664 	res->rs_end = end_row << ashift;
2665 
2666 	ASSERT3U(res->rs_start, <=, in->rs_start);
2667 	ASSERT3U(res->rs_end - res->rs_start, <=, in->rs_end - in->rs_start);
2668 }
2669 
2670 vdev_ops_t vdev_raidz_ops = {
2671 	.vdev_op_open = vdev_raidz_open,
2672 	.vdev_op_close = vdev_raidz_close,
2673 	.vdev_op_asize = vdev_raidz_asize,
2674 	.vdev_op_io_start = vdev_raidz_io_start,
2675 	.vdev_op_io_done = vdev_raidz_io_done,
2676 	.vdev_op_state_change = vdev_raidz_state_change,
2677 	.vdev_op_need_resilver = vdev_raidz_need_resilver,
2678 	.vdev_op_hold = NULL,
2679 	.vdev_op_rele = NULL,
2680 	.vdev_op_remap = NULL,
2681 	.vdev_op_xlate = vdev_raidz_xlate,
2682 	.vdev_op_type = VDEV_TYPE_RAIDZ,	/* name of this vdev type */
2683 	.vdev_op_leaf = B_FALSE			/* not a leaf vdev */
2684 };
2685