xref: /titanic_44/usr/src/uts/common/fs/zfs/vdev_raidz.c (revision 1db2880b3a411e3c56e50c7dc42d3b137fcc4e48)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
25  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
26  */
27 
28 #include <sys/zfs_context.h>
29 #include <sys/spa.h>
30 #include <sys/vdev_impl.h>
31 #include <sys/vdev_disk.h>
32 #include <sys/vdev_file.h>
33 #include <sys/vdev_raidz.h>
34 #include <sys/zio.h>
35 #include <sys/zio_checksum.h>
36 #include <sys/fs/zfs.h>
37 #include <sys/fm/fs/zfs.h>
38 
39 /*
40  * Virtual device vector for RAID-Z.
41  *
42  * This vdev supports single, double, and triple parity. For single parity,
43  * we use a simple XOR of all the data columns. For double or triple parity,
44  * we use a special case of Reed-Solomon coding. This extends the
45  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
46  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
47  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
48  * former is also based. The latter is designed to provide higher performance
49  * for writes.
50  *
51  * Note that the Plank paper claimed to support arbitrary N+M, but was then
52  * amended six years later identifying a critical flaw that invalidates its
53  * claims. Nevertheless, the technique can be adapted to work for up to
54  * triple parity. For additional parity, the amendment "Note: Correction to
55  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
56  * is viable, but the additional complexity means that write performance will
57  * suffer.
58  *
59  * All of the methods above operate on a Galois field, defined over the
60  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
61  * can be expressed with a single byte. Briefly, the operations on the
62  * field are defined as follows:
63  *
64  *   o addition (+) is represented by a bitwise XOR
65  *   o subtraction (-) is therefore identical to addition: A + B = A - B
66  *   o multiplication of A by 2 is defined by the following bitwise expression:
67  *
68  *	(A * 2)_7 = A_6
69  *	(A * 2)_6 = A_5
70  *	(A * 2)_5 = A_4
71  *	(A * 2)_4 = A_3 + A_7
72  *	(A * 2)_3 = A_2 + A_7
73  *	(A * 2)_2 = A_1 + A_7
74  *	(A * 2)_1 = A_0
75  *	(A * 2)_0 = A_7
76  *
77  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
78  * As an aside, this multiplication is derived from the error correcting
79  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
80  *
81  * Observe that any number in the field (except for 0) can be expressed as a
82  * power of 2 -- a generator for the field. We store a table of the powers of
83  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
84  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
85  * than field addition). The inverse of a field element A (A^-1) is therefore
86  * A ^ (255 - 1) = A^254.
87  *
88  * The up-to-three parity columns, P, Q, R over several data columns,
89  * D_0, ... D_n-1, can be expressed by field operations:
90  *
91  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
92  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
93  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
94  *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
95  *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
96  *
97  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
98  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
99  * independent coefficients. (There are no additional coefficients that have
100  * this property which is why the uncorrected Plank method breaks down.)
101  *
102  * See the reconstruction code below for how P, Q and R can used individually
103  * or in concert to recover missing data columns.
104  */
105 
106 typedef struct raidz_col {
107 	uint64_t rc_devidx;		/* child device index for I/O */
108 	uint64_t rc_offset;		/* device offset */
109 	uint64_t rc_size;		/* I/O size */
110 	void *rc_data;			/* I/O data */
111 	void *rc_gdata;			/* used to store the "good" version */
112 	int rc_error;			/* I/O error for this device */
113 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
114 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
115 } raidz_col_t;
116 
117 typedef struct raidz_map {
118 	uint64_t rm_cols;		/* Regular column count */
119 	uint64_t rm_scols;		/* Count including skipped columns */
120 	uint64_t rm_bigcols;		/* Number of oversized columns */
121 	uint64_t rm_asize;		/* Actual total I/O size */
122 	uint64_t rm_missingdata;	/* Count of missing data devices */
123 	uint64_t rm_missingparity;	/* Count of missing parity devices */
124 	uint64_t rm_firstdatacol;	/* First data column/parity count */
125 	uint64_t rm_nskip;		/* Skipped sectors for padding */
126 	uint64_t rm_skipstart;		/* Column index of padding start */
127 	void *rm_datacopy;		/* rm_asize-buffer of copied data */
128 	uintptr_t rm_reports;		/* # of referencing checksum reports */
129 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
130 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
131 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
132 } raidz_map_t;
133 
134 #define	VDEV_RAIDZ_P		0
135 #define	VDEV_RAIDZ_Q		1
136 #define	VDEV_RAIDZ_R		2
137 
138 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
139 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
140 
141 /*
142  * We provide a mechanism to perform the field multiplication operation on a
143  * 64-bit value all at once rather than a byte at a time. This works by
144  * creating a mask from the top bit in each byte and using that to
145  * conditionally apply the XOR of 0x1d.
146  */
147 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
148 { \
149 	(mask) = (x) & 0x8080808080808080ULL; \
150 	(mask) = ((mask) << 1) - ((mask) >> 7); \
151 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
152 	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
153 }
154 
155 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
156 { \
157 	VDEV_RAIDZ_64MUL_2((x), mask); \
158 	VDEV_RAIDZ_64MUL_2((x), mask); \
159 }
160 
161 #define	VDEV_LABEL_OFFSET(x)	(x + VDEV_LABEL_START_SIZE)
162 
163 /*
164  * Force reconstruction to use the general purpose method.
165  */
166 int vdev_raidz_default_to_general;
167 
168 /* Powers of 2 in the Galois field defined above. */
169 static const uint8_t vdev_raidz_pow2[256] = {
170 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
171 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
172 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
173 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
174 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
175 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
176 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
177 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
178 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
179 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
180 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
181 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
182 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
183 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
184 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
185 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
186 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
187 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
188 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
189 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
190 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
191 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
192 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
193 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
194 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
195 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
196 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
197 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
198 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
199 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
200 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
201 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
202 };
203 /* Logs of 2 in the Galois field defined above. */
204 static const uint8_t vdev_raidz_log2[256] = {
205 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
206 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
207 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
208 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
209 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
210 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
211 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
212 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
213 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
214 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
215 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
216 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
217 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
218 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
219 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
220 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
221 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
222 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
223 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
224 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
225 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
226 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
227 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
228 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
229 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
230 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
231 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
232 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
233 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
234 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
235 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
236 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
237 };
238 
239 static void vdev_raidz_generate_parity(raidz_map_t *rm);
240 
241 /*
242  * Multiply a given number by 2 raised to the given power.
243  */
244 static uint8_t
245 vdev_raidz_exp2(uint_t a, int exp)
246 {
247 	if (a == 0)
248 		return (0);
249 
250 	ASSERT(exp >= 0);
251 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
252 
253 	exp += vdev_raidz_log2[a];
254 	if (exp > 255)
255 		exp -= 255;
256 
257 	return (vdev_raidz_pow2[exp]);
258 }
259 
260 static void
261 vdev_raidz_map_free(raidz_map_t *rm)
262 {
263 	int c;
264 	size_t size;
265 
266 	for (c = 0; c < rm->rm_firstdatacol; c++) {
267 		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
268 
269 		if (rm->rm_col[c].rc_gdata != NULL)
270 			zio_buf_free(rm->rm_col[c].rc_gdata,
271 			    rm->rm_col[c].rc_size);
272 	}
273 
274 	size = 0;
275 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
276 		size += rm->rm_col[c].rc_size;
277 
278 	if (rm->rm_datacopy != NULL)
279 		zio_buf_free(rm->rm_datacopy, size);
280 
281 	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
282 }
283 
284 static void
285 vdev_raidz_map_free_vsd(zio_t *zio)
286 {
287 	raidz_map_t *rm = zio->io_vsd;
288 
289 	ASSERT0(rm->rm_freed);
290 	rm->rm_freed = 1;
291 
292 	if (rm->rm_reports == 0)
293 		vdev_raidz_map_free(rm);
294 }
295 
296 /*ARGSUSED*/
297 static void
298 vdev_raidz_cksum_free(void *arg, size_t ignored)
299 {
300 	raidz_map_t *rm = arg;
301 
302 	ASSERT3U(rm->rm_reports, >, 0);
303 
304 	if (--rm->rm_reports == 0 && rm->rm_freed != 0)
305 		vdev_raidz_map_free(rm);
306 }
307 
308 static void
309 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
310 {
311 	raidz_map_t *rm = zcr->zcr_cbdata;
312 	size_t c = zcr->zcr_cbinfo;
313 	size_t x;
314 
315 	const char *good = NULL;
316 	const char *bad = rm->rm_col[c].rc_data;
317 
318 	if (good_data == NULL) {
319 		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
320 		return;
321 	}
322 
323 	if (c < rm->rm_firstdatacol) {
324 		/*
325 		 * The first time through, calculate the parity blocks for
326 		 * the good data (this relies on the fact that the good
327 		 * data never changes for a given logical ZIO)
328 		 */
329 		if (rm->rm_col[0].rc_gdata == NULL) {
330 			char *bad_parity[VDEV_RAIDZ_MAXPARITY];
331 			char *buf;
332 
333 			/*
334 			 * Set up the rm_col[]s to generate the parity for
335 			 * good_data, first saving the parity bufs and
336 			 * replacing them with buffers to hold the result.
337 			 */
338 			for (x = 0; x < rm->rm_firstdatacol; x++) {
339 				bad_parity[x] = rm->rm_col[x].rc_data;
340 				rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
341 				    zio_buf_alloc(rm->rm_col[x].rc_size);
342 			}
343 
344 			/* fill in the data columns from good_data */
345 			buf = (char *)good_data;
346 			for (; x < rm->rm_cols; x++) {
347 				rm->rm_col[x].rc_data = buf;
348 				buf += rm->rm_col[x].rc_size;
349 			}
350 
351 			/*
352 			 * Construct the parity from the good data.
353 			 */
354 			vdev_raidz_generate_parity(rm);
355 
356 			/* restore everything back to its original state */
357 			for (x = 0; x < rm->rm_firstdatacol; x++)
358 				rm->rm_col[x].rc_data = bad_parity[x];
359 
360 			buf = rm->rm_datacopy;
361 			for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
362 				rm->rm_col[x].rc_data = buf;
363 				buf += rm->rm_col[x].rc_size;
364 			}
365 		}
366 
367 		ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
368 		good = rm->rm_col[c].rc_gdata;
369 	} else {
370 		/* adjust good_data to point at the start of our column */
371 		good = good_data;
372 
373 		for (x = rm->rm_firstdatacol; x < c; x++)
374 			good += rm->rm_col[x].rc_size;
375 	}
376 
377 	/* we drop the ereport if it ends up that the data was good */
378 	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
379 }
380 
381 /*
382  * Invoked indirectly by zfs_ereport_start_checksum(), called
383  * below when our read operation fails completely.  The main point
384  * is to keep a copy of everything we read from disk, so that at
385  * vdev_raidz_cksum_finish() time we can compare it with the good data.
386  */
387 static void
388 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
389 {
390 	size_t c = (size_t)(uintptr_t)arg;
391 	caddr_t buf;
392 
393 	raidz_map_t *rm = zio->io_vsd;
394 	size_t size;
395 
396 	/* set up the report and bump the refcount  */
397 	zcr->zcr_cbdata = rm;
398 	zcr->zcr_cbinfo = c;
399 	zcr->zcr_finish = vdev_raidz_cksum_finish;
400 	zcr->zcr_free = vdev_raidz_cksum_free;
401 
402 	rm->rm_reports++;
403 	ASSERT3U(rm->rm_reports, >, 0);
404 
405 	if (rm->rm_datacopy != NULL)
406 		return;
407 
408 	/*
409 	 * It's the first time we're called for this raidz_map_t, so we need
410 	 * to copy the data aside; there's no guarantee that our zio's buffer
411 	 * won't be re-used for something else.
412 	 *
413 	 * Our parity data is already in separate buffers, so there's no need
414 	 * to copy them.
415 	 */
416 
417 	size = 0;
418 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
419 		size += rm->rm_col[c].rc_size;
420 
421 	buf = rm->rm_datacopy = zio_buf_alloc(size);
422 
423 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
424 		raidz_col_t *col = &rm->rm_col[c];
425 
426 		bcopy(col->rc_data, buf, col->rc_size);
427 		col->rc_data = buf;
428 
429 		buf += col->rc_size;
430 	}
431 	ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
432 }
433 
434 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
435 	vdev_raidz_map_free_vsd,
436 	vdev_raidz_cksum_report
437 };
438 
439 /*
440  * Divides the IO evenly across all child vdevs; usually, dcols is
441  * the number of children in the target vdev.
442  */
443 static raidz_map_t *
444 vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset,
445     uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
446 {
447 	raidz_map_t *rm;
448 	/* The starting RAIDZ (parent) vdev sector of the block. */
449 	uint64_t b = offset >> unit_shift;
450 	/* The zio's size in units of the vdev's minimum sector size. */
451 	uint64_t s = size >> unit_shift;
452 	/* The first column for this stripe. */
453 	uint64_t f = b % dcols;
454 	/* The starting byte offset on each child vdev. */
455 	uint64_t o = (b / dcols) << unit_shift;
456 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
457 
458 	/*
459 	 * "Quotient": The number of data sectors for this stripe on all but
460 	 * the "big column" child vdevs that also contain "remainder" data.
461 	 */
462 	q = s / (dcols - nparity);
463 
464 	/*
465 	 * "Remainder": The number of partial stripe data sectors in this I/O.
466 	 * This will add a sector to some, but not all, child vdevs.
467 	 */
468 	r = s - q * (dcols - nparity);
469 
470 	/* The number of "big columns" - those which contain remainder data. */
471 	bc = (r == 0 ? 0 : r + nparity);
472 
473 	/*
474 	 * The total number of data and parity sectors associated with
475 	 * this I/O.
476 	 */
477 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
478 
479 	/* acols: The columns that will be accessed. */
480 	/* scols: The columns that will be accessed or skipped. */
481 	if (q == 0) {
482 		/* Our I/O request doesn't span all child vdevs. */
483 		acols = bc;
484 		scols = MIN(dcols, roundup(bc, nparity + 1));
485 	} else {
486 		acols = dcols;
487 		scols = dcols;
488 	}
489 
490 	ASSERT3U(acols, <=, scols);
491 
492 	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
493 
494 	rm->rm_cols = acols;
495 	rm->rm_scols = scols;
496 	rm->rm_bigcols = bc;
497 	rm->rm_skipstart = bc;
498 	rm->rm_missingdata = 0;
499 	rm->rm_missingparity = 0;
500 	rm->rm_firstdatacol = nparity;
501 	rm->rm_datacopy = NULL;
502 	rm->rm_reports = 0;
503 	rm->rm_freed = 0;
504 	rm->rm_ecksuminjected = 0;
505 
506 	asize = 0;
507 
508 	for (c = 0; c < scols; c++) {
509 		col = f + c;
510 		coff = o;
511 		if (col >= dcols) {
512 			col -= dcols;
513 			coff += 1ULL << unit_shift;
514 		}
515 		rm->rm_col[c].rc_devidx = col;
516 		rm->rm_col[c].rc_offset = coff;
517 		rm->rm_col[c].rc_data = NULL;
518 		rm->rm_col[c].rc_gdata = NULL;
519 		rm->rm_col[c].rc_error = 0;
520 		rm->rm_col[c].rc_tried = 0;
521 		rm->rm_col[c].rc_skipped = 0;
522 
523 		if (c >= acols)
524 			rm->rm_col[c].rc_size = 0;
525 		else if (c < bc)
526 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
527 		else
528 			rm->rm_col[c].rc_size = q << unit_shift;
529 
530 		asize += rm->rm_col[c].rc_size;
531 	}
532 
533 	ASSERT3U(asize, ==, tot << unit_shift);
534 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
535 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
536 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
537 	ASSERT3U(rm->rm_nskip, <=, nparity);
538 
539 	for (c = 0; c < rm->rm_firstdatacol; c++)
540 		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
541 
542 	rm->rm_col[c].rc_data = data;
543 
544 	for (c = c + 1; c < acols; c++)
545 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
546 		    rm->rm_col[c - 1].rc_size;
547 
548 	/*
549 	 * If all data stored spans all columns, there's a danger that parity
550 	 * will always be on the same device and, since parity isn't read
551 	 * during normal operation, that that device's I/O bandwidth won't be
552 	 * used effectively. We therefore switch the parity every 1MB.
553 	 *
554 	 * ... at least that was, ostensibly, the theory. As a practical
555 	 * matter unless we juggle the parity between all devices evenly, we
556 	 * won't see any benefit. Further, occasional writes that aren't a
557 	 * multiple of the LCM of the number of children and the minimum
558 	 * stripe width are sufficient to avoid pessimal behavior.
559 	 * Unfortunately, this decision created an implicit on-disk format
560 	 * requirement that we need to support for all eternity, but only
561 	 * for single-parity RAID-Z.
562 	 *
563 	 * If we intend to skip a sector in the zeroth column for padding
564 	 * we must make sure to note this swap. We will never intend to
565 	 * skip the first column since at least one data and one parity
566 	 * column must appear in each row.
567 	 */
568 	ASSERT(rm->rm_cols >= 2);
569 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
570 
571 	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
572 		devidx = rm->rm_col[0].rc_devidx;
573 		o = rm->rm_col[0].rc_offset;
574 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
575 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
576 		rm->rm_col[1].rc_devidx = devidx;
577 		rm->rm_col[1].rc_offset = o;
578 
579 		if (rm->rm_skipstart == 0)
580 			rm->rm_skipstart = 1;
581 	}
582 
583 	return (rm);
584 }
585 
586 static void
587 vdev_raidz_generate_parity_p(raidz_map_t *rm)
588 {
589 	uint64_t *p, *src, pcount, ccount, i;
590 	int c;
591 
592 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
593 
594 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
595 		src = rm->rm_col[c].rc_data;
596 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
597 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
598 
599 		if (c == rm->rm_firstdatacol) {
600 			ASSERT(ccount == pcount);
601 			for (i = 0; i < ccount; i++, src++, p++) {
602 				*p = *src;
603 			}
604 		} else {
605 			ASSERT(ccount <= pcount);
606 			for (i = 0; i < ccount; i++, src++, p++) {
607 				*p ^= *src;
608 			}
609 		}
610 	}
611 }
612 
613 static void
614 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
615 {
616 	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
617 	int c;
618 
619 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
620 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
621 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
622 
623 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
624 		src = rm->rm_col[c].rc_data;
625 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
626 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
627 
628 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
629 
630 		if (c == rm->rm_firstdatacol) {
631 			ASSERT(ccnt == pcnt || ccnt == 0);
632 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
633 				*p = *src;
634 				*q = *src;
635 			}
636 			for (; i < pcnt; i++, src++, p++, q++) {
637 				*p = 0;
638 				*q = 0;
639 			}
640 		} else {
641 			ASSERT(ccnt <= pcnt);
642 
643 			/*
644 			 * Apply the algorithm described above by multiplying
645 			 * the previous result and adding in the new value.
646 			 */
647 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
648 				*p ^= *src;
649 
650 				VDEV_RAIDZ_64MUL_2(*q, mask);
651 				*q ^= *src;
652 			}
653 
654 			/*
655 			 * Treat short columns as though they are full of 0s.
656 			 * Note that there's therefore nothing needed for P.
657 			 */
658 			for (; i < pcnt; i++, q++) {
659 				VDEV_RAIDZ_64MUL_2(*q, mask);
660 			}
661 		}
662 	}
663 }
664 
665 static void
666 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
667 {
668 	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
669 	int c;
670 
671 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
672 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
673 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
674 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
675 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
676 
677 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
678 		src = rm->rm_col[c].rc_data;
679 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
680 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
681 		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
682 
683 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
684 
685 		if (c == rm->rm_firstdatacol) {
686 			ASSERT(ccnt == pcnt || ccnt == 0);
687 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
688 				*p = *src;
689 				*q = *src;
690 				*r = *src;
691 			}
692 			for (; i < pcnt; i++, src++, p++, q++, r++) {
693 				*p = 0;
694 				*q = 0;
695 				*r = 0;
696 			}
697 		} else {
698 			ASSERT(ccnt <= pcnt);
699 
700 			/*
701 			 * Apply the algorithm described above by multiplying
702 			 * the previous result and adding in the new value.
703 			 */
704 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
705 				*p ^= *src;
706 
707 				VDEV_RAIDZ_64MUL_2(*q, mask);
708 				*q ^= *src;
709 
710 				VDEV_RAIDZ_64MUL_4(*r, mask);
711 				*r ^= *src;
712 			}
713 
714 			/*
715 			 * Treat short columns as though they are full of 0s.
716 			 * Note that there's therefore nothing needed for P.
717 			 */
718 			for (; i < pcnt; i++, q++, r++) {
719 				VDEV_RAIDZ_64MUL_2(*q, mask);
720 				VDEV_RAIDZ_64MUL_4(*r, mask);
721 			}
722 		}
723 	}
724 }
725 
726 /*
727  * Generate RAID parity in the first virtual columns according to the number of
728  * parity columns available.
729  */
730 static void
731 vdev_raidz_generate_parity(raidz_map_t *rm)
732 {
733 	switch (rm->rm_firstdatacol) {
734 	case 1:
735 		vdev_raidz_generate_parity_p(rm);
736 		break;
737 	case 2:
738 		vdev_raidz_generate_parity_pq(rm);
739 		break;
740 	case 3:
741 		vdev_raidz_generate_parity_pqr(rm);
742 		break;
743 	default:
744 		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
745 	}
746 }
747 
748 static int
749 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
750 {
751 	uint64_t *dst, *src, xcount, ccount, count, i;
752 	int x = tgts[0];
753 	int c;
754 
755 	ASSERT(ntgts == 1);
756 	ASSERT(x >= rm->rm_firstdatacol);
757 	ASSERT(x < rm->rm_cols);
758 
759 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
760 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
761 	ASSERT(xcount > 0);
762 
763 	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
764 	dst = rm->rm_col[x].rc_data;
765 	for (i = 0; i < xcount; i++, dst++, src++) {
766 		*dst = *src;
767 	}
768 
769 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
770 		src = rm->rm_col[c].rc_data;
771 		dst = rm->rm_col[x].rc_data;
772 
773 		if (c == x)
774 			continue;
775 
776 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
777 		count = MIN(ccount, xcount);
778 
779 		for (i = 0; i < count; i++, dst++, src++) {
780 			*dst ^= *src;
781 		}
782 	}
783 
784 	return (1 << VDEV_RAIDZ_P);
785 }
786 
787 static int
788 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
789 {
790 	uint64_t *dst, *src, xcount, ccount, count, mask, i;
791 	uint8_t *b;
792 	int x = tgts[0];
793 	int c, j, exp;
794 
795 	ASSERT(ntgts == 1);
796 
797 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
798 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
799 
800 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
801 		src = rm->rm_col[c].rc_data;
802 		dst = rm->rm_col[x].rc_data;
803 
804 		if (c == x)
805 			ccount = 0;
806 		else
807 			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
808 
809 		count = MIN(ccount, xcount);
810 
811 		if (c == rm->rm_firstdatacol) {
812 			for (i = 0; i < count; i++, dst++, src++) {
813 				*dst = *src;
814 			}
815 			for (; i < xcount; i++, dst++) {
816 				*dst = 0;
817 			}
818 
819 		} else {
820 			for (i = 0; i < count; i++, dst++, src++) {
821 				VDEV_RAIDZ_64MUL_2(*dst, mask);
822 				*dst ^= *src;
823 			}
824 
825 			for (; i < xcount; i++, dst++) {
826 				VDEV_RAIDZ_64MUL_2(*dst, mask);
827 			}
828 		}
829 	}
830 
831 	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
832 	dst = rm->rm_col[x].rc_data;
833 	exp = 255 - (rm->rm_cols - 1 - x);
834 
835 	for (i = 0; i < xcount; i++, dst++, src++) {
836 		*dst ^= *src;
837 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
838 			*b = vdev_raidz_exp2(*b, exp);
839 		}
840 	}
841 
842 	return (1 << VDEV_RAIDZ_Q);
843 }
844 
845 static int
846 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
847 {
848 	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
849 	void *pdata, *qdata;
850 	uint64_t xsize, ysize, i;
851 	int x = tgts[0];
852 	int y = tgts[1];
853 
854 	ASSERT(ntgts == 2);
855 	ASSERT(x < y);
856 	ASSERT(x >= rm->rm_firstdatacol);
857 	ASSERT(y < rm->rm_cols);
858 
859 	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
860 
861 	/*
862 	 * Move the parity data aside -- we're going to compute parity as
863 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
864 	 * reuse the parity generation mechanism without trashing the actual
865 	 * parity so we make those columns appear to be full of zeros by
866 	 * setting their lengths to zero.
867 	 */
868 	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
869 	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
870 	xsize = rm->rm_col[x].rc_size;
871 	ysize = rm->rm_col[y].rc_size;
872 
873 	rm->rm_col[VDEV_RAIDZ_P].rc_data =
874 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
875 	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
876 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
877 	rm->rm_col[x].rc_size = 0;
878 	rm->rm_col[y].rc_size = 0;
879 
880 	vdev_raidz_generate_parity_pq(rm);
881 
882 	rm->rm_col[x].rc_size = xsize;
883 	rm->rm_col[y].rc_size = ysize;
884 
885 	p = pdata;
886 	q = qdata;
887 	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
888 	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
889 	xd = rm->rm_col[x].rc_data;
890 	yd = rm->rm_col[y].rc_data;
891 
892 	/*
893 	 * We now have:
894 	 *	Pxy = P + D_x + D_y
895 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
896 	 *
897 	 * We can then solve for D_x:
898 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
899 	 * where
900 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
901 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
902 	 *
903 	 * With D_x in hand, we can easily solve for D_y:
904 	 *	D_y = P + Pxy + D_x
905 	 */
906 
907 	a = vdev_raidz_pow2[255 + x - y];
908 	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
909 	tmp = 255 - vdev_raidz_log2[a ^ 1];
910 
911 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
912 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
913 
914 	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
915 		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
916 		    vdev_raidz_exp2(*q ^ *qxy, bexp);
917 
918 		if (i < ysize)
919 			*yd = *p ^ *pxy ^ *xd;
920 	}
921 
922 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
923 	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
924 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
925 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
926 
927 	/*
928 	 * Restore the saved parity data.
929 	 */
930 	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
931 	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
932 
933 	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
934 }
935 
936 /* BEGIN CSTYLED */
937 /*
938  * In the general case of reconstruction, we must solve the system of linear
939  * equations defined by the coeffecients used to generate parity as well as
940  * the contents of the data and parity disks. This can be expressed with
941  * vectors for the original data (D) and the actual data (d) and parity (p)
942  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
943  *
944  *            __   __                     __     __
945  *            |     |         __     __   |  p_0  |
946  *            |  V  |         |  D_0  |   | p_m-1 |
947  *            |     |    x    |   :   | = |  d_0  |
948  *            |  I  |         | D_n-1 |   |   :   |
949  *            |     |         ~~     ~~   | d_n-1 |
950  *            ~~   ~~                     ~~     ~~
951  *
952  * I is simply a square identity matrix of size n, and V is a vandermonde
953  * matrix defined by the coeffecients we chose for the various parity columns
954  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
955  * computation as well as linear separability.
956  *
957  *      __               __               __     __
958  *      |   1   ..  1 1 1 |               |  p_0  |
959  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
960  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
961  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
962  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
963  *      |   :       : : : |   |   :   |   |  d_2  |
964  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
965  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
966  *      |   0   ..  0 0 1 |               | d_n-1 |
967  *      ~~               ~~               ~~     ~~
968  *
969  * Note that I, V, d, and p are known. To compute D, we must invert the
970  * matrix and use the known data and parity values to reconstruct the unknown
971  * data values. We begin by removing the rows in V|I and d|p that correspond
972  * to failed or missing columns; we then make V|I square (n x n) and d|p
973  * sized n by removing rows corresponding to unused parity from the bottom up
974  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
975  * using Gauss-Jordan elimination. In the example below we use m=3 parity
976  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
977  *           __                               __
978  *           |  1   1   1   1   1   1   1   1  |
979  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
980  *           |  19 205 116  29  64  16  4   1  |      / /
981  *           |  1   0   0   0   0   0   0   0  |     / /
982  *           |  0   1   0   0   0   0   0   0  | <--' /
983  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
984  *           |  0   0   0   1   0   0   0   0  |
985  *           |  0   0   0   0   1   0   0   0  |
986  *           |  0   0   0   0   0   1   0   0  |
987  *           |  0   0   0   0   0   0   1   0  |
988  *           |  0   0   0   0   0   0   0   1  |
989  *           ~~                               ~~
990  *           __                               __
991  *           |  1   1   1   1   1   1   1   1  |
992  *           |  19 205 116  29  64  16  4   1  |
993  *           |  1   0   0   0   0   0   0   0  |
994  *  (V|I)' = |  0   0   0   1   0   0   0   0  |
995  *           |  0   0   0   0   1   0   0   0  |
996  *           |  0   0   0   0   0   1   0   0  |
997  *           |  0   0   0   0   0   0   1   0  |
998  *           |  0   0   0   0   0   0   0   1  |
999  *           ~~                               ~~
1000  *
1001  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1002  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1003  * matrix is not singular.
1004  * __                                                                 __
1005  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1006  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1007  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1008  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1009  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1010  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1011  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1012  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1013  * ~~                                                                 ~~
1014  * __                                                                 __
1015  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1016  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1017  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1018  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1019  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1020  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1021  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1022  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1023  * ~~                                                                 ~~
1024  * __                                                                 __
1025  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1026  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1027  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1028  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1029  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1030  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1031  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1032  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1033  * ~~                                                                 ~~
1034  * __                                                                 __
1035  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1036  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1037  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1038  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1039  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1040  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1041  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1042  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1043  * ~~                                                                 ~~
1044  * __                                                                 __
1045  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1046  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1047  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1048  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1049  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1050  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1051  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1052  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1053  * ~~                                                                 ~~
1054  * __                                                                 __
1055  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1056  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1057  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1058  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1059  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1060  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1061  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1062  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1063  * ~~                                                                 ~~
1064  *                   __                               __
1065  *                   |  0   0   1   0   0   0   0   0  |
1066  *                   | 167 100  5   41 159 169 217 208 |
1067  *                   | 166 100  4   40 158 168 216 209 |
1068  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1069  *                   |  0   0   0   0   1   0   0   0  |
1070  *                   |  0   0   0   0   0   1   0   0  |
1071  *                   |  0   0   0   0   0   0   1   0  |
1072  *                   |  0   0   0   0   0   0   0   1  |
1073  *                   ~~                               ~~
1074  *
1075  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1076  * of the missing data.
1077  *
1078  * As is apparent from the example above, the only non-trivial rows in the
1079  * inverse matrix correspond to the data disks that we're trying to
1080  * reconstruct. Indeed, those are the only rows we need as the others would
1081  * only be useful for reconstructing data known or assumed to be valid. For
1082  * that reason, we only build the coefficients in the rows that correspond to
1083  * targeted columns.
1084  */
1085 /* END CSTYLED */
1086 
1087 static void
1088 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1089     uint8_t **rows)
1090 {
1091 	int i, j;
1092 	int pow;
1093 
1094 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1095 
1096 	/*
1097 	 * Fill in the missing rows of interest.
1098 	 */
1099 	for (i = 0; i < nmap; i++) {
1100 		ASSERT3S(0, <=, map[i]);
1101 		ASSERT3S(map[i], <=, 2);
1102 
1103 		pow = map[i] * n;
1104 		if (pow > 255)
1105 			pow -= 255;
1106 		ASSERT(pow <= 255);
1107 
1108 		for (j = 0; j < n; j++) {
1109 			pow -= map[i];
1110 			if (pow < 0)
1111 				pow += 255;
1112 			rows[i][j] = vdev_raidz_pow2[pow];
1113 		}
1114 	}
1115 }
1116 
1117 static void
1118 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1119     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1120 {
1121 	int i, j, ii, jj;
1122 	uint8_t log;
1123 
1124 	/*
1125 	 * Assert that the first nmissing entries from the array of used
1126 	 * columns correspond to parity columns and that subsequent entries
1127 	 * correspond to data columns.
1128 	 */
1129 	for (i = 0; i < nmissing; i++) {
1130 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
1131 	}
1132 	for (; i < n; i++) {
1133 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1134 	}
1135 
1136 	/*
1137 	 * First initialize the storage where we'll compute the inverse rows.
1138 	 */
1139 	for (i = 0; i < nmissing; i++) {
1140 		for (j = 0; j < n; j++) {
1141 			invrows[i][j] = (i == j) ? 1 : 0;
1142 		}
1143 	}
1144 
1145 	/*
1146 	 * Subtract all trivial rows from the rows of consequence.
1147 	 */
1148 	for (i = 0; i < nmissing; i++) {
1149 		for (j = nmissing; j < n; j++) {
1150 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1151 			jj = used[j] - rm->rm_firstdatacol;
1152 			ASSERT3S(jj, <, n);
1153 			invrows[i][j] = rows[i][jj];
1154 			rows[i][jj] = 0;
1155 		}
1156 	}
1157 
1158 	/*
1159 	 * For each of the rows of interest, we must normalize it and subtract
1160 	 * a multiple of it from the other rows.
1161 	 */
1162 	for (i = 0; i < nmissing; i++) {
1163 		for (j = 0; j < missing[i]; j++) {
1164 			ASSERT0(rows[i][j]);
1165 		}
1166 		ASSERT3U(rows[i][missing[i]], !=, 0);
1167 
1168 		/*
1169 		 * Compute the inverse of the first element and multiply each
1170 		 * element in the row by that value.
1171 		 */
1172 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1173 
1174 		for (j = 0; j < n; j++) {
1175 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1176 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1177 		}
1178 
1179 		for (ii = 0; ii < nmissing; ii++) {
1180 			if (i == ii)
1181 				continue;
1182 
1183 			ASSERT3U(rows[ii][missing[i]], !=, 0);
1184 
1185 			log = vdev_raidz_log2[rows[ii][missing[i]]];
1186 
1187 			for (j = 0; j < n; j++) {
1188 				rows[ii][j] ^=
1189 				    vdev_raidz_exp2(rows[i][j], log);
1190 				invrows[ii][j] ^=
1191 				    vdev_raidz_exp2(invrows[i][j], log);
1192 			}
1193 		}
1194 	}
1195 
1196 	/*
1197 	 * Verify that the data that is left in the rows are properly part of
1198 	 * an identity matrix.
1199 	 */
1200 	for (i = 0; i < nmissing; i++) {
1201 		for (j = 0; j < n; j++) {
1202 			if (j == missing[i]) {
1203 				ASSERT3U(rows[i][j], ==, 1);
1204 			} else {
1205 				ASSERT0(rows[i][j]);
1206 			}
1207 		}
1208 	}
1209 }
1210 
1211 static void
1212 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1213     int *missing, uint8_t **invrows, const uint8_t *used)
1214 {
1215 	int i, j, x, cc, c;
1216 	uint8_t *src;
1217 	uint64_t ccount;
1218 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1219 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1220 	uint8_t log = 0;
1221 	uint8_t val;
1222 	int ll;
1223 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1224 	uint8_t *p, *pp;
1225 	size_t psize;
1226 
1227 	psize = sizeof (invlog[0][0]) * n * nmissing;
1228 	p = kmem_alloc(psize, KM_SLEEP);
1229 
1230 	for (pp = p, i = 0; i < nmissing; i++) {
1231 		invlog[i] = pp;
1232 		pp += n;
1233 	}
1234 
1235 	for (i = 0; i < nmissing; i++) {
1236 		for (j = 0; j < n; j++) {
1237 			ASSERT3U(invrows[i][j], !=, 0);
1238 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1239 		}
1240 	}
1241 
1242 	for (i = 0; i < n; i++) {
1243 		c = used[i];
1244 		ASSERT3U(c, <, rm->rm_cols);
1245 
1246 		src = rm->rm_col[c].rc_data;
1247 		ccount = rm->rm_col[c].rc_size;
1248 		for (j = 0; j < nmissing; j++) {
1249 			cc = missing[j] + rm->rm_firstdatacol;
1250 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1251 			ASSERT3U(cc, <, rm->rm_cols);
1252 			ASSERT3U(cc, !=, c);
1253 
1254 			dst[j] = rm->rm_col[cc].rc_data;
1255 			dcount[j] = rm->rm_col[cc].rc_size;
1256 		}
1257 
1258 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1259 
1260 		for (x = 0; x < ccount; x++, src++) {
1261 			if (*src != 0)
1262 				log = vdev_raidz_log2[*src];
1263 
1264 			for (cc = 0; cc < nmissing; cc++) {
1265 				if (x >= dcount[cc])
1266 					continue;
1267 
1268 				if (*src == 0) {
1269 					val = 0;
1270 				} else {
1271 					if ((ll = log + invlog[cc][i]) >= 255)
1272 						ll -= 255;
1273 					val = vdev_raidz_pow2[ll];
1274 				}
1275 
1276 				if (i == 0)
1277 					dst[cc][x] = val;
1278 				else
1279 					dst[cc][x] ^= val;
1280 			}
1281 		}
1282 	}
1283 
1284 	kmem_free(p, psize);
1285 }
1286 
1287 static int
1288 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1289 {
1290 	int n, i, c, t, tt;
1291 	int nmissing_rows;
1292 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1293 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1294 
1295 	uint8_t *p, *pp;
1296 	size_t psize;
1297 
1298 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1299 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1300 	uint8_t *used;
1301 
1302 	int code = 0;
1303 
1304 
1305 	n = rm->rm_cols - rm->rm_firstdatacol;
1306 
1307 	/*
1308 	 * Figure out which data columns are missing.
1309 	 */
1310 	nmissing_rows = 0;
1311 	for (t = 0; t < ntgts; t++) {
1312 		if (tgts[t] >= rm->rm_firstdatacol) {
1313 			missing_rows[nmissing_rows++] =
1314 			    tgts[t] - rm->rm_firstdatacol;
1315 		}
1316 	}
1317 
1318 	/*
1319 	 * Figure out which parity columns to use to help generate the missing
1320 	 * data columns.
1321 	 */
1322 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1323 		ASSERT(tt < ntgts);
1324 		ASSERT(c < rm->rm_firstdatacol);
1325 
1326 		/*
1327 		 * Skip any targeted parity columns.
1328 		 */
1329 		if (c == tgts[tt]) {
1330 			tt++;
1331 			continue;
1332 		}
1333 
1334 		code |= 1 << c;
1335 
1336 		parity_map[i] = c;
1337 		i++;
1338 	}
1339 
1340 	ASSERT(code != 0);
1341 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1342 
1343 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1344 	    nmissing_rows * n + sizeof (used[0]) * n;
1345 	p = kmem_alloc(psize, KM_SLEEP);
1346 
1347 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1348 		rows[i] = pp;
1349 		pp += n;
1350 		invrows[i] = pp;
1351 		pp += n;
1352 	}
1353 	used = pp;
1354 
1355 	for (i = 0; i < nmissing_rows; i++) {
1356 		used[i] = parity_map[i];
1357 	}
1358 
1359 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1360 		if (tt < nmissing_rows &&
1361 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1362 			tt++;
1363 			continue;
1364 		}
1365 
1366 		ASSERT3S(i, <, n);
1367 		used[i] = c;
1368 		i++;
1369 	}
1370 
1371 	/*
1372 	 * Initialize the interesting rows of the matrix.
1373 	 */
1374 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1375 
1376 	/*
1377 	 * Invert the matrix.
1378 	 */
1379 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1380 	    invrows, used);
1381 
1382 	/*
1383 	 * Reconstruct the missing data using the generated matrix.
1384 	 */
1385 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1386 	    invrows, used);
1387 
1388 	kmem_free(p, psize);
1389 
1390 	return (code);
1391 }
1392 
1393 static int
1394 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1395 {
1396 	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1397 	int ntgts;
1398 	int i, c;
1399 	int code;
1400 	int nbadparity, nbaddata;
1401 	int parity_valid[VDEV_RAIDZ_MAXPARITY];
1402 
1403 	/*
1404 	 * The tgts list must already be sorted.
1405 	 */
1406 	for (i = 1; i < nt; i++) {
1407 		ASSERT(t[i] > t[i - 1]);
1408 	}
1409 
1410 	nbadparity = rm->rm_firstdatacol;
1411 	nbaddata = rm->rm_cols - nbadparity;
1412 	ntgts = 0;
1413 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1414 		if (c < rm->rm_firstdatacol)
1415 			parity_valid[c] = B_FALSE;
1416 
1417 		if (i < nt && c == t[i]) {
1418 			tgts[ntgts++] = c;
1419 			i++;
1420 		} else if (rm->rm_col[c].rc_error != 0) {
1421 			tgts[ntgts++] = c;
1422 		} else if (c >= rm->rm_firstdatacol) {
1423 			nbaddata--;
1424 		} else {
1425 			parity_valid[c] = B_TRUE;
1426 			nbadparity--;
1427 		}
1428 	}
1429 
1430 	ASSERT(ntgts >= nt);
1431 	ASSERT(nbaddata >= 0);
1432 	ASSERT(nbaddata + nbadparity == ntgts);
1433 
1434 	dt = &tgts[nbadparity];
1435 
1436 	/*
1437 	 * See if we can use any of our optimized reconstruction routines.
1438 	 */
1439 	if (!vdev_raidz_default_to_general) {
1440 		switch (nbaddata) {
1441 		case 1:
1442 			if (parity_valid[VDEV_RAIDZ_P])
1443 				return (vdev_raidz_reconstruct_p(rm, dt, 1));
1444 
1445 			ASSERT(rm->rm_firstdatacol > 1);
1446 
1447 			if (parity_valid[VDEV_RAIDZ_Q])
1448 				return (vdev_raidz_reconstruct_q(rm, dt, 1));
1449 
1450 			ASSERT(rm->rm_firstdatacol > 2);
1451 			break;
1452 
1453 		case 2:
1454 			ASSERT(rm->rm_firstdatacol > 1);
1455 
1456 			if (parity_valid[VDEV_RAIDZ_P] &&
1457 			    parity_valid[VDEV_RAIDZ_Q])
1458 				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1459 
1460 			ASSERT(rm->rm_firstdatacol > 2);
1461 
1462 			break;
1463 		}
1464 	}
1465 
1466 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1467 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1468 	ASSERT(code > 0);
1469 	return (code);
1470 }
1471 
1472 static int
1473 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1474     uint64_t *ashift)
1475 {
1476 	vdev_t *cvd;
1477 	uint64_t nparity = vd->vdev_nparity;
1478 	int c;
1479 	int lasterror = 0;
1480 	int numerrors = 0;
1481 
1482 	ASSERT(nparity > 0);
1483 
1484 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
1485 	    vd->vdev_children < nparity + 1) {
1486 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1487 		return (SET_ERROR(EINVAL));
1488 	}
1489 
1490 	vdev_open_children(vd);
1491 
1492 	for (c = 0; c < vd->vdev_children; c++) {
1493 		cvd = vd->vdev_child[c];
1494 
1495 		if (cvd->vdev_open_error != 0) {
1496 			lasterror = cvd->vdev_open_error;
1497 			numerrors++;
1498 			continue;
1499 		}
1500 
1501 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1502 		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1503 		*ashift = MAX(*ashift, cvd->vdev_ashift);
1504 	}
1505 
1506 	*asize *= vd->vdev_children;
1507 	*max_asize *= vd->vdev_children;
1508 
1509 	if (numerrors > nparity) {
1510 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1511 		return (lasterror);
1512 	}
1513 
1514 	return (0);
1515 }
1516 
1517 static void
1518 vdev_raidz_close(vdev_t *vd)
1519 {
1520 	int c;
1521 
1522 	for (c = 0; c < vd->vdev_children; c++)
1523 		vdev_close(vd->vdev_child[c]);
1524 }
1525 
1526 /*
1527  * Handle a read or write I/O to a RAID-Z dump device.
1528  *
1529  * The dump device is in a unique situation compared to other ZFS datasets:
1530  * writing to this device should be as simple and fast as possible.  In
1531  * addition, durability matters much less since the dump will be extracted
1532  * once the machine reboots.  For that reason, this function eschews parity for
1533  * performance and simplicity.  The dump device uses the checksum setting
1534  * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1535  * dataset.
1536  *
1537  * Blocks of size 128 KB have been preallocated for this volume.  I/Os less than
1538  * 128 KB will not fill an entire block; in addition, they may not be properly
1539  * aligned.  In that case, this function uses the preallocated 128 KB block and
1540  * omits reading or writing any "empty" portions of that block, as opposed to
1541  * allocating a fresh appropriately-sized block.
1542  *
1543  * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1544  *
1545  *     vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1546  *
1547  * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1548  * allocated which spans all five child vdevs.  8 KB of data would be written to
1549  * each of four vdevs, with the fifth containing the parity bits.
1550  *
1551  *       parity    data     data     data     data
1552  *     |   PP   |   XX   |   XX   |   XX   |   XX   |
1553  *         ^        ^        ^        ^        ^
1554  *         |        |        |        |        |
1555  *   8 KB parity    ------8 KB data blocks------
1556  *
1557  * However, when writing to the dump device, the behavior is different:
1558  *
1559  *     vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1560  *
1561  * Unlike the normal RAID-Z case in which the block is allocated based on the
1562  * I/O size, reads and writes here always use a 128 KB logical I/O size.  If the
1563  * I/O size is less than 128 KB, only the actual portions of data are written.
1564  * In this example the data is written to the third data vdev since that vdev
1565  * contains the offset [64 KB, 96 KB).
1566  *
1567  *       parity    data     data     data     data
1568  *     |        |        |        |   XX   |        |
1569  *                                    ^
1570  *                                    |
1571  *                             32 KB data block
1572  *
1573  * As a result, an individual I/O may not span all child vdevs; moreover, a
1574  * small I/O may only operate on a single child vdev.
1575  *
1576  * Note that since there are no parity bits calculated or written, this format
1577  * remains the same no matter how many parity bits are used in a normal RAID-Z
1578  * stripe.  On a RAID-Z3 configuration with seven child vdevs, the example above
1579  * would look like:
1580  *
1581  *       parity   parity   parity    data     data     data     data
1582  *     |        |        |        |        |        |   XX   |        |
1583  *                                                      ^
1584  *                                                      |
1585  *                                               32 KB data block
1586  */
1587 int
1588 vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1589     uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1590 {
1591 	vdev_t *tvd = vd->vdev_top;
1592 	vdev_t *cvd;
1593 	raidz_map_t *rm;
1594 	raidz_col_t *rc;
1595 	int c, err = 0;
1596 
1597 	uint64_t start, end, colstart, colend;
1598 	uint64_t coloffset, colsize, colskip;
1599 
1600 	int flags = doread ? B_READ : B_WRITE;
1601 
1602 #ifdef	_KERNEL
1603 
1604 	/*
1605 	 * Don't write past the end of the block
1606 	 */
1607 	VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
1608 
1609 	start = offset;
1610 	end = start + size;
1611 
1612 	/*
1613 	 * Allocate a RAID-Z map for this block.  Note that this block starts
1614 	 * from the "original" offset, this is, the offset of the extent which
1615 	 * contains the requisite offset of the data being read or written.
1616 	 *
1617 	 * Even if this I/O operation doesn't span the full block size, let's
1618 	 * treat the on-disk format as if the only blocks are the complete 128
1619 	 * KB size.
1620 	 */
1621 	rm = vdev_raidz_map_alloc(data - (offset - origoffset),
1622 	    SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift,
1623 	    vd->vdev_children, vd->vdev_nparity);
1624 
1625 	coloffset = origoffset;
1626 
1627 	for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1628 	    c++, coloffset += rc->rc_size) {
1629 		rc = &rm->rm_col[c];
1630 		cvd = vd->vdev_child[rc->rc_devidx];
1631 
1632 		/*
1633 		 * Find the start and end of this column in the RAID-Z map,
1634 		 * keeping in mind that the stated size and offset of the
1635 		 * operation may not fill the entire column for this vdev.
1636 		 *
1637 		 * If any portion of the data spans this column, issue the
1638 		 * appropriate operation to the vdev.
1639 		 */
1640 		if (coloffset + rc->rc_size <= start)
1641 			continue;
1642 		if (coloffset >= end)
1643 			continue;
1644 
1645 		colstart = MAX(coloffset, start);
1646 		colend = MIN(end, coloffset + rc->rc_size);
1647 		colsize = colend - colstart;
1648 		colskip = colstart - coloffset;
1649 
1650 		VERIFY3U(colsize, <=, rc->rc_size);
1651 		VERIFY3U(colskip, <=, rc->rc_size);
1652 
1653 		/*
1654 		 * Note that the child vdev will have a vdev label at the start
1655 		 * of its range of offsets, hence the need for
1656 		 * VDEV_LABEL_OFFSET().  See zio_vdev_child_io() for another
1657 		 * example of why this calculation is needed.
1658 		 */
1659 		if ((err = vdev_disk_physio(cvd,
1660 		    ((char *)rc->rc_data) + colskip, colsize,
1661 		    VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1662 		    flags, isdump)) != 0)
1663 			break;
1664 	}
1665 
1666 	vdev_raidz_map_free(rm);
1667 #endif	/* KERNEL */
1668 
1669 	return (err);
1670 }
1671 
1672 static uint64_t
1673 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1674 {
1675 	uint64_t asize;
1676 	uint64_t ashift = vd->vdev_top->vdev_ashift;
1677 	uint64_t cols = vd->vdev_children;
1678 	uint64_t nparity = vd->vdev_nparity;
1679 
1680 	asize = ((psize - 1) >> ashift) + 1;
1681 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1682 	asize = roundup(asize, nparity + 1) << ashift;
1683 
1684 	return (asize);
1685 }
1686 
1687 static void
1688 vdev_raidz_child_done(zio_t *zio)
1689 {
1690 	raidz_col_t *rc = zio->io_private;
1691 
1692 	rc->rc_error = zio->io_error;
1693 	rc->rc_tried = 1;
1694 	rc->rc_skipped = 0;
1695 }
1696 
1697 /*
1698  * Start an IO operation on a RAIDZ VDev
1699  *
1700  * Outline:
1701  * - For write operations:
1702  *   1. Generate the parity data
1703  *   2. Create child zio write operations to each column's vdev, for both
1704  *      data and parity.
1705  *   3. If the column skips any sectors for padding, create optional dummy
1706  *      write zio children for those areas to improve aggregation continuity.
1707  * - For read operations:
1708  *   1. Create child zio read operations to each data column's vdev to read
1709  *      the range of data required for zio.
1710  *   2. If this is a scrub or resilver operation, or if any of the data
1711  *      vdevs have had errors, then create zio read operations to the parity
1712  *      columns' VDevs as well.
1713  */
1714 static void
1715 vdev_raidz_io_start(zio_t *zio)
1716 {
1717 	vdev_t *vd = zio->io_vd;
1718 	vdev_t *tvd = vd->vdev_top;
1719 	vdev_t *cvd;
1720 	raidz_map_t *rm;
1721 	raidz_col_t *rc;
1722 	int c, i;
1723 
1724 	rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset,
1725 	    tvd->vdev_ashift, vd->vdev_children,
1726 	    vd->vdev_nparity);
1727 
1728 	zio->io_vsd = rm;
1729 	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1730 
1731 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1732 
1733 	if (zio->io_type == ZIO_TYPE_WRITE) {
1734 		vdev_raidz_generate_parity(rm);
1735 
1736 		for (c = 0; c < rm->rm_cols; c++) {
1737 			rc = &rm->rm_col[c];
1738 			cvd = vd->vdev_child[rc->rc_devidx];
1739 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1740 			    rc->rc_offset, rc->rc_data, rc->rc_size,
1741 			    zio->io_type, zio->io_priority, 0,
1742 			    vdev_raidz_child_done, rc));
1743 		}
1744 
1745 		/*
1746 		 * Generate optional I/Os for any skipped sectors to improve
1747 		 * aggregation contiguity.
1748 		 */
1749 		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1750 			ASSERT(c <= rm->rm_scols);
1751 			if (c == rm->rm_scols)
1752 				c = 0;
1753 			rc = &rm->rm_col[c];
1754 			cvd = vd->vdev_child[rc->rc_devidx];
1755 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1756 			    rc->rc_offset + rc->rc_size, NULL,
1757 			    1 << tvd->vdev_ashift,
1758 			    zio->io_type, zio->io_priority,
1759 			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1760 		}
1761 
1762 		zio_execute(zio);
1763 		return;
1764 	}
1765 
1766 	ASSERT(zio->io_type == ZIO_TYPE_READ);
1767 
1768 	/*
1769 	 * Iterate over the columns in reverse order so that we hit the parity
1770 	 * last -- any errors along the way will force us to read the parity.
1771 	 */
1772 	for (c = rm->rm_cols - 1; c >= 0; c--) {
1773 		rc = &rm->rm_col[c];
1774 		cvd = vd->vdev_child[rc->rc_devidx];
1775 		if (cvd->vdev_avoid_read) {
1776 			if (c >= rm->rm_firstdatacol)
1777 				rm->rm_missingdata++;
1778 			else
1779 				rm->rm_missingparity++;
1780 			rc->rc_error = SET_ERROR(ENXIO);
1781 			rc->rc_skipped = 1;	/* only try if necessary */
1782 			continue;
1783 		}
1784 		if (!vdev_readable(cvd)) {
1785 			if (c >= rm->rm_firstdatacol)
1786 				rm->rm_missingdata++;
1787 			else
1788 				rm->rm_missingparity++;
1789 			rc->rc_error = SET_ERROR(ENXIO);
1790 			rc->rc_tried = 1;	/* don't even try */
1791 			rc->rc_skipped = 1;
1792 			continue;
1793 		}
1794 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1795 			if (c >= rm->rm_firstdatacol)
1796 				rm->rm_missingdata++;
1797 			else
1798 				rm->rm_missingparity++;
1799 			rc->rc_error = SET_ERROR(ESTALE);
1800 			rc->rc_skipped = 1;
1801 			continue;
1802 		}
1803 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1804 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1805 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1806 			    rc->rc_offset, rc->rc_data, rc->rc_size,
1807 			    zio->io_type, zio->io_priority, 0,
1808 			    vdev_raidz_child_done, rc));
1809 		}
1810 	}
1811 
1812 	zio_execute(zio);
1813 }
1814 
1815 
1816 /*
1817  * Report a checksum error for a child of a RAID-Z device.
1818  */
1819 static void
1820 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1821 {
1822 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1823 
1824 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1825 		zio_bad_cksum_t zbc;
1826 		raidz_map_t *rm = zio->io_vsd;
1827 
1828 		mutex_enter(&vd->vdev_stat_lock);
1829 		vd->vdev_stat.vs_checksum_errors++;
1830 		mutex_exit(&vd->vdev_stat_lock);
1831 
1832 		zbc.zbc_has_cksum = 0;
1833 		zbc.zbc_injected = rm->rm_ecksuminjected;
1834 
1835 		zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1836 		    rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1837 		    &zbc);
1838 	}
1839 }
1840 
1841 /*
1842  * We keep track of whether or not there were any injected errors, so that
1843  * any ereports we generate can note it.
1844  */
1845 static int
1846 raidz_checksum_verify(zio_t *zio)
1847 {
1848 	zio_bad_cksum_t zbc;
1849 	raidz_map_t *rm = zio->io_vsd;
1850 
1851 	int ret = zio_checksum_error(zio, &zbc);
1852 	if (ret != 0 && zbc.zbc_injected != 0)
1853 		rm->rm_ecksuminjected = 1;
1854 
1855 	return (ret);
1856 }
1857 
1858 /*
1859  * Generate the parity from the data columns. If we tried and were able to
1860  * read the parity without error, verify that the generated parity matches the
1861  * data we read. If it doesn't, we fire off a checksum error. Return the
1862  * number such failures.
1863  */
1864 static int
1865 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1866 {
1867 	void *orig[VDEV_RAIDZ_MAXPARITY];
1868 	int c, ret = 0;
1869 	raidz_col_t *rc;
1870 
1871 	blkptr_t *bp = zio->io_bp;
1872 	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
1873 	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
1874 
1875 	if (checksum == ZIO_CHECKSUM_NOPARITY)
1876 		return (ret);
1877 
1878 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1879 		rc = &rm->rm_col[c];
1880 		if (!rc->rc_tried || rc->rc_error != 0)
1881 			continue;
1882 		orig[c] = zio_buf_alloc(rc->rc_size);
1883 		bcopy(rc->rc_data, orig[c], rc->rc_size);
1884 	}
1885 
1886 	vdev_raidz_generate_parity(rm);
1887 
1888 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1889 		rc = &rm->rm_col[c];
1890 		if (!rc->rc_tried || rc->rc_error != 0)
1891 			continue;
1892 		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1893 			raidz_checksum_error(zio, rc, orig[c]);
1894 			rc->rc_error = SET_ERROR(ECKSUM);
1895 			ret++;
1896 		}
1897 		zio_buf_free(orig[c], rc->rc_size);
1898 	}
1899 
1900 	return (ret);
1901 }
1902 
1903 /*
1904  * Keep statistics on all the ways that we used parity to correct data.
1905  */
1906 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1907 
1908 static int
1909 vdev_raidz_worst_error(raidz_map_t *rm)
1910 {
1911 	int error = 0;
1912 
1913 	for (int c = 0; c < rm->rm_cols; c++)
1914 		error = zio_worst_error(error, rm->rm_col[c].rc_error);
1915 
1916 	return (error);
1917 }
1918 
1919 /*
1920  * Iterate over all combinations of bad data and attempt a reconstruction.
1921  * Note that the algorithm below is non-optimal because it doesn't take into
1922  * account how reconstruction is actually performed. For example, with
1923  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1924  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1925  * cases we'd only use parity information in column 0.
1926  */
1927 static int
1928 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1929 {
1930 	raidz_map_t *rm = zio->io_vsd;
1931 	raidz_col_t *rc;
1932 	void *orig[VDEV_RAIDZ_MAXPARITY];
1933 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1934 	int *tgts = &tstore[1];
1935 	int current, next, i, c, n;
1936 	int code, ret = 0;
1937 
1938 	ASSERT(total_errors < rm->rm_firstdatacol);
1939 
1940 	/*
1941 	 * This simplifies one edge condition.
1942 	 */
1943 	tgts[-1] = -1;
1944 
1945 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1946 		/*
1947 		 * Initialize the targets array by finding the first n columns
1948 		 * that contain no error.
1949 		 *
1950 		 * If there were no data errors, we need to ensure that we're
1951 		 * always explicitly attempting to reconstruct at least one
1952 		 * data column. To do this, we simply push the highest target
1953 		 * up into the data columns.
1954 		 */
1955 		for (c = 0, i = 0; i < n; i++) {
1956 			if (i == n - 1 && data_errors == 0 &&
1957 			    c < rm->rm_firstdatacol) {
1958 				c = rm->rm_firstdatacol;
1959 			}
1960 
1961 			while (rm->rm_col[c].rc_error != 0) {
1962 				c++;
1963 				ASSERT3S(c, <, rm->rm_cols);
1964 			}
1965 
1966 			tgts[i] = c++;
1967 		}
1968 
1969 		/*
1970 		 * Setting tgts[n] simplifies the other edge condition.
1971 		 */
1972 		tgts[n] = rm->rm_cols;
1973 
1974 		/*
1975 		 * These buffers were allocated in previous iterations.
1976 		 */
1977 		for (i = 0; i < n - 1; i++) {
1978 			ASSERT(orig[i] != NULL);
1979 		}
1980 
1981 		orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
1982 
1983 		current = 0;
1984 		next = tgts[current];
1985 
1986 		while (current != n) {
1987 			tgts[current] = next;
1988 			current = 0;
1989 
1990 			/*
1991 			 * Save off the original data that we're going to
1992 			 * attempt to reconstruct.
1993 			 */
1994 			for (i = 0; i < n; i++) {
1995 				ASSERT(orig[i] != NULL);
1996 				c = tgts[i];
1997 				ASSERT3S(c, >=, 0);
1998 				ASSERT3S(c, <, rm->rm_cols);
1999 				rc = &rm->rm_col[c];
2000 				bcopy(rc->rc_data, orig[i], rc->rc_size);
2001 			}
2002 
2003 			/*
2004 			 * Attempt a reconstruction and exit the outer loop on
2005 			 * success.
2006 			 */
2007 			code = vdev_raidz_reconstruct(rm, tgts, n);
2008 			if (raidz_checksum_verify(zio) == 0) {
2009 				atomic_inc_64(&raidz_corrected[code]);
2010 
2011 				for (i = 0; i < n; i++) {
2012 					c = tgts[i];
2013 					rc = &rm->rm_col[c];
2014 					ASSERT(rc->rc_error == 0);
2015 					if (rc->rc_tried)
2016 						raidz_checksum_error(zio, rc,
2017 						    orig[i]);
2018 					rc->rc_error = SET_ERROR(ECKSUM);
2019 				}
2020 
2021 				ret = code;
2022 				goto done;
2023 			}
2024 
2025 			/*
2026 			 * Restore the original data.
2027 			 */
2028 			for (i = 0; i < n; i++) {
2029 				c = tgts[i];
2030 				rc = &rm->rm_col[c];
2031 				bcopy(orig[i], rc->rc_data, rc->rc_size);
2032 			}
2033 
2034 			do {
2035 				/*
2036 				 * Find the next valid column after the current
2037 				 * position..
2038 				 */
2039 				for (next = tgts[current] + 1;
2040 				    next < rm->rm_cols &&
2041 				    rm->rm_col[next].rc_error != 0; next++)
2042 					continue;
2043 
2044 				ASSERT(next <= tgts[current + 1]);
2045 
2046 				/*
2047 				 * If that spot is available, we're done here.
2048 				 */
2049 				if (next != tgts[current + 1])
2050 					break;
2051 
2052 				/*
2053 				 * Otherwise, find the next valid column after
2054 				 * the previous position.
2055 				 */
2056 				for (c = tgts[current - 1] + 1;
2057 				    rm->rm_col[c].rc_error != 0; c++)
2058 					continue;
2059 
2060 				tgts[current] = c;
2061 				current++;
2062 
2063 			} while (current != n);
2064 		}
2065 	}
2066 	n--;
2067 done:
2068 	for (i = 0; i < n; i++) {
2069 		zio_buf_free(orig[i], rm->rm_col[0].rc_size);
2070 	}
2071 
2072 	return (ret);
2073 }
2074 
2075 /*
2076  * Complete an IO operation on a RAIDZ VDev
2077  *
2078  * Outline:
2079  * - For write operations:
2080  *   1. Check for errors on the child IOs.
2081  *   2. Return, setting an error code if too few child VDevs were written
2082  *      to reconstruct the data later.  Note that partial writes are
2083  *      considered successful if they can be reconstructed at all.
2084  * - For read operations:
2085  *   1. Check for errors on the child IOs.
2086  *   2. If data errors occurred:
2087  *      a. Try to reassemble the data from the parity available.
2088  *      b. If we haven't yet read the parity drives, read them now.
2089  *      c. If all parity drives have been read but the data still doesn't
2090  *         reassemble with a correct checksum, then try combinatorial
2091  *         reconstruction.
2092  *      d. If that doesn't work, return an error.
2093  *   3. If there were unexpected errors or this is a resilver operation,
2094  *      rewrite the vdevs that had errors.
2095  */
2096 static void
2097 vdev_raidz_io_done(zio_t *zio)
2098 {
2099 	vdev_t *vd = zio->io_vd;
2100 	vdev_t *cvd;
2101 	raidz_map_t *rm = zio->io_vsd;
2102 	raidz_col_t *rc;
2103 	int unexpected_errors = 0;
2104 	int parity_errors = 0;
2105 	int parity_untried = 0;
2106 	int data_errors = 0;
2107 	int total_errors = 0;
2108 	int n, c;
2109 	int tgts[VDEV_RAIDZ_MAXPARITY];
2110 	int code;
2111 
2112 	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
2113 
2114 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2115 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2116 
2117 	for (c = 0; c < rm->rm_cols; c++) {
2118 		rc = &rm->rm_col[c];
2119 
2120 		if (rc->rc_error) {
2121 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
2122 
2123 			if (c < rm->rm_firstdatacol)
2124 				parity_errors++;
2125 			else
2126 				data_errors++;
2127 
2128 			if (!rc->rc_skipped)
2129 				unexpected_errors++;
2130 
2131 			total_errors++;
2132 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2133 			parity_untried++;
2134 		}
2135 	}
2136 
2137 	if (zio->io_type == ZIO_TYPE_WRITE) {
2138 		/*
2139 		 * XXX -- for now, treat partial writes as a success.
2140 		 * (If we couldn't write enough columns to reconstruct
2141 		 * the data, the I/O failed.  Otherwise, good enough.)
2142 		 *
2143 		 * Now that we support write reallocation, it would be better
2144 		 * to treat partial failure as real failure unless there are
2145 		 * no non-degraded top-level vdevs left, and not update DTLs
2146 		 * if we intend to reallocate.
2147 		 */
2148 		/* XXPOLICY */
2149 		if (total_errors > rm->rm_firstdatacol)
2150 			zio->io_error = vdev_raidz_worst_error(rm);
2151 
2152 		return;
2153 	}
2154 
2155 	ASSERT(zio->io_type == ZIO_TYPE_READ);
2156 	/*
2157 	 * There are three potential phases for a read:
2158 	 *	1. produce valid data from the columns read
2159 	 *	2. read all disks and try again
2160 	 *	3. perform combinatorial reconstruction
2161 	 *
2162 	 * Each phase is progressively both more expensive and less likely to
2163 	 * occur. If we encounter more errors than we can repair or all phases
2164 	 * fail, we have no choice but to return an error.
2165 	 */
2166 
2167 	/*
2168 	 * If the number of errors we saw was correctable -- less than or equal
2169 	 * to the number of parity disks read -- attempt to produce data that
2170 	 * has a valid checksum. Naturally, this case applies in the absence of
2171 	 * any errors.
2172 	 */
2173 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2174 		if (data_errors == 0) {
2175 			if (raidz_checksum_verify(zio) == 0) {
2176 				/*
2177 				 * If we read parity information (unnecessarily
2178 				 * as it happens since no reconstruction was
2179 				 * needed) regenerate and verify the parity.
2180 				 * We also regenerate parity when resilvering
2181 				 * so we can write it out to the failed device
2182 				 * later.
2183 				 */
2184 				if (parity_errors + parity_untried <
2185 				    rm->rm_firstdatacol ||
2186 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2187 					n = raidz_parity_verify(zio, rm);
2188 					unexpected_errors += n;
2189 					ASSERT(parity_errors + n <=
2190 					    rm->rm_firstdatacol);
2191 				}
2192 				goto done;
2193 			}
2194 		} else {
2195 			/*
2196 			 * We either attempt to read all the parity columns or
2197 			 * none of them. If we didn't try to read parity, we
2198 			 * wouldn't be here in the correctable case. There must
2199 			 * also have been fewer parity errors than parity
2200 			 * columns or, again, we wouldn't be in this code path.
2201 			 */
2202 			ASSERT(parity_untried == 0);
2203 			ASSERT(parity_errors < rm->rm_firstdatacol);
2204 
2205 			/*
2206 			 * Identify the data columns that reported an error.
2207 			 */
2208 			n = 0;
2209 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2210 				rc = &rm->rm_col[c];
2211 				if (rc->rc_error != 0) {
2212 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2213 					tgts[n++] = c;
2214 				}
2215 			}
2216 
2217 			ASSERT(rm->rm_firstdatacol >= n);
2218 
2219 			code = vdev_raidz_reconstruct(rm, tgts, n);
2220 
2221 			if (raidz_checksum_verify(zio) == 0) {
2222 				atomic_inc_64(&raidz_corrected[code]);
2223 
2224 				/*
2225 				 * If we read more parity disks than were used
2226 				 * for reconstruction, confirm that the other
2227 				 * parity disks produced correct data. This
2228 				 * routine is suboptimal in that it regenerates
2229 				 * the parity that we already used in addition
2230 				 * to the parity that we're attempting to
2231 				 * verify, but this should be a relatively
2232 				 * uncommon case, and can be optimized if it
2233 				 * becomes a problem. Note that we regenerate
2234 				 * parity when resilvering so we can write it
2235 				 * out to failed devices later.
2236 				 */
2237 				if (parity_errors < rm->rm_firstdatacol - n ||
2238 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2239 					n = raidz_parity_verify(zio, rm);
2240 					unexpected_errors += n;
2241 					ASSERT(parity_errors + n <=
2242 					    rm->rm_firstdatacol);
2243 				}
2244 
2245 				goto done;
2246 			}
2247 		}
2248 	}
2249 
2250 	/*
2251 	 * This isn't a typical situation -- either we got a read error or
2252 	 * a child silently returned bad data. Read every block so we can
2253 	 * try again with as much data and parity as we can track down. If
2254 	 * we've already been through once before, all children will be marked
2255 	 * as tried so we'll proceed to combinatorial reconstruction.
2256 	 */
2257 	unexpected_errors = 1;
2258 	rm->rm_missingdata = 0;
2259 	rm->rm_missingparity = 0;
2260 
2261 	for (c = 0; c < rm->rm_cols; c++) {
2262 		if (rm->rm_col[c].rc_tried)
2263 			continue;
2264 
2265 		zio_vdev_io_redone(zio);
2266 		do {
2267 			rc = &rm->rm_col[c];
2268 			if (rc->rc_tried)
2269 				continue;
2270 			zio_nowait(zio_vdev_child_io(zio, NULL,
2271 			    vd->vdev_child[rc->rc_devidx],
2272 			    rc->rc_offset, rc->rc_data, rc->rc_size,
2273 			    zio->io_type, zio->io_priority, 0,
2274 			    vdev_raidz_child_done, rc));
2275 		} while (++c < rm->rm_cols);
2276 
2277 		return;
2278 	}
2279 
2280 	/*
2281 	 * At this point we've attempted to reconstruct the data given the
2282 	 * errors we detected, and we've attempted to read all columns. There
2283 	 * must, therefore, be one or more additional problems -- silent errors
2284 	 * resulting in invalid data rather than explicit I/O errors resulting
2285 	 * in absent data. We check if there is enough additional data to
2286 	 * possibly reconstruct the data and then perform combinatorial
2287 	 * reconstruction over all possible combinations. If that fails,
2288 	 * we're cooked.
2289 	 */
2290 	if (total_errors > rm->rm_firstdatacol) {
2291 		zio->io_error = vdev_raidz_worst_error(rm);
2292 
2293 	} else if (total_errors < rm->rm_firstdatacol &&
2294 	    (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2295 		/*
2296 		 * If we didn't use all the available parity for the
2297 		 * combinatorial reconstruction, verify that the remaining
2298 		 * parity is correct.
2299 		 */
2300 		if (code != (1 << rm->rm_firstdatacol) - 1)
2301 			(void) raidz_parity_verify(zio, rm);
2302 	} else {
2303 		/*
2304 		 * We're here because either:
2305 		 *
2306 		 *	total_errors == rm_first_datacol, or
2307 		 *	vdev_raidz_combrec() failed
2308 		 *
2309 		 * In either case, there is enough bad data to prevent
2310 		 * reconstruction.
2311 		 *
2312 		 * Start checksum ereports for all children which haven't
2313 		 * failed, and the IO wasn't speculative.
2314 		 */
2315 		zio->io_error = SET_ERROR(ECKSUM);
2316 
2317 		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2318 			for (c = 0; c < rm->rm_cols; c++) {
2319 				rc = &rm->rm_col[c];
2320 				if (rc->rc_error == 0) {
2321 					zio_bad_cksum_t zbc;
2322 					zbc.zbc_has_cksum = 0;
2323 					zbc.zbc_injected =
2324 					    rm->rm_ecksuminjected;
2325 
2326 					zfs_ereport_start_checksum(
2327 					    zio->io_spa,
2328 					    vd->vdev_child[rc->rc_devidx],
2329 					    zio, rc->rc_offset, rc->rc_size,
2330 					    (void *)(uintptr_t)c, &zbc);
2331 				}
2332 			}
2333 		}
2334 	}
2335 
2336 done:
2337 	zio_checksum_verified(zio);
2338 
2339 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2340 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2341 		/*
2342 		 * Use the good data we have in hand to repair damaged children.
2343 		 */
2344 		for (c = 0; c < rm->rm_cols; c++) {
2345 			rc = &rm->rm_col[c];
2346 			cvd = vd->vdev_child[rc->rc_devidx];
2347 
2348 			if (rc->rc_error == 0)
2349 				continue;
2350 
2351 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2352 			    rc->rc_offset, rc->rc_data, rc->rc_size,
2353 			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2354 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2355 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2356 		}
2357 	}
2358 }
2359 
2360 static void
2361 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2362 {
2363 	if (faulted > vd->vdev_nparity)
2364 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2365 		    VDEV_AUX_NO_REPLICAS);
2366 	else if (degraded + faulted != 0)
2367 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2368 	else
2369 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2370 }
2371 
2372 vdev_ops_t vdev_raidz_ops = {
2373 	vdev_raidz_open,
2374 	vdev_raidz_close,
2375 	vdev_raidz_asize,
2376 	vdev_raidz_io_start,
2377 	vdev_raidz_io_done,
2378 	vdev_raidz_state_change,
2379 	NULL,
2380 	NULL,
2381 	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
2382 	B_FALSE			/* not a leaf vdev */
2383 };
2384