xref: /illumos-gate/usr/src/uts/common/fs/zfs/vdev_raidz.c (revision b327cd3f3b4dab4f29e7140159b1e01ed2ceef2a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
25  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
26  * Copyright (c) 2014 Integros [integros.com]
27  */
28 
29 #include <sys/zfs_context.h>
30 #include <sys/spa.h>
31 #include <sys/vdev_impl.h>
32 #include <sys/vdev_disk.h>
33 #include <sys/vdev_file.h>
34 #include <sys/vdev_raidz.h>
35 #include <sys/zio.h>
36 #include <sys/zio_checksum.h>
37 #include <sys/fs/zfs.h>
38 #include <sys/fm/fs/zfs.h>
39 
40 /*
41  * Virtual device vector for RAID-Z.
42  *
43  * This vdev supports single, double, and triple parity. For single parity,
44  * we use a simple XOR of all the data columns. For double or triple parity,
45  * we use a special case of Reed-Solomon coding. This extends the
46  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
47  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
48  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
49  * former is also based. The latter is designed to provide higher performance
50  * for writes.
51  *
52  * Note that the Plank paper claimed to support arbitrary N+M, but was then
53  * amended six years later identifying a critical flaw that invalidates its
54  * claims. Nevertheless, the technique can be adapted to work for up to
55  * triple parity. For additional parity, the amendment "Note: Correction to
56  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
57  * is viable, but the additional complexity means that write performance will
58  * suffer.
59  *
60  * All of the methods above operate on a Galois field, defined over the
61  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
62  * can be expressed with a single byte. Briefly, the operations on the
63  * field are defined as follows:
64  *
65  *   o addition (+) is represented by a bitwise XOR
66  *   o subtraction (-) is therefore identical to addition: A + B = A - B
67  *   o multiplication of A by 2 is defined by the following bitwise expression:
68  *
69  *	(A * 2)_7 = A_6
70  *	(A * 2)_6 = A_5
71  *	(A * 2)_5 = A_4
72  *	(A * 2)_4 = A_3 + A_7
73  *	(A * 2)_3 = A_2 + A_7
74  *	(A * 2)_2 = A_1 + A_7
75  *	(A * 2)_1 = A_0
76  *	(A * 2)_0 = A_7
77  *
78  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
79  * As an aside, this multiplication is derived from the error correcting
80  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
81  *
82  * Observe that any number in the field (except for 0) can be expressed as a
83  * power of 2 -- a generator for the field. We store a table of the powers of
84  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
85  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
86  * than field addition). The inverse of a field element A (A^-1) is therefore
87  * A ^ (255 - 1) = A^254.
88  *
89  * The up-to-three parity columns, P, Q, R over several data columns,
90  * D_0, ... D_n-1, can be expressed by field operations:
91  *
92  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
93  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
94  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
95  *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
96  *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
97  *
98  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
99  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
100  * independent coefficients. (There are no additional coefficients that have
101  * this property which is why the uncorrected Plank method breaks down.)
102  *
103  * See the reconstruction code below for how P, Q and R can used individually
104  * or in concert to recover missing data columns.
105  */
106 
107 typedef struct raidz_col {
108 	uint64_t rc_devidx;		/* child device index for I/O */
109 	uint64_t rc_offset;		/* device offset */
110 	uint64_t rc_size;		/* I/O size */
111 	void *rc_data;			/* I/O data */
112 	void *rc_gdata;			/* used to store the "good" version */
113 	int rc_error;			/* I/O error for this device */
114 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
115 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
116 } raidz_col_t;
117 
118 typedef struct raidz_map {
119 	uint64_t rm_cols;		/* Regular column count */
120 	uint64_t rm_scols;		/* Count including skipped columns */
121 	uint64_t rm_bigcols;		/* Number of oversized columns */
122 	uint64_t rm_asize;		/* Actual total I/O size */
123 	uint64_t rm_missingdata;	/* Count of missing data devices */
124 	uint64_t rm_missingparity;	/* Count of missing parity devices */
125 	uint64_t rm_firstdatacol;	/* First data column/parity count */
126 	uint64_t rm_nskip;		/* Skipped sectors for padding */
127 	uint64_t rm_skipstart;		/* Column index of padding start */
128 	void *rm_datacopy;		/* rm_asize-buffer of copied data */
129 	uintptr_t rm_reports;		/* # of referencing checksum reports */
130 	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
131 	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
132 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
133 } raidz_map_t;
134 
135 #define	VDEV_RAIDZ_P		0
136 #define	VDEV_RAIDZ_Q		1
137 #define	VDEV_RAIDZ_R		2
138 
139 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
140 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
141 
142 /*
143  * We provide a mechanism to perform the field multiplication operation on a
144  * 64-bit value all at once rather than a byte at a time. This works by
145  * creating a mask from the top bit in each byte and using that to
146  * conditionally apply the XOR of 0x1d.
147  */
148 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
149 { \
150 	(mask) = (x) & 0x8080808080808080ULL; \
151 	(mask) = ((mask) << 1) - ((mask) >> 7); \
152 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
153 	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
154 }
155 
156 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
157 { \
158 	VDEV_RAIDZ_64MUL_2((x), mask); \
159 	VDEV_RAIDZ_64MUL_2((x), mask); \
160 }
161 
162 #define	VDEV_LABEL_OFFSET(x)	(x + VDEV_LABEL_START_SIZE)
163 
164 /*
165  * Force reconstruction to use the general purpose method.
166  */
167 int vdev_raidz_default_to_general;
168 
169 /* Powers of 2 in the Galois field defined above. */
170 static const uint8_t vdev_raidz_pow2[256] = {
171 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
172 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
173 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
174 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
175 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
176 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
177 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
178 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
179 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
180 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
181 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
182 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
183 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
184 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
185 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
186 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
187 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
188 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
189 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
190 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
191 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
192 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
193 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
194 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
195 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
196 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
197 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
198 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
199 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
200 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
201 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
202 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
203 };
204 /* Logs of 2 in the Galois field defined above. */
205 static const uint8_t vdev_raidz_log2[256] = {
206 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
207 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
208 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
209 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
210 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
211 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
212 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
213 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
214 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
215 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
216 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
217 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
218 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
219 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
220 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
221 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
222 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
223 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
224 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
225 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
226 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
227 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
228 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
229 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
230 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
231 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
232 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
233 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
234 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
235 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
236 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
237 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
238 };
239 
240 static void vdev_raidz_generate_parity(raidz_map_t *rm);
241 
242 /*
243  * Multiply a given number by 2 raised to the given power.
244  */
245 static uint8_t
246 vdev_raidz_exp2(uint_t a, int exp)
247 {
248 	if (a == 0)
249 		return (0);
250 
251 	ASSERT(exp >= 0);
252 	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
253 
254 	exp += vdev_raidz_log2[a];
255 	if (exp > 255)
256 		exp -= 255;
257 
258 	return (vdev_raidz_pow2[exp]);
259 }
260 
261 static void
262 vdev_raidz_map_free(raidz_map_t *rm)
263 {
264 	int c;
265 	size_t size;
266 
267 	for (c = 0; c < rm->rm_firstdatacol; c++) {
268 		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
269 
270 		if (rm->rm_col[c].rc_gdata != NULL)
271 			zio_buf_free(rm->rm_col[c].rc_gdata,
272 			    rm->rm_col[c].rc_size);
273 	}
274 
275 	size = 0;
276 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
277 		size += rm->rm_col[c].rc_size;
278 
279 	if (rm->rm_datacopy != NULL)
280 		zio_buf_free(rm->rm_datacopy, size);
281 
282 	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
283 }
284 
285 static void
286 vdev_raidz_map_free_vsd(zio_t *zio)
287 {
288 	raidz_map_t *rm = zio->io_vsd;
289 
290 	ASSERT0(rm->rm_freed);
291 	rm->rm_freed = 1;
292 
293 	if (rm->rm_reports == 0)
294 		vdev_raidz_map_free(rm);
295 }
296 
297 /*ARGSUSED*/
298 static void
299 vdev_raidz_cksum_free(void *arg, size_t ignored)
300 {
301 	raidz_map_t *rm = arg;
302 
303 	ASSERT3U(rm->rm_reports, >, 0);
304 
305 	if (--rm->rm_reports == 0 && rm->rm_freed != 0)
306 		vdev_raidz_map_free(rm);
307 }
308 
309 static void
310 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
311 {
312 	raidz_map_t *rm = zcr->zcr_cbdata;
313 	size_t c = zcr->zcr_cbinfo;
314 	size_t x;
315 
316 	const char *good = NULL;
317 	const char *bad = rm->rm_col[c].rc_data;
318 
319 	if (good_data == NULL) {
320 		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
321 		return;
322 	}
323 
324 	if (c < rm->rm_firstdatacol) {
325 		/*
326 		 * The first time through, calculate the parity blocks for
327 		 * the good data (this relies on the fact that the good
328 		 * data never changes for a given logical ZIO)
329 		 */
330 		if (rm->rm_col[0].rc_gdata == NULL) {
331 			char *bad_parity[VDEV_RAIDZ_MAXPARITY];
332 			char *buf;
333 
334 			/*
335 			 * Set up the rm_col[]s to generate the parity for
336 			 * good_data, first saving the parity bufs and
337 			 * replacing them with buffers to hold the result.
338 			 */
339 			for (x = 0; x < rm->rm_firstdatacol; x++) {
340 				bad_parity[x] = rm->rm_col[x].rc_data;
341 				rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
342 				    zio_buf_alloc(rm->rm_col[x].rc_size);
343 			}
344 
345 			/* fill in the data columns from good_data */
346 			buf = (char *)good_data;
347 			for (; x < rm->rm_cols; x++) {
348 				rm->rm_col[x].rc_data = buf;
349 				buf += rm->rm_col[x].rc_size;
350 			}
351 
352 			/*
353 			 * Construct the parity from the good data.
354 			 */
355 			vdev_raidz_generate_parity(rm);
356 
357 			/* restore everything back to its original state */
358 			for (x = 0; x < rm->rm_firstdatacol; x++)
359 				rm->rm_col[x].rc_data = bad_parity[x];
360 
361 			buf = rm->rm_datacopy;
362 			for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
363 				rm->rm_col[x].rc_data = buf;
364 				buf += rm->rm_col[x].rc_size;
365 			}
366 		}
367 
368 		ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
369 		good = rm->rm_col[c].rc_gdata;
370 	} else {
371 		/* adjust good_data to point at the start of our column */
372 		good = good_data;
373 
374 		for (x = rm->rm_firstdatacol; x < c; x++)
375 			good += rm->rm_col[x].rc_size;
376 	}
377 
378 	/* we drop the ereport if it ends up that the data was good */
379 	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
380 }
381 
382 /*
383  * Invoked indirectly by zfs_ereport_start_checksum(), called
384  * below when our read operation fails completely.  The main point
385  * is to keep a copy of everything we read from disk, so that at
386  * vdev_raidz_cksum_finish() time we can compare it with the good data.
387  */
388 static void
389 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
390 {
391 	size_t c = (size_t)(uintptr_t)arg;
392 	caddr_t buf;
393 
394 	raidz_map_t *rm = zio->io_vsd;
395 	size_t size;
396 
397 	/* set up the report and bump the refcount  */
398 	zcr->zcr_cbdata = rm;
399 	zcr->zcr_cbinfo = c;
400 	zcr->zcr_finish = vdev_raidz_cksum_finish;
401 	zcr->zcr_free = vdev_raidz_cksum_free;
402 
403 	rm->rm_reports++;
404 	ASSERT3U(rm->rm_reports, >, 0);
405 
406 	if (rm->rm_datacopy != NULL)
407 		return;
408 
409 	/*
410 	 * It's the first time we're called for this raidz_map_t, so we need
411 	 * to copy the data aside; there's no guarantee that our zio's buffer
412 	 * won't be re-used for something else.
413 	 *
414 	 * Our parity data is already in separate buffers, so there's no need
415 	 * to copy them.
416 	 */
417 
418 	size = 0;
419 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
420 		size += rm->rm_col[c].rc_size;
421 
422 	buf = rm->rm_datacopy = zio_buf_alloc(size);
423 
424 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
425 		raidz_col_t *col = &rm->rm_col[c];
426 
427 		bcopy(col->rc_data, buf, col->rc_size);
428 		col->rc_data = buf;
429 
430 		buf += col->rc_size;
431 	}
432 	ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
433 }
434 
435 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
436 	vdev_raidz_map_free_vsd,
437 	vdev_raidz_cksum_report
438 };
439 
440 /*
441  * Divides the IO evenly across all child vdevs; usually, dcols is
442  * the number of children in the target vdev.
443  */
444 static raidz_map_t *
445 vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset,
446     uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
447 {
448 	raidz_map_t *rm;
449 	/* The starting RAIDZ (parent) vdev sector of the block. */
450 	uint64_t b = offset >> unit_shift;
451 	/* The zio's size in units of the vdev's minimum sector size. */
452 	uint64_t s = size >> unit_shift;
453 	/* The first column for this stripe. */
454 	uint64_t f = b % dcols;
455 	/* The starting byte offset on each child vdev. */
456 	uint64_t o = (b / dcols) << unit_shift;
457 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
458 
459 	/*
460 	 * "Quotient": The number of data sectors for this stripe on all but
461 	 * the "big column" child vdevs that also contain "remainder" data.
462 	 */
463 	q = s / (dcols - nparity);
464 
465 	/*
466 	 * "Remainder": The number of partial stripe data sectors in this I/O.
467 	 * This will add a sector to some, but not all, child vdevs.
468 	 */
469 	r = s - q * (dcols - nparity);
470 
471 	/* The number of "big columns" - those which contain remainder data. */
472 	bc = (r == 0 ? 0 : r + nparity);
473 
474 	/*
475 	 * The total number of data and parity sectors associated with
476 	 * this I/O.
477 	 */
478 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
479 
480 	/* acols: The columns that will be accessed. */
481 	/* scols: The columns that will be accessed or skipped. */
482 	if (q == 0) {
483 		/* Our I/O request doesn't span all child vdevs. */
484 		acols = bc;
485 		scols = MIN(dcols, roundup(bc, nparity + 1));
486 	} else {
487 		acols = dcols;
488 		scols = dcols;
489 	}
490 
491 	ASSERT3U(acols, <=, scols);
492 
493 	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
494 
495 	rm->rm_cols = acols;
496 	rm->rm_scols = scols;
497 	rm->rm_bigcols = bc;
498 	rm->rm_skipstart = bc;
499 	rm->rm_missingdata = 0;
500 	rm->rm_missingparity = 0;
501 	rm->rm_firstdatacol = nparity;
502 	rm->rm_datacopy = NULL;
503 	rm->rm_reports = 0;
504 	rm->rm_freed = 0;
505 	rm->rm_ecksuminjected = 0;
506 
507 	asize = 0;
508 
509 	for (c = 0; c < scols; c++) {
510 		col = f + c;
511 		coff = o;
512 		if (col >= dcols) {
513 			col -= dcols;
514 			coff += 1ULL << unit_shift;
515 		}
516 		rm->rm_col[c].rc_devidx = col;
517 		rm->rm_col[c].rc_offset = coff;
518 		rm->rm_col[c].rc_data = NULL;
519 		rm->rm_col[c].rc_gdata = NULL;
520 		rm->rm_col[c].rc_error = 0;
521 		rm->rm_col[c].rc_tried = 0;
522 		rm->rm_col[c].rc_skipped = 0;
523 
524 		if (c >= acols)
525 			rm->rm_col[c].rc_size = 0;
526 		else if (c < bc)
527 			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
528 		else
529 			rm->rm_col[c].rc_size = q << unit_shift;
530 
531 		asize += rm->rm_col[c].rc_size;
532 	}
533 
534 	ASSERT3U(asize, ==, tot << unit_shift);
535 	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
536 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
537 	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
538 	ASSERT3U(rm->rm_nskip, <=, nparity);
539 
540 	for (c = 0; c < rm->rm_firstdatacol; c++)
541 		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
542 
543 	rm->rm_col[c].rc_data = data;
544 
545 	for (c = c + 1; c < acols; c++)
546 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
547 		    rm->rm_col[c - 1].rc_size;
548 
549 	/*
550 	 * If all data stored spans all columns, there's a danger that parity
551 	 * will always be on the same device and, since parity isn't read
552 	 * during normal operation, that that device's I/O bandwidth won't be
553 	 * used effectively. We therefore switch the parity every 1MB.
554 	 *
555 	 * ... at least that was, ostensibly, the theory. As a practical
556 	 * matter unless we juggle the parity between all devices evenly, we
557 	 * won't see any benefit. Further, occasional writes that aren't a
558 	 * multiple of the LCM of the number of children and the minimum
559 	 * stripe width are sufficient to avoid pessimal behavior.
560 	 * Unfortunately, this decision created an implicit on-disk format
561 	 * requirement that we need to support for all eternity, but only
562 	 * for single-parity RAID-Z.
563 	 *
564 	 * If we intend to skip a sector in the zeroth column for padding
565 	 * we must make sure to note this swap. We will never intend to
566 	 * skip the first column since at least one data and one parity
567 	 * column must appear in each row.
568 	 */
569 	ASSERT(rm->rm_cols >= 2);
570 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
571 
572 	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
573 		devidx = rm->rm_col[0].rc_devidx;
574 		o = rm->rm_col[0].rc_offset;
575 		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
576 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
577 		rm->rm_col[1].rc_devidx = devidx;
578 		rm->rm_col[1].rc_offset = o;
579 
580 		if (rm->rm_skipstart == 0)
581 			rm->rm_skipstart = 1;
582 	}
583 
584 	return (rm);
585 }
586 
587 static void
588 vdev_raidz_generate_parity_p(raidz_map_t *rm)
589 {
590 	uint64_t *p, *src, pcount, ccount, i;
591 	int c;
592 
593 	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
594 
595 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
596 		src = rm->rm_col[c].rc_data;
597 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
598 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
599 
600 		if (c == rm->rm_firstdatacol) {
601 			ASSERT(ccount == pcount);
602 			for (i = 0; i < ccount; i++, src++, p++) {
603 				*p = *src;
604 			}
605 		} else {
606 			ASSERT(ccount <= pcount);
607 			for (i = 0; i < ccount; i++, src++, p++) {
608 				*p ^= *src;
609 			}
610 		}
611 	}
612 }
613 
614 static void
615 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
616 {
617 	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
618 	int c;
619 
620 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
621 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
622 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
623 
624 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
625 		src = rm->rm_col[c].rc_data;
626 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
627 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
628 
629 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
630 
631 		if (c == rm->rm_firstdatacol) {
632 			ASSERT(ccnt == pcnt || ccnt == 0);
633 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
634 				*p = *src;
635 				*q = *src;
636 			}
637 			for (; i < pcnt; i++, src++, p++, q++) {
638 				*p = 0;
639 				*q = 0;
640 			}
641 		} else {
642 			ASSERT(ccnt <= pcnt);
643 
644 			/*
645 			 * Apply the algorithm described above by multiplying
646 			 * the previous result and adding in the new value.
647 			 */
648 			for (i = 0; i < ccnt; i++, src++, p++, q++) {
649 				*p ^= *src;
650 
651 				VDEV_RAIDZ_64MUL_2(*q, mask);
652 				*q ^= *src;
653 			}
654 
655 			/*
656 			 * Treat short columns as though they are full of 0s.
657 			 * Note that there's therefore nothing needed for P.
658 			 */
659 			for (; i < pcnt; i++, q++) {
660 				VDEV_RAIDZ_64MUL_2(*q, mask);
661 			}
662 		}
663 	}
664 }
665 
666 static void
667 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
668 {
669 	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
670 	int c;
671 
672 	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
673 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
674 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
675 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
676 	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
677 
678 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
679 		src = rm->rm_col[c].rc_data;
680 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
681 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
682 		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
683 
684 		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
685 
686 		if (c == rm->rm_firstdatacol) {
687 			ASSERT(ccnt == pcnt || ccnt == 0);
688 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
689 				*p = *src;
690 				*q = *src;
691 				*r = *src;
692 			}
693 			for (; i < pcnt; i++, src++, p++, q++, r++) {
694 				*p = 0;
695 				*q = 0;
696 				*r = 0;
697 			}
698 		} else {
699 			ASSERT(ccnt <= pcnt);
700 
701 			/*
702 			 * Apply the algorithm described above by multiplying
703 			 * the previous result and adding in the new value.
704 			 */
705 			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
706 				*p ^= *src;
707 
708 				VDEV_RAIDZ_64MUL_2(*q, mask);
709 				*q ^= *src;
710 
711 				VDEV_RAIDZ_64MUL_4(*r, mask);
712 				*r ^= *src;
713 			}
714 
715 			/*
716 			 * Treat short columns as though they are full of 0s.
717 			 * Note that there's therefore nothing needed for P.
718 			 */
719 			for (; i < pcnt; i++, q++, r++) {
720 				VDEV_RAIDZ_64MUL_2(*q, mask);
721 				VDEV_RAIDZ_64MUL_4(*r, mask);
722 			}
723 		}
724 	}
725 }
726 
727 /*
728  * Generate RAID parity in the first virtual columns according to the number of
729  * parity columns available.
730  */
731 static void
732 vdev_raidz_generate_parity(raidz_map_t *rm)
733 {
734 	switch (rm->rm_firstdatacol) {
735 	case 1:
736 		vdev_raidz_generate_parity_p(rm);
737 		break;
738 	case 2:
739 		vdev_raidz_generate_parity_pq(rm);
740 		break;
741 	case 3:
742 		vdev_raidz_generate_parity_pqr(rm);
743 		break;
744 	default:
745 		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
746 	}
747 }
748 
749 static int
750 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
751 {
752 	uint64_t *dst, *src, xcount, ccount, count, i;
753 	int x = tgts[0];
754 	int c;
755 
756 	ASSERT(ntgts == 1);
757 	ASSERT(x >= rm->rm_firstdatacol);
758 	ASSERT(x < rm->rm_cols);
759 
760 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
761 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
762 	ASSERT(xcount > 0);
763 
764 	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
765 	dst = rm->rm_col[x].rc_data;
766 	for (i = 0; i < xcount; i++, dst++, src++) {
767 		*dst = *src;
768 	}
769 
770 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
771 		src = rm->rm_col[c].rc_data;
772 		dst = rm->rm_col[x].rc_data;
773 
774 		if (c == x)
775 			continue;
776 
777 		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
778 		count = MIN(ccount, xcount);
779 
780 		for (i = 0; i < count; i++, dst++, src++) {
781 			*dst ^= *src;
782 		}
783 	}
784 
785 	return (1 << VDEV_RAIDZ_P);
786 }
787 
788 static int
789 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
790 {
791 	uint64_t *dst, *src, xcount, ccount, count, mask, i;
792 	uint8_t *b;
793 	int x = tgts[0];
794 	int c, j, exp;
795 
796 	ASSERT(ntgts == 1);
797 
798 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
799 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
800 
801 	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
802 		src = rm->rm_col[c].rc_data;
803 		dst = rm->rm_col[x].rc_data;
804 
805 		if (c == x)
806 			ccount = 0;
807 		else
808 			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
809 
810 		count = MIN(ccount, xcount);
811 
812 		if (c == rm->rm_firstdatacol) {
813 			for (i = 0; i < count; i++, dst++, src++) {
814 				*dst = *src;
815 			}
816 			for (; i < xcount; i++, dst++) {
817 				*dst = 0;
818 			}
819 
820 		} else {
821 			for (i = 0; i < count; i++, dst++, src++) {
822 				VDEV_RAIDZ_64MUL_2(*dst, mask);
823 				*dst ^= *src;
824 			}
825 
826 			for (; i < xcount; i++, dst++) {
827 				VDEV_RAIDZ_64MUL_2(*dst, mask);
828 			}
829 		}
830 	}
831 
832 	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
833 	dst = rm->rm_col[x].rc_data;
834 	exp = 255 - (rm->rm_cols - 1 - x);
835 
836 	for (i = 0; i < xcount; i++, dst++, src++) {
837 		*dst ^= *src;
838 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
839 			*b = vdev_raidz_exp2(*b, exp);
840 		}
841 	}
842 
843 	return (1 << VDEV_RAIDZ_Q);
844 }
845 
846 static int
847 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
848 {
849 	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
850 	void *pdata, *qdata;
851 	uint64_t xsize, ysize, i;
852 	int x = tgts[0];
853 	int y = tgts[1];
854 
855 	ASSERT(ntgts == 2);
856 	ASSERT(x < y);
857 	ASSERT(x >= rm->rm_firstdatacol);
858 	ASSERT(y < rm->rm_cols);
859 
860 	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
861 
862 	/*
863 	 * Move the parity data aside -- we're going to compute parity as
864 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
865 	 * reuse the parity generation mechanism without trashing the actual
866 	 * parity so we make those columns appear to be full of zeros by
867 	 * setting their lengths to zero.
868 	 */
869 	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
870 	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
871 	xsize = rm->rm_col[x].rc_size;
872 	ysize = rm->rm_col[y].rc_size;
873 
874 	rm->rm_col[VDEV_RAIDZ_P].rc_data =
875 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
876 	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
877 	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
878 	rm->rm_col[x].rc_size = 0;
879 	rm->rm_col[y].rc_size = 0;
880 
881 	vdev_raidz_generate_parity_pq(rm);
882 
883 	rm->rm_col[x].rc_size = xsize;
884 	rm->rm_col[y].rc_size = ysize;
885 
886 	p = pdata;
887 	q = qdata;
888 	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
889 	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
890 	xd = rm->rm_col[x].rc_data;
891 	yd = rm->rm_col[y].rc_data;
892 
893 	/*
894 	 * We now have:
895 	 *	Pxy = P + D_x + D_y
896 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
897 	 *
898 	 * We can then solve for D_x:
899 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
900 	 * where
901 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
902 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
903 	 *
904 	 * With D_x in hand, we can easily solve for D_y:
905 	 *	D_y = P + Pxy + D_x
906 	 */
907 
908 	a = vdev_raidz_pow2[255 + x - y];
909 	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
910 	tmp = 255 - vdev_raidz_log2[a ^ 1];
911 
912 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
913 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
914 
915 	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
916 		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
917 		    vdev_raidz_exp2(*q ^ *qxy, bexp);
918 
919 		if (i < ysize)
920 			*yd = *p ^ *pxy ^ *xd;
921 	}
922 
923 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
924 	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
925 	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
926 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
927 
928 	/*
929 	 * Restore the saved parity data.
930 	 */
931 	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
932 	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
933 
934 	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
935 }
936 
937 /* BEGIN CSTYLED */
938 /*
939  * In the general case of reconstruction, we must solve the system of linear
940  * equations defined by the coeffecients used to generate parity as well as
941  * the contents of the data and parity disks. This can be expressed with
942  * vectors for the original data (D) and the actual data (d) and parity (p)
943  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
944  *
945  *            __   __                     __     __
946  *            |     |         __     __   |  p_0  |
947  *            |  V  |         |  D_0  |   | p_m-1 |
948  *            |     |    x    |   :   | = |  d_0  |
949  *            |  I  |         | D_n-1 |   |   :   |
950  *            |     |         ~~     ~~   | d_n-1 |
951  *            ~~   ~~                     ~~     ~~
952  *
953  * I is simply a square identity matrix of size n, and V is a vandermonde
954  * matrix defined by the coeffecients we chose for the various parity columns
955  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
956  * computation as well as linear separability.
957  *
958  *      __               __               __     __
959  *      |   1   ..  1 1 1 |               |  p_0  |
960  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
961  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
962  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
963  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
964  *      |   :       : : : |   |   :   |   |  d_2  |
965  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
966  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
967  *      |   0   ..  0 0 1 |               | d_n-1 |
968  *      ~~               ~~               ~~     ~~
969  *
970  * Note that I, V, d, and p are known. To compute D, we must invert the
971  * matrix and use the known data and parity values to reconstruct the unknown
972  * data values. We begin by removing the rows in V|I and d|p that correspond
973  * to failed or missing columns; we then make V|I square (n x n) and d|p
974  * sized n by removing rows corresponding to unused parity from the bottom up
975  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
976  * using Gauss-Jordan elimination. In the example below we use m=3 parity
977  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
978  *           __                               __
979  *           |  1   1   1   1   1   1   1   1  |
980  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
981  *           |  19 205 116  29  64  16  4   1  |      / /
982  *           |  1   0   0   0   0   0   0   0  |     / /
983  *           |  0   1   0   0   0   0   0   0  | <--' /
984  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
985  *           |  0   0   0   1   0   0   0   0  |
986  *           |  0   0   0   0   1   0   0   0  |
987  *           |  0   0   0   0   0   1   0   0  |
988  *           |  0   0   0   0   0   0   1   0  |
989  *           |  0   0   0   0   0   0   0   1  |
990  *           ~~                               ~~
991  *           __                               __
992  *           |  1   1   1   1   1   1   1   1  |
993  *           |  19 205 116  29  64  16  4   1  |
994  *           |  1   0   0   0   0   0   0   0  |
995  *  (V|I)' = |  0   0   0   1   0   0   0   0  |
996  *           |  0   0   0   0   1   0   0   0  |
997  *           |  0   0   0   0   0   1   0   0  |
998  *           |  0   0   0   0   0   0   1   0  |
999  *           |  0   0   0   0   0   0   0   1  |
1000  *           ~~                               ~~
1001  *
1002  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1003  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1004  * matrix is not singular.
1005  * __                                                                 __
1006  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1007  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1008  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1009  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1010  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1011  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1012  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1013  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1014  * ~~                                                                 ~~
1015  * __                                                                 __
1016  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1017  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1018  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1019  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1020  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1021  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1022  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1023  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1024  * ~~                                                                 ~~
1025  * __                                                                 __
1026  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1027  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1028  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1029  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1030  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1031  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1032  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1033  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1034  * ~~                                                                 ~~
1035  * __                                                                 __
1036  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1037  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1038  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1039  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1040  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1041  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1042  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1043  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1044  * ~~                                                                 ~~
1045  * __                                                                 __
1046  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1047  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1048  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1049  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1050  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1051  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1052  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1053  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1054  * ~~                                                                 ~~
1055  * __                                                                 __
1056  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1057  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1058  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1059  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1060  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1061  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1062  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1063  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1064  * ~~                                                                 ~~
1065  *                   __                               __
1066  *                   |  0   0   1   0   0   0   0   0  |
1067  *                   | 167 100  5   41 159 169 217 208 |
1068  *                   | 166 100  4   40 158 168 216 209 |
1069  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1070  *                   |  0   0   0   0   1   0   0   0  |
1071  *                   |  0   0   0   0   0   1   0   0  |
1072  *                   |  0   0   0   0   0   0   1   0  |
1073  *                   |  0   0   0   0   0   0   0   1  |
1074  *                   ~~                               ~~
1075  *
1076  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1077  * of the missing data.
1078  *
1079  * As is apparent from the example above, the only non-trivial rows in the
1080  * inverse matrix correspond to the data disks that we're trying to
1081  * reconstruct. Indeed, those are the only rows we need as the others would
1082  * only be useful for reconstructing data known or assumed to be valid. For
1083  * that reason, we only build the coefficients in the rows that correspond to
1084  * targeted columns.
1085  */
1086 /* END CSTYLED */
1087 
1088 static void
1089 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1090     uint8_t **rows)
1091 {
1092 	int i, j;
1093 	int pow;
1094 
1095 	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1096 
1097 	/*
1098 	 * Fill in the missing rows of interest.
1099 	 */
1100 	for (i = 0; i < nmap; i++) {
1101 		ASSERT3S(0, <=, map[i]);
1102 		ASSERT3S(map[i], <=, 2);
1103 
1104 		pow = map[i] * n;
1105 		if (pow > 255)
1106 			pow -= 255;
1107 		ASSERT(pow <= 255);
1108 
1109 		for (j = 0; j < n; j++) {
1110 			pow -= map[i];
1111 			if (pow < 0)
1112 				pow += 255;
1113 			rows[i][j] = vdev_raidz_pow2[pow];
1114 		}
1115 	}
1116 }
1117 
1118 static void
1119 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1120     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1121 {
1122 	int i, j, ii, jj;
1123 	uint8_t log;
1124 
1125 	/*
1126 	 * Assert that the first nmissing entries from the array of used
1127 	 * columns correspond to parity columns and that subsequent entries
1128 	 * correspond to data columns.
1129 	 */
1130 	for (i = 0; i < nmissing; i++) {
1131 		ASSERT3S(used[i], <, rm->rm_firstdatacol);
1132 	}
1133 	for (; i < n; i++) {
1134 		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1135 	}
1136 
1137 	/*
1138 	 * First initialize the storage where we'll compute the inverse rows.
1139 	 */
1140 	for (i = 0; i < nmissing; i++) {
1141 		for (j = 0; j < n; j++) {
1142 			invrows[i][j] = (i == j) ? 1 : 0;
1143 		}
1144 	}
1145 
1146 	/*
1147 	 * Subtract all trivial rows from the rows of consequence.
1148 	 */
1149 	for (i = 0; i < nmissing; i++) {
1150 		for (j = nmissing; j < n; j++) {
1151 			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1152 			jj = used[j] - rm->rm_firstdatacol;
1153 			ASSERT3S(jj, <, n);
1154 			invrows[i][j] = rows[i][jj];
1155 			rows[i][jj] = 0;
1156 		}
1157 	}
1158 
1159 	/*
1160 	 * For each of the rows of interest, we must normalize it and subtract
1161 	 * a multiple of it from the other rows.
1162 	 */
1163 	for (i = 0; i < nmissing; i++) {
1164 		for (j = 0; j < missing[i]; j++) {
1165 			ASSERT0(rows[i][j]);
1166 		}
1167 		ASSERT3U(rows[i][missing[i]], !=, 0);
1168 
1169 		/*
1170 		 * Compute the inverse of the first element and multiply each
1171 		 * element in the row by that value.
1172 		 */
1173 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1174 
1175 		for (j = 0; j < n; j++) {
1176 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1177 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1178 		}
1179 
1180 		for (ii = 0; ii < nmissing; ii++) {
1181 			if (i == ii)
1182 				continue;
1183 
1184 			ASSERT3U(rows[ii][missing[i]], !=, 0);
1185 
1186 			log = vdev_raidz_log2[rows[ii][missing[i]]];
1187 
1188 			for (j = 0; j < n; j++) {
1189 				rows[ii][j] ^=
1190 				    vdev_raidz_exp2(rows[i][j], log);
1191 				invrows[ii][j] ^=
1192 				    vdev_raidz_exp2(invrows[i][j], log);
1193 			}
1194 		}
1195 	}
1196 
1197 	/*
1198 	 * Verify that the data that is left in the rows are properly part of
1199 	 * an identity matrix.
1200 	 */
1201 	for (i = 0; i < nmissing; i++) {
1202 		for (j = 0; j < n; j++) {
1203 			if (j == missing[i]) {
1204 				ASSERT3U(rows[i][j], ==, 1);
1205 			} else {
1206 				ASSERT0(rows[i][j]);
1207 			}
1208 		}
1209 	}
1210 }
1211 
1212 static void
1213 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1214     int *missing, uint8_t **invrows, const uint8_t *used)
1215 {
1216 	int i, j, x, cc, c;
1217 	uint8_t *src;
1218 	uint64_t ccount;
1219 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1220 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1221 	uint8_t log = 0;
1222 	uint8_t val;
1223 	int ll;
1224 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1225 	uint8_t *p, *pp;
1226 	size_t psize;
1227 
1228 	psize = sizeof (invlog[0][0]) * n * nmissing;
1229 	p = kmem_alloc(psize, KM_SLEEP);
1230 
1231 	for (pp = p, i = 0; i < nmissing; i++) {
1232 		invlog[i] = pp;
1233 		pp += n;
1234 	}
1235 
1236 	for (i = 0; i < nmissing; i++) {
1237 		for (j = 0; j < n; j++) {
1238 			ASSERT3U(invrows[i][j], !=, 0);
1239 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1240 		}
1241 	}
1242 
1243 	for (i = 0; i < n; i++) {
1244 		c = used[i];
1245 		ASSERT3U(c, <, rm->rm_cols);
1246 
1247 		src = rm->rm_col[c].rc_data;
1248 		ccount = rm->rm_col[c].rc_size;
1249 		for (j = 0; j < nmissing; j++) {
1250 			cc = missing[j] + rm->rm_firstdatacol;
1251 			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1252 			ASSERT3U(cc, <, rm->rm_cols);
1253 			ASSERT3U(cc, !=, c);
1254 
1255 			dst[j] = rm->rm_col[cc].rc_data;
1256 			dcount[j] = rm->rm_col[cc].rc_size;
1257 		}
1258 
1259 		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1260 
1261 		for (x = 0; x < ccount; x++, src++) {
1262 			if (*src != 0)
1263 				log = vdev_raidz_log2[*src];
1264 
1265 			for (cc = 0; cc < nmissing; cc++) {
1266 				if (x >= dcount[cc])
1267 					continue;
1268 
1269 				if (*src == 0) {
1270 					val = 0;
1271 				} else {
1272 					if ((ll = log + invlog[cc][i]) >= 255)
1273 						ll -= 255;
1274 					val = vdev_raidz_pow2[ll];
1275 				}
1276 
1277 				if (i == 0)
1278 					dst[cc][x] = val;
1279 				else
1280 					dst[cc][x] ^= val;
1281 			}
1282 		}
1283 	}
1284 
1285 	kmem_free(p, psize);
1286 }
1287 
1288 static int
1289 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1290 {
1291 	int n, i, c, t, tt;
1292 	int nmissing_rows;
1293 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1294 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1295 
1296 	uint8_t *p, *pp;
1297 	size_t psize;
1298 
1299 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1300 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1301 	uint8_t *used;
1302 
1303 	int code = 0;
1304 
1305 
1306 	n = rm->rm_cols - rm->rm_firstdatacol;
1307 
1308 	/*
1309 	 * Figure out which data columns are missing.
1310 	 */
1311 	nmissing_rows = 0;
1312 	for (t = 0; t < ntgts; t++) {
1313 		if (tgts[t] >= rm->rm_firstdatacol) {
1314 			missing_rows[nmissing_rows++] =
1315 			    tgts[t] - rm->rm_firstdatacol;
1316 		}
1317 	}
1318 
1319 	/*
1320 	 * Figure out which parity columns to use to help generate the missing
1321 	 * data columns.
1322 	 */
1323 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1324 		ASSERT(tt < ntgts);
1325 		ASSERT(c < rm->rm_firstdatacol);
1326 
1327 		/*
1328 		 * Skip any targeted parity columns.
1329 		 */
1330 		if (c == tgts[tt]) {
1331 			tt++;
1332 			continue;
1333 		}
1334 
1335 		code |= 1 << c;
1336 
1337 		parity_map[i] = c;
1338 		i++;
1339 	}
1340 
1341 	ASSERT(code != 0);
1342 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1343 
1344 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1345 	    nmissing_rows * n + sizeof (used[0]) * n;
1346 	p = kmem_alloc(psize, KM_SLEEP);
1347 
1348 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1349 		rows[i] = pp;
1350 		pp += n;
1351 		invrows[i] = pp;
1352 		pp += n;
1353 	}
1354 	used = pp;
1355 
1356 	for (i = 0; i < nmissing_rows; i++) {
1357 		used[i] = parity_map[i];
1358 	}
1359 
1360 	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1361 		if (tt < nmissing_rows &&
1362 		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1363 			tt++;
1364 			continue;
1365 		}
1366 
1367 		ASSERT3S(i, <, n);
1368 		used[i] = c;
1369 		i++;
1370 	}
1371 
1372 	/*
1373 	 * Initialize the interesting rows of the matrix.
1374 	 */
1375 	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1376 
1377 	/*
1378 	 * Invert the matrix.
1379 	 */
1380 	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1381 	    invrows, used);
1382 
1383 	/*
1384 	 * Reconstruct the missing data using the generated matrix.
1385 	 */
1386 	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1387 	    invrows, used);
1388 
1389 	kmem_free(p, psize);
1390 
1391 	return (code);
1392 }
1393 
1394 static int
1395 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1396 {
1397 	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1398 	int ntgts;
1399 	int i, c;
1400 	int code;
1401 	int nbadparity, nbaddata;
1402 	int parity_valid[VDEV_RAIDZ_MAXPARITY];
1403 
1404 	/*
1405 	 * The tgts list must already be sorted.
1406 	 */
1407 	for (i = 1; i < nt; i++) {
1408 		ASSERT(t[i] > t[i - 1]);
1409 	}
1410 
1411 	nbadparity = rm->rm_firstdatacol;
1412 	nbaddata = rm->rm_cols - nbadparity;
1413 	ntgts = 0;
1414 	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1415 		if (c < rm->rm_firstdatacol)
1416 			parity_valid[c] = B_FALSE;
1417 
1418 		if (i < nt && c == t[i]) {
1419 			tgts[ntgts++] = c;
1420 			i++;
1421 		} else if (rm->rm_col[c].rc_error != 0) {
1422 			tgts[ntgts++] = c;
1423 		} else if (c >= rm->rm_firstdatacol) {
1424 			nbaddata--;
1425 		} else {
1426 			parity_valid[c] = B_TRUE;
1427 			nbadparity--;
1428 		}
1429 	}
1430 
1431 	ASSERT(ntgts >= nt);
1432 	ASSERT(nbaddata >= 0);
1433 	ASSERT(nbaddata + nbadparity == ntgts);
1434 
1435 	dt = &tgts[nbadparity];
1436 
1437 	/*
1438 	 * See if we can use any of our optimized reconstruction routines.
1439 	 */
1440 	if (!vdev_raidz_default_to_general) {
1441 		switch (nbaddata) {
1442 		case 1:
1443 			if (parity_valid[VDEV_RAIDZ_P])
1444 				return (vdev_raidz_reconstruct_p(rm, dt, 1));
1445 
1446 			ASSERT(rm->rm_firstdatacol > 1);
1447 
1448 			if (parity_valid[VDEV_RAIDZ_Q])
1449 				return (vdev_raidz_reconstruct_q(rm, dt, 1));
1450 
1451 			ASSERT(rm->rm_firstdatacol > 2);
1452 			break;
1453 
1454 		case 2:
1455 			ASSERT(rm->rm_firstdatacol > 1);
1456 
1457 			if (parity_valid[VDEV_RAIDZ_P] &&
1458 			    parity_valid[VDEV_RAIDZ_Q])
1459 				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1460 
1461 			ASSERT(rm->rm_firstdatacol > 2);
1462 
1463 			break;
1464 		}
1465 	}
1466 
1467 	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1468 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1469 	ASSERT(code > 0);
1470 	return (code);
1471 }
1472 
1473 static int
1474 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1475     uint64_t *ashift)
1476 {
1477 	vdev_t *cvd;
1478 	uint64_t nparity = vd->vdev_nparity;
1479 	int c;
1480 	int lasterror = 0;
1481 	int numerrors = 0;
1482 
1483 	ASSERT(nparity > 0);
1484 
1485 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
1486 	    vd->vdev_children < nparity + 1) {
1487 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1488 		return (SET_ERROR(EINVAL));
1489 	}
1490 
1491 	vdev_open_children(vd);
1492 
1493 	for (c = 0; c < vd->vdev_children; c++) {
1494 		cvd = vd->vdev_child[c];
1495 
1496 		if (cvd->vdev_open_error != 0) {
1497 			lasterror = cvd->vdev_open_error;
1498 			numerrors++;
1499 			continue;
1500 		}
1501 
1502 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1503 		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1504 		*ashift = MAX(*ashift, cvd->vdev_ashift);
1505 	}
1506 
1507 	*asize *= vd->vdev_children;
1508 	*max_asize *= vd->vdev_children;
1509 
1510 	if (numerrors > nparity) {
1511 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1512 		return (lasterror);
1513 	}
1514 
1515 	return (0);
1516 }
1517 
1518 static void
1519 vdev_raidz_close(vdev_t *vd)
1520 {
1521 	int c;
1522 
1523 	for (c = 0; c < vd->vdev_children; c++)
1524 		vdev_close(vd->vdev_child[c]);
1525 }
1526 
1527 /*
1528  * Handle a read or write I/O to a RAID-Z dump device.
1529  *
1530  * The dump device is in a unique situation compared to other ZFS datasets:
1531  * writing to this device should be as simple and fast as possible.  In
1532  * addition, durability matters much less since the dump will be extracted
1533  * once the machine reboots.  For that reason, this function eschews parity for
1534  * performance and simplicity.  The dump device uses the checksum setting
1535  * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1536  * dataset.
1537  *
1538  * Blocks of size 128 KB have been preallocated for this volume.  I/Os less than
1539  * 128 KB will not fill an entire block; in addition, they may not be properly
1540  * aligned.  In that case, this function uses the preallocated 128 KB block and
1541  * omits reading or writing any "empty" portions of that block, as opposed to
1542  * allocating a fresh appropriately-sized block.
1543  *
1544  * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1545  *
1546  *     vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1547  *
1548  * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1549  * allocated which spans all five child vdevs.  8 KB of data would be written to
1550  * each of four vdevs, with the fifth containing the parity bits.
1551  *
1552  *       parity    data     data     data     data
1553  *     |   PP   |   XX   |   XX   |   XX   |   XX   |
1554  *         ^        ^        ^        ^        ^
1555  *         |        |        |        |        |
1556  *   8 KB parity    ------8 KB data blocks------
1557  *
1558  * However, when writing to the dump device, the behavior is different:
1559  *
1560  *     vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1561  *
1562  * Unlike the normal RAID-Z case in which the block is allocated based on the
1563  * I/O size, reads and writes here always use a 128 KB logical I/O size.  If the
1564  * I/O size is less than 128 KB, only the actual portions of data are written.
1565  * In this example the data is written to the third data vdev since that vdev
1566  * contains the offset [64 KB, 96 KB).
1567  *
1568  *       parity    data     data     data     data
1569  *     |        |        |        |   XX   |        |
1570  *                                    ^
1571  *                                    |
1572  *                             32 KB data block
1573  *
1574  * As a result, an individual I/O may not span all child vdevs; moreover, a
1575  * small I/O may only operate on a single child vdev.
1576  *
1577  * Note that since there are no parity bits calculated or written, this format
1578  * remains the same no matter how many parity bits are used in a normal RAID-Z
1579  * stripe.  On a RAID-Z3 configuration with seven child vdevs, the example above
1580  * would look like:
1581  *
1582  *       parity   parity   parity    data     data     data     data
1583  *     |        |        |        |        |        |   XX   |        |
1584  *                                                      ^
1585  *                                                      |
1586  *                                               32 KB data block
1587  */
1588 int
1589 vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1590     uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1591 {
1592 	vdev_t *tvd = vd->vdev_top;
1593 	vdev_t *cvd;
1594 	raidz_map_t *rm;
1595 	raidz_col_t *rc;
1596 	int c, err = 0;
1597 
1598 	uint64_t start, end, colstart, colend;
1599 	uint64_t coloffset, colsize, colskip;
1600 
1601 	int flags = doread ? B_READ : B_WRITE;
1602 
1603 #ifdef	_KERNEL
1604 
1605 	/*
1606 	 * Don't write past the end of the block
1607 	 */
1608 	VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
1609 
1610 	start = offset;
1611 	end = start + size;
1612 
1613 	/*
1614 	 * Allocate a RAID-Z map for this block.  Note that this block starts
1615 	 * from the "original" offset, this is, the offset of the extent which
1616 	 * contains the requisite offset of the data being read or written.
1617 	 *
1618 	 * Even if this I/O operation doesn't span the full block size, let's
1619 	 * treat the on-disk format as if the only blocks are the complete 128
1620 	 * KB size.
1621 	 */
1622 	rm = vdev_raidz_map_alloc(data - (offset - origoffset),
1623 	    SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift,
1624 	    vd->vdev_children, vd->vdev_nparity);
1625 
1626 	coloffset = origoffset;
1627 
1628 	for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1629 	    c++, coloffset += rc->rc_size) {
1630 		rc = &rm->rm_col[c];
1631 		cvd = vd->vdev_child[rc->rc_devidx];
1632 
1633 		/*
1634 		 * Find the start and end of this column in the RAID-Z map,
1635 		 * keeping in mind that the stated size and offset of the
1636 		 * operation may not fill the entire column for this vdev.
1637 		 *
1638 		 * If any portion of the data spans this column, issue the
1639 		 * appropriate operation to the vdev.
1640 		 */
1641 		if (coloffset + rc->rc_size <= start)
1642 			continue;
1643 		if (coloffset >= end)
1644 			continue;
1645 
1646 		colstart = MAX(coloffset, start);
1647 		colend = MIN(end, coloffset + rc->rc_size);
1648 		colsize = colend - colstart;
1649 		colskip = colstart - coloffset;
1650 
1651 		VERIFY3U(colsize, <=, rc->rc_size);
1652 		VERIFY3U(colskip, <=, rc->rc_size);
1653 
1654 		/*
1655 		 * Note that the child vdev will have a vdev label at the start
1656 		 * of its range of offsets, hence the need for
1657 		 * VDEV_LABEL_OFFSET().  See zio_vdev_child_io() for another
1658 		 * example of why this calculation is needed.
1659 		 */
1660 		if ((err = vdev_disk_physio(cvd,
1661 		    ((char *)rc->rc_data) + colskip, colsize,
1662 		    VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1663 		    flags, isdump)) != 0)
1664 			break;
1665 	}
1666 
1667 	vdev_raidz_map_free(rm);
1668 #endif	/* KERNEL */
1669 
1670 	return (err);
1671 }
1672 
1673 static uint64_t
1674 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1675 {
1676 	uint64_t asize;
1677 	uint64_t ashift = vd->vdev_top->vdev_ashift;
1678 	uint64_t cols = vd->vdev_children;
1679 	uint64_t nparity = vd->vdev_nparity;
1680 
1681 	asize = ((psize - 1) >> ashift) + 1;
1682 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1683 	asize = roundup(asize, nparity + 1) << ashift;
1684 
1685 	return (asize);
1686 }
1687 
1688 static void
1689 vdev_raidz_child_done(zio_t *zio)
1690 {
1691 	raidz_col_t *rc = zio->io_private;
1692 
1693 	rc->rc_error = zio->io_error;
1694 	rc->rc_tried = 1;
1695 	rc->rc_skipped = 0;
1696 }
1697 
1698 /*
1699  * Start an IO operation on a RAIDZ VDev
1700  *
1701  * Outline:
1702  * - For write operations:
1703  *   1. Generate the parity data
1704  *   2. Create child zio write operations to each column's vdev, for both
1705  *      data and parity.
1706  *   3. If the column skips any sectors for padding, create optional dummy
1707  *      write zio children for those areas to improve aggregation continuity.
1708  * - For read operations:
1709  *   1. Create child zio read operations to each data column's vdev to read
1710  *      the range of data required for zio.
1711  *   2. If this is a scrub or resilver operation, or if any of the data
1712  *      vdevs have had errors, then create zio read operations to the parity
1713  *      columns' VDevs as well.
1714  */
1715 static void
1716 vdev_raidz_io_start(zio_t *zio)
1717 {
1718 	vdev_t *vd = zio->io_vd;
1719 	vdev_t *tvd = vd->vdev_top;
1720 	vdev_t *cvd;
1721 	raidz_map_t *rm;
1722 	raidz_col_t *rc;
1723 	int c, i;
1724 
1725 	rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset,
1726 	    tvd->vdev_ashift, vd->vdev_children,
1727 	    vd->vdev_nparity);
1728 
1729 	zio->io_vsd = rm;
1730 	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1731 
1732 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1733 
1734 	if (zio->io_type == ZIO_TYPE_WRITE) {
1735 		vdev_raidz_generate_parity(rm);
1736 
1737 		for (c = 0; c < rm->rm_cols; c++) {
1738 			rc = &rm->rm_col[c];
1739 			cvd = vd->vdev_child[rc->rc_devidx];
1740 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1741 			    rc->rc_offset, rc->rc_data, rc->rc_size,
1742 			    zio->io_type, zio->io_priority, 0,
1743 			    vdev_raidz_child_done, rc));
1744 		}
1745 
1746 		/*
1747 		 * Generate optional I/Os for any skipped sectors to improve
1748 		 * aggregation contiguity.
1749 		 */
1750 		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1751 			ASSERT(c <= rm->rm_scols);
1752 			if (c == rm->rm_scols)
1753 				c = 0;
1754 			rc = &rm->rm_col[c];
1755 			cvd = vd->vdev_child[rc->rc_devidx];
1756 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1757 			    rc->rc_offset + rc->rc_size, NULL,
1758 			    1 << tvd->vdev_ashift,
1759 			    zio->io_type, zio->io_priority,
1760 			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1761 		}
1762 
1763 		zio_execute(zio);
1764 		return;
1765 	}
1766 
1767 	ASSERT(zio->io_type == ZIO_TYPE_READ);
1768 
1769 	/*
1770 	 * Iterate over the columns in reverse order so that we hit the parity
1771 	 * last -- any errors along the way will force us to read the parity.
1772 	 */
1773 	for (c = rm->rm_cols - 1; c >= 0; c--) {
1774 		rc = &rm->rm_col[c];
1775 		cvd = vd->vdev_child[rc->rc_devidx];
1776 		if (!vdev_readable(cvd)) {
1777 			if (c >= rm->rm_firstdatacol)
1778 				rm->rm_missingdata++;
1779 			else
1780 				rm->rm_missingparity++;
1781 			rc->rc_error = SET_ERROR(ENXIO);
1782 			rc->rc_tried = 1;	/* don't even try */
1783 			rc->rc_skipped = 1;
1784 			continue;
1785 		}
1786 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1787 			if (c >= rm->rm_firstdatacol)
1788 				rm->rm_missingdata++;
1789 			else
1790 				rm->rm_missingparity++;
1791 			rc->rc_error = SET_ERROR(ESTALE);
1792 			rc->rc_skipped = 1;
1793 			continue;
1794 		}
1795 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1796 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1797 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1798 			    rc->rc_offset, rc->rc_data, rc->rc_size,
1799 			    zio->io_type, zio->io_priority, 0,
1800 			    vdev_raidz_child_done, rc));
1801 		}
1802 	}
1803 
1804 	zio_execute(zio);
1805 }
1806 
1807 
1808 /*
1809  * Report a checksum error for a child of a RAID-Z device.
1810  */
1811 static void
1812 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1813 {
1814 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1815 
1816 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1817 		zio_bad_cksum_t zbc;
1818 		raidz_map_t *rm = zio->io_vsd;
1819 
1820 		mutex_enter(&vd->vdev_stat_lock);
1821 		vd->vdev_stat.vs_checksum_errors++;
1822 		mutex_exit(&vd->vdev_stat_lock);
1823 
1824 		zbc.zbc_has_cksum = 0;
1825 		zbc.zbc_injected = rm->rm_ecksuminjected;
1826 
1827 		zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1828 		    rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1829 		    &zbc);
1830 	}
1831 }
1832 
1833 /*
1834  * We keep track of whether or not there were any injected errors, so that
1835  * any ereports we generate can note it.
1836  */
1837 static int
1838 raidz_checksum_verify(zio_t *zio)
1839 {
1840 	zio_bad_cksum_t zbc;
1841 	raidz_map_t *rm = zio->io_vsd;
1842 
1843 	int ret = zio_checksum_error(zio, &zbc);
1844 	if (ret != 0 && zbc.zbc_injected != 0)
1845 		rm->rm_ecksuminjected = 1;
1846 
1847 	return (ret);
1848 }
1849 
1850 /*
1851  * Generate the parity from the data columns. If we tried and were able to
1852  * read the parity without error, verify that the generated parity matches the
1853  * data we read. If it doesn't, we fire off a checksum error. Return the
1854  * number such failures.
1855  */
1856 static int
1857 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1858 {
1859 	void *orig[VDEV_RAIDZ_MAXPARITY];
1860 	int c, ret = 0;
1861 	raidz_col_t *rc;
1862 
1863 	blkptr_t *bp = zio->io_bp;
1864 	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
1865 	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
1866 
1867 	if (checksum == ZIO_CHECKSUM_NOPARITY)
1868 		return (ret);
1869 
1870 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1871 		rc = &rm->rm_col[c];
1872 		if (!rc->rc_tried || rc->rc_error != 0)
1873 			continue;
1874 		orig[c] = zio_buf_alloc(rc->rc_size);
1875 		bcopy(rc->rc_data, orig[c], rc->rc_size);
1876 	}
1877 
1878 	vdev_raidz_generate_parity(rm);
1879 
1880 	for (c = 0; c < rm->rm_firstdatacol; c++) {
1881 		rc = &rm->rm_col[c];
1882 		if (!rc->rc_tried || rc->rc_error != 0)
1883 			continue;
1884 		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1885 			raidz_checksum_error(zio, rc, orig[c]);
1886 			rc->rc_error = SET_ERROR(ECKSUM);
1887 			ret++;
1888 		}
1889 		zio_buf_free(orig[c], rc->rc_size);
1890 	}
1891 
1892 	return (ret);
1893 }
1894 
1895 /*
1896  * Keep statistics on all the ways that we used parity to correct data.
1897  */
1898 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1899 
1900 static int
1901 vdev_raidz_worst_error(raidz_map_t *rm)
1902 {
1903 	int error = 0;
1904 
1905 	for (int c = 0; c < rm->rm_cols; c++)
1906 		error = zio_worst_error(error, rm->rm_col[c].rc_error);
1907 
1908 	return (error);
1909 }
1910 
1911 /*
1912  * Iterate over all combinations of bad data and attempt a reconstruction.
1913  * Note that the algorithm below is non-optimal because it doesn't take into
1914  * account how reconstruction is actually performed. For example, with
1915  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1916  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1917  * cases we'd only use parity information in column 0.
1918  */
1919 static int
1920 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1921 {
1922 	raidz_map_t *rm = zio->io_vsd;
1923 	raidz_col_t *rc;
1924 	void *orig[VDEV_RAIDZ_MAXPARITY];
1925 	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1926 	int *tgts = &tstore[1];
1927 	int current, next, i, c, n;
1928 	int code, ret = 0;
1929 
1930 	ASSERT(total_errors < rm->rm_firstdatacol);
1931 
1932 	/*
1933 	 * This simplifies one edge condition.
1934 	 */
1935 	tgts[-1] = -1;
1936 
1937 	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1938 		/*
1939 		 * Initialize the targets array by finding the first n columns
1940 		 * that contain no error.
1941 		 *
1942 		 * If there were no data errors, we need to ensure that we're
1943 		 * always explicitly attempting to reconstruct at least one
1944 		 * data column. To do this, we simply push the highest target
1945 		 * up into the data columns.
1946 		 */
1947 		for (c = 0, i = 0; i < n; i++) {
1948 			if (i == n - 1 && data_errors == 0 &&
1949 			    c < rm->rm_firstdatacol) {
1950 				c = rm->rm_firstdatacol;
1951 			}
1952 
1953 			while (rm->rm_col[c].rc_error != 0) {
1954 				c++;
1955 				ASSERT3S(c, <, rm->rm_cols);
1956 			}
1957 
1958 			tgts[i] = c++;
1959 		}
1960 
1961 		/*
1962 		 * Setting tgts[n] simplifies the other edge condition.
1963 		 */
1964 		tgts[n] = rm->rm_cols;
1965 
1966 		/*
1967 		 * These buffers were allocated in previous iterations.
1968 		 */
1969 		for (i = 0; i < n - 1; i++) {
1970 			ASSERT(orig[i] != NULL);
1971 		}
1972 
1973 		orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
1974 
1975 		current = 0;
1976 		next = tgts[current];
1977 
1978 		while (current != n) {
1979 			tgts[current] = next;
1980 			current = 0;
1981 
1982 			/*
1983 			 * Save off the original data that we're going to
1984 			 * attempt to reconstruct.
1985 			 */
1986 			for (i = 0; i < n; i++) {
1987 				ASSERT(orig[i] != NULL);
1988 				c = tgts[i];
1989 				ASSERT3S(c, >=, 0);
1990 				ASSERT3S(c, <, rm->rm_cols);
1991 				rc = &rm->rm_col[c];
1992 				bcopy(rc->rc_data, orig[i], rc->rc_size);
1993 			}
1994 
1995 			/*
1996 			 * Attempt a reconstruction and exit the outer loop on
1997 			 * success.
1998 			 */
1999 			code = vdev_raidz_reconstruct(rm, tgts, n);
2000 			if (raidz_checksum_verify(zio) == 0) {
2001 				atomic_inc_64(&raidz_corrected[code]);
2002 
2003 				for (i = 0; i < n; i++) {
2004 					c = tgts[i];
2005 					rc = &rm->rm_col[c];
2006 					ASSERT(rc->rc_error == 0);
2007 					if (rc->rc_tried)
2008 						raidz_checksum_error(zio, rc,
2009 						    orig[i]);
2010 					rc->rc_error = SET_ERROR(ECKSUM);
2011 				}
2012 
2013 				ret = code;
2014 				goto done;
2015 			}
2016 
2017 			/*
2018 			 * Restore the original data.
2019 			 */
2020 			for (i = 0; i < n; i++) {
2021 				c = tgts[i];
2022 				rc = &rm->rm_col[c];
2023 				bcopy(orig[i], rc->rc_data, rc->rc_size);
2024 			}
2025 
2026 			do {
2027 				/*
2028 				 * Find the next valid column after the current
2029 				 * position..
2030 				 */
2031 				for (next = tgts[current] + 1;
2032 				    next < rm->rm_cols &&
2033 				    rm->rm_col[next].rc_error != 0; next++)
2034 					continue;
2035 
2036 				ASSERT(next <= tgts[current + 1]);
2037 
2038 				/*
2039 				 * If that spot is available, we're done here.
2040 				 */
2041 				if (next != tgts[current + 1])
2042 					break;
2043 
2044 				/*
2045 				 * Otherwise, find the next valid column after
2046 				 * the previous position.
2047 				 */
2048 				for (c = tgts[current - 1] + 1;
2049 				    rm->rm_col[c].rc_error != 0; c++)
2050 					continue;
2051 
2052 				tgts[current] = c;
2053 				current++;
2054 
2055 			} while (current != n);
2056 		}
2057 	}
2058 	n--;
2059 done:
2060 	for (i = 0; i < n; i++) {
2061 		zio_buf_free(orig[i], rm->rm_col[0].rc_size);
2062 	}
2063 
2064 	return (ret);
2065 }
2066 
2067 /*
2068  * Complete an IO operation on a RAIDZ VDev
2069  *
2070  * Outline:
2071  * - For write operations:
2072  *   1. Check for errors on the child IOs.
2073  *   2. Return, setting an error code if too few child VDevs were written
2074  *      to reconstruct the data later.  Note that partial writes are
2075  *      considered successful if they can be reconstructed at all.
2076  * - For read operations:
2077  *   1. Check for errors on the child IOs.
2078  *   2. If data errors occurred:
2079  *      a. Try to reassemble the data from the parity available.
2080  *      b. If we haven't yet read the parity drives, read them now.
2081  *      c. If all parity drives have been read but the data still doesn't
2082  *         reassemble with a correct checksum, then try combinatorial
2083  *         reconstruction.
2084  *      d. If that doesn't work, return an error.
2085  *   3. If there were unexpected errors or this is a resilver operation,
2086  *      rewrite the vdevs that had errors.
2087  */
2088 static void
2089 vdev_raidz_io_done(zio_t *zio)
2090 {
2091 	vdev_t *vd = zio->io_vd;
2092 	vdev_t *cvd;
2093 	raidz_map_t *rm = zio->io_vsd;
2094 	raidz_col_t *rc;
2095 	int unexpected_errors = 0;
2096 	int parity_errors = 0;
2097 	int parity_untried = 0;
2098 	int data_errors = 0;
2099 	int total_errors = 0;
2100 	int n, c;
2101 	int tgts[VDEV_RAIDZ_MAXPARITY];
2102 	int code;
2103 
2104 	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
2105 
2106 	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2107 	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2108 
2109 	for (c = 0; c < rm->rm_cols; c++) {
2110 		rc = &rm->rm_col[c];
2111 
2112 		if (rc->rc_error) {
2113 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
2114 
2115 			if (c < rm->rm_firstdatacol)
2116 				parity_errors++;
2117 			else
2118 				data_errors++;
2119 
2120 			if (!rc->rc_skipped)
2121 				unexpected_errors++;
2122 
2123 			total_errors++;
2124 		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2125 			parity_untried++;
2126 		}
2127 	}
2128 
2129 	if (zio->io_type == ZIO_TYPE_WRITE) {
2130 		/*
2131 		 * XXX -- for now, treat partial writes as a success.
2132 		 * (If we couldn't write enough columns to reconstruct
2133 		 * the data, the I/O failed.  Otherwise, good enough.)
2134 		 *
2135 		 * Now that we support write reallocation, it would be better
2136 		 * to treat partial failure as real failure unless there are
2137 		 * no non-degraded top-level vdevs left, and not update DTLs
2138 		 * if we intend to reallocate.
2139 		 */
2140 		/* XXPOLICY */
2141 		if (total_errors > rm->rm_firstdatacol)
2142 			zio->io_error = vdev_raidz_worst_error(rm);
2143 
2144 		return;
2145 	}
2146 
2147 	ASSERT(zio->io_type == ZIO_TYPE_READ);
2148 	/*
2149 	 * There are three potential phases for a read:
2150 	 *	1. produce valid data from the columns read
2151 	 *	2. read all disks and try again
2152 	 *	3. perform combinatorial reconstruction
2153 	 *
2154 	 * Each phase is progressively both more expensive and less likely to
2155 	 * occur. If we encounter more errors than we can repair or all phases
2156 	 * fail, we have no choice but to return an error.
2157 	 */
2158 
2159 	/*
2160 	 * If the number of errors we saw was correctable -- less than or equal
2161 	 * to the number of parity disks read -- attempt to produce data that
2162 	 * has a valid checksum. Naturally, this case applies in the absence of
2163 	 * any errors.
2164 	 */
2165 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2166 		if (data_errors == 0) {
2167 			if (raidz_checksum_verify(zio) == 0) {
2168 				/*
2169 				 * If we read parity information (unnecessarily
2170 				 * as it happens since no reconstruction was
2171 				 * needed) regenerate and verify the parity.
2172 				 * We also regenerate parity when resilvering
2173 				 * so we can write it out to the failed device
2174 				 * later.
2175 				 */
2176 				if (parity_errors + parity_untried <
2177 				    rm->rm_firstdatacol ||
2178 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2179 					n = raidz_parity_verify(zio, rm);
2180 					unexpected_errors += n;
2181 					ASSERT(parity_errors + n <=
2182 					    rm->rm_firstdatacol);
2183 				}
2184 				goto done;
2185 			}
2186 		} else {
2187 			/*
2188 			 * We either attempt to read all the parity columns or
2189 			 * none of them. If we didn't try to read parity, we
2190 			 * wouldn't be here in the correctable case. There must
2191 			 * also have been fewer parity errors than parity
2192 			 * columns or, again, we wouldn't be in this code path.
2193 			 */
2194 			ASSERT(parity_untried == 0);
2195 			ASSERT(parity_errors < rm->rm_firstdatacol);
2196 
2197 			/*
2198 			 * Identify the data columns that reported an error.
2199 			 */
2200 			n = 0;
2201 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2202 				rc = &rm->rm_col[c];
2203 				if (rc->rc_error != 0) {
2204 					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2205 					tgts[n++] = c;
2206 				}
2207 			}
2208 
2209 			ASSERT(rm->rm_firstdatacol >= n);
2210 
2211 			code = vdev_raidz_reconstruct(rm, tgts, n);
2212 
2213 			if (raidz_checksum_verify(zio) == 0) {
2214 				atomic_inc_64(&raidz_corrected[code]);
2215 
2216 				/*
2217 				 * If we read more parity disks than were used
2218 				 * for reconstruction, confirm that the other
2219 				 * parity disks produced correct data. This
2220 				 * routine is suboptimal in that it regenerates
2221 				 * the parity that we already used in addition
2222 				 * to the parity that we're attempting to
2223 				 * verify, but this should be a relatively
2224 				 * uncommon case, and can be optimized if it
2225 				 * becomes a problem. Note that we regenerate
2226 				 * parity when resilvering so we can write it
2227 				 * out to failed devices later.
2228 				 */
2229 				if (parity_errors < rm->rm_firstdatacol - n ||
2230 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2231 					n = raidz_parity_verify(zio, rm);
2232 					unexpected_errors += n;
2233 					ASSERT(parity_errors + n <=
2234 					    rm->rm_firstdatacol);
2235 				}
2236 
2237 				goto done;
2238 			}
2239 		}
2240 	}
2241 
2242 	/*
2243 	 * This isn't a typical situation -- either we got a read error or
2244 	 * a child silently returned bad data. Read every block so we can
2245 	 * try again with as much data and parity as we can track down. If
2246 	 * we've already been through once before, all children will be marked
2247 	 * as tried so we'll proceed to combinatorial reconstruction.
2248 	 */
2249 	unexpected_errors = 1;
2250 	rm->rm_missingdata = 0;
2251 	rm->rm_missingparity = 0;
2252 
2253 	for (c = 0; c < rm->rm_cols; c++) {
2254 		if (rm->rm_col[c].rc_tried)
2255 			continue;
2256 
2257 		zio_vdev_io_redone(zio);
2258 		do {
2259 			rc = &rm->rm_col[c];
2260 			if (rc->rc_tried)
2261 				continue;
2262 			zio_nowait(zio_vdev_child_io(zio, NULL,
2263 			    vd->vdev_child[rc->rc_devidx],
2264 			    rc->rc_offset, rc->rc_data, rc->rc_size,
2265 			    zio->io_type, zio->io_priority, 0,
2266 			    vdev_raidz_child_done, rc));
2267 		} while (++c < rm->rm_cols);
2268 
2269 		return;
2270 	}
2271 
2272 	/*
2273 	 * At this point we've attempted to reconstruct the data given the
2274 	 * errors we detected, and we've attempted to read all columns. There
2275 	 * must, therefore, be one or more additional problems -- silent errors
2276 	 * resulting in invalid data rather than explicit I/O errors resulting
2277 	 * in absent data. We check if there is enough additional data to
2278 	 * possibly reconstruct the data and then perform combinatorial
2279 	 * reconstruction over all possible combinations. If that fails,
2280 	 * we're cooked.
2281 	 */
2282 	if (total_errors > rm->rm_firstdatacol) {
2283 		zio->io_error = vdev_raidz_worst_error(rm);
2284 
2285 	} else if (total_errors < rm->rm_firstdatacol &&
2286 	    (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2287 		/*
2288 		 * If we didn't use all the available parity for the
2289 		 * combinatorial reconstruction, verify that the remaining
2290 		 * parity is correct.
2291 		 */
2292 		if (code != (1 << rm->rm_firstdatacol) - 1)
2293 			(void) raidz_parity_verify(zio, rm);
2294 	} else {
2295 		/*
2296 		 * We're here because either:
2297 		 *
2298 		 *	total_errors == rm_first_datacol, or
2299 		 *	vdev_raidz_combrec() failed
2300 		 *
2301 		 * In either case, there is enough bad data to prevent
2302 		 * reconstruction.
2303 		 *
2304 		 * Start checksum ereports for all children which haven't
2305 		 * failed, and the IO wasn't speculative.
2306 		 */
2307 		zio->io_error = SET_ERROR(ECKSUM);
2308 
2309 		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2310 			for (c = 0; c < rm->rm_cols; c++) {
2311 				rc = &rm->rm_col[c];
2312 				if (rc->rc_error == 0) {
2313 					zio_bad_cksum_t zbc;
2314 					zbc.zbc_has_cksum = 0;
2315 					zbc.zbc_injected =
2316 					    rm->rm_ecksuminjected;
2317 
2318 					zfs_ereport_start_checksum(
2319 					    zio->io_spa,
2320 					    vd->vdev_child[rc->rc_devidx],
2321 					    zio, rc->rc_offset, rc->rc_size,
2322 					    (void *)(uintptr_t)c, &zbc);
2323 				}
2324 			}
2325 		}
2326 	}
2327 
2328 done:
2329 	zio_checksum_verified(zio);
2330 
2331 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2332 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2333 		/*
2334 		 * Use the good data we have in hand to repair damaged children.
2335 		 */
2336 		for (c = 0; c < rm->rm_cols; c++) {
2337 			rc = &rm->rm_col[c];
2338 			cvd = vd->vdev_child[rc->rc_devidx];
2339 
2340 			if (rc->rc_error == 0)
2341 				continue;
2342 
2343 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2344 			    rc->rc_offset, rc->rc_data, rc->rc_size,
2345 			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2346 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2347 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2348 		}
2349 	}
2350 }
2351 
2352 static void
2353 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2354 {
2355 	if (faulted > vd->vdev_nparity)
2356 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2357 		    VDEV_AUX_NO_REPLICAS);
2358 	else if (degraded + faulted != 0)
2359 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2360 	else
2361 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2362 }
2363 
2364 vdev_ops_t vdev_raidz_ops = {
2365 	vdev_raidz_open,
2366 	vdev_raidz_close,
2367 	vdev_raidz_asize,
2368 	vdev_raidz_io_start,
2369 	vdev_raidz_io_done,
2370 	vdev_raidz_state_change,
2371 	NULL,
2372 	NULL,
2373 	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
2374 	B_FALSE			/* not a leaf vdev */
2375 };
2376