xref: /freebsd/sys/contrib/openzfs/module/zfs/vdev_raidz.c (revision dd41de95a84d979615a2ef11df6850622bf6184e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
25  * Copyright (c) 2016 Gvozden Nešković. All rights reserved.
26  */
27 
28 #include <sys/zfs_context.h>
29 #include <sys/spa.h>
30 #include <sys/vdev_impl.h>
31 #include <sys/zio.h>
32 #include <sys/zio_checksum.h>
33 #include <sys/abd.h>
34 #include <sys/fs/zfs.h>
35 #include <sys/fm/fs/zfs.h>
36 #include <sys/vdev_raidz.h>
37 #include <sys/vdev_raidz_impl.h>
38 #include <sys/vdev_draid.h>
39 
40 #ifdef ZFS_DEBUG
41 #include <sys/vdev.h>	/* For vdev_xlate() in vdev_raidz_io_verify() */
42 #endif
43 
44 /*
45  * Virtual device vector for RAID-Z.
46  *
47  * This vdev supports single, double, and triple parity. For single parity,
48  * we use a simple XOR of all the data columns. For double or triple parity,
49  * we use a special case of Reed-Solomon coding. This extends the
50  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
51  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
52  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
53  * former is also based. The latter is designed to provide higher performance
54  * for writes.
55  *
56  * Note that the Plank paper claimed to support arbitrary N+M, but was then
57  * amended six years later identifying a critical flaw that invalidates its
58  * claims. Nevertheless, the technique can be adapted to work for up to
59  * triple parity. For additional parity, the amendment "Note: Correction to
60  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
61  * is viable, but the additional complexity means that write performance will
62  * suffer.
63  *
64  * All of the methods above operate on a Galois field, defined over the
65  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
66  * can be expressed with a single byte. Briefly, the operations on the
67  * field are defined as follows:
68  *
69  *   o addition (+) is represented by a bitwise XOR
70  *   o subtraction (-) is therefore identical to addition: A + B = A - B
71  *   o multiplication of A by 2 is defined by the following bitwise expression:
72  *
73  *	(A * 2)_7 = A_6
74  *	(A * 2)_6 = A_5
75  *	(A * 2)_5 = A_4
76  *	(A * 2)_4 = A_3 + A_7
77  *	(A * 2)_3 = A_2 + A_7
78  *	(A * 2)_2 = A_1 + A_7
79  *	(A * 2)_1 = A_0
80  *	(A * 2)_0 = A_7
81  *
82  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
83  * As an aside, this multiplication is derived from the error correcting
84  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
85  *
86  * Observe that any number in the field (except for 0) can be expressed as a
87  * power of 2 -- a generator for the field. We store a table of the powers of
88  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
89  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
90  * than field addition). The inverse of a field element A (A^-1) is therefore
91  * A ^ (255 - 1) = A^254.
92  *
93  * The up-to-three parity columns, P, Q, R over several data columns,
94  * D_0, ... D_n-1, can be expressed by field operations:
95  *
96  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
97  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
98  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
99  *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
100  *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
101  *
102  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial
103  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
104  * independent coefficients. (There are no additional coefficients that have
105  * this property which is why the uncorrected Plank method breaks down.)
106  *
107  * See the reconstruction code below for how P, Q and R can used individually
108  * or in concert to recover missing data columns.
109  */
110 
111 #define	VDEV_RAIDZ_P		0
112 #define	VDEV_RAIDZ_Q		1
113 #define	VDEV_RAIDZ_R		2
114 
115 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
116 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
117 
118 /*
119  * We provide a mechanism to perform the field multiplication operation on a
120  * 64-bit value all at once rather than a byte at a time. This works by
121  * creating a mask from the top bit in each byte and using that to
122  * conditionally apply the XOR of 0x1d.
123  */
124 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
125 { \
126 	(mask) = (x) & 0x8080808080808080ULL; \
127 	(mask) = ((mask) << 1) - ((mask) >> 7); \
128 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
129 	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
130 }
131 
132 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
133 { \
134 	VDEV_RAIDZ_64MUL_2((x), mask); \
135 	VDEV_RAIDZ_64MUL_2((x), mask); \
136 }
137 
138 static void
139 vdev_raidz_row_free(raidz_row_t *rr)
140 {
141 	for (int c = 0; c < rr->rr_cols; c++) {
142 		raidz_col_t *rc = &rr->rr_col[c];
143 
144 		if (rc->rc_size != 0)
145 			abd_free(rc->rc_abd);
146 		if (rc->rc_orig_data != NULL)
147 			abd_free(rc->rc_orig_data);
148 	}
149 
150 	if (rr->rr_abd_empty != NULL)
151 		abd_free(rr->rr_abd_empty);
152 
153 	kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols]));
154 }
155 
156 void
157 vdev_raidz_map_free(raidz_map_t *rm)
158 {
159 	for (int i = 0; i < rm->rm_nrows; i++)
160 		vdev_raidz_row_free(rm->rm_row[i]);
161 
162 	kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows]));
163 }
164 
165 static void
166 vdev_raidz_map_free_vsd(zio_t *zio)
167 {
168 	raidz_map_t *rm = zio->io_vsd;
169 
170 	vdev_raidz_map_free(rm);
171 }
172 
173 const zio_vsd_ops_t vdev_raidz_vsd_ops = {
174 	.vsd_free = vdev_raidz_map_free_vsd,
175 };
176 
177 /*
178  * Divides the IO evenly across all child vdevs; usually, dcols is
179  * the number of children in the target vdev.
180  *
181  * Avoid inlining the function to keep vdev_raidz_io_start(), which
182  * is this functions only caller, as small as possible on the stack.
183  */
184 noinline raidz_map_t *
185 vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
186     uint64_t nparity)
187 {
188 	raidz_row_t *rr;
189 	/* The starting RAIDZ (parent) vdev sector of the block. */
190 	uint64_t b = zio->io_offset >> ashift;
191 	/* The zio's size in units of the vdev's minimum sector size. */
192 	uint64_t s = zio->io_size >> ashift;
193 	/* The first column for this stripe. */
194 	uint64_t f = b % dcols;
195 	/* The starting byte offset on each child vdev. */
196 	uint64_t o = (b / dcols) << ashift;
197 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
198 
199 	raidz_map_t *rm =
200 	    kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP);
201 	rm->rm_nrows = 1;
202 
203 	/*
204 	 * "Quotient": The number of data sectors for this stripe on all but
205 	 * the "big column" child vdevs that also contain "remainder" data.
206 	 */
207 	q = s / (dcols - nparity);
208 
209 	/*
210 	 * "Remainder": The number of partial stripe data sectors in this I/O.
211 	 * This will add a sector to some, but not all, child vdevs.
212 	 */
213 	r = s - q * (dcols - nparity);
214 
215 	/* The number of "big columns" - those which contain remainder data. */
216 	bc = (r == 0 ? 0 : r + nparity);
217 
218 	/*
219 	 * The total number of data and parity sectors associated with
220 	 * this I/O.
221 	 */
222 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
223 
224 	/*
225 	 * acols: The columns that will be accessed.
226 	 * scols: The columns that will be accessed or skipped.
227 	 */
228 	if (q == 0) {
229 		/* Our I/O request doesn't span all child vdevs. */
230 		acols = bc;
231 		scols = MIN(dcols, roundup(bc, nparity + 1));
232 	} else {
233 		acols = dcols;
234 		scols = dcols;
235 	}
236 
237 	ASSERT3U(acols, <=, scols);
238 
239 	rr = kmem_alloc(offsetof(raidz_row_t, rr_col[scols]), KM_SLEEP);
240 	rm->rm_row[0] = rr;
241 
242 	rr->rr_cols = acols;
243 	rr->rr_scols = scols;
244 	rr->rr_bigcols = bc;
245 	rr->rr_missingdata = 0;
246 	rr->rr_missingparity = 0;
247 	rr->rr_firstdatacol = nparity;
248 	rr->rr_abd_empty = NULL;
249 	rr->rr_nempty = 0;
250 #ifdef ZFS_DEBUG
251 	rr->rr_offset = zio->io_offset;
252 	rr->rr_size = zio->io_size;
253 #endif
254 
255 	asize = 0;
256 
257 	for (c = 0; c < scols; c++) {
258 		raidz_col_t *rc = &rr->rr_col[c];
259 		col = f + c;
260 		coff = o;
261 		if (col >= dcols) {
262 			col -= dcols;
263 			coff += 1ULL << ashift;
264 		}
265 		rc->rc_devidx = col;
266 		rc->rc_offset = coff;
267 		rc->rc_abd = NULL;
268 		rc->rc_orig_data = NULL;
269 		rc->rc_error = 0;
270 		rc->rc_tried = 0;
271 		rc->rc_skipped = 0;
272 		rc->rc_repair = 0;
273 		rc->rc_need_orig_restore = B_FALSE;
274 
275 		if (c >= acols)
276 			rc->rc_size = 0;
277 		else if (c < bc)
278 			rc->rc_size = (q + 1) << ashift;
279 		else
280 			rc->rc_size = q << ashift;
281 
282 		asize += rc->rc_size;
283 	}
284 
285 	ASSERT3U(asize, ==, tot << ashift);
286 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
287 	rm->rm_skipstart = bc;
288 
289 	for (c = 0; c < rr->rr_firstdatacol; c++)
290 		rr->rr_col[c].rc_abd =
291 		    abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE);
292 
293 	for (uint64_t off = 0; c < acols; c++) {
294 		raidz_col_t *rc = &rr->rr_col[c];
295 		rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
296 		    zio->io_abd, off, rc->rc_size);
297 		off += rc->rc_size;
298 	}
299 
300 	/*
301 	 * If all data stored spans all columns, there's a danger that parity
302 	 * will always be on the same device and, since parity isn't read
303 	 * during normal operation, that device's I/O bandwidth won't be
304 	 * used effectively. We therefore switch the parity every 1MB.
305 	 *
306 	 * ... at least that was, ostensibly, the theory. As a practical
307 	 * matter unless we juggle the parity between all devices evenly, we
308 	 * won't see any benefit. Further, occasional writes that aren't a
309 	 * multiple of the LCM of the number of children and the minimum
310 	 * stripe width are sufficient to avoid pessimal behavior.
311 	 * Unfortunately, this decision created an implicit on-disk format
312 	 * requirement that we need to support for all eternity, but only
313 	 * for single-parity RAID-Z.
314 	 *
315 	 * If we intend to skip a sector in the zeroth column for padding
316 	 * we must make sure to note this swap. We will never intend to
317 	 * skip the first column since at least one data and one parity
318 	 * column must appear in each row.
319 	 */
320 	ASSERT(rr->rr_cols >= 2);
321 	ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
322 
323 	if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
324 		devidx = rr->rr_col[0].rc_devidx;
325 		o = rr->rr_col[0].rc_offset;
326 		rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
327 		rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
328 		rr->rr_col[1].rc_devidx = devidx;
329 		rr->rr_col[1].rc_offset = o;
330 
331 		if (rm->rm_skipstart == 0)
332 			rm->rm_skipstart = 1;
333 	}
334 
335 	/* init RAIDZ parity ops */
336 	rm->rm_ops = vdev_raidz_math_get_ops();
337 
338 	return (rm);
339 }
340 
341 struct pqr_struct {
342 	uint64_t *p;
343 	uint64_t *q;
344 	uint64_t *r;
345 };
346 
347 static int
348 vdev_raidz_p_func(void *buf, size_t size, void *private)
349 {
350 	struct pqr_struct *pqr = private;
351 	const uint64_t *src = buf;
352 	int i, cnt = size / sizeof (src[0]);
353 
354 	ASSERT(pqr->p && !pqr->q && !pqr->r);
355 
356 	for (i = 0; i < cnt; i++, src++, pqr->p++)
357 		*pqr->p ^= *src;
358 
359 	return (0);
360 }
361 
362 static int
363 vdev_raidz_pq_func(void *buf, size_t size, void *private)
364 {
365 	struct pqr_struct *pqr = private;
366 	const uint64_t *src = buf;
367 	uint64_t mask;
368 	int i, cnt = size / sizeof (src[0]);
369 
370 	ASSERT(pqr->p && pqr->q && !pqr->r);
371 
372 	for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
373 		*pqr->p ^= *src;
374 		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
375 		*pqr->q ^= *src;
376 	}
377 
378 	return (0);
379 }
380 
381 static int
382 vdev_raidz_pqr_func(void *buf, size_t size, void *private)
383 {
384 	struct pqr_struct *pqr = private;
385 	const uint64_t *src = buf;
386 	uint64_t mask;
387 	int i, cnt = size / sizeof (src[0]);
388 
389 	ASSERT(pqr->p && pqr->q && pqr->r);
390 
391 	for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
392 		*pqr->p ^= *src;
393 		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
394 		*pqr->q ^= *src;
395 		VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
396 		*pqr->r ^= *src;
397 	}
398 
399 	return (0);
400 }
401 
402 static void
403 vdev_raidz_generate_parity_p(raidz_row_t *rr)
404 {
405 	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
406 
407 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
408 		abd_t *src = rr->rr_col[c].rc_abd;
409 
410 		if (c == rr->rr_firstdatacol) {
411 			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
412 		} else {
413 			struct pqr_struct pqr = { p, NULL, NULL };
414 			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
415 			    vdev_raidz_p_func, &pqr);
416 		}
417 	}
418 }
419 
420 static void
421 vdev_raidz_generate_parity_pq(raidz_row_t *rr)
422 {
423 	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
424 	uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
425 	uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
426 	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
427 	    rr->rr_col[VDEV_RAIDZ_Q].rc_size);
428 
429 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
430 		abd_t *src = rr->rr_col[c].rc_abd;
431 
432 		uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
433 
434 		if (c == rr->rr_firstdatacol) {
435 			ASSERT(ccnt == pcnt || ccnt == 0);
436 			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
437 			(void) memcpy(q, p, rr->rr_col[c].rc_size);
438 
439 			for (uint64_t i = ccnt; i < pcnt; i++) {
440 				p[i] = 0;
441 				q[i] = 0;
442 			}
443 		} else {
444 			struct pqr_struct pqr = { p, q, NULL };
445 
446 			ASSERT(ccnt <= pcnt);
447 			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
448 			    vdev_raidz_pq_func, &pqr);
449 
450 			/*
451 			 * Treat short columns as though they are full of 0s.
452 			 * Note that there's therefore nothing needed for P.
453 			 */
454 			uint64_t mask;
455 			for (uint64_t i = ccnt; i < pcnt; i++) {
456 				VDEV_RAIDZ_64MUL_2(q[i], mask);
457 			}
458 		}
459 	}
460 }
461 
462 static void
463 vdev_raidz_generate_parity_pqr(raidz_row_t *rr)
464 {
465 	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
466 	uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
467 	uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd);
468 	uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
469 	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
470 	    rr->rr_col[VDEV_RAIDZ_Q].rc_size);
471 	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
472 	    rr->rr_col[VDEV_RAIDZ_R].rc_size);
473 
474 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
475 		abd_t *src = rr->rr_col[c].rc_abd;
476 
477 		uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
478 
479 		if (c == rr->rr_firstdatacol) {
480 			ASSERT(ccnt == pcnt || ccnt == 0);
481 			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
482 			(void) memcpy(q, p, rr->rr_col[c].rc_size);
483 			(void) memcpy(r, p, rr->rr_col[c].rc_size);
484 
485 			for (uint64_t i = ccnt; i < pcnt; i++) {
486 				p[i] = 0;
487 				q[i] = 0;
488 				r[i] = 0;
489 			}
490 		} else {
491 			struct pqr_struct pqr = { p, q, r };
492 
493 			ASSERT(ccnt <= pcnt);
494 			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
495 			    vdev_raidz_pqr_func, &pqr);
496 
497 			/*
498 			 * Treat short columns as though they are full of 0s.
499 			 * Note that there's therefore nothing needed for P.
500 			 */
501 			uint64_t mask;
502 			for (uint64_t i = ccnt; i < pcnt; i++) {
503 				VDEV_RAIDZ_64MUL_2(q[i], mask);
504 				VDEV_RAIDZ_64MUL_4(r[i], mask);
505 			}
506 		}
507 	}
508 }
509 
510 /*
511  * Generate RAID parity in the first virtual columns according to the number of
512  * parity columns available.
513  */
514 void
515 vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr)
516 {
517 	ASSERT3U(rr->rr_cols, !=, 0);
518 
519 	/* Generate using the new math implementation */
520 	if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL)
521 		return;
522 
523 	switch (rr->rr_firstdatacol) {
524 	case 1:
525 		vdev_raidz_generate_parity_p(rr);
526 		break;
527 	case 2:
528 		vdev_raidz_generate_parity_pq(rr);
529 		break;
530 	case 3:
531 		vdev_raidz_generate_parity_pqr(rr);
532 		break;
533 	default:
534 		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
535 	}
536 }
537 
538 void
539 vdev_raidz_generate_parity(raidz_map_t *rm)
540 {
541 	for (int i = 0; i < rm->rm_nrows; i++) {
542 		raidz_row_t *rr = rm->rm_row[i];
543 		vdev_raidz_generate_parity_row(rm, rr);
544 	}
545 }
546 
547 /* ARGSUSED */
548 static int
549 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
550 {
551 	uint64_t *dst = dbuf;
552 	uint64_t *src = sbuf;
553 	int cnt = size / sizeof (src[0]);
554 
555 	for (int i = 0; i < cnt; i++) {
556 		dst[i] ^= src[i];
557 	}
558 
559 	return (0);
560 }
561 
562 /* ARGSUSED */
563 static int
564 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
565     void *private)
566 {
567 	uint64_t *dst = dbuf;
568 	uint64_t *src = sbuf;
569 	uint64_t mask;
570 	int cnt = size / sizeof (dst[0]);
571 
572 	for (int i = 0; i < cnt; i++, dst++, src++) {
573 		VDEV_RAIDZ_64MUL_2(*dst, mask);
574 		*dst ^= *src;
575 	}
576 
577 	return (0);
578 }
579 
580 /* ARGSUSED */
581 static int
582 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
583 {
584 	uint64_t *dst = buf;
585 	uint64_t mask;
586 	int cnt = size / sizeof (dst[0]);
587 
588 	for (int i = 0; i < cnt; i++, dst++) {
589 		/* same operation as vdev_raidz_reconst_q_pre_func() on dst */
590 		VDEV_RAIDZ_64MUL_2(*dst, mask);
591 	}
592 
593 	return (0);
594 }
595 
596 struct reconst_q_struct {
597 	uint64_t *q;
598 	int exp;
599 };
600 
601 static int
602 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
603 {
604 	struct reconst_q_struct *rq = private;
605 	uint64_t *dst = buf;
606 	int cnt = size / sizeof (dst[0]);
607 
608 	for (int i = 0; i < cnt; i++, dst++, rq->q++) {
609 		int j;
610 		uint8_t *b;
611 
612 		*dst ^= *rq->q;
613 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
614 			*b = vdev_raidz_exp2(*b, rq->exp);
615 		}
616 	}
617 
618 	return (0);
619 }
620 
621 struct reconst_pq_struct {
622 	uint8_t *p;
623 	uint8_t *q;
624 	uint8_t *pxy;
625 	uint8_t *qxy;
626 	int aexp;
627 	int bexp;
628 };
629 
630 static int
631 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
632 {
633 	struct reconst_pq_struct *rpq = private;
634 	uint8_t *xd = xbuf;
635 	uint8_t *yd = ybuf;
636 
637 	for (int i = 0; i < size;
638 	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
639 		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
640 		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
641 		*yd = *rpq->p ^ *rpq->pxy ^ *xd;
642 	}
643 
644 	return (0);
645 }
646 
647 static int
648 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
649 {
650 	struct reconst_pq_struct *rpq = private;
651 	uint8_t *xd = xbuf;
652 
653 	for (int i = 0; i < size;
654 	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
655 		/* same operation as vdev_raidz_reconst_pq_func() on xd */
656 		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
657 		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
658 	}
659 
660 	return (0);
661 }
662 
663 static void
664 vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts)
665 {
666 	int x = tgts[0];
667 	abd_t *dst, *src;
668 
669 	ASSERT3U(ntgts, ==, 1);
670 	ASSERT3U(x, >=, rr->rr_firstdatacol);
671 	ASSERT3U(x, <, rr->rr_cols);
672 
673 	ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size);
674 
675 	src = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
676 	dst = rr->rr_col[x].rc_abd;
677 
678 	abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size);
679 
680 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
681 		uint64_t size = MIN(rr->rr_col[x].rc_size,
682 		    rr->rr_col[c].rc_size);
683 
684 		src = rr->rr_col[c].rc_abd;
685 
686 		if (c == x)
687 			continue;
688 
689 		(void) abd_iterate_func2(dst, src, 0, 0, size,
690 		    vdev_raidz_reconst_p_func, NULL);
691 	}
692 }
693 
694 static void
695 vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts)
696 {
697 	int x = tgts[0];
698 	int c, exp;
699 	abd_t *dst, *src;
700 
701 	ASSERT(ntgts == 1);
702 
703 	ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size);
704 
705 	for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
706 		uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size,
707 		    rr->rr_col[c].rc_size);
708 
709 		src = rr->rr_col[c].rc_abd;
710 		dst = rr->rr_col[x].rc_abd;
711 
712 		if (c == rr->rr_firstdatacol) {
713 			abd_copy(dst, src, size);
714 			if (rr->rr_col[x].rc_size > size) {
715 				abd_zero_off(dst, size,
716 				    rr->rr_col[x].rc_size - size);
717 			}
718 		} else {
719 			ASSERT3U(size, <=, rr->rr_col[x].rc_size);
720 			(void) abd_iterate_func2(dst, src, 0, 0, size,
721 			    vdev_raidz_reconst_q_pre_func, NULL);
722 			(void) abd_iterate_func(dst,
723 			    size, rr->rr_col[x].rc_size - size,
724 			    vdev_raidz_reconst_q_pre_tail_func, NULL);
725 		}
726 	}
727 
728 	src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
729 	dst = rr->rr_col[x].rc_abd;
730 	exp = 255 - (rr->rr_cols - 1 - x);
731 
732 	struct reconst_q_struct rq = { abd_to_buf(src), exp };
733 	(void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size,
734 	    vdev_raidz_reconst_q_post_func, &rq);
735 }
736 
737 static void
738 vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts)
739 {
740 	uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
741 	abd_t *pdata, *qdata;
742 	uint64_t xsize, ysize;
743 	int x = tgts[0];
744 	int y = tgts[1];
745 	abd_t *xd, *yd;
746 
747 	ASSERT(ntgts == 2);
748 	ASSERT(x < y);
749 	ASSERT(x >= rr->rr_firstdatacol);
750 	ASSERT(y < rr->rr_cols);
751 
752 	ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size);
753 
754 	/*
755 	 * Move the parity data aside -- we're going to compute parity as
756 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
757 	 * reuse the parity generation mechanism without trashing the actual
758 	 * parity so we make those columns appear to be full of zeros by
759 	 * setting their lengths to zero.
760 	 */
761 	pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
762 	qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
763 	xsize = rr->rr_col[x].rc_size;
764 	ysize = rr->rr_col[y].rc_size;
765 
766 	rr->rr_col[VDEV_RAIDZ_P].rc_abd =
767 	    abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
768 	rr->rr_col[VDEV_RAIDZ_Q].rc_abd =
769 	    abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
770 	rr->rr_col[x].rc_size = 0;
771 	rr->rr_col[y].rc_size = 0;
772 
773 	vdev_raidz_generate_parity_pq(rr);
774 
775 	rr->rr_col[x].rc_size = xsize;
776 	rr->rr_col[y].rc_size = ysize;
777 
778 	p = abd_to_buf(pdata);
779 	q = abd_to_buf(qdata);
780 	pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
781 	qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
782 	xd = rr->rr_col[x].rc_abd;
783 	yd = rr->rr_col[y].rc_abd;
784 
785 	/*
786 	 * We now have:
787 	 *	Pxy = P + D_x + D_y
788 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
789 	 *
790 	 * We can then solve for D_x:
791 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
792 	 * where
793 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
794 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
795 	 *
796 	 * With D_x in hand, we can easily solve for D_y:
797 	 *	D_y = P + Pxy + D_x
798 	 */
799 
800 	a = vdev_raidz_pow2[255 + x - y];
801 	b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)];
802 	tmp = 255 - vdev_raidz_log2[a ^ 1];
803 
804 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
805 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
806 
807 	ASSERT3U(xsize, >=, ysize);
808 	struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
809 
810 	(void) abd_iterate_func2(xd, yd, 0, 0, ysize,
811 	    vdev_raidz_reconst_pq_func, &rpq);
812 	(void) abd_iterate_func(xd, ysize, xsize - ysize,
813 	    vdev_raidz_reconst_pq_tail_func, &rpq);
814 
815 	abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
816 	abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
817 
818 	/*
819 	 * Restore the saved parity data.
820 	 */
821 	rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata;
822 	rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata;
823 }
824 
825 /* BEGIN CSTYLED */
826 /*
827  * In the general case of reconstruction, we must solve the system of linear
828  * equations defined by the coefficients used to generate parity as well as
829  * the contents of the data and parity disks. This can be expressed with
830  * vectors for the original data (D) and the actual data (d) and parity (p)
831  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
832  *
833  *            __   __                     __     __
834  *            |     |         __     __   |  p_0  |
835  *            |  V  |         |  D_0  |   | p_m-1 |
836  *            |     |    x    |   :   | = |  d_0  |
837  *            |  I  |         | D_n-1 |   |   :   |
838  *            |     |         ~~     ~~   | d_n-1 |
839  *            ~~   ~~                     ~~     ~~
840  *
841  * I is simply a square identity matrix of size n, and V is a vandermonde
842  * matrix defined by the coefficients we chose for the various parity columns
843  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
844  * computation as well as linear separability.
845  *
846  *      __               __               __     __
847  *      |   1   ..  1 1 1 |               |  p_0  |
848  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
849  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
850  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
851  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
852  *      |   :       : : : |   |   :   |   |  d_2  |
853  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
854  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
855  *      |   0   ..  0 0 1 |               | d_n-1 |
856  *      ~~               ~~               ~~     ~~
857  *
858  * Note that I, V, d, and p are known. To compute D, we must invert the
859  * matrix and use the known data and parity values to reconstruct the unknown
860  * data values. We begin by removing the rows in V|I and d|p that correspond
861  * to failed or missing columns; we then make V|I square (n x n) and d|p
862  * sized n by removing rows corresponding to unused parity from the bottom up
863  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
864  * using Gauss-Jordan elimination. In the example below we use m=3 parity
865  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
866  *           __                               __
867  *           |  1   1   1   1   1   1   1   1  |
868  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
869  *           |  19 205 116  29  64  16  4   1  |      / /
870  *           |  1   0   0   0   0   0   0   0  |     / /
871  *           |  0   1   0   0   0   0   0   0  | <--' /
872  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
873  *           |  0   0   0   1   0   0   0   0  |
874  *           |  0   0   0   0   1   0   0   0  |
875  *           |  0   0   0   0   0   1   0   0  |
876  *           |  0   0   0   0   0   0   1   0  |
877  *           |  0   0   0   0   0   0   0   1  |
878  *           ~~                               ~~
879  *           __                               __
880  *           |  1   1   1   1   1   1   1   1  |
881  *           | 128  64  32  16  8   4   2   1  |
882  *           |  19 205 116  29  64  16  4   1  |
883  *           |  1   0   0   0   0   0   0   0  |
884  *           |  0   1   0   0   0   0   0   0  |
885  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
886  *           |  0   0   0   1   0   0   0   0  |
887  *           |  0   0   0   0   1   0   0   0  |
888  *           |  0   0   0   0   0   1   0   0  |
889  *           |  0   0   0   0   0   0   1   0  |
890  *           |  0   0   0   0   0   0   0   1  |
891  *           ~~                               ~~
892  *
893  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
894  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
895  * matrix is not singular.
896  * __                                                                 __
897  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
898  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
899  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
900  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
901  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
902  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
903  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
904  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
905  * ~~                                                                 ~~
906  * __                                                                 __
907  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
908  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
909  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
910  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
911  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
912  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
913  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
914  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
915  * ~~                                                                 ~~
916  * __                                                                 __
917  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
918  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
919  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
920  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
921  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
922  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
923  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
924  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
925  * ~~                                                                 ~~
926  * __                                                                 __
927  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
928  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
929  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
930  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
931  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
932  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
933  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
934  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
935  * ~~                                                                 ~~
936  * __                                                                 __
937  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
938  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
939  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
940  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
941  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
942  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
943  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
944  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
945  * ~~                                                                 ~~
946  * __                                                                 __
947  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
948  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
949  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
950  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
951  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
952  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
953  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
954  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
955  * ~~                                                                 ~~
956  *                   __                               __
957  *                   |  0   0   1   0   0   0   0   0  |
958  *                   | 167 100  5   41 159 169 217 208 |
959  *                   | 166 100  4   40 158 168 216 209 |
960  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
961  *                   |  0   0   0   0   1   0   0   0  |
962  *                   |  0   0   0   0   0   1   0   0  |
963  *                   |  0   0   0   0   0   0   1   0  |
964  *                   |  0   0   0   0   0   0   0   1  |
965  *                   ~~                               ~~
966  *
967  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
968  * of the missing data.
969  *
970  * As is apparent from the example above, the only non-trivial rows in the
971  * inverse matrix correspond to the data disks that we're trying to
972  * reconstruct. Indeed, those are the only rows we need as the others would
973  * only be useful for reconstructing data known or assumed to be valid. For
974  * that reason, we only build the coefficients in the rows that correspond to
975  * targeted columns.
976  */
977 /* END CSTYLED */
978 
979 static void
980 vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map,
981     uint8_t **rows)
982 {
983 	int i, j;
984 	int pow;
985 
986 	ASSERT(n == rr->rr_cols - rr->rr_firstdatacol);
987 
988 	/*
989 	 * Fill in the missing rows of interest.
990 	 */
991 	for (i = 0; i < nmap; i++) {
992 		ASSERT3S(0, <=, map[i]);
993 		ASSERT3S(map[i], <=, 2);
994 
995 		pow = map[i] * n;
996 		if (pow > 255)
997 			pow -= 255;
998 		ASSERT(pow <= 255);
999 
1000 		for (j = 0; j < n; j++) {
1001 			pow -= map[i];
1002 			if (pow < 0)
1003 				pow += 255;
1004 			rows[i][j] = vdev_raidz_pow2[pow];
1005 		}
1006 	}
1007 }
1008 
1009 static void
1010 vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing,
1011     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1012 {
1013 	int i, j, ii, jj;
1014 	uint8_t log;
1015 
1016 	/*
1017 	 * Assert that the first nmissing entries from the array of used
1018 	 * columns correspond to parity columns and that subsequent entries
1019 	 * correspond to data columns.
1020 	 */
1021 	for (i = 0; i < nmissing; i++) {
1022 		ASSERT3S(used[i], <, rr->rr_firstdatacol);
1023 	}
1024 	for (; i < n; i++) {
1025 		ASSERT3S(used[i], >=, rr->rr_firstdatacol);
1026 	}
1027 
1028 	/*
1029 	 * First initialize the storage where we'll compute the inverse rows.
1030 	 */
1031 	for (i = 0; i < nmissing; i++) {
1032 		for (j = 0; j < n; j++) {
1033 			invrows[i][j] = (i == j) ? 1 : 0;
1034 		}
1035 	}
1036 
1037 	/*
1038 	 * Subtract all trivial rows from the rows of consequence.
1039 	 */
1040 	for (i = 0; i < nmissing; i++) {
1041 		for (j = nmissing; j < n; j++) {
1042 			ASSERT3U(used[j], >=, rr->rr_firstdatacol);
1043 			jj = used[j] - rr->rr_firstdatacol;
1044 			ASSERT3S(jj, <, n);
1045 			invrows[i][j] = rows[i][jj];
1046 			rows[i][jj] = 0;
1047 		}
1048 	}
1049 
1050 	/*
1051 	 * For each of the rows of interest, we must normalize it and subtract
1052 	 * a multiple of it from the other rows.
1053 	 */
1054 	for (i = 0; i < nmissing; i++) {
1055 		for (j = 0; j < missing[i]; j++) {
1056 			ASSERT0(rows[i][j]);
1057 		}
1058 		ASSERT3U(rows[i][missing[i]], !=, 0);
1059 
1060 		/*
1061 		 * Compute the inverse of the first element and multiply each
1062 		 * element in the row by that value.
1063 		 */
1064 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1065 
1066 		for (j = 0; j < n; j++) {
1067 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1068 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1069 		}
1070 
1071 		for (ii = 0; ii < nmissing; ii++) {
1072 			if (i == ii)
1073 				continue;
1074 
1075 			ASSERT3U(rows[ii][missing[i]], !=, 0);
1076 
1077 			log = vdev_raidz_log2[rows[ii][missing[i]]];
1078 
1079 			for (j = 0; j < n; j++) {
1080 				rows[ii][j] ^=
1081 				    vdev_raidz_exp2(rows[i][j], log);
1082 				invrows[ii][j] ^=
1083 				    vdev_raidz_exp2(invrows[i][j], log);
1084 			}
1085 		}
1086 	}
1087 
1088 	/*
1089 	 * Verify that the data that is left in the rows are properly part of
1090 	 * an identity matrix.
1091 	 */
1092 	for (i = 0; i < nmissing; i++) {
1093 		for (j = 0; j < n; j++) {
1094 			if (j == missing[i]) {
1095 				ASSERT3U(rows[i][j], ==, 1);
1096 			} else {
1097 				ASSERT0(rows[i][j]);
1098 			}
1099 		}
1100 	}
1101 }
1102 
1103 static void
1104 vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing,
1105     int *missing, uint8_t **invrows, const uint8_t *used)
1106 {
1107 	int i, j, x, cc, c;
1108 	uint8_t *src;
1109 	uint64_t ccount;
1110 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL };
1111 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 };
1112 	uint8_t log = 0;
1113 	uint8_t val;
1114 	int ll;
1115 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1116 	uint8_t *p, *pp;
1117 	size_t psize;
1118 
1119 	psize = sizeof (invlog[0][0]) * n * nmissing;
1120 	p = kmem_alloc(psize, KM_SLEEP);
1121 
1122 	for (pp = p, i = 0; i < nmissing; i++) {
1123 		invlog[i] = pp;
1124 		pp += n;
1125 	}
1126 
1127 	for (i = 0; i < nmissing; i++) {
1128 		for (j = 0; j < n; j++) {
1129 			ASSERT3U(invrows[i][j], !=, 0);
1130 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1131 		}
1132 	}
1133 
1134 	for (i = 0; i < n; i++) {
1135 		c = used[i];
1136 		ASSERT3U(c, <, rr->rr_cols);
1137 
1138 		ccount = rr->rr_col[c].rc_size;
1139 		ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0);
1140 		if (ccount == 0)
1141 			continue;
1142 		src = abd_to_buf(rr->rr_col[c].rc_abd);
1143 		for (j = 0; j < nmissing; j++) {
1144 			cc = missing[j] + rr->rr_firstdatacol;
1145 			ASSERT3U(cc, >=, rr->rr_firstdatacol);
1146 			ASSERT3U(cc, <, rr->rr_cols);
1147 			ASSERT3U(cc, !=, c);
1148 
1149 			dcount[j] = rr->rr_col[cc].rc_size;
1150 			if (dcount[j] != 0)
1151 				dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd);
1152 		}
1153 
1154 		for (x = 0; x < ccount; x++, src++) {
1155 			if (*src != 0)
1156 				log = vdev_raidz_log2[*src];
1157 
1158 			for (cc = 0; cc < nmissing; cc++) {
1159 				if (x >= dcount[cc])
1160 					continue;
1161 
1162 				if (*src == 0) {
1163 					val = 0;
1164 				} else {
1165 					if ((ll = log + invlog[cc][i]) >= 255)
1166 						ll -= 255;
1167 					val = vdev_raidz_pow2[ll];
1168 				}
1169 
1170 				if (i == 0)
1171 					dst[cc][x] = val;
1172 				else
1173 					dst[cc][x] ^= val;
1174 			}
1175 		}
1176 	}
1177 
1178 	kmem_free(p, psize);
1179 }
1180 
1181 static void
1182 vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts)
1183 {
1184 	int n, i, c, t, tt;
1185 	int nmissing_rows;
1186 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1187 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1188 	uint8_t *p, *pp;
1189 	size_t psize;
1190 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1191 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1192 	uint8_t *used;
1193 
1194 	abd_t **bufs = NULL;
1195 
1196 	/*
1197 	 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
1198 	 * temporary linear ABDs if any non-linear ABDs are found.
1199 	 */
1200 	for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) {
1201 		if (!abd_is_linear(rr->rr_col[i].rc_abd)) {
1202 			bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *),
1203 			    KM_PUSHPAGE);
1204 
1205 			for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1206 				raidz_col_t *col = &rr->rr_col[c];
1207 
1208 				bufs[c] = col->rc_abd;
1209 				if (bufs[c] != NULL) {
1210 					col->rc_abd = abd_alloc_linear(
1211 					    col->rc_size, B_TRUE);
1212 					abd_copy(col->rc_abd, bufs[c],
1213 					    col->rc_size);
1214 				}
1215 			}
1216 
1217 			break;
1218 		}
1219 	}
1220 
1221 	n = rr->rr_cols - rr->rr_firstdatacol;
1222 
1223 	/*
1224 	 * Figure out which data columns are missing.
1225 	 */
1226 	nmissing_rows = 0;
1227 	for (t = 0; t < ntgts; t++) {
1228 		if (tgts[t] >= rr->rr_firstdatacol) {
1229 			missing_rows[nmissing_rows++] =
1230 			    tgts[t] - rr->rr_firstdatacol;
1231 		}
1232 	}
1233 
1234 	/*
1235 	 * Figure out which parity columns to use to help generate the missing
1236 	 * data columns.
1237 	 */
1238 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1239 		ASSERT(tt < ntgts);
1240 		ASSERT(c < rr->rr_firstdatacol);
1241 
1242 		/*
1243 		 * Skip any targeted parity columns.
1244 		 */
1245 		if (c == tgts[tt]) {
1246 			tt++;
1247 			continue;
1248 		}
1249 
1250 		parity_map[i] = c;
1251 		i++;
1252 	}
1253 
1254 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1255 	    nmissing_rows * n + sizeof (used[0]) * n;
1256 	p = kmem_alloc(psize, KM_SLEEP);
1257 
1258 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1259 		rows[i] = pp;
1260 		pp += n;
1261 		invrows[i] = pp;
1262 		pp += n;
1263 	}
1264 	used = pp;
1265 
1266 	for (i = 0; i < nmissing_rows; i++) {
1267 		used[i] = parity_map[i];
1268 	}
1269 
1270 	for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1271 		if (tt < nmissing_rows &&
1272 		    c == missing_rows[tt] + rr->rr_firstdatacol) {
1273 			tt++;
1274 			continue;
1275 		}
1276 
1277 		ASSERT3S(i, <, n);
1278 		used[i] = c;
1279 		i++;
1280 	}
1281 
1282 	/*
1283 	 * Initialize the interesting rows of the matrix.
1284 	 */
1285 	vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows);
1286 
1287 	/*
1288 	 * Invert the matrix.
1289 	 */
1290 	vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows,
1291 	    invrows, used);
1292 
1293 	/*
1294 	 * Reconstruct the missing data using the generated matrix.
1295 	 */
1296 	vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows,
1297 	    invrows, used);
1298 
1299 	kmem_free(p, psize);
1300 
1301 	/*
1302 	 * copy back from temporary linear abds and free them
1303 	 */
1304 	if (bufs) {
1305 		for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1306 			raidz_col_t *col = &rr->rr_col[c];
1307 
1308 			if (bufs[c] != NULL) {
1309 				abd_copy(bufs[c], col->rc_abd, col->rc_size);
1310 				abd_free(col->rc_abd);
1311 			}
1312 			col->rc_abd = bufs[c];
1313 		}
1314 		kmem_free(bufs, rr->rr_cols * sizeof (abd_t *));
1315 	}
1316 }
1317 
1318 static void
1319 vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr,
1320     const int *t, int nt)
1321 {
1322 	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1323 	int ntgts;
1324 	int i, c, ret;
1325 	int nbadparity, nbaddata;
1326 	int parity_valid[VDEV_RAIDZ_MAXPARITY];
1327 
1328 	nbadparity = rr->rr_firstdatacol;
1329 	nbaddata = rr->rr_cols - nbadparity;
1330 	ntgts = 0;
1331 	for (i = 0, c = 0; c < rr->rr_cols; c++) {
1332 		if (c < rr->rr_firstdatacol)
1333 			parity_valid[c] = B_FALSE;
1334 
1335 		if (i < nt && c == t[i]) {
1336 			tgts[ntgts++] = c;
1337 			i++;
1338 		} else if (rr->rr_col[c].rc_error != 0) {
1339 			tgts[ntgts++] = c;
1340 		} else if (c >= rr->rr_firstdatacol) {
1341 			nbaddata--;
1342 		} else {
1343 			parity_valid[c] = B_TRUE;
1344 			nbadparity--;
1345 		}
1346 	}
1347 
1348 	ASSERT(ntgts >= nt);
1349 	ASSERT(nbaddata >= 0);
1350 	ASSERT(nbaddata + nbadparity == ntgts);
1351 
1352 	dt = &tgts[nbadparity];
1353 
1354 	/* Reconstruct using the new math implementation */
1355 	ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata);
1356 	if (ret != RAIDZ_ORIGINAL_IMPL)
1357 		return;
1358 
1359 	/*
1360 	 * See if we can use any of our optimized reconstruction routines.
1361 	 */
1362 	switch (nbaddata) {
1363 	case 1:
1364 		if (parity_valid[VDEV_RAIDZ_P]) {
1365 			vdev_raidz_reconstruct_p(rr, dt, 1);
1366 			return;
1367 		}
1368 
1369 		ASSERT(rr->rr_firstdatacol > 1);
1370 
1371 		if (parity_valid[VDEV_RAIDZ_Q]) {
1372 			vdev_raidz_reconstruct_q(rr, dt, 1);
1373 			return;
1374 		}
1375 
1376 		ASSERT(rr->rr_firstdatacol > 2);
1377 		break;
1378 
1379 	case 2:
1380 		ASSERT(rr->rr_firstdatacol > 1);
1381 
1382 		if (parity_valid[VDEV_RAIDZ_P] &&
1383 		    parity_valid[VDEV_RAIDZ_Q]) {
1384 			vdev_raidz_reconstruct_pq(rr, dt, 2);
1385 			return;
1386 		}
1387 
1388 		ASSERT(rr->rr_firstdatacol > 2);
1389 
1390 		break;
1391 	}
1392 
1393 	vdev_raidz_reconstruct_general(rr, tgts, ntgts);
1394 }
1395 
1396 static int
1397 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1398     uint64_t *logical_ashift, uint64_t *physical_ashift)
1399 {
1400 	vdev_raidz_t *vdrz = vd->vdev_tsd;
1401 	uint64_t nparity = vdrz->vd_nparity;
1402 	int c;
1403 	int lasterror = 0;
1404 	int numerrors = 0;
1405 
1406 	ASSERT(nparity > 0);
1407 
1408 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
1409 	    vd->vdev_children < nparity + 1) {
1410 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1411 		return (SET_ERROR(EINVAL));
1412 	}
1413 
1414 	vdev_open_children(vd);
1415 
1416 	for (c = 0; c < vd->vdev_children; c++) {
1417 		vdev_t *cvd = vd->vdev_child[c];
1418 
1419 		if (cvd->vdev_open_error != 0) {
1420 			lasterror = cvd->vdev_open_error;
1421 			numerrors++;
1422 			continue;
1423 		}
1424 
1425 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1426 		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1427 		*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
1428 		*physical_ashift = MAX(*physical_ashift,
1429 		    cvd->vdev_physical_ashift);
1430 	}
1431 
1432 	*asize *= vd->vdev_children;
1433 	*max_asize *= vd->vdev_children;
1434 
1435 	if (numerrors > nparity) {
1436 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1437 		return (lasterror);
1438 	}
1439 
1440 	return (0);
1441 }
1442 
1443 static void
1444 vdev_raidz_close(vdev_t *vd)
1445 {
1446 	for (int c = 0; c < vd->vdev_children; c++) {
1447 		if (vd->vdev_child[c] != NULL)
1448 			vdev_close(vd->vdev_child[c]);
1449 	}
1450 }
1451 
1452 static uint64_t
1453 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1454 {
1455 	vdev_raidz_t *vdrz = vd->vdev_tsd;
1456 	uint64_t asize;
1457 	uint64_t ashift = vd->vdev_top->vdev_ashift;
1458 	uint64_t cols = vdrz->vd_logical_width;
1459 	uint64_t nparity = vdrz->vd_nparity;
1460 
1461 	asize = ((psize - 1) >> ashift) + 1;
1462 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1463 	asize = roundup(asize, nparity + 1) << ashift;
1464 
1465 	return (asize);
1466 }
1467 
1468 /*
1469  * The allocatable space for a raidz vdev is N * sizeof(smallest child)
1470  * so each child must provide at least 1/Nth of its asize.
1471  */
1472 static uint64_t
1473 vdev_raidz_min_asize(vdev_t *vd)
1474 {
1475 	return ((vd->vdev_min_asize + vd->vdev_children - 1) /
1476 	    vd->vdev_children);
1477 }
1478 
1479 void
1480 vdev_raidz_child_done(zio_t *zio)
1481 {
1482 	raidz_col_t *rc = zio->io_private;
1483 
1484 	rc->rc_error = zio->io_error;
1485 	rc->rc_tried = 1;
1486 	rc->rc_skipped = 0;
1487 }
1488 
1489 static void
1490 vdev_raidz_io_verify(vdev_t *vd, raidz_row_t *rr, int col)
1491 {
1492 #ifdef ZFS_DEBUG
1493 	vdev_t *tvd = vd->vdev_top;
1494 
1495 	range_seg64_t logical_rs, physical_rs, remain_rs;
1496 	logical_rs.rs_start = rr->rr_offset;
1497 	logical_rs.rs_end = logical_rs.rs_start +
1498 	    vdev_raidz_asize(vd, rr->rr_size);
1499 
1500 	raidz_col_t *rc = &rr->rr_col[col];
1501 	vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
1502 
1503 	vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs);
1504 	ASSERT(vdev_xlate_is_empty(&remain_rs));
1505 	ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
1506 	ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
1507 	/*
1508 	 * It would be nice to assert that rs_end is equal
1509 	 * to rc_offset + rc_size but there might be an
1510 	 * optional I/O at the end that is not accounted in
1511 	 * rc_size.
1512 	 */
1513 	if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
1514 		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
1515 		    rc->rc_size + (1 << tvd->vdev_ashift));
1516 	} else {
1517 		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
1518 	}
1519 #endif
1520 }
1521 
1522 static void
1523 vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr, uint64_t ashift)
1524 {
1525 	vdev_t *vd = zio->io_vd;
1526 	raidz_map_t *rm = zio->io_vsd;
1527 	int c, i;
1528 
1529 	vdev_raidz_generate_parity_row(rm, rr);
1530 
1531 	for (int c = 0; c < rr->rr_cols; c++) {
1532 		raidz_col_t *rc = &rr->rr_col[c];
1533 		if (rc->rc_size == 0)
1534 			continue;
1535 
1536 		/* Verify physical to logical translation */
1537 		vdev_raidz_io_verify(vd, rr, c);
1538 
1539 		zio_nowait(zio_vdev_child_io(zio, NULL,
1540 		    vd->vdev_child[rc->rc_devidx], rc->rc_offset,
1541 		    rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority,
1542 		    0, vdev_raidz_child_done, rc));
1543 	}
1544 
1545 	/*
1546 	 * Generate optional I/Os for skip sectors to improve aggregation
1547 	 * contiguity.
1548 	 */
1549 	for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1550 		ASSERT(c <= rr->rr_scols);
1551 		if (c == rr->rr_scols)
1552 			c = 0;
1553 
1554 		raidz_col_t *rc = &rr->rr_col[c];
1555 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
1556 
1557 		zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1558 		    rc->rc_offset + rc->rc_size, NULL, 1ULL << ashift,
1559 		    zio->io_type, zio->io_priority,
1560 		    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1561 	}
1562 }
1563 
1564 static void
1565 vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr)
1566 {
1567 	vdev_t *vd = zio->io_vd;
1568 
1569 	/*
1570 	 * Iterate over the columns in reverse order so that we hit the parity
1571 	 * last -- any errors along the way will force us to read the parity.
1572 	 */
1573 	for (int c = rr->rr_cols - 1; c >= 0; c--) {
1574 		raidz_col_t *rc = &rr->rr_col[c];
1575 		if (rc->rc_size == 0)
1576 			continue;
1577 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
1578 		if (!vdev_readable(cvd)) {
1579 			if (c >= rr->rr_firstdatacol)
1580 				rr->rr_missingdata++;
1581 			else
1582 				rr->rr_missingparity++;
1583 			rc->rc_error = SET_ERROR(ENXIO);
1584 			rc->rc_tried = 1;	/* don't even try */
1585 			rc->rc_skipped = 1;
1586 			continue;
1587 		}
1588 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1589 			if (c >= rr->rr_firstdatacol)
1590 				rr->rr_missingdata++;
1591 			else
1592 				rr->rr_missingparity++;
1593 			rc->rc_error = SET_ERROR(ESTALE);
1594 			rc->rc_skipped = 1;
1595 			continue;
1596 		}
1597 		if (c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
1598 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1599 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1600 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
1601 			    zio->io_type, zio->io_priority, 0,
1602 			    vdev_raidz_child_done, rc));
1603 		}
1604 	}
1605 }
1606 
1607 /*
1608  * Start an IO operation on a RAIDZ VDev
1609  *
1610  * Outline:
1611  * - For write operations:
1612  *   1. Generate the parity data
1613  *   2. Create child zio write operations to each column's vdev, for both
1614  *      data and parity.
1615  *   3. If the column skips any sectors for padding, create optional dummy
1616  *      write zio children for those areas to improve aggregation continuity.
1617  * - For read operations:
1618  *   1. Create child zio read operations to each data column's vdev to read
1619  *      the range of data required for zio.
1620  *   2. If this is a scrub or resilver operation, or if any of the data
1621  *      vdevs have had errors, then create zio read operations to the parity
1622  *      columns' VDevs as well.
1623  */
1624 static void
1625 vdev_raidz_io_start(zio_t *zio)
1626 {
1627 	vdev_t *vd = zio->io_vd;
1628 	vdev_t *tvd = vd->vdev_top;
1629 	vdev_raidz_t *vdrz = vd->vdev_tsd;
1630 
1631 	raidz_map_t *rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift,
1632 	    vdrz->vd_logical_width, vdrz->vd_nparity);
1633 	zio->io_vsd = rm;
1634 	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1635 
1636 	/*
1637 	 * Until raidz expansion is implemented all maps for a raidz vdev
1638 	 * contain a single row.
1639 	 */
1640 	ASSERT3U(rm->rm_nrows, ==, 1);
1641 	raidz_row_t *rr = rm->rm_row[0];
1642 
1643 	if (zio->io_type == ZIO_TYPE_WRITE) {
1644 		vdev_raidz_io_start_write(zio, rr, tvd->vdev_ashift);
1645 	} else {
1646 		ASSERT(zio->io_type == ZIO_TYPE_READ);
1647 		vdev_raidz_io_start_read(zio, rr);
1648 	}
1649 
1650 	zio_execute(zio);
1651 }
1652 
1653 /*
1654  * Report a checksum error for a child of a RAID-Z device.
1655  */
1656 static void
1657 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
1658 {
1659 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1660 
1661 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) &&
1662 	    zio->io_priority != ZIO_PRIORITY_REBUILD) {
1663 		zio_bad_cksum_t zbc;
1664 		raidz_map_t *rm = zio->io_vsd;
1665 
1666 		zbc.zbc_has_cksum = 0;
1667 		zbc.zbc_injected = rm->rm_ecksuminjected;
1668 
1669 		(void) zfs_ereport_post_checksum(zio->io_spa, vd,
1670 		    &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size,
1671 		    rc->rc_abd, bad_data, &zbc);
1672 		mutex_enter(&vd->vdev_stat_lock);
1673 		vd->vdev_stat.vs_checksum_errors++;
1674 		mutex_exit(&vd->vdev_stat_lock);
1675 	}
1676 }
1677 
1678 /*
1679  * We keep track of whether or not there were any injected errors, so that
1680  * any ereports we generate can note it.
1681  */
1682 static int
1683 raidz_checksum_verify(zio_t *zio)
1684 {
1685 	zio_bad_cksum_t zbc;
1686 	raidz_map_t *rm = zio->io_vsd;
1687 
1688 	bzero(&zbc, sizeof (zio_bad_cksum_t));
1689 
1690 	int ret = zio_checksum_error(zio, &zbc);
1691 	if (ret != 0 && zbc.zbc_injected != 0)
1692 		rm->rm_ecksuminjected = 1;
1693 
1694 	return (ret);
1695 }
1696 
1697 /*
1698  * Generate the parity from the data columns. If we tried and were able to
1699  * read the parity without error, verify that the generated parity matches the
1700  * data we read. If it doesn't, we fire off a checksum error. Return the
1701  * number of such failures.
1702  */
1703 static int
1704 raidz_parity_verify(zio_t *zio, raidz_row_t *rr)
1705 {
1706 	abd_t *orig[VDEV_RAIDZ_MAXPARITY];
1707 	int c, ret = 0;
1708 	raidz_map_t *rm = zio->io_vsd;
1709 	raidz_col_t *rc;
1710 
1711 	blkptr_t *bp = zio->io_bp;
1712 	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
1713 	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
1714 
1715 	if (checksum == ZIO_CHECKSUM_NOPARITY)
1716 		return (ret);
1717 
1718 	for (c = 0; c < rr->rr_firstdatacol; c++) {
1719 		rc = &rr->rr_col[c];
1720 		if (!rc->rc_tried || rc->rc_error != 0)
1721 			continue;
1722 
1723 		orig[c] = abd_alloc_sametype(rc->rc_abd, rc->rc_size);
1724 		abd_copy(orig[c], rc->rc_abd, rc->rc_size);
1725 	}
1726 
1727 	/*
1728 	 * Regenerates parity even for !tried||rc_error!=0 columns.  This
1729 	 * isn't harmful but it does have the side effect of fixing stuff
1730 	 * we didn't realize was necessary (i.e. even if we return 0).
1731 	 */
1732 	vdev_raidz_generate_parity_row(rm, rr);
1733 
1734 	for (c = 0; c < rr->rr_firstdatacol; c++) {
1735 		rc = &rr->rr_col[c];
1736 
1737 		if (!rc->rc_tried || rc->rc_error != 0)
1738 			continue;
1739 
1740 		if (abd_cmp(orig[c], rc->rc_abd) != 0) {
1741 			raidz_checksum_error(zio, rc, orig[c]);
1742 			rc->rc_error = SET_ERROR(ECKSUM);
1743 			ret++;
1744 		}
1745 		abd_free(orig[c]);
1746 	}
1747 
1748 	return (ret);
1749 }
1750 
1751 static int
1752 vdev_raidz_worst_error(raidz_row_t *rr)
1753 {
1754 	int error = 0;
1755 
1756 	for (int c = 0; c < rr->rr_cols; c++)
1757 		error = zio_worst_error(error, rr->rr_col[c].rc_error);
1758 
1759 	return (error);
1760 }
1761 
1762 static void
1763 vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
1764 {
1765 	int unexpected_errors = 0;
1766 	int parity_errors = 0;
1767 	int parity_untried = 0;
1768 	int data_errors = 0;
1769 
1770 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
1771 
1772 	for (int c = 0; c < rr->rr_cols; c++) {
1773 		raidz_col_t *rc = &rr->rr_col[c];
1774 
1775 		if (rc->rc_error) {
1776 			if (c < rr->rr_firstdatacol)
1777 				parity_errors++;
1778 			else
1779 				data_errors++;
1780 
1781 			if (!rc->rc_skipped)
1782 				unexpected_errors++;
1783 		} else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
1784 			parity_untried++;
1785 		}
1786 	}
1787 
1788 	/*
1789 	 * If we read more parity disks than were used for
1790 	 * reconstruction, confirm that the other parity disks produced
1791 	 * correct data.
1792 	 *
1793 	 * Note that we also regenerate parity when resilvering so we
1794 	 * can write it out to failed devices later.
1795 	 */
1796 	if (parity_errors + parity_untried <
1797 	    rr->rr_firstdatacol - data_errors ||
1798 	    (zio->io_flags & ZIO_FLAG_RESILVER)) {
1799 		int n = raidz_parity_verify(zio, rr);
1800 		unexpected_errors += n;
1801 		ASSERT3U(parity_errors + n, <=, rr->rr_firstdatacol);
1802 	}
1803 
1804 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
1805 	    (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) {
1806 		/*
1807 		 * Use the good data we have in hand to repair damaged children.
1808 		 */
1809 		for (int c = 0; c < rr->rr_cols; c++) {
1810 			raidz_col_t *rc = &rr->rr_col[c];
1811 			vdev_t *vd = zio->io_vd;
1812 			vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
1813 
1814 			if ((rc->rc_error == 0 || rc->rc_size == 0) &&
1815 			    (rc->rc_repair == 0)) {
1816 				continue;
1817 			}
1818 
1819 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1820 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
1821 			    ZIO_TYPE_WRITE,
1822 			    zio->io_priority == ZIO_PRIORITY_REBUILD ?
1823 			    ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE,
1824 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
1825 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
1826 		}
1827 	}
1828 }
1829 
1830 static void
1831 raidz_restore_orig_data(raidz_map_t *rm)
1832 {
1833 	for (int i = 0; i < rm->rm_nrows; i++) {
1834 		raidz_row_t *rr = rm->rm_row[i];
1835 		for (int c = 0; c < rr->rr_cols; c++) {
1836 			raidz_col_t *rc = &rr->rr_col[c];
1837 			if (rc->rc_need_orig_restore) {
1838 				abd_copy(rc->rc_abd,
1839 				    rc->rc_orig_data, rc->rc_size);
1840 				rc->rc_need_orig_restore = B_FALSE;
1841 			}
1842 		}
1843 	}
1844 }
1845 
1846 /*
1847  * returns EINVAL if reconstruction of the block will not be possible
1848  * returns ECKSUM if this specific reconstruction failed
1849  * returns 0 on successful reconstruction
1850  */
1851 static int
1852 raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
1853 {
1854 	raidz_map_t *rm = zio->io_vsd;
1855 
1856 	/* Reconstruct each row */
1857 	for (int r = 0; r < rm->rm_nrows; r++) {
1858 		raidz_row_t *rr = rm->rm_row[r];
1859 		int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */
1860 		int t = 0;
1861 		int dead = 0;
1862 		int dead_data = 0;
1863 
1864 		for (int c = 0; c < rr->rr_cols; c++) {
1865 			raidz_col_t *rc = &rr->rr_col[c];
1866 			ASSERT0(rc->rc_need_orig_restore);
1867 			if (rc->rc_error != 0) {
1868 				dead++;
1869 				if (c >= nparity)
1870 					dead_data++;
1871 				continue;
1872 			}
1873 			if (rc->rc_size == 0)
1874 				continue;
1875 			for (int lt = 0; lt < ntgts; lt++) {
1876 				if (rc->rc_devidx == ltgts[lt]) {
1877 					if (rc->rc_orig_data == NULL) {
1878 						rc->rc_orig_data =
1879 						    abd_alloc_linear(
1880 						    rc->rc_size, B_TRUE);
1881 						abd_copy(rc->rc_orig_data,
1882 						    rc->rc_abd, rc->rc_size);
1883 					}
1884 					rc->rc_need_orig_restore = B_TRUE;
1885 
1886 					dead++;
1887 					if (c >= nparity)
1888 						dead_data++;
1889 					my_tgts[t++] = c;
1890 					break;
1891 				}
1892 			}
1893 		}
1894 		if (dead > nparity) {
1895 			/* reconstruction not possible */
1896 			raidz_restore_orig_data(rm);
1897 			return (EINVAL);
1898 		}
1899 		if (dead_data > 0)
1900 			vdev_raidz_reconstruct_row(rm, rr, my_tgts, t);
1901 	}
1902 
1903 	/* Check for success */
1904 	if (raidz_checksum_verify(zio) == 0) {
1905 
1906 		/* Reconstruction succeeded - report errors */
1907 		for (int i = 0; i < rm->rm_nrows; i++) {
1908 			raidz_row_t *rr = rm->rm_row[i];
1909 
1910 			for (int c = 0; c < rr->rr_cols; c++) {
1911 				raidz_col_t *rc = &rr->rr_col[c];
1912 				if (rc->rc_need_orig_restore) {
1913 					/*
1914 					 * Note: if this is a parity column,
1915 					 * we don't really know if it's wrong.
1916 					 * We need to let
1917 					 * vdev_raidz_io_done_verified() check
1918 					 * it, and if we set rc_error, it will
1919 					 * think that it is a "known" error
1920 					 * that doesn't need to be checked
1921 					 * or corrected.
1922 					 */
1923 					if (rc->rc_error == 0 &&
1924 					    c >= rr->rr_firstdatacol) {
1925 						raidz_checksum_error(zio,
1926 						    rc, rc->rc_orig_data);
1927 						rc->rc_error =
1928 						    SET_ERROR(ECKSUM);
1929 					}
1930 					rc->rc_need_orig_restore = B_FALSE;
1931 				}
1932 			}
1933 
1934 			vdev_raidz_io_done_verified(zio, rr);
1935 		}
1936 
1937 		zio_checksum_verified(zio);
1938 
1939 		return (0);
1940 	}
1941 
1942 	/* Reconstruction failed - restore original data */
1943 	raidz_restore_orig_data(rm);
1944 	return (ECKSUM);
1945 }
1946 
1947 /*
1948  * Iterate over all combinations of N bad vdevs and attempt a reconstruction.
1949  * Note that the algorithm below is non-optimal because it doesn't take into
1950  * account how reconstruction is actually performed. For example, with
1951  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1952  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1953  * cases we'd only use parity information in column 0.
1954  *
1955  * The order that we find the various possible combinations of failed
1956  * disks is dictated by these rules:
1957  * - Examine each "slot" (the "i" in tgts[i])
1958  *   - Try to increment this slot (tgts[i] = tgts[i] + 1)
1959  *   - if we can't increment because it runs into the next slot,
1960  *     reset our slot to the minimum, and examine the next slot
1961  *
1962  *  For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose
1963  *  3 columns to reconstruct), we will generate the following sequence:
1964  *
1965  *  STATE        ACTION
1966  *  0 1 2        special case: skip since these are all parity
1967  *  0 1   3      first slot: reset to 0; middle slot: increment to 2
1968  *  0   2 3      first slot: increment to 1
1969  *    1 2 3      first: reset to 0; middle: reset to 1; last: increment to 4
1970  *  0 1     4    first: reset to 0; middle: increment to 2
1971  *  0   2   4    first: increment to 1
1972  *    1 2   4    first: reset to 0; middle: increment to 3
1973  *  0     3 4    first: increment to 1
1974  *    1   3 4    first: increment to 2
1975  *      2 3 4    first: reset to 0; middle: reset to 1; last: increment to 5
1976  *  0 1       5  first: reset to 0; middle: increment to 2
1977  *  0   2     5  first: increment to 1
1978  *    1 2     5  first: reset to 0; middle: increment to 3
1979  *  0     3   5  first: increment to 1
1980  *    1   3   5  first: increment to 2
1981  *      2 3   5  first: reset to 0; middle: increment to 4
1982  *  0       4 5  first: increment to 1
1983  *    1     4 5  first: increment to 2
1984  *      2   4 5  first: increment to 3
1985  *        3 4 5  done
1986  *
1987  * This strategy works for dRAID but is less effecient when there are a large
1988  * number of child vdevs and therefore permutations to check. Furthermore,
1989  * since the raidz_map_t rows likely do not overlap reconstruction would be
1990  * possible as long as there are no more than nparity data errors per row.
1991  * These additional permutations are not currently checked but could be as
1992  * a future improvement.
1993  */
1994 static int
1995 vdev_raidz_combrec(zio_t *zio)
1996 {
1997 	int nparity = vdev_get_nparity(zio->io_vd);
1998 	raidz_map_t *rm = zio->io_vsd;
1999 
2000 	/* Check if there's enough data to attempt reconstrution. */
2001 	for (int i = 0; i < rm->rm_nrows; i++) {
2002 		raidz_row_t *rr = rm->rm_row[i];
2003 		int total_errors = 0;
2004 
2005 		for (int c = 0; c < rr->rr_cols; c++) {
2006 			if (rr->rr_col[c].rc_error)
2007 				total_errors++;
2008 		}
2009 
2010 		if (total_errors > nparity)
2011 			return (vdev_raidz_worst_error(rr));
2012 	}
2013 
2014 	for (int num_failures = 1; num_failures <= nparity; num_failures++) {
2015 		int tstore[VDEV_RAIDZ_MAXPARITY + 2];
2016 		int *ltgts = &tstore[1]; /* value is logical child ID */
2017 
2018 		/* Determine number of logical children, n */
2019 		int n = zio->io_vd->vdev_children;
2020 
2021 		ASSERT3U(num_failures, <=, nparity);
2022 		ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY);
2023 
2024 		/* Handle corner cases in combrec logic */
2025 		ltgts[-1] = -1;
2026 		for (int i = 0; i < num_failures; i++) {
2027 			ltgts[i] = i;
2028 		}
2029 		ltgts[num_failures] = n;
2030 
2031 		for (;;) {
2032 			int err = raidz_reconstruct(zio, ltgts, num_failures,
2033 			    nparity);
2034 			if (err == EINVAL) {
2035 				/*
2036 				 * Reconstruction not possible with this #
2037 				 * failures; try more failures.
2038 				 */
2039 				break;
2040 			} else if (err == 0)
2041 				return (0);
2042 
2043 			/* Compute next targets to try */
2044 			for (int t = 0; ; t++) {
2045 				ASSERT3U(t, <, num_failures);
2046 				ltgts[t]++;
2047 				if (ltgts[t] == n) {
2048 					/* try more failures */
2049 					ASSERT3U(t, ==, num_failures - 1);
2050 					break;
2051 				}
2052 
2053 				ASSERT3U(ltgts[t], <, n);
2054 				ASSERT3U(ltgts[t], <=, ltgts[t + 1]);
2055 
2056 				/*
2057 				 * If that spot is available, we're done here.
2058 				 * Try the next combination.
2059 				 */
2060 				if (ltgts[t] != ltgts[t + 1])
2061 					break;
2062 
2063 				/*
2064 				 * Otherwise, reset this tgt to the minimum,
2065 				 * and move on to the next tgt.
2066 				 */
2067 				ltgts[t] = ltgts[t - 1] + 1;
2068 				ASSERT3U(ltgts[t], ==, t);
2069 			}
2070 
2071 			/* Increase the number of failures and keep trying. */
2072 			if (ltgts[num_failures - 1] == n)
2073 				break;
2074 		}
2075 	}
2076 
2077 	return (ECKSUM);
2078 }
2079 
2080 void
2081 vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
2082 {
2083 	for (uint64_t row = 0; row < rm->rm_nrows; row++) {
2084 		raidz_row_t *rr = rm->rm_row[row];
2085 		vdev_raidz_reconstruct_row(rm, rr, t, nt);
2086 	}
2087 }
2088 
2089 /*
2090  * Complete a write IO operation on a RAIDZ VDev
2091  *
2092  * Outline:
2093  *   1. Check for errors on the child IOs.
2094  *   2. Return, setting an error code if too few child VDevs were written
2095  *      to reconstruct the data later.  Note that partial writes are
2096  *      considered successful if they can be reconstructed at all.
2097  */
2098 static void
2099 vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)
2100 {
2101 	int total_errors = 0;
2102 
2103 	ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
2104 	ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
2105 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
2106 
2107 	for (int c = 0; c < rr->rr_cols; c++) {
2108 		raidz_col_t *rc = &rr->rr_col[c];
2109 
2110 		if (rc->rc_error) {
2111 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
2112 
2113 			total_errors++;
2114 		}
2115 	}
2116 
2117 	/*
2118 	 * Treat partial writes as a success. If we couldn't write enough
2119 	 * columns to reconstruct the data, the I/O failed.  Otherwise,
2120 	 * good enough.
2121 	 *
2122 	 * Now that we support write reallocation, it would be better
2123 	 * to treat partial failure as real failure unless there are
2124 	 * no non-degraded top-level vdevs left, and not update DTLs
2125 	 * if we intend to reallocate.
2126 	 */
2127 	if (total_errors > rr->rr_firstdatacol) {
2128 		zio->io_error = zio_worst_error(zio->io_error,
2129 		    vdev_raidz_worst_error(rr));
2130 	}
2131 }
2132 
2133 static void
2134 vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm,
2135     raidz_row_t *rr)
2136 {
2137 	int parity_errors = 0;
2138 	int parity_untried = 0;
2139 	int data_errors = 0;
2140 	int total_errors = 0;
2141 
2142 	ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
2143 	ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
2144 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
2145 
2146 	for (int c = 0; c < rr->rr_cols; c++) {
2147 		raidz_col_t *rc = &rr->rr_col[c];
2148 
2149 		if (rc->rc_error) {
2150 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
2151 
2152 			if (c < rr->rr_firstdatacol)
2153 				parity_errors++;
2154 			else
2155 				data_errors++;
2156 
2157 			total_errors++;
2158 		} else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
2159 			parity_untried++;
2160 		}
2161 	}
2162 
2163 	/*
2164 	 * If there were data errors and the number of errors we saw was
2165 	 * correctable -- less than or equal to the number of parity disks read
2166 	 * -- reconstruct based on the missing data.
2167 	 */
2168 	if (data_errors != 0 &&
2169 	    total_errors <= rr->rr_firstdatacol - parity_untried) {
2170 		/*
2171 		 * We either attempt to read all the parity columns or
2172 		 * none of them. If we didn't try to read parity, we
2173 		 * wouldn't be here in the correctable case. There must
2174 		 * also have been fewer parity errors than parity
2175 		 * columns or, again, we wouldn't be in this code path.
2176 		 */
2177 		ASSERT(parity_untried == 0);
2178 		ASSERT(parity_errors < rr->rr_firstdatacol);
2179 
2180 		/*
2181 		 * Identify the data columns that reported an error.
2182 		 */
2183 		int n = 0;
2184 		int tgts[VDEV_RAIDZ_MAXPARITY];
2185 		for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
2186 			raidz_col_t *rc = &rr->rr_col[c];
2187 			if (rc->rc_error != 0) {
2188 				ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2189 				tgts[n++] = c;
2190 			}
2191 		}
2192 
2193 		ASSERT(rr->rr_firstdatacol >= n);
2194 
2195 		vdev_raidz_reconstruct_row(rm, rr, tgts, n);
2196 	}
2197 }
2198 
2199 /*
2200  * Return the number of reads issued.
2201  */
2202 static int
2203 vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr)
2204 {
2205 	vdev_t *vd = zio->io_vd;
2206 	int nread = 0;
2207 
2208 	rr->rr_missingdata = 0;
2209 	rr->rr_missingparity = 0;
2210 
2211 	/*
2212 	 * If this rows contains empty sectors which are not required
2213 	 * for a normal read then allocate an ABD for them now so they
2214 	 * may be read, verified, and any needed repairs performed.
2215 	 */
2216 	if (rr->rr_nempty && rr->rr_abd_empty == NULL)
2217 		vdev_draid_map_alloc_empty(zio, rr);
2218 
2219 	for (int c = 0; c < rr->rr_cols; c++) {
2220 		raidz_col_t *rc = &rr->rr_col[c];
2221 		if (rc->rc_tried || rc->rc_size == 0)
2222 			continue;
2223 
2224 		zio_nowait(zio_vdev_child_io(zio, NULL,
2225 		    vd->vdev_child[rc->rc_devidx],
2226 		    rc->rc_offset, rc->rc_abd, rc->rc_size,
2227 		    zio->io_type, zio->io_priority, 0,
2228 		    vdev_raidz_child_done, rc));
2229 		nread++;
2230 	}
2231 	return (nread);
2232 }
2233 
2234 /*
2235  * We're here because either there were too many errors to even attempt
2236  * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec()
2237  * failed. In either case, there is enough bad data to prevent reconstruction.
2238  * Start checksum ereports for all children which haven't failed.
2239  */
2240 static void
2241 vdev_raidz_io_done_unrecoverable(zio_t *zio)
2242 {
2243 	raidz_map_t *rm = zio->io_vsd;
2244 
2245 	for (int i = 0; i < rm->rm_nrows; i++) {
2246 		raidz_row_t *rr = rm->rm_row[i];
2247 
2248 		for (int c = 0; c < rr->rr_cols; c++) {
2249 			raidz_col_t *rc = &rr->rr_col[c];
2250 			vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
2251 
2252 			if (rc->rc_error != 0)
2253 				continue;
2254 
2255 			zio_bad_cksum_t zbc;
2256 			zbc.zbc_has_cksum = 0;
2257 			zbc.zbc_injected = rm->rm_ecksuminjected;
2258 
2259 			(void) zfs_ereport_start_checksum(zio->io_spa,
2260 			    cvd, &zio->io_bookmark, zio, rc->rc_offset,
2261 			    rc->rc_size, &zbc);
2262 			mutex_enter(&cvd->vdev_stat_lock);
2263 			cvd->vdev_stat.vs_checksum_errors++;
2264 			mutex_exit(&cvd->vdev_stat_lock);
2265 		}
2266 	}
2267 }
2268 
2269 void
2270 vdev_raidz_io_done(zio_t *zio)
2271 {
2272 	raidz_map_t *rm = zio->io_vsd;
2273 
2274 	if (zio->io_type == ZIO_TYPE_WRITE) {
2275 		for (int i = 0; i < rm->rm_nrows; i++) {
2276 			vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]);
2277 		}
2278 	} else {
2279 		for (int i = 0; i < rm->rm_nrows; i++) {
2280 			raidz_row_t *rr = rm->rm_row[i];
2281 			vdev_raidz_io_done_reconstruct_known_missing(zio,
2282 			    rm, rr);
2283 		}
2284 
2285 		if (raidz_checksum_verify(zio) == 0) {
2286 			for (int i = 0; i < rm->rm_nrows; i++) {
2287 				raidz_row_t *rr = rm->rm_row[i];
2288 				vdev_raidz_io_done_verified(zio, rr);
2289 			}
2290 			zio_checksum_verified(zio);
2291 		} else {
2292 			/*
2293 			 * A sequential resilver has no checksum which makes
2294 			 * combinatoral reconstruction impossible. This code
2295 			 * path is unreachable since raidz_checksum_verify()
2296 			 * has no checksum to verify and must succeed.
2297 			 */
2298 			ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD);
2299 
2300 			/*
2301 			 * This isn't a typical situation -- either we got a
2302 			 * read error or a child silently returned bad data.
2303 			 * Read every block so we can try again with as much
2304 			 * data and parity as we can track down. If we've
2305 			 * already been through once before, all children will
2306 			 * be marked as tried so we'll proceed to combinatorial
2307 			 * reconstruction.
2308 			 */
2309 			int nread = 0;
2310 			for (int i = 0; i < rm->rm_nrows; i++) {
2311 				nread += vdev_raidz_read_all(zio,
2312 				    rm->rm_row[i]);
2313 			}
2314 			if (nread != 0) {
2315 				/*
2316 				 * Normally our stage is VDEV_IO_DONE, but if
2317 				 * we've already called redone(), it will have
2318 				 * changed to VDEV_IO_START, in which case we
2319 				 * don't want to call redone() again.
2320 				 */
2321 				if (zio->io_stage != ZIO_STAGE_VDEV_IO_START)
2322 					zio_vdev_io_redone(zio);
2323 				return;
2324 			}
2325 
2326 			zio->io_error = vdev_raidz_combrec(zio);
2327 			if (zio->io_error == ECKSUM &&
2328 			    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2329 				vdev_raidz_io_done_unrecoverable(zio);
2330 			}
2331 		}
2332 	}
2333 }
2334 
2335 static void
2336 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2337 {
2338 	vdev_raidz_t *vdrz = vd->vdev_tsd;
2339 	if (faulted > vdrz->vd_nparity)
2340 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2341 		    VDEV_AUX_NO_REPLICAS);
2342 	else if (degraded + faulted != 0)
2343 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2344 	else
2345 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2346 }
2347 
2348 /*
2349  * Determine if any portion of the provided block resides on a child vdev
2350  * with a dirty DTL and therefore needs to be resilvered.  The function
2351  * assumes that at least one DTL is dirty which implies that full stripe
2352  * width blocks must be resilvered.
2353  */
2354 static boolean_t
2355 vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
2356     uint64_t phys_birth)
2357 {
2358 	vdev_raidz_t *vdrz = vd->vdev_tsd;
2359 	uint64_t dcols = vd->vdev_children;
2360 	uint64_t nparity = vdrz->vd_nparity;
2361 	uint64_t ashift = vd->vdev_top->vdev_ashift;
2362 	/* The starting RAIDZ (parent) vdev sector of the block. */
2363 	uint64_t b = DVA_GET_OFFSET(dva) >> ashift;
2364 	/* The zio's size in units of the vdev's minimum sector size. */
2365 	uint64_t s = ((psize - 1) >> ashift) + 1;
2366 	/* The first column for this stripe. */
2367 	uint64_t f = b % dcols;
2368 
2369 	/* Unreachable by sequential resilver. */
2370 	ASSERT3U(phys_birth, !=, TXG_UNKNOWN);
2371 
2372 	if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
2373 		return (B_FALSE);
2374 
2375 	if (s + nparity >= dcols)
2376 		return (B_TRUE);
2377 
2378 	for (uint64_t c = 0; c < s + nparity; c++) {
2379 		uint64_t devidx = (f + c) % dcols;
2380 		vdev_t *cvd = vd->vdev_child[devidx];
2381 
2382 		/*
2383 		 * dsl_scan_need_resilver() already checked vd with
2384 		 * vdev_dtl_contains(). So here just check cvd with
2385 		 * vdev_dtl_empty(), cheaper and a good approximation.
2386 		 */
2387 		if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
2388 			return (B_TRUE);
2389 	}
2390 
2391 	return (B_FALSE);
2392 }
2393 
2394 static void
2395 vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs,
2396     range_seg64_t *physical_rs, range_seg64_t *remain_rs)
2397 {
2398 	vdev_t *raidvd = cvd->vdev_parent;
2399 	ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
2400 
2401 	uint64_t width = raidvd->vdev_children;
2402 	uint64_t tgt_col = cvd->vdev_id;
2403 	uint64_t ashift = raidvd->vdev_top->vdev_ashift;
2404 
2405 	/* make sure the offsets are block-aligned */
2406 	ASSERT0(logical_rs->rs_start % (1 << ashift));
2407 	ASSERT0(logical_rs->rs_end % (1 << ashift));
2408 	uint64_t b_start = logical_rs->rs_start >> ashift;
2409 	uint64_t b_end = logical_rs->rs_end >> ashift;
2410 
2411 	uint64_t start_row = 0;
2412 	if (b_start > tgt_col) /* avoid underflow */
2413 		start_row = ((b_start - tgt_col - 1) / width) + 1;
2414 
2415 	uint64_t end_row = 0;
2416 	if (b_end > tgt_col)
2417 		end_row = ((b_end - tgt_col - 1) / width) + 1;
2418 
2419 	physical_rs->rs_start = start_row << ashift;
2420 	physical_rs->rs_end = end_row << ashift;
2421 
2422 	ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start);
2423 	ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=,
2424 	    logical_rs->rs_end - logical_rs->rs_start);
2425 }
2426 
2427 /*
2428  * Initialize private RAIDZ specific fields from the nvlist.
2429  */
2430 static int
2431 vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd)
2432 {
2433 	vdev_raidz_t *vdrz;
2434 	uint64_t nparity;
2435 
2436 	uint_t children;
2437 	nvlist_t **child;
2438 	int error = nvlist_lookup_nvlist_array(nv,
2439 	    ZPOOL_CONFIG_CHILDREN, &child, &children);
2440 	if (error != 0)
2441 		return (SET_ERROR(EINVAL));
2442 
2443 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) {
2444 		if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
2445 			return (SET_ERROR(EINVAL));
2446 
2447 		/*
2448 		 * Previous versions could only support 1 or 2 parity
2449 		 * device.
2450 		 */
2451 		if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2)
2452 			return (SET_ERROR(EINVAL));
2453 		else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3)
2454 			return (SET_ERROR(EINVAL));
2455 	} else {
2456 		/*
2457 		 * We require the parity to be specified for SPAs that
2458 		 * support multiple parity levels.
2459 		 */
2460 		if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
2461 			return (SET_ERROR(EINVAL));
2462 
2463 		/*
2464 		 * Otherwise, we default to 1 parity device for RAID-Z.
2465 		 */
2466 		nparity = 1;
2467 	}
2468 
2469 	vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP);
2470 	vdrz->vd_logical_width = children;
2471 	vdrz->vd_nparity = nparity;
2472 
2473 	*tsd = vdrz;
2474 
2475 	return (0);
2476 }
2477 
2478 static void
2479 vdev_raidz_fini(vdev_t *vd)
2480 {
2481 	kmem_free(vd->vdev_tsd, sizeof (vdev_raidz_t));
2482 }
2483 
2484 /*
2485  * Add RAIDZ specific fields to the config nvlist.
2486  */
2487 static void
2488 vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv)
2489 {
2490 	ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops);
2491 	vdev_raidz_t *vdrz = vd->vdev_tsd;
2492 
2493 	/*
2494 	 * Make sure someone hasn't managed to sneak a fancy new vdev
2495 	 * into a crufty old storage pool.
2496 	 */
2497 	ASSERT(vdrz->vd_nparity == 1 ||
2498 	    (vdrz->vd_nparity <= 2 &&
2499 	    spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) ||
2500 	    (vdrz->vd_nparity <= 3 &&
2501 	    spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3));
2502 
2503 	/*
2504 	 * Note that we'll add these even on storage pools where they
2505 	 * aren't strictly required -- older software will just ignore
2506 	 * it.
2507 	 */
2508 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity);
2509 }
2510 
2511 static uint64_t
2512 vdev_raidz_nparity(vdev_t *vd)
2513 {
2514 	vdev_raidz_t *vdrz = vd->vdev_tsd;
2515 	return (vdrz->vd_nparity);
2516 }
2517 
2518 static uint64_t
2519 vdev_raidz_ndisks(vdev_t *vd)
2520 {
2521 	return (vd->vdev_children);
2522 }
2523 
2524 vdev_ops_t vdev_raidz_ops = {
2525 	.vdev_op_init = vdev_raidz_init,
2526 	.vdev_op_fini = vdev_raidz_fini,
2527 	.vdev_op_open = vdev_raidz_open,
2528 	.vdev_op_close = vdev_raidz_close,
2529 	.vdev_op_asize = vdev_raidz_asize,
2530 	.vdev_op_min_asize = vdev_raidz_min_asize,
2531 	.vdev_op_min_alloc = NULL,
2532 	.vdev_op_io_start = vdev_raidz_io_start,
2533 	.vdev_op_io_done = vdev_raidz_io_done,
2534 	.vdev_op_state_change = vdev_raidz_state_change,
2535 	.vdev_op_need_resilver = vdev_raidz_need_resilver,
2536 	.vdev_op_hold = NULL,
2537 	.vdev_op_rele = NULL,
2538 	.vdev_op_remap = NULL,
2539 	.vdev_op_xlate = vdev_raidz_xlate,
2540 	.vdev_op_rebuild_asize = NULL,
2541 	.vdev_op_metaslab_init = NULL,
2542 	.vdev_op_config_generate = vdev_raidz_config_generate,
2543 	.vdev_op_nparity = vdev_raidz_nparity,
2544 	.vdev_op_ndisks = vdev_raidz_ndisks,
2545 	.vdev_op_type = VDEV_TYPE_RAIDZ,	/* name of this vdev type */
2546 	.vdev_op_leaf = B_FALSE			/* not a leaf vdev */
2547 };
2548