xref: /freebsd/sys/contrib/openzfs/module/zfs/vdev_raidz.c (revision ba2cfa80e1f2a7e8ffd383e615aa304afa349ed7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
25  * Copyright (c) 2016 Gvozden Nešković. All rights reserved.
26  */
27 
28 #include <sys/zfs_context.h>
29 #include <sys/spa.h>
30 #include <sys/vdev_impl.h>
31 #include <sys/zio.h>
32 #include <sys/zio_checksum.h>
33 #include <sys/abd.h>
34 #include <sys/fs/zfs.h>
35 #include <sys/fm/fs/zfs.h>
36 #include <sys/vdev_raidz.h>
37 #include <sys/vdev_raidz_impl.h>
38 #include <sys/vdev_draid.h>
39 
40 #ifdef ZFS_DEBUG
41 #include <sys/vdev.h>	/* For vdev_xlate() in vdev_raidz_io_verify() */
42 #endif
43 
44 /*
45  * Virtual device vector for RAID-Z.
46  *
47  * This vdev supports single, double, and triple parity. For single parity,
48  * we use a simple XOR of all the data columns. For double or triple parity,
49  * we use a special case of Reed-Solomon coding. This extends the
50  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
51  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
52  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
53  * former is also based. The latter is designed to provide higher performance
54  * for writes.
55  *
56  * Note that the Plank paper claimed to support arbitrary N+M, but was then
57  * amended six years later identifying a critical flaw that invalidates its
58  * claims. Nevertheless, the technique can be adapted to work for up to
59  * triple parity. For additional parity, the amendment "Note: Correction to
60  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
61  * is viable, but the additional complexity means that write performance will
62  * suffer.
63  *
64  * All of the methods above operate on a Galois field, defined over the
65  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
66  * can be expressed with a single byte. Briefly, the operations on the
67  * field are defined as follows:
68  *
69  *   o addition (+) is represented by a bitwise XOR
70  *   o subtraction (-) is therefore identical to addition: A + B = A - B
71  *   o multiplication of A by 2 is defined by the following bitwise expression:
72  *
73  *	(A * 2)_7 = A_6
74  *	(A * 2)_6 = A_5
75  *	(A * 2)_5 = A_4
76  *	(A * 2)_4 = A_3 + A_7
77  *	(A * 2)_3 = A_2 + A_7
78  *	(A * 2)_2 = A_1 + A_7
79  *	(A * 2)_1 = A_0
80  *	(A * 2)_0 = A_7
81  *
82  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
83  * As an aside, this multiplication is derived from the error correcting
84  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
85  *
86  * Observe that any number in the field (except for 0) can be expressed as a
87  * power of 2 -- a generator for the field. We store a table of the powers of
88  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
89  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
90  * than field addition). The inverse of a field element A (A^-1) is therefore
91  * A ^ (255 - 1) = A^254.
92  *
93  * The up-to-three parity columns, P, Q, R over several data columns,
94  * D_0, ... D_n-1, can be expressed by field operations:
95  *
96  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
97  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
98  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
99  *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
100  *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
101  *
102  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial
103  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
104  * independent coefficients. (There are no additional coefficients that have
105  * this property which is why the uncorrected Plank method breaks down.)
106  *
107  * See the reconstruction code below for how P, Q and R can used individually
108  * or in concert to recover missing data columns.
109  */
110 
111 #define	VDEV_RAIDZ_P		0
112 #define	VDEV_RAIDZ_Q		1
113 #define	VDEV_RAIDZ_R		2
114 
115 #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
116 #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
117 
118 /*
119  * We provide a mechanism to perform the field multiplication operation on a
120  * 64-bit value all at once rather than a byte at a time. This works by
121  * creating a mask from the top bit in each byte and using that to
122  * conditionally apply the XOR of 0x1d.
123  */
124 #define	VDEV_RAIDZ_64MUL_2(x, mask) \
125 { \
126 	(mask) = (x) & 0x8080808080808080ULL; \
127 	(mask) = ((mask) << 1) - ((mask) >> 7); \
128 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
129 	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
130 }
131 
132 #define	VDEV_RAIDZ_64MUL_4(x, mask) \
133 { \
134 	VDEV_RAIDZ_64MUL_2((x), mask); \
135 	VDEV_RAIDZ_64MUL_2((x), mask); \
136 }
137 
138 static void
139 vdev_raidz_row_free(raidz_row_t *rr)
140 {
141 	for (int c = 0; c < rr->rr_cols; c++) {
142 		raidz_col_t *rc = &rr->rr_col[c];
143 
144 		if (rc->rc_size != 0)
145 			abd_free(rc->rc_abd);
146 		if (rc->rc_gdata != NULL)
147 			abd_free(rc->rc_gdata);
148 		if (rc->rc_orig_data != NULL)
149 			zio_buf_free(rc->rc_orig_data, rc->rc_size);
150 	}
151 
152 	if (rr->rr_abd_copy != NULL)
153 		abd_free(rr->rr_abd_copy);
154 
155 	if (rr->rr_abd_empty != NULL)
156 		abd_free(rr->rr_abd_empty);
157 
158 	kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols]));
159 }
160 
161 void
162 vdev_raidz_map_free(raidz_map_t *rm)
163 {
164 	for (int i = 0; i < rm->rm_nrows; i++)
165 		vdev_raidz_row_free(rm->rm_row[i]);
166 
167 	kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows]));
168 }
169 
170 static void
171 vdev_raidz_map_free_vsd(zio_t *zio)
172 {
173 	raidz_map_t *rm = zio->io_vsd;
174 
175 	ASSERT0(rm->rm_freed);
176 	rm->rm_freed = B_TRUE;
177 
178 	if (rm->rm_reports == 0) {
179 		vdev_raidz_map_free(rm);
180 	}
181 }
182 
183 /*ARGSUSED*/
184 static void
185 vdev_raidz_cksum_free(void *arg, size_t ignored)
186 {
187 	raidz_map_t *rm = arg;
188 
189 	ASSERT3U(rm->rm_reports, >, 0);
190 
191 	if (--rm->rm_reports == 0 && rm->rm_freed)
192 		vdev_raidz_map_free(rm);
193 }
194 
195 static void
196 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data)
197 {
198 	raidz_map_t *rm = zcr->zcr_cbdata;
199 	const size_t c = zcr->zcr_cbinfo;
200 	size_t x, offset;
201 
202 	if (good_data == NULL) {
203 		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
204 		return;
205 	}
206 
207 	ASSERT3U(rm->rm_nrows, ==, 1);
208 	raidz_row_t *rr = rm->rm_row[0];
209 
210 	const abd_t *good = NULL;
211 	const abd_t *bad = rr->rr_col[c].rc_abd;
212 
213 	if (c < rr->rr_firstdatacol) {
214 		/*
215 		 * The first time through, calculate the parity blocks for
216 		 * the good data (this relies on the fact that the good
217 		 * data never changes for a given logical ZIO)
218 		 */
219 		if (rr->rr_col[0].rc_gdata == NULL) {
220 			abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY];
221 
222 			/*
223 			 * Set up the rr_col[]s to generate the parity for
224 			 * good_data, first saving the parity bufs and
225 			 * replacing them with buffers to hold the result.
226 			 */
227 			for (x = 0; x < rr->rr_firstdatacol; x++) {
228 				bad_parity[x] = rr->rr_col[x].rc_abd;
229 				rr->rr_col[x].rc_abd = rr->rr_col[x].rc_gdata =
230 				    abd_alloc_sametype(rr->rr_col[x].rc_abd,
231 				    rr->rr_col[x].rc_size);
232 			}
233 
234 			/* fill in the data columns from good_data */
235 			offset = 0;
236 			for (; x < rr->rr_cols; x++) {
237 				abd_free(rr->rr_col[x].rc_abd);
238 
239 				rr->rr_col[x].rc_abd =
240 				    abd_get_offset_size((abd_t *)good_data,
241 				    offset, rr->rr_col[x].rc_size);
242 				offset += rr->rr_col[x].rc_size;
243 			}
244 
245 			/*
246 			 * Construct the parity from the good data.
247 			 */
248 			vdev_raidz_generate_parity_row(rm, rr);
249 
250 			/* restore everything back to its original state */
251 			for (x = 0; x < rr->rr_firstdatacol; x++)
252 				rr->rr_col[x].rc_abd = bad_parity[x];
253 
254 			offset = 0;
255 			for (x = rr->rr_firstdatacol; x < rr->rr_cols; x++) {
256 				abd_free(rr->rr_col[x].rc_abd);
257 				rr->rr_col[x].rc_abd = abd_get_offset_size(
258 				    rr->rr_abd_copy, offset,
259 				    rr->rr_col[x].rc_size);
260 				offset += rr->rr_col[x].rc_size;
261 			}
262 		}
263 
264 		ASSERT3P(rr->rr_col[c].rc_gdata, !=, NULL);
265 		good = abd_get_offset_size(rr->rr_col[c].rc_gdata, 0,
266 		    rr->rr_col[c].rc_size);
267 	} else {
268 		/* adjust good_data to point at the start of our column */
269 		offset = 0;
270 		for (x = rr->rr_firstdatacol; x < c; x++)
271 			offset += rr->rr_col[x].rc_size;
272 
273 		good = abd_get_offset_size((abd_t *)good_data, offset,
274 		    rr->rr_col[c].rc_size);
275 	}
276 
277 	/* we drop the ereport if it ends up that the data was good */
278 	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
279 	abd_free((abd_t *)good);
280 }
281 
282 /*
283  * Invoked indirectly by zfs_ereport_start_checksum(), called
284  * below when our read operation fails completely.  The main point
285  * is to keep a copy of everything we read from disk, so that at
286  * vdev_raidz_cksum_finish() time we can compare it with the good data.
287  */
288 static void
289 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
290 {
291 	size_t c = (size_t)(uintptr_t)arg;
292 	raidz_map_t *rm = zio->io_vsd;
293 
294 	/* set up the report and bump the refcount  */
295 	zcr->zcr_cbdata = rm;
296 	zcr->zcr_cbinfo = c;
297 	zcr->zcr_finish = vdev_raidz_cksum_finish;
298 	zcr->zcr_free = vdev_raidz_cksum_free;
299 
300 	rm->rm_reports++;
301 	ASSERT3U(rm->rm_reports, >, 0);
302 	ASSERT3U(rm->rm_nrows, ==, 1);
303 
304 	if (rm->rm_row[0]->rr_abd_copy != NULL)
305 		return;
306 
307 	/*
308 	 * It's the first time we're called for this raidz_map_t, so we need
309 	 * to copy the data aside; there's no guarantee that our zio's buffer
310 	 * won't be re-used for something else.
311 	 *
312 	 * Our parity data is already in separate buffers, so there's no need
313 	 * to copy them.
314 	 */
315 	for (int i = 0; i < rm->rm_nrows; i++) {
316 		raidz_row_t *rr = rm->rm_row[i];
317 		size_t offset = 0;
318 		size_t size = 0;
319 
320 		for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++)
321 			size += rr->rr_col[c].rc_size;
322 
323 		rr->rr_abd_copy = abd_alloc_for_io(size, B_FALSE);
324 
325 		for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
326 			raidz_col_t *col = &rr->rr_col[c];
327 			abd_t *tmp = abd_get_offset_size(rr->rr_abd_copy,
328 			    offset, col->rc_size);
329 
330 			abd_copy(tmp, col->rc_abd, col->rc_size);
331 
332 			abd_free(col->rc_abd);
333 			col->rc_abd = tmp;
334 
335 			offset += col->rc_size;
336 		}
337 		ASSERT3U(offset, ==, size);
338 	}
339 }
340 
341 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
342 	.vsd_free = vdev_raidz_map_free_vsd,
343 	.vsd_cksum_report = vdev_raidz_cksum_report
344 };
345 
346 /*
347  * Divides the IO evenly across all child vdevs; usually, dcols is
348  * the number of children in the target vdev.
349  *
350  * Avoid inlining the function to keep vdev_raidz_io_start(), which
351  * is this functions only caller, as small as possible on the stack.
352  */
353 noinline raidz_map_t *
354 vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
355     uint64_t nparity)
356 {
357 	raidz_row_t *rr;
358 	/* The starting RAIDZ (parent) vdev sector of the block. */
359 	uint64_t b = zio->io_offset >> ashift;
360 	/* The zio's size in units of the vdev's minimum sector size. */
361 	uint64_t s = zio->io_size >> ashift;
362 	/* The first column for this stripe. */
363 	uint64_t f = b % dcols;
364 	/* The starting byte offset on each child vdev. */
365 	uint64_t o = (b / dcols) << ashift;
366 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
367 
368 	raidz_map_t *rm =
369 	    kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP);
370 	rm->rm_nrows = 1;
371 
372 	/*
373 	 * "Quotient": The number of data sectors for this stripe on all but
374 	 * the "big column" child vdevs that also contain "remainder" data.
375 	 */
376 	q = s / (dcols - nparity);
377 
378 	/*
379 	 * "Remainder": The number of partial stripe data sectors in this I/O.
380 	 * This will add a sector to some, but not all, child vdevs.
381 	 */
382 	r = s - q * (dcols - nparity);
383 
384 	/* The number of "big columns" - those which contain remainder data. */
385 	bc = (r == 0 ? 0 : r + nparity);
386 
387 	/*
388 	 * The total number of data and parity sectors associated with
389 	 * this I/O.
390 	 */
391 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
392 
393 	/*
394 	 * acols: The columns that will be accessed.
395 	 * scols: The columns that will be accessed or skipped.
396 	 */
397 	if (q == 0) {
398 		/* Our I/O request doesn't span all child vdevs. */
399 		acols = bc;
400 		scols = MIN(dcols, roundup(bc, nparity + 1));
401 	} else {
402 		acols = dcols;
403 		scols = dcols;
404 	}
405 
406 	ASSERT3U(acols, <=, scols);
407 
408 	rr = kmem_alloc(offsetof(raidz_row_t, rr_col[scols]), KM_SLEEP);
409 	rm->rm_row[0] = rr;
410 
411 	rr->rr_cols = acols;
412 	rr->rr_scols = scols;
413 	rr->rr_bigcols = bc;
414 	rr->rr_missingdata = 0;
415 	rr->rr_missingparity = 0;
416 	rr->rr_firstdatacol = nparity;
417 	rr->rr_abd_copy = NULL;
418 	rr->rr_abd_empty = NULL;
419 	rr->rr_nempty = 0;
420 #ifdef ZFS_DEBUG
421 	rr->rr_offset = zio->io_offset;
422 	rr->rr_size = zio->io_size;
423 #endif
424 
425 	asize = 0;
426 
427 	for (c = 0; c < scols; c++) {
428 		raidz_col_t *rc = &rr->rr_col[c];
429 		col = f + c;
430 		coff = o;
431 		if (col >= dcols) {
432 			col -= dcols;
433 			coff += 1ULL << ashift;
434 		}
435 		rc->rc_devidx = col;
436 		rc->rc_offset = coff;
437 		rc->rc_abd = NULL;
438 		rc->rc_gdata = NULL;
439 		rc->rc_orig_data = NULL;
440 		rc->rc_error = 0;
441 		rc->rc_tried = 0;
442 		rc->rc_skipped = 0;
443 		rc->rc_repair = 0;
444 		rc->rc_need_orig_restore = B_FALSE;
445 
446 		if (c >= acols)
447 			rc->rc_size = 0;
448 		else if (c < bc)
449 			rc->rc_size = (q + 1) << ashift;
450 		else
451 			rc->rc_size = q << ashift;
452 
453 		asize += rc->rc_size;
454 	}
455 
456 	ASSERT3U(asize, ==, tot << ashift);
457 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
458 	rm->rm_skipstart = bc;
459 
460 	for (c = 0; c < rr->rr_firstdatacol; c++)
461 		rr->rr_col[c].rc_abd =
462 		    abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE);
463 
464 	for (uint64_t off = 0; c < acols; c++) {
465 		raidz_col_t *rc = &rr->rr_col[c];
466 		rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
467 		    zio->io_abd, off, rc->rc_size);
468 		off += rc->rc_size;
469 	}
470 
471 	/*
472 	 * If all data stored spans all columns, there's a danger that parity
473 	 * will always be on the same device and, since parity isn't read
474 	 * during normal operation, that device's I/O bandwidth won't be
475 	 * used effectively. We therefore switch the parity every 1MB.
476 	 *
477 	 * ... at least that was, ostensibly, the theory. As a practical
478 	 * matter unless we juggle the parity between all devices evenly, we
479 	 * won't see any benefit. Further, occasional writes that aren't a
480 	 * multiple of the LCM of the number of children and the minimum
481 	 * stripe width are sufficient to avoid pessimal behavior.
482 	 * Unfortunately, this decision created an implicit on-disk format
483 	 * requirement that we need to support for all eternity, but only
484 	 * for single-parity RAID-Z.
485 	 *
486 	 * If we intend to skip a sector in the zeroth column for padding
487 	 * we must make sure to note this swap. We will never intend to
488 	 * skip the first column since at least one data and one parity
489 	 * column must appear in each row.
490 	 */
491 	ASSERT(rr->rr_cols >= 2);
492 	ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
493 
494 	if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
495 		devidx = rr->rr_col[0].rc_devidx;
496 		o = rr->rr_col[0].rc_offset;
497 		rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
498 		rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
499 		rr->rr_col[1].rc_devidx = devidx;
500 		rr->rr_col[1].rc_offset = o;
501 
502 		if (rm->rm_skipstart == 0)
503 			rm->rm_skipstart = 1;
504 	}
505 
506 	/* init RAIDZ parity ops */
507 	rm->rm_ops = vdev_raidz_math_get_ops();
508 
509 	return (rm);
510 }
511 
512 struct pqr_struct {
513 	uint64_t *p;
514 	uint64_t *q;
515 	uint64_t *r;
516 };
517 
518 static int
519 vdev_raidz_p_func(void *buf, size_t size, void *private)
520 {
521 	struct pqr_struct *pqr = private;
522 	const uint64_t *src = buf;
523 	int i, cnt = size / sizeof (src[0]);
524 
525 	ASSERT(pqr->p && !pqr->q && !pqr->r);
526 
527 	for (i = 0; i < cnt; i++, src++, pqr->p++)
528 		*pqr->p ^= *src;
529 
530 	return (0);
531 }
532 
533 static int
534 vdev_raidz_pq_func(void *buf, size_t size, void *private)
535 {
536 	struct pqr_struct *pqr = private;
537 	const uint64_t *src = buf;
538 	uint64_t mask;
539 	int i, cnt = size / sizeof (src[0]);
540 
541 	ASSERT(pqr->p && pqr->q && !pqr->r);
542 
543 	for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
544 		*pqr->p ^= *src;
545 		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
546 		*pqr->q ^= *src;
547 	}
548 
549 	return (0);
550 }
551 
552 static int
553 vdev_raidz_pqr_func(void *buf, size_t size, void *private)
554 {
555 	struct pqr_struct *pqr = private;
556 	const uint64_t *src = buf;
557 	uint64_t mask;
558 	int i, cnt = size / sizeof (src[0]);
559 
560 	ASSERT(pqr->p && pqr->q && pqr->r);
561 
562 	for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
563 		*pqr->p ^= *src;
564 		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
565 		*pqr->q ^= *src;
566 		VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
567 		*pqr->r ^= *src;
568 	}
569 
570 	return (0);
571 }
572 
573 static void
574 vdev_raidz_generate_parity_p(raidz_row_t *rr)
575 {
576 	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
577 
578 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
579 		abd_t *src = rr->rr_col[c].rc_abd;
580 
581 		if (c == rr->rr_firstdatacol) {
582 			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
583 		} else {
584 			struct pqr_struct pqr = { p, NULL, NULL };
585 			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
586 			    vdev_raidz_p_func, &pqr);
587 		}
588 	}
589 }
590 
591 static void
592 vdev_raidz_generate_parity_pq(raidz_row_t *rr)
593 {
594 	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
595 	uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
596 	uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
597 	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
598 	    rr->rr_col[VDEV_RAIDZ_Q].rc_size);
599 
600 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
601 		abd_t *src = rr->rr_col[c].rc_abd;
602 
603 		uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
604 
605 		if (c == rr->rr_firstdatacol) {
606 			ASSERT(ccnt == pcnt || ccnt == 0);
607 			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
608 			(void) memcpy(q, p, rr->rr_col[c].rc_size);
609 
610 			for (uint64_t i = ccnt; i < pcnt; i++) {
611 				p[i] = 0;
612 				q[i] = 0;
613 			}
614 		} else {
615 			struct pqr_struct pqr = { p, q, NULL };
616 
617 			ASSERT(ccnt <= pcnt);
618 			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
619 			    vdev_raidz_pq_func, &pqr);
620 
621 			/*
622 			 * Treat short columns as though they are full of 0s.
623 			 * Note that there's therefore nothing needed for P.
624 			 */
625 			uint64_t mask;
626 			for (uint64_t i = ccnt; i < pcnt; i++) {
627 				VDEV_RAIDZ_64MUL_2(q[i], mask);
628 			}
629 		}
630 	}
631 }
632 
633 static void
634 vdev_raidz_generate_parity_pqr(raidz_row_t *rr)
635 {
636 	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
637 	uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
638 	uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd);
639 	uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
640 	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
641 	    rr->rr_col[VDEV_RAIDZ_Q].rc_size);
642 	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
643 	    rr->rr_col[VDEV_RAIDZ_R].rc_size);
644 
645 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
646 		abd_t *src = rr->rr_col[c].rc_abd;
647 
648 		uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
649 
650 		if (c == rr->rr_firstdatacol) {
651 			ASSERT(ccnt == pcnt || ccnt == 0);
652 			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
653 			(void) memcpy(q, p, rr->rr_col[c].rc_size);
654 			(void) memcpy(r, p, rr->rr_col[c].rc_size);
655 
656 			for (uint64_t i = ccnt; i < pcnt; i++) {
657 				p[i] = 0;
658 				q[i] = 0;
659 				r[i] = 0;
660 			}
661 		} else {
662 			struct pqr_struct pqr = { p, q, r };
663 
664 			ASSERT(ccnt <= pcnt);
665 			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
666 			    vdev_raidz_pqr_func, &pqr);
667 
668 			/*
669 			 * Treat short columns as though they are full of 0s.
670 			 * Note that there's therefore nothing needed for P.
671 			 */
672 			uint64_t mask;
673 			for (uint64_t i = ccnt; i < pcnt; i++) {
674 				VDEV_RAIDZ_64MUL_2(q[i], mask);
675 				VDEV_RAIDZ_64MUL_4(r[i], mask);
676 			}
677 		}
678 	}
679 }
680 
681 /*
682  * Generate RAID parity in the first virtual columns according to the number of
683  * parity columns available.
684  */
685 void
686 vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr)
687 {
688 	ASSERT3U(rr->rr_cols, !=, 0);
689 
690 	/* Generate using the new math implementation */
691 	if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL)
692 		return;
693 
694 	switch (rr->rr_firstdatacol) {
695 	case 1:
696 		vdev_raidz_generate_parity_p(rr);
697 		break;
698 	case 2:
699 		vdev_raidz_generate_parity_pq(rr);
700 		break;
701 	case 3:
702 		vdev_raidz_generate_parity_pqr(rr);
703 		break;
704 	default:
705 		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
706 	}
707 }
708 
709 void
710 vdev_raidz_generate_parity(raidz_map_t *rm)
711 {
712 	for (int i = 0; i < rm->rm_nrows; i++) {
713 		raidz_row_t *rr = rm->rm_row[i];
714 		vdev_raidz_generate_parity_row(rm, rr);
715 	}
716 }
717 
718 /* ARGSUSED */
719 static int
720 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
721 {
722 	uint64_t *dst = dbuf;
723 	uint64_t *src = sbuf;
724 	int cnt = size / sizeof (src[0]);
725 
726 	for (int i = 0; i < cnt; i++) {
727 		dst[i] ^= src[i];
728 	}
729 
730 	return (0);
731 }
732 
733 /* ARGSUSED */
734 static int
735 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
736     void *private)
737 {
738 	uint64_t *dst = dbuf;
739 	uint64_t *src = sbuf;
740 	uint64_t mask;
741 	int cnt = size / sizeof (dst[0]);
742 
743 	for (int i = 0; i < cnt; i++, dst++, src++) {
744 		VDEV_RAIDZ_64MUL_2(*dst, mask);
745 		*dst ^= *src;
746 	}
747 
748 	return (0);
749 }
750 
751 /* ARGSUSED */
752 static int
753 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
754 {
755 	uint64_t *dst = buf;
756 	uint64_t mask;
757 	int cnt = size / sizeof (dst[0]);
758 
759 	for (int i = 0; i < cnt; i++, dst++) {
760 		/* same operation as vdev_raidz_reconst_q_pre_func() on dst */
761 		VDEV_RAIDZ_64MUL_2(*dst, mask);
762 	}
763 
764 	return (0);
765 }
766 
767 struct reconst_q_struct {
768 	uint64_t *q;
769 	int exp;
770 };
771 
772 static int
773 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
774 {
775 	struct reconst_q_struct *rq = private;
776 	uint64_t *dst = buf;
777 	int cnt = size / sizeof (dst[0]);
778 
779 	for (int i = 0; i < cnt; i++, dst++, rq->q++) {
780 		int j;
781 		uint8_t *b;
782 
783 		*dst ^= *rq->q;
784 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
785 			*b = vdev_raidz_exp2(*b, rq->exp);
786 		}
787 	}
788 
789 	return (0);
790 }
791 
792 struct reconst_pq_struct {
793 	uint8_t *p;
794 	uint8_t *q;
795 	uint8_t *pxy;
796 	uint8_t *qxy;
797 	int aexp;
798 	int bexp;
799 };
800 
801 static int
802 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
803 {
804 	struct reconst_pq_struct *rpq = private;
805 	uint8_t *xd = xbuf;
806 	uint8_t *yd = ybuf;
807 
808 	for (int i = 0; i < size;
809 	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
810 		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
811 		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
812 		*yd = *rpq->p ^ *rpq->pxy ^ *xd;
813 	}
814 
815 	return (0);
816 }
817 
818 static int
819 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
820 {
821 	struct reconst_pq_struct *rpq = private;
822 	uint8_t *xd = xbuf;
823 
824 	for (int i = 0; i < size;
825 	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
826 		/* same operation as vdev_raidz_reconst_pq_func() on xd */
827 		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
828 		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
829 	}
830 
831 	return (0);
832 }
833 
834 static int
835 vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts)
836 {
837 	int x = tgts[0];
838 	abd_t *dst, *src;
839 
840 	ASSERT3U(ntgts, ==, 1);
841 	ASSERT3U(x, >=, rr->rr_firstdatacol);
842 	ASSERT3U(x, <, rr->rr_cols);
843 
844 	ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size);
845 
846 	src = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
847 	dst = rr->rr_col[x].rc_abd;
848 
849 	abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size);
850 
851 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
852 		uint64_t size = MIN(rr->rr_col[x].rc_size,
853 		    rr->rr_col[c].rc_size);
854 
855 		src = rr->rr_col[c].rc_abd;
856 
857 		if (c == x)
858 			continue;
859 
860 		(void) abd_iterate_func2(dst, src, 0, 0, size,
861 		    vdev_raidz_reconst_p_func, NULL);
862 	}
863 
864 	return (1 << VDEV_RAIDZ_P);
865 }
866 
867 static int
868 vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts)
869 {
870 	int x = tgts[0];
871 	int c, exp;
872 	abd_t *dst, *src;
873 
874 	ASSERT(ntgts == 1);
875 
876 	ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size);
877 
878 	for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
879 		uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size,
880 		    rr->rr_col[c].rc_size);
881 
882 		src = rr->rr_col[c].rc_abd;
883 		dst = rr->rr_col[x].rc_abd;
884 
885 		if (c == rr->rr_firstdatacol) {
886 			abd_copy(dst, src, size);
887 			if (rr->rr_col[x].rc_size > size) {
888 				abd_zero_off(dst, size,
889 				    rr->rr_col[x].rc_size - size);
890 			}
891 		} else {
892 			ASSERT3U(size, <=, rr->rr_col[x].rc_size);
893 			(void) abd_iterate_func2(dst, src, 0, 0, size,
894 			    vdev_raidz_reconst_q_pre_func, NULL);
895 			(void) abd_iterate_func(dst,
896 			    size, rr->rr_col[x].rc_size - size,
897 			    vdev_raidz_reconst_q_pre_tail_func, NULL);
898 		}
899 	}
900 
901 	src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
902 	dst = rr->rr_col[x].rc_abd;
903 	exp = 255 - (rr->rr_cols - 1 - x);
904 
905 	struct reconst_q_struct rq = { abd_to_buf(src), exp };
906 	(void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size,
907 	    vdev_raidz_reconst_q_post_func, &rq);
908 
909 	return (1 << VDEV_RAIDZ_Q);
910 }
911 
912 static int
913 vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts)
914 {
915 	uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
916 	abd_t *pdata, *qdata;
917 	uint64_t xsize, ysize;
918 	int x = tgts[0];
919 	int y = tgts[1];
920 	abd_t *xd, *yd;
921 
922 	ASSERT(ntgts == 2);
923 	ASSERT(x < y);
924 	ASSERT(x >= rr->rr_firstdatacol);
925 	ASSERT(y < rr->rr_cols);
926 
927 	ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size);
928 
929 	/*
930 	 * Move the parity data aside -- we're going to compute parity as
931 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
932 	 * reuse the parity generation mechanism without trashing the actual
933 	 * parity so we make those columns appear to be full of zeros by
934 	 * setting their lengths to zero.
935 	 */
936 	pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
937 	qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
938 	xsize = rr->rr_col[x].rc_size;
939 	ysize = rr->rr_col[y].rc_size;
940 
941 	rr->rr_col[VDEV_RAIDZ_P].rc_abd =
942 	    abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
943 	rr->rr_col[VDEV_RAIDZ_Q].rc_abd =
944 	    abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
945 	rr->rr_col[x].rc_size = 0;
946 	rr->rr_col[y].rc_size = 0;
947 
948 	vdev_raidz_generate_parity_pq(rr);
949 
950 	rr->rr_col[x].rc_size = xsize;
951 	rr->rr_col[y].rc_size = ysize;
952 
953 	p = abd_to_buf(pdata);
954 	q = abd_to_buf(qdata);
955 	pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
956 	qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
957 	xd = rr->rr_col[x].rc_abd;
958 	yd = rr->rr_col[y].rc_abd;
959 
960 	/*
961 	 * We now have:
962 	 *	Pxy = P + D_x + D_y
963 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
964 	 *
965 	 * We can then solve for D_x:
966 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
967 	 * where
968 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
969 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
970 	 *
971 	 * With D_x in hand, we can easily solve for D_y:
972 	 *	D_y = P + Pxy + D_x
973 	 */
974 
975 	a = vdev_raidz_pow2[255 + x - y];
976 	b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)];
977 	tmp = 255 - vdev_raidz_log2[a ^ 1];
978 
979 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
980 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
981 
982 	ASSERT3U(xsize, >=, ysize);
983 	struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
984 
985 	(void) abd_iterate_func2(xd, yd, 0, 0, ysize,
986 	    vdev_raidz_reconst_pq_func, &rpq);
987 	(void) abd_iterate_func(xd, ysize, xsize - ysize,
988 	    vdev_raidz_reconst_pq_tail_func, &rpq);
989 
990 	abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
991 	abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
992 
993 	/*
994 	 * Restore the saved parity data.
995 	 */
996 	rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata;
997 	rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata;
998 
999 	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
1000 }
1001 
1002 /* BEGIN CSTYLED */
1003 /*
1004  * In the general case of reconstruction, we must solve the system of linear
1005  * equations defined by the coefficients used to generate parity as well as
1006  * the contents of the data and parity disks. This can be expressed with
1007  * vectors for the original data (D) and the actual data (d) and parity (p)
1008  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
1009  *
1010  *            __   __                     __     __
1011  *            |     |         __     __   |  p_0  |
1012  *            |  V  |         |  D_0  |   | p_m-1 |
1013  *            |     |    x    |   :   | = |  d_0  |
1014  *            |  I  |         | D_n-1 |   |   :   |
1015  *            |     |         ~~     ~~   | d_n-1 |
1016  *            ~~   ~~                     ~~     ~~
1017  *
1018  * I is simply a square identity matrix of size n, and V is a vandermonde
1019  * matrix defined by the coefficients we chose for the various parity columns
1020  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
1021  * computation as well as linear separability.
1022  *
1023  *      __               __               __     __
1024  *      |   1   ..  1 1 1 |               |  p_0  |
1025  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
1026  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
1027  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
1028  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
1029  *      |   :       : : : |   |   :   |   |  d_2  |
1030  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
1031  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
1032  *      |   0   ..  0 0 1 |               | d_n-1 |
1033  *      ~~               ~~               ~~     ~~
1034  *
1035  * Note that I, V, d, and p are known. To compute D, we must invert the
1036  * matrix and use the known data and parity values to reconstruct the unknown
1037  * data values. We begin by removing the rows in V|I and d|p that correspond
1038  * to failed or missing columns; we then make V|I square (n x n) and d|p
1039  * sized n by removing rows corresponding to unused parity from the bottom up
1040  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
1041  * using Gauss-Jordan elimination. In the example below we use m=3 parity
1042  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
1043  *           __                               __
1044  *           |  1   1   1   1   1   1   1   1  |
1045  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
1046  *           |  19 205 116  29  64  16  4   1  |      / /
1047  *           |  1   0   0   0   0   0   0   0  |     / /
1048  *           |  0   1   0   0   0   0   0   0  | <--' /
1049  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
1050  *           |  0   0   0   1   0   0   0   0  |
1051  *           |  0   0   0   0   1   0   0   0  |
1052  *           |  0   0   0   0   0   1   0   0  |
1053  *           |  0   0   0   0   0   0   1   0  |
1054  *           |  0   0   0   0   0   0   0   1  |
1055  *           ~~                               ~~
1056  *           __                               __
1057  *           |  1   1   1   1   1   1   1   1  |
1058  *           | 128  64  32  16  8   4   2   1  |
1059  *           |  19 205 116  29  64  16  4   1  |
1060  *           |  1   0   0   0   0   0   0   0  |
1061  *           |  0   1   0   0   0   0   0   0  |
1062  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
1063  *           |  0   0   0   1   0   0   0   0  |
1064  *           |  0   0   0   0   1   0   0   0  |
1065  *           |  0   0   0   0   0   1   0   0  |
1066  *           |  0   0   0   0   0   0   1   0  |
1067  *           |  0   0   0   0   0   0   0   1  |
1068  *           ~~                               ~~
1069  *
1070  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1071  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1072  * matrix is not singular.
1073  * __                                                                 __
1074  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1075  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1076  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1077  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1078  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1079  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1080  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1081  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1082  * ~~                                                                 ~~
1083  * __                                                                 __
1084  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1085  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1086  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1087  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1088  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1089  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1090  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1091  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1092  * ~~                                                                 ~~
1093  * __                                                                 __
1094  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1095  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1096  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1097  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1098  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1099  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1100  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1101  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1102  * ~~                                                                 ~~
1103  * __                                                                 __
1104  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1105  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1106  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1107  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1108  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1109  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1110  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1111  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1112  * ~~                                                                 ~~
1113  * __                                                                 __
1114  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1115  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1116  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1117  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1118  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1119  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1120  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1121  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1122  * ~~                                                                 ~~
1123  * __                                                                 __
1124  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1125  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1126  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1127  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1128  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1129  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1130  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1131  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1132  * ~~                                                                 ~~
1133  *                   __                               __
1134  *                   |  0   0   1   0   0   0   0   0  |
1135  *                   | 167 100  5   41 159 169 217 208 |
1136  *                   | 166 100  4   40 158 168 216 209 |
1137  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1138  *                   |  0   0   0   0   1   0   0   0  |
1139  *                   |  0   0   0   0   0   1   0   0  |
1140  *                   |  0   0   0   0   0   0   1   0  |
1141  *                   |  0   0   0   0   0   0   0   1  |
1142  *                   ~~                               ~~
1143  *
1144  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1145  * of the missing data.
1146  *
1147  * As is apparent from the example above, the only non-trivial rows in the
1148  * inverse matrix correspond to the data disks that we're trying to
1149  * reconstruct. Indeed, those are the only rows we need as the others would
1150  * only be useful for reconstructing data known or assumed to be valid. For
1151  * that reason, we only build the coefficients in the rows that correspond to
1152  * targeted columns.
1153  */
1154 /* END CSTYLED */
1155 
1156 static void
1157 vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map,
1158     uint8_t **rows)
1159 {
1160 	int i, j;
1161 	int pow;
1162 
1163 	ASSERT(n == rr->rr_cols - rr->rr_firstdatacol);
1164 
1165 	/*
1166 	 * Fill in the missing rows of interest.
1167 	 */
1168 	for (i = 0; i < nmap; i++) {
1169 		ASSERT3S(0, <=, map[i]);
1170 		ASSERT3S(map[i], <=, 2);
1171 
1172 		pow = map[i] * n;
1173 		if (pow > 255)
1174 			pow -= 255;
1175 		ASSERT(pow <= 255);
1176 
1177 		for (j = 0; j < n; j++) {
1178 			pow -= map[i];
1179 			if (pow < 0)
1180 				pow += 255;
1181 			rows[i][j] = vdev_raidz_pow2[pow];
1182 		}
1183 	}
1184 }
1185 
1186 static void
1187 vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing,
1188     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1189 {
1190 	int i, j, ii, jj;
1191 	uint8_t log;
1192 
1193 	/*
1194 	 * Assert that the first nmissing entries from the array of used
1195 	 * columns correspond to parity columns and that subsequent entries
1196 	 * correspond to data columns.
1197 	 */
1198 	for (i = 0; i < nmissing; i++) {
1199 		ASSERT3S(used[i], <, rr->rr_firstdatacol);
1200 	}
1201 	for (; i < n; i++) {
1202 		ASSERT3S(used[i], >=, rr->rr_firstdatacol);
1203 	}
1204 
1205 	/*
1206 	 * First initialize the storage where we'll compute the inverse rows.
1207 	 */
1208 	for (i = 0; i < nmissing; i++) {
1209 		for (j = 0; j < n; j++) {
1210 			invrows[i][j] = (i == j) ? 1 : 0;
1211 		}
1212 	}
1213 
1214 	/*
1215 	 * Subtract all trivial rows from the rows of consequence.
1216 	 */
1217 	for (i = 0; i < nmissing; i++) {
1218 		for (j = nmissing; j < n; j++) {
1219 			ASSERT3U(used[j], >=, rr->rr_firstdatacol);
1220 			jj = used[j] - rr->rr_firstdatacol;
1221 			ASSERT3S(jj, <, n);
1222 			invrows[i][j] = rows[i][jj];
1223 			rows[i][jj] = 0;
1224 		}
1225 	}
1226 
1227 	/*
1228 	 * For each of the rows of interest, we must normalize it and subtract
1229 	 * a multiple of it from the other rows.
1230 	 */
1231 	for (i = 0; i < nmissing; i++) {
1232 		for (j = 0; j < missing[i]; j++) {
1233 			ASSERT0(rows[i][j]);
1234 		}
1235 		ASSERT3U(rows[i][missing[i]], !=, 0);
1236 
1237 		/*
1238 		 * Compute the inverse of the first element and multiply each
1239 		 * element in the row by that value.
1240 		 */
1241 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1242 
1243 		for (j = 0; j < n; j++) {
1244 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1245 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1246 		}
1247 
1248 		for (ii = 0; ii < nmissing; ii++) {
1249 			if (i == ii)
1250 				continue;
1251 
1252 			ASSERT3U(rows[ii][missing[i]], !=, 0);
1253 
1254 			log = vdev_raidz_log2[rows[ii][missing[i]]];
1255 
1256 			for (j = 0; j < n; j++) {
1257 				rows[ii][j] ^=
1258 				    vdev_raidz_exp2(rows[i][j], log);
1259 				invrows[ii][j] ^=
1260 				    vdev_raidz_exp2(invrows[i][j], log);
1261 			}
1262 		}
1263 	}
1264 
1265 	/*
1266 	 * Verify that the data that is left in the rows are properly part of
1267 	 * an identity matrix.
1268 	 */
1269 	for (i = 0; i < nmissing; i++) {
1270 		for (j = 0; j < n; j++) {
1271 			if (j == missing[i]) {
1272 				ASSERT3U(rows[i][j], ==, 1);
1273 			} else {
1274 				ASSERT0(rows[i][j]);
1275 			}
1276 		}
1277 	}
1278 }
1279 
1280 static void
1281 vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing,
1282     int *missing, uint8_t **invrows, const uint8_t *used)
1283 {
1284 	int i, j, x, cc, c;
1285 	uint8_t *src;
1286 	uint64_t ccount;
1287 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL };
1288 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 };
1289 	uint8_t log = 0;
1290 	uint8_t val;
1291 	int ll;
1292 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1293 	uint8_t *p, *pp;
1294 	size_t psize;
1295 
1296 	psize = sizeof (invlog[0][0]) * n * nmissing;
1297 	p = kmem_alloc(psize, KM_SLEEP);
1298 
1299 	for (pp = p, i = 0; i < nmissing; i++) {
1300 		invlog[i] = pp;
1301 		pp += n;
1302 	}
1303 
1304 	for (i = 0; i < nmissing; i++) {
1305 		for (j = 0; j < n; j++) {
1306 			ASSERT3U(invrows[i][j], !=, 0);
1307 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1308 		}
1309 	}
1310 
1311 	for (i = 0; i < n; i++) {
1312 		c = used[i];
1313 		ASSERT3U(c, <, rr->rr_cols);
1314 
1315 		ccount = rr->rr_col[c].rc_size;
1316 		ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0);
1317 		if (ccount == 0)
1318 			continue;
1319 		src = abd_to_buf(rr->rr_col[c].rc_abd);
1320 		for (j = 0; j < nmissing; j++) {
1321 			cc = missing[j] + rr->rr_firstdatacol;
1322 			ASSERT3U(cc, >=, rr->rr_firstdatacol);
1323 			ASSERT3U(cc, <, rr->rr_cols);
1324 			ASSERT3U(cc, !=, c);
1325 
1326 			dcount[j] = rr->rr_col[cc].rc_size;
1327 			if (dcount[j] != 0)
1328 				dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd);
1329 		}
1330 
1331 		for (x = 0; x < ccount; x++, src++) {
1332 			if (*src != 0)
1333 				log = vdev_raidz_log2[*src];
1334 
1335 			for (cc = 0; cc < nmissing; cc++) {
1336 				if (x >= dcount[cc])
1337 					continue;
1338 
1339 				if (*src == 0) {
1340 					val = 0;
1341 				} else {
1342 					if ((ll = log + invlog[cc][i]) >= 255)
1343 						ll -= 255;
1344 					val = vdev_raidz_pow2[ll];
1345 				}
1346 
1347 				if (i == 0)
1348 					dst[cc][x] = val;
1349 				else
1350 					dst[cc][x] ^= val;
1351 			}
1352 		}
1353 	}
1354 
1355 	kmem_free(p, psize);
1356 }
1357 
1358 static int
1359 vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts)
1360 {
1361 	int n, i, c, t, tt;
1362 	int nmissing_rows;
1363 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1364 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1365 	uint8_t *p, *pp;
1366 	size_t psize;
1367 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1368 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1369 	uint8_t *used;
1370 
1371 	abd_t **bufs = NULL;
1372 
1373 	int code = 0;
1374 
1375 	/*
1376 	 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
1377 	 * temporary linear ABDs if any non-linear ABDs are found.
1378 	 */
1379 	for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) {
1380 		if (!abd_is_linear(rr->rr_col[i].rc_abd)) {
1381 			bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *),
1382 			    KM_PUSHPAGE);
1383 
1384 			for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1385 				raidz_col_t *col = &rr->rr_col[c];
1386 
1387 				bufs[c] = col->rc_abd;
1388 				if (bufs[c] != NULL) {
1389 					col->rc_abd = abd_alloc_linear(
1390 					    col->rc_size, B_TRUE);
1391 					abd_copy(col->rc_abd, bufs[c],
1392 					    col->rc_size);
1393 				}
1394 			}
1395 
1396 			break;
1397 		}
1398 	}
1399 
1400 	n = rr->rr_cols - rr->rr_firstdatacol;
1401 
1402 	/*
1403 	 * Figure out which data columns are missing.
1404 	 */
1405 	nmissing_rows = 0;
1406 	for (t = 0; t < ntgts; t++) {
1407 		if (tgts[t] >= rr->rr_firstdatacol) {
1408 			missing_rows[nmissing_rows++] =
1409 			    tgts[t] - rr->rr_firstdatacol;
1410 		}
1411 	}
1412 
1413 	/*
1414 	 * Figure out which parity columns to use to help generate the missing
1415 	 * data columns.
1416 	 */
1417 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1418 		ASSERT(tt < ntgts);
1419 		ASSERT(c < rr->rr_firstdatacol);
1420 
1421 		/*
1422 		 * Skip any targeted parity columns.
1423 		 */
1424 		if (c == tgts[tt]) {
1425 			tt++;
1426 			continue;
1427 		}
1428 
1429 		code |= 1 << c;
1430 
1431 		parity_map[i] = c;
1432 		i++;
1433 	}
1434 
1435 	ASSERT(code != 0);
1436 	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1437 
1438 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1439 	    nmissing_rows * n + sizeof (used[0]) * n;
1440 	p = kmem_alloc(psize, KM_SLEEP);
1441 
1442 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1443 		rows[i] = pp;
1444 		pp += n;
1445 		invrows[i] = pp;
1446 		pp += n;
1447 	}
1448 	used = pp;
1449 
1450 	for (i = 0; i < nmissing_rows; i++) {
1451 		used[i] = parity_map[i];
1452 	}
1453 
1454 	for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1455 		if (tt < nmissing_rows &&
1456 		    c == missing_rows[tt] + rr->rr_firstdatacol) {
1457 			tt++;
1458 			continue;
1459 		}
1460 
1461 		ASSERT3S(i, <, n);
1462 		used[i] = c;
1463 		i++;
1464 	}
1465 
1466 	/*
1467 	 * Initialize the interesting rows of the matrix.
1468 	 */
1469 	vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows);
1470 
1471 	/*
1472 	 * Invert the matrix.
1473 	 */
1474 	vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows,
1475 	    invrows, used);
1476 
1477 	/*
1478 	 * Reconstruct the missing data using the generated matrix.
1479 	 */
1480 	vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows,
1481 	    invrows, used);
1482 
1483 	kmem_free(p, psize);
1484 
1485 	/*
1486 	 * copy back from temporary linear abds and free them
1487 	 */
1488 	if (bufs) {
1489 		for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1490 			raidz_col_t *col = &rr->rr_col[c];
1491 
1492 			if (bufs[c] != NULL) {
1493 				abd_copy(bufs[c], col->rc_abd, col->rc_size);
1494 				abd_free(col->rc_abd);
1495 			}
1496 			col->rc_abd = bufs[c];
1497 		}
1498 		kmem_free(bufs, rr->rr_cols * sizeof (abd_t *));
1499 	}
1500 
1501 	return (code);
1502 }
1503 
1504 static int
1505 vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr,
1506     const int *t, int nt)
1507 {
1508 	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1509 	int ntgts;
1510 	int i, c, ret;
1511 	int code;
1512 	int nbadparity, nbaddata;
1513 	int parity_valid[VDEV_RAIDZ_MAXPARITY];
1514 
1515 	nbadparity = rr->rr_firstdatacol;
1516 	nbaddata = rr->rr_cols - nbadparity;
1517 	ntgts = 0;
1518 	for (i = 0, c = 0; c < rr->rr_cols; c++) {
1519 		if (c < rr->rr_firstdatacol)
1520 			parity_valid[c] = B_FALSE;
1521 
1522 		if (i < nt && c == t[i]) {
1523 			tgts[ntgts++] = c;
1524 			i++;
1525 		} else if (rr->rr_col[c].rc_error != 0) {
1526 			tgts[ntgts++] = c;
1527 		} else if (c >= rr->rr_firstdatacol) {
1528 			nbaddata--;
1529 		} else {
1530 			parity_valid[c] = B_TRUE;
1531 			nbadparity--;
1532 		}
1533 	}
1534 
1535 	ASSERT(ntgts >= nt);
1536 	ASSERT(nbaddata >= 0);
1537 	ASSERT(nbaddata + nbadparity == ntgts);
1538 
1539 	dt = &tgts[nbadparity];
1540 
1541 	/* Reconstruct using the new math implementation */
1542 	ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata);
1543 	if (ret != RAIDZ_ORIGINAL_IMPL)
1544 		return (ret);
1545 
1546 	/*
1547 	 * See if we can use any of our optimized reconstruction routines.
1548 	 */
1549 	switch (nbaddata) {
1550 	case 1:
1551 		if (parity_valid[VDEV_RAIDZ_P])
1552 			return (vdev_raidz_reconstruct_p(rr, dt, 1));
1553 
1554 		ASSERT(rr->rr_firstdatacol > 1);
1555 
1556 		if (parity_valid[VDEV_RAIDZ_Q])
1557 			return (vdev_raidz_reconstruct_q(rr, dt, 1));
1558 
1559 		ASSERT(rr->rr_firstdatacol > 2);
1560 		break;
1561 
1562 	case 2:
1563 		ASSERT(rr->rr_firstdatacol > 1);
1564 
1565 		if (parity_valid[VDEV_RAIDZ_P] &&
1566 		    parity_valid[VDEV_RAIDZ_Q])
1567 			return (vdev_raidz_reconstruct_pq(rr, dt, 2));
1568 
1569 		ASSERT(rr->rr_firstdatacol > 2);
1570 
1571 		break;
1572 	}
1573 
1574 	code = vdev_raidz_reconstruct_general(rr, tgts, ntgts);
1575 	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1576 	ASSERT(code > 0);
1577 	return (code);
1578 }
1579 
1580 static int
1581 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1582     uint64_t *logical_ashift, uint64_t *physical_ashift)
1583 {
1584 	vdev_raidz_t *vdrz = vd->vdev_tsd;
1585 	uint64_t nparity = vdrz->vd_nparity;
1586 	int c;
1587 	int lasterror = 0;
1588 	int numerrors = 0;
1589 
1590 	ASSERT(nparity > 0);
1591 
1592 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
1593 	    vd->vdev_children < nparity + 1) {
1594 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1595 		return (SET_ERROR(EINVAL));
1596 	}
1597 
1598 	vdev_open_children(vd);
1599 
1600 	for (c = 0; c < vd->vdev_children; c++) {
1601 		vdev_t *cvd = vd->vdev_child[c];
1602 
1603 		if (cvd->vdev_open_error != 0) {
1604 			lasterror = cvd->vdev_open_error;
1605 			numerrors++;
1606 			continue;
1607 		}
1608 
1609 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1610 		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1611 		*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
1612 		*physical_ashift = MAX(*physical_ashift,
1613 		    cvd->vdev_physical_ashift);
1614 	}
1615 
1616 	*asize *= vd->vdev_children;
1617 	*max_asize *= vd->vdev_children;
1618 
1619 	if (numerrors > nparity) {
1620 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1621 		return (lasterror);
1622 	}
1623 
1624 	return (0);
1625 }
1626 
1627 static void
1628 vdev_raidz_close(vdev_t *vd)
1629 {
1630 	for (int c = 0; c < vd->vdev_children; c++) {
1631 		if (vd->vdev_child[c] != NULL)
1632 			vdev_close(vd->vdev_child[c]);
1633 	}
1634 }
1635 
1636 static uint64_t
1637 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1638 {
1639 	vdev_raidz_t *vdrz = vd->vdev_tsd;
1640 	uint64_t asize;
1641 	uint64_t ashift = vd->vdev_top->vdev_ashift;
1642 	uint64_t cols = vdrz->vd_logical_width;
1643 	uint64_t nparity = vdrz->vd_nparity;
1644 
1645 	asize = ((psize - 1) >> ashift) + 1;
1646 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1647 	asize = roundup(asize, nparity + 1) << ashift;
1648 
1649 	return (asize);
1650 }
1651 
1652 /*
1653  * The allocatable space for a raidz vdev is N * sizeof(smallest child)
1654  * so each child must provide at least 1/Nth of its asize.
1655  */
1656 static uint64_t
1657 vdev_raidz_min_asize(vdev_t *vd)
1658 {
1659 	return ((vd->vdev_min_asize + vd->vdev_children - 1) /
1660 	    vd->vdev_children);
1661 }
1662 
1663 void
1664 vdev_raidz_child_done(zio_t *zio)
1665 {
1666 	raidz_col_t *rc = zio->io_private;
1667 
1668 	rc->rc_error = zio->io_error;
1669 	rc->rc_tried = 1;
1670 	rc->rc_skipped = 0;
1671 }
1672 
1673 static void
1674 vdev_raidz_io_verify(vdev_t *vd, raidz_row_t *rr, int col)
1675 {
1676 #ifdef ZFS_DEBUG
1677 	vdev_t *tvd = vd->vdev_top;
1678 
1679 	range_seg64_t logical_rs, physical_rs, remain_rs;
1680 	logical_rs.rs_start = rr->rr_offset;
1681 	logical_rs.rs_end = logical_rs.rs_start +
1682 	    vdev_raidz_asize(vd, rr->rr_size);
1683 
1684 	raidz_col_t *rc = &rr->rr_col[col];
1685 	vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
1686 
1687 	vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs);
1688 	ASSERT(vdev_xlate_is_empty(&remain_rs));
1689 	ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
1690 	ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
1691 	/*
1692 	 * It would be nice to assert that rs_end is equal
1693 	 * to rc_offset + rc_size but there might be an
1694 	 * optional I/O at the end that is not accounted in
1695 	 * rc_size.
1696 	 */
1697 	if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
1698 		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
1699 		    rc->rc_size + (1 << tvd->vdev_ashift));
1700 	} else {
1701 		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
1702 	}
1703 #endif
1704 }
1705 
1706 static void
1707 vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr, uint64_t ashift)
1708 {
1709 	vdev_t *vd = zio->io_vd;
1710 	raidz_map_t *rm = zio->io_vsd;
1711 	int c, i;
1712 
1713 	vdev_raidz_generate_parity_row(rm, rr);
1714 
1715 	for (int c = 0; c < rr->rr_cols; c++) {
1716 		raidz_col_t *rc = &rr->rr_col[c];
1717 		if (rc->rc_size == 0)
1718 			continue;
1719 
1720 		/* Verify physical to logical translation */
1721 		vdev_raidz_io_verify(vd, rr, c);
1722 
1723 		zio_nowait(zio_vdev_child_io(zio, NULL,
1724 		    vd->vdev_child[rc->rc_devidx], rc->rc_offset,
1725 		    rc->rc_abd, rc->rc_size, zio->io_type, zio->io_priority,
1726 		    0, vdev_raidz_child_done, rc));
1727 	}
1728 
1729 	/*
1730 	 * Generate optional I/Os for skip sectors to improve aggregation
1731 	 * contiguity.
1732 	 */
1733 	for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1734 		ASSERT(c <= rr->rr_scols);
1735 		if (c == rr->rr_scols)
1736 			c = 0;
1737 
1738 		raidz_col_t *rc = &rr->rr_col[c];
1739 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
1740 
1741 		zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1742 		    rc->rc_offset + rc->rc_size, NULL, 1ULL << ashift,
1743 		    zio->io_type, zio->io_priority,
1744 		    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1745 	}
1746 }
1747 
1748 static void
1749 vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr)
1750 {
1751 	vdev_t *vd = zio->io_vd;
1752 
1753 	/*
1754 	 * Iterate over the columns in reverse order so that we hit the parity
1755 	 * last -- any errors along the way will force us to read the parity.
1756 	 */
1757 	for (int c = rr->rr_cols - 1; c >= 0; c--) {
1758 		raidz_col_t *rc = &rr->rr_col[c];
1759 		if (rc->rc_size == 0)
1760 			continue;
1761 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
1762 		if (!vdev_readable(cvd)) {
1763 			if (c >= rr->rr_firstdatacol)
1764 				rr->rr_missingdata++;
1765 			else
1766 				rr->rr_missingparity++;
1767 			rc->rc_error = SET_ERROR(ENXIO);
1768 			rc->rc_tried = 1;	/* don't even try */
1769 			rc->rc_skipped = 1;
1770 			continue;
1771 		}
1772 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1773 			if (c >= rr->rr_firstdatacol)
1774 				rr->rr_missingdata++;
1775 			else
1776 				rr->rr_missingparity++;
1777 			rc->rc_error = SET_ERROR(ESTALE);
1778 			rc->rc_skipped = 1;
1779 			continue;
1780 		}
1781 		if (c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
1782 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1783 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1784 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
1785 			    zio->io_type, zio->io_priority, 0,
1786 			    vdev_raidz_child_done, rc));
1787 		}
1788 	}
1789 }
1790 
1791 /*
1792  * Start an IO operation on a RAIDZ VDev
1793  *
1794  * Outline:
1795  * - For write operations:
1796  *   1. Generate the parity data
1797  *   2. Create child zio write operations to each column's vdev, for both
1798  *      data and parity.
1799  *   3. If the column skips any sectors for padding, create optional dummy
1800  *      write zio children for those areas to improve aggregation continuity.
1801  * - For read operations:
1802  *   1. Create child zio read operations to each data column's vdev to read
1803  *      the range of data required for zio.
1804  *   2. If this is a scrub or resilver operation, or if any of the data
1805  *      vdevs have had errors, then create zio read operations to the parity
1806  *      columns' VDevs as well.
1807  */
1808 static void
1809 vdev_raidz_io_start(zio_t *zio)
1810 {
1811 	vdev_t *vd = zio->io_vd;
1812 	vdev_t *tvd = vd->vdev_top;
1813 	vdev_raidz_t *vdrz = vd->vdev_tsd;
1814 	raidz_map_t *rm;
1815 
1816 	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift,
1817 	    vdrz->vd_logical_width, vdrz->vd_nparity);
1818 
1819 	/*
1820 	 * Until raidz expansion is implemented all maps for a raidz vdev
1821 	 * contain a single row.
1822 	 */
1823 	ASSERT3U(rm->rm_nrows, ==, 1);
1824 	raidz_row_t *rr = rm->rm_row[0];
1825 
1826 	zio->io_vsd = rm;
1827 	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1828 
1829 	if (zio->io_type == ZIO_TYPE_WRITE) {
1830 		vdev_raidz_io_start_write(zio, rr, tvd->vdev_ashift);
1831 	} else {
1832 		ASSERT(zio->io_type == ZIO_TYPE_READ);
1833 		vdev_raidz_io_start_read(zio, rr);
1834 	}
1835 
1836 	zio_execute(zio);
1837 }
1838 
1839 /*
1840  * Report a checksum error for a child of a RAID-Z device.
1841  */
1842 static void
1843 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
1844 {
1845 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1846 
1847 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) &&
1848 	    zio->io_priority != ZIO_PRIORITY_REBUILD) {
1849 		zio_bad_cksum_t zbc;
1850 		raidz_map_t *rm = zio->io_vsd;
1851 
1852 		zbc.zbc_has_cksum = 0;
1853 		zbc.zbc_injected = rm->rm_ecksuminjected;
1854 
1855 		(void) zfs_ereport_post_checksum(zio->io_spa, vd,
1856 		    &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size,
1857 		    rc->rc_abd, bad_data, &zbc);
1858 		mutex_enter(&vd->vdev_stat_lock);
1859 		vd->vdev_stat.vs_checksum_errors++;
1860 		mutex_exit(&vd->vdev_stat_lock);
1861 	}
1862 }
1863 
1864 /*
1865  * We keep track of whether or not there were any injected errors, so that
1866  * any ereports we generate can note it.
1867  */
1868 static int
1869 raidz_checksum_verify(zio_t *zio)
1870 {
1871 	zio_bad_cksum_t zbc;
1872 	raidz_map_t *rm = zio->io_vsd;
1873 
1874 	bzero(&zbc, sizeof (zio_bad_cksum_t));
1875 
1876 	int ret = zio_checksum_error(zio, &zbc);
1877 	if (ret != 0 && zbc.zbc_injected != 0)
1878 		rm->rm_ecksuminjected = 1;
1879 
1880 	return (ret);
1881 }
1882 
1883 /*
1884  * Generate the parity from the data columns. If we tried and were able to
1885  * read the parity without error, verify that the generated parity matches the
1886  * data we read. If it doesn't, we fire off a checksum error. Return the
1887  * number of such failures.
1888  */
1889 static int
1890 raidz_parity_verify(zio_t *zio, raidz_row_t *rr)
1891 {
1892 	abd_t *orig[VDEV_RAIDZ_MAXPARITY];
1893 	int c, ret = 0;
1894 	raidz_map_t *rm = zio->io_vsd;
1895 	raidz_col_t *rc;
1896 
1897 	blkptr_t *bp = zio->io_bp;
1898 	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
1899 	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
1900 
1901 	if (checksum == ZIO_CHECKSUM_NOPARITY)
1902 		return (ret);
1903 
1904 	for (c = 0; c < rr->rr_firstdatacol; c++) {
1905 		rc = &rr->rr_col[c];
1906 		if (!rc->rc_tried || rc->rc_error != 0)
1907 			continue;
1908 
1909 		orig[c] = abd_alloc_sametype(rc->rc_abd, rc->rc_size);
1910 		abd_copy(orig[c], rc->rc_abd, rc->rc_size);
1911 	}
1912 
1913 	/*
1914 	 * Regenerates parity even for !tried||rc_error!=0 columns.  This
1915 	 * isn't harmful but it does have the side effect of fixing stuff
1916 	 * we didn't realize was necessary (i.e. even if we return 0).
1917 	 */
1918 	vdev_raidz_generate_parity_row(rm, rr);
1919 
1920 	for (c = 0; c < rr->rr_firstdatacol; c++) {
1921 		rc = &rr->rr_col[c];
1922 
1923 		if (!rc->rc_tried || rc->rc_error != 0)
1924 			continue;
1925 
1926 		if (abd_cmp(orig[c], rc->rc_abd) != 0) {
1927 			raidz_checksum_error(zio, rc, orig[c]);
1928 			rc->rc_error = SET_ERROR(ECKSUM);
1929 			ret++;
1930 		}
1931 		abd_free(orig[c]);
1932 	}
1933 
1934 	return (ret);
1935 }
1936 
1937 static int
1938 vdev_raidz_worst_error(raidz_row_t *rr)
1939 {
1940 	int error = 0;
1941 
1942 	for (int c = 0; c < rr->rr_cols; c++)
1943 		error = zio_worst_error(error, rr->rr_col[c].rc_error);
1944 
1945 	return (error);
1946 }
1947 
1948 static void
1949 vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
1950 {
1951 	int unexpected_errors = 0;
1952 	int parity_errors = 0;
1953 	int parity_untried = 0;
1954 	int data_errors = 0;
1955 
1956 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
1957 
1958 	for (int c = 0; c < rr->rr_cols; c++) {
1959 		raidz_col_t *rc = &rr->rr_col[c];
1960 
1961 		if (rc->rc_error) {
1962 			if (c < rr->rr_firstdatacol)
1963 				parity_errors++;
1964 			else
1965 				data_errors++;
1966 
1967 			if (!rc->rc_skipped)
1968 				unexpected_errors++;
1969 		} else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
1970 			parity_untried++;
1971 		}
1972 	}
1973 
1974 	/*
1975 	 * If we read more parity disks than were used for
1976 	 * reconstruction, confirm that the other parity disks produced
1977 	 * correct data.
1978 	 *
1979 	 * Note that we also regenerate parity when resilvering so we
1980 	 * can write it out to failed devices later.
1981 	 */
1982 	if (parity_errors + parity_untried <
1983 	    rr->rr_firstdatacol - data_errors ||
1984 	    (zio->io_flags & ZIO_FLAG_RESILVER)) {
1985 		int n = raidz_parity_verify(zio, rr);
1986 		unexpected_errors += n;
1987 		ASSERT3U(parity_errors + n, <=, rr->rr_firstdatacol);
1988 	}
1989 
1990 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
1991 	    (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) {
1992 		/*
1993 		 * Use the good data we have in hand to repair damaged children.
1994 		 */
1995 		for (int c = 0; c < rr->rr_cols; c++) {
1996 			raidz_col_t *rc = &rr->rr_col[c];
1997 			vdev_t *vd = zio->io_vd;
1998 			vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
1999 
2000 			if ((rc->rc_error == 0 || rc->rc_size == 0) &&
2001 			    (rc->rc_repair == 0)) {
2002 				continue;
2003 			}
2004 
2005 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2006 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
2007 			    ZIO_TYPE_WRITE,
2008 			    zio->io_priority == ZIO_PRIORITY_REBUILD ?
2009 			    ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE,
2010 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2011 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2012 		}
2013 	}
2014 }
2015 
2016 static void
2017 raidz_restore_orig_data(raidz_map_t *rm)
2018 {
2019 	for (int i = 0; i < rm->rm_nrows; i++) {
2020 		raidz_row_t *rr = rm->rm_row[i];
2021 		for (int c = 0; c < rr->rr_cols; c++) {
2022 			raidz_col_t *rc = &rr->rr_col[c];
2023 			if (rc->rc_need_orig_restore) {
2024 				abd_copy_from_buf(rc->rc_abd,
2025 				    rc->rc_orig_data, rc->rc_size);
2026 				rc->rc_need_orig_restore = B_FALSE;
2027 			}
2028 		}
2029 	}
2030 }
2031 
2032 /*
2033  * returns EINVAL if reconstruction of the block will not be possible
2034  * returns ECKSUM if this specific reconstruction failed
2035  * returns 0 on successful reconstruction
2036  */
2037 static int
2038 raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
2039 {
2040 	raidz_map_t *rm = zio->io_vsd;
2041 
2042 	/* Reconstruct each row */
2043 	for (int r = 0; r < rm->rm_nrows; r++) {
2044 		raidz_row_t *rr = rm->rm_row[r];
2045 		int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */
2046 		int t = 0;
2047 		int dead = 0;
2048 		int dead_data = 0;
2049 
2050 		for (int c = 0; c < rr->rr_cols; c++) {
2051 			raidz_col_t *rc = &rr->rr_col[c];
2052 			ASSERT0(rc->rc_need_orig_restore);
2053 			if (rc->rc_error != 0) {
2054 				dead++;
2055 				if (c >= nparity)
2056 					dead_data++;
2057 				continue;
2058 			}
2059 			if (rc->rc_size == 0)
2060 				continue;
2061 			for (int lt = 0; lt < ntgts; lt++) {
2062 				if (rc->rc_devidx == ltgts[lt]) {
2063 					if (rc->rc_orig_data == NULL) {
2064 						rc->rc_orig_data =
2065 						    zio_buf_alloc(rc->rc_size);
2066 						abd_copy_to_buf(
2067 						    rc->rc_orig_data,
2068 						    rc->rc_abd, rc->rc_size);
2069 					}
2070 					rc->rc_need_orig_restore = B_TRUE;
2071 
2072 					dead++;
2073 					if (c >= nparity)
2074 						dead_data++;
2075 					my_tgts[t++] = c;
2076 					break;
2077 				}
2078 			}
2079 		}
2080 		if (dead > nparity) {
2081 			/* reconstruction not possible */
2082 			raidz_restore_orig_data(rm);
2083 			return (EINVAL);
2084 		}
2085 		rr->rr_code = 0;
2086 		if (dead_data > 0)
2087 			rr->rr_code = vdev_raidz_reconstruct_row(rm, rr,
2088 			    my_tgts, t);
2089 	}
2090 
2091 	/* Check for success */
2092 	if (raidz_checksum_verify(zio) == 0) {
2093 
2094 		/* Reconstruction succeeded - report errors */
2095 		for (int i = 0; i < rm->rm_nrows; i++) {
2096 			raidz_row_t *rr = rm->rm_row[i];
2097 
2098 			for (int c = 0; c < rr->rr_cols; c++) {
2099 				raidz_col_t *rc = &rr->rr_col[c];
2100 				if (rc->rc_need_orig_restore) {
2101 					/*
2102 					 * Note: if this is a parity column,
2103 					 * we don't really know if it's wrong.
2104 					 * We need to let
2105 					 * vdev_raidz_io_done_verified() check
2106 					 * it, and if we set rc_error, it will
2107 					 * think that it is a "known" error
2108 					 * that doesn't need to be checked
2109 					 * or corrected.
2110 					 */
2111 					if (rc->rc_error == 0 &&
2112 					    c >= rr->rr_firstdatacol) {
2113 						raidz_checksum_error(zio,
2114 						    rc, rc->rc_gdata);
2115 						rc->rc_error =
2116 						    SET_ERROR(ECKSUM);
2117 					}
2118 					rc->rc_need_orig_restore = B_FALSE;
2119 				}
2120 			}
2121 
2122 			vdev_raidz_io_done_verified(zio, rr);
2123 		}
2124 
2125 		zio_checksum_verified(zio);
2126 
2127 		return (0);
2128 	}
2129 
2130 	/* Reconstruction failed - restore original data */
2131 	raidz_restore_orig_data(rm);
2132 	return (ECKSUM);
2133 }
2134 
2135 /*
2136  * Iterate over all combinations of N bad vdevs and attempt a reconstruction.
2137  * Note that the algorithm below is non-optimal because it doesn't take into
2138  * account how reconstruction is actually performed. For example, with
2139  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
2140  * is targeted as invalid as if columns 1 and 4 are targeted since in both
2141  * cases we'd only use parity information in column 0.
2142  *
2143  * The order that we find the various possible combinations of failed
2144  * disks is dictated by these rules:
2145  * - Examine each "slot" (the "i" in tgts[i])
2146  *   - Try to increment this slot (tgts[i] = tgts[i] + 1)
2147  *   - if we can't increment because it runs into the next slot,
2148  *     reset our slot to the minimum, and examine the next slot
2149  *
2150  *  For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose
2151  *  3 columns to reconstruct), we will generate the following sequence:
2152  *
2153  *  STATE        ACTION
2154  *  0 1 2        special case: skip since these are all parity
2155  *  0 1   3      first slot: reset to 0; middle slot: increment to 2
2156  *  0   2 3      first slot: increment to 1
2157  *    1 2 3      first: reset to 0; middle: reset to 1; last: increment to 4
2158  *  0 1     4    first: reset to 0; middle: increment to 2
2159  *  0   2   4    first: increment to 1
2160  *    1 2   4    first: reset to 0; middle: increment to 3
2161  *  0     3 4    first: increment to 1
2162  *    1   3 4    first: increment to 2
2163  *      2 3 4    first: reset to 0; middle: reset to 1; last: increment to 5
2164  *  0 1       5  first: reset to 0; middle: increment to 2
2165  *  0   2     5  first: increment to 1
2166  *    1 2     5  first: reset to 0; middle: increment to 3
2167  *  0     3   5  first: increment to 1
2168  *    1   3   5  first: increment to 2
2169  *      2 3   5  first: reset to 0; middle: increment to 4
2170  *  0       4 5  first: increment to 1
2171  *    1     4 5  first: increment to 2
2172  *      2   4 5  first: increment to 3
2173  *        3 4 5  done
2174  *
2175  * This strategy works for dRAID but is less effecient when there are a large
2176  * number of child vdevs and therefore permutations to check. Furthermore,
2177  * since the raidz_map_t rows likely do not overlap reconstruction would be
2178  * possible as long as there are no more than nparity data errors per row.
2179  * These additional permutations are not currently checked but could be as
2180  * a future improvement.
2181  */
2182 static int
2183 vdev_raidz_combrec(zio_t *zio)
2184 {
2185 	int nparity = vdev_get_nparity(zio->io_vd);
2186 	raidz_map_t *rm = zio->io_vsd;
2187 
2188 	/* Check if there's enough data to attempt reconstrution. */
2189 	for (int i = 0; i < rm->rm_nrows; i++) {
2190 		raidz_row_t *rr = rm->rm_row[i];
2191 		int total_errors = 0;
2192 
2193 		for (int c = 0; c < rr->rr_cols; c++) {
2194 			if (rr->rr_col[c].rc_error)
2195 				total_errors++;
2196 		}
2197 
2198 		if (total_errors > nparity)
2199 			return (vdev_raidz_worst_error(rr));
2200 	}
2201 
2202 	for (int num_failures = 1; num_failures <= nparity; num_failures++) {
2203 		int tstore[VDEV_RAIDZ_MAXPARITY + 2];
2204 		int *ltgts = &tstore[1]; /* value is logical child ID */
2205 
2206 		/* Determine number of logical children, n */
2207 		int n = zio->io_vd->vdev_children;
2208 
2209 		ASSERT3U(num_failures, <=, nparity);
2210 		ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY);
2211 
2212 		/* Handle corner cases in combrec logic */
2213 		ltgts[-1] = -1;
2214 		for (int i = 0; i < num_failures; i++) {
2215 			ltgts[i] = i;
2216 		}
2217 		ltgts[num_failures] = n;
2218 
2219 		for (;;) {
2220 			int err = raidz_reconstruct(zio, ltgts, num_failures,
2221 			    nparity);
2222 			if (err == EINVAL) {
2223 				/*
2224 				 * Reconstruction not possible with this #
2225 				 * failures; try more failures.
2226 				 */
2227 				break;
2228 			} else if (err == 0)
2229 				return (0);
2230 
2231 			/* Compute next targets to try */
2232 			for (int t = 0; ; t++) {
2233 				ASSERT3U(t, <, num_failures);
2234 				ltgts[t]++;
2235 				if (ltgts[t] == n) {
2236 					/* try more failures */
2237 					ASSERT3U(t, ==, num_failures - 1);
2238 					break;
2239 				}
2240 
2241 				ASSERT3U(ltgts[t], <, n);
2242 				ASSERT3U(ltgts[t], <=, ltgts[t + 1]);
2243 
2244 				/*
2245 				 * If that spot is available, we're done here.
2246 				 * Try the next combination.
2247 				 */
2248 				if (ltgts[t] != ltgts[t + 1])
2249 					break;
2250 
2251 				/*
2252 				 * Otherwise, reset this tgt to the minimum,
2253 				 * and move on to the next tgt.
2254 				 */
2255 				ltgts[t] = ltgts[t - 1] + 1;
2256 				ASSERT3U(ltgts[t], ==, t);
2257 			}
2258 
2259 			/* Increase the number of failures and keep trying. */
2260 			if (ltgts[num_failures - 1] == n)
2261 				break;
2262 		}
2263 	}
2264 
2265 	return (ECKSUM);
2266 }
2267 
2268 void
2269 vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
2270 {
2271 	for (uint64_t row = 0; row < rm->rm_nrows; row++) {
2272 		raidz_row_t *rr = rm->rm_row[row];
2273 		vdev_raidz_reconstruct_row(rm, rr, t, nt);
2274 	}
2275 }
2276 
2277 /*
2278  * Complete a write IO operation on a RAIDZ VDev
2279  *
2280  * Outline:
2281  *   1. Check for errors on the child IOs.
2282  *   2. Return, setting an error code if too few child VDevs were written
2283  *      to reconstruct the data later.  Note that partial writes are
2284  *      considered successful if they can be reconstructed at all.
2285  */
2286 static void
2287 vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)
2288 {
2289 	int total_errors = 0;
2290 
2291 	ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
2292 	ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
2293 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
2294 
2295 	for (int c = 0; c < rr->rr_cols; c++) {
2296 		raidz_col_t *rc = &rr->rr_col[c];
2297 
2298 		if (rc->rc_error) {
2299 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
2300 
2301 			total_errors++;
2302 		}
2303 	}
2304 
2305 	/*
2306 	 * Treat partial writes as a success. If we couldn't write enough
2307 	 * columns to reconstruct the data, the I/O failed.  Otherwise,
2308 	 * good enough.
2309 	 *
2310 	 * Now that we support write reallocation, it would be better
2311 	 * to treat partial failure as real failure unless there are
2312 	 * no non-degraded top-level vdevs left, and not update DTLs
2313 	 * if we intend to reallocate.
2314 	 */
2315 	if (total_errors > rr->rr_firstdatacol) {
2316 		zio->io_error = zio_worst_error(zio->io_error,
2317 		    vdev_raidz_worst_error(rr));
2318 	}
2319 }
2320 
2321 /*
2322  * return 0 if no reconstruction occurred, otherwise the "code" from
2323  * vdev_raidz_reconstruct().
2324  */
2325 static int
2326 vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm,
2327     raidz_row_t *rr)
2328 {
2329 	int parity_errors = 0;
2330 	int parity_untried = 0;
2331 	int data_errors = 0;
2332 	int total_errors = 0;
2333 	int code = 0;
2334 
2335 	ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
2336 	ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
2337 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
2338 
2339 	for (int c = 0; c < rr->rr_cols; c++) {
2340 		raidz_col_t *rc = &rr->rr_col[c];
2341 
2342 		if (rc->rc_error) {
2343 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
2344 
2345 			if (c < rr->rr_firstdatacol)
2346 				parity_errors++;
2347 			else
2348 				data_errors++;
2349 
2350 			total_errors++;
2351 		} else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
2352 			parity_untried++;
2353 		}
2354 	}
2355 
2356 	/*
2357 	 * If there were data errors and the number of errors we saw was
2358 	 * correctable -- less than or equal to the number of parity disks read
2359 	 * -- reconstruct based on the missing data.
2360 	 */
2361 	if (data_errors != 0 &&
2362 	    total_errors <= rr->rr_firstdatacol - parity_untried) {
2363 		/*
2364 		 * We either attempt to read all the parity columns or
2365 		 * none of them. If we didn't try to read parity, we
2366 		 * wouldn't be here in the correctable case. There must
2367 		 * also have been fewer parity errors than parity
2368 		 * columns or, again, we wouldn't be in this code path.
2369 		 */
2370 		ASSERT(parity_untried == 0);
2371 		ASSERT(parity_errors < rr->rr_firstdatacol);
2372 
2373 		/*
2374 		 * Identify the data columns that reported an error.
2375 		 */
2376 		int n = 0;
2377 		int tgts[VDEV_RAIDZ_MAXPARITY];
2378 		for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
2379 			raidz_col_t *rc = &rr->rr_col[c];
2380 			if (rc->rc_error != 0) {
2381 				ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2382 				tgts[n++] = c;
2383 			}
2384 		}
2385 
2386 		ASSERT(rr->rr_firstdatacol >= n);
2387 
2388 		code = vdev_raidz_reconstruct_row(rm, rr, tgts, n);
2389 	}
2390 
2391 	return (code);
2392 }
2393 
2394 /*
2395  * Return the number of reads issued.
2396  */
2397 static int
2398 vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr)
2399 {
2400 	vdev_t *vd = zio->io_vd;
2401 	int nread = 0;
2402 
2403 	rr->rr_missingdata = 0;
2404 	rr->rr_missingparity = 0;
2405 
2406 	/*
2407 	 * If this rows contains empty sectors which are not required
2408 	 * for a normal read then allocate an ABD for them now so they
2409 	 * may be read, verified, and any needed repairs performed.
2410 	 */
2411 	if (rr->rr_nempty && rr->rr_abd_empty == NULL)
2412 		vdev_draid_map_alloc_empty(zio, rr);
2413 
2414 	for (int c = 0; c < rr->rr_cols; c++) {
2415 		raidz_col_t *rc = &rr->rr_col[c];
2416 		if (rc->rc_tried || rc->rc_size == 0)
2417 			continue;
2418 
2419 		zio_nowait(zio_vdev_child_io(zio, NULL,
2420 		    vd->vdev_child[rc->rc_devidx],
2421 		    rc->rc_offset, rc->rc_abd, rc->rc_size,
2422 		    zio->io_type, zio->io_priority, 0,
2423 		    vdev_raidz_child_done, rc));
2424 		nread++;
2425 	}
2426 	return (nread);
2427 }
2428 
2429 /*
2430  * We're here because either there were too many errors to even attempt
2431  * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec()
2432  * failed. In either case, there is enough bad data to prevent reconstruction.
2433  * Start checksum ereports for all children which haven't failed.
2434  */
2435 static void
2436 vdev_raidz_io_done_unrecoverable(zio_t *zio)
2437 {
2438 	raidz_map_t *rm = zio->io_vsd;
2439 
2440 	for (int i = 0; i < rm->rm_nrows; i++) {
2441 		raidz_row_t *rr = rm->rm_row[i];
2442 
2443 		for (int c = 0; c < rr->rr_cols; c++) {
2444 			raidz_col_t *rc = &rr->rr_col[c];
2445 			vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
2446 
2447 			if (rc->rc_error != 0)
2448 				continue;
2449 
2450 			zio_bad_cksum_t zbc;
2451 			zbc.zbc_has_cksum = 0;
2452 			zbc.zbc_injected = rm->rm_ecksuminjected;
2453 
2454 			(void) zfs_ereport_start_checksum(zio->io_spa,
2455 			    cvd, &zio->io_bookmark, zio, rc->rc_offset,
2456 			    rc->rc_size, (void *)(uintptr_t)c, &zbc);
2457 			mutex_enter(&cvd->vdev_stat_lock);
2458 			cvd->vdev_stat.vs_checksum_errors++;
2459 			mutex_exit(&cvd->vdev_stat_lock);
2460 		}
2461 	}
2462 }
2463 
2464 void
2465 vdev_raidz_io_done(zio_t *zio)
2466 {
2467 	raidz_map_t *rm = zio->io_vsd;
2468 
2469 	if (zio->io_type == ZIO_TYPE_WRITE) {
2470 		for (int i = 0; i < rm->rm_nrows; i++) {
2471 			vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]);
2472 		}
2473 	} else {
2474 		for (int i = 0; i < rm->rm_nrows; i++) {
2475 			raidz_row_t *rr = rm->rm_row[i];
2476 			rr->rr_code =
2477 			    vdev_raidz_io_done_reconstruct_known_missing(zio,
2478 			    rm, rr);
2479 		}
2480 
2481 		if (raidz_checksum_verify(zio) == 0) {
2482 			for (int i = 0; i < rm->rm_nrows; i++) {
2483 				raidz_row_t *rr = rm->rm_row[i];
2484 				vdev_raidz_io_done_verified(zio, rr);
2485 			}
2486 			zio_checksum_verified(zio);
2487 		} else {
2488 			/*
2489 			 * A sequential resilver has no checksum which makes
2490 			 * combinatoral reconstruction impossible. This code
2491 			 * path is unreachable since raidz_checksum_verify()
2492 			 * has no checksum to verify and must succeed.
2493 			 */
2494 			ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD);
2495 
2496 			/*
2497 			 * This isn't a typical situation -- either we got a
2498 			 * read error or a child silently returned bad data.
2499 			 * Read every block so we can try again with as much
2500 			 * data and parity as we can track down. If we've
2501 			 * already been through once before, all children will
2502 			 * be marked as tried so we'll proceed to combinatorial
2503 			 * reconstruction.
2504 			 */
2505 			int nread = 0;
2506 			for (int i = 0; i < rm->rm_nrows; i++) {
2507 				nread += vdev_raidz_read_all(zio,
2508 				    rm->rm_row[i]);
2509 			}
2510 			if (nread != 0) {
2511 				/*
2512 				 * Normally our stage is VDEV_IO_DONE, but if
2513 				 * we've already called redone(), it will have
2514 				 * changed to VDEV_IO_START, in which case we
2515 				 * don't want to call redone() again.
2516 				 */
2517 				if (zio->io_stage != ZIO_STAGE_VDEV_IO_START)
2518 					zio_vdev_io_redone(zio);
2519 				return;
2520 			}
2521 
2522 			zio->io_error = vdev_raidz_combrec(zio);
2523 			if (zio->io_error == ECKSUM &&
2524 			    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2525 				vdev_raidz_io_done_unrecoverable(zio);
2526 			}
2527 		}
2528 	}
2529 }
2530 
2531 static void
2532 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2533 {
2534 	vdev_raidz_t *vdrz = vd->vdev_tsd;
2535 	if (faulted > vdrz->vd_nparity)
2536 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2537 		    VDEV_AUX_NO_REPLICAS);
2538 	else if (degraded + faulted != 0)
2539 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2540 	else
2541 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2542 }
2543 
2544 /*
2545  * Determine if any portion of the provided block resides on a child vdev
2546  * with a dirty DTL and therefore needs to be resilvered.  The function
2547  * assumes that at least one DTL is dirty which implies that full stripe
2548  * width blocks must be resilvered.
2549  */
2550 static boolean_t
2551 vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
2552     uint64_t phys_birth)
2553 {
2554 	vdev_raidz_t *vdrz = vd->vdev_tsd;
2555 	uint64_t dcols = vd->vdev_children;
2556 	uint64_t nparity = vdrz->vd_nparity;
2557 	uint64_t ashift = vd->vdev_top->vdev_ashift;
2558 	/* The starting RAIDZ (parent) vdev sector of the block. */
2559 	uint64_t b = DVA_GET_OFFSET(dva) >> ashift;
2560 	/* The zio's size in units of the vdev's minimum sector size. */
2561 	uint64_t s = ((psize - 1) >> ashift) + 1;
2562 	/* The first column for this stripe. */
2563 	uint64_t f = b % dcols;
2564 
2565 	/* Unreachable by sequential resilver. */
2566 	ASSERT3U(phys_birth, !=, TXG_UNKNOWN);
2567 
2568 	if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
2569 		return (B_FALSE);
2570 
2571 	if (s + nparity >= dcols)
2572 		return (B_TRUE);
2573 
2574 	for (uint64_t c = 0; c < s + nparity; c++) {
2575 		uint64_t devidx = (f + c) % dcols;
2576 		vdev_t *cvd = vd->vdev_child[devidx];
2577 
2578 		/*
2579 		 * dsl_scan_need_resilver() already checked vd with
2580 		 * vdev_dtl_contains(). So here just check cvd with
2581 		 * vdev_dtl_empty(), cheaper and a good approximation.
2582 		 */
2583 		if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
2584 			return (B_TRUE);
2585 	}
2586 
2587 	return (B_FALSE);
2588 }
2589 
2590 static void
2591 vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs,
2592     range_seg64_t *physical_rs, range_seg64_t *remain_rs)
2593 {
2594 	vdev_t *raidvd = cvd->vdev_parent;
2595 	ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
2596 
2597 	uint64_t width = raidvd->vdev_children;
2598 	uint64_t tgt_col = cvd->vdev_id;
2599 	uint64_t ashift = raidvd->vdev_top->vdev_ashift;
2600 
2601 	/* make sure the offsets are block-aligned */
2602 	ASSERT0(logical_rs->rs_start % (1 << ashift));
2603 	ASSERT0(logical_rs->rs_end % (1 << ashift));
2604 	uint64_t b_start = logical_rs->rs_start >> ashift;
2605 	uint64_t b_end = logical_rs->rs_end >> ashift;
2606 
2607 	uint64_t start_row = 0;
2608 	if (b_start > tgt_col) /* avoid underflow */
2609 		start_row = ((b_start - tgt_col - 1) / width) + 1;
2610 
2611 	uint64_t end_row = 0;
2612 	if (b_end > tgt_col)
2613 		end_row = ((b_end - tgt_col - 1) / width) + 1;
2614 
2615 	physical_rs->rs_start = start_row << ashift;
2616 	physical_rs->rs_end = end_row << ashift;
2617 
2618 	ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start);
2619 	ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=,
2620 	    logical_rs->rs_end - logical_rs->rs_start);
2621 }
2622 
2623 /*
2624  * Initialize private RAIDZ specific fields from the nvlist.
2625  */
2626 static int
2627 vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd)
2628 {
2629 	vdev_raidz_t *vdrz;
2630 	uint64_t nparity;
2631 
2632 	uint_t children;
2633 	nvlist_t **child;
2634 	int error = nvlist_lookup_nvlist_array(nv,
2635 	    ZPOOL_CONFIG_CHILDREN, &child, &children);
2636 	if (error != 0)
2637 		return (SET_ERROR(EINVAL));
2638 
2639 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) {
2640 		if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
2641 			return (SET_ERROR(EINVAL));
2642 
2643 		/*
2644 		 * Previous versions could only support 1 or 2 parity
2645 		 * device.
2646 		 */
2647 		if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2)
2648 			return (SET_ERROR(EINVAL));
2649 		else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3)
2650 			return (SET_ERROR(EINVAL));
2651 	} else {
2652 		/*
2653 		 * We require the parity to be specified for SPAs that
2654 		 * support multiple parity levels.
2655 		 */
2656 		if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
2657 			return (SET_ERROR(EINVAL));
2658 
2659 		/*
2660 		 * Otherwise, we default to 1 parity device for RAID-Z.
2661 		 */
2662 		nparity = 1;
2663 	}
2664 
2665 	vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP);
2666 	vdrz->vd_logical_width = children;
2667 	vdrz->vd_nparity = nparity;
2668 
2669 	*tsd = vdrz;
2670 
2671 	return (0);
2672 }
2673 
2674 static void
2675 vdev_raidz_fini(vdev_t *vd)
2676 {
2677 	kmem_free(vd->vdev_tsd, sizeof (vdev_raidz_t));
2678 }
2679 
2680 /*
2681  * Add RAIDZ specific fields to the config nvlist.
2682  */
2683 static void
2684 vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv)
2685 {
2686 	ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops);
2687 	vdev_raidz_t *vdrz = vd->vdev_tsd;
2688 
2689 	/*
2690 	 * Make sure someone hasn't managed to sneak a fancy new vdev
2691 	 * into a crufty old storage pool.
2692 	 */
2693 	ASSERT(vdrz->vd_nparity == 1 ||
2694 	    (vdrz->vd_nparity <= 2 &&
2695 	    spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) ||
2696 	    (vdrz->vd_nparity <= 3 &&
2697 	    spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3));
2698 
2699 	/*
2700 	 * Note that we'll add these even on storage pools where they
2701 	 * aren't strictly required -- older software will just ignore
2702 	 * it.
2703 	 */
2704 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity);
2705 }
2706 
2707 static uint64_t
2708 vdev_raidz_nparity(vdev_t *vd)
2709 {
2710 	vdev_raidz_t *vdrz = vd->vdev_tsd;
2711 	return (vdrz->vd_nparity);
2712 }
2713 
2714 static uint64_t
2715 vdev_raidz_ndisks(vdev_t *vd)
2716 {
2717 	return (vd->vdev_children);
2718 }
2719 
2720 vdev_ops_t vdev_raidz_ops = {
2721 	.vdev_op_init = vdev_raidz_init,
2722 	.vdev_op_fini = vdev_raidz_fini,
2723 	.vdev_op_open = vdev_raidz_open,
2724 	.vdev_op_close = vdev_raidz_close,
2725 	.vdev_op_asize = vdev_raidz_asize,
2726 	.vdev_op_min_asize = vdev_raidz_min_asize,
2727 	.vdev_op_min_alloc = NULL,
2728 	.vdev_op_io_start = vdev_raidz_io_start,
2729 	.vdev_op_io_done = vdev_raidz_io_done,
2730 	.vdev_op_state_change = vdev_raidz_state_change,
2731 	.vdev_op_need_resilver = vdev_raidz_need_resilver,
2732 	.vdev_op_hold = NULL,
2733 	.vdev_op_rele = NULL,
2734 	.vdev_op_remap = NULL,
2735 	.vdev_op_xlate = vdev_raidz_xlate,
2736 	.vdev_op_rebuild_asize = NULL,
2737 	.vdev_op_metaslab_init = NULL,
2738 	.vdev_op_config_generate = vdev_raidz_config_generate,
2739 	.vdev_op_nparity = vdev_raidz_nparity,
2740 	.vdev_op_ndisks = vdev_raidz_ndisks,
2741 	.vdev_op_type = VDEV_TYPE_RAIDZ,	/* name of this vdev type */
2742 	.vdev_op_leaf = B_FALSE			/* not a leaf vdev */
2743 };
2744