xref: /freebsd/sys/contrib/openzfs/module/zfs/vdev_raidz.c (revision e92ffd9b626833ebdbf2742c8ffddc6cd94b963e)
1eda14cbcSMatt Macy /*
2eda14cbcSMatt Macy  * CDDL HEADER START
3eda14cbcSMatt Macy  *
4eda14cbcSMatt Macy  * The contents of this file are subject to the terms of the
5eda14cbcSMatt Macy  * Common Development and Distribution License (the "License").
6eda14cbcSMatt Macy  * You may not use this file except in compliance with the License.
7eda14cbcSMatt Macy  *
8eda14cbcSMatt Macy  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9eda14cbcSMatt Macy  * or http://www.opensolaris.org/os/licensing.
10eda14cbcSMatt Macy  * See the License for the specific language governing permissions
11eda14cbcSMatt Macy  * and limitations under the License.
12eda14cbcSMatt Macy  *
13eda14cbcSMatt Macy  * When distributing Covered Code, include this CDDL HEADER in each
14eda14cbcSMatt Macy  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15eda14cbcSMatt Macy  * If applicable, add the following below this CDDL HEADER, with the
16eda14cbcSMatt Macy  * fields enclosed by brackets "[]" replaced with your own identifying
17eda14cbcSMatt Macy  * information: Portions Copyright [yyyy] [name of copyright owner]
18eda14cbcSMatt Macy  *
19eda14cbcSMatt Macy  * CDDL HEADER END
20eda14cbcSMatt Macy  */
21eda14cbcSMatt Macy 
22eda14cbcSMatt Macy /*
23eda14cbcSMatt Macy  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
242c48331dSMatt Macy  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
25eda14cbcSMatt Macy  * Copyright (c) 2016 Gvozden Nešković. All rights reserved.
26eda14cbcSMatt Macy  */
27eda14cbcSMatt Macy 
28eda14cbcSMatt Macy #include <sys/zfs_context.h>
29eda14cbcSMatt Macy #include <sys/spa.h>
30eda14cbcSMatt Macy #include <sys/vdev_impl.h>
31eda14cbcSMatt Macy #include <sys/zio.h>
32eda14cbcSMatt Macy #include <sys/zio_checksum.h>
33eda14cbcSMatt Macy #include <sys/abd.h>
34eda14cbcSMatt Macy #include <sys/fs/zfs.h>
35eda14cbcSMatt Macy #include <sys/fm/fs/zfs.h>
36eda14cbcSMatt Macy #include <sys/vdev_raidz.h>
37eda14cbcSMatt Macy #include <sys/vdev_raidz_impl.h>
387877fdebSMatt Macy #include <sys/vdev_draid.h>
39eda14cbcSMatt Macy 
40eda14cbcSMatt Macy #ifdef ZFS_DEBUG
41eda14cbcSMatt Macy #include <sys/vdev.h>	/* For vdev_xlate() in vdev_raidz_io_verify() */
42eda14cbcSMatt Macy #endif
43eda14cbcSMatt Macy 
44eda14cbcSMatt Macy /*
45eda14cbcSMatt Macy  * Virtual device vector for RAID-Z.
46eda14cbcSMatt Macy  *
47eda14cbcSMatt Macy  * This vdev supports single, double, and triple parity. For single parity,
48eda14cbcSMatt Macy  * we use a simple XOR of all the data columns. For double or triple parity,
49eda14cbcSMatt Macy  * we use a special case of Reed-Solomon coding. This extends the
50eda14cbcSMatt Macy  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
51eda14cbcSMatt Macy  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
52eda14cbcSMatt Macy  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
53eda14cbcSMatt Macy  * former is also based. The latter is designed to provide higher performance
54eda14cbcSMatt Macy  * for writes.
55eda14cbcSMatt Macy  *
56eda14cbcSMatt Macy  * Note that the Plank paper claimed to support arbitrary N+M, but was then
57eda14cbcSMatt Macy  * amended six years later identifying a critical flaw that invalidates its
58eda14cbcSMatt Macy  * claims. Nevertheless, the technique can be adapted to work for up to
59eda14cbcSMatt Macy  * triple parity. For additional parity, the amendment "Note: Correction to
60eda14cbcSMatt Macy  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
61eda14cbcSMatt Macy  * is viable, but the additional complexity means that write performance will
62eda14cbcSMatt Macy  * suffer.
63eda14cbcSMatt Macy  *
64eda14cbcSMatt Macy  * All of the methods above operate on a Galois field, defined over the
65eda14cbcSMatt Macy  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
66eda14cbcSMatt Macy  * can be expressed with a single byte. Briefly, the operations on the
67eda14cbcSMatt Macy  * field are defined as follows:
68eda14cbcSMatt Macy  *
69eda14cbcSMatt Macy  *   o addition (+) is represented by a bitwise XOR
70eda14cbcSMatt Macy  *   o subtraction (-) is therefore identical to addition: A + B = A - B
71eda14cbcSMatt Macy  *   o multiplication of A by 2 is defined by the following bitwise expression:
72eda14cbcSMatt Macy  *
73eda14cbcSMatt Macy  *	(A * 2)_7 = A_6
74eda14cbcSMatt Macy  *	(A * 2)_6 = A_5
75eda14cbcSMatt Macy  *	(A * 2)_5 = A_4
76eda14cbcSMatt Macy  *	(A * 2)_4 = A_3 + A_7
77eda14cbcSMatt Macy  *	(A * 2)_3 = A_2 + A_7
78eda14cbcSMatt Macy  *	(A * 2)_2 = A_1 + A_7
79eda14cbcSMatt Macy  *	(A * 2)_1 = A_0
80eda14cbcSMatt Macy  *	(A * 2)_0 = A_7
81eda14cbcSMatt Macy  *
82eda14cbcSMatt Macy  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
83eda14cbcSMatt Macy  * As an aside, this multiplication is derived from the error correcting
84eda14cbcSMatt Macy  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
85eda14cbcSMatt Macy  *
86eda14cbcSMatt Macy  * Observe that any number in the field (except for 0) can be expressed as a
87eda14cbcSMatt Macy  * power of 2 -- a generator for the field. We store a table of the powers of
88eda14cbcSMatt Macy  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
89eda14cbcSMatt Macy  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
90eda14cbcSMatt Macy  * than field addition). The inverse of a field element A (A^-1) is therefore
91eda14cbcSMatt Macy  * A ^ (255 - 1) = A^254.
92eda14cbcSMatt Macy  *
93eda14cbcSMatt Macy  * The up-to-three parity columns, P, Q, R over several data columns,
94eda14cbcSMatt Macy  * D_0, ... D_n-1, can be expressed by field operations:
95eda14cbcSMatt Macy  *
96eda14cbcSMatt Macy  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
97eda14cbcSMatt Macy  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
98eda14cbcSMatt Macy  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
99eda14cbcSMatt Macy  *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
100eda14cbcSMatt Macy  *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
101eda14cbcSMatt Macy  *
102eda14cbcSMatt Macy  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial
103eda14cbcSMatt Macy  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
104eda14cbcSMatt Macy  * independent coefficients. (There are no additional coefficients that have
105eda14cbcSMatt Macy  * this property which is why the uncorrected Plank method breaks down.)
106eda14cbcSMatt Macy  *
107eda14cbcSMatt Macy  * See the reconstruction code below for how P, Q and R can used individually
108eda14cbcSMatt Macy  * or in concert to recover missing data columns.
109eda14cbcSMatt Macy  */
110eda14cbcSMatt Macy 
111eda14cbcSMatt Macy #define	VDEV_RAIDZ_P		0
112eda14cbcSMatt Macy #define	VDEV_RAIDZ_Q		1
113eda14cbcSMatt Macy #define	VDEV_RAIDZ_R		2
114eda14cbcSMatt Macy 
115eda14cbcSMatt Macy #define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
116eda14cbcSMatt Macy #define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
117eda14cbcSMatt Macy 
118eda14cbcSMatt Macy /*
119eda14cbcSMatt Macy  * We provide a mechanism to perform the field multiplication operation on a
120eda14cbcSMatt Macy  * 64-bit value all at once rather than a byte at a time. This works by
121eda14cbcSMatt Macy  * creating a mask from the top bit in each byte and using that to
122eda14cbcSMatt Macy  * conditionally apply the XOR of 0x1d.
123eda14cbcSMatt Macy  */
124eda14cbcSMatt Macy #define	VDEV_RAIDZ_64MUL_2(x, mask) \
125eda14cbcSMatt Macy { \
126eda14cbcSMatt Macy 	(mask) = (x) & 0x8080808080808080ULL; \
127eda14cbcSMatt Macy 	(mask) = ((mask) << 1) - ((mask) >> 7); \
128eda14cbcSMatt Macy 	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
129eda14cbcSMatt Macy 	    ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
130eda14cbcSMatt Macy }
131eda14cbcSMatt Macy 
132eda14cbcSMatt Macy #define	VDEV_RAIDZ_64MUL_4(x, mask) \
133eda14cbcSMatt Macy { \
134eda14cbcSMatt Macy 	VDEV_RAIDZ_64MUL_2((x), mask); \
135eda14cbcSMatt Macy 	VDEV_RAIDZ_64MUL_2((x), mask); \
136eda14cbcSMatt Macy }
137eda14cbcSMatt Macy 
1387877fdebSMatt Macy static void
1397877fdebSMatt Macy vdev_raidz_row_free(raidz_row_t *rr)
140eda14cbcSMatt Macy {
141184c1b94SMartin Matuska 	for (int c = 0; c < rr->rr_cols; c++) {
142184c1b94SMartin Matuska 		raidz_col_t *rc = &rr->rr_col[c];
143eda14cbcSMatt Macy 
144184c1b94SMartin Matuska 		if (rc->rc_size != 0)
145184c1b94SMartin Matuska 			abd_free(rc->rc_abd);
146184c1b94SMartin Matuska 		if (rc->rc_orig_data != NULL)
147f9693befSMartin Matuska 			abd_free(rc->rc_orig_data);
148eda14cbcSMatt Macy 	}
149eda14cbcSMatt Macy 
1507877fdebSMatt Macy 	if (rr->rr_abd_empty != NULL)
1517877fdebSMatt Macy 		abd_free(rr->rr_abd_empty);
152eda14cbcSMatt Macy 
1537877fdebSMatt Macy 	kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols]));
1547877fdebSMatt Macy }
1557877fdebSMatt Macy 
1567877fdebSMatt Macy void
1577877fdebSMatt Macy vdev_raidz_map_free(raidz_map_t *rm)
1587877fdebSMatt Macy {
1597877fdebSMatt Macy 	for (int i = 0; i < rm->rm_nrows; i++)
1607877fdebSMatt Macy 		vdev_raidz_row_free(rm->rm_row[i]);
1617877fdebSMatt Macy 
1627877fdebSMatt Macy 	kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows]));
163eda14cbcSMatt Macy }
164eda14cbcSMatt Macy 
165eda14cbcSMatt Macy static void
166eda14cbcSMatt Macy vdev_raidz_map_free_vsd(zio_t *zio)
167eda14cbcSMatt Macy {
168eda14cbcSMatt Macy 	raidz_map_t *rm = zio->io_vsd;
169eda14cbcSMatt Macy 
170eda14cbcSMatt Macy 	vdev_raidz_map_free(rm);
171eda14cbcSMatt Macy }
172eda14cbcSMatt Macy 
173f9693befSMartin Matuska const zio_vsd_ops_t vdev_raidz_vsd_ops = {
174eda14cbcSMatt Macy 	.vsd_free = vdev_raidz_map_free_vsd,
175eda14cbcSMatt Macy };
176eda14cbcSMatt Macy 
17781b22a98SMartin Matuska static void
17881b22a98SMartin Matuska vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift)
17981b22a98SMartin Matuska {
18081b22a98SMartin Matuska 	int c;
18181b22a98SMartin Matuska 	int nwrapped = 0;
18281b22a98SMartin Matuska 	uint64_t off = 0;
18381b22a98SMartin Matuska 	raidz_row_t *rr = rm->rm_row[0];
18481b22a98SMartin Matuska 
18581b22a98SMartin Matuska 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
18681b22a98SMartin Matuska 	ASSERT3U(rm->rm_nrows, ==, 1);
18781b22a98SMartin Matuska 
18881b22a98SMartin Matuska 	/*
18981b22a98SMartin Matuska 	 * Pad any parity columns with additional space to account for skip
19081b22a98SMartin Matuska 	 * sectors.
19181b22a98SMartin Matuska 	 */
19281b22a98SMartin Matuska 	if (rm->rm_skipstart < rr->rr_firstdatacol) {
19381b22a98SMartin Matuska 		ASSERT0(rm->rm_skipstart);
19481b22a98SMartin Matuska 		nwrapped = rm->rm_nskip;
19581b22a98SMartin Matuska 	} else if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) {
19681b22a98SMartin Matuska 		nwrapped =
19781b22a98SMartin Matuska 		    (rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols;
19881b22a98SMartin Matuska 	}
19981b22a98SMartin Matuska 
20081b22a98SMartin Matuska 	/*
20181b22a98SMartin Matuska 	 * Optional single skip sectors (rc_size == 0) will be handled in
20281b22a98SMartin Matuska 	 * vdev_raidz_io_start_write().
20381b22a98SMartin Matuska 	 */
20481b22a98SMartin Matuska 	int skipped = rr->rr_scols - rr->rr_cols;
20581b22a98SMartin Matuska 
20681b22a98SMartin Matuska 	/* Allocate buffers for the parity columns */
20781b22a98SMartin Matuska 	for (c = 0; c < rr->rr_firstdatacol; c++) {
20881b22a98SMartin Matuska 		raidz_col_t *rc = &rr->rr_col[c];
20981b22a98SMartin Matuska 
21081b22a98SMartin Matuska 		/*
21181b22a98SMartin Matuska 		 * Parity columns will pad out a linear ABD to account for
21281b22a98SMartin Matuska 		 * the skip sector. A linear ABD is used here because
21381b22a98SMartin Matuska 		 * parity calculations use the ABD buffer directly to calculate
21481b22a98SMartin Matuska 		 * parity. This avoids doing a memcpy back to the ABD after the
21581b22a98SMartin Matuska 		 * parity has been calculated. By issuing the parity column
21681b22a98SMartin Matuska 		 * with the skip sector we can reduce contention on the child
21781b22a98SMartin Matuska 		 * VDEV queue locks (vq_lock).
21881b22a98SMartin Matuska 		 */
21981b22a98SMartin Matuska 		if (c < nwrapped) {
22081b22a98SMartin Matuska 			rc->rc_abd = abd_alloc_linear(
22181b22a98SMartin Matuska 			    rc->rc_size + (1ULL << ashift), B_FALSE);
22281b22a98SMartin Matuska 			abd_zero_off(rc->rc_abd, rc->rc_size, 1ULL << ashift);
22381b22a98SMartin Matuska 			skipped++;
22481b22a98SMartin Matuska 		} else {
22581b22a98SMartin Matuska 			rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
22681b22a98SMartin Matuska 		}
22781b22a98SMartin Matuska 	}
22881b22a98SMartin Matuska 
22981b22a98SMartin Matuska 	for (off = 0; c < rr->rr_cols; c++) {
23081b22a98SMartin Matuska 		raidz_col_t *rc = &rr->rr_col[c];
23181b22a98SMartin Matuska 		abd_t *abd = abd_get_offset_struct(&rc->rc_abdstruct,
23281b22a98SMartin Matuska 		    zio->io_abd, off, rc->rc_size);
23381b22a98SMartin Matuska 
23481b22a98SMartin Matuska 		/*
23581b22a98SMartin Matuska 		 * Generate I/O for skip sectors to improve aggregation
23681b22a98SMartin Matuska 		 * continuity. We will use gang ABD's to reduce contention
23781b22a98SMartin Matuska 		 * on the child VDEV queue locks (vq_lock) by issuing
23881b22a98SMartin Matuska 		 * a single I/O that contains the data and skip sector.
23981b22a98SMartin Matuska 		 *
24081b22a98SMartin Matuska 		 * It is important to make sure that rc_size is not updated
24181b22a98SMartin Matuska 		 * even though we are adding a skip sector to the ABD. When
24281b22a98SMartin Matuska 		 * calculating the parity in vdev_raidz_generate_parity_row()
24381b22a98SMartin Matuska 		 * the rc_size is used to iterate through the ABD's. We can
24481b22a98SMartin Matuska 		 * not have zero'd out skip sectors used for calculating
24581b22a98SMartin Matuska 		 * parity for raidz, because those same sectors are not used
24681b22a98SMartin Matuska 		 * during reconstruction.
24781b22a98SMartin Matuska 		 */
24881b22a98SMartin Matuska 		if (c >= rm->rm_skipstart && skipped < rm->rm_nskip) {
24981b22a98SMartin Matuska 			rc->rc_abd = abd_alloc_gang();
25081b22a98SMartin Matuska 			abd_gang_add(rc->rc_abd, abd, B_TRUE);
25181b22a98SMartin Matuska 			abd_gang_add(rc->rc_abd,
25281b22a98SMartin Matuska 			    abd_get_zeros(1ULL << ashift), B_TRUE);
25381b22a98SMartin Matuska 			skipped++;
25481b22a98SMartin Matuska 		} else {
25581b22a98SMartin Matuska 			rc->rc_abd = abd;
25681b22a98SMartin Matuska 		}
25781b22a98SMartin Matuska 		off += rc->rc_size;
25881b22a98SMartin Matuska 	}
25981b22a98SMartin Matuska 
26081b22a98SMartin Matuska 	ASSERT3U(off, ==, zio->io_size);
26181b22a98SMartin Matuska 	ASSERT3S(skipped, ==, rm->rm_nskip);
26281b22a98SMartin Matuska }
26381b22a98SMartin Matuska 
26481b22a98SMartin Matuska static void
26581b22a98SMartin Matuska vdev_raidz_map_alloc_read(zio_t *zio, raidz_map_t *rm)
26681b22a98SMartin Matuska {
26781b22a98SMartin Matuska 	int c;
26881b22a98SMartin Matuska 	raidz_row_t *rr = rm->rm_row[0];
26981b22a98SMartin Matuska 
27081b22a98SMartin Matuska 	ASSERT3U(rm->rm_nrows, ==, 1);
27181b22a98SMartin Matuska 
27281b22a98SMartin Matuska 	/* Allocate buffers for the parity columns */
27381b22a98SMartin Matuska 	for (c = 0; c < rr->rr_firstdatacol; c++)
27481b22a98SMartin Matuska 		rr->rr_col[c].rc_abd =
27581b22a98SMartin Matuska 		    abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE);
27681b22a98SMartin Matuska 
27781b22a98SMartin Matuska 	for (uint64_t off = 0; c < rr->rr_cols; c++) {
27881b22a98SMartin Matuska 		raidz_col_t *rc = &rr->rr_col[c];
27981b22a98SMartin Matuska 		rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
28081b22a98SMartin Matuska 		    zio->io_abd, off, rc->rc_size);
28181b22a98SMartin Matuska 		off += rc->rc_size;
28281b22a98SMartin Matuska 	}
28381b22a98SMartin Matuska }
28481b22a98SMartin Matuska 
285eda14cbcSMatt Macy /*
286eda14cbcSMatt Macy  * Divides the IO evenly across all child vdevs; usually, dcols is
287eda14cbcSMatt Macy  * the number of children in the target vdev.
288eda14cbcSMatt Macy  *
289eda14cbcSMatt Macy  * Avoid inlining the function to keep vdev_raidz_io_start(), which
290eda14cbcSMatt Macy  * is this functions only caller, as small as possible on the stack.
291eda14cbcSMatt Macy  */
292eda14cbcSMatt Macy noinline raidz_map_t *
293eda14cbcSMatt Macy vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
294eda14cbcSMatt Macy     uint64_t nparity)
295eda14cbcSMatt Macy {
2967877fdebSMatt Macy 	raidz_row_t *rr;
297eda14cbcSMatt Macy 	/* The starting RAIDZ (parent) vdev sector of the block. */
298eda14cbcSMatt Macy 	uint64_t b = zio->io_offset >> ashift;
299eda14cbcSMatt Macy 	/* The zio's size in units of the vdev's minimum sector size. */
300eda14cbcSMatt Macy 	uint64_t s = zio->io_size >> ashift;
301eda14cbcSMatt Macy 	/* The first column for this stripe. */
302eda14cbcSMatt Macy 	uint64_t f = b % dcols;
303eda14cbcSMatt Macy 	/* The starting byte offset on each child vdev. */
304eda14cbcSMatt Macy 	uint64_t o = (b / dcols) << ashift;
305eda14cbcSMatt Macy 	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
306eda14cbcSMatt Macy 
3077877fdebSMatt Macy 	raidz_map_t *rm =
3087877fdebSMatt Macy 	    kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP);
3097877fdebSMatt Macy 	rm->rm_nrows = 1;
3107877fdebSMatt Macy 
311eda14cbcSMatt Macy 	/*
312eda14cbcSMatt Macy 	 * "Quotient": The number of data sectors for this stripe on all but
313eda14cbcSMatt Macy 	 * the "big column" child vdevs that also contain "remainder" data.
314eda14cbcSMatt Macy 	 */
315eda14cbcSMatt Macy 	q = s / (dcols - nparity);
316eda14cbcSMatt Macy 
317eda14cbcSMatt Macy 	/*
318eda14cbcSMatt Macy 	 * "Remainder": The number of partial stripe data sectors in this I/O.
319eda14cbcSMatt Macy 	 * This will add a sector to some, but not all, child vdevs.
320eda14cbcSMatt Macy 	 */
321eda14cbcSMatt Macy 	r = s - q * (dcols - nparity);
322eda14cbcSMatt Macy 
323eda14cbcSMatt Macy 	/* The number of "big columns" - those which contain remainder data. */
324eda14cbcSMatt Macy 	bc = (r == 0 ? 0 : r + nparity);
325eda14cbcSMatt Macy 
326eda14cbcSMatt Macy 	/*
327eda14cbcSMatt Macy 	 * The total number of data and parity sectors associated with
328eda14cbcSMatt Macy 	 * this I/O.
329eda14cbcSMatt Macy 	 */
330eda14cbcSMatt Macy 	tot = s + nparity * (q + (r == 0 ? 0 : 1));
331eda14cbcSMatt Macy 
3327877fdebSMatt Macy 	/*
3337877fdebSMatt Macy 	 * acols: The columns that will be accessed.
3347877fdebSMatt Macy 	 * scols: The columns that will be accessed or skipped.
3357877fdebSMatt Macy 	 */
336eda14cbcSMatt Macy 	if (q == 0) {
337eda14cbcSMatt Macy 		/* Our I/O request doesn't span all child vdevs. */
338eda14cbcSMatt Macy 		acols = bc;
339eda14cbcSMatt Macy 		scols = MIN(dcols, roundup(bc, nparity + 1));
340eda14cbcSMatt Macy 	} else {
341eda14cbcSMatt Macy 		acols = dcols;
342eda14cbcSMatt Macy 		scols = dcols;
343eda14cbcSMatt Macy 	}
344eda14cbcSMatt Macy 
345eda14cbcSMatt Macy 	ASSERT3U(acols, <=, scols);
346eda14cbcSMatt Macy 
3477877fdebSMatt Macy 	rr = kmem_alloc(offsetof(raidz_row_t, rr_col[scols]), KM_SLEEP);
3487877fdebSMatt Macy 	rm->rm_row[0] = rr;
349eda14cbcSMatt Macy 
3507877fdebSMatt Macy 	rr->rr_cols = acols;
3517877fdebSMatt Macy 	rr->rr_scols = scols;
3527877fdebSMatt Macy 	rr->rr_bigcols = bc;
3537877fdebSMatt Macy 	rr->rr_missingdata = 0;
3547877fdebSMatt Macy 	rr->rr_missingparity = 0;
3557877fdebSMatt Macy 	rr->rr_firstdatacol = nparity;
3567877fdebSMatt Macy 	rr->rr_abd_empty = NULL;
3577877fdebSMatt Macy 	rr->rr_nempty = 0;
3587877fdebSMatt Macy #ifdef ZFS_DEBUG
3597877fdebSMatt Macy 	rr->rr_offset = zio->io_offset;
3607877fdebSMatt Macy 	rr->rr_size = zio->io_size;
3617877fdebSMatt Macy #endif
362eda14cbcSMatt Macy 
363eda14cbcSMatt Macy 	asize = 0;
364eda14cbcSMatt Macy 
365eda14cbcSMatt Macy 	for (c = 0; c < scols; c++) {
3667877fdebSMatt Macy 		raidz_col_t *rc = &rr->rr_col[c];
367eda14cbcSMatt Macy 		col = f + c;
368eda14cbcSMatt Macy 		coff = o;
369eda14cbcSMatt Macy 		if (col >= dcols) {
370eda14cbcSMatt Macy 			col -= dcols;
371eda14cbcSMatt Macy 			coff += 1ULL << ashift;
372eda14cbcSMatt Macy 		}
3737877fdebSMatt Macy 		rc->rc_devidx = col;
3747877fdebSMatt Macy 		rc->rc_offset = coff;
3757877fdebSMatt Macy 		rc->rc_abd = NULL;
3767877fdebSMatt Macy 		rc->rc_orig_data = NULL;
3777877fdebSMatt Macy 		rc->rc_error = 0;
3787877fdebSMatt Macy 		rc->rc_tried = 0;
3797877fdebSMatt Macy 		rc->rc_skipped = 0;
38016038816SMartin Matuska 		rc->rc_force_repair = 0;
38116038816SMartin Matuska 		rc->rc_allow_repair = 1;
3827877fdebSMatt Macy 		rc->rc_need_orig_restore = B_FALSE;
383eda14cbcSMatt Macy 
384eda14cbcSMatt Macy 		if (c >= acols)
3857877fdebSMatt Macy 			rc->rc_size = 0;
386eda14cbcSMatt Macy 		else if (c < bc)
3877877fdebSMatt Macy 			rc->rc_size = (q + 1) << ashift;
388eda14cbcSMatt Macy 		else
3897877fdebSMatt Macy 			rc->rc_size = q << ashift;
390eda14cbcSMatt Macy 
3917877fdebSMatt Macy 		asize += rc->rc_size;
392eda14cbcSMatt Macy 	}
393eda14cbcSMatt Macy 
394eda14cbcSMatt Macy 	ASSERT3U(asize, ==, tot << ashift);
395eda14cbcSMatt Macy 	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
3967877fdebSMatt Macy 	rm->rm_skipstart = bc;
397eda14cbcSMatt Macy 
398eda14cbcSMatt Macy 	/*
399eda14cbcSMatt Macy 	 * If all data stored spans all columns, there's a danger that parity
400eda14cbcSMatt Macy 	 * will always be on the same device and, since parity isn't read
401eda14cbcSMatt Macy 	 * during normal operation, that device's I/O bandwidth won't be
402eda14cbcSMatt Macy 	 * used effectively. We therefore switch the parity every 1MB.
403eda14cbcSMatt Macy 	 *
404eda14cbcSMatt Macy 	 * ... at least that was, ostensibly, the theory. As a practical
405eda14cbcSMatt Macy 	 * matter unless we juggle the parity between all devices evenly, we
406eda14cbcSMatt Macy 	 * won't see any benefit. Further, occasional writes that aren't a
407eda14cbcSMatt Macy 	 * multiple of the LCM of the number of children and the minimum
408eda14cbcSMatt Macy 	 * stripe width are sufficient to avoid pessimal behavior.
409eda14cbcSMatt Macy 	 * Unfortunately, this decision created an implicit on-disk format
410eda14cbcSMatt Macy 	 * requirement that we need to support for all eternity, but only
411eda14cbcSMatt Macy 	 * for single-parity RAID-Z.
412eda14cbcSMatt Macy 	 *
413eda14cbcSMatt Macy 	 * If we intend to skip a sector in the zeroth column for padding
414eda14cbcSMatt Macy 	 * we must make sure to note this swap. We will never intend to
415eda14cbcSMatt Macy 	 * skip the first column since at least one data and one parity
416eda14cbcSMatt Macy 	 * column must appear in each row.
417eda14cbcSMatt Macy 	 */
4187877fdebSMatt Macy 	ASSERT(rr->rr_cols >= 2);
4197877fdebSMatt Macy 	ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
420eda14cbcSMatt Macy 
4217877fdebSMatt Macy 	if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
4227877fdebSMatt Macy 		devidx = rr->rr_col[0].rc_devidx;
4237877fdebSMatt Macy 		o = rr->rr_col[0].rc_offset;
4247877fdebSMatt Macy 		rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
4257877fdebSMatt Macy 		rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
4267877fdebSMatt Macy 		rr->rr_col[1].rc_devidx = devidx;
4277877fdebSMatt Macy 		rr->rr_col[1].rc_offset = o;
428eda14cbcSMatt Macy 
429eda14cbcSMatt Macy 		if (rm->rm_skipstart == 0)
430eda14cbcSMatt Macy 			rm->rm_skipstart = 1;
431eda14cbcSMatt Macy 	}
432eda14cbcSMatt Macy 
43381b22a98SMartin Matuska 	if (zio->io_type == ZIO_TYPE_WRITE) {
43481b22a98SMartin Matuska 		vdev_raidz_map_alloc_write(zio, rm, ashift);
43581b22a98SMartin Matuska 	} else {
43681b22a98SMartin Matuska 		vdev_raidz_map_alloc_read(zio, rm);
43781b22a98SMartin Matuska 	}
43881b22a98SMartin Matuska 
439eda14cbcSMatt Macy 	/* init RAIDZ parity ops */
440eda14cbcSMatt Macy 	rm->rm_ops = vdev_raidz_math_get_ops();
441eda14cbcSMatt Macy 
442eda14cbcSMatt Macy 	return (rm);
443eda14cbcSMatt Macy }
444eda14cbcSMatt Macy 
445eda14cbcSMatt Macy struct pqr_struct {
446eda14cbcSMatt Macy 	uint64_t *p;
447eda14cbcSMatt Macy 	uint64_t *q;
448eda14cbcSMatt Macy 	uint64_t *r;
449eda14cbcSMatt Macy };
450eda14cbcSMatt Macy 
451eda14cbcSMatt Macy static int
452eda14cbcSMatt Macy vdev_raidz_p_func(void *buf, size_t size, void *private)
453eda14cbcSMatt Macy {
454eda14cbcSMatt Macy 	struct pqr_struct *pqr = private;
455eda14cbcSMatt Macy 	const uint64_t *src = buf;
456eda14cbcSMatt Macy 	int i, cnt = size / sizeof (src[0]);
457eda14cbcSMatt Macy 
458eda14cbcSMatt Macy 	ASSERT(pqr->p && !pqr->q && !pqr->r);
459eda14cbcSMatt Macy 
460eda14cbcSMatt Macy 	for (i = 0; i < cnt; i++, src++, pqr->p++)
461eda14cbcSMatt Macy 		*pqr->p ^= *src;
462eda14cbcSMatt Macy 
463eda14cbcSMatt Macy 	return (0);
464eda14cbcSMatt Macy }
465eda14cbcSMatt Macy 
466eda14cbcSMatt Macy static int
467eda14cbcSMatt Macy vdev_raidz_pq_func(void *buf, size_t size, void *private)
468eda14cbcSMatt Macy {
469eda14cbcSMatt Macy 	struct pqr_struct *pqr = private;
470eda14cbcSMatt Macy 	const uint64_t *src = buf;
471eda14cbcSMatt Macy 	uint64_t mask;
472eda14cbcSMatt Macy 	int i, cnt = size / sizeof (src[0]);
473eda14cbcSMatt Macy 
474eda14cbcSMatt Macy 	ASSERT(pqr->p && pqr->q && !pqr->r);
475eda14cbcSMatt Macy 
476eda14cbcSMatt Macy 	for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
477eda14cbcSMatt Macy 		*pqr->p ^= *src;
478eda14cbcSMatt Macy 		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
479eda14cbcSMatt Macy 		*pqr->q ^= *src;
480eda14cbcSMatt Macy 	}
481eda14cbcSMatt Macy 
482eda14cbcSMatt Macy 	return (0);
483eda14cbcSMatt Macy }
484eda14cbcSMatt Macy 
485eda14cbcSMatt Macy static int
486eda14cbcSMatt Macy vdev_raidz_pqr_func(void *buf, size_t size, void *private)
487eda14cbcSMatt Macy {
488eda14cbcSMatt Macy 	struct pqr_struct *pqr = private;
489eda14cbcSMatt Macy 	const uint64_t *src = buf;
490eda14cbcSMatt Macy 	uint64_t mask;
491eda14cbcSMatt Macy 	int i, cnt = size / sizeof (src[0]);
492eda14cbcSMatt Macy 
493eda14cbcSMatt Macy 	ASSERT(pqr->p && pqr->q && pqr->r);
494eda14cbcSMatt Macy 
495eda14cbcSMatt Macy 	for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
496eda14cbcSMatt Macy 		*pqr->p ^= *src;
497eda14cbcSMatt Macy 		VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
498eda14cbcSMatt Macy 		*pqr->q ^= *src;
499eda14cbcSMatt Macy 		VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
500eda14cbcSMatt Macy 		*pqr->r ^= *src;
501eda14cbcSMatt Macy 	}
502eda14cbcSMatt Macy 
503eda14cbcSMatt Macy 	return (0);
504eda14cbcSMatt Macy }
505eda14cbcSMatt Macy 
506eda14cbcSMatt Macy static void
5077877fdebSMatt Macy vdev_raidz_generate_parity_p(raidz_row_t *rr)
508eda14cbcSMatt Macy {
5097877fdebSMatt Macy 	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
510eda14cbcSMatt Macy 
5117877fdebSMatt Macy 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
5127877fdebSMatt Macy 		abd_t *src = rr->rr_col[c].rc_abd;
513eda14cbcSMatt Macy 
5147877fdebSMatt Macy 		if (c == rr->rr_firstdatacol) {
5157877fdebSMatt Macy 			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
516eda14cbcSMatt Macy 		} else {
517eda14cbcSMatt Macy 			struct pqr_struct pqr = { p, NULL, NULL };
5187877fdebSMatt Macy 			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
519eda14cbcSMatt Macy 			    vdev_raidz_p_func, &pqr);
520eda14cbcSMatt Macy 		}
521eda14cbcSMatt Macy 	}
522eda14cbcSMatt Macy }
523eda14cbcSMatt Macy 
524eda14cbcSMatt Macy static void
5257877fdebSMatt Macy vdev_raidz_generate_parity_pq(raidz_row_t *rr)
526eda14cbcSMatt Macy {
5277877fdebSMatt Macy 	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
5287877fdebSMatt Macy 	uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
5297877fdebSMatt Macy 	uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
5307877fdebSMatt Macy 	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
5317877fdebSMatt Macy 	    rr->rr_col[VDEV_RAIDZ_Q].rc_size);
532eda14cbcSMatt Macy 
5337877fdebSMatt Macy 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
5347877fdebSMatt Macy 		abd_t *src = rr->rr_col[c].rc_abd;
535eda14cbcSMatt Macy 
5367877fdebSMatt Macy 		uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
537eda14cbcSMatt Macy 
5387877fdebSMatt Macy 		if (c == rr->rr_firstdatacol) {
539eda14cbcSMatt Macy 			ASSERT(ccnt == pcnt || ccnt == 0);
5407877fdebSMatt Macy 			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
5417877fdebSMatt Macy 			(void) memcpy(q, p, rr->rr_col[c].rc_size);
542eda14cbcSMatt Macy 
5437877fdebSMatt Macy 			for (uint64_t i = ccnt; i < pcnt; i++) {
544eda14cbcSMatt Macy 				p[i] = 0;
545eda14cbcSMatt Macy 				q[i] = 0;
546eda14cbcSMatt Macy 			}
547eda14cbcSMatt Macy 		} else {
548eda14cbcSMatt Macy 			struct pqr_struct pqr = { p, q, NULL };
549eda14cbcSMatt Macy 
550eda14cbcSMatt Macy 			ASSERT(ccnt <= pcnt);
5517877fdebSMatt Macy 			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
552eda14cbcSMatt Macy 			    vdev_raidz_pq_func, &pqr);
553eda14cbcSMatt Macy 
554eda14cbcSMatt Macy 			/*
555eda14cbcSMatt Macy 			 * Treat short columns as though they are full of 0s.
556eda14cbcSMatt Macy 			 * Note that there's therefore nothing needed for P.
557eda14cbcSMatt Macy 			 */
5587877fdebSMatt Macy 			uint64_t mask;
5597877fdebSMatt Macy 			for (uint64_t i = ccnt; i < pcnt; i++) {
560eda14cbcSMatt Macy 				VDEV_RAIDZ_64MUL_2(q[i], mask);
561eda14cbcSMatt Macy 			}
562eda14cbcSMatt Macy 		}
563eda14cbcSMatt Macy 	}
564eda14cbcSMatt Macy }
565eda14cbcSMatt Macy 
566eda14cbcSMatt Macy static void
5677877fdebSMatt Macy vdev_raidz_generate_parity_pqr(raidz_row_t *rr)
568eda14cbcSMatt Macy {
5697877fdebSMatt Macy 	uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
5707877fdebSMatt Macy 	uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
5717877fdebSMatt Macy 	uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd);
5727877fdebSMatt Macy 	uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
5737877fdebSMatt Macy 	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
5747877fdebSMatt Macy 	    rr->rr_col[VDEV_RAIDZ_Q].rc_size);
5757877fdebSMatt Macy 	ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
5767877fdebSMatt Macy 	    rr->rr_col[VDEV_RAIDZ_R].rc_size);
577eda14cbcSMatt Macy 
5787877fdebSMatt Macy 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
5797877fdebSMatt Macy 		abd_t *src = rr->rr_col[c].rc_abd;
580eda14cbcSMatt Macy 
5817877fdebSMatt Macy 		uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
582eda14cbcSMatt Macy 
5837877fdebSMatt Macy 		if (c == rr->rr_firstdatacol) {
584eda14cbcSMatt Macy 			ASSERT(ccnt == pcnt || ccnt == 0);
5857877fdebSMatt Macy 			abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
5867877fdebSMatt Macy 			(void) memcpy(q, p, rr->rr_col[c].rc_size);
5877877fdebSMatt Macy 			(void) memcpy(r, p, rr->rr_col[c].rc_size);
588eda14cbcSMatt Macy 
5897877fdebSMatt Macy 			for (uint64_t i = ccnt; i < pcnt; i++) {
590eda14cbcSMatt Macy 				p[i] = 0;
591eda14cbcSMatt Macy 				q[i] = 0;
592eda14cbcSMatt Macy 				r[i] = 0;
593eda14cbcSMatt Macy 			}
594eda14cbcSMatt Macy 		} else {
595eda14cbcSMatt Macy 			struct pqr_struct pqr = { p, q, r };
596eda14cbcSMatt Macy 
597eda14cbcSMatt Macy 			ASSERT(ccnt <= pcnt);
5987877fdebSMatt Macy 			(void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
599eda14cbcSMatt Macy 			    vdev_raidz_pqr_func, &pqr);
600eda14cbcSMatt Macy 
601eda14cbcSMatt Macy 			/*
602eda14cbcSMatt Macy 			 * Treat short columns as though they are full of 0s.
603eda14cbcSMatt Macy 			 * Note that there's therefore nothing needed for P.
604eda14cbcSMatt Macy 			 */
6057877fdebSMatt Macy 			uint64_t mask;
6067877fdebSMatt Macy 			for (uint64_t i = ccnt; i < pcnt; i++) {
607eda14cbcSMatt Macy 				VDEV_RAIDZ_64MUL_2(q[i], mask);
608eda14cbcSMatt Macy 				VDEV_RAIDZ_64MUL_4(r[i], mask);
609eda14cbcSMatt Macy 			}
610eda14cbcSMatt Macy 		}
611eda14cbcSMatt Macy 	}
612eda14cbcSMatt Macy }
613eda14cbcSMatt Macy 
614eda14cbcSMatt Macy /*
615eda14cbcSMatt Macy  * Generate RAID parity in the first virtual columns according to the number of
616eda14cbcSMatt Macy  * parity columns available.
617eda14cbcSMatt Macy  */
618eda14cbcSMatt Macy void
6197877fdebSMatt Macy vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr)
620eda14cbcSMatt Macy {
6217877fdebSMatt Macy 	ASSERT3U(rr->rr_cols, !=, 0);
6227877fdebSMatt Macy 
623eda14cbcSMatt Macy 	/* Generate using the new math implementation */
6247877fdebSMatt Macy 	if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL)
625eda14cbcSMatt Macy 		return;
626eda14cbcSMatt Macy 
6277877fdebSMatt Macy 	switch (rr->rr_firstdatacol) {
628eda14cbcSMatt Macy 	case 1:
6297877fdebSMatt Macy 		vdev_raidz_generate_parity_p(rr);
630eda14cbcSMatt Macy 		break;
631eda14cbcSMatt Macy 	case 2:
6327877fdebSMatt Macy 		vdev_raidz_generate_parity_pq(rr);
633eda14cbcSMatt Macy 		break;
634eda14cbcSMatt Macy 	case 3:
6357877fdebSMatt Macy 		vdev_raidz_generate_parity_pqr(rr);
636eda14cbcSMatt Macy 		break;
637eda14cbcSMatt Macy 	default:
638eda14cbcSMatt Macy 		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
639eda14cbcSMatt Macy 	}
640eda14cbcSMatt Macy }
641eda14cbcSMatt Macy 
6427877fdebSMatt Macy void
6437877fdebSMatt Macy vdev_raidz_generate_parity(raidz_map_t *rm)
6447877fdebSMatt Macy {
6457877fdebSMatt Macy 	for (int i = 0; i < rm->rm_nrows; i++) {
6467877fdebSMatt Macy 		raidz_row_t *rr = rm->rm_row[i];
6477877fdebSMatt Macy 		vdev_raidz_generate_parity_row(rm, rr);
6487877fdebSMatt Macy 	}
6497877fdebSMatt Macy }
6507877fdebSMatt Macy 
651eda14cbcSMatt Macy static int
652eda14cbcSMatt Macy vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
653eda14cbcSMatt Macy {
654*e92ffd9bSMartin Matuska 	(void) private;
655eda14cbcSMatt Macy 	uint64_t *dst = dbuf;
656eda14cbcSMatt Macy 	uint64_t *src = sbuf;
657eda14cbcSMatt Macy 	int cnt = size / sizeof (src[0]);
658eda14cbcSMatt Macy 
659eda14cbcSMatt Macy 	for (int i = 0; i < cnt; i++) {
660eda14cbcSMatt Macy 		dst[i] ^= src[i];
661eda14cbcSMatt Macy 	}
662eda14cbcSMatt Macy 
663eda14cbcSMatt Macy 	return (0);
664eda14cbcSMatt Macy }
665eda14cbcSMatt Macy 
666eda14cbcSMatt Macy static int
667eda14cbcSMatt Macy vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
668eda14cbcSMatt Macy     void *private)
669eda14cbcSMatt Macy {
670*e92ffd9bSMartin Matuska 	(void) private;
671eda14cbcSMatt Macy 	uint64_t *dst = dbuf;
672eda14cbcSMatt Macy 	uint64_t *src = sbuf;
673eda14cbcSMatt Macy 	uint64_t mask;
674eda14cbcSMatt Macy 	int cnt = size / sizeof (dst[0]);
675eda14cbcSMatt Macy 
676eda14cbcSMatt Macy 	for (int i = 0; i < cnt; i++, dst++, src++) {
677eda14cbcSMatt Macy 		VDEV_RAIDZ_64MUL_2(*dst, mask);
678eda14cbcSMatt Macy 		*dst ^= *src;
679eda14cbcSMatt Macy 	}
680eda14cbcSMatt Macy 
681eda14cbcSMatt Macy 	return (0);
682eda14cbcSMatt Macy }
683eda14cbcSMatt Macy 
684eda14cbcSMatt Macy static int
685eda14cbcSMatt Macy vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
686eda14cbcSMatt Macy {
687*e92ffd9bSMartin Matuska 	(void) private;
688eda14cbcSMatt Macy 	uint64_t *dst = buf;
689eda14cbcSMatt Macy 	uint64_t mask;
690eda14cbcSMatt Macy 	int cnt = size / sizeof (dst[0]);
691eda14cbcSMatt Macy 
692eda14cbcSMatt Macy 	for (int i = 0; i < cnt; i++, dst++) {
693eda14cbcSMatt Macy 		/* same operation as vdev_raidz_reconst_q_pre_func() on dst */
694eda14cbcSMatt Macy 		VDEV_RAIDZ_64MUL_2(*dst, mask);
695eda14cbcSMatt Macy 	}
696eda14cbcSMatt Macy 
697eda14cbcSMatt Macy 	return (0);
698eda14cbcSMatt Macy }
699eda14cbcSMatt Macy 
700eda14cbcSMatt Macy struct reconst_q_struct {
701eda14cbcSMatt Macy 	uint64_t *q;
702eda14cbcSMatt Macy 	int exp;
703eda14cbcSMatt Macy };
704eda14cbcSMatt Macy 
705eda14cbcSMatt Macy static int
706eda14cbcSMatt Macy vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
707eda14cbcSMatt Macy {
708eda14cbcSMatt Macy 	struct reconst_q_struct *rq = private;
709eda14cbcSMatt Macy 	uint64_t *dst = buf;
710eda14cbcSMatt Macy 	int cnt = size / sizeof (dst[0]);
711eda14cbcSMatt Macy 
712eda14cbcSMatt Macy 	for (int i = 0; i < cnt; i++, dst++, rq->q++) {
713eda14cbcSMatt Macy 		int j;
714eda14cbcSMatt Macy 		uint8_t *b;
715eda14cbcSMatt Macy 
716eda14cbcSMatt Macy 		*dst ^= *rq->q;
717eda14cbcSMatt Macy 		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
718eda14cbcSMatt Macy 			*b = vdev_raidz_exp2(*b, rq->exp);
719eda14cbcSMatt Macy 		}
720eda14cbcSMatt Macy 	}
721eda14cbcSMatt Macy 
722eda14cbcSMatt Macy 	return (0);
723eda14cbcSMatt Macy }
724eda14cbcSMatt Macy 
725eda14cbcSMatt Macy struct reconst_pq_struct {
726eda14cbcSMatt Macy 	uint8_t *p;
727eda14cbcSMatt Macy 	uint8_t *q;
728eda14cbcSMatt Macy 	uint8_t *pxy;
729eda14cbcSMatt Macy 	uint8_t *qxy;
730eda14cbcSMatt Macy 	int aexp;
731eda14cbcSMatt Macy 	int bexp;
732eda14cbcSMatt Macy };
733eda14cbcSMatt Macy 
734eda14cbcSMatt Macy static int
735eda14cbcSMatt Macy vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
736eda14cbcSMatt Macy {
737eda14cbcSMatt Macy 	struct reconst_pq_struct *rpq = private;
738eda14cbcSMatt Macy 	uint8_t *xd = xbuf;
739eda14cbcSMatt Macy 	uint8_t *yd = ybuf;
740eda14cbcSMatt Macy 
741eda14cbcSMatt Macy 	for (int i = 0; i < size;
742eda14cbcSMatt Macy 	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
743eda14cbcSMatt Macy 		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
744eda14cbcSMatt Macy 		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
745eda14cbcSMatt Macy 		*yd = *rpq->p ^ *rpq->pxy ^ *xd;
746eda14cbcSMatt Macy 	}
747eda14cbcSMatt Macy 
748eda14cbcSMatt Macy 	return (0);
749eda14cbcSMatt Macy }
750eda14cbcSMatt Macy 
751eda14cbcSMatt Macy static int
752eda14cbcSMatt Macy vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
753eda14cbcSMatt Macy {
754eda14cbcSMatt Macy 	struct reconst_pq_struct *rpq = private;
755eda14cbcSMatt Macy 	uint8_t *xd = xbuf;
756eda14cbcSMatt Macy 
757eda14cbcSMatt Macy 	for (int i = 0; i < size;
758eda14cbcSMatt Macy 	    i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
759eda14cbcSMatt Macy 		/* same operation as vdev_raidz_reconst_pq_func() on xd */
760eda14cbcSMatt Macy 		*xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
761eda14cbcSMatt Macy 		    vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
762eda14cbcSMatt Macy 	}
763eda14cbcSMatt Macy 
764eda14cbcSMatt Macy 	return (0);
765eda14cbcSMatt Macy }
766eda14cbcSMatt Macy 
767f9693befSMartin Matuska static void
7687877fdebSMatt Macy vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts)
769eda14cbcSMatt Macy {
770eda14cbcSMatt Macy 	int x = tgts[0];
771eda14cbcSMatt Macy 	abd_t *dst, *src;
772eda14cbcSMatt Macy 
7737877fdebSMatt Macy 	ASSERT3U(ntgts, ==, 1);
7747877fdebSMatt Macy 	ASSERT3U(x, >=, rr->rr_firstdatacol);
7757877fdebSMatt Macy 	ASSERT3U(x, <, rr->rr_cols);
776eda14cbcSMatt Macy 
7777877fdebSMatt Macy 	ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size);
778eda14cbcSMatt Macy 
7797877fdebSMatt Macy 	src = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
7807877fdebSMatt Macy 	dst = rr->rr_col[x].rc_abd;
781eda14cbcSMatt Macy 
7827877fdebSMatt Macy 	abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size);
783eda14cbcSMatt Macy 
7847877fdebSMatt Macy 	for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
7857877fdebSMatt Macy 		uint64_t size = MIN(rr->rr_col[x].rc_size,
7867877fdebSMatt Macy 		    rr->rr_col[c].rc_size);
787eda14cbcSMatt Macy 
7887877fdebSMatt Macy 		src = rr->rr_col[c].rc_abd;
789eda14cbcSMatt Macy 
790eda14cbcSMatt Macy 		if (c == x)
791eda14cbcSMatt Macy 			continue;
792eda14cbcSMatt Macy 
793eda14cbcSMatt Macy 		(void) abd_iterate_func2(dst, src, 0, 0, size,
794eda14cbcSMatt Macy 		    vdev_raidz_reconst_p_func, NULL);
795eda14cbcSMatt Macy 	}
796eda14cbcSMatt Macy }
797eda14cbcSMatt Macy 
798f9693befSMartin Matuska static void
7997877fdebSMatt Macy vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts)
800eda14cbcSMatt Macy {
801eda14cbcSMatt Macy 	int x = tgts[0];
802eda14cbcSMatt Macy 	int c, exp;
803eda14cbcSMatt Macy 	abd_t *dst, *src;
804eda14cbcSMatt Macy 
805eda14cbcSMatt Macy 	ASSERT(ntgts == 1);
806eda14cbcSMatt Macy 
8077877fdebSMatt Macy 	ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size);
808eda14cbcSMatt Macy 
8097877fdebSMatt Macy 	for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
8107877fdebSMatt Macy 		uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size,
8117877fdebSMatt Macy 		    rr->rr_col[c].rc_size);
812eda14cbcSMatt Macy 
8137877fdebSMatt Macy 		src = rr->rr_col[c].rc_abd;
8147877fdebSMatt Macy 		dst = rr->rr_col[x].rc_abd;
815eda14cbcSMatt Macy 
8167877fdebSMatt Macy 		if (c == rr->rr_firstdatacol) {
817eda14cbcSMatt Macy 			abd_copy(dst, src, size);
8187877fdebSMatt Macy 			if (rr->rr_col[x].rc_size > size) {
819eda14cbcSMatt Macy 				abd_zero_off(dst, size,
8207877fdebSMatt Macy 				    rr->rr_col[x].rc_size - size);
8217877fdebSMatt Macy 			}
822eda14cbcSMatt Macy 		} else {
8237877fdebSMatt Macy 			ASSERT3U(size, <=, rr->rr_col[x].rc_size);
824eda14cbcSMatt Macy 			(void) abd_iterate_func2(dst, src, 0, 0, size,
825eda14cbcSMatt Macy 			    vdev_raidz_reconst_q_pre_func, NULL);
826eda14cbcSMatt Macy 			(void) abd_iterate_func(dst,
8277877fdebSMatt Macy 			    size, rr->rr_col[x].rc_size - size,
828eda14cbcSMatt Macy 			    vdev_raidz_reconst_q_pre_tail_func, NULL);
829eda14cbcSMatt Macy 		}
830eda14cbcSMatt Macy 	}
831eda14cbcSMatt Macy 
8327877fdebSMatt Macy 	src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
8337877fdebSMatt Macy 	dst = rr->rr_col[x].rc_abd;
8347877fdebSMatt Macy 	exp = 255 - (rr->rr_cols - 1 - x);
835eda14cbcSMatt Macy 
836eda14cbcSMatt Macy 	struct reconst_q_struct rq = { abd_to_buf(src), exp };
8377877fdebSMatt Macy 	(void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size,
838eda14cbcSMatt Macy 	    vdev_raidz_reconst_q_post_func, &rq);
839eda14cbcSMatt Macy }
840eda14cbcSMatt Macy 
841f9693befSMartin Matuska static void
8427877fdebSMatt Macy vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts)
843eda14cbcSMatt Macy {
844eda14cbcSMatt Macy 	uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
845eda14cbcSMatt Macy 	abd_t *pdata, *qdata;
846eda14cbcSMatt Macy 	uint64_t xsize, ysize;
847eda14cbcSMatt Macy 	int x = tgts[0];
848eda14cbcSMatt Macy 	int y = tgts[1];
849eda14cbcSMatt Macy 	abd_t *xd, *yd;
850eda14cbcSMatt Macy 
851eda14cbcSMatt Macy 	ASSERT(ntgts == 2);
852eda14cbcSMatt Macy 	ASSERT(x < y);
8537877fdebSMatt Macy 	ASSERT(x >= rr->rr_firstdatacol);
8547877fdebSMatt Macy 	ASSERT(y < rr->rr_cols);
855eda14cbcSMatt Macy 
8567877fdebSMatt Macy 	ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size);
857eda14cbcSMatt Macy 
858eda14cbcSMatt Macy 	/*
859eda14cbcSMatt Macy 	 * Move the parity data aside -- we're going to compute parity as
860eda14cbcSMatt Macy 	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
861eda14cbcSMatt Macy 	 * reuse the parity generation mechanism without trashing the actual
862eda14cbcSMatt Macy 	 * parity so we make those columns appear to be full of zeros by
863eda14cbcSMatt Macy 	 * setting their lengths to zero.
864eda14cbcSMatt Macy 	 */
8657877fdebSMatt Macy 	pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
8667877fdebSMatt Macy 	qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
8677877fdebSMatt Macy 	xsize = rr->rr_col[x].rc_size;
8687877fdebSMatt Macy 	ysize = rr->rr_col[y].rc_size;
869eda14cbcSMatt Macy 
8707877fdebSMatt Macy 	rr->rr_col[VDEV_RAIDZ_P].rc_abd =
8717877fdebSMatt Macy 	    abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
8727877fdebSMatt Macy 	rr->rr_col[VDEV_RAIDZ_Q].rc_abd =
8737877fdebSMatt Macy 	    abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
8747877fdebSMatt Macy 	rr->rr_col[x].rc_size = 0;
8757877fdebSMatt Macy 	rr->rr_col[y].rc_size = 0;
876eda14cbcSMatt Macy 
8777877fdebSMatt Macy 	vdev_raidz_generate_parity_pq(rr);
878eda14cbcSMatt Macy 
8797877fdebSMatt Macy 	rr->rr_col[x].rc_size = xsize;
8807877fdebSMatt Macy 	rr->rr_col[y].rc_size = ysize;
881eda14cbcSMatt Macy 
882eda14cbcSMatt Macy 	p = abd_to_buf(pdata);
883eda14cbcSMatt Macy 	q = abd_to_buf(qdata);
8847877fdebSMatt Macy 	pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
8857877fdebSMatt Macy 	qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
8867877fdebSMatt Macy 	xd = rr->rr_col[x].rc_abd;
8877877fdebSMatt Macy 	yd = rr->rr_col[y].rc_abd;
888eda14cbcSMatt Macy 
889eda14cbcSMatt Macy 	/*
890eda14cbcSMatt Macy 	 * We now have:
891eda14cbcSMatt Macy 	 *	Pxy = P + D_x + D_y
892eda14cbcSMatt Macy 	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
893eda14cbcSMatt Macy 	 *
894eda14cbcSMatt Macy 	 * We can then solve for D_x:
895eda14cbcSMatt Macy 	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
896eda14cbcSMatt Macy 	 * where
897eda14cbcSMatt Macy 	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
898eda14cbcSMatt Macy 	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
899eda14cbcSMatt Macy 	 *
900eda14cbcSMatt Macy 	 * With D_x in hand, we can easily solve for D_y:
901eda14cbcSMatt Macy 	 *	D_y = P + Pxy + D_x
902eda14cbcSMatt Macy 	 */
903eda14cbcSMatt Macy 
904eda14cbcSMatt Macy 	a = vdev_raidz_pow2[255 + x - y];
9057877fdebSMatt Macy 	b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)];
906eda14cbcSMatt Macy 	tmp = 255 - vdev_raidz_log2[a ^ 1];
907eda14cbcSMatt Macy 
908eda14cbcSMatt Macy 	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
909eda14cbcSMatt Macy 	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
910eda14cbcSMatt Macy 
911eda14cbcSMatt Macy 	ASSERT3U(xsize, >=, ysize);
912eda14cbcSMatt Macy 	struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
913eda14cbcSMatt Macy 
914eda14cbcSMatt Macy 	(void) abd_iterate_func2(xd, yd, 0, 0, ysize,
915eda14cbcSMatt Macy 	    vdev_raidz_reconst_pq_func, &rpq);
916eda14cbcSMatt Macy 	(void) abd_iterate_func(xd, ysize, xsize - ysize,
917eda14cbcSMatt Macy 	    vdev_raidz_reconst_pq_tail_func, &rpq);
918eda14cbcSMatt Macy 
9197877fdebSMatt Macy 	abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
9207877fdebSMatt Macy 	abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
921eda14cbcSMatt Macy 
922eda14cbcSMatt Macy 	/*
923eda14cbcSMatt Macy 	 * Restore the saved parity data.
924eda14cbcSMatt Macy 	 */
9257877fdebSMatt Macy 	rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata;
9267877fdebSMatt Macy 	rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata;
927eda14cbcSMatt Macy }
928eda14cbcSMatt Macy 
929eda14cbcSMatt Macy /* BEGIN CSTYLED */
930eda14cbcSMatt Macy /*
931eda14cbcSMatt Macy  * In the general case of reconstruction, we must solve the system of linear
932eda14cbcSMatt Macy  * equations defined by the coefficients used to generate parity as well as
933eda14cbcSMatt Macy  * the contents of the data and parity disks. This can be expressed with
934eda14cbcSMatt Macy  * vectors for the original data (D) and the actual data (d) and parity (p)
935eda14cbcSMatt Macy  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
936eda14cbcSMatt Macy  *
937eda14cbcSMatt Macy  *            __   __                     __     __
938eda14cbcSMatt Macy  *            |     |         __     __   |  p_0  |
939eda14cbcSMatt Macy  *            |  V  |         |  D_0  |   | p_m-1 |
940eda14cbcSMatt Macy  *            |     |    x    |   :   | = |  d_0  |
941eda14cbcSMatt Macy  *            |  I  |         | D_n-1 |   |   :   |
942eda14cbcSMatt Macy  *            |     |         ~~     ~~   | d_n-1 |
943eda14cbcSMatt Macy  *            ~~   ~~                     ~~     ~~
944eda14cbcSMatt Macy  *
945eda14cbcSMatt Macy  * I is simply a square identity matrix of size n, and V is a vandermonde
946eda14cbcSMatt Macy  * matrix defined by the coefficients we chose for the various parity columns
947eda14cbcSMatt Macy  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
948eda14cbcSMatt Macy  * computation as well as linear separability.
949eda14cbcSMatt Macy  *
950eda14cbcSMatt Macy  *      __               __               __     __
951eda14cbcSMatt Macy  *      |   1   ..  1 1 1 |               |  p_0  |
952eda14cbcSMatt Macy  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
953eda14cbcSMatt Macy  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
954eda14cbcSMatt Macy  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
955eda14cbcSMatt Macy  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
956eda14cbcSMatt Macy  *      |   :       : : : |   |   :   |   |  d_2  |
957eda14cbcSMatt Macy  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
958eda14cbcSMatt Macy  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
959eda14cbcSMatt Macy  *      |   0   ..  0 0 1 |               | d_n-1 |
960eda14cbcSMatt Macy  *      ~~               ~~               ~~     ~~
961eda14cbcSMatt Macy  *
962eda14cbcSMatt Macy  * Note that I, V, d, and p are known. To compute D, we must invert the
963eda14cbcSMatt Macy  * matrix and use the known data and parity values to reconstruct the unknown
964eda14cbcSMatt Macy  * data values. We begin by removing the rows in V|I and d|p that correspond
965eda14cbcSMatt Macy  * to failed or missing columns; we then make V|I square (n x n) and d|p
966eda14cbcSMatt Macy  * sized n by removing rows corresponding to unused parity from the bottom up
967eda14cbcSMatt Macy  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
968eda14cbcSMatt Macy  * using Gauss-Jordan elimination. In the example below we use m=3 parity
969eda14cbcSMatt Macy  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
970eda14cbcSMatt Macy  *           __                               __
971eda14cbcSMatt Macy  *           |  1   1   1   1   1   1   1   1  |
972eda14cbcSMatt Macy  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
973eda14cbcSMatt Macy  *           |  19 205 116  29  64  16  4   1  |      / /
974eda14cbcSMatt Macy  *           |  1   0   0   0   0   0   0   0  |     / /
975eda14cbcSMatt Macy  *           |  0   1   0   0   0   0   0   0  | <--' /
976eda14cbcSMatt Macy  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
977eda14cbcSMatt Macy  *           |  0   0   0   1   0   0   0   0  |
978eda14cbcSMatt Macy  *           |  0   0   0   0   1   0   0   0  |
979eda14cbcSMatt Macy  *           |  0   0   0   0   0   1   0   0  |
980eda14cbcSMatt Macy  *           |  0   0   0   0   0   0   1   0  |
981eda14cbcSMatt Macy  *           |  0   0   0   0   0   0   0   1  |
982eda14cbcSMatt Macy  *           ~~                               ~~
983eda14cbcSMatt Macy  *           __                               __
984eda14cbcSMatt Macy  *           |  1   1   1   1   1   1   1   1  |
985eda14cbcSMatt Macy  *           | 128  64  32  16  8   4   2   1  |
986eda14cbcSMatt Macy  *           |  19 205 116  29  64  16  4   1  |
987eda14cbcSMatt Macy  *           |  1   0   0   0   0   0   0   0  |
988eda14cbcSMatt Macy  *           |  0   1   0   0   0   0   0   0  |
989eda14cbcSMatt Macy  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
990eda14cbcSMatt Macy  *           |  0   0   0   1   0   0   0   0  |
991eda14cbcSMatt Macy  *           |  0   0   0   0   1   0   0   0  |
992eda14cbcSMatt Macy  *           |  0   0   0   0   0   1   0   0  |
993eda14cbcSMatt Macy  *           |  0   0   0   0   0   0   1   0  |
994eda14cbcSMatt Macy  *           |  0   0   0   0   0   0   0   1  |
995eda14cbcSMatt Macy  *           ~~                               ~~
996eda14cbcSMatt Macy  *
997eda14cbcSMatt Macy  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
998eda14cbcSMatt Macy  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
999eda14cbcSMatt Macy  * matrix is not singular.
1000eda14cbcSMatt Macy  * __                                                                 __
1001eda14cbcSMatt Macy  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1002eda14cbcSMatt Macy  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1003eda14cbcSMatt Macy  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1004eda14cbcSMatt Macy  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1005eda14cbcSMatt Macy  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1006eda14cbcSMatt Macy  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1007eda14cbcSMatt Macy  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1008eda14cbcSMatt Macy  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1009eda14cbcSMatt Macy  * ~~                                                                 ~~
1010eda14cbcSMatt Macy  * __                                                                 __
1011eda14cbcSMatt Macy  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1012eda14cbcSMatt Macy  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1013eda14cbcSMatt Macy  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1014eda14cbcSMatt Macy  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1015eda14cbcSMatt Macy  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1016eda14cbcSMatt Macy  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1017eda14cbcSMatt Macy  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1018eda14cbcSMatt Macy  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1019eda14cbcSMatt Macy  * ~~                                                                 ~~
1020eda14cbcSMatt Macy  * __                                                                 __
1021eda14cbcSMatt Macy  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1022eda14cbcSMatt Macy  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1023eda14cbcSMatt Macy  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1024eda14cbcSMatt Macy  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1025eda14cbcSMatt Macy  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1026eda14cbcSMatt Macy  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1027eda14cbcSMatt Macy  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1028eda14cbcSMatt Macy  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1029eda14cbcSMatt Macy  * ~~                                                                 ~~
1030eda14cbcSMatt Macy  * __                                                                 __
1031eda14cbcSMatt Macy  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1032eda14cbcSMatt Macy  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1033eda14cbcSMatt Macy  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1034eda14cbcSMatt Macy  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1035eda14cbcSMatt Macy  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1036eda14cbcSMatt Macy  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1037eda14cbcSMatt Macy  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1038eda14cbcSMatt Macy  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1039eda14cbcSMatt Macy  * ~~                                                                 ~~
1040eda14cbcSMatt Macy  * __                                                                 __
1041eda14cbcSMatt Macy  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1042eda14cbcSMatt Macy  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1043eda14cbcSMatt Macy  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1044eda14cbcSMatt Macy  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1045eda14cbcSMatt Macy  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1046eda14cbcSMatt Macy  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1047eda14cbcSMatt Macy  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1048eda14cbcSMatt Macy  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1049eda14cbcSMatt Macy  * ~~                                                                 ~~
1050eda14cbcSMatt Macy  * __                                                                 __
1051eda14cbcSMatt Macy  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1052eda14cbcSMatt Macy  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1053eda14cbcSMatt Macy  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1054eda14cbcSMatt Macy  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1055eda14cbcSMatt Macy  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1056eda14cbcSMatt Macy  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1057eda14cbcSMatt Macy  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1058eda14cbcSMatt Macy  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1059eda14cbcSMatt Macy  * ~~                                                                 ~~
1060eda14cbcSMatt Macy  *                   __                               __
1061eda14cbcSMatt Macy  *                   |  0   0   1   0   0   0   0   0  |
1062eda14cbcSMatt Macy  *                   | 167 100  5   41 159 169 217 208 |
1063eda14cbcSMatt Macy  *                   | 166 100  4   40 158 168 216 209 |
1064eda14cbcSMatt Macy  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1065eda14cbcSMatt Macy  *                   |  0   0   0   0   1   0   0   0  |
1066eda14cbcSMatt Macy  *                   |  0   0   0   0   0   1   0   0  |
1067eda14cbcSMatt Macy  *                   |  0   0   0   0   0   0   1   0  |
1068eda14cbcSMatt Macy  *                   |  0   0   0   0   0   0   0   1  |
1069eda14cbcSMatt Macy  *                   ~~                               ~~
1070eda14cbcSMatt Macy  *
1071eda14cbcSMatt Macy  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1072eda14cbcSMatt Macy  * of the missing data.
1073eda14cbcSMatt Macy  *
1074eda14cbcSMatt Macy  * As is apparent from the example above, the only non-trivial rows in the
1075eda14cbcSMatt Macy  * inverse matrix correspond to the data disks that we're trying to
1076eda14cbcSMatt Macy  * reconstruct. Indeed, those are the only rows we need as the others would
1077eda14cbcSMatt Macy  * only be useful for reconstructing data known or assumed to be valid. For
1078eda14cbcSMatt Macy  * that reason, we only build the coefficients in the rows that correspond to
1079eda14cbcSMatt Macy  * targeted columns.
1080eda14cbcSMatt Macy  */
1081eda14cbcSMatt Macy /* END CSTYLED */
1082eda14cbcSMatt Macy 
1083eda14cbcSMatt Macy static void
10847877fdebSMatt Macy vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map,
1085eda14cbcSMatt Macy     uint8_t **rows)
1086eda14cbcSMatt Macy {
1087eda14cbcSMatt Macy 	int i, j;
1088eda14cbcSMatt Macy 	int pow;
1089eda14cbcSMatt Macy 
10907877fdebSMatt Macy 	ASSERT(n == rr->rr_cols - rr->rr_firstdatacol);
1091eda14cbcSMatt Macy 
1092eda14cbcSMatt Macy 	/*
1093eda14cbcSMatt Macy 	 * Fill in the missing rows of interest.
1094eda14cbcSMatt Macy 	 */
1095eda14cbcSMatt Macy 	for (i = 0; i < nmap; i++) {
1096eda14cbcSMatt Macy 		ASSERT3S(0, <=, map[i]);
1097eda14cbcSMatt Macy 		ASSERT3S(map[i], <=, 2);
1098eda14cbcSMatt Macy 
1099eda14cbcSMatt Macy 		pow = map[i] * n;
1100eda14cbcSMatt Macy 		if (pow > 255)
1101eda14cbcSMatt Macy 			pow -= 255;
1102eda14cbcSMatt Macy 		ASSERT(pow <= 255);
1103eda14cbcSMatt Macy 
1104eda14cbcSMatt Macy 		for (j = 0; j < n; j++) {
1105eda14cbcSMatt Macy 			pow -= map[i];
1106eda14cbcSMatt Macy 			if (pow < 0)
1107eda14cbcSMatt Macy 				pow += 255;
1108eda14cbcSMatt Macy 			rows[i][j] = vdev_raidz_pow2[pow];
1109eda14cbcSMatt Macy 		}
1110eda14cbcSMatt Macy 	}
1111eda14cbcSMatt Macy }
1112eda14cbcSMatt Macy 
1113eda14cbcSMatt Macy static void
11147877fdebSMatt Macy vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing,
1115eda14cbcSMatt Macy     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1116eda14cbcSMatt Macy {
1117eda14cbcSMatt Macy 	int i, j, ii, jj;
1118eda14cbcSMatt Macy 	uint8_t log;
1119eda14cbcSMatt Macy 
1120eda14cbcSMatt Macy 	/*
1121eda14cbcSMatt Macy 	 * Assert that the first nmissing entries from the array of used
1122eda14cbcSMatt Macy 	 * columns correspond to parity columns and that subsequent entries
1123eda14cbcSMatt Macy 	 * correspond to data columns.
1124eda14cbcSMatt Macy 	 */
1125eda14cbcSMatt Macy 	for (i = 0; i < nmissing; i++) {
11267877fdebSMatt Macy 		ASSERT3S(used[i], <, rr->rr_firstdatacol);
1127eda14cbcSMatt Macy 	}
1128eda14cbcSMatt Macy 	for (; i < n; i++) {
11297877fdebSMatt Macy 		ASSERT3S(used[i], >=, rr->rr_firstdatacol);
1130eda14cbcSMatt Macy 	}
1131eda14cbcSMatt Macy 
1132eda14cbcSMatt Macy 	/*
1133eda14cbcSMatt Macy 	 * First initialize the storage where we'll compute the inverse rows.
1134eda14cbcSMatt Macy 	 */
1135eda14cbcSMatt Macy 	for (i = 0; i < nmissing; i++) {
1136eda14cbcSMatt Macy 		for (j = 0; j < n; j++) {
1137eda14cbcSMatt Macy 			invrows[i][j] = (i == j) ? 1 : 0;
1138eda14cbcSMatt Macy 		}
1139eda14cbcSMatt Macy 	}
1140eda14cbcSMatt Macy 
1141eda14cbcSMatt Macy 	/*
1142eda14cbcSMatt Macy 	 * Subtract all trivial rows from the rows of consequence.
1143eda14cbcSMatt Macy 	 */
1144eda14cbcSMatt Macy 	for (i = 0; i < nmissing; i++) {
1145eda14cbcSMatt Macy 		for (j = nmissing; j < n; j++) {
11467877fdebSMatt Macy 			ASSERT3U(used[j], >=, rr->rr_firstdatacol);
11477877fdebSMatt Macy 			jj = used[j] - rr->rr_firstdatacol;
1148eda14cbcSMatt Macy 			ASSERT3S(jj, <, n);
1149eda14cbcSMatt Macy 			invrows[i][j] = rows[i][jj];
1150eda14cbcSMatt Macy 			rows[i][jj] = 0;
1151eda14cbcSMatt Macy 		}
1152eda14cbcSMatt Macy 	}
1153eda14cbcSMatt Macy 
1154eda14cbcSMatt Macy 	/*
1155eda14cbcSMatt Macy 	 * For each of the rows of interest, we must normalize it and subtract
1156eda14cbcSMatt Macy 	 * a multiple of it from the other rows.
1157eda14cbcSMatt Macy 	 */
1158eda14cbcSMatt Macy 	for (i = 0; i < nmissing; i++) {
1159eda14cbcSMatt Macy 		for (j = 0; j < missing[i]; j++) {
1160eda14cbcSMatt Macy 			ASSERT0(rows[i][j]);
1161eda14cbcSMatt Macy 		}
1162eda14cbcSMatt Macy 		ASSERT3U(rows[i][missing[i]], !=, 0);
1163eda14cbcSMatt Macy 
1164eda14cbcSMatt Macy 		/*
1165eda14cbcSMatt Macy 		 * Compute the inverse of the first element and multiply each
1166eda14cbcSMatt Macy 		 * element in the row by that value.
1167eda14cbcSMatt Macy 		 */
1168eda14cbcSMatt Macy 		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1169eda14cbcSMatt Macy 
1170eda14cbcSMatt Macy 		for (j = 0; j < n; j++) {
1171eda14cbcSMatt Macy 			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1172eda14cbcSMatt Macy 			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1173eda14cbcSMatt Macy 		}
1174eda14cbcSMatt Macy 
1175eda14cbcSMatt Macy 		for (ii = 0; ii < nmissing; ii++) {
1176eda14cbcSMatt Macy 			if (i == ii)
1177eda14cbcSMatt Macy 				continue;
1178eda14cbcSMatt Macy 
1179eda14cbcSMatt Macy 			ASSERT3U(rows[ii][missing[i]], !=, 0);
1180eda14cbcSMatt Macy 
1181eda14cbcSMatt Macy 			log = vdev_raidz_log2[rows[ii][missing[i]]];
1182eda14cbcSMatt Macy 
1183eda14cbcSMatt Macy 			for (j = 0; j < n; j++) {
1184eda14cbcSMatt Macy 				rows[ii][j] ^=
1185eda14cbcSMatt Macy 				    vdev_raidz_exp2(rows[i][j], log);
1186eda14cbcSMatt Macy 				invrows[ii][j] ^=
1187eda14cbcSMatt Macy 				    vdev_raidz_exp2(invrows[i][j], log);
1188eda14cbcSMatt Macy 			}
1189eda14cbcSMatt Macy 		}
1190eda14cbcSMatt Macy 	}
1191eda14cbcSMatt Macy 
1192eda14cbcSMatt Macy 	/*
1193eda14cbcSMatt Macy 	 * Verify that the data that is left in the rows are properly part of
1194eda14cbcSMatt Macy 	 * an identity matrix.
1195eda14cbcSMatt Macy 	 */
1196eda14cbcSMatt Macy 	for (i = 0; i < nmissing; i++) {
1197eda14cbcSMatt Macy 		for (j = 0; j < n; j++) {
1198eda14cbcSMatt Macy 			if (j == missing[i]) {
1199eda14cbcSMatt Macy 				ASSERT3U(rows[i][j], ==, 1);
1200eda14cbcSMatt Macy 			} else {
1201eda14cbcSMatt Macy 				ASSERT0(rows[i][j]);
1202eda14cbcSMatt Macy 			}
1203eda14cbcSMatt Macy 		}
1204eda14cbcSMatt Macy 	}
1205eda14cbcSMatt Macy }
1206eda14cbcSMatt Macy 
1207eda14cbcSMatt Macy static void
12087877fdebSMatt Macy vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing,
1209eda14cbcSMatt Macy     int *missing, uint8_t **invrows, const uint8_t *used)
1210eda14cbcSMatt Macy {
1211eda14cbcSMatt Macy 	int i, j, x, cc, c;
1212eda14cbcSMatt Macy 	uint8_t *src;
1213eda14cbcSMatt Macy 	uint64_t ccount;
1214eda14cbcSMatt Macy 	uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL };
1215eda14cbcSMatt Macy 	uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 };
1216eda14cbcSMatt Macy 	uint8_t log = 0;
1217eda14cbcSMatt Macy 	uint8_t val;
1218eda14cbcSMatt Macy 	int ll;
1219eda14cbcSMatt Macy 	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1220eda14cbcSMatt Macy 	uint8_t *p, *pp;
1221eda14cbcSMatt Macy 	size_t psize;
1222eda14cbcSMatt Macy 
1223eda14cbcSMatt Macy 	psize = sizeof (invlog[0][0]) * n * nmissing;
1224eda14cbcSMatt Macy 	p = kmem_alloc(psize, KM_SLEEP);
1225eda14cbcSMatt Macy 
1226eda14cbcSMatt Macy 	for (pp = p, i = 0; i < nmissing; i++) {
1227eda14cbcSMatt Macy 		invlog[i] = pp;
1228eda14cbcSMatt Macy 		pp += n;
1229eda14cbcSMatt Macy 	}
1230eda14cbcSMatt Macy 
1231eda14cbcSMatt Macy 	for (i = 0; i < nmissing; i++) {
1232eda14cbcSMatt Macy 		for (j = 0; j < n; j++) {
1233eda14cbcSMatt Macy 			ASSERT3U(invrows[i][j], !=, 0);
1234eda14cbcSMatt Macy 			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1235eda14cbcSMatt Macy 		}
1236eda14cbcSMatt Macy 	}
1237eda14cbcSMatt Macy 
1238eda14cbcSMatt Macy 	for (i = 0; i < n; i++) {
1239eda14cbcSMatt Macy 		c = used[i];
12407877fdebSMatt Macy 		ASSERT3U(c, <, rr->rr_cols);
1241eda14cbcSMatt Macy 
12427877fdebSMatt Macy 		ccount = rr->rr_col[c].rc_size;
12437877fdebSMatt Macy 		ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0);
12447877fdebSMatt Macy 		if (ccount == 0)
12457877fdebSMatt Macy 			continue;
12467877fdebSMatt Macy 		src = abd_to_buf(rr->rr_col[c].rc_abd);
1247eda14cbcSMatt Macy 		for (j = 0; j < nmissing; j++) {
12487877fdebSMatt Macy 			cc = missing[j] + rr->rr_firstdatacol;
12497877fdebSMatt Macy 			ASSERT3U(cc, >=, rr->rr_firstdatacol);
12507877fdebSMatt Macy 			ASSERT3U(cc, <, rr->rr_cols);
1251eda14cbcSMatt Macy 			ASSERT3U(cc, !=, c);
1252eda14cbcSMatt Macy 
12537877fdebSMatt Macy 			dcount[j] = rr->rr_col[cc].rc_size;
12547877fdebSMatt Macy 			if (dcount[j] != 0)
12557877fdebSMatt Macy 				dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd);
1256eda14cbcSMatt Macy 		}
1257eda14cbcSMatt Macy 
1258eda14cbcSMatt Macy 		for (x = 0; x < ccount; x++, src++) {
1259eda14cbcSMatt Macy 			if (*src != 0)
1260eda14cbcSMatt Macy 				log = vdev_raidz_log2[*src];
1261eda14cbcSMatt Macy 
1262eda14cbcSMatt Macy 			for (cc = 0; cc < nmissing; cc++) {
1263eda14cbcSMatt Macy 				if (x >= dcount[cc])
1264eda14cbcSMatt Macy 					continue;
1265eda14cbcSMatt Macy 
1266eda14cbcSMatt Macy 				if (*src == 0) {
1267eda14cbcSMatt Macy 					val = 0;
1268eda14cbcSMatt Macy 				} else {
1269eda14cbcSMatt Macy 					if ((ll = log + invlog[cc][i]) >= 255)
1270eda14cbcSMatt Macy 						ll -= 255;
1271eda14cbcSMatt Macy 					val = vdev_raidz_pow2[ll];
1272eda14cbcSMatt Macy 				}
1273eda14cbcSMatt Macy 
1274eda14cbcSMatt Macy 				if (i == 0)
1275eda14cbcSMatt Macy 					dst[cc][x] = val;
1276eda14cbcSMatt Macy 				else
1277eda14cbcSMatt Macy 					dst[cc][x] ^= val;
1278eda14cbcSMatt Macy 			}
1279eda14cbcSMatt Macy 		}
1280eda14cbcSMatt Macy 	}
1281eda14cbcSMatt Macy 
1282eda14cbcSMatt Macy 	kmem_free(p, psize);
1283eda14cbcSMatt Macy }
1284eda14cbcSMatt Macy 
1285f9693befSMartin Matuska static void
12867877fdebSMatt Macy vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts)
1287eda14cbcSMatt Macy {
1288eda14cbcSMatt Macy 	int n, i, c, t, tt;
1289eda14cbcSMatt Macy 	int nmissing_rows;
1290eda14cbcSMatt Macy 	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1291eda14cbcSMatt Macy 	int parity_map[VDEV_RAIDZ_MAXPARITY];
1292eda14cbcSMatt Macy 	uint8_t *p, *pp;
1293eda14cbcSMatt Macy 	size_t psize;
1294eda14cbcSMatt Macy 	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1295eda14cbcSMatt Macy 	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1296eda14cbcSMatt Macy 	uint8_t *used;
1297eda14cbcSMatt Macy 
1298eda14cbcSMatt Macy 	abd_t **bufs = NULL;
1299eda14cbcSMatt Macy 
1300eda14cbcSMatt Macy 	/*
1301eda14cbcSMatt Macy 	 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
13027877fdebSMatt Macy 	 * temporary linear ABDs if any non-linear ABDs are found.
1303eda14cbcSMatt Macy 	 */
13047877fdebSMatt Macy 	for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) {
13057877fdebSMatt Macy 		if (!abd_is_linear(rr->rr_col[i].rc_abd)) {
13067877fdebSMatt Macy 			bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *),
13077877fdebSMatt Macy 			    KM_PUSHPAGE);
1308eda14cbcSMatt Macy 
13097877fdebSMatt Macy 			for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
13107877fdebSMatt Macy 				raidz_col_t *col = &rr->rr_col[c];
1311eda14cbcSMatt Macy 
1312eda14cbcSMatt Macy 				bufs[c] = col->rc_abd;
13137877fdebSMatt Macy 				if (bufs[c] != NULL) {
13147877fdebSMatt Macy 					col->rc_abd = abd_alloc_linear(
13157877fdebSMatt Macy 					    col->rc_size, B_TRUE);
13167877fdebSMatt Macy 					abd_copy(col->rc_abd, bufs[c],
13177877fdebSMatt Macy 					    col->rc_size);
1318eda14cbcSMatt Macy 				}
1319eda14cbcSMatt Macy 			}
1320eda14cbcSMatt Macy 
13217877fdebSMatt Macy 			break;
13227877fdebSMatt Macy 		}
13237877fdebSMatt Macy 	}
13247877fdebSMatt Macy 
13257877fdebSMatt Macy 	n = rr->rr_cols - rr->rr_firstdatacol;
1326eda14cbcSMatt Macy 
1327eda14cbcSMatt Macy 	/*
1328eda14cbcSMatt Macy 	 * Figure out which data columns are missing.
1329eda14cbcSMatt Macy 	 */
1330eda14cbcSMatt Macy 	nmissing_rows = 0;
1331eda14cbcSMatt Macy 	for (t = 0; t < ntgts; t++) {
13327877fdebSMatt Macy 		if (tgts[t] >= rr->rr_firstdatacol) {
1333eda14cbcSMatt Macy 			missing_rows[nmissing_rows++] =
13347877fdebSMatt Macy 			    tgts[t] - rr->rr_firstdatacol;
1335eda14cbcSMatt Macy 		}
1336eda14cbcSMatt Macy 	}
1337eda14cbcSMatt Macy 
1338eda14cbcSMatt Macy 	/*
1339eda14cbcSMatt Macy 	 * Figure out which parity columns to use to help generate the missing
1340eda14cbcSMatt Macy 	 * data columns.
1341eda14cbcSMatt Macy 	 */
1342eda14cbcSMatt Macy 	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1343eda14cbcSMatt Macy 		ASSERT(tt < ntgts);
13447877fdebSMatt Macy 		ASSERT(c < rr->rr_firstdatacol);
1345eda14cbcSMatt Macy 
1346eda14cbcSMatt Macy 		/*
1347eda14cbcSMatt Macy 		 * Skip any targeted parity columns.
1348eda14cbcSMatt Macy 		 */
1349eda14cbcSMatt Macy 		if (c == tgts[tt]) {
1350eda14cbcSMatt Macy 			tt++;
1351eda14cbcSMatt Macy 			continue;
1352eda14cbcSMatt Macy 		}
1353eda14cbcSMatt Macy 
1354eda14cbcSMatt Macy 		parity_map[i] = c;
1355eda14cbcSMatt Macy 		i++;
1356eda14cbcSMatt Macy 	}
1357eda14cbcSMatt Macy 
1358eda14cbcSMatt Macy 	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1359eda14cbcSMatt Macy 	    nmissing_rows * n + sizeof (used[0]) * n;
1360eda14cbcSMatt Macy 	p = kmem_alloc(psize, KM_SLEEP);
1361eda14cbcSMatt Macy 
1362eda14cbcSMatt Macy 	for (pp = p, i = 0; i < nmissing_rows; i++) {
1363eda14cbcSMatt Macy 		rows[i] = pp;
1364eda14cbcSMatt Macy 		pp += n;
1365eda14cbcSMatt Macy 		invrows[i] = pp;
1366eda14cbcSMatt Macy 		pp += n;
1367eda14cbcSMatt Macy 	}
1368eda14cbcSMatt Macy 	used = pp;
1369eda14cbcSMatt Macy 
1370eda14cbcSMatt Macy 	for (i = 0; i < nmissing_rows; i++) {
1371eda14cbcSMatt Macy 		used[i] = parity_map[i];
1372eda14cbcSMatt Macy 	}
1373eda14cbcSMatt Macy 
13747877fdebSMatt Macy 	for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1375eda14cbcSMatt Macy 		if (tt < nmissing_rows &&
13767877fdebSMatt Macy 		    c == missing_rows[tt] + rr->rr_firstdatacol) {
1377eda14cbcSMatt Macy 			tt++;
1378eda14cbcSMatt Macy 			continue;
1379eda14cbcSMatt Macy 		}
1380eda14cbcSMatt Macy 
1381eda14cbcSMatt Macy 		ASSERT3S(i, <, n);
1382eda14cbcSMatt Macy 		used[i] = c;
1383eda14cbcSMatt Macy 		i++;
1384eda14cbcSMatt Macy 	}
1385eda14cbcSMatt Macy 
1386eda14cbcSMatt Macy 	/*
1387eda14cbcSMatt Macy 	 * Initialize the interesting rows of the matrix.
1388eda14cbcSMatt Macy 	 */
13897877fdebSMatt Macy 	vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows);
1390eda14cbcSMatt Macy 
1391eda14cbcSMatt Macy 	/*
1392eda14cbcSMatt Macy 	 * Invert the matrix.
1393eda14cbcSMatt Macy 	 */
13947877fdebSMatt Macy 	vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows,
1395eda14cbcSMatt Macy 	    invrows, used);
1396eda14cbcSMatt Macy 
1397eda14cbcSMatt Macy 	/*
1398eda14cbcSMatt Macy 	 * Reconstruct the missing data using the generated matrix.
1399eda14cbcSMatt Macy 	 */
14007877fdebSMatt Macy 	vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows,
1401eda14cbcSMatt Macy 	    invrows, used);
1402eda14cbcSMatt Macy 
1403eda14cbcSMatt Macy 	kmem_free(p, psize);
1404eda14cbcSMatt Macy 
1405eda14cbcSMatt Macy 	/*
1406eda14cbcSMatt Macy 	 * copy back from temporary linear abds and free them
1407eda14cbcSMatt Macy 	 */
1408eda14cbcSMatt Macy 	if (bufs) {
14097877fdebSMatt Macy 		for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
14107877fdebSMatt Macy 			raidz_col_t *col = &rr->rr_col[c];
1411eda14cbcSMatt Macy 
14127877fdebSMatt Macy 			if (bufs[c] != NULL) {
1413eda14cbcSMatt Macy 				abd_copy(bufs[c], col->rc_abd, col->rc_size);
1414eda14cbcSMatt Macy 				abd_free(col->rc_abd);
14157877fdebSMatt Macy 			}
1416eda14cbcSMatt Macy 			col->rc_abd = bufs[c];
1417eda14cbcSMatt Macy 		}
14187877fdebSMatt Macy 		kmem_free(bufs, rr->rr_cols * sizeof (abd_t *));
1419eda14cbcSMatt Macy 	}
1420eda14cbcSMatt Macy }
1421eda14cbcSMatt Macy 
1422f9693befSMartin Matuska static void
14237877fdebSMatt Macy vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr,
14247877fdebSMatt Macy     const int *t, int nt)
1425eda14cbcSMatt Macy {
1426eda14cbcSMatt Macy 	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1427eda14cbcSMatt Macy 	int ntgts;
1428eda14cbcSMatt Macy 	int i, c, ret;
1429eda14cbcSMatt Macy 	int nbadparity, nbaddata;
1430eda14cbcSMatt Macy 	int parity_valid[VDEV_RAIDZ_MAXPARITY];
1431eda14cbcSMatt Macy 
14327877fdebSMatt Macy 	nbadparity = rr->rr_firstdatacol;
14337877fdebSMatt Macy 	nbaddata = rr->rr_cols - nbadparity;
1434eda14cbcSMatt Macy 	ntgts = 0;
14357877fdebSMatt Macy 	for (i = 0, c = 0; c < rr->rr_cols; c++) {
14367877fdebSMatt Macy 		if (c < rr->rr_firstdatacol)
1437eda14cbcSMatt Macy 			parity_valid[c] = B_FALSE;
1438eda14cbcSMatt Macy 
1439eda14cbcSMatt Macy 		if (i < nt && c == t[i]) {
1440eda14cbcSMatt Macy 			tgts[ntgts++] = c;
1441eda14cbcSMatt Macy 			i++;
14427877fdebSMatt Macy 		} else if (rr->rr_col[c].rc_error != 0) {
1443eda14cbcSMatt Macy 			tgts[ntgts++] = c;
14447877fdebSMatt Macy 		} else if (c >= rr->rr_firstdatacol) {
1445eda14cbcSMatt Macy 			nbaddata--;
1446eda14cbcSMatt Macy 		} else {
1447eda14cbcSMatt Macy 			parity_valid[c] = B_TRUE;
1448eda14cbcSMatt Macy 			nbadparity--;
1449eda14cbcSMatt Macy 		}
1450eda14cbcSMatt Macy 	}
1451eda14cbcSMatt Macy 
1452eda14cbcSMatt Macy 	ASSERT(ntgts >= nt);
1453eda14cbcSMatt Macy 	ASSERT(nbaddata >= 0);
1454eda14cbcSMatt Macy 	ASSERT(nbaddata + nbadparity == ntgts);
1455eda14cbcSMatt Macy 
1456eda14cbcSMatt Macy 	dt = &tgts[nbadparity];
1457eda14cbcSMatt Macy 
1458eda14cbcSMatt Macy 	/* Reconstruct using the new math implementation */
14597877fdebSMatt Macy 	ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata);
1460eda14cbcSMatt Macy 	if (ret != RAIDZ_ORIGINAL_IMPL)
1461f9693befSMartin Matuska 		return;
1462eda14cbcSMatt Macy 
1463eda14cbcSMatt Macy 	/*
1464eda14cbcSMatt Macy 	 * See if we can use any of our optimized reconstruction routines.
1465eda14cbcSMatt Macy 	 */
1466eda14cbcSMatt Macy 	switch (nbaddata) {
1467eda14cbcSMatt Macy 	case 1:
1468f9693befSMartin Matuska 		if (parity_valid[VDEV_RAIDZ_P]) {
1469f9693befSMartin Matuska 			vdev_raidz_reconstruct_p(rr, dt, 1);
1470f9693befSMartin Matuska 			return;
1471f9693befSMartin Matuska 		}
1472eda14cbcSMatt Macy 
14737877fdebSMatt Macy 		ASSERT(rr->rr_firstdatacol > 1);
1474eda14cbcSMatt Macy 
1475f9693befSMartin Matuska 		if (parity_valid[VDEV_RAIDZ_Q]) {
1476f9693befSMartin Matuska 			vdev_raidz_reconstruct_q(rr, dt, 1);
1477f9693befSMartin Matuska 			return;
1478f9693befSMartin Matuska 		}
1479eda14cbcSMatt Macy 
14807877fdebSMatt Macy 		ASSERT(rr->rr_firstdatacol > 2);
1481eda14cbcSMatt Macy 		break;
1482eda14cbcSMatt Macy 
1483eda14cbcSMatt Macy 	case 2:
14847877fdebSMatt Macy 		ASSERT(rr->rr_firstdatacol > 1);
1485eda14cbcSMatt Macy 
1486eda14cbcSMatt Macy 		if (parity_valid[VDEV_RAIDZ_P] &&
1487f9693befSMartin Matuska 		    parity_valid[VDEV_RAIDZ_Q]) {
1488f9693befSMartin Matuska 			vdev_raidz_reconstruct_pq(rr, dt, 2);
1489f9693befSMartin Matuska 			return;
1490f9693befSMartin Matuska 		}
1491eda14cbcSMatt Macy 
14927877fdebSMatt Macy 		ASSERT(rr->rr_firstdatacol > 2);
1493eda14cbcSMatt Macy 
1494eda14cbcSMatt Macy 		break;
1495eda14cbcSMatt Macy 	}
1496eda14cbcSMatt Macy 
1497f9693befSMartin Matuska 	vdev_raidz_reconstruct_general(rr, tgts, ntgts);
1498eda14cbcSMatt Macy }
1499eda14cbcSMatt Macy 
1500eda14cbcSMatt Macy static int
1501eda14cbcSMatt Macy vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1502eda14cbcSMatt Macy     uint64_t *logical_ashift, uint64_t *physical_ashift)
1503eda14cbcSMatt Macy {
15047877fdebSMatt Macy 	vdev_raidz_t *vdrz = vd->vdev_tsd;
15057877fdebSMatt Macy 	uint64_t nparity = vdrz->vd_nparity;
1506eda14cbcSMatt Macy 	int c;
1507eda14cbcSMatt Macy 	int lasterror = 0;
1508eda14cbcSMatt Macy 	int numerrors = 0;
1509eda14cbcSMatt Macy 
1510eda14cbcSMatt Macy 	ASSERT(nparity > 0);
1511eda14cbcSMatt Macy 
1512eda14cbcSMatt Macy 	if (nparity > VDEV_RAIDZ_MAXPARITY ||
1513eda14cbcSMatt Macy 	    vd->vdev_children < nparity + 1) {
1514eda14cbcSMatt Macy 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1515eda14cbcSMatt Macy 		return (SET_ERROR(EINVAL));
1516eda14cbcSMatt Macy 	}
1517eda14cbcSMatt Macy 
1518eda14cbcSMatt Macy 	vdev_open_children(vd);
1519eda14cbcSMatt Macy 
1520eda14cbcSMatt Macy 	for (c = 0; c < vd->vdev_children; c++) {
15217877fdebSMatt Macy 		vdev_t *cvd = vd->vdev_child[c];
1522eda14cbcSMatt Macy 
1523eda14cbcSMatt Macy 		if (cvd->vdev_open_error != 0) {
1524eda14cbcSMatt Macy 			lasterror = cvd->vdev_open_error;
1525eda14cbcSMatt Macy 			numerrors++;
1526eda14cbcSMatt Macy 			continue;
1527eda14cbcSMatt Macy 		}
1528eda14cbcSMatt Macy 
1529eda14cbcSMatt Macy 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1530eda14cbcSMatt Macy 		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1531eda14cbcSMatt Macy 		*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
1532eda14cbcSMatt Macy 		*physical_ashift = MAX(*physical_ashift,
1533eda14cbcSMatt Macy 		    cvd->vdev_physical_ashift);
1534eda14cbcSMatt Macy 	}
1535eda14cbcSMatt Macy 
1536eda14cbcSMatt Macy 	*asize *= vd->vdev_children;
1537eda14cbcSMatt Macy 	*max_asize *= vd->vdev_children;
1538eda14cbcSMatt Macy 
1539eda14cbcSMatt Macy 	if (numerrors > nparity) {
1540eda14cbcSMatt Macy 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1541eda14cbcSMatt Macy 		return (lasterror);
1542eda14cbcSMatt Macy 	}
1543eda14cbcSMatt Macy 
1544eda14cbcSMatt Macy 	return (0);
1545eda14cbcSMatt Macy }
1546eda14cbcSMatt Macy 
1547eda14cbcSMatt Macy static void
1548eda14cbcSMatt Macy vdev_raidz_close(vdev_t *vd)
1549eda14cbcSMatt Macy {
15507877fdebSMatt Macy 	for (int c = 0; c < vd->vdev_children; c++) {
15517877fdebSMatt Macy 		if (vd->vdev_child[c] != NULL)
1552eda14cbcSMatt Macy 			vdev_close(vd->vdev_child[c]);
1553eda14cbcSMatt Macy 	}
15547877fdebSMatt Macy }
1555eda14cbcSMatt Macy 
1556eda14cbcSMatt Macy static uint64_t
1557eda14cbcSMatt Macy vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1558eda14cbcSMatt Macy {
15597877fdebSMatt Macy 	vdev_raidz_t *vdrz = vd->vdev_tsd;
1560eda14cbcSMatt Macy 	uint64_t asize;
1561eda14cbcSMatt Macy 	uint64_t ashift = vd->vdev_top->vdev_ashift;
15627877fdebSMatt Macy 	uint64_t cols = vdrz->vd_logical_width;
15637877fdebSMatt Macy 	uint64_t nparity = vdrz->vd_nparity;
1564eda14cbcSMatt Macy 
1565eda14cbcSMatt Macy 	asize = ((psize - 1) >> ashift) + 1;
1566eda14cbcSMatt Macy 	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1567eda14cbcSMatt Macy 	asize = roundup(asize, nparity + 1) << ashift;
1568eda14cbcSMatt Macy 
1569eda14cbcSMatt Macy 	return (asize);
1570eda14cbcSMatt Macy }
1571eda14cbcSMatt Macy 
15727877fdebSMatt Macy /*
15737877fdebSMatt Macy  * The allocatable space for a raidz vdev is N * sizeof(smallest child)
15747877fdebSMatt Macy  * so each child must provide at least 1/Nth of its asize.
15757877fdebSMatt Macy  */
15767877fdebSMatt Macy static uint64_t
15777877fdebSMatt Macy vdev_raidz_min_asize(vdev_t *vd)
15787877fdebSMatt Macy {
15797877fdebSMatt Macy 	return ((vd->vdev_min_asize + vd->vdev_children - 1) /
15807877fdebSMatt Macy 	    vd->vdev_children);
15817877fdebSMatt Macy }
15827877fdebSMatt Macy 
15837877fdebSMatt Macy void
1584eda14cbcSMatt Macy vdev_raidz_child_done(zio_t *zio)
1585eda14cbcSMatt Macy {
1586eda14cbcSMatt Macy 	raidz_col_t *rc = zio->io_private;
1587eda14cbcSMatt Macy 
158881b22a98SMartin Matuska 	ASSERT3P(rc->rc_abd, !=, NULL);
1589eda14cbcSMatt Macy 	rc->rc_error = zio->io_error;
1590eda14cbcSMatt Macy 	rc->rc_tried = 1;
1591eda14cbcSMatt Macy 	rc->rc_skipped = 0;
1592eda14cbcSMatt Macy }
1593eda14cbcSMatt Macy 
1594eda14cbcSMatt Macy static void
15957877fdebSMatt Macy vdev_raidz_io_verify(vdev_t *vd, raidz_row_t *rr, int col)
1596eda14cbcSMatt Macy {
1597eda14cbcSMatt Macy #ifdef ZFS_DEBUG
1598eda14cbcSMatt Macy 	vdev_t *tvd = vd->vdev_top;
1599eda14cbcSMatt Macy 
16007877fdebSMatt Macy 	range_seg64_t logical_rs, physical_rs, remain_rs;
16017877fdebSMatt Macy 	logical_rs.rs_start = rr->rr_offset;
1602eda14cbcSMatt Macy 	logical_rs.rs_end = logical_rs.rs_start +
16037877fdebSMatt Macy 	    vdev_raidz_asize(vd, rr->rr_size);
1604eda14cbcSMatt Macy 
16057877fdebSMatt Macy 	raidz_col_t *rc = &rr->rr_col[col];
1606eda14cbcSMatt Macy 	vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
1607eda14cbcSMatt Macy 
16087877fdebSMatt Macy 	vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs);
16097877fdebSMatt Macy 	ASSERT(vdev_xlate_is_empty(&remain_rs));
1610eda14cbcSMatt Macy 	ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
1611eda14cbcSMatt Macy 	ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
1612eda14cbcSMatt Macy 	/*
1613eda14cbcSMatt Macy 	 * It would be nice to assert that rs_end is equal
1614eda14cbcSMatt Macy 	 * to rc_offset + rc_size but there might be an
1615eda14cbcSMatt Macy 	 * optional I/O at the end that is not accounted in
1616eda14cbcSMatt Macy 	 * rc_size.
1617eda14cbcSMatt Macy 	 */
1618eda14cbcSMatt Macy 	if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
1619eda14cbcSMatt Macy 		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
1620eda14cbcSMatt Macy 		    rc->rc_size + (1 << tvd->vdev_ashift));
1621eda14cbcSMatt Macy 	} else {
1622eda14cbcSMatt Macy 		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
1623eda14cbcSMatt Macy 	}
1624eda14cbcSMatt Macy #endif
1625eda14cbcSMatt Macy }
1626eda14cbcSMatt Macy 
16277877fdebSMatt Macy static void
16287877fdebSMatt Macy vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr, uint64_t ashift)
16297877fdebSMatt Macy {
16307877fdebSMatt Macy 	vdev_t *vd = zio->io_vd;
16317877fdebSMatt Macy 	raidz_map_t *rm = zio->io_vsd;
16327877fdebSMatt Macy 
16337877fdebSMatt Macy 	vdev_raidz_generate_parity_row(rm, rr);
16347877fdebSMatt Macy 
163581b22a98SMartin Matuska 	for (int c = 0; c < rr->rr_scols; c++) {
16367877fdebSMatt Macy 		raidz_col_t *rc = &rr->rr_col[c];
163781b22a98SMartin Matuska 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
16387877fdebSMatt Macy 
16397877fdebSMatt Macy 		/* Verify physical to logical translation */
16407877fdebSMatt Macy 		vdev_raidz_io_verify(vd, rr, c);
16417877fdebSMatt Macy 
164281b22a98SMartin Matuska 		if (rc->rc_size > 0) {
164381b22a98SMartin Matuska 			ASSERT3P(rc->rc_abd, !=, NULL);
16447877fdebSMatt Macy 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
164581b22a98SMartin Matuska 			    rc->rc_offset, rc->rc_abd,
164681b22a98SMartin Matuska 			    abd_get_size(rc->rc_abd), zio->io_type,
164781b22a98SMartin Matuska 			    zio->io_priority, 0, vdev_raidz_child_done, rc));
164881b22a98SMartin Matuska 		} else {
164981b22a98SMartin Matuska 			/*
165081b22a98SMartin Matuska 			 * Generate optional write for skip sector to improve
165181b22a98SMartin Matuska 			 * aggregation contiguity.
165281b22a98SMartin Matuska 			 */
165381b22a98SMartin Matuska 			ASSERT3P(rc->rc_abd, ==, NULL);
165481b22a98SMartin Matuska 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
165581b22a98SMartin Matuska 			    rc->rc_offset, NULL, 1ULL << ashift,
16567877fdebSMatt Macy 			    zio->io_type, zio->io_priority,
165781b22a98SMartin Matuska 			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL,
165881b22a98SMartin Matuska 			    NULL));
165981b22a98SMartin Matuska 		}
16607877fdebSMatt Macy 	}
16617877fdebSMatt Macy }
16627877fdebSMatt Macy 
16637877fdebSMatt Macy static void
16647877fdebSMatt Macy vdev_raidz_io_start_read(zio_t *zio, raidz_row_t *rr)
16657877fdebSMatt Macy {
16667877fdebSMatt Macy 	vdev_t *vd = zio->io_vd;
16677877fdebSMatt Macy 
16687877fdebSMatt Macy 	/*
16697877fdebSMatt Macy 	 * Iterate over the columns in reverse order so that we hit the parity
16707877fdebSMatt Macy 	 * last -- any errors along the way will force us to read the parity.
16717877fdebSMatt Macy 	 */
16727877fdebSMatt Macy 	for (int c = rr->rr_cols - 1; c >= 0; c--) {
16737877fdebSMatt Macy 		raidz_col_t *rc = &rr->rr_col[c];
16747877fdebSMatt Macy 		if (rc->rc_size == 0)
16757877fdebSMatt Macy 			continue;
16767877fdebSMatt Macy 		vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
16777877fdebSMatt Macy 		if (!vdev_readable(cvd)) {
16787877fdebSMatt Macy 			if (c >= rr->rr_firstdatacol)
16797877fdebSMatt Macy 				rr->rr_missingdata++;
16807877fdebSMatt Macy 			else
16817877fdebSMatt Macy 				rr->rr_missingparity++;
16827877fdebSMatt Macy 			rc->rc_error = SET_ERROR(ENXIO);
16837877fdebSMatt Macy 			rc->rc_tried = 1;	/* don't even try */
16847877fdebSMatt Macy 			rc->rc_skipped = 1;
16857877fdebSMatt Macy 			continue;
16867877fdebSMatt Macy 		}
16877877fdebSMatt Macy 		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
16887877fdebSMatt Macy 			if (c >= rr->rr_firstdatacol)
16897877fdebSMatt Macy 				rr->rr_missingdata++;
16907877fdebSMatt Macy 			else
16917877fdebSMatt Macy 				rr->rr_missingparity++;
16927877fdebSMatt Macy 			rc->rc_error = SET_ERROR(ESTALE);
16937877fdebSMatt Macy 			rc->rc_skipped = 1;
16947877fdebSMatt Macy 			continue;
16957877fdebSMatt Macy 		}
16967877fdebSMatt Macy 		if (c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
16977877fdebSMatt Macy 		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
16987877fdebSMatt Macy 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
16997877fdebSMatt Macy 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
17007877fdebSMatt Macy 			    zio->io_type, zio->io_priority, 0,
17017877fdebSMatt Macy 			    vdev_raidz_child_done, rc));
17027877fdebSMatt Macy 		}
17037877fdebSMatt Macy 	}
17047877fdebSMatt Macy }
17057877fdebSMatt Macy 
1706eda14cbcSMatt Macy /*
1707eda14cbcSMatt Macy  * Start an IO operation on a RAIDZ VDev
1708eda14cbcSMatt Macy  *
1709eda14cbcSMatt Macy  * Outline:
1710eda14cbcSMatt Macy  * - For write operations:
1711eda14cbcSMatt Macy  *   1. Generate the parity data
1712eda14cbcSMatt Macy  *   2. Create child zio write operations to each column's vdev, for both
1713eda14cbcSMatt Macy  *      data and parity.
1714eda14cbcSMatt Macy  *   3. If the column skips any sectors for padding, create optional dummy
1715eda14cbcSMatt Macy  *      write zio children for those areas to improve aggregation continuity.
1716eda14cbcSMatt Macy  * - For read operations:
1717eda14cbcSMatt Macy  *   1. Create child zio read operations to each data column's vdev to read
1718eda14cbcSMatt Macy  *      the range of data required for zio.
1719eda14cbcSMatt Macy  *   2. If this is a scrub or resilver operation, or if any of the data
1720eda14cbcSMatt Macy  *      vdevs have had errors, then create zio read operations to the parity
1721eda14cbcSMatt Macy  *      columns' VDevs as well.
1722eda14cbcSMatt Macy  */
1723eda14cbcSMatt Macy static void
1724eda14cbcSMatt Macy vdev_raidz_io_start(zio_t *zio)
1725eda14cbcSMatt Macy {
1726eda14cbcSMatt Macy 	vdev_t *vd = zio->io_vd;
1727eda14cbcSMatt Macy 	vdev_t *tvd = vd->vdev_top;
17287877fdebSMatt Macy 	vdev_raidz_t *vdrz = vd->vdev_tsd;
1729eda14cbcSMatt Macy 
1730f9693befSMartin Matuska 	raidz_map_t *rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift,
17317877fdebSMatt Macy 	    vdrz->vd_logical_width, vdrz->vd_nparity);
1732f9693befSMartin Matuska 	zio->io_vsd = rm;
1733f9693befSMartin Matuska 	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1734eda14cbcSMatt Macy 
17357877fdebSMatt Macy 	/*
17367877fdebSMatt Macy 	 * Until raidz expansion is implemented all maps for a raidz vdev
17377877fdebSMatt Macy 	 * contain a single row.
17387877fdebSMatt Macy 	 */
17397877fdebSMatt Macy 	ASSERT3U(rm->rm_nrows, ==, 1);
17407877fdebSMatt Macy 	raidz_row_t *rr = rm->rm_row[0];
17417877fdebSMatt Macy 
1742eda14cbcSMatt Macy 	if (zio->io_type == ZIO_TYPE_WRITE) {
17437877fdebSMatt Macy 		vdev_raidz_io_start_write(zio, rr, tvd->vdev_ashift);
17447877fdebSMatt Macy 	} else {
1745eda14cbcSMatt Macy 		ASSERT(zio->io_type == ZIO_TYPE_READ);
17467877fdebSMatt Macy 		vdev_raidz_io_start_read(zio, rr);
1747eda14cbcSMatt Macy 	}
1748eda14cbcSMatt Macy 
1749eda14cbcSMatt Macy 	zio_execute(zio);
1750eda14cbcSMatt Macy }
1751eda14cbcSMatt Macy 
1752eda14cbcSMatt Macy /*
1753eda14cbcSMatt Macy  * Report a checksum error for a child of a RAID-Z device.
1754eda14cbcSMatt Macy  */
1755*e92ffd9bSMartin Matuska void
1756*e92ffd9bSMartin Matuska vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
1757eda14cbcSMatt Macy {
1758eda14cbcSMatt Macy 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1759eda14cbcSMatt Macy 
17607877fdebSMatt Macy 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) &&
17617877fdebSMatt Macy 	    zio->io_priority != ZIO_PRIORITY_REBUILD) {
1762eda14cbcSMatt Macy 		zio_bad_cksum_t zbc;
1763eda14cbcSMatt Macy 		raidz_map_t *rm = zio->io_vsd;
1764eda14cbcSMatt Macy 
1765eda14cbcSMatt Macy 		zbc.zbc_has_cksum = 0;
1766eda14cbcSMatt Macy 		zbc.zbc_injected = rm->rm_ecksuminjected;
1767eda14cbcSMatt Macy 
1768ba27dd8bSMartin Matuska 		(void) zfs_ereport_post_checksum(zio->io_spa, vd,
1769eda14cbcSMatt Macy 		    &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size,
1770eda14cbcSMatt Macy 		    rc->rc_abd, bad_data, &zbc);
17712c48331dSMatt Macy 		mutex_enter(&vd->vdev_stat_lock);
17722c48331dSMatt Macy 		vd->vdev_stat.vs_checksum_errors++;
17732c48331dSMatt Macy 		mutex_exit(&vd->vdev_stat_lock);
17742c48331dSMatt Macy 	}
1775eda14cbcSMatt Macy }
1776eda14cbcSMatt Macy 
1777eda14cbcSMatt Macy /*
1778eda14cbcSMatt Macy  * We keep track of whether or not there were any injected errors, so that
1779eda14cbcSMatt Macy  * any ereports we generate can note it.
1780eda14cbcSMatt Macy  */
1781eda14cbcSMatt Macy static int
1782eda14cbcSMatt Macy raidz_checksum_verify(zio_t *zio)
1783eda14cbcSMatt Macy {
1784eda14cbcSMatt Macy 	zio_bad_cksum_t zbc;
1785eda14cbcSMatt Macy 	raidz_map_t *rm = zio->io_vsd;
1786eda14cbcSMatt Macy 
1787eda14cbcSMatt Macy 	bzero(&zbc, sizeof (zio_bad_cksum_t));
1788eda14cbcSMatt Macy 
1789eda14cbcSMatt Macy 	int ret = zio_checksum_error(zio, &zbc);
1790eda14cbcSMatt Macy 	if (ret != 0 && zbc.zbc_injected != 0)
1791eda14cbcSMatt Macy 		rm->rm_ecksuminjected = 1;
1792eda14cbcSMatt Macy 
1793eda14cbcSMatt Macy 	return (ret);
1794eda14cbcSMatt Macy }
1795eda14cbcSMatt Macy 
1796eda14cbcSMatt Macy /*
1797eda14cbcSMatt Macy  * Generate the parity from the data columns. If we tried and were able to
1798eda14cbcSMatt Macy  * read the parity without error, verify that the generated parity matches the
1799eda14cbcSMatt Macy  * data we read. If it doesn't, we fire off a checksum error. Return the
18007877fdebSMatt Macy  * number of such failures.
1801eda14cbcSMatt Macy  */
1802eda14cbcSMatt Macy static int
18037877fdebSMatt Macy raidz_parity_verify(zio_t *zio, raidz_row_t *rr)
1804eda14cbcSMatt Macy {
1805eda14cbcSMatt Macy 	abd_t *orig[VDEV_RAIDZ_MAXPARITY];
1806eda14cbcSMatt Macy 	int c, ret = 0;
18077877fdebSMatt Macy 	raidz_map_t *rm = zio->io_vsd;
1808eda14cbcSMatt Macy 	raidz_col_t *rc;
1809eda14cbcSMatt Macy 
1810eda14cbcSMatt Macy 	blkptr_t *bp = zio->io_bp;
1811eda14cbcSMatt Macy 	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
1812eda14cbcSMatt Macy 	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
1813eda14cbcSMatt Macy 
1814eda14cbcSMatt Macy 	if (checksum == ZIO_CHECKSUM_NOPARITY)
1815eda14cbcSMatt Macy 		return (ret);
1816eda14cbcSMatt Macy 
18177877fdebSMatt Macy 	for (c = 0; c < rr->rr_firstdatacol; c++) {
18187877fdebSMatt Macy 		rc = &rr->rr_col[c];
1819eda14cbcSMatt Macy 		if (!rc->rc_tried || rc->rc_error != 0)
1820eda14cbcSMatt Macy 			continue;
1821eda14cbcSMatt Macy 
1822eda14cbcSMatt Macy 		orig[c] = abd_alloc_sametype(rc->rc_abd, rc->rc_size);
1823eda14cbcSMatt Macy 		abd_copy(orig[c], rc->rc_abd, rc->rc_size);
1824eda14cbcSMatt Macy 	}
1825eda14cbcSMatt Macy 
18267877fdebSMatt Macy 	/*
1827*e92ffd9bSMartin Matuska 	 * Verify any empty sectors are zero filled to ensure the parity
1828*e92ffd9bSMartin Matuska 	 * is calculated correctly even if these non-data sectors are damaged.
1829*e92ffd9bSMartin Matuska 	 */
1830*e92ffd9bSMartin Matuska 	if (rr->rr_nempty && rr->rr_abd_empty != NULL)
1831*e92ffd9bSMartin Matuska 		ret += vdev_draid_map_verify_empty(zio, rr);
1832*e92ffd9bSMartin Matuska 
1833*e92ffd9bSMartin Matuska 	/*
18347877fdebSMatt Macy 	 * Regenerates parity even for !tried||rc_error!=0 columns.  This
18357877fdebSMatt Macy 	 * isn't harmful but it does have the side effect of fixing stuff
18367877fdebSMatt Macy 	 * we didn't realize was necessary (i.e. even if we return 0).
18377877fdebSMatt Macy 	 */
18387877fdebSMatt Macy 	vdev_raidz_generate_parity_row(rm, rr);
1839eda14cbcSMatt Macy 
18407877fdebSMatt Macy 	for (c = 0; c < rr->rr_firstdatacol; c++) {
18417877fdebSMatt Macy 		rc = &rr->rr_col[c];
18427877fdebSMatt Macy 
1843eda14cbcSMatt Macy 		if (!rc->rc_tried || rc->rc_error != 0)
1844eda14cbcSMatt Macy 			continue;
18457877fdebSMatt Macy 
1846eda14cbcSMatt Macy 		if (abd_cmp(orig[c], rc->rc_abd) != 0) {
1847*e92ffd9bSMartin Matuska 			vdev_raidz_checksum_error(zio, rc, orig[c]);
1848eda14cbcSMatt Macy 			rc->rc_error = SET_ERROR(ECKSUM);
1849eda14cbcSMatt Macy 			ret++;
1850eda14cbcSMatt Macy 		}
1851eda14cbcSMatt Macy 		abd_free(orig[c]);
1852eda14cbcSMatt Macy 	}
1853eda14cbcSMatt Macy 
1854eda14cbcSMatt Macy 	return (ret);
1855eda14cbcSMatt Macy }
1856eda14cbcSMatt Macy 
1857eda14cbcSMatt Macy static int
18587877fdebSMatt Macy vdev_raidz_worst_error(raidz_row_t *rr)
1859eda14cbcSMatt Macy {
1860eda14cbcSMatt Macy 	int error = 0;
1861eda14cbcSMatt Macy 
18627877fdebSMatt Macy 	for (int c = 0; c < rr->rr_cols; c++)
18637877fdebSMatt Macy 		error = zio_worst_error(error, rr->rr_col[c].rc_error);
1864eda14cbcSMatt Macy 
1865eda14cbcSMatt Macy 	return (error);
1866eda14cbcSMatt Macy }
1867eda14cbcSMatt Macy 
1868eda14cbcSMatt Macy static void
18697877fdebSMatt Macy vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
1870eda14cbcSMatt Macy {
1871eda14cbcSMatt Macy 	int unexpected_errors = 0;
1872eda14cbcSMatt Macy 	int parity_errors = 0;
1873eda14cbcSMatt Macy 	int parity_untried = 0;
1874eda14cbcSMatt Macy 	int data_errors = 0;
1875eda14cbcSMatt Macy 
18767877fdebSMatt Macy 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
1877eda14cbcSMatt Macy 
18787877fdebSMatt Macy 	for (int c = 0; c < rr->rr_cols; c++) {
18797877fdebSMatt Macy 		raidz_col_t *rc = &rr->rr_col[c];
1880eda14cbcSMatt Macy 
1881eda14cbcSMatt Macy 		if (rc->rc_error) {
18827877fdebSMatt Macy 			if (c < rr->rr_firstdatacol)
1883eda14cbcSMatt Macy 				parity_errors++;
1884eda14cbcSMatt Macy 			else
1885eda14cbcSMatt Macy 				data_errors++;
1886eda14cbcSMatt Macy 
1887eda14cbcSMatt Macy 			if (!rc->rc_skipped)
1888eda14cbcSMatt Macy 				unexpected_errors++;
18897877fdebSMatt Macy 		} else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
1890eda14cbcSMatt Macy 			parity_untried++;
1891eda14cbcSMatt Macy 		}
1892eda14cbcSMatt Macy 	}
1893eda14cbcSMatt Macy 
1894eda14cbcSMatt Macy 	/*
18957877fdebSMatt Macy 	 * If we read more parity disks than were used for
18967877fdebSMatt Macy 	 * reconstruction, confirm that the other parity disks produced
18977877fdebSMatt Macy 	 * correct data.
18987877fdebSMatt Macy 	 *
18997877fdebSMatt Macy 	 * Note that we also regenerate parity when resilvering so we
19007877fdebSMatt Macy 	 * can write it out to failed devices later.
19017877fdebSMatt Macy 	 */
19027877fdebSMatt Macy 	if (parity_errors + parity_untried <
19037877fdebSMatt Macy 	    rr->rr_firstdatacol - data_errors ||
19047877fdebSMatt Macy 	    (zio->io_flags & ZIO_FLAG_RESILVER)) {
19057877fdebSMatt Macy 		int n = raidz_parity_verify(zio, rr);
19067877fdebSMatt Macy 		unexpected_errors += n;
19077877fdebSMatt Macy 	}
19087877fdebSMatt Macy 
19097877fdebSMatt Macy 	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
19107877fdebSMatt Macy 	    (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) {
19117877fdebSMatt Macy 		/*
19127877fdebSMatt Macy 		 * Use the good data we have in hand to repair damaged children.
19137877fdebSMatt Macy 		 */
19147877fdebSMatt Macy 		for (int c = 0; c < rr->rr_cols; c++) {
19157877fdebSMatt Macy 			raidz_col_t *rc = &rr->rr_col[c];
19167877fdebSMatt Macy 			vdev_t *vd = zio->io_vd;
19177877fdebSMatt Macy 			vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
19187877fdebSMatt Macy 
191916038816SMartin Matuska 			if (!rc->rc_allow_repair) {
192016038816SMartin Matuska 				continue;
192116038816SMartin Matuska 			} else if (!rc->rc_force_repair &&
192216038816SMartin Matuska 			    (rc->rc_error == 0 || rc->rc_size == 0)) {
19237877fdebSMatt Macy 				continue;
19247877fdebSMatt Macy 			}
19257877fdebSMatt Macy 
19267877fdebSMatt Macy 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
19277877fdebSMatt Macy 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
19287877fdebSMatt Macy 			    ZIO_TYPE_WRITE,
19297877fdebSMatt Macy 			    zio->io_priority == ZIO_PRIORITY_REBUILD ?
19307877fdebSMatt Macy 			    ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE,
19317877fdebSMatt Macy 			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
19327877fdebSMatt Macy 			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
19337877fdebSMatt Macy 		}
19347877fdebSMatt Macy 	}
19357877fdebSMatt Macy }
19367877fdebSMatt Macy 
19377877fdebSMatt Macy static void
19387877fdebSMatt Macy raidz_restore_orig_data(raidz_map_t *rm)
19397877fdebSMatt Macy {
19407877fdebSMatt Macy 	for (int i = 0; i < rm->rm_nrows; i++) {
19417877fdebSMatt Macy 		raidz_row_t *rr = rm->rm_row[i];
19427877fdebSMatt Macy 		for (int c = 0; c < rr->rr_cols; c++) {
19437877fdebSMatt Macy 			raidz_col_t *rc = &rr->rr_col[c];
19447877fdebSMatt Macy 			if (rc->rc_need_orig_restore) {
1945f9693befSMartin Matuska 				abd_copy(rc->rc_abd,
19467877fdebSMatt Macy 				    rc->rc_orig_data, rc->rc_size);
19477877fdebSMatt Macy 				rc->rc_need_orig_restore = B_FALSE;
19487877fdebSMatt Macy 			}
19497877fdebSMatt Macy 		}
19507877fdebSMatt Macy 	}
19517877fdebSMatt Macy }
19527877fdebSMatt Macy 
19537877fdebSMatt Macy /*
19547877fdebSMatt Macy  * returns EINVAL if reconstruction of the block will not be possible
19557877fdebSMatt Macy  * returns ECKSUM if this specific reconstruction failed
19567877fdebSMatt Macy  * returns 0 on successful reconstruction
19577877fdebSMatt Macy  */
19587877fdebSMatt Macy static int
19597877fdebSMatt Macy raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
19607877fdebSMatt Macy {
19617877fdebSMatt Macy 	raidz_map_t *rm = zio->io_vsd;
19627877fdebSMatt Macy 
19637877fdebSMatt Macy 	/* Reconstruct each row */
19647877fdebSMatt Macy 	for (int r = 0; r < rm->rm_nrows; r++) {
19657877fdebSMatt Macy 		raidz_row_t *rr = rm->rm_row[r];
19667877fdebSMatt Macy 		int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */
19677877fdebSMatt Macy 		int t = 0;
19687877fdebSMatt Macy 		int dead = 0;
19697877fdebSMatt Macy 		int dead_data = 0;
19707877fdebSMatt Macy 
19717877fdebSMatt Macy 		for (int c = 0; c < rr->rr_cols; c++) {
19727877fdebSMatt Macy 			raidz_col_t *rc = &rr->rr_col[c];
19737877fdebSMatt Macy 			ASSERT0(rc->rc_need_orig_restore);
19747877fdebSMatt Macy 			if (rc->rc_error != 0) {
19757877fdebSMatt Macy 				dead++;
19767877fdebSMatt Macy 				if (c >= nparity)
19777877fdebSMatt Macy 					dead_data++;
19787877fdebSMatt Macy 				continue;
19797877fdebSMatt Macy 			}
19807877fdebSMatt Macy 			if (rc->rc_size == 0)
19817877fdebSMatt Macy 				continue;
19827877fdebSMatt Macy 			for (int lt = 0; lt < ntgts; lt++) {
19837877fdebSMatt Macy 				if (rc->rc_devidx == ltgts[lt]) {
19847877fdebSMatt Macy 					if (rc->rc_orig_data == NULL) {
19857877fdebSMatt Macy 						rc->rc_orig_data =
1986f9693befSMartin Matuska 						    abd_alloc_linear(
1987f9693befSMartin Matuska 						    rc->rc_size, B_TRUE);
1988f9693befSMartin Matuska 						abd_copy(rc->rc_orig_data,
19897877fdebSMatt Macy 						    rc->rc_abd, rc->rc_size);
19907877fdebSMatt Macy 					}
19917877fdebSMatt Macy 					rc->rc_need_orig_restore = B_TRUE;
19927877fdebSMatt Macy 
19937877fdebSMatt Macy 					dead++;
19947877fdebSMatt Macy 					if (c >= nparity)
19957877fdebSMatt Macy 						dead_data++;
19967877fdebSMatt Macy 					my_tgts[t++] = c;
19977877fdebSMatt Macy 					break;
19987877fdebSMatt Macy 				}
19997877fdebSMatt Macy 			}
20007877fdebSMatt Macy 		}
20017877fdebSMatt Macy 		if (dead > nparity) {
20027877fdebSMatt Macy 			/* reconstruction not possible */
20037877fdebSMatt Macy 			raidz_restore_orig_data(rm);
20047877fdebSMatt Macy 			return (EINVAL);
20057877fdebSMatt Macy 		}
20067877fdebSMatt Macy 		if (dead_data > 0)
2007f9693befSMartin Matuska 			vdev_raidz_reconstruct_row(rm, rr, my_tgts, t);
20087877fdebSMatt Macy 	}
20097877fdebSMatt Macy 
20107877fdebSMatt Macy 	/* Check for success */
20117877fdebSMatt Macy 	if (raidz_checksum_verify(zio) == 0) {
20127877fdebSMatt Macy 
20137877fdebSMatt Macy 		/* Reconstruction succeeded - report errors */
20147877fdebSMatt Macy 		for (int i = 0; i < rm->rm_nrows; i++) {
20157877fdebSMatt Macy 			raidz_row_t *rr = rm->rm_row[i];
20167877fdebSMatt Macy 
20177877fdebSMatt Macy 			for (int c = 0; c < rr->rr_cols; c++) {
20187877fdebSMatt Macy 				raidz_col_t *rc = &rr->rr_col[c];
20197877fdebSMatt Macy 				if (rc->rc_need_orig_restore) {
20207877fdebSMatt Macy 					/*
20217877fdebSMatt Macy 					 * Note: if this is a parity column,
20227877fdebSMatt Macy 					 * we don't really know if it's wrong.
20237877fdebSMatt Macy 					 * We need to let
20247877fdebSMatt Macy 					 * vdev_raidz_io_done_verified() check
20257877fdebSMatt Macy 					 * it, and if we set rc_error, it will
20267877fdebSMatt Macy 					 * think that it is a "known" error
20277877fdebSMatt Macy 					 * that doesn't need to be checked
20287877fdebSMatt Macy 					 * or corrected.
20297877fdebSMatt Macy 					 */
20307877fdebSMatt Macy 					if (rc->rc_error == 0 &&
20317877fdebSMatt Macy 					    c >= rr->rr_firstdatacol) {
2032*e92ffd9bSMartin Matuska 						vdev_raidz_checksum_error(zio,
2033f9693befSMartin Matuska 						    rc, rc->rc_orig_data);
20347877fdebSMatt Macy 						rc->rc_error =
20357877fdebSMatt Macy 						    SET_ERROR(ECKSUM);
20367877fdebSMatt Macy 					}
20377877fdebSMatt Macy 					rc->rc_need_orig_restore = B_FALSE;
20387877fdebSMatt Macy 				}
20397877fdebSMatt Macy 			}
20407877fdebSMatt Macy 
20417877fdebSMatt Macy 			vdev_raidz_io_done_verified(zio, rr);
20427877fdebSMatt Macy 		}
20437877fdebSMatt Macy 
20447877fdebSMatt Macy 		zio_checksum_verified(zio);
20457877fdebSMatt Macy 
20467877fdebSMatt Macy 		return (0);
20477877fdebSMatt Macy 	}
20487877fdebSMatt Macy 
20497877fdebSMatt Macy 	/* Reconstruction failed - restore original data */
20507877fdebSMatt Macy 	raidz_restore_orig_data(rm);
20517877fdebSMatt Macy 	return (ECKSUM);
20527877fdebSMatt Macy }
20537877fdebSMatt Macy 
20547877fdebSMatt Macy /*
20557877fdebSMatt Macy  * Iterate over all combinations of N bad vdevs and attempt a reconstruction.
20567877fdebSMatt Macy  * Note that the algorithm below is non-optimal because it doesn't take into
20577877fdebSMatt Macy  * account how reconstruction is actually performed. For example, with
20587877fdebSMatt Macy  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
20597877fdebSMatt Macy  * is targeted as invalid as if columns 1 and 4 are targeted since in both
20607877fdebSMatt Macy  * cases we'd only use parity information in column 0.
20617877fdebSMatt Macy  *
20627877fdebSMatt Macy  * The order that we find the various possible combinations of failed
20637877fdebSMatt Macy  * disks is dictated by these rules:
20647877fdebSMatt Macy  * - Examine each "slot" (the "i" in tgts[i])
20657877fdebSMatt Macy  *   - Try to increment this slot (tgts[i] = tgts[i] + 1)
20667877fdebSMatt Macy  *   - if we can't increment because it runs into the next slot,
20677877fdebSMatt Macy  *     reset our slot to the minimum, and examine the next slot
20687877fdebSMatt Macy  *
20697877fdebSMatt Macy  *  For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose
20707877fdebSMatt Macy  *  3 columns to reconstruct), we will generate the following sequence:
20717877fdebSMatt Macy  *
20727877fdebSMatt Macy  *  STATE        ACTION
20737877fdebSMatt Macy  *  0 1 2        special case: skip since these are all parity
20747877fdebSMatt Macy  *  0 1   3      first slot: reset to 0; middle slot: increment to 2
20757877fdebSMatt Macy  *  0   2 3      first slot: increment to 1
20767877fdebSMatt Macy  *    1 2 3      first: reset to 0; middle: reset to 1; last: increment to 4
20777877fdebSMatt Macy  *  0 1     4    first: reset to 0; middle: increment to 2
20787877fdebSMatt Macy  *  0   2   4    first: increment to 1
20797877fdebSMatt Macy  *    1 2   4    first: reset to 0; middle: increment to 3
20807877fdebSMatt Macy  *  0     3 4    first: increment to 1
20817877fdebSMatt Macy  *    1   3 4    first: increment to 2
20827877fdebSMatt Macy  *      2 3 4    first: reset to 0; middle: reset to 1; last: increment to 5
20837877fdebSMatt Macy  *  0 1       5  first: reset to 0; middle: increment to 2
20847877fdebSMatt Macy  *  0   2     5  first: increment to 1
20857877fdebSMatt Macy  *    1 2     5  first: reset to 0; middle: increment to 3
20867877fdebSMatt Macy  *  0     3   5  first: increment to 1
20877877fdebSMatt Macy  *    1   3   5  first: increment to 2
20887877fdebSMatt Macy  *      2 3   5  first: reset to 0; middle: increment to 4
20897877fdebSMatt Macy  *  0       4 5  first: increment to 1
20907877fdebSMatt Macy  *    1     4 5  first: increment to 2
20917877fdebSMatt Macy  *      2   4 5  first: increment to 3
20927877fdebSMatt Macy  *        3 4 5  done
20937877fdebSMatt Macy  *
209416038816SMartin Matuska  * This strategy works for dRAID but is less efficient when there are a large
20957877fdebSMatt Macy  * number of child vdevs and therefore permutations to check. Furthermore,
20967877fdebSMatt Macy  * since the raidz_map_t rows likely do not overlap reconstruction would be
20977877fdebSMatt Macy  * possible as long as there are no more than nparity data errors per row.
20987877fdebSMatt Macy  * These additional permutations are not currently checked but could be as
20997877fdebSMatt Macy  * a future improvement.
21007877fdebSMatt Macy  */
21017877fdebSMatt Macy static int
21027877fdebSMatt Macy vdev_raidz_combrec(zio_t *zio)
21037877fdebSMatt Macy {
21047877fdebSMatt Macy 	int nparity = vdev_get_nparity(zio->io_vd);
21057877fdebSMatt Macy 	raidz_map_t *rm = zio->io_vsd;
21067877fdebSMatt Macy 
21077877fdebSMatt Macy 	/* Check if there's enough data to attempt reconstrution. */
21087877fdebSMatt Macy 	for (int i = 0; i < rm->rm_nrows; i++) {
21097877fdebSMatt Macy 		raidz_row_t *rr = rm->rm_row[i];
21107877fdebSMatt Macy 		int total_errors = 0;
21117877fdebSMatt Macy 
21127877fdebSMatt Macy 		for (int c = 0; c < rr->rr_cols; c++) {
21137877fdebSMatt Macy 			if (rr->rr_col[c].rc_error)
21147877fdebSMatt Macy 				total_errors++;
21157877fdebSMatt Macy 		}
21167877fdebSMatt Macy 
21177877fdebSMatt Macy 		if (total_errors > nparity)
21187877fdebSMatt Macy 			return (vdev_raidz_worst_error(rr));
21197877fdebSMatt Macy 	}
21207877fdebSMatt Macy 
21217877fdebSMatt Macy 	for (int num_failures = 1; num_failures <= nparity; num_failures++) {
21227877fdebSMatt Macy 		int tstore[VDEV_RAIDZ_MAXPARITY + 2];
21237877fdebSMatt Macy 		int *ltgts = &tstore[1]; /* value is logical child ID */
21247877fdebSMatt Macy 
21257877fdebSMatt Macy 		/* Determine number of logical children, n */
21267877fdebSMatt Macy 		int n = zio->io_vd->vdev_children;
21277877fdebSMatt Macy 
21287877fdebSMatt Macy 		ASSERT3U(num_failures, <=, nparity);
21297877fdebSMatt Macy 		ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY);
21307877fdebSMatt Macy 
21317877fdebSMatt Macy 		/* Handle corner cases in combrec logic */
21327877fdebSMatt Macy 		ltgts[-1] = -1;
21337877fdebSMatt Macy 		for (int i = 0; i < num_failures; i++) {
21347877fdebSMatt Macy 			ltgts[i] = i;
21357877fdebSMatt Macy 		}
21367877fdebSMatt Macy 		ltgts[num_failures] = n;
21377877fdebSMatt Macy 
21387877fdebSMatt Macy 		for (;;) {
21397877fdebSMatt Macy 			int err = raidz_reconstruct(zio, ltgts, num_failures,
21407877fdebSMatt Macy 			    nparity);
21417877fdebSMatt Macy 			if (err == EINVAL) {
21427877fdebSMatt Macy 				/*
21437877fdebSMatt Macy 				 * Reconstruction not possible with this #
21447877fdebSMatt Macy 				 * failures; try more failures.
21457877fdebSMatt Macy 				 */
21467877fdebSMatt Macy 				break;
21477877fdebSMatt Macy 			} else if (err == 0)
21487877fdebSMatt Macy 				return (0);
21497877fdebSMatt Macy 
21507877fdebSMatt Macy 			/* Compute next targets to try */
21517877fdebSMatt Macy 			for (int t = 0; ; t++) {
21527877fdebSMatt Macy 				ASSERT3U(t, <, num_failures);
21537877fdebSMatt Macy 				ltgts[t]++;
21547877fdebSMatt Macy 				if (ltgts[t] == n) {
21557877fdebSMatt Macy 					/* try more failures */
21567877fdebSMatt Macy 					ASSERT3U(t, ==, num_failures - 1);
21577877fdebSMatt Macy 					break;
21587877fdebSMatt Macy 				}
21597877fdebSMatt Macy 
21607877fdebSMatt Macy 				ASSERT3U(ltgts[t], <, n);
21617877fdebSMatt Macy 				ASSERT3U(ltgts[t], <=, ltgts[t + 1]);
21627877fdebSMatt Macy 
21637877fdebSMatt Macy 				/*
21647877fdebSMatt Macy 				 * If that spot is available, we're done here.
21657877fdebSMatt Macy 				 * Try the next combination.
21667877fdebSMatt Macy 				 */
21677877fdebSMatt Macy 				if (ltgts[t] != ltgts[t + 1])
21687877fdebSMatt Macy 					break;
21697877fdebSMatt Macy 
21707877fdebSMatt Macy 				/*
21717877fdebSMatt Macy 				 * Otherwise, reset this tgt to the minimum,
21727877fdebSMatt Macy 				 * and move on to the next tgt.
21737877fdebSMatt Macy 				 */
21747877fdebSMatt Macy 				ltgts[t] = ltgts[t - 1] + 1;
21757877fdebSMatt Macy 				ASSERT3U(ltgts[t], ==, t);
21767877fdebSMatt Macy 			}
21777877fdebSMatt Macy 
21787877fdebSMatt Macy 			/* Increase the number of failures and keep trying. */
21797877fdebSMatt Macy 			if (ltgts[num_failures - 1] == n)
21807877fdebSMatt Macy 				break;
21817877fdebSMatt Macy 		}
21827877fdebSMatt Macy 	}
21837877fdebSMatt Macy 
21847877fdebSMatt Macy 	return (ECKSUM);
21857877fdebSMatt Macy }
21867877fdebSMatt Macy 
21877877fdebSMatt Macy void
21887877fdebSMatt Macy vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
21897877fdebSMatt Macy {
21907877fdebSMatt Macy 	for (uint64_t row = 0; row < rm->rm_nrows; row++) {
21917877fdebSMatt Macy 		raidz_row_t *rr = rm->rm_row[row];
21927877fdebSMatt Macy 		vdev_raidz_reconstruct_row(rm, rr, t, nt);
21937877fdebSMatt Macy 	}
21947877fdebSMatt Macy }
21957877fdebSMatt Macy 
21967877fdebSMatt Macy /*
21977877fdebSMatt Macy  * Complete a write IO operation on a RAIDZ VDev
21987877fdebSMatt Macy  *
21997877fdebSMatt Macy  * Outline:
22007877fdebSMatt Macy  *   1. Check for errors on the child IOs.
22017877fdebSMatt Macy  *   2. Return, setting an error code if too few child VDevs were written
22027877fdebSMatt Macy  *      to reconstruct the data later.  Note that partial writes are
22037877fdebSMatt Macy  *      considered successful if they can be reconstructed at all.
22047877fdebSMatt Macy  */
22057877fdebSMatt Macy static void
22067877fdebSMatt Macy vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)
22077877fdebSMatt Macy {
22087877fdebSMatt Macy 	int total_errors = 0;
22097877fdebSMatt Macy 
22107877fdebSMatt Macy 	ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
22117877fdebSMatt Macy 	ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
22127877fdebSMatt Macy 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
22137877fdebSMatt Macy 
22147877fdebSMatt Macy 	for (int c = 0; c < rr->rr_cols; c++) {
22157877fdebSMatt Macy 		raidz_col_t *rc = &rr->rr_col[c];
22167877fdebSMatt Macy 
22177877fdebSMatt Macy 		if (rc->rc_error) {
22187877fdebSMatt Macy 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
22197877fdebSMatt Macy 
22207877fdebSMatt Macy 			total_errors++;
22217877fdebSMatt Macy 		}
22227877fdebSMatt Macy 	}
22237877fdebSMatt Macy 
22247877fdebSMatt Macy 	/*
22257877fdebSMatt Macy 	 * Treat partial writes as a success. If we couldn't write enough
22267877fdebSMatt Macy 	 * columns to reconstruct the data, the I/O failed.  Otherwise,
22277877fdebSMatt Macy 	 * good enough.
2228eda14cbcSMatt Macy 	 *
2229eda14cbcSMatt Macy 	 * Now that we support write reallocation, it would be better
2230eda14cbcSMatt Macy 	 * to treat partial failure as real failure unless there are
2231eda14cbcSMatt Macy 	 * no non-degraded top-level vdevs left, and not update DTLs
2232eda14cbcSMatt Macy 	 * if we intend to reallocate.
2233eda14cbcSMatt Macy 	 */
22347877fdebSMatt Macy 	if (total_errors > rr->rr_firstdatacol) {
22357877fdebSMatt Macy 		zio->io_error = zio_worst_error(zio->io_error,
22367877fdebSMatt Macy 		    vdev_raidz_worst_error(rr));
22377877fdebSMatt Macy 	}
2238eda14cbcSMatt Macy }
2239eda14cbcSMatt Macy 
2240f9693befSMartin Matuska static void
22417877fdebSMatt Macy vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm,
22427877fdebSMatt Macy     raidz_row_t *rr)
22437877fdebSMatt Macy {
22447877fdebSMatt Macy 	int parity_errors = 0;
22457877fdebSMatt Macy 	int parity_untried = 0;
22467877fdebSMatt Macy 	int data_errors = 0;
22477877fdebSMatt Macy 	int total_errors = 0;
22487877fdebSMatt Macy 
22497877fdebSMatt Macy 	ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
22507877fdebSMatt Macy 	ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
22517877fdebSMatt Macy 	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
22527877fdebSMatt Macy 
22537877fdebSMatt Macy 	for (int c = 0; c < rr->rr_cols; c++) {
22547877fdebSMatt Macy 		raidz_col_t *rc = &rr->rr_col[c];
22557877fdebSMatt Macy 
22567877fdebSMatt Macy 		if (rc->rc_error) {
22577877fdebSMatt Macy 			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
22587877fdebSMatt Macy 
22597877fdebSMatt Macy 			if (c < rr->rr_firstdatacol)
22607877fdebSMatt Macy 				parity_errors++;
22617877fdebSMatt Macy 			else
22627877fdebSMatt Macy 				data_errors++;
22637877fdebSMatt Macy 
22647877fdebSMatt Macy 			total_errors++;
22657877fdebSMatt Macy 		} else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
22667877fdebSMatt Macy 			parity_untried++;
22677877fdebSMatt Macy 		}
22687877fdebSMatt Macy 	}
2269eda14cbcSMatt Macy 
2270eda14cbcSMatt Macy 	/*
22717877fdebSMatt Macy 	 * If there were data errors and the number of errors we saw was
22727877fdebSMatt Macy 	 * correctable -- less than or equal to the number of parity disks read
22737877fdebSMatt Macy 	 * -- reconstruct based on the missing data.
2274eda14cbcSMatt Macy 	 */
22757877fdebSMatt Macy 	if (data_errors != 0 &&
22767877fdebSMatt Macy 	    total_errors <= rr->rr_firstdatacol - parity_untried) {
2277eda14cbcSMatt Macy 		/*
2278eda14cbcSMatt Macy 		 * We either attempt to read all the parity columns or
2279eda14cbcSMatt Macy 		 * none of them. If we didn't try to read parity, we
2280eda14cbcSMatt Macy 		 * wouldn't be here in the correctable case. There must
2281eda14cbcSMatt Macy 		 * also have been fewer parity errors than parity
2282eda14cbcSMatt Macy 		 * columns or, again, we wouldn't be in this code path.
2283eda14cbcSMatt Macy 		 */
2284eda14cbcSMatt Macy 		ASSERT(parity_untried == 0);
22857877fdebSMatt Macy 		ASSERT(parity_errors < rr->rr_firstdatacol);
2286eda14cbcSMatt Macy 
2287eda14cbcSMatt Macy 		/*
2288eda14cbcSMatt Macy 		 * Identify the data columns that reported an error.
2289eda14cbcSMatt Macy 		 */
22907877fdebSMatt Macy 		int n = 0;
22917877fdebSMatt Macy 		int tgts[VDEV_RAIDZ_MAXPARITY];
22927877fdebSMatt Macy 		for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
22937877fdebSMatt Macy 			raidz_col_t *rc = &rr->rr_col[c];
2294eda14cbcSMatt Macy 			if (rc->rc_error != 0) {
2295eda14cbcSMatt Macy 				ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2296eda14cbcSMatt Macy 				tgts[n++] = c;
2297eda14cbcSMatt Macy 			}
2298eda14cbcSMatt Macy 		}
2299eda14cbcSMatt Macy 
23007877fdebSMatt Macy 		ASSERT(rr->rr_firstdatacol >= n);
2301eda14cbcSMatt Macy 
2302f9693befSMartin Matuska 		vdev_raidz_reconstruct_row(rm, rr, tgts, n);
2303eda14cbcSMatt Macy 	}
2304eda14cbcSMatt Macy }
2305eda14cbcSMatt Macy 
2306eda14cbcSMatt Macy /*
23077877fdebSMatt Macy  * Return the number of reads issued.
2308eda14cbcSMatt Macy  */
23097877fdebSMatt Macy static int
23107877fdebSMatt Macy vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr)
23117877fdebSMatt Macy {
23127877fdebSMatt Macy 	vdev_t *vd = zio->io_vd;
23137877fdebSMatt Macy 	int nread = 0;
2314eda14cbcSMatt Macy 
23157877fdebSMatt Macy 	rr->rr_missingdata = 0;
23167877fdebSMatt Macy 	rr->rr_missingparity = 0;
23177877fdebSMatt Macy 
23187877fdebSMatt Macy 	/*
23197877fdebSMatt Macy 	 * If this rows contains empty sectors which are not required
23207877fdebSMatt Macy 	 * for a normal read then allocate an ABD for them now so they
23217877fdebSMatt Macy 	 * may be read, verified, and any needed repairs performed.
23227877fdebSMatt Macy 	 */
23237877fdebSMatt Macy 	if (rr->rr_nempty && rr->rr_abd_empty == NULL)
23247877fdebSMatt Macy 		vdev_draid_map_alloc_empty(zio, rr);
23257877fdebSMatt Macy 
23267877fdebSMatt Macy 	for (int c = 0; c < rr->rr_cols; c++) {
23277877fdebSMatt Macy 		raidz_col_t *rc = &rr->rr_col[c];
23287877fdebSMatt Macy 		if (rc->rc_tried || rc->rc_size == 0)
2329eda14cbcSMatt Macy 			continue;
2330eda14cbcSMatt Macy 
2331eda14cbcSMatt Macy 		zio_nowait(zio_vdev_child_io(zio, NULL,
2332eda14cbcSMatt Macy 		    vd->vdev_child[rc->rc_devidx],
2333eda14cbcSMatt Macy 		    rc->rc_offset, rc->rc_abd, rc->rc_size,
2334eda14cbcSMatt Macy 		    zio->io_type, zio->io_priority, 0,
2335eda14cbcSMatt Macy 		    vdev_raidz_child_done, rc));
23367877fdebSMatt Macy 		nread++;
23377877fdebSMatt Macy 	}
23387877fdebSMatt Macy 	return (nread);
2339eda14cbcSMatt Macy }
2340eda14cbcSMatt Macy 
2341eda14cbcSMatt Macy /*
23427877fdebSMatt Macy  * We're here because either there were too many errors to even attempt
23437877fdebSMatt Macy  * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec()
23447877fdebSMatt Macy  * failed. In either case, there is enough bad data to prevent reconstruction.
23457877fdebSMatt Macy  * Start checksum ereports for all children which haven't failed.
2346eda14cbcSMatt Macy  */
23477877fdebSMatt Macy static void
23487877fdebSMatt Macy vdev_raidz_io_done_unrecoverable(zio_t *zio)
23497877fdebSMatt Macy {
23507877fdebSMatt Macy 	raidz_map_t *rm = zio->io_vsd;
2351eda14cbcSMatt Macy 
23527877fdebSMatt Macy 	for (int i = 0; i < rm->rm_nrows; i++) {
23537877fdebSMatt Macy 		raidz_row_t *rr = rm->rm_row[i];
2354eda14cbcSMatt Macy 
23557877fdebSMatt Macy 		for (int c = 0; c < rr->rr_cols; c++) {
23567877fdebSMatt Macy 			raidz_col_t *rc = &rr->rr_col[c];
23577877fdebSMatt Macy 			vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
23587877fdebSMatt Macy 
23592c48331dSMatt Macy 			if (rc->rc_error != 0)
23602c48331dSMatt Macy 				continue;
23612c48331dSMatt Macy 
2362eda14cbcSMatt Macy 			zio_bad_cksum_t zbc;
2363eda14cbcSMatt Macy 			zbc.zbc_has_cksum = 0;
23642c48331dSMatt Macy 			zbc.zbc_injected = rm->rm_ecksuminjected;
2365eda14cbcSMatt Macy 
2366ba27dd8bSMartin Matuska 			(void) zfs_ereport_start_checksum(zio->io_spa,
23677877fdebSMatt Macy 			    cvd, &zio->io_bookmark, zio, rc->rc_offset,
2368f9693befSMartin Matuska 			    rc->rc_size, &zbc);
2369eda14cbcSMatt Macy 			mutex_enter(&cvd->vdev_stat_lock);
2370eda14cbcSMatt Macy 			cvd->vdev_stat.vs_checksum_errors++;
2371eda14cbcSMatt Macy 			mutex_exit(&cvd->vdev_stat_lock);
2372eda14cbcSMatt Macy 		}
2373eda14cbcSMatt Macy 	}
2374eda14cbcSMatt Macy }
2375eda14cbcSMatt Macy 
23767877fdebSMatt Macy void
23777877fdebSMatt Macy vdev_raidz_io_done(zio_t *zio)
23787877fdebSMatt Macy {
23797877fdebSMatt Macy 	raidz_map_t *rm = zio->io_vsd;
23807877fdebSMatt Macy 
23817877fdebSMatt Macy 	if (zio->io_type == ZIO_TYPE_WRITE) {
23827877fdebSMatt Macy 		for (int i = 0; i < rm->rm_nrows; i++) {
23837877fdebSMatt Macy 			vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]);
23847877fdebSMatt Macy 		}
23857877fdebSMatt Macy 	} else {
23867877fdebSMatt Macy 		for (int i = 0; i < rm->rm_nrows; i++) {
23877877fdebSMatt Macy 			raidz_row_t *rr = rm->rm_row[i];
23887877fdebSMatt Macy 			vdev_raidz_io_done_reconstruct_known_missing(zio,
23897877fdebSMatt Macy 			    rm, rr);
23907877fdebSMatt Macy 		}
23917877fdebSMatt Macy 
23927877fdebSMatt Macy 		if (raidz_checksum_verify(zio) == 0) {
23937877fdebSMatt Macy 			for (int i = 0; i < rm->rm_nrows; i++) {
23947877fdebSMatt Macy 				raidz_row_t *rr = rm->rm_row[i];
23957877fdebSMatt Macy 				vdev_raidz_io_done_verified(zio, rr);
23967877fdebSMatt Macy 			}
2397eda14cbcSMatt Macy 			zio_checksum_verified(zio);
23987877fdebSMatt Macy 		} else {
2399eda14cbcSMatt Macy 			/*
24007877fdebSMatt Macy 			 * A sequential resilver has no checksum which makes
24017877fdebSMatt Macy 			 * combinatoral reconstruction impossible. This code
24027877fdebSMatt Macy 			 * path is unreachable since raidz_checksum_verify()
24037877fdebSMatt Macy 			 * has no checksum to verify and must succeed.
2404eda14cbcSMatt Macy 			 */
24057877fdebSMatt Macy 			ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD);
2406eda14cbcSMatt Macy 
24077877fdebSMatt Macy 			/*
24087877fdebSMatt Macy 			 * This isn't a typical situation -- either we got a
24097877fdebSMatt Macy 			 * read error or a child silently returned bad data.
24107877fdebSMatt Macy 			 * Read every block so we can try again with as much
24117877fdebSMatt Macy 			 * data and parity as we can track down. If we've
24127877fdebSMatt Macy 			 * already been through once before, all children will
24137877fdebSMatt Macy 			 * be marked as tried so we'll proceed to combinatorial
24147877fdebSMatt Macy 			 * reconstruction.
24157877fdebSMatt Macy 			 */
24167877fdebSMatt Macy 			int nread = 0;
24177877fdebSMatt Macy 			for (int i = 0; i < rm->rm_nrows; i++) {
24187877fdebSMatt Macy 				nread += vdev_raidz_read_all(zio,
24197877fdebSMatt Macy 				    rm->rm_row[i]);
24207877fdebSMatt Macy 			}
24217877fdebSMatt Macy 			if (nread != 0) {
24227877fdebSMatt Macy 				/*
24237877fdebSMatt Macy 				 * Normally our stage is VDEV_IO_DONE, but if
24247877fdebSMatt Macy 				 * we've already called redone(), it will have
24257877fdebSMatt Macy 				 * changed to VDEV_IO_START, in which case we
24267877fdebSMatt Macy 				 * don't want to call redone() again.
24277877fdebSMatt Macy 				 */
24287877fdebSMatt Macy 				if (zio->io_stage != ZIO_STAGE_VDEV_IO_START)
24297877fdebSMatt Macy 					zio_vdev_io_redone(zio);
24307877fdebSMatt Macy 				return;
24317877fdebSMatt Macy 			}
2432eda14cbcSMatt Macy 
24337877fdebSMatt Macy 			zio->io_error = vdev_raidz_combrec(zio);
24347877fdebSMatt Macy 			if (zio->io_error == ECKSUM &&
24357877fdebSMatt Macy 			    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
24367877fdebSMatt Macy 				vdev_raidz_io_done_unrecoverable(zio);
24377877fdebSMatt Macy 			}
2438eda14cbcSMatt Macy 		}
2439eda14cbcSMatt Macy 	}
2440eda14cbcSMatt Macy }
2441eda14cbcSMatt Macy 
2442eda14cbcSMatt Macy static void
2443eda14cbcSMatt Macy vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2444eda14cbcSMatt Macy {
24457877fdebSMatt Macy 	vdev_raidz_t *vdrz = vd->vdev_tsd;
24467877fdebSMatt Macy 	if (faulted > vdrz->vd_nparity)
2447eda14cbcSMatt Macy 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2448eda14cbcSMatt Macy 		    VDEV_AUX_NO_REPLICAS);
2449eda14cbcSMatt Macy 	else if (degraded + faulted != 0)
2450eda14cbcSMatt Macy 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2451eda14cbcSMatt Macy 	else
2452eda14cbcSMatt Macy 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2453eda14cbcSMatt Macy }
2454eda14cbcSMatt Macy 
2455eda14cbcSMatt Macy /*
2456eda14cbcSMatt Macy  * Determine if any portion of the provided block resides on a child vdev
2457eda14cbcSMatt Macy  * with a dirty DTL and therefore needs to be resilvered.  The function
2458eda14cbcSMatt Macy  * assumes that at least one DTL is dirty which implies that full stripe
2459eda14cbcSMatt Macy  * width blocks must be resilvered.
2460eda14cbcSMatt Macy  */
2461eda14cbcSMatt Macy static boolean_t
24627877fdebSMatt Macy vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
24637877fdebSMatt Macy     uint64_t phys_birth)
2464eda14cbcSMatt Macy {
24657877fdebSMatt Macy 	vdev_raidz_t *vdrz = vd->vdev_tsd;
2466eda14cbcSMatt Macy 	uint64_t dcols = vd->vdev_children;
24677877fdebSMatt Macy 	uint64_t nparity = vdrz->vd_nparity;
2468eda14cbcSMatt Macy 	uint64_t ashift = vd->vdev_top->vdev_ashift;
2469eda14cbcSMatt Macy 	/* The starting RAIDZ (parent) vdev sector of the block. */
24707877fdebSMatt Macy 	uint64_t b = DVA_GET_OFFSET(dva) >> ashift;
2471eda14cbcSMatt Macy 	/* The zio's size in units of the vdev's minimum sector size. */
2472eda14cbcSMatt Macy 	uint64_t s = ((psize - 1) >> ashift) + 1;
2473eda14cbcSMatt Macy 	/* The first column for this stripe. */
2474eda14cbcSMatt Macy 	uint64_t f = b % dcols;
2475eda14cbcSMatt Macy 
24767877fdebSMatt Macy 	/* Unreachable by sequential resilver. */
24777877fdebSMatt Macy 	ASSERT3U(phys_birth, !=, TXG_UNKNOWN);
24787877fdebSMatt Macy 
24797877fdebSMatt Macy 	if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
24807877fdebSMatt Macy 		return (B_FALSE);
24817877fdebSMatt Macy 
2482eda14cbcSMatt Macy 	if (s + nparity >= dcols)
2483eda14cbcSMatt Macy 		return (B_TRUE);
2484eda14cbcSMatt Macy 
2485eda14cbcSMatt Macy 	for (uint64_t c = 0; c < s + nparity; c++) {
2486eda14cbcSMatt Macy 		uint64_t devidx = (f + c) % dcols;
2487eda14cbcSMatt Macy 		vdev_t *cvd = vd->vdev_child[devidx];
2488eda14cbcSMatt Macy 
2489eda14cbcSMatt Macy 		/*
2490eda14cbcSMatt Macy 		 * dsl_scan_need_resilver() already checked vd with
2491eda14cbcSMatt Macy 		 * vdev_dtl_contains(). So here just check cvd with
2492eda14cbcSMatt Macy 		 * vdev_dtl_empty(), cheaper and a good approximation.
2493eda14cbcSMatt Macy 		 */
2494eda14cbcSMatt Macy 		if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
2495eda14cbcSMatt Macy 			return (B_TRUE);
2496eda14cbcSMatt Macy 	}
2497eda14cbcSMatt Macy 
2498eda14cbcSMatt Macy 	return (B_FALSE);
2499eda14cbcSMatt Macy }
2500eda14cbcSMatt Macy 
2501eda14cbcSMatt Macy static void
25027877fdebSMatt Macy vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs,
25037877fdebSMatt Macy     range_seg64_t *physical_rs, range_seg64_t *remain_rs)
2504eda14cbcSMatt Macy {
2505*e92ffd9bSMartin Matuska 	(void) remain_rs;
2506*e92ffd9bSMartin Matuska 
2507eda14cbcSMatt Macy 	vdev_t *raidvd = cvd->vdev_parent;
2508eda14cbcSMatt Macy 	ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
2509eda14cbcSMatt Macy 
2510eda14cbcSMatt Macy 	uint64_t width = raidvd->vdev_children;
2511eda14cbcSMatt Macy 	uint64_t tgt_col = cvd->vdev_id;
2512eda14cbcSMatt Macy 	uint64_t ashift = raidvd->vdev_top->vdev_ashift;
2513eda14cbcSMatt Macy 
2514eda14cbcSMatt Macy 	/* make sure the offsets are block-aligned */
25157877fdebSMatt Macy 	ASSERT0(logical_rs->rs_start % (1 << ashift));
25167877fdebSMatt Macy 	ASSERT0(logical_rs->rs_end % (1 << ashift));
25177877fdebSMatt Macy 	uint64_t b_start = logical_rs->rs_start >> ashift;
25187877fdebSMatt Macy 	uint64_t b_end = logical_rs->rs_end >> ashift;
2519eda14cbcSMatt Macy 
2520eda14cbcSMatt Macy 	uint64_t start_row = 0;
2521eda14cbcSMatt Macy 	if (b_start > tgt_col) /* avoid underflow */
2522eda14cbcSMatt Macy 		start_row = ((b_start - tgt_col - 1) / width) + 1;
2523eda14cbcSMatt Macy 
2524eda14cbcSMatt Macy 	uint64_t end_row = 0;
2525eda14cbcSMatt Macy 	if (b_end > tgt_col)
2526eda14cbcSMatt Macy 		end_row = ((b_end - tgt_col - 1) / width) + 1;
2527eda14cbcSMatt Macy 
25287877fdebSMatt Macy 	physical_rs->rs_start = start_row << ashift;
25297877fdebSMatt Macy 	physical_rs->rs_end = end_row << ashift;
2530eda14cbcSMatt Macy 
25317877fdebSMatt Macy 	ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start);
25327877fdebSMatt Macy 	ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=,
25337877fdebSMatt Macy 	    logical_rs->rs_end - logical_rs->rs_start);
25347877fdebSMatt Macy }
25357877fdebSMatt Macy 
25367877fdebSMatt Macy /*
25377877fdebSMatt Macy  * Initialize private RAIDZ specific fields from the nvlist.
25387877fdebSMatt Macy  */
25397877fdebSMatt Macy static int
25407877fdebSMatt Macy vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd)
25417877fdebSMatt Macy {
25427877fdebSMatt Macy 	vdev_raidz_t *vdrz;
25437877fdebSMatt Macy 	uint64_t nparity;
25447877fdebSMatt Macy 
25457877fdebSMatt Macy 	uint_t children;
25467877fdebSMatt Macy 	nvlist_t **child;
25477877fdebSMatt Macy 	int error = nvlist_lookup_nvlist_array(nv,
25487877fdebSMatt Macy 	    ZPOOL_CONFIG_CHILDREN, &child, &children);
25497877fdebSMatt Macy 	if (error != 0)
25507877fdebSMatt Macy 		return (SET_ERROR(EINVAL));
25517877fdebSMatt Macy 
25527877fdebSMatt Macy 	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) {
25537877fdebSMatt Macy 		if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
25547877fdebSMatt Macy 			return (SET_ERROR(EINVAL));
25557877fdebSMatt Macy 
25567877fdebSMatt Macy 		/*
25577877fdebSMatt Macy 		 * Previous versions could only support 1 or 2 parity
25587877fdebSMatt Macy 		 * device.
25597877fdebSMatt Macy 		 */
25607877fdebSMatt Macy 		if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2)
25617877fdebSMatt Macy 			return (SET_ERROR(EINVAL));
25627877fdebSMatt Macy 		else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3)
25637877fdebSMatt Macy 			return (SET_ERROR(EINVAL));
25647877fdebSMatt Macy 	} else {
25657877fdebSMatt Macy 		/*
25667877fdebSMatt Macy 		 * We require the parity to be specified for SPAs that
25677877fdebSMatt Macy 		 * support multiple parity levels.
25687877fdebSMatt Macy 		 */
25697877fdebSMatt Macy 		if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
25707877fdebSMatt Macy 			return (SET_ERROR(EINVAL));
25717877fdebSMatt Macy 
25727877fdebSMatt Macy 		/*
25737877fdebSMatt Macy 		 * Otherwise, we default to 1 parity device for RAID-Z.
25747877fdebSMatt Macy 		 */
25757877fdebSMatt Macy 		nparity = 1;
25767877fdebSMatt Macy 	}
25777877fdebSMatt Macy 
25787877fdebSMatt Macy 	vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP);
25797877fdebSMatt Macy 	vdrz->vd_logical_width = children;
25807877fdebSMatt Macy 	vdrz->vd_nparity = nparity;
25817877fdebSMatt Macy 
25827877fdebSMatt Macy 	*tsd = vdrz;
25837877fdebSMatt Macy 
25847877fdebSMatt Macy 	return (0);
25857877fdebSMatt Macy }
25867877fdebSMatt Macy 
25877877fdebSMatt Macy static void
25887877fdebSMatt Macy vdev_raidz_fini(vdev_t *vd)
25897877fdebSMatt Macy {
25907877fdebSMatt Macy 	kmem_free(vd->vdev_tsd, sizeof (vdev_raidz_t));
25917877fdebSMatt Macy }
25927877fdebSMatt Macy 
25937877fdebSMatt Macy /*
25947877fdebSMatt Macy  * Add RAIDZ specific fields to the config nvlist.
25957877fdebSMatt Macy  */
25967877fdebSMatt Macy static void
25977877fdebSMatt Macy vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv)
25987877fdebSMatt Macy {
25997877fdebSMatt Macy 	ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops);
26007877fdebSMatt Macy 	vdev_raidz_t *vdrz = vd->vdev_tsd;
26017877fdebSMatt Macy 
26027877fdebSMatt Macy 	/*
26037877fdebSMatt Macy 	 * Make sure someone hasn't managed to sneak a fancy new vdev
26047877fdebSMatt Macy 	 * into a crufty old storage pool.
26057877fdebSMatt Macy 	 */
26067877fdebSMatt Macy 	ASSERT(vdrz->vd_nparity == 1 ||
26077877fdebSMatt Macy 	    (vdrz->vd_nparity <= 2 &&
26087877fdebSMatt Macy 	    spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) ||
26097877fdebSMatt Macy 	    (vdrz->vd_nparity <= 3 &&
26107877fdebSMatt Macy 	    spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3));
26117877fdebSMatt Macy 
26127877fdebSMatt Macy 	/*
26137877fdebSMatt Macy 	 * Note that we'll add these even on storage pools where they
26147877fdebSMatt Macy 	 * aren't strictly required -- older software will just ignore
26157877fdebSMatt Macy 	 * it.
26167877fdebSMatt Macy 	 */
26177877fdebSMatt Macy 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity);
26187877fdebSMatt Macy }
26197877fdebSMatt Macy 
26207877fdebSMatt Macy static uint64_t
26217877fdebSMatt Macy vdev_raidz_nparity(vdev_t *vd)
26227877fdebSMatt Macy {
26237877fdebSMatt Macy 	vdev_raidz_t *vdrz = vd->vdev_tsd;
26247877fdebSMatt Macy 	return (vdrz->vd_nparity);
26257877fdebSMatt Macy }
26267877fdebSMatt Macy 
26277877fdebSMatt Macy static uint64_t
26287877fdebSMatt Macy vdev_raidz_ndisks(vdev_t *vd)
26297877fdebSMatt Macy {
26307877fdebSMatt Macy 	return (vd->vdev_children);
2631eda14cbcSMatt Macy }
2632eda14cbcSMatt Macy 
2633eda14cbcSMatt Macy vdev_ops_t vdev_raidz_ops = {
26347877fdebSMatt Macy 	.vdev_op_init = vdev_raidz_init,
26357877fdebSMatt Macy 	.vdev_op_fini = vdev_raidz_fini,
2636eda14cbcSMatt Macy 	.vdev_op_open = vdev_raidz_open,
2637eda14cbcSMatt Macy 	.vdev_op_close = vdev_raidz_close,
2638eda14cbcSMatt Macy 	.vdev_op_asize = vdev_raidz_asize,
26397877fdebSMatt Macy 	.vdev_op_min_asize = vdev_raidz_min_asize,
26407877fdebSMatt Macy 	.vdev_op_min_alloc = NULL,
2641eda14cbcSMatt Macy 	.vdev_op_io_start = vdev_raidz_io_start,
2642eda14cbcSMatt Macy 	.vdev_op_io_done = vdev_raidz_io_done,
2643eda14cbcSMatt Macy 	.vdev_op_state_change = vdev_raidz_state_change,
2644eda14cbcSMatt Macy 	.vdev_op_need_resilver = vdev_raidz_need_resilver,
2645eda14cbcSMatt Macy 	.vdev_op_hold = NULL,
2646eda14cbcSMatt Macy 	.vdev_op_rele = NULL,
2647eda14cbcSMatt Macy 	.vdev_op_remap = NULL,
2648eda14cbcSMatt Macy 	.vdev_op_xlate = vdev_raidz_xlate,
26497877fdebSMatt Macy 	.vdev_op_rebuild_asize = NULL,
26507877fdebSMatt Macy 	.vdev_op_metaslab_init = NULL,
26517877fdebSMatt Macy 	.vdev_op_config_generate = vdev_raidz_config_generate,
26527877fdebSMatt Macy 	.vdev_op_nparity = vdev_raidz_nparity,
26537877fdebSMatt Macy 	.vdev_op_ndisks = vdev_raidz_ndisks,
2654eda14cbcSMatt Macy 	.vdev_op_type = VDEV_TYPE_RAIDZ,	/* name of this vdev type */
2655eda14cbcSMatt Macy 	.vdev_op_leaf = B_FALSE			/* not a leaf vdev */
2656eda14cbcSMatt Macy };
2657